326 lines
8.3 KiB
C
326 lines
8.3 KiB
C
|
/*
|
||
|
* Copyright (c) 2010 Mellanox Technologies. All rights reserved.
|
||
|
*
|
||
|
* This software is available to you under a choice of one of two
|
||
|
* licenses. You may choose to be licensed under the terms of the GNU
|
||
|
* General Public License (GPL) Version 2, available from the file
|
||
|
* COPYING in the main directory of this source tree, or the
|
||
|
* OpenIB.org BSD license below:
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or
|
||
|
* without modification, are permitted provided that the following
|
||
|
* conditions are met:
|
||
|
*
|
||
|
* - Redistributions of source code must retain the above
|
||
|
* copyright notice, this list of conditions and the following
|
||
|
* disclaimer.
|
||
|
*
|
||
|
* - Redistributions in binary form must reproduce the above
|
||
|
* copyright notice, this list of conditions and the following
|
||
|
* disclaimer in the documentation and/or other materials
|
||
|
* provided with the distribution.
|
||
|
*
|
||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
|
* SOFTWARE.
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
#include <linux/sched.h>
|
||
|
#include <linux/mutex.h>
|
||
|
#include <asm/atomic.h>
|
||
|
|
||
|
#include "mlx4.h"
|
||
|
|
||
|
#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
|
||
|
|
||
|
|
||
|
|
||
|
/* Each CPU is put into a group. In most cases, the group number is
|
||
|
* equal to the CPU number of one of the CPUs in the group. The
|
||
|
* exception is group NR_CPUS which is the default group. This is
|
||
|
* protected by sys_tune_startup_mutex. */
|
||
|
DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS;
|
||
|
|
||
|
/* For each group, a count of the number of CPUs in the group which
|
||
|
* are known to be busy. A busy CPU might be running the busy loop
|
||
|
* below or general kernel code. The count is decremented on entry to
|
||
|
* the old pm_idle handler and incremented on exit. The aim is to
|
||
|
* avoid the count going to zero or negative. This situation can
|
||
|
* occur temporarily during module unload or CPU hot-plug but
|
||
|
* normality will be restored when the affected CPUs next exit the
|
||
|
* idle loop. */
|
||
|
static atomic_t busy_cpu_count[NR_CPUS+1];
|
||
|
|
||
|
/* A workqueue item to be executed to cause the CPU to exit from the
|
||
|
* idle loop. */
|
||
|
DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work);
|
||
|
|
||
|
#define sys_tune_set_state(CPU,STATE) \
|
||
|
do { } while(0)
|
||
|
|
||
|
|
||
|
/* A mutex to protect most of the module datastructures. */
|
||
|
static DEFINE_MUTEX(sys_tune_startup_mutex);
|
||
|
|
||
|
/* The old pm_idle handler. */
|
||
|
static void (*old_pm_idle)(void) = NULL;
|
||
|
|
||
|
static void sys_tune_pm_idle(void)
|
||
|
{
|
||
|
atomic_t *busy_cpus_ptr;
|
||
|
int busy_cpus;
|
||
|
int cpu = smp_processor_id();
|
||
|
|
||
|
busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]);
|
||
|
|
||
|
sys_tune_set_state(cpu, 2);
|
||
|
|
||
|
local_irq_enable();
|
||
|
while (!need_resched()) {
|
||
|
busy_cpus = atomic_read(busy_cpus_ptr);
|
||
|
|
||
|
/* If other CPUs in this group are busy then let this
|
||
|
* CPU go idle. We mustn't let the number of busy
|
||
|
* CPUs drop below 1. */
|
||
|
if ( busy_cpus > 1 &&
|
||
|
old_pm_idle != NULL &&
|
||
|
( atomic_cmpxchg(busy_cpus_ptr, busy_cpus,
|
||
|
busy_cpus-1) == busy_cpus ) ) {
|
||
|
local_irq_disable();
|
||
|
sys_tune_set_state(cpu, 3);
|
||
|
/* This check might not be necessary, but it
|
||
|
* seems safest to include it because there
|
||
|
* might be a kernel version which requires
|
||
|
* it. */
|
||
|
if (need_resched())
|
||
|
local_irq_enable();
|
||
|
else
|
||
|
old_pm_idle();
|
||
|
/* This CPU is busy again. */
|
||
|
sys_tune_set_state(cpu, 1);
|
||
|
atomic_add(1, busy_cpus_ptr);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
cpu_relax();
|
||
|
}
|
||
|
sys_tune_set_state(cpu, 0);
|
||
|
}
|
||
|
|
||
|
|
||
|
void sys_tune_work_func(struct work_struct *work)
|
||
|
{
|
||
|
/* Do nothing. Since this function is running in process
|
||
|
* context, the idle thread isn't running on this CPU. */
|
||
|
}
|
||
|
|
||
|
|
||
|
#ifdef CONFIG_SMP
|
||
|
static void sys_tune_smp_call(void *info)
|
||
|
{
|
||
|
schedule_work(&get_cpu_var(sys_tune_cpu_work));
|
||
|
put_cpu_var(sys_tune_cpu_work);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#ifdef CONFIG_SMP
|
||
|
static void sys_tune_refresh(void)
|
||
|
{
|
||
|
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
|
||
|
on_each_cpu(&sys_tune_smp_call, NULL, 0, 1);
|
||
|
#else
|
||
|
on_each_cpu(&sys_tune_smp_call, NULL, 1);
|
||
|
#endif
|
||
|
}
|
||
|
#else
|
||
|
static void sys_tune_refresh(void)
|
||
|
{
|
||
|
/* The current thread is executing on the one and only CPU so
|
||
|
* the idle thread isn't running. */
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
static int sys_tune_cpu_group(int cpu)
|
||
|
{
|
||
|
#ifdef CONFIG_SMP
|
||
|
const cpumask_t *mask;
|
||
|
int other_cpu;
|
||
|
int group;
|
||
|
|
||
|
#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP)
|
||
|
/* Keep one hyperthread busy per core. */
|
||
|
mask = topology_thread_cpumask(cpu);
|
||
|
#else
|
||
|
return cpu;
|
||
|
#endif
|
||
|
for_each_cpu_mask(cpu, *(mask)) {
|
||
|
group = per_cpu(idle_cpu_group, other_cpu);
|
||
|
if (group != NR_CPUS)
|
||
|
return group;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
return cpu;
|
||
|
}
|
||
|
|
||
|
|
||
|
static void sys_tune_add_cpu(int cpu)
|
||
|
{
|
||
|
int group;
|
||
|
|
||
|
/* Do nothing if this CPU has already been added. */
|
||
|
if (per_cpu(idle_cpu_group, cpu) != NR_CPUS)
|
||
|
return;
|
||
|
|
||
|
group = sys_tune_cpu_group(cpu);
|
||
|
per_cpu(idle_cpu_group, cpu) = group;
|
||
|
atomic_inc(&(busy_cpu_count[group]));
|
||
|
|
||
|
}
|
||
|
|
||
|
static void sys_tune_del_cpu(int cpu)
|
||
|
{
|
||
|
|
||
|
int group;
|
||
|
|
||
|
if (per_cpu(idle_cpu_group, cpu) == NR_CPUS)
|
||
|
return;
|
||
|
|
||
|
group = per_cpu(idle_cpu_group, cpu);
|
||
|
/* If the CPU was busy, this can cause the count to drop to
|
||
|
* zero. To rectify this, we need to cause one of the other
|
||
|
* CPUs in the group to exit the idle loop. If the CPU was
|
||
|
* not busy then this causes the contribution for this CPU to
|
||
|
* go to -1 which can cause the overall count to drop to zero
|
||
|
* or go negative. To rectify this situation we need to cause
|
||
|
* this CPU to exit the idle loop. */
|
||
|
atomic_dec(&(busy_cpu_count[group]));
|
||
|
per_cpu(idle_cpu_group, cpu) = NR_CPUS;
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
static int sys_tune_cpu_notify(struct notifier_block *self,
|
||
|
unsigned long action, void *hcpu)
|
||
|
{
|
||
|
int cpu = (long)hcpu;
|
||
|
|
||
|
switch(action) {
|
||
|
#ifdef CPU_ONLINE_FROZEN
|
||
|
case CPU_ONLINE_FROZEN:
|
||
|
#endif
|
||
|
case CPU_ONLINE:
|
||
|
mutex_lock(&sys_tune_startup_mutex);
|
||
|
sys_tune_add_cpu(cpu);
|
||
|
mutex_unlock(&sys_tune_startup_mutex);
|
||
|
/* The CPU might have already entered the idle loop in
|
||
|
* the wrong group. Make sure it exits the idle loop
|
||
|
* so that it picks up the correct group. */
|
||
|
sys_tune_refresh();
|
||
|
break;
|
||
|
|
||
|
#ifdef CPU_DEAD_FROZEN
|
||
|
case CPU_DEAD_FROZEN:
|
||
|
#endif
|
||
|
case CPU_DEAD:
|
||
|
mutex_lock(&sys_tune_startup_mutex);
|
||
|
sys_tune_del_cpu(cpu);
|
||
|
mutex_unlock(&sys_tune_startup_mutex);
|
||
|
/* The deleted CPU may have been the only busy CPU in
|
||
|
* the group. Make sure one of the other CPUs in the
|
||
|
* group exits the idle loop. */
|
||
|
sys_tune_refresh();
|
||
|
break;
|
||
|
}
|
||
|
return NOTIFY_OK;
|
||
|
}
|
||
|
|
||
|
|
||
|
static struct notifier_block sys_tune_cpu_nb = {
|
||
|
.notifier_call = sys_tune_cpu_notify,
|
||
|
};
|
||
|
|
||
|
|
||
|
static void sys_tune_ensure_init(void)
|
||
|
{
|
||
|
BUG_ON (old_pm_idle != NULL);
|
||
|
|
||
|
/* Atomically update pm_idle to &sys_tune_pm_idle. The old value
|
||
|
* is stored in old_pm_idle before installing the new
|
||
|
* handler. */
|
||
|
do {
|
||
|
old_pm_idle = pm_idle;
|
||
|
} while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) !=
|
||
|
old_pm_idle);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
void sys_tune_fini(void)
|
||
|
{
|
||
|
#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
|
||
|
void (*old)(void);
|
||
|
int cpu;
|
||
|
|
||
|
unregister_cpu_notifier(&sys_tune_cpu_nb);
|
||
|
|
||
|
mutex_lock(&sys_tune_startup_mutex);
|
||
|
|
||
|
|
||
|
old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle);
|
||
|
|
||
|
for_each_online_cpu(cpu)
|
||
|
sys_tune_del_cpu(cpu);
|
||
|
|
||
|
mutex_unlock(&sys_tune_startup_mutex);
|
||
|
|
||
|
/* Our handler may still be executing on other CPUs.
|
||
|
* Schedule this thread on all CPUs to make sure all
|
||
|
* idle threads get interrupted. */
|
||
|
sys_tune_refresh();
|
||
|
|
||
|
/* Make sure the work item has finished executing on all CPUs.
|
||
|
* This in turn ensures that all idle threads have been
|
||
|
* interrupted. */
|
||
|
flush_scheduled_work();
|
||
|
#endif /* CONFIG_X86 */
|
||
|
}
|
||
|
|
||
|
void sys_tune_init(void)
|
||
|
{
|
||
|
#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
|
||
|
int cpu;
|
||
|
|
||
|
for_each_possible_cpu(cpu) {
|
||
|
INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu),
|
||
|
sys_tune_work_func);
|
||
|
}
|
||
|
|
||
|
/* Start by registering the handler to ensure we don't miss
|
||
|
* any updates. */
|
||
|
register_cpu_notifier(&sys_tune_cpu_nb);
|
||
|
|
||
|
mutex_lock(&sys_tune_startup_mutex);
|
||
|
|
||
|
for_each_online_cpu(cpu)
|
||
|
sys_tune_add_cpu(cpu);
|
||
|
|
||
|
sys_tune_ensure_init();
|
||
|
|
||
|
|
||
|
mutex_unlock(&sys_tune_startup_mutex);
|
||
|
|
||
|
/* Ensure our idle handler starts to run. */
|
||
|
sys_tune_refresh();
|
||
|
#endif
|
||
|
}
|
||
|
|