cpufreq(4): Add support for Intel Speed Shift

Intel Speed Shift is Intel's technology to control frequency in hardware,
with hints from software.

Let's get a working version of this in the tree and we can refine it from
here.

Submitted by:	bwidawsk, scottph
Reviewed by:	bcr (manpages), myself
Discussed with:	jhb, kib (earlier versions)
With feedback from:	Greg V, gallatin, freebsdnewbie AT freenet.de
Relnotes:	yes
Differential Revision:	https://reviews.freebsd.org/D18028
This commit is contained in:
Conrad Meyer 2020-01-22 23:28:42 +00:00
parent 7ec5e1c4cd
commit 4577cf3744
10 changed files with 843 additions and 122 deletions

View File

@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd March 3, 2006
.Dd January 22, 2020
.Dt CPUFREQ 4
.Os
.Sh NAME
@ -85,6 +85,10 @@ sysctl entry.
.Bl -tag -width indent
.It Va dev.cpu.%d.freq
Current active CPU frequency in MHz.
.It Va dev.cpu.%d.freq_driver
The specific
.Nm
driver used by this cpu.
.It Va dev.cpu.%d.freq_levels
Currently available levels for the CPU (frequency/power usage).
Values are in units of MHz and milliwatts.

View File

@ -0,0 +1,89 @@
.\"
.\" Copyright (c) 2019 Intel Corporation
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
.\"
.Dd January 22, 2020
.Dt HWPSTATE_INTEL 4
.Os
.Sh NAME
.Nm hwpstate_intel
.Nd Intel Speed Shift Technology driver
.Sh SYNOPSIS
To compile this driver into your kernel
place the following line in your kernel
configuration file:
.Bd -ragged -offset indent
.Cd "device cpufreq"
.Ed
.Sh DESCRIPTION
The
.Nm
driver provides support for hardware-controlled performance states on Intel
platforms, also known as Intel Speed Shift Technology.
.Sh LOADER TUNABLES
.Bl -tag -width indent
.It Va hint.hwpstate_intel.0.disabled
Can be used to disable
.Nm ,
allowing other compatible drivers to manage performance states, like
.Xr est 4 .
.Pq default 0
.El
.Sh SYSCTL VARIABLES
The following
.Xr sysctl 8
values are available
.Bl -tag -width indent
.It Va dev.hwpstate_intel.%d.\%desc
Describes the attached driver
.It dev.hwpstate_intel.0.%desc: Intel Speed Shift
.It Va dev.hwpstate_intel.%d.\%driver
Driver in use, always hwpstate_intel.
.It dev.hwpstate_intel.0.%driver: hwpstate_intel
.It Va dev.hwpstate_intel.%d.\%parent
.It dev.hwpstate_intel.0.%parent: cpu0
The cpu that is exposing these frequencies.
For example
.Va cpu0 .
.It Va dev.hwpstate_intel.%d.epp
Energy/Performance Preference.
Valid values range from 0 to 100.
Setting this field conveys a hint to the hardware regarding a preference towards
performance (at value 0), energy efficiency (at value 100), or somewhere in
between.
.It dev.hwpstate_intel.0.epp: 0
.El
.Sh COMPATIBILITY
.Nm
is only found on supported Intel CPUs.
.Sh SEE ALSO
.Xr cpufreq 4
.Rs
.%T "Intel 64 and IA-32 Architectures Software Developer Manuals"
.%U "http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html"
.Re
.Sh AUTHORS
This manual page was written by
.An D Scott Phillips Aq Mt scottph@FreeBSD.org .

View File

@ -290,7 +290,8 @@ x86/acpica/srat.c optional acpi
x86/bios/smbios.c optional smbios
x86/bios/vpd.c optional vpd
x86/cpufreq/est.c optional cpufreq
x86/cpufreq/hwpstate.c optional cpufreq
x86/cpufreq/hwpstate_amd.c optional cpufreq
x86/cpufreq/hwpstate_intel.c optional cpufreq
x86/cpufreq/p4tcc.c optional cpufreq
x86/cpufreq/powernow.c optional cpufreq
x86/iommu/busdma_dmar.c optional acpi acpi_dmar pci

View File

@ -76,6 +76,7 @@ struct cpufreq_softc {
int all_count;
int max_mhz;
device_t dev;
device_t cf_drv_dev;
struct sysctl_ctx_list sysctl_ctx;
struct task startup_task;
struct cf_level *levels_buf;
@ -142,6 +143,11 @@ SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RWTUN, &cf_lowest_freq, 1,
SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RWTUN, &cf_verbose, 1,
"Print verbose debugging messages");
/*
* This is called as the result of a hardware specific frequency control driver
* calling cpufreq_register. It provides a general interface for system wide
* frequency controls and operates on a per cpu basis.
*/
static int
cpufreq_attach(device_t dev)
{
@ -149,7 +155,6 @@ cpufreq_attach(device_t dev)
struct pcpu *pc;
device_t parent;
uint64_t rate;
int numdevs;
CF_DEBUG("initializing %s\n", device_get_nameunit(dev));
sc = device_get_softc(dev);
@ -164,6 +169,7 @@ cpufreq_attach(device_t dev)
sc->max_mhz = cpu_get_nominal_mhz(dev);
/* If that fails, try to measure the current rate */
if (sc->max_mhz <= 0) {
CF_DEBUG("Unable to obtain nominal frequency.\n");
pc = cpu_get_pcpu(dev);
if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0)
sc->max_mhz = rate / 1000000;
@ -171,15 +177,6 @@ cpufreq_attach(device_t dev)
sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
}
/*
* Only initialize one set of sysctls for all CPUs. In the future,
* if multiple CPUs can have different settings, we can move these
* sysctls to be under every CPU instead of just the first one.
*/
numdevs = devclass_get_count(cpufreq_dc);
if (numdevs > 1)
return (0);
CF_DEBUG("initializing one-time data for %s\n",
device_get_nameunit(dev));
sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf),
@ -216,7 +213,6 @@ cpufreq_detach(device_t dev)
{
struct cpufreq_softc *sc;
struct cf_saved_freq *saved_freq;
int numdevs;
CF_DEBUG("shutdown %s\n", device_get_nameunit(dev));
sc = device_get_softc(dev);
@ -227,12 +223,7 @@ cpufreq_detach(device_t dev)
free(saved_freq, M_TEMP);
}
/* Only clean up these resources when the last device is detaching. */
numdevs = devclass_get_count(cpufreq_dc);
if (numdevs == 1) {
CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
free(sc->levels_buf, M_DEVBUF);
}
return (0);
}
@ -421,26 +412,75 @@ out:
return (error);
}
static int
cpufreq_get_frequency(device_t dev)
{
struct cf_setting set;
if (CPUFREQ_DRV_GET(dev, &set) != 0)
return (-1);
return (set.freq);
}
/* Returns the index into *levels with the match */
static int
cpufreq_get_level(device_t dev, struct cf_level *levels, int count)
{
int i, freq;
if ((freq = cpufreq_get_frequency(dev)) < 0)
return (-1);
for (i = 0; i < count; i++)
if (freq == levels[i].total_set.freq)
return (i);
return (-1);
}
/*
* Used by the cpufreq core, this function will populate *level with the current
* frequency as either determined by a cached value sc->curr_level, or in the
* case the lower level driver has set the CPUFREQ_FLAG_UNCACHED flag, it will
* obtain the frequency from the driver itself.
*/
static int
cf_get_method(device_t dev, struct cf_level *level)
{
struct cpufreq_softc *sc;
struct cf_level *levels;
struct cf_setting *curr_set, set;
struct cf_setting *curr_set;
struct pcpu *pc;
device_t *devs;
int bdiff, count, diff, error, i, n, numdevs;
int bdiff, count, diff, error, i, type;
uint64_t rate;
sc = device_get_softc(dev);
error = 0;
levels = NULL;
/* If we already know the current frequency, we're done. */
/*
* If we already know the current frequency, and the driver didn't ask
* for uncached usage, we're done.
*/
CF_MTX_LOCK(&sc->lock);
curr_set = &sc->curr_level.total_set;
if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
error = CPUFREQ_DRV_TYPE(sc->cf_drv_dev, &type);
if (error == 0 && (type & CPUFREQ_FLAG_UNCACHED)) {
struct cf_setting set;
/*
* If the driver wants to always report back the real frequency,
* first try the driver and if that fails, fall back to
* estimating.
*/
if (CPUFREQ_DRV_GET(sc->cf_drv_dev, &set) != 0)
goto estimate;
sc->curr_level.total_set = set;
CF_DEBUG("get returning immediate freq %d\n", curr_set->freq);
goto out;
} else if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
CF_DEBUG("get returning known freq %d\n", curr_set->freq);
error = 0;
goto out;
}
CF_MTX_UNLOCK(&sc->lock);
@ -461,11 +501,6 @@ cf_get_method(device_t dev, struct cf_level *level)
free(levels, M_TEMP);
return (error);
}
error = device_get_children(device_get_parent(dev), &devs, &numdevs);
if (error) {
free(levels, M_TEMP);
return (error);
}
/*
* Reacquire the lock and search for the given level.
@ -476,24 +511,21 @@ cf_get_method(device_t dev, struct cf_level *level)
* The estimation code below catches this case though.
*/
CF_MTX_LOCK(&sc->lock);
for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) {
if (!device_is_attached(devs[n]))
continue;
if (CPUFREQ_DRV_GET(devs[n], &set) != 0)
continue;
for (i = 0; i < count; i++) {
if (set.freq == levels[i].total_set.freq) {
i = cpufreq_get_level(sc->cf_drv_dev, levels, count);
if (i >= 0)
sc->curr_level = levels[i];
break;
}
}
}
free(devs, M_TEMP);
else
CF_DEBUG("Couldn't find supported level for %s\n",
device_get_nameunit(sc->cf_drv_dev));
if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq);
goto out;
}
estimate:
CF_MTX_ASSERT(&sc->lock);
/*
* We couldn't find an exact match, so attempt to estimate and then
* match against a level.
@ -525,56 +557,47 @@ out:
return (error);
}
/*
* Either directly obtain settings from the cpufreq driver, or build a list of
* relative settings to be integrated later against an absolute max.
*/
static int
cf_levels_method(device_t dev, struct cf_level *levels, int *count)
cpufreq_add_levels(device_t cf_dev, struct cf_setting_lst *rel_sets)
{
struct cf_setting_array *set_arr;
struct cf_setting_lst rel_sets;
struct cpufreq_softc *sc;
struct cf_level *lev;
struct cf_setting *sets;
struct pcpu *pc;
device_t *devs;
int error, i, numdevs, set_count, type;
uint64_t rate;
device_t dev;
struct cpufreq_softc *sc;
int type, set_count, error;
if (levels == NULL || count == NULL)
return (EINVAL);
sc = device_get_softc(cf_dev);
dev = sc->cf_drv_dev;
TAILQ_INIT(&rel_sets);
sc = device_get_softc(dev);
error = device_get_children(device_get_parent(dev), &devs, &numdevs);
if (error)
return (error);
sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
if (sets == NULL) {
free(devs, M_TEMP);
return (ENOMEM);
}
/* Get settings from all cpufreq drivers. */
CF_MTX_LOCK(&sc->lock);
for (i = 0; i < numdevs; i++) {
/* Skip devices that aren't ready. */
if (!device_is_attached(devs[i]))
continue;
if (!device_is_attached(cf_dev))
return (0);
/*
* Get settings, skipping drivers that offer no settings or
* provide settings for informational purposes only.
*/
error = CPUFREQ_DRV_TYPE(devs[i], &type);
if (error || (type & CPUFREQ_FLAG_INFO_ONLY)) {
error = CPUFREQ_DRV_TYPE(dev, &type);
if (error != 0 || (type & CPUFREQ_FLAG_INFO_ONLY)) {
if (error == 0) {
CF_DEBUG("skipping info-only driver %s\n",
device_get_nameunit(devs[i]));
device_get_nameunit(cf_dev));
}
continue;
return (error);
}
sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
if (sets == NULL)
return (ENOMEM);
set_count = MAX_SETTINGS;
error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count);
if (error || set_count == 0)
continue;
error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
if (error != 0 || set_count == 0)
goto out;
/* Add the settings to our absolute/relative lists. */
switch (type & CPUFREQ_TYPE_MASK) {
@ -590,20 +613,48 @@ cf_levels_method(device_t dev, struct cf_level *levels, int *count)
}
bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
set_arr->count = set_count;
TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
TAILQ_INSERT_TAIL(rel_sets, set_arr, link);
break;
default:
error = EINVAL;
}
out:
free(sets, M_TEMP);
return (error);
}
static int
cf_levels_method(device_t dev, struct cf_level *levels, int *count)
{
struct cf_setting_array *set_arr;
struct cf_setting_lst rel_sets;
struct cpufreq_softc *sc;
struct cf_level *lev;
struct pcpu *pc;
int error, i;
uint64_t rate;
if (levels == NULL || count == NULL)
return (EINVAL);
TAILQ_INIT(&rel_sets);
sc = device_get_softc(dev);
CF_MTX_LOCK(&sc->lock);
error = cpufreq_add_levels(sc->dev, &rel_sets);
if (error)
goto out;
}
/*
* If there are no absolute levels, create a fake one at 100%. We
* then cache the clockrate for later use as our base frequency.
*/
if (TAILQ_EMPTY(&sc->all_levels)) {
struct cf_setting set;
CF_DEBUG("No absolute levels returned by driver\n");
if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) {
sc->max_mhz = cpu_get_nominal_mhz(dev);
/*
@ -617,10 +668,10 @@ cf_levels_method(device_t dev, struct cf_level *levels, int *count)
sc->max_mhz = rate / 1000000;
}
}
memset(&sets[0], CPUFREQ_VAL_UNKNOWN, sizeof(*sets));
sets[0].freq = sc->max_mhz;
sets[0].dev = NULL;
error = cpufreq_insert_abs(sc, sets, 1);
memset(&set, CPUFREQ_VAL_UNKNOWN, sizeof(set));
set.freq = sc->max_mhz;
set.dev = NULL;
error = cpufreq_insert_abs(sc, &set, 1);
if (error)
goto out;
}
@ -665,8 +716,6 @@ out:
TAILQ_REMOVE(&rel_sets, set_arr, link);
free(set_arr, M_TEMP);
}
free(devs, M_TEMP);
free(sets, M_TEMP);
return (error);
}
@ -1011,11 +1060,24 @@ out:
return (error);
}
static void
cpufreq_add_freq_driver_sysctl(device_t cf_dev)
{
struct cpufreq_softc *sc;
sc = device_get_softc(cf_dev);
SYSCTL_ADD_CONST_STRING(&sc->sysctl_ctx,
SYSCTL_CHILDREN(device_get_sysctl_tree(cf_dev)), OID_AUTO,
"freq_driver", CTLFLAG_RD, device_get_nameunit(sc->cf_drv_dev),
"cpufreq driver used by this cpu");
}
int
cpufreq_register(device_t dev)
{
struct cpufreq_softc *sc;
device_t cf_dev, cpu_dev;
int error;
/* Add a sysctl to get each driver's settings separately. */
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
@ -1031,6 +1093,7 @@ cpufreq_register(device_t dev)
if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) {
sc = device_get_softc(cf_dev);
sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
MPASS(sc->cf_drv_dev != NULL);
return (0);
}
@ -1040,40 +1103,36 @@ cpufreq_register(device_t dev)
return (ENOMEM);
device_quiet(cf_dev);
return (device_probe_and_attach(cf_dev));
error = device_probe_and_attach(cf_dev);
if (error)
return (error);
sc = device_get_softc(cf_dev);
sc->cf_drv_dev = dev;
cpufreq_add_freq_driver_sysctl(cf_dev);
return (error);
}
int
cpufreq_unregister(device_t dev)
{
device_t cf_dev, *devs;
int cfcount, devcount, error, i, type;
device_t cf_dev;
struct cpufreq_softc *sc;
/*
* If this is the last cpufreq child device, remove the control
* device as well. We identify cpufreq children by calling a method
* they support.
*/
error = device_get_children(device_get_parent(dev), &devs, &devcount);
if (error)
return (error);
cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1);
if (cf_dev == NULL) {
device_printf(dev,
"warning: cpufreq_unregister called with no cpufreq device active\n");
free(devs, M_TEMP);
return (0);
}
cfcount = 0;
for (i = 0; i < devcount; i++) {
if (!device_is_attached(devs[i]))
continue;
if (CPUFREQ_DRV_TYPE(devs[i], &type) == 0)
cfcount++;
}
if (cfcount <= 1)
sc = device_get_softc(cf_dev);
MPASS(sc->cf_drv_dev == dev);
device_delete_child(device_get_parent(cf_dev), cf_dev);
free(devs, M_TEMP);
return (0);
}

View File

@ -11,7 +11,7 @@ SRCS+= bus_if.h cpufreq_if.h device_if.h pci_if.h
.PATH: ${SRCTOP}/sys/x86/cpufreq
SRCS+= acpi_if.h opt_acpi.h
SRCS+= est.c hwpstate.c p4tcc.c powernow.c
SRCS+= est.c hwpstate_amd.c p4tcc.c powernow.c hwpstate_intel.c
.endif
.if ${MACHINE} == "i386"

View File

@ -120,11 +120,16 @@ TAILQ_HEAD(cf_level_lst, cf_level);
* information about settings but rely on another machine-dependent driver
* for actually performing the frequency transition (e.g., ACPI performance
* states of type "functional fixed hardware.")
*
* The "uncached" flag tells CPUFREQ_DRV_GET to try obtaining the real
* instantaneous frequency from the underlying hardware regardless of cached
* state. It is probably a bug to not combine this with "info only"
*/
#define CPUFREQ_TYPE_MASK 0xffff
#define CPUFREQ_TYPE_RELATIVE (1<<0)
#define CPUFREQ_TYPE_ABSOLUTE (1<<1)
#define CPUFREQ_FLAG_INFO_ONLY (1<<16)
#define CPUFREQ_FLAG_UNCACHED (1<<17)
/*
* When setting a level, the caller indicates the priority of this request.

View File

@ -50,6 +50,8 @@ __FBSDID("$FreeBSD$");
#include <dev/acpica/acpivar.h>
#include "acpi_if.h"
#include <x86/cpufreq/hwpstate_intel_internal.h>
/* Status/control registers (from the IA-32 System Programming Guide). */
#define MSR_PERF_STATUS 0x198
#define MSR_PERF_CTL 0x199
@ -898,6 +900,7 @@ static driver_t est_driver = {
static devclass_t est_devclass;
DRIVER_MODULE(est, cpu, est_driver, est_devclass, 0, 0);
MODULE_DEPEND(est, hwpstate_intel, 1, 1, 1);
static int
est_features(driver_t *driver, u_int *features)
@ -916,6 +919,15 @@ est_identify(driver_t *driver, device_t parent)
{
device_t child;
/*
* Defer to hwpstate if it is present. This priority logic
* should be replaced with normal newbus probing in the
* future.
*/
intel_hwpstate_identify(NULL, parent);
if (device_find_child(parent, "hwpstate_intel", -1) != NULL)
return;
/* Make sure we're not being doubly invoked. */
if (device_find_child(parent, "est", -1) != NULL)
return;

View File

@ -0,0 +1,516 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2018 Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted providing that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/sbuf.h>
#include <sys/module.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/cpu.h>
#include <sys/smp.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <machine/cpu.h>
#include <machine/md_var.h>
#include <machine/cputypes.h>
#include <machine/specialreg.h>
#include <contrib/dev/acpica/include/acpi.h>
#include <dev/acpica/acpivar.h>
#include <x86/cpufreq/hwpstate_intel_internal.h>
#include "acpi_if.h"
#include "cpufreq_if.h"
extern uint64_t tsc_freq;
static int intel_hwpstate_probe(device_t dev);
static int intel_hwpstate_attach(device_t dev);
static int intel_hwpstate_detach(device_t dev);
static int intel_hwpstate_suspend(device_t dev);
static int intel_hwpstate_resume(device_t dev);
static int intel_hwpstate_get(device_t dev, struct cf_setting *cf);
static int intel_hwpstate_type(device_t dev, int *type);
static device_method_t intel_hwpstate_methods[] = {
/* Device interface */
DEVMETHOD(device_identify, intel_hwpstate_identify),
DEVMETHOD(device_probe, intel_hwpstate_probe),
DEVMETHOD(device_attach, intel_hwpstate_attach),
DEVMETHOD(device_detach, intel_hwpstate_detach),
DEVMETHOD(device_suspend, intel_hwpstate_suspend),
DEVMETHOD(device_resume, intel_hwpstate_resume),
/* cpufreq interface */
DEVMETHOD(cpufreq_drv_get, intel_hwpstate_get),
DEVMETHOD(cpufreq_drv_type, intel_hwpstate_type),
DEVMETHOD_END
};
struct hwp_softc {
device_t dev;
bool hwp_notifications;
bool hwp_activity_window;
bool hwp_pref_ctrl;
bool hwp_pkg_ctrl;
uint64_t req; /* Cached copy of last request */
uint8_t high;
uint8_t guaranteed;
uint8_t efficient;
uint8_t low;
};
static devclass_t hwpstate_intel_devclass;
static driver_t hwpstate_intel_driver = {
"hwpstate_intel",
intel_hwpstate_methods,
sizeof(struct hwp_softc),
};
DRIVER_MODULE(hwpstate_intel, cpu, hwpstate_intel_driver,
hwpstate_intel_devclass, NULL, NULL);
static int
intel_hwp_dump_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
device_t dev;
struct pcpu *pc;
struct sbuf *sb;
struct hwp_softc *sc;
uint64_t data, data2;
int ret;
sc = (struct hwp_softc *)arg1;
dev = sc->dev;
pc = cpu_get_pcpu(dev);
if (pc == NULL)
return (ENXIO);
sb = sbuf_new(NULL, NULL, 1024, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
sbuf_putc(sb, '\n');
thread_lock(curthread);
sched_bind(curthread, pc->pc_cpuid);
thread_unlock(curthread);
rdmsr_safe(MSR_IA32_PM_ENABLE, &data);
sbuf_printf(sb, "CPU%d: HWP %sabled\n", pc->pc_cpuid,
((data & 1) ? "En" : "Dis"));
if (data == 0) {
ret = 0;
goto out;
}
rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &data);
sbuf_printf(sb, "\tHighest Performance: %03lu\n", data & 0xff);
sbuf_printf(sb, "\tGuaranteed Performance: %03lu\n", (data >> 8) & 0xff);
sbuf_printf(sb, "\tEfficient Performance: %03lu\n", (data >> 16) & 0xff);
sbuf_printf(sb, "\tLowest Performance: %03lu\n", (data >> 24) & 0xff);
rdmsr_safe(MSR_IA32_HWP_REQUEST, &data);
if (sc->hwp_pkg_ctrl && (data & IA32_HWP_REQUEST_PACKAGE_CONTROL)) {
rdmsr_safe(MSR_IA32_HWP_REQUEST_PKG, &data2);
}
sbuf_putc(sb, '\n');
#define pkg_print(x, name, offset) do { \
if (!sc->hwp_pkg_ctrl || (data & x) != 0) \
sbuf_printf(sb, "\t%s: %03lu\n", name, (data >> offset) & 0xff);\
else \
sbuf_printf(sb, "\t%s: %03lu\n", name, (data2 >> offset) & 0xff);\
} while (0)
pkg_print(IA32_HWP_REQUEST_EPP_VALID,
"Requested Efficiency Performance Preference", 24);
pkg_print(IA32_HWP_REQUEST_DESIRED_VALID,
"Requested Desired Performance", 16);
pkg_print(IA32_HWP_REQUEST_MAXIMUM_VALID,
"Requested Maximum Performance", 8);
pkg_print(IA32_HWP_REQUEST_MINIMUM_VALID,
"Requested Minimum Performance", 0);
#undef pkg_print
sbuf_putc(sb, '\n');
out:
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
ret = sbuf_finish(sb);
if (ret == 0)
ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
sbuf_delete(sb);
return (ret);
}
static inline int
percent_to_raw(int x)
{
MPASS(x <= 100 && x >= 0);
return (0xff * x / 100);
}
/*
* Given x * 10 in [0, 1000], round to the integer nearest x.
*
* This allows round-tripping nice human readable numbers through this
* interface. Otherwise, user-provided percentages such as 25, 50, 75 get
* rounded down to 24, 49, and 74, which is a bit ugly.
*/
static inline int
round10(int xtimes10)
{
return ((xtimes10 + 5) / 10);
}
static inline int
raw_to_percent(int x)
{
MPASS(x <= 0xff && x >= 0);
return (round10(x * 1000 / 0xff));
}
static int
sysctl_epp_select(SYSCTL_HANDLER_ARGS)
{
device_t dev;
struct pcpu *pc;
uint64_t requested;
uint32_t val;
int ret;
dev = oidp->oid_arg1;
pc = cpu_get_pcpu(dev);
if (pc == NULL)
return (ENXIO);
thread_lock(curthread);
sched_bind(curthread, pc->pc_cpuid);
thread_unlock(curthread);
rdmsr_safe(MSR_IA32_HWP_REQUEST, &requested);
val = (requested & IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE) >> 24;
val = raw_to_percent(val);
MPASS(val >= 0 && val <= 100);
ret = sysctl_handle_int(oidp, &val, 0, req);
if (ret || req->newptr == NULL)
goto out;
if (val > 100) {
ret = EINVAL;
goto out;
}
val = percent_to_raw(val);
requested &= ~IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE;
requested |= val << 24;
wrmsr_safe(MSR_IA32_HWP_REQUEST, requested);
out:
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
return (ret);
}
void
intel_hwpstate_identify(driver_t *driver, device_t parent)
{
uint32_t regs[4];
if (device_find_child(parent, "hwpstate_intel", -1) != NULL)
return;
if (cpu_vendor_id != CPU_VENDOR_INTEL)
return;
if (resource_disabled("hwpstate_intel", 0))
return;
/*
* Intel SDM 14.4.1 (HWP Programming Interfaces):
* The CPUID instruction allows software to discover the presence of
* HWP support in an Intel processor. Specifically, execute CPUID
* instruction with EAX=06H as input will return 5 bit flags covering
* the following aspects in bits 7 through 11 of CPUID.06H:EAX.
*/
if (cpu_high < 6)
return;
/*
* Intel SDM 14.4.1 (HWP Programming Interfaces):
* Availability of HWP baseline resource and capability,
* CPUID.06H:EAX[bit 7]: If this bit is set, HWP provides several new
* architectural MSRs: IA32_PM_ENABLE, IA32_HWP_CAPABILITIES,
* IA32_HWP_REQUEST, IA32_HWP_STATUS.
*/
do_cpuid(6, regs);
if ((regs[0] & CPUTPM1_HWP) == 0)
return;
if (BUS_ADD_CHILD(parent, 10, "hwpstate_intel", -1) == NULL)
return;
if (bootverbose)
device_printf(parent, "hwpstate registered\n");
}
static int
intel_hwpstate_probe(device_t dev)
{
device_set_desc(dev, "Intel Speed Shift");
return (BUS_PROBE_NOWILDCARD);
}
/* FIXME: Need to support PKG variant */
static int
set_autonomous_hwp(struct hwp_softc *sc)
{
struct pcpu *pc;
device_t dev;
uint64_t caps;
int ret;
dev = sc->dev;
pc = cpu_get_pcpu(dev);
if (pc == NULL)
return (ENXIO);
thread_lock(curthread);
sched_bind(curthread, pc->pc_cpuid);
thread_unlock(curthread);
/* XXX: Many MSRs aren't readable until feature is enabled */
ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1);
if (ret) {
device_printf(dev, "Failed to enable HWP for cpu%d (%d)\n",
pc->pc_cpuid, ret);
goto out;
}
ret = rdmsr_safe(MSR_IA32_HWP_REQUEST, &sc->req);
if (ret)
return (ret);
ret = rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &caps);
if (ret)
return (ret);
sc->high = IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(caps);
sc->guaranteed = IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(caps);
sc->efficient = IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(caps);
sc->low = IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(caps);
/* hardware autonomous selection determines the performance target */
sc->req &= ~IA32_HWP_DESIRED_PERFORMANCE;
/* enable HW dynamic selection of window size */
sc->req &= ~IA32_HWP_ACTIVITY_WINDOW;
/* IA32_HWP_REQUEST.Minimum_Performance = IA32_HWP_CAPABILITIES.Lowest_Performance */
sc->req &= ~IA32_HWP_MINIMUM_PERFORMANCE;
sc->req |= sc->low;
/* IA32_HWP_REQUEST.Maximum_Performance = IA32_HWP_CAPABILITIES.Highest_Performance. */
sc->req &= ~IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE;
sc->req |= sc->high << 8;
ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
if (ret) {
device_printf(dev,
"Failed to setup autonomous HWP for cpu%d (file a bug)\n",
pc->pc_cpuid);
}
out:
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
return (ret);
}
static int
intel_hwpstate_attach(device_t dev)
{
struct hwp_softc *sc;
uint32_t regs[4];
int ret;
sc = device_get_softc(dev);
sc->dev = dev;
do_cpuid(6, regs);
if (regs[0] & CPUTPM1_HWP_NOTIFICATION)
sc->hwp_notifications = true;
if (regs[0] & CPUTPM1_HWP_ACTIVITY_WINDOW)
sc->hwp_activity_window = true;
if (regs[0] & CPUTPM1_HWP_PERF_PREF)
sc->hwp_pref_ctrl = true;
if (regs[0] & CPUTPM1_HWP_PKG)
sc->hwp_pkg_ctrl = true;
ret = set_autonomous_hwp(sc);
if (ret)
return (ret);
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_STATIC_CHILDREN(_debug), OID_AUTO, device_get_nameunit(dev),
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP,
sc, 0, intel_hwp_dump_sysctl_handler, "A", "");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
"epp", CTLTYPE_INT | CTLFLAG_RWTUN, dev, sizeof(dev),
sysctl_epp_select, "I",
"Efficiency/Performance Preference "
"(range from 0, most performant, through 100, most efficient)");
return (cpufreq_register(dev));
}
static int
intel_hwpstate_detach(device_t dev)
{
return (cpufreq_unregister(dev));
}
static int
intel_hwpstate_get(device_t dev, struct cf_setting *set)
{
struct pcpu *pc;
uint64_t rate;
int ret;
if (set == NULL)
return (EINVAL);
pc = cpu_get_pcpu(dev);
if (pc == NULL)
return (ENXIO);
memset(set, CPUFREQ_VAL_UNKNOWN, sizeof(*set));
set->dev = dev;
ret = cpu_est_clockrate(pc->pc_cpuid, &rate);
if (ret == 0)
set->freq = rate / 1000000;
set->volts = CPUFREQ_VAL_UNKNOWN;
set->power = CPUFREQ_VAL_UNKNOWN;
set->lat = CPUFREQ_VAL_UNKNOWN;
return (0);
}
static int
intel_hwpstate_type(device_t dev, int *type)
{
if (type == NULL)
return (EINVAL);
*type = CPUFREQ_TYPE_ABSOLUTE | CPUFREQ_FLAG_INFO_ONLY | CPUFREQ_FLAG_UNCACHED;
return (0);
}
static int
intel_hwpstate_suspend(device_t dev)
{
return (0);
}
/*
* Redo a subset of set_autonomous_hwp on resume; untested. Without this,
* testers observed that on resume MSR_IA32_HWP_REQUEST was bogus.
*/
static int
intel_hwpstate_resume(device_t dev)
{
struct hwp_softc *sc;
struct pcpu *pc;
int ret;
sc = device_get_softc(dev);
pc = cpu_get_pcpu(dev);
if (pc == NULL)
return (ENXIO);
thread_lock(curthread);
sched_bind(curthread, pc->pc_cpuid);
thread_unlock(curthread);
ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1);
if (ret) {
device_printf(dev,
"Failed to enable HWP for cpu%d after suspend (%d)\n",
pc->pc_cpuid, ret);
goto out;
}
ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
if (ret) {
device_printf(dev,
"Failed to setup autonomous HWP for cpu%d after suspend\n",
pc->pc_cpuid);
}
out:
thread_lock(curthread);
sched_unbind(curthread);
thread_unlock(curthread);
return (ret);
}

View File

@ -0,0 +1,35 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2018 Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted providing that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef __X86_CPUFREQ_HWPSTATE_INTEL_INTERNAL_H
#define __X86_CPUFREQ_HWPSTATE_INTEL_INTERNAL_H
void intel_hwpstate_identify(driver_t *driver, device_t parent);
#endif