numam-dpdk/lib/librte_power/rte_power_pmd_mgmt.c
Liang Ma 682a645438 power: add ethdev power management
Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API mandates a core-to-single-queue mapping (that is, multiple
queued per device are supported, but they have to be polled on different
cores).

This design is using PMD RX callbacks.

1. UMWAIT/UMONITOR:

   When a certain threshold of empty polls is reached, the core will go
   into a power optimized sleep while waiting on an address of next RX
   descriptor to be written to.

2. TPAUSE/Pause instruction

   This method uses the pause (or TPAUSE, if available) instruction to
   avoid busy polling.

3. Frequency scaling
   Reuse existing DPDK power library to scale up/down core frequency
   depending on traffic volume.

Signed-off-by: Liang Ma <liang.j.ma@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: David Hunt <david.hunt@intel.com>
2021-01-29 15:29:48 +01:00

366 lines
9.4 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
#include <rte_lcore.h>
#include <rte_cycles.h>
#include <rte_cpuflags.h>
#include <rte_malloc.h>
#include <rte_ethdev.h>
#include <rte_power_intrinsics.h>
#include "rte_power_pmd_mgmt.h"
#define EMPTYPOLL_MAX 512
/* store some internal state */
static struct pmd_conf_data {
/** what do we support? */
struct rte_cpu_intrinsics intrinsics_support;
/** pre-calculated tsc diff for 1us */
uint64_t tsc_per_us;
/** how many rte_pause can we fit in a microsecond? */
uint64_t pause_per_us;
} global_data;
/**
* Possible power management states of an ethdev port.
*/
enum pmd_mgmt_state {
/** Device power management is disabled. */
PMD_MGMT_DISABLED = 0,
/** Device power management is enabled. */
PMD_MGMT_ENABLED
};
struct pmd_queue_cfg {
volatile enum pmd_mgmt_state pwr_mgmt_state;
/**< State of power management for this queue */
enum rte_power_pmd_mgmt_type cb_mode;
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
volatile bool umwait_in_progress;
/**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
} __rte_cache_aligned;
static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
static void
calc_tsc(void)
{
const uint64_t hz = rte_get_timer_hz();
const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
global_data.tsc_per_us = tsc_per_us;
/* only do this if we don't have tpause */
if (!global_data.intrinsics_support.power_pause) {
const uint64_t start = rte_rdtsc_precise();
const uint32_t n_pauses = 10000;
double us, us_per_pause;
uint64_t end;
unsigned int i;
/* estimate number of rte_pause() calls per us*/
for (i = 0; i < n_pauses; i++)
rte_pause();
end = rte_rdtsc_precise();
us = (end - start) / (double)tsc_per_us;
us_per_pause = us / n_pauses;
global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
}
}
static uint16_t
clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused,
void *addr __rte_unused)
{
struct pmd_queue_cfg *q_conf;
q_conf = &port_cfg[port_id][qidx];
if (unlikely(nb_rx == 0)) {
q_conf->empty_poll_stats++;
if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
struct rte_power_monitor_cond pmc;
uint16_t ret;
/*
* we might get a cancellation request while being
* inside the callback, in which case the wakeup
* wouldn't work because it would've arrived too early.
*
* to get around this, we notify the other thread that
* we're sleeping, so that it can spin until we're done.
* unsolicited wakeups are perfectly safe.
*/
q_conf->umwait_in_progress = true;
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
/* check if we need to cancel sleep */
if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
/* use monitoring condition to sleep */
ret = rte_eth_get_monitor_addr(port_id, qidx,
&pmc);
if (ret == 0)
rte_power_monitor(&pmc, -1ULL);
}
q_conf->umwait_in_progress = false;
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
}
} else
q_conf->empty_poll_stats = 0;
return nb_rx;
}
static uint16_t
clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused,
void *addr __rte_unused)
{
struct pmd_queue_cfg *q_conf;
q_conf = &port_cfg[port_id][qidx];
if (unlikely(nb_rx == 0)) {
q_conf->empty_poll_stats++;
/* sleep for 1 microsecond */
if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
/* use tpause if we have it */
if (global_data.intrinsics_support.power_pause) {
const uint64_t cur = rte_rdtsc();
const uint64_t wait_tsc =
cur + global_data.tsc_per_us;
rte_power_pause(wait_tsc);
} else {
uint64_t i;
for (i = 0; i < global_data.pause_per_us; i++)
rte_pause();
}
}
} else
q_conf->empty_poll_stats = 0;
return nb_rx;
}
static uint16_t
clb_scale_freq(uint16_t port_id, uint16_t qidx,
struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
uint16_t max_pkts __rte_unused, void *_ __rte_unused)
{
struct pmd_queue_cfg *q_conf;
q_conf = &port_cfg[port_id][qidx];
if (unlikely(nb_rx == 0)) {
q_conf->empty_poll_stats++;
if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
/* scale down freq */
rte_power_freq_min(rte_lcore_id());
} else {
q_conf->empty_poll_stats = 0;
/* scale up freq */
rte_power_freq_max(rte_lcore_id());
}
return nb_rx;
}
int
rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
{
struct pmd_queue_cfg *queue_cfg;
struct rte_eth_dev_info info;
int ret;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
ret = -EINVAL;
goto end;
}
if (rte_eth_dev_info_get(port_id, &info) < 0) {
ret = -EINVAL;
goto end;
}
/* check if queue id is valid */
if (queue_id >= info.nb_rx_queues) {
ret = -EINVAL;
goto end;
}
queue_cfg = &port_cfg[port_id][queue_id];
if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
ret = -EINVAL;
goto end;
}
/* we need this in various places */
rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
switch (mode) {
case RTE_POWER_MGMT_TYPE_MONITOR:
{
struct rte_power_monitor_cond dummy;
/* check if rte_power_monitor is supported */
if (!global_data.intrinsics_support.power_monitor) {
RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
ret = -ENOTSUP;
goto end;
}
/* check if the device supports the necessary PMD API */
if (rte_eth_get_monitor_addr(port_id, queue_id,
&dummy) == -ENOTSUP) {
RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
ret = -ENOTSUP;
goto end;
}
/* initialize data before enabling the callback */
queue_cfg->empty_poll_stats = 0;
queue_cfg->cb_mode = mode;
queue_cfg->umwait_in_progress = false;
queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
/* ensure we update our state before callback starts */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
clb_umwait, NULL);
break;
}
case RTE_POWER_MGMT_TYPE_SCALE:
{
enum power_management_env env;
/* only PSTATE and ACPI modes are supported */
if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
!rte_power_check_env_supported(
PM_ENV_PSTATE_CPUFREQ)) {
RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
ret = -ENOTSUP;
goto end;
}
/* ensure we could initialize the power library */
if (rte_power_init(lcore_id)) {
ret = -EINVAL;
goto end;
}
/* ensure we initialized the correct env */
env = rte_power_get_env();
if (env != PM_ENV_ACPI_CPUFREQ &&
env != PM_ENV_PSTATE_CPUFREQ) {
RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
ret = -ENOTSUP;
goto end;
}
/* initialize data before enabling the callback */
queue_cfg->empty_poll_stats = 0;
queue_cfg->cb_mode = mode;
queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
/* this is not necessary here, but do it anyway */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
queue_id, clb_scale_freq, NULL);
break;
}
case RTE_POWER_MGMT_TYPE_PAUSE:
/* figure out various time-to-tsc conversions */
if (global_data.tsc_per_us == 0)
calc_tsc();
/* initialize data before enabling the callback */
queue_cfg->empty_poll_stats = 0;
queue_cfg->cb_mode = mode;
queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
/* this is not necessary here, but do it anyway */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
clb_pause, NULL);
break;
}
ret = 0;
end:
return ret;
}
int
rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
uint16_t port_id, uint16_t queue_id)
{
struct pmd_queue_cfg *queue_cfg;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
return -EINVAL;
/* no need to check queue id as wrong queue id would not be enabled */
queue_cfg = &port_cfg[port_id][queue_id];
if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
return -EINVAL;
/* stop any callbacks from progressing */
queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
/* ensure we update our state before continuing */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
switch (queue_cfg->cb_mode) {
case RTE_POWER_MGMT_TYPE_MONITOR:
{
bool exit = false;
do {
/*
* we may request cancellation while the other thread
* has just entered the callback but hasn't started
* sleeping yet, so keep waking it up until we know it's
* done sleeping.
*/
if (queue_cfg->umwait_in_progress)
rte_power_monitor_wakeup(lcore_id);
else
exit = true;
} while (!exit);
}
/* fall-through */
case RTE_POWER_MGMT_TYPE_PAUSE:
rte_eth_remove_rx_callback(port_id, queue_id,
queue_cfg->cur_cb);
break;
case RTE_POWER_MGMT_TYPE_SCALE:
rte_power_freq_max(lcore_id);
rte_eth_remove_rx_callback(port_id, queue_id,
queue_cfg->cur_cb);
rte_power_exit(lcore_id);
break;
}
/*
* we don't free the RX callback here because it is unsafe to do so
* unless we know for a fact that all data plane threads have stopped.
*/
queue_cfg->cur_cb = NULL;
return 0;
}