power: add ethdev power management

Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API mandates a core-to-single-queue mapping (that is, multiple
queued per device are supported, but they have to be polled on different
cores).

This design is using PMD RX callbacks.

1. UMWAIT/UMONITOR:

   When a certain threshold of empty polls is reached, the core will go
   into a power optimized sleep while waiting on an address of next RX
   descriptor to be written to.

2. TPAUSE/Pause instruction

   This method uses the pause (or TPAUSE, if available) instruction to
   avoid busy polling.

3. Frequency scaling
   Reuse existing DPDK power library to scale up/down core frequency
   depending on traffic volume.

Signed-off-by: Liang Ma <liang.j.ma@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: David Hunt <david.hunt@intel.com>
This commit is contained in:
Liang Ma 2021-01-22 17:12:16 +00:00 committed by Thomas Monjalon
parent abc0cade20
commit 682a645438
6 changed files with 513 additions and 1 deletions

View File

@ -192,6 +192,48 @@ User Cases
----------
The mechanism can applied to any device which is based on polling. e.g. NIC, FPGA.
Ethernet PMD Power Management API
---------------------------------
Abstract
~~~~~~~~
Existing power management mechanisms require developers
to change application design or change code to make use of it.
The PMD power management API provides a convenient alternative
by utilizing Ethernet PMD RX callbacks,
and triggering power saving whenever empty poll count reaches a certain number.
Monitor
This power saving scheme will put the CPU into optimized power state
and use the ``rte_power_monitor()`` function
to monitor the Ethernet PMD RX descriptor address,
and wake the CPU up whenever there's new traffic.
Pause
This power saving scheme will avoid busy polling
by either entering power-optimized sleep state
with ``rte_power_pause()`` function,
or, if it's not available, use ``rte_pause()``.
Frequency scaling
This power saving scheme will use ``librte_power`` library
functionality to scale the core frequency up/down
depending on traffic volume.
.. note::
Currently, this power management API is limited to mandatory mapping
of 1 queue to 1 core (multiple queues are supported,
but they must be polled from different cores).
API Overview for Ethernet PMD Power Management
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* **Queue Enable**: Enable specific power scheme for certain queue/port/core.
* **Queue Disable**: Disable power scheme for certain queue/port/core.
References
----------

View File

@ -60,6 +60,16 @@ New Features
Added ``rte_eth_get_monitor_addr()``, to be used in conjunction with
``rte_power_monitor()`` to enable automatic power management for PMDs.
* **Added Ethernet PMD power management helper API.**
A new helper API has been added to make using Ethernet PMD power management
easier for the user: ``rte_power_ethdev_pmgmt_queue_enable()``. Three power
management schemes are supported initially:
* Power saving based on UMWAIT instruction (x86 only)
* Power saving based on ``rte_pause()`` (generic) or TPAUSE instruction (x86 only)
* Power saving based on frequency scaling through the ``librte_power`` library
* **Added GENEVE TLV option in rte_flow.**
Added support for matching and raw encap/decap of GENEVE TLV option.

View File

@ -9,7 +9,9 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
'power_kvm_vm.c', 'guest_channel.c',
'rte_power_empty_poll.c',
'power_pstate_cpufreq.c',
'rte_power_pmd_mgmt.c',
'power_common.c')
headers = files('rte_power.h','rte_power_empty_poll.h',
'rte_power_pmd_mgmt.h',
'rte_power_guest_channel.h')
deps += ['timer']
deps += ['timer', 'ethdev']

View File

@ -0,0 +1,365 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
#include <rte_lcore.h>
#include <rte_cycles.h>
#include <rte_cpuflags.h>
#include <rte_malloc.h>
#include <rte_ethdev.h>
#include <rte_power_intrinsics.h>
#include "rte_power_pmd_mgmt.h"
#define EMPTYPOLL_MAX 512
/* store some internal state */
static struct pmd_conf_data {
/** what do we support? */
struct rte_cpu_intrinsics intrinsics_support;
/** pre-calculated tsc diff for 1us */
uint64_t tsc_per_us;
/** how many rte_pause can we fit in a microsecond? */
uint64_t pause_per_us;
} global_data;
/**
* Possible power management states of an ethdev port.
*/
enum pmd_mgmt_state {
/** Device power management is disabled. */
PMD_MGMT_DISABLED = 0,
/** Device power management is enabled. */
PMD_MGMT_ENABLED
};
struct pmd_queue_cfg {
volatile enum pmd_mgmt_state pwr_mgmt_state;
/**< State of power management for this queue */
enum rte_power_pmd_mgmt_type cb_mode;
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
volatile bool umwait_in_progress;
/**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
} __rte_cache_aligned;
static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
static void
calc_tsc(void)
{
const uint64_t hz = rte_get_timer_hz();
const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
global_data.tsc_per_us = tsc_per_us;
/* only do this if we don't have tpause */
if (!global_data.intrinsics_support.power_pause) {
const uint64_t start = rte_rdtsc_precise();
const uint32_t n_pauses = 10000;
double us, us_per_pause;
uint64_t end;
unsigned int i;
/* estimate number of rte_pause() calls per us*/
for (i = 0; i < n_pauses; i++)
rte_pause();
end = rte_rdtsc_precise();
us = (end - start) / (double)tsc_per_us;
us_per_pause = us / n_pauses;
global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
}
}
static uint16_t
clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused,
void *addr __rte_unused)
{
struct pmd_queue_cfg *q_conf;
q_conf = &port_cfg[port_id][qidx];
if (unlikely(nb_rx == 0)) {
q_conf->empty_poll_stats++;
if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
struct rte_power_monitor_cond pmc;
uint16_t ret;
/*
* we might get a cancellation request while being
* inside the callback, in which case the wakeup
* wouldn't work because it would've arrived too early.
*
* to get around this, we notify the other thread that
* we're sleeping, so that it can spin until we're done.
* unsolicited wakeups are perfectly safe.
*/
q_conf->umwait_in_progress = true;
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
/* check if we need to cancel sleep */
if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
/* use monitoring condition to sleep */
ret = rte_eth_get_monitor_addr(port_id, qidx,
&pmc);
if (ret == 0)
rte_power_monitor(&pmc, -1ULL);
}
q_conf->umwait_in_progress = false;
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
}
} else
q_conf->empty_poll_stats = 0;
return nb_rx;
}
static uint16_t
clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused,
void *addr __rte_unused)
{
struct pmd_queue_cfg *q_conf;
q_conf = &port_cfg[port_id][qidx];
if (unlikely(nb_rx == 0)) {
q_conf->empty_poll_stats++;
/* sleep for 1 microsecond */
if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
/* use tpause if we have it */
if (global_data.intrinsics_support.power_pause) {
const uint64_t cur = rte_rdtsc();
const uint64_t wait_tsc =
cur + global_data.tsc_per_us;
rte_power_pause(wait_tsc);
} else {
uint64_t i;
for (i = 0; i < global_data.pause_per_us; i++)
rte_pause();
}
}
} else
q_conf->empty_poll_stats = 0;
return nb_rx;
}
static uint16_t
clb_scale_freq(uint16_t port_id, uint16_t qidx,
struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
uint16_t max_pkts __rte_unused, void *_ __rte_unused)
{
struct pmd_queue_cfg *q_conf;
q_conf = &port_cfg[port_id][qidx];
if (unlikely(nb_rx == 0)) {
q_conf->empty_poll_stats++;
if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
/* scale down freq */
rte_power_freq_min(rte_lcore_id());
} else {
q_conf->empty_poll_stats = 0;
/* scale up freq */
rte_power_freq_max(rte_lcore_id());
}
return nb_rx;
}
int
rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
{
struct pmd_queue_cfg *queue_cfg;
struct rte_eth_dev_info info;
int ret;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
ret = -EINVAL;
goto end;
}
if (rte_eth_dev_info_get(port_id, &info) < 0) {
ret = -EINVAL;
goto end;
}
/* check if queue id is valid */
if (queue_id >= info.nb_rx_queues) {
ret = -EINVAL;
goto end;
}
queue_cfg = &port_cfg[port_id][queue_id];
if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
ret = -EINVAL;
goto end;
}
/* we need this in various places */
rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
switch (mode) {
case RTE_POWER_MGMT_TYPE_MONITOR:
{
struct rte_power_monitor_cond dummy;
/* check if rte_power_monitor is supported */
if (!global_data.intrinsics_support.power_monitor) {
RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
ret = -ENOTSUP;
goto end;
}
/* check if the device supports the necessary PMD API */
if (rte_eth_get_monitor_addr(port_id, queue_id,
&dummy) == -ENOTSUP) {
RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
ret = -ENOTSUP;
goto end;
}
/* initialize data before enabling the callback */
queue_cfg->empty_poll_stats = 0;
queue_cfg->cb_mode = mode;
queue_cfg->umwait_in_progress = false;
queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
/* ensure we update our state before callback starts */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
clb_umwait, NULL);
break;
}
case RTE_POWER_MGMT_TYPE_SCALE:
{
enum power_management_env env;
/* only PSTATE and ACPI modes are supported */
if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
!rte_power_check_env_supported(
PM_ENV_PSTATE_CPUFREQ)) {
RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
ret = -ENOTSUP;
goto end;
}
/* ensure we could initialize the power library */
if (rte_power_init(lcore_id)) {
ret = -EINVAL;
goto end;
}
/* ensure we initialized the correct env */
env = rte_power_get_env();
if (env != PM_ENV_ACPI_CPUFREQ &&
env != PM_ENV_PSTATE_CPUFREQ) {
RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
ret = -ENOTSUP;
goto end;
}
/* initialize data before enabling the callback */
queue_cfg->empty_poll_stats = 0;
queue_cfg->cb_mode = mode;
queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
/* this is not necessary here, but do it anyway */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
queue_id, clb_scale_freq, NULL);
break;
}
case RTE_POWER_MGMT_TYPE_PAUSE:
/* figure out various time-to-tsc conversions */
if (global_data.tsc_per_us == 0)
calc_tsc();
/* initialize data before enabling the callback */
queue_cfg->empty_poll_stats = 0;
queue_cfg->cb_mode = mode;
queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
/* this is not necessary here, but do it anyway */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
clb_pause, NULL);
break;
}
ret = 0;
end:
return ret;
}
int
rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
uint16_t port_id, uint16_t queue_id)
{
struct pmd_queue_cfg *queue_cfg;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
return -EINVAL;
/* no need to check queue id as wrong queue id would not be enabled */
queue_cfg = &port_cfg[port_id][queue_id];
if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
return -EINVAL;
/* stop any callbacks from progressing */
queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
/* ensure we update our state before continuing */
rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
switch (queue_cfg->cb_mode) {
case RTE_POWER_MGMT_TYPE_MONITOR:
{
bool exit = false;
do {
/*
* we may request cancellation while the other thread
* has just entered the callback but hasn't started
* sleeping yet, so keep waking it up until we know it's
* done sleeping.
*/
if (queue_cfg->umwait_in_progress)
rte_power_monitor_wakeup(lcore_id);
else
exit = true;
} while (!exit);
}
/* fall-through */
case RTE_POWER_MGMT_TYPE_PAUSE:
rte_eth_remove_rx_callback(port_id, queue_id,
queue_cfg->cur_cb);
break;
case RTE_POWER_MGMT_TYPE_SCALE:
rte_power_freq_max(lcore_id);
rte_eth_remove_rx_callback(port_id, queue_id,
queue_cfg->cur_cb);
rte_power_exit(lcore_id);
break;
}
/*
* we don't free the RX callback here because it is unsafe to do so
* unless we know for a fact that all data plane threads have stopped.
*/
queue_cfg->cur_cb = NULL;
return 0;
}

View File

@ -0,0 +1,91 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
#ifndef _RTE_POWER_PMD_MGMT_H
#define _RTE_POWER_PMD_MGMT_H
/**
* @file
* RTE PMD Power Management
*/
#include <stdint.h>
#include <stdbool.h>
#include <rte_common.h>
#include <rte_byteorder.h>
#include <rte_log.h>
#include <rte_power.h>
#include <rte_atomic.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* PMD Power Management Type
*/
enum rte_power_pmd_mgmt_type {
/** Use power-optimized monitoring to wait for incoming traffic */
RTE_POWER_MGMT_TYPE_MONITOR = 1,
/** Use power-optimized sleep to avoid busy polling */
RTE_POWER_MGMT_TYPE_PAUSE,
/** Use frequency scaling when traffic is low */
RTE_POWER_MGMT_TYPE_SCALE,
};
/**
* @warning
* @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
*
* Enable power management on a specified Ethernet device Rx queue and lcore.
*
* @note This function is not thread-safe.
*
* @param lcore_id
* The lcore the Rx queue will be polled from.
* @param port_id
* The port identifier of the Ethernet device.
* @param queue_id
* The queue identifier of the Ethernet device.
* @param mode
* The power management scheme to use for specified Rx queue.
* @return
* 0 on success
* <0 on error
*/
__rte_experimental
int
rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id,
uint16_t port_id, uint16_t queue_id,
enum rte_power_pmd_mgmt_type mode);
/**
* @warning
* @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
*
* Disable power management on a specified Ethernet device Rx queue and lcore.
*
* @note This function is not thread-safe.
*
* @param lcore_id
* The lcore the Rx queue is polled from.
* @param port_id
* The port identifier of the Ethernet device.
* @param queue_id
* The queue identifier of the Ethernet device.
* @return
* 0 on success
* <0 on error
*/
__rte_experimental
int
rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
uint16_t port_id, uint16_t queue_id);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -36,6 +36,8 @@ EXPERIMENTAL {
rte_power_poll_stat_update;
# added in 21.02
rte_power_ethdev_pmgmt_queue_disable;
rte_power_ethdev_pmgmt_queue_enable;
rte_power_guest_channel_receive_msg;
rte_power_guest_channel_send_msg;
};