power: add ethdev power management

Add a simple on/off switch that will enable saving power when no packets are arriving. It is based on counting the number of empty polls and, when the number reaches a certain threshold, entering an architecture-defined optimized power state that will either wait until a TSC timestamp expires, or when packets arrive. This API mandates a core-to-single-queue mapping (that is, multiple queued per device are supported, but they have to be polled on different cores). This design is using PMD RX callbacks. 1. UMWAIT/UMONITOR: When a certain threshold of empty polls is reached, the core will go into a power optimized sleep while waiting on an address of next RX descriptor to be written to. 2. TPAUSE/Pause instruction This method uses the pause (or TPAUSE, if available) instruction to avoid busy polling. 3. Frequency scaling Reuse existing DPDK power library to scale up/down core frequency depending on traffic volume. Signed-off-by: Liang Ma <liang.j.ma@intel.com> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com> Acked-by: David Hunt <david.hunt@intel.com>
2021-01-22 17:12:16 +00:00 · 2021-01-22 17:12:16 +00:00 · 682a645438
commit 682a645438
parent abc0cade20
6 changed files with 513 additions and 1 deletions
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@ -192,6 +192,48 @@ User Cases
 ----------
 The mechanism can applied to any device which is based on polling. e.g. NIC, FPGA.
 Ethernet PMD Power Management API
 ---------------------------------
 Abstract
 ~~~~~~~~
 Existing power management mechanisms require developers
 to change application design or change code to make use of it.
 The PMD power management API provides a convenient alternative
 by utilizing Ethernet PMD RX callbacks,
 and triggering power saving whenever empty poll count reaches a certain number.
 Monitor
   This power saving scheme will put the CPU into optimized power state
   and use the ``rte_power_monitor()`` function
   to monitor the Ethernet PMD RX descriptor address,
   and wake the CPU up whenever there's new traffic.
 Pause
   This power saving scheme will avoid busy polling
   by either entering power-optimized sleep state
   with ``rte_power_pause()`` function,
   or, if it's not available, use ``rte_pause()``.
 Frequency scaling
   This power saving scheme will use ``librte_power`` library
   functionality to scale the core frequency up/down
   depending on traffic volume.
 .. note::
   Currently, this power management API is limited to mandatory mapping
   of 1 queue to 1 core (multiple queues are supported,
   but they must be polled from different cores).
 API Overview for Ethernet PMD Power Management
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * **Queue Enable**: Enable specific power scheme for certain queue/port/core.
 * **Queue Disable**: Disable power scheme for certain queue/port/core.
 References
 ----------
--- a/doc/guides/rel_notes/release_21_02.rst
+++ b/doc/guides/rel_notes/release_21_02.rst
@ -60,6 +60,16 @@ New Features
  Added ``rte_eth_get_monitor_addr()``, to be used in conjunction with
  ``rte_power_monitor()`` to enable automatic power management for PMDs.
 * **Added Ethernet PMD power management helper API.**
  A new helper API has been added to make using Ethernet PMD power management
  easier for the user: ``rte_power_ethdev_pmgmt_queue_enable()``. Three power
  management schemes are supported initially:
  * Power saving based on UMWAIT instruction (x86 only)
  * Power saving based on ``rte_pause()`` (generic) or TPAUSE instruction (x86 only)
  * Power saving based on frequency scaling through the ``librte_power`` library
 * **Added GENEVE TLV option in rte_flow.**
  Added support for matching and raw encap/decap of GENEVE TLV option.
--- a/lib/librte_power/meson.build
+++ b/lib/librte_power/meson.build
@ -9,7 +9,9 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
 		'power_kvm_vm.c', 'guest_channel.c',
 		'rte_power_empty_poll.c',
 		'power_pstate_cpufreq.c',
 		'rte_power_pmd_mgmt.c',
 		'power_common.c')
 headers = files('rte_power.h','rte_power_empty_poll.h',
 	'rte_power_pmd_mgmt.h',
 	'rte_power_guest_channel.h')
-deps += ['timer']
+deps += ['timer', 'ethdev']
--- a/lib/librte_power/rte_power_pmd_mgmt.c
+++ b/lib/librte_power/rte_power_pmd_mgmt.c
@ -0,0 +1,365 @@
 /* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2020 Intel Corporation
 */
 #include <rte_lcore.h>
 #include <rte_cycles.h>
 #include <rte_cpuflags.h>
 #include <rte_malloc.h>
 #include <rte_ethdev.h>
 #include <rte_power_intrinsics.h>
 #include "rte_power_pmd_mgmt.h"
 #define EMPTYPOLL_MAX  512
 /* store some internal state */
 static struct pmd_conf_data {
 	/** what do we support? */
 	struct rte_cpu_intrinsics intrinsics_support;
 	/** pre-calculated tsc diff for 1us */
 	uint64_t tsc_per_us;
 	/** how many rte_pause can we fit in a microsecond? */
 	uint64_t pause_per_us;
 } global_data;
 /**
 * Possible power management states of an ethdev port.
 */
 enum pmd_mgmt_state {
 	/** Device power management is disabled. */
 	PMD_MGMT_DISABLED = 0,
 	/** Device power management is enabled. */
 	PMD_MGMT_ENABLED
 };
 struct pmd_queue_cfg {
 	volatile enum pmd_mgmt_state pwr_mgmt_state;
 	/**< State of power management for this queue */
 	enum rte_power_pmd_mgmt_type cb_mode;
 	/**< Callback mode for this queue */
 	const struct rte_eth_rxtx_callback *cur_cb;
 	/**< Callback instance */
 	volatile bool umwait_in_progress;
 	/**< are we currently sleeping? */
 	uint64_t empty_poll_stats;
 	/**< Number of empty polls */
 } __rte_cache_aligned;
 static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
 static void
 calc_tsc(void)
 {
 	const uint64_t hz = rte_get_timer_hz();
 	const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
 	global_data.tsc_per_us = tsc_per_us;
 	/* only do this if we don't have tpause */
 	if (!global_data.intrinsics_support.power_pause) {
 		const uint64_t start = rte_rdtsc_precise();
 		const uint32_t n_pauses = 10000;
 		double us, us_per_pause;
 		uint64_t end;
 		unsigned int i;
 		/* estimate number of rte_pause() calls per us*/
 		for (i = 0; i < n_pauses; i++)
 			rte_pause();
 		end = rte_rdtsc_precise();
 		us = (end - start) / (double)tsc_per_us;
 		us_per_pause = us / n_pauses;
 		global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
 	}
 }
 static uint16_t
 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
 		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
 		void *addr __rte_unused)
 {
 	struct pmd_queue_cfg *q_conf;
 	q_conf = &port_cfg[port_id][qidx];
 	if (unlikely(nb_rx == 0)) {
 		q_conf->empty_poll_stats++;
 		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
 			struct rte_power_monitor_cond pmc;
 			uint16_t ret;
 			/*
 			 * we might get a cancellation request while being
 			 * inside the callback, in which case the wakeup
 			 * wouldn't work because it would've arrived too early.
 			 *
 			 * to get around this, we notify the other thread that
 			 * we're sleeping, so that it can spin until we're done.
 			 * unsolicited wakeups are perfectly safe.
 			 */
 			q_conf->umwait_in_progress = true;
 			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
 			/* check if we need to cancel sleep */
 			if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
 				/* use monitoring condition to sleep */
 				ret = rte_eth_get_monitor_addr(port_id, qidx,
 						&pmc);
 				if (ret == 0)
 					rte_power_monitor(&pmc, -1ULL);
 			}
 			q_conf->umwait_in_progress = false;
 			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
 		}
 	} else
 		q_conf->empty_poll_stats = 0;
 	return nb_rx;
 }
 static uint16_t
 clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
 		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
 		void *addr __rte_unused)
 {
 	struct pmd_queue_cfg *q_conf;
 	q_conf = &port_cfg[port_id][qidx];
 	if (unlikely(nb_rx == 0)) {
 		q_conf->empty_poll_stats++;
 		/* sleep for 1 microsecond */
 		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
 			/* use tpause if we have it */
 			if (global_data.intrinsics_support.power_pause) {
 				const uint64_t cur = rte_rdtsc();
 				const uint64_t wait_tsc =
 						cur + global_data.tsc_per_us;
 				rte_power_pause(wait_tsc);
 			} else {
 				uint64_t i;
 				for (i = 0; i < global_data.pause_per_us; i++)
 					rte_pause();
 			}
 		}
 	} else
 		q_conf->empty_poll_stats = 0;
 	return nb_rx;
 }
 static uint16_t
 clb_scale_freq(uint16_t port_id, uint16_t qidx,
 		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
 		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
 {
 	struct pmd_queue_cfg *q_conf;
 	q_conf = &port_cfg[port_id][qidx];
 	if (unlikely(nb_rx == 0)) {
 		q_conf->empty_poll_stats++;
 		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
 			/* scale down freq */
 			rte_power_freq_min(rte_lcore_id());
 	} else {
 		q_conf->empty_poll_stats = 0;
 		/* scale up freq */
 		rte_power_freq_max(rte_lcore_id());
 	}
 	return nb_rx;
 }
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 		uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
 {
 	struct pmd_queue_cfg *queue_cfg;
 	struct rte_eth_dev_info info;
 	int ret;
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 	if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
 		ret = -EINVAL;
 		goto end;
 	}
 	if (rte_eth_dev_info_get(port_id, &info) < 0) {
 		ret = -EINVAL;
 		goto end;
 	}
 	/* check if queue id is valid */
 	if (queue_id >= info.nb_rx_queues) {
 		ret = -EINVAL;
 		goto end;
 	}
 	queue_cfg = &port_cfg[port_id][queue_id];
 	if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
 		ret = -EINVAL;
 		goto end;
 	}
 	/* we need this in various places */
 	rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
 	switch (mode) {
 	case RTE_POWER_MGMT_TYPE_MONITOR:
 	{
 		struct rte_power_monitor_cond dummy;
 		/* check if rte_power_monitor is supported */
 		if (!global_data.intrinsics_support.power_monitor) {
 			RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
 			ret = -ENOTSUP;
 			goto end;
 		}
 		/* check if the device supports the necessary PMD API */
 		if (rte_eth_get_monitor_addr(port_id, queue_id,
 				&dummy) == -ENOTSUP) {
 			RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
 			ret = -ENOTSUP;
 			goto end;
 		}
 		/* initialize data before enabling the callback */
 		queue_cfg->empty_poll_stats = 0;
 		queue_cfg->cb_mode = mode;
 		queue_cfg->umwait_in_progress = false;
 		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
 		/* ensure we update our state before callback starts */
 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
 		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
 				clb_umwait, NULL);
 		break;
 	}
 	case RTE_POWER_MGMT_TYPE_SCALE:
 	{
 		enum power_management_env env;
 		/* only PSTATE and ACPI modes are supported */
 		if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
 				!rte_power_check_env_supported(
 					PM_ENV_PSTATE_CPUFREQ)) {
 			RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
 			ret = -ENOTSUP;
 			goto end;
 		}
 		/* ensure we could initialize the power library */
 		if (rte_power_init(lcore_id)) {
 			ret = -EINVAL;
 			goto end;
 		}
 		/* ensure we initialized the correct env */
 		env = rte_power_get_env();
 		if (env != PM_ENV_ACPI_CPUFREQ &&
 				env != PM_ENV_PSTATE_CPUFREQ) {
 			RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
 			ret = -ENOTSUP;
 			goto end;
 		}
 		/* initialize data before enabling the callback */
 		queue_cfg->empty_poll_stats = 0;
 		queue_cfg->cb_mode = mode;
 		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
 		/* this is not necessary here, but do it anyway */
 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
 		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
 				queue_id, clb_scale_freq, NULL);
 		break;
 	}
 	case RTE_POWER_MGMT_TYPE_PAUSE:
 		/* figure out various time-to-tsc conversions */
 		if (global_data.tsc_per_us == 0)
 			calc_tsc();
 		/* initialize data before enabling the callback */
 		queue_cfg->empty_poll_stats = 0;
 		queue_cfg->cb_mode = mode;
 		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
 		/* this is not necessary here, but do it anyway */
 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
 		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
 				clb_pause, NULL);
 		break;
 	}
 	ret = 0;
 end:
 	return ret;
 }
 int
 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
 		uint16_t port_id, uint16_t queue_id)
 {
 	struct pmd_queue_cfg *queue_cfg;
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 	if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
 		return -EINVAL;
 	/* no need to check queue id as wrong queue id would not be enabled */
 	queue_cfg = &port_cfg[port_id][queue_id];
 	if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
 		return -EINVAL;
 	/* stop any callbacks from progressing */
 	queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
 	/* ensure we update our state before continuing */
 	rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
 	switch (queue_cfg->cb_mode) {
 	case RTE_POWER_MGMT_TYPE_MONITOR:
 	{
 		bool exit = false;
 		do {
 			/*
 			 * we may request cancellation while the other thread
 			 * has just entered the callback but hasn't started
 			 * sleeping yet, so keep waking it up until we know it's
 			 * done sleeping.
 			 */
 			if (queue_cfg->umwait_in_progress)
 				rte_power_monitor_wakeup(lcore_id);
 			else
 				exit = true;
 		} while (!exit);
 	}
 	/* fall-through */
 	case RTE_POWER_MGMT_TYPE_PAUSE:
 		rte_eth_remove_rx_callback(port_id, queue_id,
 				queue_cfg->cur_cb);
 		break;
 	case RTE_POWER_MGMT_TYPE_SCALE:
 		rte_power_freq_max(lcore_id);
 		rte_eth_remove_rx_callback(port_id, queue_id,
 				queue_cfg->cur_cb);
 		rte_power_exit(lcore_id);
 		break;
 	}
 	/*
 	 * we don't free the RX callback here because it is unsafe to do so
 	 * unless we know for a fact that all data plane threads have stopped.
 	 */
 	queue_cfg->cur_cb = NULL;
 	return 0;
 }
--- a/lib/librte_power/rte_power_pmd_mgmt.h
+++ b/lib/librte_power/rte_power_pmd_mgmt.h
@ -0,0 +1,91 @@
 /* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2020 Intel Corporation
 */
 #ifndef _RTE_POWER_PMD_MGMT_H
 #define _RTE_POWER_PMD_MGMT_H
 /**
 * @file
 * RTE PMD Power Management
 */
 #include <stdint.h>
 #include <stdbool.h>
 #include <rte_common.h>
 #include <rte_byteorder.h>
 #include <rte_log.h>
 #include <rte_power.h>
 #include <rte_atomic.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * PMD Power Management Type
 */
 enum rte_power_pmd_mgmt_type {
 	/** Use power-optimized monitoring to wait for incoming traffic */
 	RTE_POWER_MGMT_TYPE_MONITOR = 1,
 	/** Use power-optimized sleep to avoid busy polling */
 	RTE_POWER_MGMT_TYPE_PAUSE,
 	/** Use frequency scaling when traffic is low */
 	RTE_POWER_MGMT_TYPE_SCALE,
 };
 /**
 * @warning
 * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
 *
 * Enable power management on a specified Ethernet device Rx queue and lcore.
 *
 * @note This function is not thread-safe.
 *
 * @param lcore_id
 *   The lcore the Rx queue will be polled from.
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The queue identifier of the Ethernet device.
 * @param mode
 *   The power management scheme to use for specified Rx queue.
 * @return
 *   0 on success
 *   <0 on error
 */
 __rte_experimental
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id,
 		uint16_t port_id, uint16_t queue_id,
 		enum rte_power_pmd_mgmt_type mode);
 /**
 * @warning
 * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
 *
 * Disable power management on a specified Ethernet device Rx queue and lcore.
 *
 * @note This function is not thread-safe.
 *
 * @param lcore_id
 *   The lcore the Rx queue is polled from.
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The queue identifier of the Ethernet device.
 * @return
 *   0 on success
 *   <0 on error
 */
 __rte_experimental
 int
 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
 		uint16_t port_id, uint16_t queue_id);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/lib/librte_power/version.map
+++ b/lib/librte_power/version.map
@ -36,6 +36,8 @@ EXPERIMENTAL {
 	rte_power_poll_stat_update;
 	# added in 21.02
 	rte_power_ethdev_pmgmt_queue_disable;
 	rte_power_ethdev_pmgmt_queue_enable;
 	rte_power_guest_channel_receive_msg;
 	rte_power_guest_channel_send_msg;
 };