power: add traffic pattern aware power control
1. Abstract
For packet processing workloads such as DPDK polling is continuous.
This means CPU cores always show 100% busy independent of how much work
those cores are doing. It is critical to accurately determine how busy
a core is hugely important for the following reasons:
* No indication of overload conditions.
* User does not know how much real load is on a system, resulting
in wasted energy as no power management is utilized.
Compared to the original l3fwd-power design, instead of going to sleep
after detecting an empty poll, the new mechanism just lowers the core
frequency. As a result, the application does not stop polling the device,
which leads to improved handling of bursts of traffic.
When the system become busy, the empty poll mechanism can also increase the
core frequency (including turbo) to do best effort for intensive traffic.
This gives us more flexible and balanced traffic awareness over the
standard l3fwd-power application.
2. Proposed solution
The proposed solution focuses on how many times empty polls are executed.
The less the number of empty polls, means current core is busy with
processing workload, therefore, the higher frequency is needed. The high
empty poll number indicates the current core not doing any real work
therefore, we can lower the frequency to safe power.
In the current implementation, each core has 1 empty-poll counter which
assume 1 core is dedicated to 1 queue. This will need to be expanded in the
future to support multiple queues per core.
2.1 Power state definition:
LOW: Not currently used, reserved for future use.
MED: the frequency is used to process modest traffic workload.
HIGH: the frequency is used to process busy traffic workload.
2.2 There are two phases to establish the power management system:
a.Initialization/Training phase. The training phase is necessary
in order to figure out the system polling baseline numbers from
idle to busy. The highest poll count will be during idle, where
all polls are empty. These poll counts will be different between
systems due to the many possible processor micro-arch, cache
and device configurations, hence the training phase.
In the training phase, traffic is blocked so the training
algorithm can average the empty-poll numbers for the LOW, MED and
HIGH power states in order to create a baseline.
The core's counter are collected every 10ms, and the Training
phase will take 2 seconds.
Training is disabled as default configuration. The default
parameter is applied. Sample App still can trigger training
if that's needed. Once the training phase has been executed once on
a system, the application can then be started with the relevant
thresholds provided on the command line, allowing the application
to start passing start traffic immediately
b.Normal phase. Traffic starts immediately based on the default
thresholds, or based on the user supplied thresholds via the
command line parameters. The run-time poll counts are compared with
the baseline and the decision will be taken to move to MED power
state or HIGH power state. The counters are calculated every 10ms.
3. Proposed API
1. rte_power_empty_poll_stat_init(struct ep_params **eptr,
uint8_t *freq_tlb, struct ep_policy *policy);
which is used to initialize the power management system.
2. rte_power_empty_poll_stat_free(void);
which is used to free the resource hold by power management system.
3. rte_power_empty_poll_stat_update(unsigned int lcore_id);
which is used to update specific core empty poll counter, not thread safe
4. rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt);
which is used to update specific core valid poll counter, not thread safe
5. rte_power_empty_poll_stat_fetch(unsigned int lcore_id);
which is used to get specific core empty poll counter.
6. rte_power_poll_stat_fetch(unsigned int lcore_id);
which is used to get specific core valid poll counter.
7. rte_empty_poll_detection(struct rte_timer *tim, void *arg);
which is used to detect empty poll state changes then take action.
Signed-off-by: Liang Ma <liang.j.ma@intel.com>
Reviewed-by: Lei Yao <lei.a.yao@intel.com>
Acked-by: David Hunt <david.hunt@intel.com>
2018-10-19 12:07:18 +01:00
|
|
|
/* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
* Copyright(c) 2010-2018 Intel Corporation
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _RTE_EMPTY_POLL_H
|
|
|
|
#define _RTE_EMPTY_POLL_H
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
* RTE Power Management
|
|
|
|
*/
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <stdbool.h>
|
|
|
|
|
|
|
|
#include <rte_common.h>
|
|
|
|
#include <rte_byteorder.h>
|
|
|
|
#include <rte_log.h>
|
|
|
|
#include <rte_string_fns.h>
|
|
|
|
#include <rte_power.h>
|
|
|
|
#include <rte_timer.h>
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define NUM_FREQS RTE_MAX_LCORE_FREQS
|
|
|
|
|
|
|
|
#define BINS_AV 4 /* Has to be ^2 */
|
|
|
|
|
|
|
|
#define DROP (NUM_DIRECTIONS * NUM_DEVICES)
|
|
|
|
|
|
|
|
#define NUM_PRIORITIES 2
|
|
|
|
|
|
|
|
#define NUM_NODES 256 /* Max core number*/
|
|
|
|
|
|
|
|
/* Processor Power State */
|
|
|
|
enum freq_val {
|
|
|
|
LOW,
|
|
|
|
MED,
|
|
|
|
HGH,
|
|
|
|
NUM_FREQ = NUM_FREQS
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* Queue Polling State */
|
|
|
|
enum queue_state {
|
|
|
|
TRAINING, /* NO TRAFFIC */
|
|
|
|
MED_NORMAL, /* MED */
|
|
|
|
HGH_BUSY, /* HIGH */
|
|
|
|
LOW_PURGE, /* LOW */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Queue Stats */
|
|
|
|
struct freq_threshold {
|
|
|
|
|
|
|
|
uint64_t base_edpi;
|
|
|
|
bool trained;
|
|
|
|
uint32_t threshold_percent;
|
|
|
|
uint32_t cur_train_iter;
|
|
|
|
};
|
|
|
|
|
2019-04-26 16:14:22 +01:00
|
|
|
/* Each Worker Thread Empty Poll Stats */
|
power: add traffic pattern aware power control
1. Abstract
For packet processing workloads such as DPDK polling is continuous.
This means CPU cores always show 100% busy independent of how much work
those cores are doing. It is critical to accurately determine how busy
a core is hugely important for the following reasons:
* No indication of overload conditions.
* User does not know how much real load is on a system, resulting
in wasted energy as no power management is utilized.
Compared to the original l3fwd-power design, instead of going to sleep
after detecting an empty poll, the new mechanism just lowers the core
frequency. As a result, the application does not stop polling the device,
which leads to improved handling of bursts of traffic.
When the system become busy, the empty poll mechanism can also increase the
core frequency (including turbo) to do best effort for intensive traffic.
This gives us more flexible and balanced traffic awareness over the
standard l3fwd-power application.
2. Proposed solution
The proposed solution focuses on how many times empty polls are executed.
The less the number of empty polls, means current core is busy with
processing workload, therefore, the higher frequency is needed. The high
empty poll number indicates the current core not doing any real work
therefore, we can lower the frequency to safe power.
In the current implementation, each core has 1 empty-poll counter which
assume 1 core is dedicated to 1 queue. This will need to be expanded in the
future to support multiple queues per core.
2.1 Power state definition:
LOW: Not currently used, reserved for future use.
MED: the frequency is used to process modest traffic workload.
HIGH: the frequency is used to process busy traffic workload.
2.2 There are two phases to establish the power management system:
a.Initialization/Training phase. The training phase is necessary
in order to figure out the system polling baseline numbers from
idle to busy. The highest poll count will be during idle, where
all polls are empty. These poll counts will be different between
systems due to the many possible processor micro-arch, cache
and device configurations, hence the training phase.
In the training phase, traffic is blocked so the training
algorithm can average the empty-poll numbers for the LOW, MED and
HIGH power states in order to create a baseline.
The core's counter are collected every 10ms, and the Training
phase will take 2 seconds.
Training is disabled as default configuration. The default
parameter is applied. Sample App still can trigger training
if that's needed. Once the training phase has been executed once on
a system, the application can then be started with the relevant
thresholds provided on the command line, allowing the application
to start passing start traffic immediately
b.Normal phase. Traffic starts immediately based on the default
thresholds, or based on the user supplied thresholds via the
command line parameters. The run-time poll counts are compared with
the baseline and the decision will be taken to move to MED power
state or HIGH power state. The counters are calculated every 10ms.
3. Proposed API
1. rte_power_empty_poll_stat_init(struct ep_params **eptr,
uint8_t *freq_tlb, struct ep_policy *policy);
which is used to initialize the power management system.
2. rte_power_empty_poll_stat_free(void);
which is used to free the resource hold by power management system.
3. rte_power_empty_poll_stat_update(unsigned int lcore_id);
which is used to update specific core empty poll counter, not thread safe
4. rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt);
which is used to update specific core valid poll counter, not thread safe
5. rte_power_empty_poll_stat_fetch(unsigned int lcore_id);
which is used to get specific core empty poll counter.
6. rte_power_poll_stat_fetch(unsigned int lcore_id);
which is used to get specific core valid poll counter.
7. rte_empty_poll_detection(struct rte_timer *tim, void *arg);
which is used to detect empty poll state changes then take action.
Signed-off-by: Liang Ma <liang.j.ma@intel.com>
Reviewed-by: Lei Yao <lei.a.yao@intel.com>
Acked-by: David Hunt <david.hunt@intel.com>
2018-10-19 12:07:18 +01:00
|
|
|
struct priority_worker {
|
|
|
|
|
|
|
|
/* Current dequeue and throughput counts */
|
|
|
|
/* These 2 are written to by the worker threads */
|
|
|
|
/* So keep them on their own cache line */
|
|
|
|
uint64_t empty_dequeues;
|
|
|
|
uint64_t num_dequeue_pkts;
|
|
|
|
|
|
|
|
enum queue_state queue_state;
|
|
|
|
|
|
|
|
uint64_t empty_dequeues_prev;
|
|
|
|
uint64_t num_dequeue_pkts_prev;
|
|
|
|
|
|
|
|
/* Used for training only */
|
|
|
|
struct freq_threshold thresh[NUM_FREQ];
|
|
|
|
enum freq_val cur_freq;
|
|
|
|
|
|
|
|
/* bucket arrays to calculate the averages */
|
|
|
|
/* edpi mean empty poll counter difference per interval */
|
|
|
|
uint64_t edpi_av[BINS_AV];
|
|
|
|
/* empty poll counter */
|
|
|
|
uint32_t ec;
|
|
|
|
/* ppi mean valid poll counter per interval */
|
|
|
|
uint64_t ppi_av[BINS_AV];
|
|
|
|
/* valid poll counter */
|
|
|
|
uint32_t pc;
|
|
|
|
|
|
|
|
uint32_t lcore_id;
|
|
|
|
uint32_t iter_counter;
|
|
|
|
uint32_t threshold_ctr;
|
|
|
|
uint32_t display_ctr;
|
|
|
|
uint8_t dev_id;
|
|
|
|
|
|
|
|
} __rte_cache_aligned;
|
|
|
|
|
|
|
|
|
|
|
|
struct stats_data {
|
|
|
|
|
|
|
|
struct priority_worker wrk_stats[NUM_NODES];
|
|
|
|
|
|
|
|
/* flag to stop rx threads processing packets until training over */
|
|
|
|
bool start_rx;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Empty Poll Parameters */
|
|
|
|
struct ep_params {
|
|
|
|
|
|
|
|
/* Timer related stuff */
|
|
|
|
uint64_t interval_ticks;
|
|
|
|
uint32_t max_train_iter;
|
|
|
|
|
|
|
|
struct rte_timer timer0;
|
|
|
|
struct stats_data wrk_data;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* Sample App Init information */
|
|
|
|
struct ep_policy {
|
|
|
|
|
|
|
|
uint64_t med_base_edpi;
|
|
|
|
uint64_t hgh_base_edpi;
|
|
|
|
|
|
|
|
enum queue_state state;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initialize the power management system.
|
|
|
|
*
|
|
|
|
* @param eptr
|
|
|
|
* the structure of empty poll configuration
|
|
|
|
* @param freq_tlb
|
|
|
|
* the power state/frequency mapping table
|
|
|
|
* @param policy
|
|
|
|
* the initialization policy from sample app
|
|
|
|
*
|
|
|
|
* @return
|
|
|
|
* - 0 on success.
|
|
|
|
* - Negative on error.
|
|
|
|
*/
|
|
|
|
int __rte_experimental
|
|
|
|
rte_power_empty_poll_stat_init(struct ep_params **eptr, uint8_t *freq_tlb,
|
|
|
|
struct ep_policy *policy);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Free the resource hold by power management system.
|
|
|
|
*/
|
|
|
|
void __rte_experimental
|
|
|
|
rte_power_empty_poll_stat_free(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Update specific core empty poll counter
|
|
|
|
* It's not thread safe.
|
|
|
|
*
|
|
|
|
* @param lcore_id
|
|
|
|
* lcore id
|
|
|
|
*
|
|
|
|
* @return
|
|
|
|
* - 0 on success.
|
|
|
|
* - Negative on error.
|
|
|
|
*/
|
|
|
|
int __rte_experimental
|
|
|
|
rte_power_empty_poll_stat_update(unsigned int lcore_id);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Update specific core valid poll counter, not thread safe.
|
|
|
|
*
|
|
|
|
* @param lcore_id
|
|
|
|
* lcore id.
|
|
|
|
* @param nb_pkt
|
|
|
|
* The packet number of one valid poll.
|
|
|
|
*
|
|
|
|
* @return
|
|
|
|
* - 0 on success.
|
|
|
|
* - Negative on error.
|
|
|
|
*/
|
|
|
|
int __rte_experimental
|
|
|
|
rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Fetch specific core empty poll counter.
|
|
|
|
*
|
|
|
|
* @param lcore_id
|
|
|
|
* lcore id
|
|
|
|
*
|
|
|
|
* @return
|
|
|
|
* Current lcore empty poll counter value.
|
|
|
|
*/
|
|
|
|
uint64_t __rte_experimental
|
|
|
|
rte_power_empty_poll_stat_fetch(unsigned int lcore_id);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Fetch specific core valid poll counter.
|
|
|
|
*
|
|
|
|
* @param lcore_id
|
|
|
|
* lcore id
|
|
|
|
*
|
|
|
|
* @return
|
|
|
|
* Current lcore valid poll counter value.
|
|
|
|
*/
|
|
|
|
uint64_t __rte_experimental
|
|
|
|
rte_power_poll_stat_fetch(unsigned int lcore_id);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Empty poll state change detection function
|
|
|
|
*
|
|
|
|
* @param tim
|
|
|
|
* The timer structure
|
|
|
|
* @param arg
|
|
|
|
* The customized parameter
|
|
|
|
*/
|
|
|
|
void __rte_experimental
|
|
|
|
rte_empty_poll_detection(struct rte_timer *tim, void *arg);
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif
|