test/stack: check stack performance
stack_perf_autotest tests the following with one lcore: - Cycles to attempt to pop an empty stack - Cycles to push then pop a single object - Cycles to push then pop a burst of 32 objects It also tests the cycles to push then pop a burst of 8 and 32 objects with the following lcore combinations (if possible): - Two hyperthreads - Two physical cores - Two physical cores on separate NUMA nodes - All available lcores Signed-off-by: Gage Eads <gage.eads@intel.com> Reviewed-by: Olivier Matz <olivier.matz@6wind.com>
This commit is contained in:
parent
5e2e61b99e
commit
cfe6fab029
@ -91,6 +91,7 @@ endif
|
||||
SRCS-y += test_rwlock.c
|
||||
|
||||
SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack.c
|
||||
SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack_perf.c
|
||||
|
||||
SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer.c
|
||||
SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_perf.c
|
||||
|
@ -96,6 +96,7 @@ test_sources = files('commands.c',
|
||||
'test_service_cores.c',
|
||||
'test_spinlock.c',
|
||||
'test_stack.c',
|
||||
'test_stack_perf.c',
|
||||
'test_string_fns.c',
|
||||
'test_table.c',
|
||||
'test_table_acl.c',
|
||||
@ -241,6 +242,7 @@ perf_test_names = [
|
||||
'distributor_perf_autotest',
|
||||
'ring_pmd_perf_autotest',
|
||||
'pmd_perf_autotest',
|
||||
'stack_perf_autotest',
|
||||
]
|
||||
|
||||
# All test cases in driver_test_names list are non-parallel
|
||||
|
345
app/test/test_stack_perf.c
Normal file
345
app/test/test_stack_perf.c
Normal file
@ -0,0 +1,345 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright(c) 2019 Intel Corporation
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include <rte_atomic.h>
|
||||
#include <rte_cycles.h>
|
||||
#include <rte_launch.h>
|
||||
#include <rte_pause.h>
|
||||
#include <rte_stack.h>
|
||||
|
||||
#include "test.h"
|
||||
|
||||
#define STACK_NAME "STACK_PERF"
|
||||
#define MAX_BURST 32
|
||||
#define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
|
||||
|
||||
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
|
||||
|
||||
/*
|
||||
* Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
|
||||
* constants.
|
||||
*/
|
||||
static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
|
||||
|
||||
static rte_atomic32_t lcore_barrier;
|
||||
|
||||
struct lcore_pair {
|
||||
unsigned int c1;
|
||||
unsigned int c2;
|
||||
};
|
||||
|
||||
static int
|
||||
get_two_hyperthreads(struct lcore_pair *lcp)
|
||||
{
|
||||
unsigned int socket[2];
|
||||
unsigned int core[2];
|
||||
unsigned int id[2];
|
||||
|
||||
RTE_LCORE_FOREACH(id[0]) {
|
||||
RTE_LCORE_FOREACH(id[1]) {
|
||||
if (id[0] == id[1])
|
||||
continue;
|
||||
core[0] = lcore_config[id[0]].core_id;
|
||||
core[1] = lcore_config[id[1]].core_id;
|
||||
socket[0] = lcore_config[id[0]].socket_id;
|
||||
socket[1] = lcore_config[id[1]].socket_id;
|
||||
if ((core[0] == core[1]) && (socket[0] == socket[1])) {
|
||||
lcp->c1 = id[0];
|
||||
lcp->c2 = id[1];
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
get_two_cores(struct lcore_pair *lcp)
|
||||
{
|
||||
unsigned int socket[2];
|
||||
unsigned int core[2];
|
||||
unsigned int id[2];
|
||||
|
||||
RTE_LCORE_FOREACH(id[0]) {
|
||||
RTE_LCORE_FOREACH(id[1]) {
|
||||
if (id[0] == id[1])
|
||||
continue;
|
||||
core[0] = lcore_config[id[0]].core_id;
|
||||
core[1] = lcore_config[id[1]].core_id;
|
||||
socket[0] = lcore_config[id[0]].socket_id;
|
||||
socket[1] = lcore_config[id[1]].socket_id;
|
||||
if ((core[0] != core[1]) && (socket[0] == socket[1])) {
|
||||
lcp->c1 = id[0];
|
||||
lcp->c2 = id[1];
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
get_two_sockets(struct lcore_pair *lcp)
|
||||
{
|
||||
unsigned int socket[2];
|
||||
unsigned int id[2];
|
||||
|
||||
RTE_LCORE_FOREACH(id[0]) {
|
||||
RTE_LCORE_FOREACH(id[1]) {
|
||||
if (id[0] == id[1])
|
||||
continue;
|
||||
socket[0] = lcore_config[id[0]].socket_id;
|
||||
socket[1] = lcore_config[id[1]].socket_id;
|
||||
if (socket[0] != socket[1]) {
|
||||
lcp->c1 = id[0];
|
||||
lcp->c2 = id[1];
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Measure the cycle cost of popping an empty stack. */
|
||||
static void
|
||||
test_empty_pop(struct rte_stack *s)
|
||||
{
|
||||
unsigned int iterations = 100000000;
|
||||
void *objs[MAX_BURST];
|
||||
unsigned int i;
|
||||
|
||||
uint64_t start = rte_rdtsc();
|
||||
|
||||
for (i = 0; i < iterations; i++)
|
||||
rte_stack_pop(s, objs, bulk_sizes[0]);
|
||||
|
||||
uint64_t end = rte_rdtsc();
|
||||
|
||||
printf("Stack empty pop: %.2F\n",
|
||||
(double)(end - start) / iterations);
|
||||
}
|
||||
|
||||
struct thread_args {
|
||||
struct rte_stack *s;
|
||||
unsigned int sz;
|
||||
double avg;
|
||||
};
|
||||
|
||||
/* Measure the average per-pointer cycle cost of stack push and pop */
|
||||
static int
|
||||
bulk_push_pop(void *p)
|
||||
{
|
||||
unsigned int iterations = 1000000;
|
||||
struct thread_args *args = p;
|
||||
void *objs[MAX_BURST] = {0};
|
||||
unsigned int size, i;
|
||||
struct rte_stack *s;
|
||||
|
||||
s = args->s;
|
||||
size = args->sz;
|
||||
|
||||
rte_atomic32_sub(&lcore_barrier, 1);
|
||||
while (rte_atomic32_read(&lcore_barrier) != 0)
|
||||
rte_pause();
|
||||
|
||||
uint64_t start = rte_rdtsc();
|
||||
|
||||
for (i = 0; i < iterations; i++) {
|
||||
rte_stack_push(s, objs, size);
|
||||
rte_stack_pop(s, objs, size);
|
||||
}
|
||||
|
||||
uint64_t end = rte_rdtsc();
|
||||
|
||||
args->avg = ((double)(end - start))/(iterations * size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
|
||||
* perf when between hyperthread siblings, cores on the same socket, and cores
|
||||
* on different sockets.
|
||||
*/
|
||||
static void
|
||||
run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
|
||||
lcore_function_t fn)
|
||||
{
|
||||
struct thread_args args[2];
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
|
||||
rte_atomic32_set(&lcore_barrier, 2);
|
||||
|
||||
args[0].sz = args[1].sz = bulk_sizes[i];
|
||||
args[0].s = args[1].s = s;
|
||||
|
||||
if (cores->c1 == rte_get_master_lcore()) {
|
||||
rte_eal_remote_launch(fn, &args[1], cores->c2);
|
||||
fn(&args[0]);
|
||||
rte_eal_wait_lcore(cores->c2);
|
||||
} else {
|
||||
rte_eal_remote_launch(fn, &args[0], cores->c1);
|
||||
rte_eal_remote_launch(fn, &args[1], cores->c2);
|
||||
rte_eal_wait_lcore(cores->c1);
|
||||
rte_eal_wait_lcore(cores->c2);
|
||||
}
|
||||
|
||||
printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
|
||||
bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* Run bulk_push_pop() simultaneously on 1+ cores. */
|
||||
static void
|
||||
run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
|
||||
{
|
||||
struct thread_args args[RTE_MAX_LCORE];
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
|
||||
unsigned int lcore_id;
|
||||
int cnt = 0;
|
||||
double avg;
|
||||
|
||||
rte_atomic32_set(&lcore_barrier, n);
|
||||
|
||||
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
|
||||
if (++cnt >= n)
|
||||
break;
|
||||
|
||||
args[lcore_id].s = s;
|
||||
args[lcore_id].sz = bulk_sizes[i];
|
||||
|
||||
if (rte_eal_remote_launch(fn, &args[lcore_id],
|
||||
lcore_id))
|
||||
rte_panic("Failed to launch lcore %d\n",
|
||||
lcore_id);
|
||||
}
|
||||
|
||||
lcore_id = rte_lcore_id();
|
||||
|
||||
args[lcore_id].s = s;
|
||||
args[lcore_id].sz = bulk_sizes[i];
|
||||
|
||||
fn(&args[lcore_id]);
|
||||
|
||||
rte_eal_mp_wait_lcore();
|
||||
|
||||
avg = args[rte_lcore_id()].avg;
|
||||
|
||||
cnt = 0;
|
||||
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
|
||||
if (++cnt >= n)
|
||||
break;
|
||||
avg += args[lcore_id].avg;
|
||||
}
|
||||
|
||||
printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
|
||||
bulk_sizes[i], avg / n);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Measure the cycle cost of pushing and popping a single pointer on a single
|
||||
* lcore.
|
||||
*/
|
||||
static void
|
||||
test_single_push_pop(struct rte_stack *s)
|
||||
{
|
||||
unsigned int iterations = 16000000;
|
||||
void *obj = NULL;
|
||||
unsigned int i;
|
||||
|
||||
uint64_t start = rte_rdtsc();
|
||||
|
||||
for (i = 0; i < iterations; i++) {
|
||||
rte_stack_push(s, &obj, 1);
|
||||
rte_stack_pop(s, &obj, 1);
|
||||
}
|
||||
|
||||
uint64_t end = rte_rdtsc();
|
||||
|
||||
printf("Average cycles per single object push/pop: %.2F\n",
|
||||
((double)(end - start)) / iterations);
|
||||
}
|
||||
|
||||
/* Measure the cycle cost of bulk pushing and popping on a single lcore. */
|
||||
static void
|
||||
test_bulk_push_pop(struct rte_stack *s)
|
||||
{
|
||||
unsigned int iterations = 8000000;
|
||||
void *objs[MAX_BURST];
|
||||
unsigned int sz, i;
|
||||
|
||||
for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) {
|
||||
uint64_t start = rte_rdtsc();
|
||||
|
||||
for (i = 0; i < iterations; i++) {
|
||||
rte_stack_push(s, objs, bulk_sizes[sz]);
|
||||
rte_stack_pop(s, objs, bulk_sizes[sz]);
|
||||
}
|
||||
|
||||
uint64_t end = rte_rdtsc();
|
||||
|
||||
double avg = ((double)(end - start) /
|
||||
(iterations * bulk_sizes[sz]));
|
||||
|
||||
printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
|
||||
bulk_sizes[sz], avg);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
test_stack_perf(void)
|
||||
{
|
||||
struct lcore_pair cores;
|
||||
struct rte_stack *s;
|
||||
|
||||
rte_atomic32_init(&lcore_barrier);
|
||||
|
||||
s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), 0);
|
||||
if (s == NULL) {
|
||||
printf("[%s():%u] failed to create a stack\n",
|
||||
__func__, __LINE__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("### Testing single element push/pop ###\n");
|
||||
test_single_push_pop(s);
|
||||
|
||||
printf("\n### Testing empty pop ###\n");
|
||||
test_empty_pop(s);
|
||||
|
||||
printf("\n### Testing using a single lcore ###\n");
|
||||
test_bulk_push_pop(s);
|
||||
|
||||
if (get_two_hyperthreads(&cores) == 0) {
|
||||
printf("\n### Testing using two hyperthreads ###\n");
|
||||
run_on_core_pair(&cores, s, bulk_push_pop);
|
||||
}
|
||||
if (get_two_cores(&cores) == 0) {
|
||||
printf("\n### Testing using two physical cores ###\n");
|
||||
run_on_core_pair(&cores, s, bulk_push_pop);
|
||||
}
|
||||
if (get_two_sockets(&cores) == 0) {
|
||||
printf("\n### Testing using two NUMA nodes ###\n");
|
||||
run_on_core_pair(&cores, s, bulk_push_pop);
|
||||
}
|
||||
|
||||
printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
|
||||
run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
|
||||
|
||||
rte_stack_free(s);
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
|
Loading…
Reference in New Issue
Block a user