numam-dpdk/app/test/test_stack_perf.c
Stanislaw Kardach 1abb185d6c stack: allow lock-free only on relevant architectures
Since commit 7911ba0473 ("stack: enable lock-free implementation for
aarch64"), lock-free stack is supported on arm64 but this description was
missing from the doxygen for the flag.

Currently it is impossible to detect programmatically whether lock-free
implementation of rte_stack is supported. One could check whether the
header guard for lock-free stubs is defined (_RTE_STACK_LF_STUBS_H_) but
that's an unstable implementation detail. Because of that currently all
lock-free ring creations silently succeed (as long as the stack header
is 16B long) which later leads to push and pop operations being NOPs.
The observable effect is that stack_lf_autotest fails on platforms not
supporting the lock-free. Instead it should just skip the lock-free test
altogether.

This commit adds a new errno value (ENOTSUP) that may be returned by
rte_stack_create() to indicate that a given combination of flags is not
supported on a current platform.
This is detected by checking a compile-time flag in the include logic in
rte_stack_lf.h which may be used by applications to check the lock-free
support at compile time.

Use the added RTE_STACK_LF_SUPPORTED flag to disable the lock-free stack
tests at the compile time.
Perf test doesn't fail because rte_ring_create() succeeds, however
marking this test as skipped gives a better indication of what actually
was tested.

Fixes: 7911ba0473 ("stack: enable lock-free implementation for aarch64")

Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
2021-05-03 18:46:15 +02:00

361 lines
7.6 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2019 Intel Corporation
*/
#include <stdio.h>
#include <inttypes.h>
#include <rte_atomic.h>
#include <rte_cycles.h>
#include <rte_launch.h>
#include <rte_pause.h>
#include <rte_stack.h>
#include "test.h"
#define STACK_NAME "STACK_PERF"
#define MAX_BURST 32
#define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
/*
* Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
* constants.
*/
static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
static rte_atomic32_t lcore_barrier;
struct lcore_pair {
unsigned int c1;
unsigned int c2;
};
static int
get_two_hyperthreads(struct lcore_pair *lcp)
{
unsigned int socket[2];
unsigned int core[2];
unsigned int id[2];
RTE_LCORE_FOREACH(id[0]) {
RTE_LCORE_FOREACH(id[1]) {
if (id[0] == id[1])
continue;
core[0] = rte_lcore_to_cpu_id(id[0]);
core[1] = rte_lcore_to_cpu_id(id[1]);
socket[0] = rte_lcore_to_socket_id(id[0]);
socket[1] = rte_lcore_to_socket_id(id[1]);
if ((core[0] == core[1]) && (socket[0] == socket[1])) {
lcp->c1 = id[0];
lcp->c2 = id[1];
return 0;
}
}
}
return 1;
}
static int
get_two_cores(struct lcore_pair *lcp)
{
unsigned int socket[2];
unsigned int core[2];
unsigned int id[2];
RTE_LCORE_FOREACH(id[0]) {
RTE_LCORE_FOREACH(id[1]) {
if (id[0] == id[1])
continue;
core[0] = rte_lcore_to_cpu_id(id[0]);
core[1] = rte_lcore_to_cpu_id(id[1]);
socket[0] = rte_lcore_to_socket_id(id[0]);
socket[1] = rte_lcore_to_socket_id(id[1]);
if ((core[0] != core[1]) && (socket[0] == socket[1])) {
lcp->c1 = id[0];
lcp->c2 = id[1];
return 0;
}
}
}
return 1;
}
static int
get_two_sockets(struct lcore_pair *lcp)
{
unsigned int socket[2];
unsigned int id[2];
RTE_LCORE_FOREACH(id[0]) {
RTE_LCORE_FOREACH(id[1]) {
if (id[0] == id[1])
continue;
socket[0] = rte_lcore_to_socket_id(id[0]);
socket[1] = rte_lcore_to_socket_id(id[1]);
if (socket[0] != socket[1]) {
lcp->c1 = id[0];
lcp->c2 = id[1];
return 0;
}
}
}
return 1;
}
/* Measure the cycle cost of popping an empty stack. */
static void
test_empty_pop(struct rte_stack *s)
{
unsigned int iterations = 100000000;
void *objs[MAX_BURST];
unsigned int i;
uint64_t start = rte_rdtsc();
for (i = 0; i < iterations; i++)
rte_stack_pop(s, objs, bulk_sizes[0]);
uint64_t end = rte_rdtsc();
printf("Stack empty pop: %.2F\n",
(double)(end - start) / iterations);
}
struct thread_args {
struct rte_stack *s;
unsigned int sz;
double avg;
};
/* Measure the average per-pointer cycle cost of stack push and pop */
static int
bulk_push_pop(void *p)
{
unsigned int iterations = 1000000;
struct thread_args *args = p;
void *objs[MAX_BURST] = {0};
unsigned int size, i;
struct rte_stack *s;
s = args->s;
size = args->sz;
rte_atomic32_sub(&lcore_barrier, 1);
while (rte_atomic32_read(&lcore_barrier) != 0)
rte_pause();
uint64_t start = rte_rdtsc();
for (i = 0; i < iterations; i++) {
rte_stack_push(s, objs, size);
rte_stack_pop(s, objs, size);
}
uint64_t end = rte_rdtsc();
args->avg = ((double)(end - start))/(iterations * size);
return 0;
}
/*
* Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
* perf when between hyperthread siblings, cores on the same socket, and cores
* on different sockets.
*/
static void
run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
lcore_function_t fn)
{
struct thread_args args[2];
unsigned int i;
for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
rte_atomic32_set(&lcore_barrier, 2);
args[0].sz = args[1].sz = bulk_sizes[i];
args[0].s = args[1].s = s;
if (cores->c1 == rte_get_main_lcore()) {
rte_eal_remote_launch(fn, &args[1], cores->c2);
fn(&args[0]);
rte_eal_wait_lcore(cores->c2);
} else {
rte_eal_remote_launch(fn, &args[0], cores->c1);
rte_eal_remote_launch(fn, &args[1], cores->c2);
rte_eal_wait_lcore(cores->c1);
rte_eal_wait_lcore(cores->c2);
}
printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
}
}
/* Run bulk_push_pop() simultaneously on 1+ cores. */
static void
run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
{
struct thread_args args[RTE_MAX_LCORE];
unsigned int i;
for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
unsigned int lcore_id;
int cnt = 0;
double avg;
rte_atomic32_set(&lcore_barrier, n);
RTE_LCORE_FOREACH_WORKER(lcore_id) {
if (++cnt >= n)
break;
args[lcore_id].s = s;
args[lcore_id].sz = bulk_sizes[i];
if (rte_eal_remote_launch(fn, &args[lcore_id],
lcore_id))
rte_panic("Failed to launch lcore %d\n",
lcore_id);
}
lcore_id = rte_lcore_id();
args[lcore_id].s = s;
args[lcore_id].sz = bulk_sizes[i];
fn(&args[lcore_id]);
rte_eal_mp_wait_lcore();
avg = args[rte_lcore_id()].avg;
cnt = 0;
RTE_LCORE_FOREACH_WORKER(lcore_id) {
if (++cnt >= n)
break;
avg += args[lcore_id].avg;
}
printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
bulk_sizes[i], avg / n);
}
}
/*
* Measure the cycle cost of pushing and popping a single pointer on a single
* lcore.
*/
static void
test_single_push_pop(struct rte_stack *s)
{
unsigned int iterations = 16000000;
void *obj = NULL;
unsigned int i;
uint64_t start = rte_rdtsc();
for (i = 0; i < iterations; i++) {
rte_stack_push(s, &obj, 1);
rte_stack_pop(s, &obj, 1);
}
uint64_t end = rte_rdtsc();
printf("Average cycles per single object push/pop: %.2F\n",
((double)(end - start)) / iterations);
}
/* Measure the cycle cost of bulk pushing and popping on a single lcore. */
static void
test_bulk_push_pop(struct rte_stack *s)
{
unsigned int iterations = 8000000;
void *objs[MAX_BURST];
unsigned int sz, i;
for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
uint64_t start = rte_rdtsc();
for (i = 0; i < iterations; i++) {
rte_stack_push(s, objs, bulk_sizes[sz]);
rte_stack_pop(s, objs, bulk_sizes[sz]);
}
uint64_t end = rte_rdtsc();
double avg = ((double)(end - start) /
(iterations * bulk_sizes[sz]));
printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
bulk_sizes[sz], avg);
}
}
static int
__test_stack_perf(uint32_t flags)
{
struct lcore_pair cores;
struct rte_stack *s;
rte_atomic32_init(&lcore_barrier);
s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
if (s == NULL) {
printf("[%s():%u] failed to create a stack\n",
__func__, __LINE__);
return -1;
}
printf("### Testing single element push/pop ###\n");
test_single_push_pop(s);
printf("\n### Testing empty pop ###\n");
test_empty_pop(s);
printf("\n### Testing using a single lcore ###\n");
test_bulk_push_pop(s);
if (get_two_hyperthreads(&cores) == 0) {
printf("\n### Testing using two hyperthreads ###\n");
run_on_core_pair(&cores, s, bulk_push_pop);
}
if (get_two_cores(&cores) == 0) {
printf("\n### Testing using two physical cores ###\n");
run_on_core_pair(&cores, s, bulk_push_pop);
}
if (get_two_sockets(&cores) == 0) {
printf("\n### Testing using two NUMA nodes ###\n");
run_on_core_pair(&cores, s, bulk_push_pop);
}
printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
rte_stack_free(s);
return 0;
}
static int
test_stack_perf(void)
{
return __test_stack_perf(0);
}
static int
test_lf_stack_perf(void)
{
#if defined(RTE_STACK_LF_SUPPORTED)
return __test_stack_perf(RTE_STACK_F_LF);
#else
return TEST_SKIPPED;
#endif
}
REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);