a9de470cc7
Since all other apps have been moved to the "app" folder, the autotest app remains alone in the test folder. Rather than having an entire top-level folder for this, we can move it back to where it all started in early versions of DPDK - the "app/" folder. This move has a couple of advantages: * This reduces clutter at the top level of the project, due to one less folder. * It eliminates the separate build task necessary for building the autotests using make "make test-build" which means that developers are less likely to miss something in their own compilation tests * It re-aligns the final location of the test binary in the app folder when building with make with it's location in the source tree. For meson builds, the autotest app is different from the other apps in that it needs a series of different test cases defined for it for use by "meson test". Therefore, it does not get built as part of the main loop in the app folder, but gets built separately at the end. Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
402 lines
10 KiB
C
402 lines
10 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright(c) 2010-2014 Intel Corporation
|
|
*/
|
|
|
|
|
|
#include <stdio.h>
|
|
#include <inttypes.h>
|
|
#include <rte_ring.h>
|
|
#include <rte_cycles.h>
|
|
#include <rte_launch.h>
|
|
#include <rte_pause.h>
|
|
|
|
#include "test.h"
|
|
|
|
/*
|
|
* Ring
|
|
* ====
|
|
*
|
|
* Measures performance of various operations using rdtsc
|
|
* * Empty ring dequeue
|
|
* * Enqueue/dequeue of bursts in 1 threads
|
|
* * Enqueue/dequeue of bursts in 2 threads
|
|
*/
|
|
|
|
#define RING_NAME "RING_PERF"
|
|
#define RING_SIZE 4096
|
|
#define MAX_BURST 32
|
|
|
|
/*
|
|
* the sizes to enqueue and dequeue in testing
|
|
* (marked volatile so they won't be seen as compile-time constants)
|
|
*/
|
|
static const volatile unsigned bulk_sizes[] = { 8, 32 };
|
|
|
|
struct lcore_pair {
|
|
unsigned c1, c2;
|
|
};
|
|
|
|
static volatile unsigned lcore_count = 0;
|
|
|
|
/**** Functions to analyse our core mask to get cores for different tests ***/
|
|
|
|
static int
|
|
get_two_hyperthreads(struct lcore_pair *lcp)
|
|
{
|
|
unsigned id1, id2;
|
|
unsigned c1, c2, s1, s2;
|
|
RTE_LCORE_FOREACH(id1) {
|
|
/* inner loop just re-reads all id's. We could skip the first few
|
|
* elements, but since number of cores is small there is little point
|
|
*/
|
|
RTE_LCORE_FOREACH(id2) {
|
|
if (id1 == id2)
|
|
continue;
|
|
c1 = lcore_config[id1].core_id;
|
|
c2 = lcore_config[id2].core_id;
|
|
s1 = lcore_config[id1].socket_id;
|
|
s2 = lcore_config[id2].socket_id;
|
|
if ((c1 == c2) && (s1 == s2)){
|
|
lcp->c1 = id1;
|
|
lcp->c2 = id2;
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
get_two_cores(struct lcore_pair *lcp)
|
|
{
|
|
unsigned id1, id2;
|
|
unsigned c1, c2, s1, s2;
|
|
RTE_LCORE_FOREACH(id1) {
|
|
RTE_LCORE_FOREACH(id2) {
|
|
if (id1 == id2)
|
|
continue;
|
|
c1 = lcore_config[id1].core_id;
|
|
c2 = lcore_config[id2].core_id;
|
|
s1 = lcore_config[id1].socket_id;
|
|
s2 = lcore_config[id2].socket_id;
|
|
if ((c1 != c2) && (s1 == s2)){
|
|
lcp->c1 = id1;
|
|
lcp->c2 = id2;
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
get_two_sockets(struct lcore_pair *lcp)
|
|
{
|
|
unsigned id1, id2;
|
|
unsigned s1, s2;
|
|
RTE_LCORE_FOREACH(id1) {
|
|
RTE_LCORE_FOREACH(id2) {
|
|
if (id1 == id2)
|
|
continue;
|
|
s1 = lcore_config[id1].socket_id;
|
|
s2 = lcore_config[id2].socket_id;
|
|
if (s1 != s2){
|
|
lcp->c1 = id1;
|
|
lcp->c2 = id2;
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
|
|
static void
|
|
test_empty_dequeue(struct rte_ring *r)
|
|
{
|
|
const unsigned iter_shift = 26;
|
|
const unsigned iterations = 1<<iter_shift;
|
|
unsigned i = 0;
|
|
void *burst[MAX_BURST];
|
|
|
|
const uint64_t sc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++)
|
|
rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
|
|
const uint64_t sc_end = rte_rdtsc();
|
|
|
|
const uint64_t mc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++)
|
|
rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
|
|
const uint64_t mc_end = rte_rdtsc();
|
|
|
|
printf("SC empty dequeue: %.2F\n",
|
|
(double)(sc_end-sc_start) / iterations);
|
|
printf("MC empty dequeue: %.2F\n",
|
|
(double)(mc_end-mc_start) / iterations);
|
|
}
|
|
|
|
/*
|
|
* for the separate enqueue and dequeue threads they take in one param
|
|
* and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
|
|
*/
|
|
struct thread_params {
|
|
struct rte_ring *r;
|
|
unsigned size; /* input value, the burst size */
|
|
double spsc, mpmc; /* output value, the single or multi timings */
|
|
};
|
|
|
|
/*
|
|
* Function that uses rdtsc to measure timing for ring enqueue. Needs pair
|
|
* thread running dequeue_bulk function
|
|
*/
|
|
static int
|
|
enqueue_bulk(void *p)
|
|
{
|
|
const unsigned iter_shift = 23;
|
|
const unsigned iterations = 1<<iter_shift;
|
|
struct thread_params *params = p;
|
|
struct rte_ring *r = params->r;
|
|
const unsigned size = params->size;
|
|
unsigned i;
|
|
void *burst[MAX_BURST] = {0};
|
|
|
|
if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
|
|
while(lcore_count != 2)
|
|
rte_pause();
|
|
|
|
const uint64_t sp_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++)
|
|
while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
|
|
rte_pause();
|
|
const uint64_t sp_end = rte_rdtsc();
|
|
|
|
const uint64_t mp_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++)
|
|
while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
|
|
rte_pause();
|
|
const uint64_t mp_end = rte_rdtsc();
|
|
|
|
params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
|
|
params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Function that uses rdtsc to measure timing for ring dequeue. Needs pair
|
|
* thread running enqueue_bulk function
|
|
*/
|
|
static int
|
|
dequeue_bulk(void *p)
|
|
{
|
|
const unsigned iter_shift = 23;
|
|
const unsigned iterations = 1<<iter_shift;
|
|
struct thread_params *params = p;
|
|
struct rte_ring *r = params->r;
|
|
const unsigned size = params->size;
|
|
unsigned i;
|
|
void *burst[MAX_BURST] = {0};
|
|
|
|
if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
|
|
while(lcore_count != 2)
|
|
rte_pause();
|
|
|
|
const uint64_t sc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++)
|
|
while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
|
|
rte_pause();
|
|
const uint64_t sc_end = rte_rdtsc();
|
|
|
|
const uint64_t mc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++)
|
|
while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
|
|
rte_pause();
|
|
const uint64_t mc_end = rte_rdtsc();
|
|
|
|
params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
|
|
params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Function that calls the enqueue and dequeue bulk functions on pairs of cores.
|
|
* used to measure ring perf between hyperthreads, cores and sockets.
|
|
*/
|
|
static void
|
|
run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
|
|
lcore_function_t f1, lcore_function_t f2)
|
|
{
|
|
struct thread_params param1 = {0}, param2 = {0};
|
|
unsigned i;
|
|
for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
|
|
lcore_count = 0;
|
|
param1.size = param2.size = bulk_sizes[i];
|
|
param1.r = param2.r = r;
|
|
if (cores->c1 == rte_get_master_lcore()) {
|
|
rte_eal_remote_launch(f2, ¶m2, cores->c2);
|
|
f1(¶m1);
|
|
rte_eal_wait_lcore(cores->c2);
|
|
} else {
|
|
rte_eal_remote_launch(f1, ¶m1, cores->c1);
|
|
rte_eal_remote_launch(f2, ¶m2, cores->c2);
|
|
rte_eal_wait_lcore(cores->c1);
|
|
rte_eal_wait_lcore(cores->c2);
|
|
}
|
|
printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
|
|
param1.spsc + param2.spsc);
|
|
printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
|
|
param1.mpmc + param2.mpmc);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Test function that determines how long an enqueue + dequeue of a single item
|
|
* takes on a single lcore. Result is for comparison with the bulk enq+deq.
|
|
*/
|
|
static void
|
|
test_single_enqueue_dequeue(struct rte_ring *r)
|
|
{
|
|
const unsigned iter_shift = 24;
|
|
const unsigned iterations = 1<<iter_shift;
|
|
unsigned i = 0;
|
|
void *burst = NULL;
|
|
|
|
const uint64_t sc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++) {
|
|
rte_ring_sp_enqueue(r, burst);
|
|
rte_ring_sc_dequeue(r, &burst);
|
|
}
|
|
const uint64_t sc_end = rte_rdtsc();
|
|
|
|
const uint64_t mc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++) {
|
|
rte_ring_mp_enqueue(r, burst);
|
|
rte_ring_mc_dequeue(r, &burst);
|
|
}
|
|
const uint64_t mc_end = rte_rdtsc();
|
|
|
|
printf("SP/SC single enq/dequeue: %"PRIu64"\n",
|
|
(sc_end-sc_start) >> iter_shift);
|
|
printf("MP/MC single enq/dequeue: %"PRIu64"\n",
|
|
(mc_end-mc_start) >> iter_shift);
|
|
}
|
|
|
|
/*
|
|
* Test that does both enqueue and dequeue on a core using the burst() API calls
|
|
* instead of the bulk() calls used in other tests. Results should be the same
|
|
* as for the bulk function called on a single lcore.
|
|
*/
|
|
static void
|
|
test_burst_enqueue_dequeue(struct rte_ring *r)
|
|
{
|
|
const unsigned iter_shift = 23;
|
|
const unsigned iterations = 1<<iter_shift;
|
|
unsigned sz, i = 0;
|
|
void *burst[MAX_BURST] = {0};
|
|
|
|
for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
|
|
const uint64_t sc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++) {
|
|
rte_ring_sp_enqueue_burst(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
rte_ring_sc_dequeue_burst(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
}
|
|
const uint64_t sc_end = rte_rdtsc();
|
|
|
|
const uint64_t mc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++) {
|
|
rte_ring_mp_enqueue_burst(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
rte_ring_mc_dequeue_burst(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
}
|
|
const uint64_t mc_end = rte_rdtsc();
|
|
|
|
uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
|
|
uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
|
|
|
|
printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
|
|
sc_avg);
|
|
printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
|
|
mc_avg);
|
|
}
|
|
}
|
|
|
|
/* Times enqueue and dequeue on a single lcore */
|
|
static void
|
|
test_bulk_enqueue_dequeue(struct rte_ring *r)
|
|
{
|
|
const unsigned iter_shift = 23;
|
|
const unsigned iterations = 1<<iter_shift;
|
|
unsigned sz, i = 0;
|
|
void *burst[MAX_BURST] = {0};
|
|
|
|
for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
|
|
const uint64_t sc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++) {
|
|
rte_ring_sp_enqueue_bulk(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
rte_ring_sc_dequeue_bulk(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
}
|
|
const uint64_t sc_end = rte_rdtsc();
|
|
|
|
const uint64_t mc_start = rte_rdtsc();
|
|
for (i = 0; i < iterations; i++) {
|
|
rte_ring_mp_enqueue_bulk(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
rte_ring_mc_dequeue_bulk(r, burst,
|
|
bulk_sizes[sz], NULL);
|
|
}
|
|
const uint64_t mc_end = rte_rdtsc();
|
|
|
|
double sc_avg = ((double)(sc_end-sc_start) /
|
|
(iterations * bulk_sizes[sz]));
|
|
double mc_avg = ((double)(mc_end-mc_start) /
|
|
(iterations * bulk_sizes[sz]));
|
|
|
|
printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
|
|
sc_avg);
|
|
printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
|
|
mc_avg);
|
|
}
|
|
}
|
|
|
|
static int
|
|
test_ring_perf(void)
|
|
{
|
|
struct lcore_pair cores;
|
|
struct rte_ring *r = NULL;
|
|
|
|
r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
|
|
if (r == NULL)
|
|
return -1;
|
|
|
|
printf("### Testing single element and burst enq/deq ###\n");
|
|
test_single_enqueue_dequeue(r);
|
|
test_burst_enqueue_dequeue(r);
|
|
|
|
printf("\n### Testing empty dequeue ###\n");
|
|
test_empty_dequeue(r);
|
|
|
|
printf("\n### Testing using a single lcore ###\n");
|
|
test_bulk_enqueue_dequeue(r);
|
|
|
|
if (get_two_hyperthreads(&cores) == 0) {
|
|
printf("\n### Testing using two hyperthreads ###\n");
|
|
run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
|
|
}
|
|
if (get_two_cores(&cores) == 0) {
|
|
printf("\n### Testing using two physical cores ###\n");
|
|
run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
|
|
}
|
|
if (get_two_sockets(&cores) == 0) {
|
|
printf("\n### Testing using two NUMA nodes ###\n");
|
|
run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
|
|
}
|
|
rte_ring_free(r);
|
|
return 0;
|
|
}
|
|
|
|
REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);
|