service: reduce statistics overhead for parallel services

Move the statistics from the service data structure to the per-lcore
struct. This eliminates contention for the counter cache lines, which
decreases the producer-side statistics overhead for services deployed
across many lcores.

Prior to this patch, enabling statistics for a service with a
per-service function call latency of 1000 clock cycles deployed across
16 cores on a Intel Xeon 6230N @ 2,3 GHz would incur a cost of ~10000
core clock cycles per service call. After this patch, the statistics
overhead is reduce to 22 clock cycles per call.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Harry van Haaren <harry.van.haaren@intel.com>
This commit is contained in:
Mattias Rönnblom 2022-10-05 11:16:10 +02:00 committed by David Marchand
parent 99e4e84047
commit eb111cbdc2

View File

@ -50,16 +50,12 @@ struct rte_service_spec_impl {
* on currently.
*/
uint32_t num_mapped_cores;
/* 32-bit builds won't naturally align a uint64_t, so force alignment,
* allowing regular reads to be atomic.
*/
uint64_t calls __rte_aligned(8);
uint64_t cycles_spent __rte_aligned(8);
} __rte_cache_aligned;
/* Mask used to ensure uint64_t 8 byte vars are naturally aligned. */
#define RTE_SERVICE_STAT_ALIGN_MASK (8 - 1)
struct service_stats {
uint64_t calls;
uint64_t cycles;
};
/* the internal values of a service core */
struct core_state {
@ -70,7 +66,7 @@ struct core_state {
uint8_t is_service_core; /* set if core is currently a service core */
uint8_t service_active_on_lcore[RTE_SERVICE_NUM_MAX];
uint64_t loops;
uint64_t calls_per_service[RTE_SERVICE_NUM_MAX];
struct service_stats service_stats[RTE_SERVICE_NUM_MAX];
} __rte_cache_aligned;
static uint32_t rte_service_count;
@ -138,13 +134,16 @@ rte_service_finalize(void)
rte_service_library_initialized = 0;
}
/* returns 1 if service is registered and has not been unregistered
* Returns 0 if service never registered, or has been unregistered
*/
static inline int
static inline bool
service_registered(uint32_t id)
{
return rte_services[id].internal_flags & SERVICE_F_REGISTERED;
}
static inline bool
service_valid(uint32_t id)
{
return !!(rte_services[id].internal_flags & SERVICE_F_REGISTERED);
return id < RTE_SERVICE_NUM_MAX && service_registered(id);
}
static struct rte_service_spec_impl *
@ -155,7 +154,7 @@ service_get(uint32_t id)
/* validate ID and retrieve service pointer, or return error value */
#define SERVICE_VALID_GET_OR_ERR_RET(id, service, retval) do { \
if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) \
if (!service_valid(id)) \
return retval; \
service = &rte_services[id]; \
} while (0)
@ -217,7 +216,7 @@ rte_service_get_by_name(const char *name, uint32_t *service_id)
int i;
for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
if (service_valid(i) &&
if (service_registered(i) &&
strcmp(name, rte_services[i].spec.name) == 0) {
*service_id = i;
return 0;
@ -254,7 +253,7 @@ rte_service_component_register(const struct rte_service_spec *spec,
return -EINVAL;
for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
if (!service_valid(i)) {
if (!service_registered(i)) {
free_slot = i;
break;
}
@ -366,29 +365,25 @@ service_runner_do_callback(struct rte_service_spec_impl *s,
{
void *userdata = s->spec.callback_userdata;
/* Ensure the atomically stored variables are naturally aligned,
* as required for regular loads to be atomic.
*/
RTE_BUILD_BUG_ON((offsetof(struct rte_service_spec_impl, calls)
& RTE_SERVICE_STAT_ALIGN_MASK) != 0);
RTE_BUILD_BUG_ON((offsetof(struct rte_service_spec_impl, cycles_spent)
& RTE_SERVICE_STAT_ALIGN_MASK) != 0);
if (service_stats_enabled(s)) {
uint64_t start = rte_rdtsc();
s->spec.callback(userdata);
uint64_t end = rte_rdtsc();
uint64_t cycles = end - start;
cs->calls_per_service[service_idx]++;
if (service_mt_safe(s)) {
__atomic_fetch_add(&s->cycles_spent, cycles, __ATOMIC_RELAXED);
__atomic_fetch_add(&s->calls, 1, __ATOMIC_RELAXED);
} else {
uint64_t cycles_new = s->cycles_spent + cycles;
uint64_t calls_new = s->calls++;
__atomic_store_n(&s->cycles_spent, cycles_new, __ATOMIC_RELAXED);
__atomic_store_n(&s->calls, calls_new, __ATOMIC_RELAXED);
}
/* The lcore service worker thread is the only writer,
* and thus only a non-atomic load and an atomic store
* is needed, and not the more expensive atomic
* add.
*/
struct service_stats *service_stats =
&cs->service_stats[service_idx];
__atomic_store_n(&service_stats->calls,
service_stats->calls + 1, __ATOMIC_RELAXED);
__atomic_store_n(&service_stats->cycles,
service_stats->cycles + cycles, __ATOMIC_RELAXED);
} else
s->spec.callback(userdata);
}
@ -436,7 +431,7 @@ rte_service_may_be_active(uint32_t id)
int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE);
int i;
if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id))
if (!service_valid(id))
return -EINVAL;
for (i = 0; i < lcore_count; i++) {
@ -483,16 +478,17 @@ service_runner_func(void *arg)
*/
while (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) ==
RUNSTATE_RUNNING) {
const uint64_t service_mask = cs->service_mask;
for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
if (!service_valid(i))
if (!service_registered(i))
continue;
/* return value ignored as no change to code flow */
service_run(i, cs, service_mask, service_get(i), 1);
}
cs->loops++;
__atomic_store_n(&cs->loops, cs->loops + 1, __ATOMIC_RELAXED);
}
/* Use SEQ CST memory ordering to avoid any re-ordering around
@ -608,8 +604,8 @@ static int32_t
service_update(uint32_t sid, uint32_t lcore, uint32_t *set, uint32_t *enabled)
{
/* validate ID, or return error value */
if (sid >= RTE_SERVICE_NUM_MAX || !service_valid(sid) ||
lcore >= RTE_MAX_LCORE || !lcore_states[lcore].is_service_core)
if (!service_valid(sid) || lcore >= RTE_MAX_LCORE ||
!lcore_states[lcore].is_service_core)
return -EINVAL;
uint64_t sid_mask = UINT64_C(1) << sid;
@ -813,21 +809,76 @@ rte_service_lcore_stop(uint32_t lcore)
return 0;
}
static uint64_t
lcore_attr_get_loops(unsigned int lcore)
{
struct core_state *cs = &lcore_states[lcore];
return __atomic_load_n(&cs->loops, __ATOMIC_RELAXED);
}
static uint64_t
lcore_attr_get_service_calls(uint32_t service_id, unsigned int lcore)
{
struct core_state *cs = &lcore_states[lcore];
return __atomic_load_n(&cs->service_stats[service_id].calls,
__ATOMIC_RELAXED);
}
static uint64_t
lcore_attr_get_service_cycles(uint32_t service_id, unsigned int lcore)
{
struct core_state *cs = &lcore_states[lcore];
return __atomic_load_n(&cs->service_stats[service_id].cycles,
__ATOMIC_RELAXED);
}
typedef uint64_t (*lcore_attr_get_fun)(uint32_t service_id,
unsigned int lcore);
static uint64_t
attr_get(uint32_t id, lcore_attr_get_fun lcore_attr_get)
{
unsigned int lcore;
uint64_t sum = 0;
for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
if (lcore_states[lcore].is_service_core)
sum += lcore_attr_get(id, lcore);
}
return sum;
}
static uint64_t
attr_get_service_calls(uint32_t service_id)
{
return attr_get(service_id, lcore_attr_get_service_calls);
}
static uint64_t
attr_get_service_cycles(uint32_t service_id)
{
return attr_get(service_id, lcore_attr_get_service_cycles);
}
int32_t
rte_service_attr_get(uint32_t id, uint32_t attr_id, uint64_t *attr_value)
{
struct rte_service_spec_impl *s;
SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
if (!service_valid(id))
return -EINVAL;
if (!attr_value)
return -EINVAL;
switch (attr_id) {
case RTE_SERVICE_ATTR_CYCLES:
*attr_value = s->cycles_spent;
return 0;
case RTE_SERVICE_ATTR_CALL_COUNT:
*attr_value = s->calls;
*attr_value = attr_get_service_calls(id);
return 0;
case RTE_SERVICE_ATTR_CYCLES:
*attr_value = attr_get_service_cycles(id);
return 0;
default:
return -EINVAL;
@ -849,7 +900,7 @@ rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id,
switch (attr_id) {
case RTE_SERVICE_LCORE_ATTR_LOOPS:
*attr_value = cs->loops;
*attr_value = lcore_attr_get_loops(lcore);
return 0;
default:
return -EINVAL;
@ -859,11 +910,17 @@ rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id,
int32_t
rte_service_attr_reset_all(uint32_t id)
{
struct rte_service_spec_impl *s;
SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
unsigned int lcore;
if (!service_valid(id))
return -EINVAL;
for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
struct core_state *cs = &lcore_states[lcore];
cs->service_stats[id] = (struct service_stats) {};
}
s->cycles_spent = 0;
s->calls = 0;
return 0;
}
@ -885,17 +942,25 @@ rte_service_lcore_attr_reset_all(uint32_t lcore)
}
static void
service_dump_one(FILE *f, struct rte_service_spec_impl *s)
service_dump_one(FILE *f, uint32_t id)
{
/* avoid divide by zero */
int calls = 1;
struct rte_service_spec_impl *s;
uint64_t service_calls;
uint64_t service_cycles;
service_calls = attr_get_service_calls(id);
service_cycles = attr_get_service_cycles(id);
/* avoid divide by zero */
if (service_calls == 0)
service_calls = 1;
s = service_get(id);
if (s->calls != 0)
calls = s->calls;
fprintf(f, " %s: stats %d\tcalls %"PRIu64"\tcycles %"
PRIu64"\tavg: %"PRIu64"\n",
s->spec.name, service_stats_enabled(s), s->calls,
s->cycles_spent, s->cycles_spent / calls);
PRIu64"\tavg: %"PRIu64"\n",
s->spec.name, service_stats_enabled(s), service_calls,
service_cycles, service_cycles / service_calls);
}
static void
@ -906,9 +971,9 @@ service_dump_calls_per_lcore(FILE *f, uint32_t lcore)
fprintf(f, "%02d\t", lcore);
for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
if (!service_valid(i))
if (!service_registered(i))
continue;
fprintf(f, "%"PRIu64"\t", cs->calls_per_service[i]);
fprintf(f, "%"PRIu64"\t", cs->service_stats[i].calls);
}
fprintf(f, "\n");
}
@ -924,16 +989,16 @@ rte_service_dump(FILE *f, uint32_t id)
struct rte_service_spec_impl *s;
SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
fprintf(f, "Service %s Summary\n", s->spec.name);
service_dump_one(f, s);
service_dump_one(f, id);
return 0;
}
/* print all services, as UINT32_MAX was passed as id */
fprintf(f, "Services Summary\n");
for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
if (!service_valid(i))
if (!service_registered(i))
continue;
service_dump_one(f, &rte_services[i]);
service_dump_one(f, i);
}
fprintf(f, "Service Cores Summary\n");