service: reduce statistics overhead for parallel services

Move the statistics from the service data structure to the per-lcore struct. This eliminates contention for the counter cache lines, which decreases the producer-side statistics overhead for services deployed across many lcores. Prior to this patch, enabling statistics for a service with a per-service function call latency of 1000 clock cycles deployed across 16 cores on a Intel Xeon 6230N @ 2,3 GHz would incur a cost of ~10000 core clock cycles per service call. After this patch, the statistics overhead is reduce to 22 clock cycles per call. Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com> Acked-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Harry van Haaren <harry.van.haaren@intel.com>
2022-10-05 11:16:10 +02:00 · 2022-10-05 11:16:10 +02:00 · eb111cbdc2
commit eb111cbdc2
parent 99e4e84047
1 changed files with 129 additions and 64 deletions
--- a/lib/eal/common/rte_service.c
+++ b/lib/eal/common/rte_service.c
@ -50,16 +50,12 @@ struct rte_service_spec_impl {
 	 * on currently.
 	 */
 	uint32_t num_mapped_cores;
-
-	/* 32-bit builds won't naturally align a uint64_t, so force alignment,
-	 * allowing regular reads to be atomic.
-	 */
-	uint64_t calls __rte_aligned(8);
-	uint64_t cycles_spent __rte_aligned(8);
 } __rte_cache_aligned;

-/* Mask used to ensure uint64_t 8 byte vars are naturally aligned. */
-#define RTE_SERVICE_STAT_ALIGN_MASK (8 - 1)
+struct service_stats {
+	uint64_t calls;
+	uint64_t cycles;
+};

 /* the internal values of a service core */
 struct core_state {
@ -70,7 +66,7 @@ struct core_state {
 	uint8_t is_service_core; /* set if core is currently a service core */
 	uint8_t service_active_on_lcore[RTE_SERVICE_NUM_MAX];
 	uint64_t loops;
-	uint64_t calls_per_service[RTE_SERVICE_NUM_MAX];
+	struct service_stats service_stats[RTE_SERVICE_NUM_MAX];
 } __rte_cache_aligned;

 static uint32_t rte_service_count;
@ -138,13 +134,16 @@ rte_service_finalize(void)
 	rte_service_library_initialized = 0;
 }

-/* returns 1 if service is registered and has not been unregistered
- * Returns 0 if service never registered, or has been unregistered
- */
-static inline int
+static inline bool
+service_registered(uint32_t id)
+{
+	return rte_services[id].internal_flags & SERVICE_F_REGISTERED;
+}
+
+static inline bool
 service_valid(uint32_t id)
 {
-	return !!(rte_services[id].internal_flags & SERVICE_F_REGISTERED);
+	return id < RTE_SERVICE_NUM_MAX && service_registered(id);
 }

 static struct rte_service_spec_impl *
@ -155,7 +154,7 @@ service_get(uint32_t id)

 /* validate ID and retrieve service pointer, or return error value */
 #define SERVICE_VALID_GET_OR_ERR_RET(id, service, retval) do {          \
-	if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id))            \
+	if (!service_valid(id))                                         \
 		return retval;                                          \
 	service = &rte_services[id];                                    \
 } while (0)
@ -217,7 +216,7 @@ rte_service_get_by_name(const char *name, uint32_t *service_id)

 	int i;
 	for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
-		if (service_valid(i) &&
+		if (service_registered(i) &&
 				strcmp(name, rte_services[i].spec.name) == 0) {
 			*service_id = i;
 			return 0;
@ -254,7 +253,7 @@ rte_service_component_register(const struct rte_service_spec *spec,
 		return -EINVAL;

 	for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
-		if (!service_valid(i)) {
+		if (!service_registered(i)) {
 			free_slot = i;
 			break;
 		}
@ -366,29 +365,25 @@ service_runner_do_callback(struct rte_service_spec_impl *s,
 {
 	void *userdata = s->spec.callback_userdata;

-	/* Ensure the atomically stored variables are naturally aligned,
-	 * as required for regular loads to be atomic.
-	 */
-	RTE_BUILD_BUG_ON((offsetof(struct rte_service_spec_impl, calls)
-		& RTE_SERVICE_STAT_ALIGN_MASK) != 0);
-	RTE_BUILD_BUG_ON((offsetof(struct rte_service_spec_impl, cycles_spent)
-		& RTE_SERVICE_STAT_ALIGN_MASK) != 0);
-
 	if (service_stats_enabled(s)) {
 		uint64_t start = rte_rdtsc();
 		s->spec.callback(userdata);
 		uint64_t end = rte_rdtsc();
 		uint64_t cycles = end - start;
-		cs->calls_per_service[service_idx]++;
-		if (service_mt_safe(s)) {
-			__atomic_fetch_add(&s->cycles_spent, cycles, __ATOMIC_RELAXED);
-			__atomic_fetch_add(&s->calls, 1, __ATOMIC_RELAXED);
-		} else {
-			uint64_t cycles_new = s->cycles_spent + cycles;
-			uint64_t calls_new = s->calls++;
-			__atomic_store_n(&s->cycles_spent, cycles_new, __ATOMIC_RELAXED);
-			__atomic_store_n(&s->calls, calls_new, __ATOMIC_RELAXED);
-		}
+
+		/* The lcore service worker thread is the only writer,
+		 * and thus only a non-atomic load and an atomic store
+		 * is needed, and not the more expensive atomic
+		 * add.
+		 */
+		struct service_stats *service_stats =
+			&cs->service_stats[service_idx];
+
+		__atomic_store_n(&service_stats->calls,
+			service_stats->calls + 1, __ATOMIC_RELAXED);
+
+		__atomic_store_n(&service_stats->cycles,
+			service_stats->cycles + cycles, __ATOMIC_RELAXED);
 	} else
 		s->spec.callback(userdata);
 }
@ -436,7 +431,7 @@ rte_service_may_be_active(uint32_t id)
 	int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE);
 	int i;

-	if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id))
+	if (!service_valid(id))
 		return -EINVAL;

 	for (i = 0; i < lcore_count; i++) {
@ -483,16 +478,17 @@ service_runner_func(void *arg)
 	 */
 	while (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) ==
 			RUNSTATE_RUNNING) {
+
 		const uint64_t service_mask = cs->service_mask;

 		for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
-			if (!service_valid(i))
+			if (!service_registered(i))
 				continue;
 			/* return value ignored as no change to code flow */
 			service_run(i, cs, service_mask, service_get(i), 1);
 		}

-		cs->loops++;
+		__atomic_store_n(&cs->loops, cs->loops + 1, __ATOMIC_RELAXED);
 	}

 	/* Use SEQ CST memory ordering to avoid any re-ordering around
@ -608,8 +604,8 @@ static int32_t
 service_update(uint32_t sid, uint32_t lcore, uint32_t *set, uint32_t *enabled)
 {
 	/* validate ID, or return error value */
-	if (sid >= RTE_SERVICE_NUM_MAX || !service_valid(sid) ||
-	    lcore >= RTE_MAX_LCORE || !lcore_states[lcore].is_service_core)
+	if (!service_valid(sid) || lcore >= RTE_MAX_LCORE ||
+			!lcore_states[lcore].is_service_core)
 		return -EINVAL;

 	uint64_t sid_mask = UINT64_C(1) << sid;
@ -813,21 +809,76 @@ rte_service_lcore_stop(uint32_t lcore)
 	return 0;
 }

+static uint64_t
+lcore_attr_get_loops(unsigned int lcore)
+{
+	struct core_state *cs = &lcore_states[lcore];
+
+	return __atomic_load_n(&cs->loops, __ATOMIC_RELAXED);
+}
+
+static uint64_t
+lcore_attr_get_service_calls(uint32_t service_id, unsigned int lcore)
+{
+	struct core_state *cs = &lcore_states[lcore];
+
+	return __atomic_load_n(&cs->service_stats[service_id].calls,
+		__ATOMIC_RELAXED);
+}
+
+static uint64_t
+lcore_attr_get_service_cycles(uint32_t service_id, unsigned int lcore)
+{
+	struct core_state *cs = &lcore_states[lcore];
+
+	return __atomic_load_n(&cs->service_stats[service_id].cycles,
+		__ATOMIC_RELAXED);
+}
+
+typedef uint64_t (*lcore_attr_get_fun)(uint32_t service_id,
+				       unsigned int lcore);
+
+static uint64_t
+attr_get(uint32_t id, lcore_attr_get_fun lcore_attr_get)
+{
+	unsigned int lcore;
+	uint64_t sum = 0;
+
+	for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+		if (lcore_states[lcore].is_service_core)
+			sum += lcore_attr_get(id, lcore);
+	}
+
+	return sum;
+}
+
+static uint64_t
+attr_get_service_calls(uint32_t service_id)
+{
+	return attr_get(service_id, lcore_attr_get_service_calls);
+}
+
+static uint64_t
+attr_get_service_cycles(uint32_t service_id)
+{
+	return attr_get(service_id, lcore_attr_get_service_cycles);
+}
+
 int32_t
 rte_service_attr_get(uint32_t id, uint32_t attr_id, uint64_t *attr_value)
 {
-	struct rte_service_spec_impl *s;
-	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
+	if (!service_valid(id))
+		return -EINVAL;

 	if (!attr_value)
 		return -EINVAL;

 	switch (attr_id) {
-	case RTE_SERVICE_ATTR_CYCLES:
-		*attr_value = s->cycles_spent;
-		return 0;
 	case RTE_SERVICE_ATTR_CALL_COUNT:
-		*attr_value = s->calls;
+		*attr_value = attr_get_service_calls(id);
+		return 0;
+	case RTE_SERVICE_ATTR_CYCLES:
+		*attr_value = attr_get_service_cycles(id);
 		return 0;
 	default:
 		return -EINVAL;
@ -849,7 +900,7 @@ rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id,

 	switch (attr_id) {
 	case RTE_SERVICE_LCORE_ATTR_LOOPS:
-		*attr_value = cs->loops;
+		*attr_value = lcore_attr_get_loops(lcore);
 		return 0;
 	default:
 		return -EINVAL;
@ -859,11 +910,17 @@ rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id,
 int32_t
 rte_service_attr_reset_all(uint32_t id)
 {
-	struct rte_service_spec_impl *s;
-	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
+	unsigned int lcore;
+
+	if (!service_valid(id))
+		return -EINVAL;
+
+	for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+		struct core_state *cs = &lcore_states[lcore];
+
+		cs->service_stats[id] = (struct service_stats) {};
+	}

-	s->cycles_spent = 0;
-	s->calls = 0;
 	return 0;
 }

@ -885,17 +942,25 @@ rte_service_lcore_attr_reset_all(uint32_t lcore)
 }

 static void
-service_dump_one(FILE *f, struct rte_service_spec_impl *s)
+service_dump_one(FILE *f, uint32_t id)
 {
-	/* avoid divide by zero */
-	int calls = 1;
+	struct rte_service_spec_impl *s;
+	uint64_t service_calls;
+	uint64_t service_cycles;
+
+	service_calls = attr_get_service_calls(id);
+	service_cycles = attr_get_service_cycles(id);
+
+	/* avoid divide by zero */
+	if (service_calls == 0)
+		service_calls = 1;
+
+	s = service_get(id);

-	if (s->calls != 0)
-		calls = s->calls;
 	fprintf(f, "  %s: stats %d\tcalls %"PRIu64"\tcycles %"
-			PRIu64"\tavg: %"PRIu64"\n",
-			s->spec.name, service_stats_enabled(s), s->calls,
-			s->cycles_spent, s->cycles_spent / calls);
+		PRIu64"\tavg: %"PRIu64"\n",
+		s->spec.name, service_stats_enabled(s), service_calls,
+		service_cycles, service_cycles / service_calls);
 }

 static void
@ -906,9 +971,9 @@ service_dump_calls_per_lcore(FILE *f, uint32_t lcore)

 	fprintf(f, "%02d\t", lcore);
 	for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
-		if (!service_valid(i))
+		if (!service_registered(i))
 			continue;
-		fprintf(f, "%"PRIu64"\t", cs->calls_per_service[i]);
+		fprintf(f, "%"PRIu64"\t", cs->service_stats[i].calls);
 	}
 	fprintf(f, "\n");
 }
@ -924,16 +989,16 @@ rte_service_dump(FILE *f, uint32_t id)
 		struct rte_service_spec_impl *s;
 		SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
 		fprintf(f, "Service %s Summary\n", s->spec.name);
-		service_dump_one(f, s);
+		service_dump_one(f, id);
 		return 0;
 	}

 	/* print all services, as UINT32_MAX was passed as id */
 	fprintf(f, "Services Summary\n");
 	for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
-		if (!service_valid(i))
+		if (!service_registered(i))
 			continue;
-		service_dump_one(f, &rte_services[i]);
+		service_dump_one(f, i);
 	}

 	fprintf(f, "Service Cores Summary\n");