mempool: use cache for frequently updated stats

When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the performance of mempools with caches is improved as follows. When accessing objects in the mempool, either the put_bulk and put_objs or the get_success_bulk and get_success_objs statistics counters are likely to be incremented. By adding an alternative set of these counters to the mempool cache structure, accessing the dedicated statistics structure is avoided in the likely cases where these counters are incremented. The trick here is that the cache line holding the mempool cache structure is accessed anyway, in order to access the 'len' or 'flushthresh' fields. Updating some statistics counters in the same cache line has lower performance cost than accessing the statistics counters in the dedicated statistics structure, which resides in another cache line. mempool_perf_autotest with this patch shows the following improvements in rate_persec. The cost of enabling mempool stats (without debug) after this patch: -6.8 % and -6.7 %, respectively without and with cache. Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru> Reviewed-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com> Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
2022-11-09 19:18:52 +01:00 · 2022-11-09 19:18:52 +01:00 · 203dcc9cfe
commit 203dcc9cfe
parent 17749e4d64
2 changed files with 67 additions and 12 deletions
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2010-2014 Intel Corporation.
 * Copyright(c) 2016 6WIND S.A.
+ * Copyright(c) 2022 SmartShare Systems
 */

 #include <stdbool.h>
@ -1286,6 +1287,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
+	if (mp->cache_size != 0) {
+		/* Add the statistics stored in the mempool caches. */
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
+		}
+	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
 	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2010-2014 Intel Corporation.
 * Copyright(c) 2016 6WIND S.A.
+ * Copyright(c) 2022 SmartShare Systems
 */

 #ifndef _RTE_MEMPOOL_H_
@ -86,6 +87,19 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	uint32_t unused;
+	/*
+	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
+	 * providing faster update access when using a mempool cache.
+	 */
+	struct {
+		uint64_t put_bulk;          /**< Number of puts. */
+		uint64_t put_objs;          /**< Number of objects successfully put. */
+		uint64_t get_success_bulk;  /**< Successful allocation number. */
+		uint64_t get_success_objs;  /**< Objects successfully allocated. */
+	} stats;                        /**< Statistics */
+#endif
 	/**
 	 * Cache objects
 	 *
@ -319,6 +333,22 @@ struct rte_mempool {
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
 #endif

+/**
+ * @internal When stats is enabled, store some statistics.
+ *
+ * @param cache
+ *   Pointer to the memory pool cache.
+ * @param name
+ *   Name of the statistics field to increment in the memory pool cache.
+ * @param n
+ *   Number to add to the statistics.
+ */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) ((cache)->stats.name += (n))
+#else
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
+#endif
+
 /**
 * @internal Calculate the size of the mempool header.
 *
@ -1333,14 +1363,18 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;

-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
+	/* No cache provided */
+	if (unlikely(cache == NULL))
 		goto driver_enqueue;

+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
+	/* The request itself is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
+
 	/*
 	 * The cache follows the following algorithm:
 	 *   1. If the objects cannot be added to the cache without crossing
@ -1364,6 +1398,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,

 driver_enqueue:

+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@ -1470,8 +1510,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */

-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);

 		return 0;
 	}
@ -1500,8 +1540,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,

 	cache->len = cache->size;

-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);

 	return 0;

@ -1523,8 +1563,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		if (likely(cache != NULL)) {
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		} else {
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		}
 	}

 	return ret;