Introduce write-mostly sums

wmsum counters are a reduced version of aggsum counters, optimized for
write-mostly scenarios.  They do not provide optimized read functions,
but instead allow much cheaper add function.  The primary usage is
infrequently read statistic counters, not requiring exact precision.

The Linux implementation is directly mapped into percpu_counter KPI.
The FreeBSD implementation is directly mapped into counter(9) KPI.
In user-space due to lack of better implementation mapped to aggsum.

Unfortunately neither Linux percpu_counter nor FreeBSD counter(9)
provide sufficient functionality to completelly replace aggsum, so
it still remains to be used for several hot counters.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12114
This commit is contained in:
Alexander Motin 2021-05-27 16:27:29 -04:00 committed by GitHub
parent 2041d6eecd
commit 86706441a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 319 additions and 72 deletions

View File

@ -25,6 +25,31 @@ AC_DEFUN([ZFS_AC_KERNEL_PERCPU_COUNTER_INIT], [
])
])
dnl #
dnl # 4.13 API change,
dnl # __percpu_counter_add() was renamed to percpu_counter_add_batch().
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_ADD_BATCH], [
ZFS_LINUX_TEST_SRC([percpu_counter_add_batch], [
#include <linux/percpu_counter.h>
],[
struct percpu_counter counter;
percpu_counter_add_batch(&counter, 1, 1);
])
])
AC_DEFUN([ZFS_AC_KERNEL_PERCPU_COUNTER_ADD_BATCH], [
AC_MSG_CHECKING([whether percpu_counter_add_batch() is defined])
ZFS_LINUX_TEST_RESULT([percpu_counter_add_batch], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_PERCPU_COUNTER_ADD_BATCH, 1,
[percpu_counter_add_batch() is defined])
],[
AC_MSG_RESULT(no)
])
])
dnl #
dnl # 5.10 API change,
dnl # The "count" was moved into ref->data, from ref
@ -51,10 +76,12 @@ AC_DEFUN([ZFS_AC_KERNEL_PERCPU_REF_COUNT_IN_DATA], [
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU], [
ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_INIT
ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_ADD_BATCH
ZFS_AC_KERNEL_SRC_PERCPU_REF_COUNT_IN_DATA
])
AC_DEFUN([ZFS_AC_KERNEL_PERCPU], [
ZFS_AC_KERNEL_PERCPU_COUNTER_INIT
ZFS_AC_KERNEL_PERCPU_COUNTER_ADD_BATCH
ZFS_AC_KERNEL_PERCPU_REF_COUNT_IN_DATA
])

View File

@ -68,6 +68,7 @@ KERNEL_H = \
vmsystm.h \
vnode_impl.h \
vnode.h \
wmsum.h \
zmod.h \
zone.h

View File

@ -0,0 +1,72 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* wmsum counters are a reduced version of aggsum counters, optimized for
* write-mostly scenarios. They do not provide optimized read functions,
* but instead allow much cheaper add function. The primary usage is
* infrequently read statistic counters, not requiring exact precision.
*
* The FreeBSD implementation is directly mapped into counter(9) KPI.
*/
#ifndef _SYS_WMSUM_H
#define _SYS_WMSUM_H
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/counter.h>
#include <sys/malloc.h>
#ifdef __cplusplus
extern "C" {
#endif
#define wmsum_t counter_u64_t
static inline void
wmsum_init(wmsum_t *ws, uint64_t value)
{
*ws = counter_u64_alloc(M_WAITOK);
counter_u64_add(*ws, value);
}
static inline void
wmsum_fini(wmsum_t *ws)
{
counter_u64_free(*ws);
}
static inline uint64_t
wmsum_value(wmsum_t *ws)
{
return (counter_u64_fetch(*ws));
}
static inline void
wmsum_add(wmsum_t *ws, int64_t delta)
{
counter_u64_add(*ws, delta);
}
#ifdef __cplusplus
}
#endif
#endif /* _SYS_WMSUM_H */

View File

@ -54,6 +54,7 @@ KERNEL_H = \
vmsystm.h \
vnode.h \
wait.h \
wmsum.h \
zmod.h \
zone.h

View File

@ -0,0 +1,76 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* wmsum counters are a reduced version of aggsum counters, optimized for
* write-mostly scenarios. They do not provide optimized read functions,
* but instead allow much cheaper add function. The primary usage is
* infrequently read statistic counters, not requiring exact precision.
*
* The Linux implementation is directly mapped into percpu_counter KPI.
*/
#ifndef _SYS_WMSUM_H
#define _SYS_WMSUM_H
#include <linux/percpu_counter.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct percpu_counter wmsum_t;
static inline void
wmsum_init(wmsum_t *ws, uint64_t value)
{
#ifdef HAVE_PERCPU_COUNTER_INIT_WITH_GFP
percpu_counter_init(ws, value, GFP_KERNEL);
#else
percpu_counter_init(ws, value);
#endif
}
static inline void
wmsum_fini(wmsum_t *ws)
{
percpu_counter_destroy(ws);
}
static inline uint64_t
wmsum_value(wmsum_t *ws)
{
return (percpu_counter_sum(ws));
}
static inline void
wmsum_add(wmsum_t *ws, int64_t delta)
{
#ifdef HAVE_PERCPU_COUNTER_ADD_BATCH
percpu_counter_add_batch(ws, delta, INT_MAX / 2);
#else
__percpu_counter_add(ws, delta, INT_MAX / 2);
#endif
}
#ifdef __cplusplus
}
#endif
#endif /* _SYS_WMSUM_H */

View File

@ -27,18 +27,18 @@
#ifndef _SYS_DATASET_KSTATS_H
#define _SYS_DATASET_KSTATS_H
#include <sys/aggsum.h>
#include <sys/wmsum.h>
#include <sys/dmu.h>
#include <sys/kstat.h>
typedef struct dataset_aggsum_stats_t {
aggsum_t das_writes;
aggsum_t das_nwritten;
aggsum_t das_reads;
aggsum_t das_nread;
aggsum_t das_nunlinks;
aggsum_t das_nunlinked;
} dataset_aggsum_stats_t;
typedef struct dataset_sum_stats_t {
wmsum_t dss_writes;
wmsum_t dss_nwritten;
wmsum_t dss_reads;
wmsum_t dss_nread;
wmsum_t dss_nunlinks;
wmsum_t dss_nunlinked;
} dataset_sum_stats_t;
typedef struct dataset_kstat_values {
kstat_named_t dkv_ds_name;
@ -59,7 +59,7 @@ typedef struct dataset_kstat_values {
} dataset_kstat_values_t;
typedef struct dataset_kstats {
dataset_aggsum_stats_t dk_aggsums;
dataset_sum_stats_t dk_sums;
kstat_t *dk_kstats;
} dataset_kstats_t;

View File

@ -44,4 +44,5 @@ libspl_HEADERS = \
varargs.h \
vnode.h \
vtoc.h \
wmsum.h \
zone.h

View File

@ -0,0 +1,68 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* wmsum counters are a reduced version of aggsum counters, optimized for
* write-mostly scenarios. They do not provide optimized read functions,
* but instead allow much cheaper add function. The primary usage is
* infrequently read statistic counters, not requiring exact precision.
*
* In user-space due to lack of better implementation mapped to aggsum.
*/
#ifndef _SYS_WMSUM_H
#define _SYS_WMSUM_H
#include <sys/aggsum.h>
#ifdef __cplusplus
extern "C" {
#endif
#define wmsum_t aggsum_t
static inline void
wmsum_init(wmsum_t *ws, uint64_t value)
{
aggsum_init(ws, value);
}
static inline void
wmsum_fini(wmsum_t *ws)
{
aggsum_fini(ws);
}
static inline uint64_t
wmsum_value(wmsum_t *ws)
{
return (aggsum_value(ws));
}
static inline void
wmsum_add(wmsum_t *ws, int64_t delta)
{
aggsum_add(ws, delta);
}
#ifdef __cplusplus
}
#endif
#endif /* _SYS_WMSUM_H */

View File

@ -305,6 +305,7 @@
#include <sys/arc_impl.h>
#include <sys/trace_zfs.h>
#include <sys/aggsum.h>
#include <sys/wmsum.h>
#include <cityhash.h>
#include <sys/vdev_trim.h>
#include <sys/zfs_racct.h>
@ -692,14 +693,14 @@ arc_state_t *arc_mfu;
*/
aggsum_t arc_size;
aggsum_t arc_meta_used;
aggsum_t astat_data_size;
aggsum_t astat_metadata_size;
aggsum_t astat_dbuf_size;
wmsum_t astat_data_size;
wmsum_t astat_metadata_size;
wmsum_t astat_dbuf_size;
aggsum_t astat_dnode_size;
aggsum_t astat_bonus_size;
aggsum_t astat_hdr_size;
wmsum_t astat_bonus_size;
wmsum_t astat_hdr_size;
aggsum_t astat_l2_hdr_size;
aggsum_t astat_abd_chunk_waste_size;
wmsum_t astat_abd_chunk_waste_size;
hrtime_t arc_growtime;
list_t arc_prune_list;
@ -2645,22 +2646,22 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
default:
break;
case ARC_SPACE_DATA:
aggsum_add(&astat_data_size, space);
wmsum_add(&astat_data_size, space);
break;
case ARC_SPACE_META:
aggsum_add(&astat_metadata_size, space);
wmsum_add(&astat_metadata_size, space);
break;
case ARC_SPACE_BONUS:
aggsum_add(&astat_bonus_size, space);
wmsum_add(&astat_bonus_size, space);
break;
case ARC_SPACE_DNODE:
aggsum_add(&astat_dnode_size, space);
break;
case ARC_SPACE_DBUF:
aggsum_add(&astat_dbuf_size, space);
wmsum_add(&astat_dbuf_size, space);
break;
case ARC_SPACE_HDRS:
aggsum_add(&astat_hdr_size, space);
wmsum_add(&astat_hdr_size, space);
break;
case ARC_SPACE_L2HDRS:
aggsum_add(&astat_l2_hdr_size, space);
@ -2672,7 +2673,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
* scatter ABD's come from the ARC, because other users are
* very short-lived.
*/
aggsum_add(&astat_abd_chunk_waste_size, space);
wmsum_add(&astat_abd_chunk_waste_size, space);
break;
}
@ -2691,28 +2692,28 @@ arc_space_return(uint64_t space, arc_space_type_t type)
default:
break;
case ARC_SPACE_DATA:
aggsum_add(&astat_data_size, -space);
wmsum_add(&astat_data_size, -space);
break;
case ARC_SPACE_META:
aggsum_add(&astat_metadata_size, -space);
wmsum_add(&astat_metadata_size, -space);
break;
case ARC_SPACE_BONUS:
aggsum_add(&astat_bonus_size, -space);
wmsum_add(&astat_bonus_size, -space);
break;
case ARC_SPACE_DNODE:
aggsum_add(&astat_dnode_size, -space);
break;
case ARC_SPACE_DBUF:
aggsum_add(&astat_dbuf_size, -space);
wmsum_add(&astat_dbuf_size, -space);
break;
case ARC_SPACE_HDRS:
aggsum_add(&astat_hdr_size, -space);
wmsum_add(&astat_hdr_size, -space);
break;
case ARC_SPACE_L2HDRS:
aggsum_add(&astat_l2_hdr_size, -space);
break;
case ARC_SPACE_ABD_CHUNK_WASTE:
aggsum_add(&astat_abd_chunk_waste_size, -space);
wmsum_add(&astat_abd_chunk_waste_size, -space);
break;
}
@ -7275,21 +7276,21 @@ arc_kstat_update(kstat_t *ksp, int rw)
ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
ARCSTAT(arcstat_data_size) = wmsum_value(&astat_data_size);
ARCSTAT(arcstat_metadata_size) =
aggsum_value(&astat_metadata_size);
ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
wmsum_value(&astat_metadata_size);
ARCSTAT(arcstat_hdr_size) = wmsum_value(&astat_hdr_size);
ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
ARCSTAT(arcstat_dbuf_size) = wmsum_value(&astat_dbuf_size);
#if defined(COMPAT_FREEBSD11)
ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
ARCSTAT(arcstat_other_size) = wmsum_value(&astat_bonus_size) +
aggsum_value(&astat_dnode_size) +
aggsum_value(&astat_dbuf_size);
wmsum_value(&astat_dbuf_size);
#endif
ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
ARCSTAT(arcstat_bonus_size) = wmsum_value(&astat_bonus_size);
ARCSTAT(arcstat_abd_chunk_waste_size) =
aggsum_value(&astat_abd_chunk_waste_size);
wmsum_value(&astat_abd_chunk_waste_size);
as->arcstat_memory_all_bytes.value.ui64 =
arc_all_memory();
@ -7522,14 +7523,14 @@ arc_state_init(void)
aggsum_init(&arc_meta_used, 0);
aggsum_init(&arc_size, 0);
aggsum_init(&astat_data_size, 0);
aggsum_init(&astat_metadata_size, 0);
aggsum_init(&astat_hdr_size, 0);
wmsum_init(&astat_data_size, 0);
wmsum_init(&astat_metadata_size, 0);
wmsum_init(&astat_hdr_size, 0);
aggsum_init(&astat_l2_hdr_size, 0);
aggsum_init(&astat_bonus_size, 0);
wmsum_init(&astat_bonus_size, 0);
aggsum_init(&astat_dnode_size, 0);
aggsum_init(&astat_dbuf_size, 0);
aggsum_init(&astat_abd_chunk_waste_size, 0);
wmsum_init(&astat_dbuf_size, 0);
wmsum_init(&astat_abd_chunk_waste_size, 0);
arc_anon->arcs_state = ARC_STATE_ANON;
arc_mru->arcs_state = ARC_STATE_MRU;
@ -7575,14 +7576,14 @@ arc_state_fini(void)
aggsum_fini(&arc_meta_used);
aggsum_fini(&arc_size);
aggsum_fini(&astat_data_size);
aggsum_fini(&astat_metadata_size);
aggsum_fini(&astat_hdr_size);
wmsum_fini(&astat_data_size);
wmsum_fini(&astat_metadata_size);
wmsum_fini(&astat_hdr_size);
aggsum_fini(&astat_l2_hdr_size);
aggsum_fini(&astat_bonus_size);
wmsum_fini(&astat_bonus_size);
aggsum_fini(&astat_dnode_size);
aggsum_fini(&astat_dbuf_size);
aggsum_fini(&astat_abd_chunk_waste_size);
wmsum_fini(&astat_dbuf_size);
wmsum_fini(&astat_abd_chunk_waste_size);
}
uint64_t

View File

@ -50,17 +50,17 @@ dataset_kstats_update(kstat_t *ksp, int rw)
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
dkv->dkv_writes.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_writes);
wmsum_value(&dk->dk_sums.dss_writes);
dkv->dkv_nwritten.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nwritten);
wmsum_value(&dk->dk_sums.dss_nwritten);
dkv->dkv_reads.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_reads);
wmsum_value(&dk->dk_sums.dss_reads);
dkv->dkv_nread.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nread);
wmsum_value(&dk->dk_sums.dss_nread);
dkv->dkv_nunlinks.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nunlinks);
wmsum_value(&dk->dk_sums.dss_nunlinks);
dkv->dkv_nunlinked.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nunlinked);
wmsum_value(&dk->dk_sums.dss_nunlinked);
return (0);
}
@ -140,12 +140,12 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
kstat_install(kstat);
dk->dk_kstats = kstat;
aggsum_init(&dk->dk_aggsums.das_writes, 0);
aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
aggsum_init(&dk->dk_aggsums.das_reads, 0);
aggsum_init(&dk->dk_aggsums.das_nread, 0);
aggsum_init(&dk->dk_aggsums.das_nunlinks, 0);
aggsum_init(&dk->dk_aggsums.das_nunlinked, 0);
wmsum_init(&dk->dk_sums.dss_writes, 0);
wmsum_init(&dk->dk_sums.dss_nwritten, 0);
wmsum_init(&dk->dk_sums.dss_reads, 0);
wmsum_init(&dk->dk_sums.dss_nread, 0);
wmsum_init(&dk->dk_sums.dss_nunlinks, 0);
wmsum_init(&dk->dk_sums.dss_nunlinked, 0);
}
void
@ -162,12 +162,12 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
kstat_delete(dk->dk_kstats);
dk->dk_kstats = NULL;
aggsum_fini(&dk->dk_aggsums.das_writes);
aggsum_fini(&dk->dk_aggsums.das_nwritten);
aggsum_fini(&dk->dk_aggsums.das_reads);
aggsum_fini(&dk->dk_aggsums.das_nread);
aggsum_fini(&dk->dk_aggsums.das_nunlinks);
aggsum_fini(&dk->dk_aggsums.das_nunlinked);
wmsum_fini(&dk->dk_sums.dss_writes);
wmsum_fini(&dk->dk_sums.dss_nwritten);
wmsum_fini(&dk->dk_sums.dss_reads);
wmsum_fini(&dk->dk_sums.dss_nread);
wmsum_fini(&dk->dk_sums.dss_nunlinks);
wmsum_fini(&dk->dk_sums.dss_nunlinked);
}
void
@ -179,8 +179,8 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_writes, 1);
aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten);
wmsum_add(&dk->dk_sums.dss_writes, 1);
wmsum_add(&dk->dk_sums.dss_nwritten, nwritten);
}
void
@ -192,8 +192,8 @@ dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_reads, 1);
aggsum_add(&dk->dk_aggsums.das_nread, nread);
wmsum_add(&dk->dk_sums.dss_reads, 1);
wmsum_add(&dk->dk_sums.dss_nread, nread);
}
void
@ -202,7 +202,7 @@ dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_nunlinks, delta);
wmsum_add(&dk->dk_sums.dss_nunlinks, delta);
}
void
@ -211,5 +211,5 @@ dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_nunlinked, delta);
wmsum_add(&dk->dk_sums.dss_nunlinked, delta);
}