OpenZFS 8484 - Implement aggregate sum and use for arc counters

In pursuit of improving performance on multi-core systems, we should
implements fanned out counters and use them to improve the performance of
some of the arc statistics. These stats are updated extremely frequently,
and can consume a significant amount of CPU time.

Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Paul Dagnelie <pcd@delphix.com>

OpenZFS-issue: https://www.illumos.org/issues/8484
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7028a8b92b7
Issue #3752
Closes #7462
This commit is contained in:
Paul Dagnelie 2017-05-25 11:32:40 -07:00 committed by Brian Behlendorf
parent f0ed6c7448
commit 37fb3e4318
12 changed files with 589 additions and 85 deletions

View File

@ -2,6 +2,7 @@ SUBDIRS = fm fs crypto lua sysevent
COMMON_H = \ COMMON_H = \
$(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/abd.h \
$(top_srcdir)/include/sys/aggsum.h \
$(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc.h \
$(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/arc_impl.h \
$(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl.h \
@ -11,6 +12,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/bpobj.h \ $(top_srcdir)/include/sys/bpobj.h \
$(top_srcdir)/include/sys/bptree.h \ $(top_srcdir)/include/sys/bptree.h \
$(top_srcdir)/include/sys/bqueue.h \ $(top_srcdir)/include/sys/bqueue.h \
$(top_srcdir)/include/sys/cityhash.h \
$(top_srcdir)/include/sys/dbuf.h \ $(top_srcdir)/include/sys/dbuf.h \
$(top_srcdir)/include/sys/ddt.h \ $(top_srcdir)/include/sys/ddt.h \
$(top_srcdir)/include/sys/dmu.h \ $(top_srcdir)/include/sys/dmu.h \

59
include/sys/aggsum.h Normal file
View File

@ -0,0 +1,59 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_AGGSUM_H
#define _SYS_AGGSUM_H
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct aggsum_bucket aggsum_bucket_t;
struct aggsum_bucket {
kmutex_t asc_lock;
int64_t asc_delta;
uint64_t asc_borrowed;
} ____cacheline_aligned;
/*
* Fan out over FANOUT cpus.
*/
typedef struct aggsum {
kmutex_t as_lock;
int64_t as_lower_bound;
int64_t as_upper_bound;
uint64_t as_numbuckets;
aggsum_bucket_t *as_buckets;
} aggsum_t;
void aggsum_init(aggsum_t *, uint64_t);
void aggsum_fini(aggsum_t *);
int64_t aggsum_lower_bound(aggsum_t *);
int64_t aggsum_upper_bound(aggsum_t *);
int aggsum_compare(aggsum_t *, uint64_t);
uint64_t aggsum_value(aggsum_t *);
void aggsum_add(aggsum_t *, int64_t);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_AGGSUM_H */

41
include/sys/cityhash.h Normal file
View File

@ -0,0 +1,41 @@
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_CITYHASH_H
#define _SYS_CITYHASH_H
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_CITYHASH_H */

View File

@ -750,5 +750,7 @@ extern fstrans_cookie_t spl_fstrans_mark(void);
extern void spl_fstrans_unmark(fstrans_cookie_t); extern void spl_fstrans_unmark(fstrans_cookie_t);
extern int __spl_pf_fstrans_check(void); extern int __spl_pf_fstrans_check(void);
#define ____cacheline_aligned
#endif /* _KERNEL */ #endif /* _KERNEL */
#endif /* _SYS_ZFS_CONTEXT_H */ #endif /* _SYS_ZFS_CONTEXT_H */

View File

@ -40,12 +40,14 @@ KERNEL_C = \
zpool_prop.c \ zpool_prop.c \
zprop_common.c \ zprop_common.c \
abd.c \ abd.c \
aggsum.c \
arc.c \ arc.c \
blkptr.c \ blkptr.c \
bplist.c \ bplist.c \
bpobj.c \ bpobj.c \
bptree.c \ bptree.c \
bqueue.c \ bqueue.c \
cityhash.c \
dbuf.c \ dbuf.c \
dbuf_stats.c \ dbuf_stats.c \
ddt.c \ ddt.c \

View File

@ -17,10 +17,12 @@ endif
ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE) ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE)
$(MODULE)-objs += abd.o $(MODULE)-objs += abd.o
$(MODULE)-objs += aggsum.o
$(MODULE)-objs += arc.o $(MODULE)-objs += arc.o
$(MODULE)-objs += blkptr.o $(MODULE)-objs += blkptr.o
$(MODULE)-objs += bplist.o $(MODULE)-objs += bplist.o
$(MODULE)-objs += bpobj.o $(MODULE)-objs += bpobj.o
$(MODULE)-objs += cityhash.o
$(MODULE)-objs += dbuf.o $(MODULE)-objs += dbuf.o
$(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += dbuf_stats.o
$(MODULE)-objs += bptree.o $(MODULE)-objs += bptree.o

View File

@ -0,0 +1,19 @@
Copyright (c) 2011 Google, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -0,0 +1 @@
CITYHASH CHECKSUM FUNCTIONALITY IN ZFS

233
module/zfs/aggsum.c Normal file
View File

@ -0,0 +1,233 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/aggsum.h>
/*
* Aggregate-sum counters are a form of fanned-out counter, used when atomic
* instructions on a single field cause enough CPU cache line contention to
* slow system performance. Due to their increased overhead and the expense
* involved with precisely reading from them, they should only be used in cases
* where the write rate (increment/decrement) is much higher than the read rate
* (get value).
*
* Aggregate sum counters are comprised of two basic parts, the core and the
* buckets. The core counter contains a lock for the entire counter, as well
* as the current upper and lower bounds on the value of the counter. The
* aggsum_bucket structure contains a per-bucket lock to protect the contents of
* the bucket, the current amount that this bucket has changed from the global
* counter (called the delta), and the amount of increment and decrement we have
* "borrowed" from the core counter.
*
* The basic operation of an aggsum is simple. Threads that wish to modify the
* counter will modify one bucket's counter (determined by their current CPU, to
* help minimize lock and cache contention). If the bucket already has
* sufficient capacity borrowed from the core structure to handle their request,
* they simply modify the delta and return. If the bucket does not, we clear
* the bucket's current state (to prevent the borrowed amounts from getting too
* large), and borrow more from the core counter. Borrowing is done by adding to
* the upper bound (or subtracting from the lower bound) of the core counter,
* and setting the borrow value for the bucket to the amount added (or
* subtracted). Clearing the bucket is the opposite; we add the current delta
* to both the lower and upper bounds of the core counter, subtract the borrowed
* incremental from the upper bound, and add the borrowed decrement from the
* lower bound. Note that only borrowing and clearing require access to the
* core counter; since all other operations access CPU-local resources,
* performance can be much higher than a traditional counter.
*
* Threads that wish to read from the counter have a slightly more challenging
* task. It is fast to determine the upper and lower bounds of the aggum; this
* does not require grabbing any locks. This suffices for cases where an
* approximation of the aggsum's value is acceptable. However, if one needs to
* know whether some specific value is above or below the current value in the
* aggsum, they invoke aggsum_compare(). This function operates by repeatedly
* comparing the target value to the upper and lower bounds of the aggsum, and
* then clearing a bucket. This proceeds until the target is outside of the
* upper and lower bounds and we return a response, or the last bucket has been
* cleared and we know that the target is equal to the aggsum's value. Finally,
* the most expensive operation is determining the precise value of the aggsum.
* To do this, we clear every bucket and then return the upper bound (which must
* be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
* expensive is clearing buckets. This involves grabbing the global lock
* (serializing against themselves and borrow operations), grabbing a bucket's
* lock (preventing threads on those CPUs from modifying their delta), and
* zeroing out the borrowed value (forcing that thread to borrow on its next
* request, which will also be expensive). This is what makes aggsums well
* suited for write-many read-rarely operations.
*/
/*
* We will borrow aggsum_borrow_multiplier times the current request, so we will
* have to get the as_lock approximately every aggsum_borrow_multiplier calls to
* aggsum_delta().
*/
static uint_t aggsum_borrow_multiplier = 10;
void
aggsum_init(aggsum_t *as, uint64_t value)
{
bzero(as, sizeof (*as));
as->as_lower_bound = as->as_upper_bound = value;
mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
as->as_numbuckets = boot_ncpus;
as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
KM_SLEEP);
for (int i = 0; i < as->as_numbuckets; i++) {
mutex_init(&as->as_buckets[i].asc_lock,
NULL, MUTEX_DEFAULT, NULL);
}
}
void
aggsum_fini(aggsum_t *as)
{
for (int i = 0; i < as->as_numbuckets; i++)
mutex_destroy(&as->as_buckets[i].asc_lock);
kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
mutex_destroy(&as->as_lock);
}
int64_t
aggsum_lower_bound(aggsum_t *as)
{
return (as->as_lower_bound);
}
int64_t
aggsum_upper_bound(aggsum_t *as)
{
return (as->as_upper_bound);
}
static void
aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
{
ASSERT(MUTEX_HELD(&as->as_lock));
ASSERT(MUTEX_HELD(&asb->asc_lock));
/*
* We use atomic instructions for this because we read the upper and
* lower bounds without the lock, so we need stores to be atomic.
*/
atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta);
atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta);
asb->asc_delta = 0;
atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-asb->asc_borrowed);
atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
asb->asc_borrowed);
asb->asc_borrowed = 0;
}
uint64_t
aggsum_value(aggsum_t *as)
{
int64_t rv;
mutex_enter(&as->as_lock);
if (as->as_lower_bound == as->as_upper_bound) {
rv = as->as_lower_bound;
for (int i = 0; i < as->as_numbuckets; i++) {
ASSERT0(as->as_buckets[i].asc_delta);
ASSERT0(as->as_buckets[i].asc_borrowed);
}
mutex_exit(&as->as_lock);
return (rv);
}
for (int i = 0; i < as->as_numbuckets; i++) {
struct aggsum_bucket *asb = &as->as_buckets[i];
mutex_enter(&asb->asc_lock);
aggsum_flush_bucket(as, asb);
mutex_exit(&asb->asc_lock);
}
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
rv = as->as_lower_bound;
mutex_exit(&as->as_lock);
return (rv);
}
static void
aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb)
{
int64_t abs_delta = (delta < 0 ? -delta : delta);
mutex_enter(&as->as_lock);
mutex_enter(&asb->asc_lock);
aggsum_flush_bucket(as, asb);
atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta);
atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta);
asb->asc_borrowed = abs_delta;
mutex_exit(&asb->asc_lock);
mutex_exit(&as->as_lock);
}
void
aggsum_add(aggsum_t *as, int64_t delta)
{
struct aggsum_bucket *asb =
&as->as_buckets[CPU_SEQID % as->as_numbuckets];
for (;;) {
mutex_enter(&asb->asc_lock);
if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
asb->asc_delta += delta;
mutex_exit(&asb->asc_lock);
return;
}
mutex_exit(&asb->asc_lock);
aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb);
}
}
/*
* Compare the aggsum value to target efficiently. Returns -1 if the value
* represented by the aggsum is less than target, 1 if it's greater, and 0 if
* they are equal.
*/
int
aggsum_compare(aggsum_t *as, uint64_t target)
{
if (as->as_upper_bound < target)
return (-1);
if (as->as_lower_bound > target)
return (1);
mutex_enter(&as->as_lock);
for (int i = 0; i < as->as_numbuckets; i++) {
struct aggsum_bucket *asb = &as->as_buckets[i];
mutex_enter(&asb->asc_lock);
aggsum_flush_bucket(as, asb);
mutex_exit(&asb->asc_lock);
if (as->as_upper_bound < target) {
mutex_exit(&as->as_lock);
return (-1);
}
if (as->as_lower_bound > target) {
mutex_exit(&as->as_lock);
return (1);
}
}
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
ASSERT3U(as->as_lower_bound, ==, target);
mutex_exit(&as->as_lock);
return (0);
}

View File

@ -303,6 +303,8 @@
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
#include <sys/arc_impl.h> #include <sys/arc_impl.h>
#include <sys/trace_arc.h> #include <sys/trace_arc.h>
#include <sys/aggsum.h>
#include <sys/cityhash.h>
#ifndef _KERNEL #ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
@ -475,6 +477,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_c; kstat_named_t arcstat_c;
kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max; kstat_named_t arcstat_c_max;
/* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_size; kstat_named_t arcstat_size;
/* /*
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
@ -503,12 +506,14 @@ typedef struct arc_stats {
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
* caches), and arc_buf_t structures (allocated via arc_buf_t * caches), and arc_buf_t structures (allocated via arc_buf_t
* cache). * cache).
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_hdr_size;
/* /*
* Number of bytes consumed by ARC buffers of type equal to * Number of bytes consumed by ARC buffers of type equal to
* ARC_BUFC_DATA. This is generally consumed by buffers backing * ARC_BUFC_DATA. This is generally consumed by buffers backing
* on disk user data (e.g. plain file contents). * on disk user data (e.g. plain file contents).
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_data_size; kstat_named_t arcstat_data_size;
/* /*
@ -516,18 +521,22 @@ typedef struct arc_stats {
* ARC_BUFC_METADATA. This is generally consumed by buffers * ARC_BUFC_METADATA. This is generally consumed by buffers
* backing on disk data that is used for internal ZFS * backing on disk data that is used for internal ZFS
* structures (e.g. ZAP, dnode, indirect blocks, etc). * structures (e.g. ZAP, dnode, indirect blocks, etc).
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_metadata_size; kstat_named_t arcstat_metadata_size;
/* /*
* Number of bytes consumed by dmu_buf_impl_t objects. * Number of bytes consumed by dmu_buf_impl_t objects.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_dbuf_size; kstat_named_t arcstat_dbuf_size;
/* /*
* Number of bytes consumed by dnode_t objects. * Number of bytes consumed by dnode_t objects.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_dnode_size; kstat_named_t arcstat_dnode_size;
/* /*
* Number of bytes consumed by bonus buffers. * Number of bytes consumed by bonus buffers.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_bonus_size; kstat_named_t arcstat_bonus_size;
/* /*
@ -535,6 +544,7 @@ typedef struct arc_stats {
* arc_anon state. This includes *all* buffers in the arc_anon * arc_anon state. This includes *all* buffers in the arc_anon
* state; e.g. data, metadata, evictable, and unevictable buffers * state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value. * are all included in this value.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_anon_size; kstat_named_t arcstat_anon_size;
/* /*
@ -542,6 +552,7 @@ typedef struct arc_stats {
* following criteria: backing buffers of type ARC_BUFC_DATA, * following criteria: backing buffers of type ARC_BUFC_DATA,
* residing in the arc_anon state, and are eligible for eviction * residing in the arc_anon state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer). * (e.g. have no outstanding holds on the buffer).
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_anon_evictable_data; kstat_named_t arcstat_anon_evictable_data;
/* /*
@ -549,6 +560,7 @@ typedef struct arc_stats {
* following criteria: backing buffers of type ARC_BUFC_METADATA, * following criteria: backing buffers of type ARC_BUFC_METADATA,
* residing in the arc_anon state, and are eligible for eviction * residing in the arc_anon state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer). * (e.g. have no outstanding holds on the buffer).
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_anon_evictable_metadata; kstat_named_t arcstat_anon_evictable_metadata;
/* /*
@ -556,6 +568,7 @@ typedef struct arc_stats {
* arc_mru state. This includes *all* buffers in the arc_mru * arc_mru state. This includes *all* buffers in the arc_mru
* state; e.g. data, metadata, evictable, and unevictable buffers * state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value. * are all included in this value.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mru_size; kstat_named_t arcstat_mru_size;
/* /*
@ -563,6 +576,7 @@ typedef struct arc_stats {
* following criteria: backing buffers of type ARC_BUFC_DATA, * following criteria: backing buffers of type ARC_BUFC_DATA,
* residing in the arc_mru state, and are eligible for eviction * residing in the arc_mru state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer). * (e.g. have no outstanding holds on the buffer).
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mru_evictable_data; kstat_named_t arcstat_mru_evictable_data;
/* /*
@ -570,6 +584,7 @@ typedef struct arc_stats {
* following criteria: backing buffers of type ARC_BUFC_METADATA, * following criteria: backing buffers of type ARC_BUFC_METADATA,
* residing in the arc_mru state, and are eligible for eviction * residing in the arc_mru state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer). * (e.g. have no outstanding holds on the buffer).
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mru_evictable_metadata; kstat_named_t arcstat_mru_evictable_metadata;
/* /*
@ -580,18 +595,21 @@ typedef struct arc_stats {
* don't actually have ARC buffers linked off of these headers. * don't actually have ARC buffers linked off of these headers.
* Thus, *if* the headers had associated ARC buffers, these * Thus, *if* the headers had associated ARC buffers, these
* buffers *would have* consumed this number of bytes. * buffers *would have* consumed this number of bytes.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mru_ghost_size; kstat_named_t arcstat_mru_ghost_size;
/* /*
* Number of bytes that *would have been* consumed by ARC * Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type * buffers that are eligible for eviction, of type
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state. * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mru_ghost_evictable_data; kstat_named_t arcstat_mru_ghost_evictable_data;
/* /*
* Number of bytes that *would have been* consumed by ARC * Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type * buffers that are eligible for eviction, of type
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mru_ghost_evictable_metadata; kstat_named_t arcstat_mru_ghost_evictable_metadata;
/* /*
@ -599,36 +617,42 @@ typedef struct arc_stats {
* arc_mfu state. This includes *all* buffers in the arc_mfu * arc_mfu state. This includes *all* buffers in the arc_mfu
* state; e.g. data, metadata, evictable, and unevictable buffers * state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value. * are all included in this value.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mfu_size; kstat_named_t arcstat_mfu_size;
/* /*
* Number of bytes consumed by ARC buffers that are eligible for * Number of bytes consumed by ARC buffers that are eligible for
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
* state. * state.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mfu_evictable_data; kstat_named_t arcstat_mfu_evictable_data;
/* /*
* Number of bytes consumed by ARC buffers that are eligible for * Number of bytes consumed by ARC buffers that are eligible for
* eviction, of type ARC_BUFC_METADATA, and reside in the * eviction, of type ARC_BUFC_METADATA, and reside in the
* arc_mfu state. * arc_mfu state.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mfu_evictable_metadata; kstat_named_t arcstat_mfu_evictable_metadata;
/* /*
* Total number of bytes that *would have been* consumed by ARC * Total number of bytes that *would have been* consumed by ARC
* buffers in the arc_mfu_ghost state. See the comment above * buffers in the arc_mfu_ghost state. See the comment above
* arcstat_mru_ghost_size for more details. * arcstat_mru_ghost_size for more details.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mfu_ghost_size; kstat_named_t arcstat_mfu_ghost_size;
/* /*
* Number of bytes that *would have been* consumed by ARC * Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type * buffers that are eligible for eviction, of type
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mfu_ghost_evictable_data; kstat_named_t arcstat_mfu_ghost_evictable_data;
/* /*
* Number of bytes that *would have been* consumed by ARC * Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type * buffers that are eligible for eviction, of type
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
* Not updated directly; only synced in arc_kstat_update.
*/ */
kstat_named_t arcstat_mfu_ghost_evictable_metadata; kstat_named_t arcstat_mfu_ghost_evictable_metadata;
kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_hits;
@ -650,6 +674,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_io_error;
kstat_named_t arcstat_l2_lsize; kstat_named_t arcstat_l2_lsize;
kstat_named_t arcstat_l2_psize; kstat_named_t arcstat_l2_psize;
/* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_direct_count;
@ -661,6 +686,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_tempreserve;
kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_loaned_bytes;
kstat_named_t arcstat_prune; kstat_named_t arcstat_prune;
/* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_used;
kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_dnode_limit;
@ -829,7 +855,6 @@ static arc_state_t *arc_l2c_only;
* the possibility of inconsistency by having shadow copies of the variables, * the possibility of inconsistency by having shadow copies of the variables,
* while still allowing the code to be readable. * while still allowing the code to be readable.
*/ */
#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
@ -840,11 +865,7 @@ static arc_state_t *arc_l2c_only;
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ #define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
@ -857,6 +878,24 @@ static arc_state_t *arc_l2c_only;
/* number of bytes in the arc from arc_buf_t's */ /* number of bytes in the arc from arc_buf_t's */
#define arc_overhead_size ARCSTAT(arcstat_overhead_size) #define arc_overhead_size ARCSTAT(arcstat_overhead_size)
/*
* There are also some ARC variables that we want to export, but that are
* updated so often that having the canonical representation be the statistic
* variable causes a performance bottleneck. We want to use aggsum_t's for these
* instead, but still be able to export the kstat in the same way as before.
* The solution is to always use the aggsum version, except in the kstat update
* callback.
*/
aggsum_t arc_size;
aggsum_t arc_meta_used;
aggsum_t astat_data_size;
aggsum_t astat_metadata_size;
aggsum_t astat_dbuf_size;
aggsum_t astat_dnode_size;
aggsum_t astat_bonus_size;
aggsum_t astat_hdr_size;
aggsum_t astat_l2_hdr_size;
static list_t arc_prune_list; static list_t arc_prune_list;
static kmutex_t arc_prune_mtx; static kmutex_t arc_prune_mtx;
static taskq_t *arc_prune_taskq; static taskq_t *arc_prune_taskq;
@ -1050,21 +1089,15 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *); static void l2arc_read_done(zio_t *);
/*
* We use Cityhash for this. It's fast, and has good hash properties without
* requiring any large static buffers.
*/
static uint64_t static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{ {
uint8_t *vdva = (uint8_t *)dva; return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
uint64_t crc = -1ULL;
int i;
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
for (i = 0; i < sizeof (dva_t); i++)
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
crc ^= (spa>>8) ^ birth;
return (crc);
} }
#define HDR_EMPTY(hdr) \ #define HDR_EMPTY(hdr) \
@ -2676,32 +2709,32 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
default: default:
break; break;
case ARC_SPACE_DATA: case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space); aggsum_add(&astat_data_size, space);
break; break;
case ARC_SPACE_META: case ARC_SPACE_META:
ARCSTAT_INCR(arcstat_metadata_size, space); aggsum_add(&astat_metadata_size, space);
break; break;
case ARC_SPACE_BONUS: case ARC_SPACE_BONUS:
ARCSTAT_INCR(arcstat_bonus_size, space); aggsum_add(&astat_bonus_size, space);
break; break;
case ARC_SPACE_DNODE: case ARC_SPACE_DNODE:
ARCSTAT_INCR(arcstat_dnode_size, space); aggsum_add(&astat_dnode_size, space);
break; break;
case ARC_SPACE_DBUF: case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, space); aggsum_add(&astat_dbuf_size, space);
break; break;
case ARC_SPACE_HDRS: case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, space); aggsum_add(&astat_hdr_size, space);
break; break;
case ARC_SPACE_L2HDRS: case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, space); aggsum_add(&astat_l2_hdr_size, space);
break; break;
} }
if (type != ARC_SPACE_DATA) if (type != ARC_SPACE_DATA)
ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_meta_used, space);
atomic_add_64(&arc_size, space); aggsum_add(&arc_size, space);
} }
void void
@ -2713,37 +2746,42 @@ arc_space_return(uint64_t space, arc_space_type_t type)
default: default:
break; break;
case ARC_SPACE_DATA: case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space); aggsum_add(&astat_data_size, -space);
break; break;
case ARC_SPACE_META: case ARC_SPACE_META:
ARCSTAT_INCR(arcstat_metadata_size, -space); aggsum_add(&astat_metadata_size, -space);
break; break;
case ARC_SPACE_BONUS: case ARC_SPACE_BONUS:
ARCSTAT_INCR(arcstat_bonus_size, -space); aggsum_add(&astat_bonus_size, -space);
break; break;
case ARC_SPACE_DNODE: case ARC_SPACE_DNODE:
ARCSTAT_INCR(arcstat_dnode_size, -space); aggsum_add(&astat_dnode_size, -space);
break; break;
case ARC_SPACE_DBUF: case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, -space); aggsum_add(&astat_dbuf_size, -space);
break; break;
case ARC_SPACE_HDRS: case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, -space); aggsum_add(&astat_hdr_size, -space);
break; break;
case ARC_SPACE_L2HDRS: case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, -space); aggsum_add(&astat_l2_hdr_size, -space);
break; break;
} }
if (type != ARC_SPACE_DATA) { if (type != ARC_SPACE_DATA) {
ASSERT(arc_meta_used >= space); ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
if (arc_meta_max < arc_meta_used) /*
arc_meta_max = arc_meta_used; * We use the upper bound here rather than the precise value
ARCSTAT_INCR(arcstat_meta_used, -space); * because the arc_meta_max value doesn't need to be
* precise. It's only consumed by humans via arcstats.
*/
if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
arc_meta_max = aggsum_upper_bound(&arc_meta_used);
aggsum_add(&arc_meta_used, -space);
} }
ASSERT(arc_size >= space); ASSERT(aggsum_compare(&arc_size, space) >= 0);
atomic_add_64(&arc_size, -space); aggsum_add(&arc_size, -space);
} }
/* /*
@ -4073,9 +4111,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
* Request that 10% of the LRUs be scanned by the superblock * Request that 10% of the LRUs be scanned by the superblock
* shrinker. * shrinker.
*/ */
if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit) if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
arc_prune_async((arc_dnode_size - arc_dnode_limit) / arc_dnode_limit) > 0) {
sizeof (dnode_t) / zfs_arc_dnode_reduce_percent); arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
arc_dnode_limit) / sizeof (dnode_t) /
zfs_arc_dnode_reduce_percent);
}
/* /*
* Start eviction using a randomly selected sublist, * Start eviction using a randomly selected sublist,
@ -4257,14 +4298,14 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
* *
* Therefore, this function has been updated to make alternating passes * Therefore, this function has been updated to make alternating passes
* over the ARC releasing data buffers and then newly unheld meta data * over the ARC releasing data buffers and then newly unheld meta data
* buffers. This ensures forward progress is maintained and arc_meta_used * buffers. This ensures forward progress is maintained and meta_used
* will decrease. Normally this is sufficient, but if required the ARC * will decrease. Normally this is sufficient, but if required the ARC
* will call the registered prune callbacks causing dentry and inodes to * will call the registered prune callbacks causing dentry and inodes to
* be dropped from the VFS cache. This will make dnode meta data buffers * be dropped from the VFS cache. This will make dnode meta data buffers
* available for reclaim. * available for reclaim.
*/ */
static uint64_t static uint64_t
arc_adjust_meta_balanced(void) arc_adjust_meta_balanced(uint64_t meta_used)
{ {
int64_t delta, prune = 0, adjustmnt; int64_t delta, prune = 0, adjustmnt;
uint64_t total_evicted = 0; uint64_t total_evicted = 0;
@ -4280,7 +4321,7 @@ arc_adjust_meta_balanced(void)
* metadata from the MFU. I think we probably need to implement a * metadata from the MFU. I think we probably need to implement a
* "metadata arc_p" value to do this properly. * "metadata arc_p" value to do this properly.
*/ */
adjustmnt = arc_meta_used - arc_meta_limit; adjustmnt = meta_used - arc_meta_limit;
if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) { if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
delta = MIN(refcount_count(&arc_mru->arcs_esize[type]), delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
@ -4305,7 +4346,7 @@ arc_adjust_meta_balanced(void)
total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
} }
adjustmnt = arc_meta_used - arc_meta_limit; adjustmnt = meta_used - arc_meta_limit;
if (adjustmnt > 0 && if (adjustmnt > 0 &&
refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
@ -4329,7 +4370,7 @@ arc_adjust_meta_balanced(void)
* meta buffers. Requests to the upper layers will be made with * meta buffers. Requests to the upper layers will be made with
* increasingly large scan sizes until the ARC is below the limit. * increasingly large scan sizes until the ARC is below the limit.
*/ */
if (arc_meta_used > arc_meta_limit) { if (meta_used > arc_meta_limit) {
if (type == ARC_BUFC_DATA) { if (type == ARC_BUFC_DATA) {
type = ARC_BUFC_METADATA; type = ARC_BUFC_METADATA;
} else { } else {
@ -4354,7 +4395,7 @@ arc_adjust_meta_balanced(void)
* capped by the arc_meta_limit tunable. * capped by the arc_meta_limit tunable.
*/ */
static uint64_t static uint64_t
arc_adjust_meta_only(void) arc_adjust_meta_only(uint64_t meta_used)
{ {
uint64_t total_evicted = 0; uint64_t total_evicted = 0;
int64_t target; int64_t target;
@ -4366,7 +4407,7 @@ arc_adjust_meta_only(void)
* we're over the meta limit more than we're over arc_p, we * we're over the meta limit more than we're over arc_p, we
* evict some from the MRU here, and some from the MFU below. * evict some from the MRU here, and some from the MFU below.
*/ */
target = MIN((int64_t)(arc_meta_used - arc_meta_limit), target = MIN((int64_t)(meta_used - arc_meta_limit),
(int64_t)(refcount_count(&arc_anon->arcs_size) + (int64_t)(refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) - arc_p)); refcount_count(&arc_mru->arcs_size) - arc_p));
@ -4377,8 +4418,9 @@ arc_adjust_meta_only(void)
* below the meta limit, but not so much as to drop us below the * below the meta limit, but not so much as to drop us below the
* space allotted to the MFU (which is defined as arc_c - arc_p). * space allotted to the MFU (which is defined as arc_c - arc_p).
*/ */
target = MIN((int64_t)(arc_meta_used - arc_meta_limit), target = MIN((int64_t)(meta_used - arc_meta_limit),
(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); (int64_t)(refcount_count(&arc_mfu->arcs_size) -
(arc_c - arc_p)));
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
@ -4386,12 +4428,12 @@ arc_adjust_meta_only(void)
} }
static uint64_t static uint64_t
arc_adjust_meta(void) arc_adjust_meta(uint64_t meta_used)
{ {
if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
return (arc_adjust_meta_only()); return (arc_adjust_meta_only(meta_used));
else else
return (arc_adjust_meta_balanced()); return (arc_adjust_meta_balanced(meta_used));
} }
/* /*
@ -4478,12 +4520,14 @@ arc_adjust(void)
uint64_t total_evicted = 0; uint64_t total_evicted = 0;
uint64_t bytes; uint64_t bytes;
int64_t target; int64_t target;
uint64_t asize = aggsum_value(&arc_size);
uint64_t ameta = aggsum_value(&arc_meta_used);
/* /*
* If we're over arc_meta_limit, we want to correct that before * If we're over arc_meta_limit, we want to correct that before
* potentially evicting data buffers below. * potentially evicting data buffers below.
*/ */
total_evicted += arc_adjust_meta(); total_evicted += arc_adjust_meta(ameta);
/* /*
* Adjust MRU size * Adjust MRU size
@ -4495,9 +4539,9 @@ arc_adjust(void)
* the MRU is over arc_p, we'll evict enough to get back to * the MRU is over arc_p, we'll evict enough to get back to
* arc_p here, and then evict more from the MFU below. * arc_p here, and then evict more from the MFU below.
*/ */
target = MIN((int64_t)(arc_size - arc_c), target = MIN((int64_t)(asize - arc_c),
(int64_t)(refcount_count(&arc_anon->arcs_size) + (int64_t)(refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
/* /*
* If we're below arc_meta_min, always prefer to evict data. * If we're below arc_meta_min, always prefer to evict data.
@ -4508,7 +4552,7 @@ arc_adjust(void)
* type, spill over into the next type. * type, spill over into the next type.
*/ */
if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
arc_meta_used > arc_meta_min) { ameta > arc_meta_min) {
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
total_evicted += bytes; total_evicted += bytes;
@ -4541,10 +4585,10 @@ arc_adjust(void)
* size back to arc_p, if we're still above the target cache * size back to arc_p, if we're still above the target cache
* size, we evict the rest from the MFU. * size, we evict the rest from the MFU.
*/ */
target = arc_size - arc_c; target = asize - arc_c;
if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
arc_meta_used > arc_meta_min) { ameta > arc_meta_min) {
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
total_evicted += bytes; total_evicted += bytes;
@ -4645,13 +4689,14 @@ arc_flush(spa_t *spa, boolean_t retry)
void void
arc_shrink(int64_t to_free) arc_shrink(int64_t to_free)
{ {
uint64_t asize = aggsum_value(&arc_size);
uint64_t c = arc_c; uint64_t c = arc_c;
if (c > to_free && c - to_free > arc_c_min) { if (c > to_free && c - to_free > arc_c_min) {
arc_c = c - to_free; arc_c = c - to_free;
atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
if (arc_c > arc_size) if (asize < arc_c)
arc_c = MAX(arc_size, arc_c_min); arc_c = MAX(asize, arc_c_min);
if (arc_p > arc_c) if (arc_p > arc_c)
arc_p = (arc_c >> 1); arc_p = (arc_c >> 1);
ASSERT(arc_c >= arc_c_min); ASSERT(arc_c >= arc_c_min);
@ -4660,7 +4705,7 @@ arc_shrink(int64_t to_free)
arc_c = arc_c_min; arc_c = arc_c_min;
} }
if (arc_size > arc_c) if (asize > arc_c)
(void) arc_adjust(); (void) arc_adjust();
} }
@ -4877,7 +4922,8 @@ arc_kmem_reap_now(void)
extern kmem_cache_t *range_seg_cache; extern kmem_cache_t *range_seg_cache;
#ifdef _KERNEL #ifdef _KERNEL
if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) { if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
zfs_arc_meta_prune) {
/* /*
* We are exceeding our meta-data cache limit. * We are exceeding our meta-data cache limit.
* Prune some entries to release holds on meta-data. * Prune some entries to release holds on meta-data.
@ -5022,7 +5068,7 @@ arc_reclaim_thread(void *unused)
* be helpful and could potentially cause us to enter an * be helpful and could potentially cause us to enter an
* infinite loop. * infinite loop.
*/ */
if (arc_size <= arc_c || evicted == 0) { if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
/* /*
* We're either no longer overflowing, or we * We're either no longer overflowing, or we
* can't evict anything more, so we should wake * can't evict anything more, so we should wake
@ -5101,12 +5147,13 @@ arc_reclaim_thread(void *unused)
static uint64_t static uint64_t
arc_evictable_memory(void) arc_evictable_memory(void)
{ {
int64_t asize = aggsum_value(&arc_size);
uint64_t arc_clean = uint64_t arc_clean =
refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0); uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
/* /*
* Scale reported evictable memory in proportion to page cache, cap * Scale reported evictable memory in proportion to page cache, cap
@ -5118,7 +5165,7 @@ arc_evictable_memory(void)
if (arc_dirty >= min) if (arc_dirty >= min)
return (arc_clean); return (arc_clean);
return (MAX((int64_t)arc_size - (int64_t)min, 0)); return (MAX((int64_t)asize - (int64_t)min, 0));
} }
/* /*
@ -5261,7 +5308,8 @@ arc_adapt(int bytes, arc_state_t *state)
* cache size, increment the target cache size * cache size, increment the target cache size
*/ */
ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >=
0) {
atomic_add_64(&arc_c, (int64_t)bytes); atomic_add_64(&arc_c, (int64_t)bytes);
if (arc_c > arc_c_max) if (arc_c > arc_c_max)
arc_c = arc_c_max; arc_c = arc_c_max;
@ -5284,7 +5332,16 @@ arc_is_overflowing(void)
uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
arc_c >> zfs_arc_overflow_shift); arc_c >> zfs_arc_overflow_shift);
return (arc_size >= arc_c + overflow); /*
* We just compare the lower bound here for performance reasons. Our
* primary goals are to make sure that the arc never grows without
* bound, and that it can reach its maximum size. This check
* accomplishes both goals. The maximum amount we could run over by is
* 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
* in the ARC. In practice, that's in the tens of MB, which is low
* enough to be safe.
*/
return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
} }
static abd_t * static abd_t *
@ -5399,7 +5456,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
* If we are growing the cache, and we are adding anonymous * If we are growing the cache, and we are adding anonymous
* data, and we have outgrown arc_p, update arc_p * data, and we have outgrown arc_p, update arc_p
*/ */
if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && if (aggsum_compare(&arc_size, arc_c) < 0 &&
hdr->b_l1hdr.b_state == arc_anon &&
(refcount_count(&arc_anon->arcs_size) + (refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) > arc_p)) refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size); arc_p = MIN(arc_c, arc_p + size);
@ -7213,6 +7271,17 @@ arc_kstat_update(kstat_t *ksp, int rw)
&as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_data,
&as->arcstat_mfu_ghost_evictable_metadata); &as->arcstat_mfu_ghost_evictable_metadata);
ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
ARCSTAT(arcstat_metadata_size) =
aggsum_value(&astat_metadata_size);
ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
as->arcstat_memory_all_bytes.value.ui64 = as->arcstat_memory_all_bytes.value.ui64 =
arc_all_memory(); arc_all_memory();
as->arcstat_memory_free_bytes.value.ui64 = as->arcstat_memory_free_bytes.value.ui64 =
@ -7424,6 +7493,16 @@ arc_state_init(void)
refcount_create(&arc_mfu_ghost->arcs_size); refcount_create(&arc_mfu_ghost->arcs_size);
refcount_create(&arc_l2c_only->arcs_size); refcount_create(&arc_l2c_only->arcs_size);
aggsum_init(&arc_meta_used, 0);
aggsum_init(&arc_size, 0);
aggsum_init(&astat_data_size, 0);
aggsum_init(&astat_metadata_size, 0);
aggsum_init(&astat_hdr_size, 0);
aggsum_init(&astat_l2_hdr_size, 0);
aggsum_init(&astat_bonus_size, 0);
aggsum_init(&astat_dnode_size, 0);
aggsum_init(&astat_dbuf_size, 0);
arc_anon->arcs_state = ARC_STATE_ANON; arc_anon->arcs_state = ARC_STATE_ANON;
arc_mru->arcs_state = ARC_STATE_MRU; arc_mru->arcs_state = ARC_STATE_MRU;
arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
@ -7465,6 +7544,16 @@ arc_state_fini(void)
multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
aggsum_fini(&arc_meta_used);
aggsum_fini(&arc_size);
aggsum_fini(&astat_data_size);
aggsum_fini(&astat_metadata_size);
aggsum_fini(&astat_hdr_size);
aggsum_fini(&astat_l2_hdr_size);
aggsum_fini(&astat_bonus_size);
aggsum_fini(&astat_dnode_size);
aggsum_fini(&astat_dbuf_size);
} }
uint64_t uint64_t
@ -7516,7 +7605,6 @@ arc_init(void)
arc_c = arc_c_max; arc_c = arc_c_max;
arc_p = (arc_c >> 1); arc_p = (arc_c >> 1);
arc_size = 0;
/* Set min to 1/2 of arc_c_min */ /* Set min to 1/2 of arc_c_min */
arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;

63
module/zfs/cityhash.c Normal file
View File

@ -0,0 +1,63 @@
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
#include <sys/cityhash.h>
#define HASH_K1 0xb492b66fbe98f273ULL
#define HASH_K2 0x9ae16a3b2f90404fULL
/*
* Bitwise right rotate. Normally this will compile to a single
* instruction.
*/
static inline uint64_t
rotate(uint64_t val, int shift)
{
// Avoid shifting by 64: doing so yields an undefined result.
return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
}
static inline uint64_t
cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
{
uint64_t a = (u ^ v) * mul;
a ^= (a >> 47);
uint64_t b = (v ^ a) * mul;
b ^= (b >> 47);
b *= mul;
return (b);
}
uint64_t
cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
{
uint64_t mul = HASH_K2 + 64;
uint64_t a = w1 * HASH_K1;
uint64_t b = w2;
uint64_t c = w4 * mul;
uint64_t d = w3 * HASH_K2;
return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
a + rotate(b + HASH_K2, 18) + c, mul));
}

View File

@ -48,6 +48,7 @@
#include <sys/callb.h> #include <sys/callb.h>
#include <sys/abd.h> #include <sys/abd.h>
#include <sys/vdev.h> #include <sys/vdev.h>
#include <sys/cityhash.h>
kstat_t *dbuf_ksp; kstat_t *dbuf_ksp;
@ -270,23 +271,14 @@ static dbuf_hash_table_t dbuf_hash_table;
static uint64_t dbuf_hash_count; static uint64_t dbuf_hash_count;
/*
* We use Cityhash for this. It's fast, and has good hash properties without
* requiring any large static buffers.
*/
static uint64_t static uint64_t
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
{ {
uintptr_t osv = (uintptr_t)os; return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
uint64_t crc = -1ULL;
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
return (crc);
} }
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \