arc_read()/arc_access() refactoring and cleanup

ARC code was many times significantly modified over the years, that
created significant amount of tangled and potentially broken code.
This should make arc_access()/arc_read() code some more readable.

 - Decouple prefetch status tracking from b_refcnt.  It made sense
originally, but became highly cryptic over the years.  Move all the
logic into arc_access().  While there, clean up and comment state
transitions in arc_access().  Some transitions were weird IMO.
 - Unify arc_access() calls to arc_read() instead of sometimes calling
it from arc_read_done().  To avoid extra state changes and checks add
one more b_refcnt for ARC_FLAG_IO_IN_PROGRESS.
 - Reimplement ARC_FLAG_WAIT in case of ARC_FLAG_IO_IN_PROGRESS with
the same callback mechanism to not falsely account them as hits. Count
those as "iohits", an intermediate between "hits" and "misses". While
there, call read callbacks in original request order, that should be
good for fairness and random speculations/allocations/aggregations.
 - Introduce additional statistic counters for prefetch, accounting
predictive vs prescient and hits vs iohits vs misses.
 - Remove hash_lock argument from functions not needing it.
 - Remove ARC_FLAG_PREDICTIVE_PREFETCH, since it should be opposite
to ARC_FLAG_PRESCIENT_PREFETCH if ARC_FLAG_PREFETCH is set.  We may
wish to add ARC_FLAG_PRESCIENT_PREFETCH to few more places.
 - Fix few false positive tests found in the process.

Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14123
This commit is contained in:
Alexander Motin 2022-12-22 15:10:24 -05:00 committed by GitHub
parent dc8c2f6158
commit c935fe2e92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 312 additions and 294 deletions

View File

@ -103,12 +103,12 @@ DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \
TP_PROTO(arc_buf_hdr_t *ab), \
TP_ARGS(ab))
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__iohit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
@ -387,12 +387,12 @@ DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction);
#else
DEFINE_DTRACE_PROBE1(arc__hit);
DEFINE_DTRACE_PROBE1(arc__iohit);
DEFINE_DTRACE_PROBE1(arc__evict);
DEFINE_DTRACE_PROBE1(arc__delete);
DEFINE_DTRACE_PROBE1(new_state__mru);
DEFINE_DTRACE_PROBE1(new_state__mfu);
DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync);
DEFINE_DTRACE_PROBE1(arc__demand__hit__predictive__prefetch);
DEFINE_DTRACE_PROBE1(l2arc__hit);
DEFINE_DTRACE_PROBE1(l2arc__miss);
DEFINE_DTRACE_PROBE2(l2arc__read);

View File

@ -115,7 +115,6 @@ typedef enum arc_flags
ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
/*

View File

@ -101,9 +101,14 @@ struct arc_callback {
boolean_t acb_compressed;
boolean_t acb_noauth;
boolean_t acb_nobuf;
boolean_t acb_wait;
int acb_wait_error;
kmutex_t acb_wait_lock;
kcondvar_t acb_wait_cv;
zbookmark_phys_t acb_zb;
zio_t *acb_zio_dummy;
zio_t *acb_zio_head;
arc_callback_t *acb_prev;
arc_callback_t *acb_next;
};
@ -511,15 +516,27 @@ struct arc_buf_hdr {
};
typedef struct arc_stats {
/* Number of requests that were satisfied without I/O. */
kstat_named_t arcstat_hits;
/* Number of requests for which I/O was already running. */
kstat_named_t arcstat_iohits;
/* Number of requests for which I/O has to be issued. */
kstat_named_t arcstat_misses;
/* Same three, but specifically for demand data. */
kstat_named_t arcstat_demand_data_hits;
kstat_named_t arcstat_demand_data_iohits;
kstat_named_t arcstat_demand_data_misses;
/* Same three, but specifically for demand metadata. */
kstat_named_t arcstat_demand_metadata_hits;
kstat_named_t arcstat_demand_metadata_iohits;
kstat_named_t arcstat_demand_metadata_misses;
/* Same three, but specifically for prefetch data. */
kstat_named_t arcstat_prefetch_data_hits;
kstat_named_t arcstat_prefetch_data_iohits;
kstat_named_t arcstat_prefetch_data_misses;
/* Same three, but specifically for prefetch metadata. */
kstat_named_t arcstat_prefetch_metadata_hits;
kstat_named_t arcstat_prefetch_metadata_iohits;
kstat_named_t arcstat_prefetch_metadata_misses;
kstat_named_t arcstat_mru_hits;
kstat_named_t arcstat_mru_ghost_hits;
@ -844,8 +861,18 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_async_upgrade_sync;
/* Number of predictive prefetch requests. */
kstat_named_t arcstat_predictive_prefetch;
/* Number of requests for which predictive prefetch has completed. */
kstat_named_t arcstat_demand_hit_predictive_prefetch;
/* Number of requests for which predictive prefetch was running. */
kstat_named_t arcstat_demand_iohit_predictive_prefetch;
/* Number of prescient prefetch requests. */
kstat_named_t arcstat_prescient_prefetch;
/* Number of requests for which prescient prefetch has completed. */
kstat_named_t arcstat_demand_hit_prescient_prefetch;
/* Number of requests for which prescient prefetch was running. */
kstat_named_t arcstat_demand_iohit_prescient_prefetch;
kstat_named_t arcstat_need_free;
kstat_named_t arcstat_sys_free;
kstat_named_t arcstat_raw_size;
@ -855,14 +882,19 @@ typedef struct arc_stats {
typedef struct arc_sums {
wmsum_t arcstat_hits;
wmsum_t arcstat_iohits;
wmsum_t arcstat_misses;
wmsum_t arcstat_demand_data_hits;
wmsum_t arcstat_demand_data_iohits;
wmsum_t arcstat_demand_data_misses;
wmsum_t arcstat_demand_metadata_hits;
wmsum_t arcstat_demand_metadata_iohits;
wmsum_t arcstat_demand_metadata_misses;
wmsum_t arcstat_prefetch_data_hits;
wmsum_t arcstat_prefetch_data_iohits;
wmsum_t arcstat_prefetch_data_misses;
wmsum_t arcstat_prefetch_metadata_hits;
wmsum_t arcstat_prefetch_metadata_iohits;
wmsum_t arcstat_prefetch_metadata_misses;
wmsum_t arcstat_mru_hits;
wmsum_t arcstat_mru_ghost_hits;
@ -936,8 +968,12 @@ typedef struct arc_sums {
wmsum_t arcstat_prune;
aggsum_t arcstat_meta_used;
wmsum_t arcstat_async_upgrade_sync;
wmsum_t arcstat_predictive_prefetch;
wmsum_t arcstat_demand_hit_predictive_prefetch;
wmsum_t arcstat_demand_iohit_predictive_prefetch;
wmsum_t arcstat_prescient_prefetch;
wmsum_t arcstat_demand_hit_prescient_prefetch;
wmsum_t arcstat_demand_iohit_prescient_prefetch;
wmsum_t arcstat_raw_size;
wmsum_t arcstat_cached_only_in_progress;
wmsum_t arcstat_abd_chunk_waste_size;

File diff suppressed because it is too large Load Diff

View File

@ -185,7 +185,8 @@ static boolean_t
traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))

View File

@ -517,13 +517,11 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
issued = 0;
for (int64_t blk = pf_start; blk < pf_end; blk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
dmu_zfetch_done, zs);
ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
}
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
dmu_zfetch_done, zs);
ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
}
if (!have_lock)

View File

@ -163,6 +163,7 @@ before_clone=$(get_prop written $TESTPOOL/$TESTFS1)
log_must zfs clone $TESTPOOL/$TESTFS1@snap1 $TESTPOOL/$TESTFS1/snap1.clone
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS1/snap1.clone/testfile bs=1M \
count=40
sync_pool
after_clone=$(get_prop written $TESTPOOL/$TESTFS1)
within_percent $before_clone $after_clone 99.5 || \
log_fail "unexpected written for clone $before_clone $after_clone"

View File

@ -80,7 +80,7 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio
log_must fio $FIO_SCRIPTS/random_reads.fio
log_must zpool export $TESTPOOL
log_must zpool import -d $VDIR $TESTPOOL
log_must zpool import -N -d $VDIR $TESTPOOL
# Regardless of l2arc_noprefetch, some MFU buffers might be evicted
# from ARC, accessed later on as prefetches and transition to MRU as

View File

@ -95,6 +95,7 @@ for type in "" "mirror" "raidz2" "draid"; do
# Fill the pool, verify the vdevs are no longer sparse.
file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R
sync_pool $TESTPOOL
verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS
# Remove the file, wait for trim, verify the vdevs are now sparse.

View File

@ -94,6 +94,7 @@ for type in "" "mirror" "raidz2" "draid"; do
# Fill the pool, verify the vdevs are no longer sparse.
file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R
sync_pool $TESTPOOL
verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS
# Remove the file, issue trim, verify the vdevs are now sparse.

View File

@ -83,6 +83,7 @@ function do_test {
# Write to zvol
log_must dd if=$datafile1 of=$zvolpath conv=fsync
sync_pool
# Record how much space we've used (should be 5MB, with 128k
# of tolerance).