5987 zfs prefetch code needs work
illumos/illumos-gate@cf6106c8a0 https://www.illumos.org/issues/5987 The existing ZFS prefetch code (dmu_zfetch.c) has some problems: 1. It's nearly impossible to understand. e.g. there are an abundance of kstats but it's hard to know what they mean (see below). 2. For some workloads, it detects patterns that aren't really there (e.g. strided patterns, backwards scans), and generates needless i/os prefetching blocks that will never be referenced. 3. It has lock contention issues. These are caused primarily by dmu_zfetch_colinear() calling dmu_zfetch_dofetch() (which can block waiting for i/o) with the zf_rwlock held for writer, thus blocking all other threads accessing this file. I suggest that we rewrite this code to detect only forward, sequential streams. [... truncated ...] Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Approved by: Gordon Ross <gordon.ross@nexenta.com> Author: Matthew Ahrens <mahrens@delphix.com>
This commit is contained in:
parent
ec598d965c
commit
981f27b3d6
@ -495,6 +495,8 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_meta_limit;
|
||||
kstat_named_t arcstat_meta_max;
|
||||
kstat_named_t arcstat_meta_min;
|
||||
kstat_named_t arcstat_sync_wait_for_async;
|
||||
kstat_named_t arcstat_demand_hit_predictive_prefetch;
|
||||
} arc_stats_t;
|
||||
|
||||
static arc_stats_t arc_stats = {
|
||||
@ -580,7 +582,9 @@ static arc_stats_t arc_stats = {
|
||||
{ "arc_meta_used", KSTAT_DATA_UINT64 },
|
||||
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
|
||||
{ "arc_meta_max", KSTAT_DATA_UINT64 },
|
||||
{ "arc_meta_min", KSTAT_DATA_UINT64 }
|
||||
{ "arc_meta_min", KSTAT_DATA_UINT64 },
|
||||
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
|
||||
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
|
||||
};
|
||||
|
||||
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
|
||||
@ -4004,6 +4008,36 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
|
||||
if (HDR_IO_IN_PROGRESS(hdr)) {
|
||||
|
||||
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
|
||||
priority == ZIO_PRIORITY_SYNC_READ) {
|
||||
/*
|
||||
* This sync read must wait for an
|
||||
* in-progress async read (e.g. a predictive
|
||||
* prefetch). Async reads are queued
|
||||
* separately at the vdev_queue layer, so
|
||||
* this is a form of priority inversion.
|
||||
* Ideally, we would "inherit" the demand
|
||||
* i/o's priority by moving the i/o from
|
||||
* the async queue to the synchronous queue,
|
||||
* but there is currently no mechanism to do
|
||||
* so. Track this so that we can evaluate
|
||||
* the magnitude of this potential performance
|
||||
* problem.
|
||||
*
|
||||
* Note that if the prefetch i/o is already
|
||||
* active (has been issued to the device),
|
||||
* the prefetch improved performance, because
|
||||
* we issued it sooner than we would have
|
||||
* without the prefetch.
|
||||
*/
|
||||
DTRACE_PROBE1(arc__sync__wait__for__async,
|
||||
arc_buf_hdr_t *, hdr);
|
||||
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
|
||||
}
|
||||
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
|
||||
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
|
||||
}
|
||||
|
||||
if (*arc_flags & ARC_FLAG_WAIT) {
|
||||
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
|
||||
mutex_exit(hash_lock);
|
||||
@ -4012,7 +4046,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
|
||||
|
||||
if (done) {
|
||||
arc_callback_t *acb = NULL;
|
||||
arc_callback_t *acb = NULL;
|
||||
|
||||
acb = kmem_zalloc(sizeof (arc_callback_t),
|
||||
KM_SLEEP);
|
||||
@ -4037,6 +4071,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
hdr->b_l1hdr.b_state == arc_mfu);
|
||||
|
||||
if (done) {
|
||||
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
|
||||
/*
|
||||
* This is a demand read which does not have to
|
||||
* wait for i/o because we did a predictive
|
||||
* prefetch i/o for it, which has completed.
|
||||
*/
|
||||
DTRACE_PROBE1(
|
||||
arc__demand__hit__predictive__prefetch,
|
||||
arc_buf_hdr_t *, hdr);
|
||||
ARCSTAT_BUMP(
|
||||
arcstat_demand_hit_predictive_prefetch);
|
||||
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
|
||||
}
|
||||
add_reference(hdr, hash_lock, private);
|
||||
/*
|
||||
* If this block is already in use, create a new
|
||||
@ -4099,12 +4146,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
goto top; /* restart the IO request */
|
||||
}
|
||||
|
||||
/* if this is a prefetch, we don't have a reference */
|
||||
if (*arc_flags & ARC_FLAG_PREFETCH) {
|
||||
/*
|
||||
* If there is a callback, we pass our reference to
|
||||
* it; otherwise we remove our reference.
|
||||
*/
|
||||
if (done == NULL) {
|
||||
(void) remove_reference(hdr, hash_lock,
|
||||
private);
|
||||
hdr->b_flags |= ARC_FLAG_PREFETCH;
|
||||
}
|
||||
if (*arc_flags & ARC_FLAG_PREFETCH)
|
||||
hdr->b_flags |= ARC_FLAG_PREFETCH;
|
||||
if (*arc_flags & ARC_FLAG_L2CACHE)
|
||||
hdr->b_flags |= ARC_FLAG_L2CACHE;
|
||||
if (*arc_flags & ARC_FLAG_L2COMPRESS)
|
||||
@ -4127,11 +4178,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
||||
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
|
||||
|
||||
/* if this is a prefetch, we don't have a reference */
|
||||
/*
|
||||
* If there is a callback, we pass a reference to it.
|
||||
*/
|
||||
if (done != NULL)
|
||||
add_reference(hdr, hash_lock, private);
|
||||
if (*arc_flags & ARC_FLAG_PREFETCH)
|
||||
hdr->b_flags |= ARC_FLAG_PREFETCH;
|
||||
else
|
||||
add_reference(hdr, hash_lock, private);
|
||||
if (*arc_flags & ARC_FLAG_L2CACHE)
|
||||
hdr->b_flags |= ARC_FLAG_L2CACHE;
|
||||
if (*arc_flags & ARC_FLAG_L2COMPRESS)
|
||||
@ -4149,6 +4202,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
arc_access(hdr, hash_lock);
|
||||
}
|
||||
|
||||
if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
|
||||
hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
|
||||
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
|
||||
|
||||
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
|
||||
@ -4188,6 +4243,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
|
||||
data, metadata, misses);
|
||||
|
||||
if (priority == ZIO_PRIORITY_ASYNC_READ)
|
||||
hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
|
||||
else
|
||||
hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
|
||||
|
||||
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
|
||||
/*
|
||||
* Read from the L2ARC if the following are true:
|
||||
|
@ -618,7 +618,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||
{
|
||||
dnode_t *dn;
|
||||
zbookmark_phys_t zb;
|
||||
@ -664,7 +664,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
|
||||
db->db.db_size, db, type));
|
||||
bzero(db->db.db_data, db->db.db_size);
|
||||
db->db_state = DB_CACHED;
|
||||
*flags |= DB_RF_CACHED;
|
||||
mutex_exit(&db->db_mtx);
|
||||
return;
|
||||
}
|
||||
@ -687,10 +686,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
|
||||
|
||||
(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
|
||||
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
|
||||
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
|
||||
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
|
||||
&aflags, &zb);
|
||||
if (aflags & ARC_FLAG_CACHED)
|
||||
*flags |= DB_RF_CACHED;
|
||||
}
|
||||
|
||||
int
|
||||
@ -723,8 +720,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||
if (db->db_state == DB_CACHED) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
if (prefetch)
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
|
||||
db->db.db_size, TRUE);
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
|
||||
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
DB_DNODE_EXIT(db);
|
||||
@ -733,13 +729,12 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||
|
||||
if (zio == NULL)
|
||||
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
dbuf_read_impl(db, zio, &flags);
|
||||
dbuf_read_impl(db, zio, flags);
|
||||
|
||||
/* dbuf_read_impl has dropped db_mtx for us */
|
||||
|
||||
if (prefetch)
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
|
||||
db->db.db_size, flags & DB_RF_CACHED);
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
|
||||
|
||||
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
@ -758,8 +753,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||
*/
|
||||
mutex_exit(&db->db_mtx);
|
||||
if (prefetch)
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
|
||||
db->db.db_size, TRUE);
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
|
||||
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
DB_DNODE_EXIT(db);
|
||||
@ -2059,6 +2053,9 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
||||
ASSERT(blkid != DMU_BONUS_BLKID);
|
||||
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
||||
|
||||
if (blkid > dn->dn_maxblkid)
|
||||
return;
|
||||
|
||||
if (dnode_block_freed(dn, blkid))
|
||||
return;
|
||||
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
*/
|
||||
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
|
||||
/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
|
||||
@ -386,7 +386,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
|
||||
*/
|
||||
static int
|
||||
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
|
||||
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
uint64_t blkid, nblks, i;
|
||||
@ -396,15 +396,19 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
|
||||
ASSERT(length <= DMU_MAX_ACCESS);
|
||||
|
||||
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
|
||||
if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
|
||||
dbuf_flags |= DB_RF_NOPREFETCH;
|
||||
/*
|
||||
* Note: We directly notify the prefetch code of this read, so that
|
||||
* we can tell it about the multi-block read. dbuf_read() only knows
|
||||
* about the one block it is accessing.
|
||||
*/
|
||||
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
|
||||
DB_RF_NOPREFETCH;
|
||||
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||
if (dn->dn_datablkshift) {
|
||||
int blkshift = dn->dn_datablkshift;
|
||||
nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
|
||||
P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
|
||||
nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
|
||||
P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
|
||||
} else {
|
||||
if (offset + length > dn->dn_datablksz) {
|
||||
zfs_panic_recover("zfs: accessing past end of object "
|
||||
@ -423,19 +427,24 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
blkid = dbuf_whichblock(dn, 0, offset);
|
||||
for (i = 0; i < nblks; i++) {
|
||||
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
|
||||
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
|
||||
if (db == NULL) {
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dmu_buf_rele_array(dbp, nblks, tag);
|
||||
zio_nowait(zio);
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
|
||||
/* initiate async i/o */
|
||||
if (read) {
|
||||
if (read)
|
||||
(void) dbuf_read(db, zio, dbuf_flags);
|
||||
}
|
||||
dbp[i] = &db->db;
|
||||
}
|
||||
|
||||
if ((flags & DMU_READ_NO_PREFETCH) == 0 && read &&
|
||||
length < zfetch_array_rd_sz) {
|
||||
dmu_zfetch(&dn->dn_zfetch, blkid, nblks);
|
||||
}
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
|
||||
/* wait for async i/o */
|
||||
@ -489,7 +498,8 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
|
||||
|
||||
int
|
||||
dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
|
||||
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
|
||||
uint64_t length, boolean_t read, void *tag, int *numbufsp,
|
||||
dmu_buf_t ***dbpp)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
||||
dnode_t *dn;
|
||||
@ -537,9 +547,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
||||
uint64_t blkid;
|
||||
int nblks, err;
|
||||
|
||||
if (zfs_prefetch_disable)
|
||||
return;
|
||||
|
||||
if (len == 0) { /* they're interested in the bonus buffer */
|
||||
dn = DMU_META_DNODE(os);
|
||||
|
||||
|
@ -24,7 +24,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -36,207 +36,42 @@
|
||||
#include <sys/kstat.h>
|
||||
|
||||
/*
|
||||
* I'm against tune-ables, but these should probably exist as tweakable globals
|
||||
* until we can get this working the way we want it to.
|
||||
* This tunable disables predictive prefetch. Note that it leaves "prescient"
|
||||
* prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
|
||||
* prescient prefetch never issues i/os that end up not being needed,
|
||||
* so it can't hurt performance.
|
||||
*/
|
||||
|
||||
int zfs_prefetch_disable = 0;
|
||||
boolean_t zfs_prefetch_disable = B_FALSE;
|
||||
|
||||
/* max # of streams per zfetch */
|
||||
uint32_t zfetch_max_streams = 8;
|
||||
/* min time before stream reclaim */
|
||||
uint32_t zfetch_min_sec_reap = 2;
|
||||
/* max number of blocks to fetch at a time */
|
||||
uint32_t zfetch_block_cap = 256;
|
||||
/* number of bytes in a array_read at which we stop prefetching (1Mb) */
|
||||
/* max bytes to prefetch per stream (default 8MB) */
|
||||
uint32_t zfetch_max_distance = 8 * 1024 * 1024;
|
||||
/* number of bytes in a array_read at which we stop prefetching (1MB) */
|
||||
uint64_t zfetch_array_rd_sz = 1024 * 1024;
|
||||
|
||||
/* forward decls for static routines */
|
||||
static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *);
|
||||
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
|
||||
static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
|
||||
static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
|
||||
static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int);
|
||||
static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
|
||||
static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
|
||||
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
|
||||
static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
|
||||
|
||||
typedef struct zfetch_stats {
|
||||
kstat_named_t zfetchstat_hits;
|
||||
kstat_named_t zfetchstat_misses;
|
||||
kstat_named_t zfetchstat_colinear_hits;
|
||||
kstat_named_t zfetchstat_colinear_misses;
|
||||
kstat_named_t zfetchstat_stride_hits;
|
||||
kstat_named_t zfetchstat_stride_misses;
|
||||
kstat_named_t zfetchstat_reclaim_successes;
|
||||
kstat_named_t zfetchstat_reclaim_failures;
|
||||
kstat_named_t zfetchstat_stream_resets;
|
||||
kstat_named_t zfetchstat_stream_noresets;
|
||||
kstat_named_t zfetchstat_bogus_streams;
|
||||
kstat_named_t zfetchstat_max_streams;
|
||||
} zfetch_stats_t;
|
||||
|
||||
static zfetch_stats_t zfetch_stats = {
|
||||
{ "hits", KSTAT_DATA_UINT64 },
|
||||
{ "misses", KSTAT_DATA_UINT64 },
|
||||
{ "colinear_hits", KSTAT_DATA_UINT64 },
|
||||
{ "colinear_misses", KSTAT_DATA_UINT64 },
|
||||
{ "stride_hits", KSTAT_DATA_UINT64 },
|
||||
{ "stride_misses", KSTAT_DATA_UINT64 },
|
||||
{ "reclaim_successes", KSTAT_DATA_UINT64 },
|
||||
{ "reclaim_failures", KSTAT_DATA_UINT64 },
|
||||
{ "streams_resets", KSTAT_DATA_UINT64 },
|
||||
{ "streams_noresets", KSTAT_DATA_UINT64 },
|
||||
{ "bogus_streams", KSTAT_DATA_UINT64 },
|
||||
{ "max_streams", KSTAT_DATA_UINT64 },
|
||||
};
|
||||
|
||||
#define ZFETCHSTAT_INCR(stat, val) \
|
||||
atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
|
||||
|
||||
#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1);
|
||||
#define ZFETCHSTAT_BUMP(stat) \
|
||||
atomic_inc_64(&zfetch_stats.stat.value.ui64);
|
||||
|
||||
kstat_t *zfetch_ksp;
|
||||
|
||||
/*
|
||||
* Given a zfetch structure and a zstream structure, determine whether the
|
||||
* blocks to be read are part of a co-linear pair of existing prefetch
|
||||
* streams. If a set is found, coalesce the streams, removing one, and
|
||||
* configure the prefetch so it looks for a strided access pattern.
|
||||
*
|
||||
* In other words: if we find two sequential access streams that are
|
||||
* the same length and distance N appart, and this read is N from the
|
||||
* last stream, then we are probably in a strided access pattern. So
|
||||
* combine the two sequential streams into a single strided stream.
|
||||
*
|
||||
* Returns whether co-linear streams were found.
|
||||
*/
|
||||
static boolean_t
|
||||
dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
|
||||
{
|
||||
zstream_t *z_walk;
|
||||
zstream_t *z_comp;
|
||||
|
||||
if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
|
||||
return (0);
|
||||
|
||||
if (zh == NULL) {
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
return (0);
|
||||
}
|
||||
|
||||
for (z_walk = list_head(&zf->zf_stream); z_walk;
|
||||
z_walk = list_next(&zf->zf_stream, z_walk)) {
|
||||
for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
|
||||
z_comp = list_next(&zf->zf_stream, z_comp)) {
|
||||
int64_t diff;
|
||||
|
||||
if (z_walk->zst_len != z_walk->zst_stride ||
|
||||
z_comp->zst_len != z_comp->zst_stride) {
|
||||
continue;
|
||||
}
|
||||
|
||||
diff = z_comp->zst_offset - z_walk->zst_offset;
|
||||
if (z_comp->zst_offset + diff == zh->zst_offset) {
|
||||
z_walk->zst_offset = zh->zst_offset;
|
||||
z_walk->zst_direction = diff < 0 ? -1 : 1;
|
||||
z_walk->zst_stride =
|
||||
diff * z_walk->zst_direction;
|
||||
z_walk->zst_ph_offset =
|
||||
zh->zst_offset + z_walk->zst_stride;
|
||||
dmu_zfetch_stream_remove(zf, z_comp);
|
||||
mutex_destroy(&z_comp->zst_lock);
|
||||
kmem_free(z_comp, sizeof (zstream_t));
|
||||
|
||||
dmu_zfetch_dofetch(zf, z_walk);
|
||||
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
return (1);
|
||||
}
|
||||
|
||||
diff = z_walk->zst_offset - z_comp->zst_offset;
|
||||
if (z_walk->zst_offset + diff == zh->zst_offset) {
|
||||
z_walk->zst_offset = zh->zst_offset;
|
||||
z_walk->zst_direction = diff < 0 ? -1 : 1;
|
||||
z_walk->zst_stride =
|
||||
diff * z_walk->zst_direction;
|
||||
z_walk->zst_ph_offset =
|
||||
zh->zst_offset + z_walk->zst_stride;
|
||||
dmu_zfetch_stream_remove(zf, z_comp);
|
||||
mutex_destroy(&z_comp->zst_lock);
|
||||
kmem_free(z_comp, sizeof (zstream_t));
|
||||
|
||||
dmu_zfetch_dofetch(zf, z_walk);
|
||||
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
return (1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a zstream_t, determine the bounds of the prefetch. Then call the
|
||||
* routine that actually prefetches the individual blocks.
|
||||
*/
|
||||
static void
|
||||
dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
|
||||
{
|
||||
uint64_t prefetch_tail;
|
||||
uint64_t prefetch_limit;
|
||||
uint64_t prefetch_ofst;
|
||||
uint64_t prefetch_len;
|
||||
uint64_t blocks_fetched;
|
||||
|
||||
zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
|
||||
zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
|
||||
|
||||
prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
|
||||
(int64_t)(zs->zst_offset + zs->zst_stride));
|
||||
/*
|
||||
* XXX: use a faster division method?
|
||||
*/
|
||||
prefetch_limit = zs->zst_offset + zs->zst_len +
|
||||
(zs->zst_cap * zs->zst_stride) / zs->zst_len;
|
||||
|
||||
while (prefetch_tail < prefetch_limit) {
|
||||
prefetch_ofst = zs->zst_offset + zs->zst_direction *
|
||||
(prefetch_tail - zs->zst_offset);
|
||||
|
||||
prefetch_len = zs->zst_len;
|
||||
|
||||
/*
|
||||
* Don't prefetch beyond the end of the file, if working
|
||||
* backwards.
|
||||
*/
|
||||
if ((zs->zst_direction == ZFETCH_BACKWARD) &&
|
||||
(prefetch_ofst > prefetch_tail)) {
|
||||
prefetch_len += prefetch_ofst;
|
||||
prefetch_ofst = 0;
|
||||
}
|
||||
|
||||
/* don't prefetch more than we're supposed to */
|
||||
if (prefetch_len > zs->zst_len)
|
||||
break;
|
||||
|
||||
blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
|
||||
prefetch_ofst, zs->zst_len);
|
||||
|
||||
prefetch_tail += zs->zst_stride;
|
||||
/* stop if we've run out of stuff to prefetch */
|
||||
if (blocks_fetched < zs->zst_len)
|
||||
break;
|
||||
}
|
||||
zs->zst_ph_offset = prefetch_tail;
|
||||
zs->zst_last = ddi_get_lbolt();
|
||||
}
|
||||
|
||||
void
|
||||
zfetch_init(void)
|
||||
{
|
||||
|
||||
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
|
||||
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
|
||||
KSTAT_FLAG_VIRTUAL);
|
||||
@ -264,273 +99,41 @@ zfetch_fini(void)
|
||||
void
|
||||
dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
|
||||
{
|
||||
if (zf == NULL) {
|
||||
if (zf == NULL)
|
||||
return;
|
||||
}
|
||||
|
||||
zf->zf_dnode = dno;
|
||||
zf->zf_stream_cnt = 0;
|
||||
zf->zf_alloc_fail = 0;
|
||||
|
||||
list_create(&zf->zf_stream, sizeof (zstream_t),
|
||||
offsetof(zstream_t, zst_node));
|
||||
offsetof(zstream_t, zs_node));
|
||||
|
||||
rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function computes the actual size, in blocks, that can be prefetched,
|
||||
* and fetches it.
|
||||
*/
|
||||
static uint64_t
|
||||
dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
|
||||
static void
|
||||
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
|
||||
{
|
||||
uint64_t fetchsz;
|
||||
uint64_t i;
|
||||
|
||||
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
|
||||
|
||||
for (i = 0; i < fetchsz; i++) {
|
||||
dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
|
||||
ARC_FLAG_PREFETCH);
|
||||
}
|
||||
|
||||
return (fetchsz);
|
||||
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
|
||||
list_remove(&zf->zf_stream, zs);
|
||||
mutex_destroy(&zs->zs_lock);
|
||||
kmem_free(zs, sizeof (*zs));
|
||||
}
|
||||
|
||||
/*
|
||||
* this function returns the number of blocks that would be prefetched, based
|
||||
* upon the supplied dnode, blockid, and nblks. This is used so that we can
|
||||
* update streams in place, and then prefetch with their old value after the
|
||||
* fact. This way, we can delay the prefetch, but subsequent accesses to the
|
||||
* stream won't result in the same data being prefetched multiple times.
|
||||
*/
|
||||
static uint64_t
|
||||
dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
|
||||
{
|
||||
uint64_t fetchsz;
|
||||
|
||||
if (blkid > dn->dn_maxblkid) {
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* compute fetch size */
|
||||
if (blkid + nblks + 1 > dn->dn_maxblkid) {
|
||||
fetchsz = (dn->dn_maxblkid - blkid) + 1;
|
||||
ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
|
||||
} else {
|
||||
fetchsz = nblks;
|
||||
}
|
||||
|
||||
|
||||
return (fetchsz);
|
||||
}
|
||||
|
||||
/*
|
||||
* given a zfetch and a zstream structure, see if there is an associated zstream
|
||||
* for this block read. If so, it starts a prefetch for the stream it
|
||||
* located and returns true, otherwise it returns false
|
||||
*/
|
||||
static boolean_t
|
||||
dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
|
||||
{
|
||||
zstream_t *zs;
|
||||
int64_t diff;
|
||||
int reset = !prefetched;
|
||||
int rc = 0;
|
||||
|
||||
if (zh == NULL)
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* XXX: This locking strategy is a bit coarse; however, it's impact has
|
||||
* yet to be tested. If this turns out to be an issue, it can be
|
||||
* modified in a number of different ways.
|
||||
*/
|
||||
|
||||
rw_enter(&zf->zf_rwlock, RW_READER);
|
||||
top:
|
||||
|
||||
for (zs = list_head(&zf->zf_stream); zs;
|
||||
zs = list_next(&zf->zf_stream, zs)) {
|
||||
|
||||
/*
|
||||
* XXX - should this be an assert?
|
||||
*/
|
||||
if (zs->zst_len == 0) {
|
||||
/* bogus stream */
|
||||
ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We hit this case when we are in a strided prefetch stream:
|
||||
* we will read "len" blocks before "striding".
|
||||
*/
|
||||
if (zh->zst_offset >= zs->zst_offset &&
|
||||
zh->zst_offset < zs->zst_offset + zs->zst_len) {
|
||||
if (prefetched) {
|
||||
/* already fetched */
|
||||
ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
|
||||
rc = 1;
|
||||
goto out;
|
||||
} else {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the forward sequential read case: we increment
|
||||
* len by one each time we hit here, so we will enter this
|
||||
* case on every read.
|
||||
*/
|
||||
if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
|
||||
|
||||
reset = !prefetched && zs->zst_len > 1;
|
||||
|
||||
mutex_enter(&zs->zst_lock);
|
||||
|
||||
if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
|
||||
mutex_exit(&zs->zst_lock);
|
||||
goto top;
|
||||
}
|
||||
zs->zst_len += zh->zst_len;
|
||||
diff = zs->zst_len - zfetch_block_cap;
|
||||
if (diff > 0) {
|
||||
zs->zst_offset += diff;
|
||||
zs->zst_len = zs->zst_len > diff ?
|
||||
zs->zst_len - diff : 0;
|
||||
}
|
||||
zs->zst_direction = ZFETCH_FORWARD;
|
||||
|
||||
break;
|
||||
|
||||
/*
|
||||
* Same as above, but reading backwards through the file.
|
||||
*/
|
||||
} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
|
||||
/* backwards sequential access */
|
||||
|
||||
reset = !prefetched && zs->zst_len > 1;
|
||||
|
||||
mutex_enter(&zs->zst_lock);
|
||||
|
||||
if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
|
||||
mutex_exit(&zs->zst_lock);
|
||||
goto top;
|
||||
}
|
||||
|
||||
zs->zst_offset = zs->zst_offset > zh->zst_len ?
|
||||
zs->zst_offset - zh->zst_len : 0;
|
||||
zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
|
||||
zs->zst_ph_offset - zh->zst_len : 0;
|
||||
zs->zst_len += zh->zst_len;
|
||||
|
||||
diff = zs->zst_len - zfetch_block_cap;
|
||||
if (diff > 0) {
|
||||
zs->zst_ph_offset = zs->zst_ph_offset > diff ?
|
||||
zs->zst_ph_offset - diff : 0;
|
||||
zs->zst_len = zs->zst_len > diff ?
|
||||
zs->zst_len - diff : zs->zst_len;
|
||||
}
|
||||
zs->zst_direction = ZFETCH_BACKWARD;
|
||||
|
||||
break;
|
||||
|
||||
} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
|
||||
zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
|
||||
/* strided forward access */
|
||||
|
||||
mutex_enter(&zs->zst_lock);
|
||||
|
||||
if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
|
||||
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
|
||||
mutex_exit(&zs->zst_lock);
|
||||
goto top;
|
||||
}
|
||||
|
||||
zs->zst_offset += zs->zst_stride;
|
||||
zs->zst_direction = ZFETCH_FORWARD;
|
||||
|
||||
break;
|
||||
|
||||
} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
|
||||
zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
|
||||
/* strided reverse access */
|
||||
|
||||
mutex_enter(&zs->zst_lock);
|
||||
|
||||
if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
|
||||
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
|
||||
mutex_exit(&zs->zst_lock);
|
||||
goto top;
|
||||
}
|
||||
|
||||
zs->zst_offset = zs->zst_offset > zs->zst_stride ?
|
||||
zs->zst_offset - zs->zst_stride : 0;
|
||||
zs->zst_ph_offset = (zs->zst_ph_offset >
|
||||
(2 * zs->zst_stride)) ?
|
||||
(zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
|
||||
zs->zst_direction = ZFETCH_BACKWARD;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (zs) {
|
||||
if (reset) {
|
||||
zstream_t *remove = zs;
|
||||
|
||||
ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
|
||||
rc = 0;
|
||||
mutex_exit(&zs->zst_lock);
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
rw_enter(&zf->zf_rwlock, RW_WRITER);
|
||||
/*
|
||||
* Relocate the stream, in case someone removes
|
||||
* it while we were acquiring the WRITER lock.
|
||||
*/
|
||||
for (zs = list_head(&zf->zf_stream); zs;
|
||||
zs = list_next(&zf->zf_stream, zs)) {
|
||||
if (zs == remove) {
|
||||
dmu_zfetch_stream_remove(zf, zs);
|
||||
mutex_destroy(&zs->zst_lock);
|
||||
kmem_free(zs, sizeof (zstream_t));
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
|
||||
rc = 1;
|
||||
dmu_zfetch_dofetch(zf, zs);
|
||||
mutex_exit(&zs->zst_lock);
|
||||
}
|
||||
}
|
||||
out:
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean-up state associated with a zfetch structure. This frees allocated
|
||||
* structure members, empties the zf_stream tree, and generally makes things
|
||||
* nice. This doesn't free the zfetch_t itself, that's left to the caller.
|
||||
* Clean-up state associated with a zfetch structure (e.g. destroy the
|
||||
* streams). This doesn't free the zfetch_t itself, that's left to the caller.
|
||||
*/
|
||||
void
|
||||
dmu_zfetch_rele(zfetch_t *zf)
|
||||
dmu_zfetch_fini(zfetch_t *zf)
|
||||
{
|
||||
zstream_t *zs;
|
||||
zstream_t *zs_next;
|
||||
zstream_t *zs;
|
||||
|
||||
ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
|
||||
|
||||
for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
|
||||
zs_next = list_next(&zf->zf_stream, zs);
|
||||
|
||||
list_remove(&zf->zf_stream, zs);
|
||||
mutex_destroy(&zs->zst_lock);
|
||||
kmem_free(zs, sizeof (zstream_t));
|
||||
}
|
||||
rw_enter(&zf->zf_rwlock, RW_WRITER);
|
||||
while ((zs = list_head(&zf->zf_stream)) != NULL)
|
||||
dmu_zfetch_stream_remove(zf, zs);
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
list_destroy(&zf->zf_stream);
|
||||
rw_destroy(&zf->zf_rwlock);
|
||||
|
||||
@ -538,101 +141,55 @@ dmu_zfetch_rele(zfetch_t *zf)
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a zfetch and zstream structure, insert the zstream structure into the
|
||||
* AVL tree contained within the zfetch structure. Peform the appropriate
|
||||
* book-keeping. It is possible that another thread has inserted a stream which
|
||||
* matches one that we are about to insert, so we must be sure to check for this
|
||||
* case. If one is found, return failure, and let the caller cleanup the
|
||||
* duplicates.
|
||||
*/
|
||||
static int
|
||||
dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
|
||||
{
|
||||
zstream_t *zs_walk;
|
||||
zstream_t *zs_next;
|
||||
|
||||
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
|
||||
|
||||
for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
|
||||
zs_next = list_next(&zf->zf_stream, zs_walk);
|
||||
|
||||
if (dmu_zfetch_streams_equal(zs_walk, zs)) {
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
|
||||
list_insert_head(&zf->zf_stream, zs);
|
||||
zf->zf_stream_cnt++;
|
||||
return (1);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Walk the list of zstreams in the given zfetch, find an old one (by time), and
|
||||
* reclaim it for use by the caller.
|
||||
*/
|
||||
static zstream_t *
|
||||
dmu_zfetch_stream_reclaim(zfetch_t *zf)
|
||||
{
|
||||
zstream_t *zs;
|
||||
|
||||
if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
|
||||
return (0);
|
||||
|
||||
for (zs = list_head(&zf->zf_stream); zs;
|
||||
zs = list_next(&zf->zf_stream, zs)) {
|
||||
|
||||
if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
|
||||
break;
|
||||
}
|
||||
|
||||
if (zs) {
|
||||
dmu_zfetch_stream_remove(zf, zs);
|
||||
mutex_destroy(&zs->zst_lock);
|
||||
bzero(zs, sizeof (zstream_t));
|
||||
} else {
|
||||
zf->zf_alloc_fail++;
|
||||
}
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
|
||||
return (zs);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a zfetch and zstream structure, remove the zstream structure from its
|
||||
* container in the zfetch structure. Perform the appropriate book-keeping.
|
||||
* If there aren't too many streams already, create a new stream.
|
||||
* The "blkid" argument is the next block that we expect this stream to access.
|
||||
* While we're here, clean up old streams (which haven't been
|
||||
* accessed for at least zfetch_min_sec_reap seconds).
|
||||
*/
|
||||
static void
|
||||
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
|
||||
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||
{
|
||||
zstream_t *zs_next;
|
||||
int numstreams = 0;
|
||||
|
||||
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
|
||||
|
||||
list_remove(&zf->zf_stream, zs);
|
||||
zf->zf_stream_cnt--;
|
||||
}
|
||||
/*
|
||||
* Clean up old streams.
|
||||
*/
|
||||
for (zstream_t *zs = list_head(&zf->zf_stream);
|
||||
zs != NULL; zs = zs_next) {
|
||||
zs_next = list_next(&zf->zf_stream, zs);
|
||||
if (((gethrtime() - zs->zs_atime) / NANOSEC) >
|
||||
zfetch_min_sec_reap)
|
||||
dmu_zfetch_stream_remove(zf, zs);
|
||||
else
|
||||
numstreams++;
|
||||
}
|
||||
|
||||
static int
|
||||
dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
|
||||
{
|
||||
if (zs1->zst_offset != zs2->zst_offset)
|
||||
return (0);
|
||||
/*
|
||||
* The maximum number of streams is normally zfetch_max_streams,
|
||||
* but for small files we lower it such that it's at least possible
|
||||
* for all the streams to be non-overlapping.
|
||||
*
|
||||
* If we are already at the maximum number of streams for this file,
|
||||
* even after removing old streams, then don't create this stream.
|
||||
*/
|
||||
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
|
||||
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
|
||||
zfetch_max_distance));
|
||||
if (numstreams >= max_streams) {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
|
||||
return;
|
||||
}
|
||||
|
||||
if (zs1->zst_len != zs2->zst_len)
|
||||
return (0);
|
||||
zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
|
||||
zs->zs_blkid = blkid;
|
||||
zs->zs_pf_blkid = blkid;
|
||||
zs->zs_atime = gethrtime();
|
||||
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
if (zs1->zst_stride != zs2->zst_stride)
|
||||
return (0);
|
||||
|
||||
if (zs1->zst_ph_offset != zs2->zst_ph_offset)
|
||||
return (0);
|
||||
|
||||
if (zs1->zst_cap != zs2->zst_cap)
|
||||
return (0);
|
||||
|
||||
if (zs1->zst_direction != zs2->zst_direction)
|
||||
return (0);
|
||||
|
||||
return (1);
|
||||
list_insert_head(&zf->zf_stream, zs);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -640,91 +197,86 @@ dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
|
||||
* routines to create, delete, find, or operate upon prefetch streams.
|
||||
*/
|
||||
void
|
||||
dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
|
||||
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
|
||||
{
|
||||
zstream_t zst;
|
||||
zstream_t *newstream;
|
||||
boolean_t fetched;
|
||||
int inserted;
|
||||
unsigned int blkshft;
|
||||
uint64_t blksz;
|
||||
zstream_t *zs;
|
||||
|
||||
if (zfs_prefetch_disable)
|
||||
return;
|
||||
|
||||
/* files that aren't ln2 blocksz are only one block -- nothing to do */
|
||||
if (!zf->zf_dnode->dn_datablkshift)
|
||||
/*
|
||||
* As a fast path for small (single-block) files, ignore access
|
||||
* to the first block.
|
||||
*/
|
||||
if (blkid == 0)
|
||||
return;
|
||||
|
||||
/* convert offset and size, into blockid and nblocks */
|
||||
blkshft = zf->zf_dnode->dn_datablkshift;
|
||||
blksz = (1 << blkshft);
|
||||
rw_enter(&zf->zf_rwlock, RW_READER);
|
||||
|
||||
bzero(&zst, sizeof (zstream_t));
|
||||
zst.zst_offset = offset >> blkshft;
|
||||
zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
|
||||
P2ALIGN(offset, blksz)) >> blkshft;
|
||||
|
||||
fetched = dmu_zfetch_find(zf, &zst, prefetched);
|
||||
if (fetched) {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
||||
} else {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
||||
fetched = dmu_zfetch_colinear(zf, &zst);
|
||||
if (fetched) {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
|
||||
} else {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
|
||||
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||
zs = list_next(&zf->zf_stream, zs)) {
|
||||
if (blkid == zs->zs_blkid) {
|
||||
mutex_enter(&zs->zs_lock);
|
||||
/*
|
||||
* zs_blkid could have changed before we
|
||||
* acquired zs_lock; re-check them here.
|
||||
*/
|
||||
if (blkid != zs->zs_blkid) {
|
||||
mutex_exit(&zs->zs_lock);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fetched) {
|
||||
newstream = dmu_zfetch_stream_reclaim(zf);
|
||||
|
||||
if (zs == NULL) {
|
||||
/*
|
||||
* we still couldn't find a stream, drop the lock, and allocate
|
||||
* one if possible. Otherwise, give up and go home.
|
||||
* This access is not part of any existing stream. Create
|
||||
* a new stream for it.
|
||||
*/
|
||||
if (newstream) {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
|
||||
} else {
|
||||
uint64_t maxblocks;
|
||||
uint32_t max_streams;
|
||||
uint32_t cur_streams;
|
||||
|
||||
ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
|
||||
cur_streams = zf->zf_stream_cnt;
|
||||
maxblocks = zf->zf_dnode->dn_maxblkid;
|
||||
|
||||
max_streams = MIN(zfetch_max_streams,
|
||||
(maxblocks / zfetch_block_cap));
|
||||
if (max_streams == 0) {
|
||||
max_streams++;
|
||||
}
|
||||
|
||||
if (cur_streams >= max_streams) {
|
||||
return;
|
||||
}
|
||||
newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
|
||||
}
|
||||
|
||||
newstream->zst_offset = zst.zst_offset;
|
||||
newstream->zst_len = zst.zst_len;
|
||||
newstream->zst_stride = zst.zst_len;
|
||||
newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
|
||||
newstream->zst_cap = zst.zst_len;
|
||||
newstream->zst_direction = ZFETCH_FORWARD;
|
||||
newstream->zst_last = ddi_get_lbolt();
|
||||
|
||||
mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
rw_enter(&zf->zf_rwlock, RW_WRITER);
|
||||
inserted = dmu_zfetch_stream_insert(zf, newstream);
|
||||
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
||||
if (rw_tryupgrade(&zf->zf_rwlock))
|
||||
dmu_zfetch_stream_create(zf, blkid + nblks);
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
|
||||
if (!inserted) {
|
||||
mutex_destroy(&newstream->zst_lock);
|
||||
kmem_free(newstream, sizeof (zstream_t));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* This access was to a block that we issued a prefetch for on
|
||||
* behalf of this stream. Issue further prefetches for this stream.
|
||||
*
|
||||
* Normally, we start prefetching where we stopped
|
||||
* prefetching last (zs_pf_blkid). But when we get our first
|
||||
* hit on this stream, zs_pf_blkid == zs_blkid, we don't
|
||||
* want to prefetch to block we just accessed. In this case,
|
||||
* start just after the block we just accessed.
|
||||
*/
|
||||
int64_t pf_start = MAX(zs->zs_pf_blkid, blkid + nblks);
|
||||
|
||||
/*
|
||||
* Double our amount of prefetched data, but don't let the
|
||||
* prefetch get further ahead than zfetch_max_distance.
|
||||
*/
|
||||
int pf_nblks =
|
||||
MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks,
|
||||
zs->zs_blkid + nblks +
|
||||
(zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start);
|
||||
|
||||
zs->zs_pf_blkid = pf_start + pf_nblks;
|
||||
zs->zs_atime = gethrtime();
|
||||
zs->zs_blkid = blkid + nblks;
|
||||
|
||||
/*
|
||||
* dbuf_prefetch() issues the prefetch i/o
|
||||
* asynchronously, but it may need to wait for an
|
||||
* indirect block to be read from disk. Therefore
|
||||
* we do not want to hold any locks while we call it.
|
||||
*/
|
||||
mutex_exit(&zs->zs_lock);
|
||||
rw_exit(&zf->zf_rwlock);
|
||||
for (int i = 0; i < pf_nblks; i++) {
|
||||
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
|
||||
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
|
||||
}
|
||||
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
||||
}
|
||||
|
@ -523,7 +523,7 @@ dnode_destroy(dnode_t *dn)
|
||||
dn->dn_id_flags = 0;
|
||||
dn->dn_unlisted_l0_blkid = 0;
|
||||
|
||||
dmu_zfetch_rele(&dn->dn_zfetch);
|
||||
dmu_zfetch_fini(&dn->dn_zfetch);
|
||||
kmem_cache_free(dnode_cache, dn);
|
||||
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
|
||||
|
||||
@ -771,8 +771,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
|
||||
dmu_zfetch_init(&ndn->dn_zfetch, NULL);
|
||||
list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
|
||||
ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
|
||||
ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
|
||||
ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
|
||||
|
||||
/*
|
||||
* Update back pointers. Updating the handle fixes the back pointer of
|
||||
|
@ -64,27 +64,31 @@ typedef enum arc_flags
|
||||
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
|
||||
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
|
||||
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
|
||||
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */
|
||||
|
||||
/*
|
||||
* Private ARC flags. These flags are private ARC only flags that
|
||||
* will show up in b_flags in the arc_hdr_buf_t. These flags should
|
||||
* only be set by ARC code.
|
||||
*/
|
||||
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
|
||||
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
|
||||
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
|
||||
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
|
||||
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
|
||||
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
|
||||
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
|
||||
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
|
||||
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
|
||||
ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */
|
||||
ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */
|
||||
ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */
|
||||
ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */
|
||||
ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */
|
||||
ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */
|
||||
/* Indicates that block was read with ASYNC priority. */
|
||||
ARC_FLAG_PRIO_ASYNC_READ = 1 << 14,
|
||||
ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */
|
||||
ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */
|
||||
ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */
|
||||
/* indicates that the buffer contains metadata (otherwise, data) */
|
||||
ARC_FLAG_BUFC_METADATA = 1 << 16,
|
||||
ARC_FLAG_BUFC_METADATA = 1 << 18,
|
||||
|
||||
/* Flags specifying whether optional hdr struct fields are defined */
|
||||
ARC_FLAG_HAS_L1HDR = 1 << 17,
|
||||
ARC_FLAG_HAS_L2HDR = 1 << 18,
|
||||
ARC_FLAG_HAS_L1HDR = 1 << 19,
|
||||
ARC_FLAG_HAS_L2HDR = 1 << 20,
|
||||
|
||||
|
||||
/*
|
||||
* The arc buffer's compression mode is stored in the top 7 bits of the
|
||||
|
@ -490,7 +490,8 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
|
||||
* individually with dmu_buf_rele.
|
||||
*/
|
||||
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
|
||||
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
|
||||
uint64_t length, boolean_t read, void *tag,
|
||||
int *numbufsp, dmu_buf_t ***dbpp);
|
||||
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
|
||||
|
||||
typedef void dmu_buf_evict_func_t(void *user_ptr);
|
||||
@ -734,7 +735,7 @@ void dmu_xuio_clear(struct xuio *uio, int i);
|
||||
void xuio_stat_wbuf_copied();
|
||||
void xuio_stat_wbuf_nocopy();
|
||||
|
||||
extern int zfs_prefetch_disable;
|
||||
extern boolean_t zfs_prefetch_disable;
|
||||
extern int zfs_max_recordsize;
|
||||
|
||||
/*
|
||||
|
@ -23,8 +23,12 @@
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#ifndef _DFETCH_H
|
||||
#define _DFETCH_H
|
||||
/*
|
||||
* Copyright (c) 2014 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _DMU_ZFETCH_H
|
||||
#define _DMU_ZFETCH_H
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
|
||||
@ -36,41 +40,30 @@ extern uint64_t zfetch_array_rd_sz;
|
||||
|
||||
struct dnode; /* so we can reference dnode */
|
||||
|
||||
typedef enum zfetch_dirn {
|
||||
ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
|
||||
ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
|
||||
} zfetch_dirn_t;
|
||||
|
||||
typedef struct zstream {
|
||||
uint64_t zst_offset; /* offset of starting block in range */
|
||||
uint64_t zst_len; /* length of range, in blocks */
|
||||
zfetch_dirn_t zst_direction; /* direction of prefetch */
|
||||
uint64_t zst_stride; /* length of stride, in blocks */
|
||||
uint64_t zst_ph_offset; /* prefetch offset, in blocks */
|
||||
uint64_t zst_cap; /* prefetch limit (cap), in blocks */
|
||||
kmutex_t zst_lock; /* protects stream */
|
||||
clock_t zst_last; /* lbolt of last prefetch */
|
||||
avl_node_t zst_node; /* embed avl node here */
|
||||
uint64_t zs_blkid; /* expect next access at this blkid */
|
||||
uint64_t zs_pf_blkid; /* next block to prefetch */
|
||||
kmutex_t zs_lock; /* protects stream */
|
||||
hrtime_t zs_atime; /* time last prefetch issued */
|
||||
list_node_t zs_node; /* link for zf_stream */
|
||||
} zstream_t;
|
||||
|
||||
typedef struct zfetch {
|
||||
krwlock_t zf_rwlock; /* protects zfetch structure */
|
||||
list_t zf_stream; /* AVL tree of zstream_t's */
|
||||
list_t zf_stream; /* list of zstream_t's */
|
||||
struct dnode *zf_dnode; /* dnode that owns this zfetch */
|
||||
uint32_t zf_stream_cnt; /* # of active streams */
|
||||
uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
|
||||
} zfetch_t;
|
||||
|
||||
void zfetch_init(void);
|
||||
void zfetch_fini(void);
|
||||
|
||||
void dmu_zfetch_init(zfetch_t *, struct dnode *);
|
||||
void dmu_zfetch_rele(zfetch_t *);
|
||||
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
|
||||
void dmu_zfetch_fini(zfetch_t *);
|
||||
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _DFETCH_H */
|
||||
#endif /* _DMU_ZFETCH_H */
|
||||
|
Loading…
Reference in New Issue
Block a user