- reduce contention by breaking up ARC state locks in to 16 for data

and 16 for metadata
- export L2ARC tunables as sysctls
- add several kstats to track L2ARC state more precisely
- avoid holding a contended lock when atomically incrementing a
  contended counter (no lock protection needed for atomics)
This commit is contained in:
Kip Macy 2010-03-16 22:17:21 +00:00
parent fe03378831
commit 285738b6ad
2 changed files with 379 additions and 127 deletions

View File

@ -131,6 +131,7 @@
#include <sys/kstat.h>
#include <sys/sdt.h>
#include <sys/ktr.h>
#include <vm/vm_pageout.h>
static kmutex_t arc_reclaim_thr_lock;
@ -186,6 +187,11 @@ SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
&zfs_mdcomp_disable, 0, "Disable metadata compression");
#ifdef ZIO_USE_UMA
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
#endif
/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
@ -218,13 +224,31 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
* second level ARC benefit from these fast lookups.
*/
#define ARCS_LOCK_PAD 128
struct arcs_lock {
kmutex_t arcs_lock;
#ifdef _KERNEL
unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
#endif
};
/*
* must be power of two for mask use to work
*
*/
#define ARC_BUFC_NUMDATALISTS 16
#define ARC_BUFC_NUMMETADATALISTS 16
#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS+ARC_BUFC_NUMDATALISTS)
typedef struct arc_state {
list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(128);
} arc_state_t;
#define ARCS_LOCK(s, i) &((s)->arcs_locks[(i)].arcs_lock)
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@ -248,7 +272,9 @@ typedef struct arc_stats {
kstat_named_t arcstat_mru_ghost_hits;
kstat_named_t arcstat_mfu_hits;
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_allocated;
kstat_named_t arcstat_deleted;
kstat_named_t arcstat_stolen;
kstat_named_t arcstat_recycle_miss;
kstat_named_t arcstat_mutex_miss;
kstat_named_t arcstat_evict_skip;
@ -280,6 +306,19 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_l2_write_trylock_fail;
kstat_named_t arcstat_l2_write_in_l2;
kstat_named_t arcstat_l2_write_passed_headroom;
kstat_named_t arcstat_l2_write_spa_mismatch;
kstat_named_t arcstat_l2_write_hdr_io_in_progress;
kstat_named_t arcstat_l2_write_not_cacheable;
kstat_named_t arcstat_l2_write_full;
kstat_named_t arcstat_l2_write_buffer_iter;
kstat_named_t arcstat_l2_write_pios;
kstat_named_t arcstat_l2_write_bytes_written;
kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
kstat_named_t arcstat_l2_write_buffer_list_iter;
kstat_named_t arcstat_l2_write_buffer_list_null_iter;
} arc_stats_t;
static arc_stats_t arc_stats = {
@ -297,7 +336,9 @@ static arc_stats_t arc_stats = {
{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
{ "mfu_hits", KSTAT_DATA_UINT64 },
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
{ "allocated", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
{ "stolen", KSTAT_DATA_UINT64 },
{ "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
@ -328,7 +369,20 @@ static arc_stats_t arc_stats = {
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 }
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
{ "l2_write_in_l2", KSTAT_DATA_UINT64 },
{ "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
{ "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
{ "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
{ "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
{ "l2_write_full", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
{ "l2_write_pios", KSTAT_DATA_UINT64 },
{ "l2_write_bytes_written", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@ -550,9 +604,10 @@ extern kmem_cache_t *zio_data_buf_cache[];
* Level 2 ARC
*/
#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
#define L2ARC_HEADROOM 4 /* num of writes */
#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */
#define L2ARC_HEADROOM 128 /* num of writes */
#define L2ARC_FEED_SECS 1 /* caching interval */
#define L2ARC_FEED_SECS_SHIFT 1 /* caching interval shift */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
@ -564,7 +619,66 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT; /* interval seconds shift */
boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
&l2arc_write_max, 0, "max write size");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
&l2arc_write_boost, 0, "extra write during warmup");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
&l2arc_headroom, 0, "number of dev writes");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
&l2arc_feed_secs, 0, "interval seconds");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW,
&l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds");
SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
&l2arc_noprefetch, 0, "don't cache prefetch bufs");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
&ARC_anon.arcs_size, 0, "size of anonymous state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
&ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
&ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
&ARC_mru.arcs_size, 0, "size of mru state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
&ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
&ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
&ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
&ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
"size of metadata in mru ghost state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
&ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
"size of data in mru ghost state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
&ARC_mfu.arcs_size, 0, "size of mfu state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
&ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
&ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
&ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
&ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
"size of metadata in mfu ghost state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
&ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
"size of data in mfu ghost state");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
&ARC_l2c_only.arcs_size, 0, "size of mru state");
/*
* L2ARC Internals
@ -957,21 +1071,43 @@ arc_buf_freeze(arc_buf_t *buf)
arc_cksum_compute(buf, B_FALSE);
}
static void
get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
{
uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
if (ab->b_type == ARC_BUFC_METADATA)
buf_hashid &= (ARC_BUFC_NUMMETADATALISTS-1);
else {
buf_hashid &= (ARC_BUFC_NUMDATALISTS-1);
buf_hashid += ARC_BUFC_NUMMETADATALISTS;
}
*list = &state->arcs_lists[buf_hashid];
*lock = ARCS_LOCK(state, buf_hashid);
}
static void
add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
{
ASSERT(MUTEX_HELD(hash_lock));
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
(ab->b_state != arc_anon)) {
list_t *list;
kmutex_t *lock;
uint64_t delta = ab->b_size * ab->b_datacnt;
list_t *list = &ab->b_state->arcs_list[ab->b_type];
uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
mutex_enter(&ab->b_state->arcs_mtx);
get_buf_info(ab, ab->b_state, &list, &lock);
ASSERT(!MUTEX_HELD(lock));
mutex_enter(lock);
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(list, ab);
mutex_exit(lock);
if (GHOST_STATE(ab->b_state)) {
ASSERT3U(ab->b_datacnt, ==, 0);
ASSERT3P(ab->b_buf, ==, NULL);
@ -980,7 +1116,6 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
ASSERT(delta > 0);
ASSERT3U(*size, >=, delta);
atomic_add_64(size, -delta);
mutex_exit(&ab->b_state->arcs_mtx);
/* remove the prefetch flag if we get a reference */
if (ab->b_flags & ARC_PREFETCH)
ab->b_flags &= ~ARC_PREFETCH;
@ -999,14 +1134,19 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
uint64_t *size = &state->arcs_lsize[ab->b_type];
list_t *list;
kmutex_t *lock;
ASSERT(!MUTEX_HELD(&state->arcs_mtx));
mutex_enter(&state->arcs_mtx);
get_buf_info(ab, state, &list, &lock);
ASSERT(!MUTEX_HELD(lock));
mutex_enter(lock);
ASSERT(!list_link_active(&ab->b_arc_node));
list_insert_head(&state->arcs_list[ab->b_type], ab);
list_insert_head(list, ab);
mutex_exit(lock);
ASSERT(ab->b_datacnt > 0);
atomic_add_64(size, ab->b_size * ab->b_datacnt);
mutex_exit(&state->arcs_mtx);
}
return (cnt);
}
@ -1021,6 +1161,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
arc_state_t *old_state = ab->b_state;
int64_t refcnt = refcount_count(&ab->b_refcnt);
uint64_t from_delta, to_delta;
list_t *list;
kmutex_t *lock;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(new_state != old_state);
@ -1035,14 +1177,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
*/
if (refcnt == 0) {
if (old_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
int use_mutex;
uint64_t *size = &old_state->arcs_lsize[ab->b_type];
get_buf_info(ab, old_state, &list, &lock);
use_mutex = !MUTEX_HELD(lock);
if (use_mutex)
mutex_enter(&old_state->arcs_mtx);
mutex_enter(lock);
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(&old_state->arcs_list[ab->b_type], ab);
list_remove(list, ab);
/*
* If prefetching out of the ghost cache,
@ -1057,16 +1202,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(size, -from_delta);
if (use_mutex)
mutex_exit(&old_state->arcs_mtx);
mutex_exit(lock);
}
if (new_state != arc_anon) {
int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
int use_mutex;
uint64_t *size = &new_state->arcs_lsize[ab->b_type];
get_buf_info(ab, new_state, &list, &lock);
use_mutex = !MUTEX_HELD(lock);
if (use_mutex)
mutex_enter(&new_state->arcs_mtx);
mutex_enter(lock);
list_insert_head(&new_state->arcs_list[ab->b_type], ab);
list_insert_head(list, ab);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
@ -1077,7 +1226,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(size, to_delta);
if (use_mutex)
mutex_exit(&new_state->arcs_mtx);
mutex_exit(lock);
}
}
@ -1467,21 +1616,49 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
{
arc_state_t *evicted_state;
uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
int64_t bytes_remaining;
arc_buf_hdr_t *ab, *ab_prev = NULL;
list_t *list = &state->arcs_list[type];
list_t *evicted_list, *list, *evicted_list_start, *list_start;
kmutex_t *lock, *evicted_lock;
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
static int evict_metadata_offset, evict_data_offset;
int i, idx, offset, list_count, count;
ASSERT(state == arc_mru || state == arc_mfu);
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
if (type == ARC_BUFC_METADATA) {
offset = 0;
list_count = ARC_BUFC_NUMMETADATALISTS;
list_start = &state->arcs_lists[0];
evicted_list_start = &evicted_state->arcs_lists[0];
idx = evict_metadata_offset;
} else {
offset = ARC_BUFC_NUMMETADATALISTS;
mutex_enter(&state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
list_start = &state->arcs_lists[offset];
evicted_list_start = &evicted_state->arcs_lists[offset];
list_count = ARC_BUFC_NUMDATALISTS;
idx = evict_data_offset;
}
bytes_remaining = evicted_state->arcs_lsize[type];
count = 0;
evict_start:
list = &list_start[idx];
evicted_list = &evicted_list_start[idx];
lock = ARCS_LOCK(state, (offset + idx));
evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
mutex_enter(lock);
mutex_enter(evicted_lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
bytes_remaining -= (ab->b_size * ab->b_datacnt);
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(ab) ||
(spa && ab->b_spa != spa) ||
@ -1541,18 +1718,36 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
mutex_exit(hash_lock);
if (bytes >= 0 && bytes_evicted >= bytes)
break;
if (bytes_remaining > 0) {
mutex_exit(evicted_lock);
mutex_exit(lock);
idx = ((idx + 1)&(list_count-1));
count++;
goto evict_start;
}
} else {
missed += 1;
}
}
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&state->arcs_mtx);
if (bytes_evicted < bytes)
dprintf("only evicted %lld bytes from %x",
(longlong_t)bytes_evicted, state);
mutex_exit(evicted_lock);
mutex_exit(lock);
idx = ((idx + 1)&(list_count-1));
count++;
if (bytes_evicted < bytes) {
if (count < list_count)
goto evict_start;
else
dprintf("only evicted %lld bytes from %x",
(longlong_t)bytes_evicted, state);
}
if (type == ARC_BUFC_METADATA)
evict_metadata_offset = idx;
else
evict_data_offset = idx;
if (skipped)
ARCSTAT_INCR(arcstat_evict_skip, skipped);
@ -1579,6 +1774,8 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
}
}
if (stolen)
ARCSTAT_BUMP(arcstat_stolen);
return (stolen);
}
@ -1591,14 +1788,28 @@ static void
arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
list_t *list = &state->arcs_list[ARC_BUFC_DATA];
kmutex_t *hash_lock;
list_t *list, *list_start;
kmutex_t *hash_lock, *lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
static int evict_offset;
int list_count, idx = evict_offset;
int offset, count = 0;
ASSERT(GHOST_STATE(state));
top:
mutex_enter(&state->arcs_mtx);
/*
* data lists come after metadata lists
*/
list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
list_count = ARC_BUFC_NUMDATALISTS;
offset = ARC_BUFC_NUMMETADATALISTS;
evict_start:
list = &list_start[idx];
lock = ARCS_LOCK(state, idx + offset);
mutex_enter(lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
if (spa && ab->b_spa != spa)
@ -1628,20 +1839,31 @@ arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
break;
} else {
if (bytes < 0) {
mutex_exit(&state->arcs_mtx);
/*
* we're draining the ARC, retry
*/
mutex_exit(lock);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
goto top;
goto evict_start;
}
bufs_skipped += 1;
}
}
mutex_exit(&state->arcs_mtx);
if (list == &state->arcs_list[ARC_BUFC_DATA] &&
mutex_exit(lock);
idx = ((idx + 1)&(ARC_BUFC_NUMDATALISTS-1));
count++;
if (count < list_count)
goto evict_start;
evict_offset = idx;
if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
(bytes < 0 || bytes_deleted < bytes)) {
list = &state->arcs_list[ARC_BUFC_METADATA];
goto top;
list_start = &state->arcs_lists[0];
list_count = ARC_BUFC_NUMMETADATALISTS;
offset = count = 0;
goto evict_start;
}
if (bufs_skipped) {
@ -1755,22 +1977,22 @@ arc_do_user_evicts(void)
void
arc_flush(spa_t *spa)
{
while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
}
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
@ -2206,6 +2428,7 @@ arc_get_data_buf(arc_buf_t *buf)
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
arc_p = MIN(arc_c, arc_p + size);
}
ARCSTAT_BUMP(arcstat_allocated);
}
/*
@ -2391,7 +2614,10 @@ arc_read_done(zio_t *zio)
hdr->b_flags &= ~ARC_L2_EVICTED;
if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
hdr->b_flags &= ~ARC_L2CACHE;
#if 0
else if ((hdr->b_flags & ARC_PREFETCH) == 0)
hdr->b_flags |= ARC_L2CACHE;
#endif
/* byteswap if necessary */
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
@ -2505,6 +2731,7 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
uint32_t *arc_flags, const zbookmark_t *zb)
{
int err;
arc_buf_hdr_t *hdr = pbuf->b_hdr;
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
@ -2513,8 +2740,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
ASSERT3P(hdr, ==, pbuf->b_hdr);
rw_exit(&pbuf->b_lock);
return (err);
}
@ -2825,7 +3052,9 @@ arc_buf_evict(arc_buf_t *buf)
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
arc_buf_t **bufp;
list_t *list, *evicted_list;
kmutex_t *lock, *evicted_lock;
rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
if (hdr == NULL) {
@ -2873,16 +3102,18 @@ arc_buf_evict(arc_buf_t *buf)
evicted_state =
(old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
mutex_enter(&old_state->arcs_mtx);
mutex_enter(&evicted_state->arcs_mtx);
get_buf_info(hdr, old_state, &list, &lock);
get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
mutex_enter(lock);
mutex_enter(evicted_lock);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
hdr->b_flags |= ARC_IN_HASH_TABLE;
hdr->b_flags &= ~ARC_BUF_AVAILABLE;
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&old_state->arcs_mtx);
mutex_exit(evicted_lock);
mutex_exit(lock);
}
mutex_exit(hash_lock);
rw_exit(&buf->b_lock);
@ -3428,7 +3659,8 @@ void
arc_init(void)
{
int prefetch_tunable_set = 0;
int i;
mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
@ -3496,33 +3728,34 @@ arc_init(void)
arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
NULL, MUTEX_DEFAULT, NULL);
list_create(&arc_mru->arcs_lists[i],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mru_ghost->arcs_lists[i],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu->arcs_lists[i],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu_ghost->arcs_lists[i],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu_ghost->arcs_lists[i],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_l2c_only->arcs_lists[i],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
}
buf_init();
@ -3596,7 +3829,8 @@ arc_init(void)
void
arc_fini(void)
{
int i;
mutex_enter(&arc_reclaim_thr_lock);
arc_thread_exit = 1;
cv_signal(&arc_reclaim_thr_cv);
@ -3617,21 +3851,19 @@ arc_fini(void)
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
mutex_destroy(&arc_anon->arcs_mtx);
mutex_destroy(&arc_mru->arcs_mtx);
mutex_destroy(&arc_mru_ghost->arcs_mtx);
mutex_destroy(&arc_mfu->arcs_mtx);
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
list_destroy(&arc_mru->arcs_lists[i]);
list_destroy(&arc_mru_ghost->arcs_lists[i]);
list_destroy(&arc_mfu->arcs_lists[i]);
list_destroy(&arc_mfu_ghost->arcs_lists[i]);
mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
}
mutex_destroy(&zfs_write_limit_lock);
buf_fini();
@ -4026,28 +4258,31 @@ static list_t *
l2arc_list_locked(int list_num, kmutex_t **lock)
{
list_t *list;
int idx;
ASSERT(list_num >= 0 && list_num < 2*ARC_BUFC_NUMLISTS);
ASSERT(list_num >= 0 && list_num <= 3);
switch (list_num) {
case 0:
list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
*lock = &arc_mfu->arcs_mtx;
break;
case 1:
list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
*lock = &arc_mru->arcs_mtx;
break;
case 2:
list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
*lock = &arc_mfu->arcs_mtx;
break;
case 3:
list = &arc_mru->arcs_list[ARC_BUFC_DATA];
*lock = &arc_mru->arcs_mtx;
break;
if (list_num < ARC_BUFC_NUMMETADATALISTS) {
idx = list_num;
list = &arc_mfu->arcs_lists[idx];
*lock = ARCS_LOCK(arc_mfu, idx);
} else if (list_num < ARC_BUFC_NUMMETADATALISTS*2) {
idx = list_num - ARC_BUFC_NUMMETADATALISTS;
list = &arc_mru->arcs_lists[idx];
*lock = ARCS_LOCK(arc_mru, idx);
} else if (list_num < (ARC_BUFC_NUMMETADATALISTS*2 +
ARC_BUFC_NUMDATALISTS)) {
idx = list_num - ARC_BUFC_NUMMETADATALISTS;
list = &arc_mfu->arcs_lists[idx];
*lock = ARCS_LOCK(arc_mfu, idx);
} else {
idx = list_num - ARC_BUFC_NUMLISTS;
list = &arc_mru->arcs_lists[idx];
*lock = ARCS_LOCK(arc_mru, idx);
}
CTR3(KTR_SPARE2, "list=%p list_num=%d idx=%d",
list, list_num, idx);
ASSERT(!(MUTEX_HELD(*lock)));
mutex_enter(*lock);
return (list);
@ -4212,13 +4447,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
/*
* Copy buffers for L2ARC writing.
*/
mutex_enter(&l2arc_buflist_mtx);
for (try = 0; try <= 3; try++) {
for (try = 0; try < 2*ARC_BUFC_NUMLISTS; try++) {
list = l2arc_list_locked(try, &list_lock);
passed_sz = 0;
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
/*
* L2ARC fast warmup.
@ -4231,52 +4468,66 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ab = list_head(list);
else
ab = list_tail(list);
if (ab == NULL) {
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
}
for (; ab; ab = ab_prev) {
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (!have_lock && !mutex_tryenter(hash_lock)) {
ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
/*
* Skip this buffer rather than waiting.
*/
continue;
}
if (ab->b_l2hdr != NULL) {
/*
* Already in L2ARC.
*/
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_in_l2);
continue;
}
passed_sz += ab->b_size;
if (passed_sz > headroom) {
/*
* Searched too far.
*/
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
break;
}
if (ab->b_spa != spa) {
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
continue;
}
if (ab->b_l2hdr != NULL) {
/*
* Already in L2ARC.
*/
if (HDR_IO_IN_PROGRESS(ab)) {
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
continue;
}
if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
if (!HDR_L2CACHE(ab)) {
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
continue;
}
if ((write_sz + ab->b_size) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_full);
break;
}
@ -4300,8 +4551,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
ARCSTAT_BUMP(arcstat_l2_write_pios);
}
ARCSTAT_INCR(arcstat_l2_write_bytes_written, ab->b_size);
/*
* Create and add a new L2ARC header.
*/
@ -4309,7 +4562,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
hdrl2->b_dev = dev;
hdrl2->b_daddr = dev->l2ad_hand;
ab->b_flags |= ARC_L2_WRITING;
ab->b_l2hdr = hdrl2;
list_insert_head(dev->l2ad_buflist, ab);
buf_data = ab->b_buf->b_data;
@ -4397,7 +4649,7 @@ l2arc_feed_thread(void *dummy __unused)
*/
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
hz * l2arc_feed_secs);
hz * l2arc_feed_secs >> l2arc_feed_secs_shift);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
/*

View File

@ -55,8 +55,8 @@ struct arc_buf {
};
typedef enum arc_buf_contents {
ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_NUMTYPES
} arc_buf_contents_t;
/*