Update L2ARC code and fix several bugs.

- improve ARC memory consumption (Bug ID 6488341)
- ARC/L2ARC metadata accounting (Bug ID 6748019)
- L2ARC turbo warmup (Bud ID 6748023)
- kstats for ARC content (Bug ID 6748023)
- kstats for evicted bytes from ARC by L2ARC state (Bud ID 6871680)
- fix panic on i386 systems (Bug ID 6821260)

OpenSolaris onnv revisions:
8582:df9361868dbe, 8628:97dcded6e556, 9215:7c4584f76b47,
9274:a10f8bd993c1, 10357:29060492b29d

OpenSolaris Bug IDs:
6748019, 6748023, 6748030, 6488341, 6798268, 6821260, 6790261, 6871680

Approved by:	pjd, delphij (mentor)
Obtained from:	OpenSlaris (multiple bug IDs)
MFC after:	3 days
This commit is contained in:
mm 2010-05-21 09:52:49 +00:00
parent 71313e529c
commit 013e2a2184
4 changed files with 311 additions and 126 deletions

View File

@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@ -152,14 +152,20 @@ typedef enum arc_reclaim_strategy {
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
/* log2(fraction of arc to reclaim) */
static int arc_shrink_shift = 5;
/*
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
static int arc_min_prefetch_lifespan;
extern int zfs_prefetch_disable;
static int arc_dead;
extern int zfs_prefetch_disable;
/*
* The arc has filled available memory and has now warmed up.
@ -173,6 +179,9 @@ uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
int zfs_mdcomp_disable = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
@ -277,6 +286,9 @@ typedef struct arc_stats {
kstat_named_t arcstat_recycle_miss;
kstat_named_t arcstat_mutex_miss;
kstat_named_t arcstat_evict_skip;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
kstat_named_t arcstat_evict_l2_ineligible;
kstat_named_t arcstat_hash_elements;
kstat_named_t arcstat_hash_elements_max;
kstat_named_t arcstat_hash_collisions;
@ -288,10 +300,14 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
kstat_named_t arcstat_hdr_size;
kstat_named_t arcstat_data_size;
kstat_named_t arcstat_other_size;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
kstat_named_t arcstat_l2_feeds;
kstat_named_t arcstat_l2_rw_clash;
kstat_named_t arcstat_l2_read_bytes;
kstat_named_t arcstat_l2_write_bytes;
kstat_named_t arcstat_l2_writes_sent;
kstat_named_t arcstat_l2_writes_done;
kstat_named_t arcstat_l2_writes_error;
@ -341,6 +357,9 @@ static arc_stats_t arc_stats = {
{ "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
{ "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_collisions", KSTAT_DATA_UINT64 },
@ -352,10 +371,14 @@ static arc_stats_t arc_stats = {
{ "c_max", KSTAT_DATA_UINT64 },
{ "size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_rw_clash", KSTAT_DATA_UINT64 },
{ "l2_read_bytes", KSTAT_DATA_UINT64 },
{ "l2_write_bytes", KSTAT_DATA_UINT64 },
{ "l2_writes_sent", KSTAT_DATA_UINT64 },
{ "l2_writes_done", KSTAT_DATA_UINT64 },
{ "l2_writes_error", KSTAT_DATA_UINT64 },
@ -519,6 +542,8 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
static int arc_evict_needed(arc_buf_contents_t type);
static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
static boolean_t l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab);
#define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
(state) == arc_l2c_only)
@ -546,6 +571,7 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
@ -603,10 +629,10 @@ extern kmem_cache_t *zio_data_buf_cache[];
* Level 2 ARC
*/
#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */
#define L2ARC_HEADROOM 128 /* num of writes */
#define L2ARC_FEED_SECS 1 /* caching interval */
#define L2ARC_FEED_SECS_SHIFT 1 /* caching interval shift */
#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
#define L2ARC_HEADROOM 2 /* num of writes */
#define L2ARC_FEED_SECS 1 /* caching interval secs */
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
@ -618,9 +644,10 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT; /* interval seconds shift */
uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */
boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
&l2arc_write_max, 0, "max write size");
@ -630,12 +657,15 @@ SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
&l2arc_headroom, 0, "number of dev writes");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
&l2arc_feed_secs, 0, "interval seconds");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW,
&l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
&l2arc_feed_min_ms, 0, "min interval milliseconds");
SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
&l2arc_noprefetch, 0, "don't cache prefetch bufs");
SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
&l2arc_feed_again, 0, "turbo warmup");
SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
&l2arc_norw, 0, "no reads during writes");
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
&ARC_anon.arcs_size, 0, "size of anonymous state");
@ -692,6 +722,7 @@ typedef struct l2arc_dev {
uint64_t l2ad_end; /* last addr on device */
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
list_t *l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;
@ -722,7 +753,7 @@ typedef struct l2arc_write_callback {
struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
daddr_t b_daddr; /* disk address, offset byte */
uint64_t b_daddr; /* disk address, offset byte */
};
typedef struct l2arc_data_free {
@ -890,8 +921,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
return (0);
}
@ -903,6 +934,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
bzero(buf, sizeof (arc_buf_t));
rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
}
@ -919,8 +952,7 @@ hdr_dest(void *vbuf, void *unused)
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
mutex_destroy(&buf->b_freeze_lock);
ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
}
/* ARGSUSED */
@ -930,6 +962,7 @@ buf_dest(void *vbuf, void *unused)
arc_buf_t *buf = vbuf;
rw_destroy(&buf->b_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
/*
@ -1245,15 +1278,49 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
void
arc_space_consume(uint64_t space)
arc_space_consume(uint64_t space, arc_space_type_t type)
{
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, space);
break;
case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, space);
break;
}
atomic_add_64(&arc_meta_used, space);
atomic_add_64(&arc_size, space);
}
void
arc_space_return(uint64_t space)
arc_space_return(uint64_t space, arc_space_type_t type)
{
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, -space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, -space);
break;
case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
break;
}
ASSERT(arc_meta_used >= space);
if (arc_meta_max < arc_meta_used)
arc_meta_max = arc_meta_used;
@ -1353,6 +1420,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
@ -1400,11 +1468,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free,
buf->b_data, size);
arc_space_return(size);
arc_space_return(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
arc_buf_data_free(buf->b_hdr,
zio_data_buf_free, buf->b_data, size);
ARCSTAT_INCR(arcstat_data_size, -size);
atomic_add_64(&arc_size, -size);
}
}
@ -1699,6 +1768,21 @@ evict_start:
buf->b_data == stolen, TRUE);
}
}
if (ab->b_l2hdr) {
ARCSTAT_INCR(arcstat_evict_l2_cached,
ab->b_size);
} else {
if (l2arc_write_eligible(ab->b_spa, ab)) {
ARCSTAT_INCR(arcstat_evict_l2_eligible,
ab->b_size);
} else {
ARCSTAT_INCR(
arcstat_evict_l2_ineligible,
ab->b_size);
}
}
if (ab->b_datacnt == 0) {
arc_change_state(evicted_state, ab, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(ab));
@ -1871,61 +1955,63 @@ evict_start:
static void
arc_adjust(void)
{
int64_t top_sz, mru_over, arc_over, todelete;
int64_t adjustment, delta;
top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
/*
* Adjust MRU size
*/
if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
int64_t toevict =
MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
(void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
adjustment = MIN(arc_size - arc_c,
arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
adjustment -= delta;
}
if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
int64_t toevict =
MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
(void) arc_evict(arc_mru, NULL, toevict, FALSE,
if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
(void) arc_evict(arc_mru, NULL, delta, FALSE,
ARC_BUFC_METADATA);
top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
}
mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
/*
* Adjust MFU size
*/
if (mru_over > 0) {
if (arc_mru_ghost->arcs_size > 0) {
todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
arc_evict_ghost(arc_mru_ghost, NULL, todelete);
}
adjustment = arc_size - arc_c;
if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
adjustment -= delta;
}
if ((arc_over = arc_size - arc_c) > 0) {
int64_t tbl_over;
if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
int64_t delta = MIN(adjustment,
arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
(void) arc_evict(arc_mfu, NULL, delta, FALSE,
ARC_BUFC_METADATA);
}
if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
int64_t toevict =
MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
ARC_BUFC_DATA);
arc_over = arc_size - arc_c;
}
/*
* Adjust ghost lists
*/
if (arc_over > 0 &&
arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
int64_t toevict =
MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
arc_over);
(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
ARC_BUFC_METADATA);
}
adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
tbl_over = arc_size + arc_mru_ghost->arcs_size +
arc_mfu_ghost->arcs_size - arc_c * 2;
if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
delta = MIN(arc_mru_ghost->arcs_size, adjustment);
arc_evict_ghost(arc_mru_ghost, NULL, delta);
}
if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
}
adjustment =
arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
arc_evict_ghost(arc_mfu_ghost, NULL, delta);
}
}
@ -1999,8 +2085,6 @@ arc_flush(spa_t *spa)
ASSERT(spa || arc_eviction_list == NULL);
}
int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
void
arc_shrink(void)
{
@ -2237,6 +2321,7 @@ static void
arc_adapt(int bytes, arc_state_t *state)
{
int mult;
uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
if (state == arc_l2c_only)
return;
@ -2254,12 +2339,15 @@ arc_adapt(int bytes, arc_state_t *state)
mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
arc_p = MIN(arc_c, arc_p + bytes * mult);
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) {
uint64_t delta;
mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
delta = MIN(bytes * mult, arc_p);
arc_p = MAX(arc_p_min, arc_p - delta);
}
ASSERT((int64_t)arc_p >= 0);
@ -2359,10 +2447,11 @@ arc_get_data_buf(arc_buf_t *buf)
if (!arc_evict_needed(type)) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
arc_space_consume(size);
arc_space_consume(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
}
goto out;
@ -2379,21 +2468,22 @@ arc_get_data_buf(arc_buf_t *buf)
if (state == arc_mru || state == arc_anon) {
uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
state = (arc_mfu->arcs_lsize[type] > 0 &&
state = (arc_mfu->arcs_lsize[type] >= size &&
arc_p > mru_used) ? arc_mfu : arc_mru;
} else {
/* MFU cases */
uint64_t mfu_space = arc_c - arc_p;
state = (arc_mru->arcs_lsize[type] > 0 &&
state = (arc_mru->arcs_lsize[type] >= size &&
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
arc_space_consume(size);
arc_space_consume(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
}
ARCSTAT_BUMP(arcstat_recycle_miss);
@ -2816,7 +2906,8 @@ top:
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
vdev_t *vd = NULL;
daddr_t addr;
uint64_t addr;
boolean_t devw = B_FALSE;
if (hdr == NULL) {
/* this block is not in the cache */
@ -2895,6 +2986,7 @@ top:
if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
(vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
devw = hdr->b_l2hdr->b_dev->l2ad_writing;
addr = hdr->b_l2hdr->b_daddr;
/*
* Lock out device removal.
@ -2914,7 +3006,7 @@ top:
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
data, metadata, misses);
if (vd != NULL) {
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true:
* 1. The L2ARC vdev was previously cached.
@ -2922,9 +3014,11 @@ top:
* 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev.
* 5. This isn't prefetch and l2arc_noprefetch is set.
*/
if (hdr->b_l2hdr != NULL &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb;
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
@ -2950,6 +3044,7 @@ top:
ZIO_FLAG_DONT_RETRY, B_FALSE);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes, size);
if (*arc_flags & ARC_NOWAIT) {
zio_nowait(rzio);
@ -2969,6 +3064,14 @@ top:
ARCSTAT_BUMP(arcstat_l2_rw_clash);
spa_config_exit(spa, SCL_L2ARC, vd);
}
} else {
if (vd != NULL)
spa_config_exit(spa, SCL_L2ARC, vd);
if (l2arc_ndev != 0) {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_misses);
}
}
rzio = zio_read(pio, spa, bp, buf->b_data, size,
@ -3125,6 +3228,7 @@ arc_release(arc_buf_t *buf, void *tag)
kmutex_t *hash_lock;
l2arc_buf_hdr_t *l2hdr;
uint64_t buf_size;
boolean_t released = B_FALSE;
rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
@ -3140,12 +3244,12 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(buf->b_efunc == NULL);
arc_buf_thaw(buf);
rw_exit(&buf->b_lock);
return;
released = B_TRUE;
} else {
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
}
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
l2hdr = hdr->b_l2hdr;
if (l2hdr) {
mutex_enter(&l2arc_buflist_mtx);
@ -3153,6 +3257,9 @@ arc_release(arc_buf_t *buf, void *tag)
buf_size = hdr->b_size;
}
if (released)
goto out;
/*
* Do we have more than one buf?
*/
@ -3220,6 +3327,7 @@ arc_release(arc_buf_t *buf, void *tag)
buf->b_efunc = NULL;
buf->b_private = NULL;
out:
if (l2hdr) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@ -3697,6 +3805,15 @@ arc_init(void)
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
arc_c_min = arc_meta_limit / 2;
if (zfs_arc_grow_retry > 0)
arc_grow_retry = zfs_arc_grow_retry;
if (zfs_arc_shrink_shift > 0)
arc_shrink_shift = zfs_arc_shrink_shift;
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
@ -3987,8 +4104,82 @@ arc_fini(void)
*
* Tunables may be removed or added as future performance improvements are
* integrated, and also may become zpool properties.
*
* There are three key functions that control how the L2ARC warms up:
*
* l2arc_write_eligible() check if a buffer is eligible to cache
* l2arc_write_size() calculate how much to write
* l2arc_write_interval() calculate sleep delay between writes
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
*/
static boolean_t
l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab)
{
/*
* A buffer is *not* eligible for the L2ARC if it:
* 1. belongs to a different spa.
* 2. is already cached on the L2ARC.
* 3. has an I/O in progress (it may be an incomplete read).
* 4. is flagged not eligible (zfs property).
*/
if (ab->b_spa != spa) {
ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
return (B_FALSE);
}
if (ab->b_l2hdr != NULL) {
ARCSTAT_BUMP(arcstat_l2_write_in_l2);
return (B_FALSE);
}
if (HDR_IO_IN_PROGRESS(ab)) {
ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
return (B_FALSE);
}
if (!HDR_L2CACHE(ab)) {
ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
return (B_FALSE);
}
return (B_TRUE);
}
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
uint64_t size;
size = dev->l2ad_write;
if (arc_warm == B_FALSE)
size += dev->l2ad_boost;
return (size);
}
static clock_t
l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
{
clock_t interval, next;
/*
* If the ARC lists are busy, increase our write rate; if the
* lists are stale, idle back. This is achieved by checking
* how much we previously wrote - if it was more than half of
* what we wanted, schedule the next write much sooner.
*/
if (l2arc_feed_again && wrote > (wanted / 2))
interval = (hz * l2arc_feed_min_ms) / 1000;
else
interval = hz * l2arc_feed_secs;
next = MAX(LBOLT, MIN(LBOLT + interval, began + interval));
return (next);
}
static void
l2arc_hdr_stat_add(void)
{
@ -4410,7 +4601,7 @@ top:
* An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
* for reading until they have completed writing.
*/
static void
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
arc_buf_hdr_t *ab, *ab_prev, *head;
@ -4473,15 +4664,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
continue;
}
if (ab->b_l2hdr != NULL) {
/*
* Already in L2ARC.
*/
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_in_l2);
continue;
}
passed_sz += ab->b_size;
if (passed_sz > headroom) {
/*
@ -4492,22 +4674,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}
if (ab->b_spa != spa) {
if (!l2arc_write_eligible(spa, ab)) {
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
continue;
}
if (HDR_IO_IN_PROGRESS(ab)) {
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
continue;
}
if (!HDR_L2CACHE(ab)) {
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
continue;
}
if ((write_sz + ab->b_size) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
@ -4515,12 +4686,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}
if (ab->b_buf == NULL) {
DTRACE_PROBE1(l2arc__buf__null, void *, ab);
mutex_exit(hash_lock);
continue;
}
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
@ -4589,11 +4754,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
if (pio == NULL) {
ASSERT3U(write_sz, ==, 0);
kmem_cache_free(hdr_cache, head);
return;
return (0);
}
ASSERT3U(write_sz, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
@ -4609,7 +4775,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
dev->l2ad_first = B_FALSE;
}
dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
return (write_sz);
}
/*
@ -4622,20 +4792,19 @@ l2arc_feed_thread(void *dummy __unused)
callb_cpr_t cpr;
l2arc_dev_t *dev;
spa_t *spa;
uint64_t size;
uint64_t size, wrote;
clock_t begin, next = LBOLT;
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
mutex_enter(&l2arc_feed_thr_lock);
while (l2arc_thread_exit == 0) {
/*
* Pause for l2arc_feed_secs seconds between writes.
*/
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
hz * l2arc_feed_secs >> l2arc_feed_secs_shift);
next - LBOLT);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
next = LBOLT + hz;
/*
* Quick check for L2ARC devices.
@ -4646,6 +4815,7 @@ l2arc_feed_thread(void *dummy __unused)
continue;
}
mutex_exit(&l2arc_dev_mtx);
begin = LBOLT;
/*
* This selects the next l2arc device to write to, and in
@ -4674,9 +4844,7 @@ l2arc_feed_thread(void *dummy __unused)
ARCSTAT_BUMP(arcstat_l2_feeds);
size = dev->l2ad_write;
if (arc_warm == B_FALSE)
size += dev->l2ad_boost;
size = l2arc_write_size(dev);
/*
* Evict L2ARC buffers that will be overwritten.
@ -4686,7 +4854,12 @@ l2arc_feed_thread(void *dummy __unused)
/*
* Write ARC buffers.
*/
l2arc_write_buffers(spa, dev, size);
wrote = l2arc_write_buffers(spa, dev, size);
/*
* Calculate interval between writes.
*/
next = l2arc_write_interval(begin, size, wrote);
spa_config_exit(spa, SCL_L2ARC, dev);
}
@ -4736,6 +4909,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
ASSERT3U(adddev->l2ad_write, >, 0);
/*

View File

@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@ -466,7 +466,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
if (bonuslen)
@ -657,7 +657,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (db->db_blkid == DB_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
@ -1314,7 +1314,7 @@ dbuf_clear(dmu_buf_impl_t *db)
ASSERT(db->db.db_data != NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
}
db->db.db_data = NULL;
db->db_state = DB_UNCACHED;
@ -1436,7 +1436,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db.db_offset = DB_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t));
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
} else {
int blocksize =
@ -1463,7 +1463,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
list_insert_head(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t));
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db);
@ -1532,7 +1532,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(db->db_data_pending == NULL);
kmem_cache_free(dbuf_cache, db);
arc_space_return(sizeof (dmu_buf_impl_t));
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
}
void
@ -1951,7 +1951,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
}
db->db_data_pending = NULL;
drp = &db->db_last_dirty;

View File

@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@ -301,7 +301,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
list_insert_head(&os->os_dnodes, dn);
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t));
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
return (dn);
}
@ -336,7 +336,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_bonus = NULL;
}
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t));
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
}
void

View File

@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@ -68,8 +68,19 @@ typedef enum arc_buf_contents {
#define ARC_CACHED (1 << 4) /* I/O was already in cache */
#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
void arc_space_consume(uint64_t space);
void arc_space_return(uint64_t space);
/*
* The following breakdows of arc_size exist for kstat only.
*/
typedef enum arc_space_type {
ARC_SPACE_DATA,
ARC_SPACE_HDRS,
ARC_SPACE_L2HDRS,
ARC_SPACE_OTHER,
ARC_SPACE_NUMTYPES
} arc_space_type_t;
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
void *arc_data_buf_alloc(uint64_t space);
void arc_data_buf_free(void *buf, uint64_t space);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,