MFV r302991: 6950 ARC should cache compressed data
illumos/illumos-gate@dcbf3bd6a1
dcbf3bd6a1
https://www.illumos.org/issues/6950
When reading compressed data from disk, the ARC should keep the compressed
block cached and only decompress it when consumers access the block. The
uncompressed data should be short-lived allowing the ARC to cache a much larger
amount of data. The DMU would also maintain a smaller cache of uncompressed
blocks to minimize the impact of decompressing frequently accessed blocks.
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Author: George Wilson <george.wilson@delphix.com>
This commit is contained in:
commit
efa0867fb0
@ -1289,7 +1289,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
|
||||
}
|
||||
if (!err)
|
||||
ASSERT3U(fill, ==, BP_GET_FILL(bp));
|
||||
(void) arc_buf_remove_ref(buf, &buf);
|
||||
arc_buf_destroy(buf, &buf);
|
||||
}
|
||||
|
||||
return (err);
|
||||
|
@ -189,6 +189,7 @@ extern uint64_t metaslab_gang_bang;
|
||||
extern uint64_t metaslab_df_alloc_threshold;
|
||||
extern uint64_t zfs_deadman_synctime_ms;
|
||||
extern int metaslab_preload_limit;
|
||||
extern boolean_t zfs_compressed_arc_enabled;
|
||||
|
||||
static ztest_shared_opts_t *ztest_shared_opts;
|
||||
static ztest_shared_opts_t ztest_opts;
|
||||
@ -5355,6 +5356,12 @@ ztest_resume_thread(void *arg)
|
||||
if (spa_suspended(spa))
|
||||
ztest_resume(spa);
|
||||
(void) poll(NULL, 0, 100);
|
||||
|
||||
/*
|
||||
* Periodically change the zfs_compressed_arc_enabled setting.
|
||||
*/
|
||||
if (ztest_random(10) == 0)
|
||||
zfs_compressed_arc_enabled = ztest_random(2);
|
||||
}
|
||||
return (NULL);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -45,6 +45,9 @@
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/blkptr.h>
|
||||
#include <sys/range_tree.h>
|
||||
#include <sys/callb.h>
|
||||
|
||||
uint_t zfs_dbuf_evict_key;
|
||||
|
||||
/*
|
||||
* Number of times that zfs_free_range() took the slow path while doing
|
||||
@ -52,7 +55,6 @@
|
||||
*/
|
||||
uint64_t zfs_free_range_recv_miss;
|
||||
|
||||
static void dbuf_destroy(dmu_buf_impl_t *db);
|
||||
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
||||
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
||||
|
||||
@ -64,9 +66,76 @@ extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
|
||||
/*
|
||||
* Global data structures and functions for the dbuf cache.
|
||||
*/
|
||||
static kmem_cache_t *dbuf_cache;
|
||||
static kmem_cache_t *dbuf_kmem_cache;
|
||||
static taskq_t *dbu_evict_taskq;
|
||||
|
||||
static kthread_t *dbuf_cache_evict_thread;
|
||||
static kmutex_t dbuf_evict_lock;
|
||||
static kcondvar_t dbuf_evict_cv;
|
||||
static boolean_t dbuf_evict_thread_exit;
|
||||
|
||||
/*
|
||||
* LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
|
||||
* are not currently held but have been recently released. These dbufs
|
||||
* are not eligible for arc eviction until they are aged out of the cache.
|
||||
* Dbufs are added to the dbuf cache once the last hold is released. If a
|
||||
* dbuf is later accessed and still exists in the dbuf cache, then it will
|
||||
* be removed from the cache and later re-added to the head of the cache.
|
||||
* Dbufs that are aged out of the cache will be immediately destroyed and
|
||||
* become eligible for arc eviction.
|
||||
*/
|
||||
static multilist_t dbuf_cache;
|
||||
static refcount_t dbuf_cache_size;
|
||||
uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
|
||||
|
||||
/* Cap the size of the dbuf cache to log2 fraction of arc size. */
|
||||
int dbuf_cache_max_shift = 5;
|
||||
|
||||
/*
|
||||
* The dbuf cache uses a three-stage eviction policy:
|
||||
* - A low water marker designates when the dbuf eviction thread
|
||||
* should stop evicting from the dbuf cache.
|
||||
* - When we reach the maximum size (aka mid water mark), we
|
||||
* signal the eviction thread to run.
|
||||
* - The high water mark indicates when the eviction thread
|
||||
* is unable to keep up with the incoming load and eviction must
|
||||
* happen in the context of the calling thread.
|
||||
*
|
||||
* The dbuf cache:
|
||||
* (max size)
|
||||
* low water mid water hi water
|
||||
* +----------------------------------------+----------+----------+
|
||||
* | | | |
|
||||
* | | | |
|
||||
* | | | |
|
||||
* | | | |
|
||||
* +----------------------------------------+----------+----------+
|
||||
* stop signal evict
|
||||
* evicting eviction directly
|
||||
* thread
|
||||
*
|
||||
* The high and low water marks indicate the operating range for the eviction
|
||||
* thread. The low water mark is, by default, 90% of the total size of the
|
||||
* cache and the high water mark is at 110% (both of these percentages can be
|
||||
* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
|
||||
* respectively). The eviction thread will try to ensure that the cache remains
|
||||
* within this range by waking up every second and checking if the cache is
|
||||
* above the low water mark. The thread can also be woken up by callers adding
|
||||
* elements into the cache if the cache is larger than the mid water (i.e max
|
||||
* cache size). Once the eviction thread is woken up and eviction is required,
|
||||
* it will continue evicting buffers until it's able to reduce the cache size
|
||||
* to the low water mark. If the cache size continues to grow and hits the high
|
||||
* water mark, then callers adding elments to the cache will begin to evict
|
||||
* directly from the cache until the cache is no longer above the high water
|
||||
* mark.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The percentage above and below the maximum cache size.
|
||||
*/
|
||||
uint_t dbuf_cache_hiwater_pct = 10;
|
||||
uint_t dbuf_cache_lowater_pct = 10;
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
dbuf_cons(void *vdb, void *unused, int kmflag)
|
||||
@ -76,6 +145,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
|
||||
|
||||
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
|
||||
multilist_link_init(&db->db_cache_link);
|
||||
refcount_create(&db->db_holds);
|
||||
|
||||
return (0);
|
||||
@ -88,6 +158,7 @@ dbuf_dest(void *vdb, void *unused)
|
||||
dmu_buf_impl_t *db = vdb;
|
||||
mutex_destroy(&db->db_mtx);
|
||||
cv_destroy(&db->db_changed);
|
||||
ASSERT(!multilist_link_active(&db->db_cache_link));
|
||||
refcount_destroy(&db->db_holds);
|
||||
}
|
||||
|
||||
@ -117,8 +188,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
|
||||
return (crc);
|
||||
}
|
||||
|
||||
#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
|
||||
|
||||
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
|
||||
((dbuf)->db.db_object == (obj) && \
|
||||
(dbuf)->db_objset == (os) && \
|
||||
@ -129,7 +198,7 @@ dmu_buf_impl_t *
|
||||
dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
|
||||
{
|
||||
dbuf_hash_table_t *h = &dbuf_hash_table;
|
||||
uint64_t hv = DBUF_HASH(os, obj, level, blkid);
|
||||
uint64_t hv = dbuf_hash(os, obj, level, blkid);
|
||||
uint64_t idx = hv & h->hash_table_mask;
|
||||
dmu_buf_impl_t *db;
|
||||
|
||||
@ -180,7 +249,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
|
||||
uint64_t obj = db->db.db_object;
|
||||
int level = db->db_level;
|
||||
uint64_t blkid = db->db_blkid;
|
||||
uint64_t hv = DBUF_HASH(os, obj, level, blkid);
|
||||
uint64_t hv = dbuf_hash(os, obj, level, blkid);
|
||||
uint64_t idx = hv & h->hash_table_mask;
|
||||
dmu_buf_impl_t *dbf;
|
||||
|
||||
@ -212,7 +281,7 @@ static void
|
||||
dbuf_hash_remove(dmu_buf_impl_t *db)
|
||||
{
|
||||
dbuf_hash_table_t *h = &dbuf_hash_table;
|
||||
uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
|
||||
uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
|
||||
db->db_level, db->db_blkid);
|
||||
uint64_t idx = hv & h->hash_table_mask;
|
||||
dmu_buf_impl_t *dbf, **dbp;
|
||||
@ -237,8 +306,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
|
||||
atomic_dec_64(&dbuf_hash_count);
|
||||
}
|
||||
|
||||
static arc_evict_func_t dbuf_do_evict;
|
||||
|
||||
typedef enum {
|
||||
DBVU_EVICTING,
|
||||
DBVU_NOT_EVICTING
|
||||
@ -323,15 +390,181 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dbuf_evict(dmu_buf_impl_t *db)
|
||||
/*
|
||||
* This function *must* return indices evenly distributed between all
|
||||
* sublists of the multilist. This is needed due to how the dbuf eviction
|
||||
* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
|
||||
* distributed between all sublists and uses this assumption when
|
||||
* deciding which sublist to evict from and how much to evict from it.
|
||||
*/
|
||||
unsigned int
|
||||
dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
ASSERT(db->db_buf == NULL);
|
||||
ASSERT(db->db_data_pending == NULL);
|
||||
dmu_buf_impl_t *db = obj;
|
||||
|
||||
dbuf_clear(db);
|
||||
dbuf_destroy(db);
|
||||
/*
|
||||
* The assumption here, is the hash value for a given
|
||||
* dmu_buf_impl_t will remain constant throughout it's lifetime
|
||||
* (i.e. it's objset, object, level and blkid fields don't change).
|
||||
* Thus, we don't need to store the dbuf's sublist index
|
||||
* on insertion, as this index can be recalculated on removal.
|
||||
*
|
||||
* Also, the low order bits of the hash value are thought to be
|
||||
* distributed evenly. Otherwise, in the case that the multilist
|
||||
* has a power of two number of sublists, each sublists' usage
|
||||
* would not be evenly distributed.
|
||||
*/
|
||||
return (dbuf_hash(db->db_objset, db->db.db_object,
|
||||
db->db_level, db->db_blkid) %
|
||||
multilist_get_num_sublists(ml));
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
dbuf_cache_above_hiwater(void)
|
||||
{
|
||||
uint64_t dbuf_cache_hiwater_bytes =
|
||||
(dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
|
||||
|
||||
return (refcount_count(&dbuf_cache_size) >
|
||||
dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
dbuf_cache_above_lowater(void)
|
||||
{
|
||||
uint64_t dbuf_cache_lowater_bytes =
|
||||
(dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
|
||||
|
||||
return (refcount_count(&dbuf_cache_size) >
|
||||
dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
|
||||
}
|
||||
|
||||
/*
|
||||
* Evict the oldest eligible dbuf from the dbuf cache.
|
||||
*/
|
||||
static void
|
||||
dbuf_evict_one(void)
|
||||
{
|
||||
int idx = multilist_get_random_index(&dbuf_cache);
|
||||
multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
|
||||
|
||||
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
|
||||
|
||||
/*
|
||||
* Set the thread's tsd to indicate that it's processing evictions.
|
||||
* Once a thread stops evicting from the dbuf cache it will
|
||||
* reset its tsd to NULL.
|
||||
*/
|
||||
ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
|
||||
(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
|
||||
|
||||
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
|
||||
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
|
||||
db = multilist_sublist_prev(mls, db);
|
||||
}
|
||||
|
||||
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
|
||||
multilist_sublist_t *, mls);
|
||||
|
||||
if (db != NULL) {
|
||||
multilist_sublist_remove(mls, db);
|
||||
multilist_sublist_unlock(mls);
|
||||
(void) refcount_remove_many(&dbuf_cache_size,
|
||||
db->db.db_size, db);
|
||||
dbuf_destroy(db);
|
||||
} else {
|
||||
multilist_sublist_unlock(mls);
|
||||
}
|
||||
(void) tsd_set(zfs_dbuf_evict_key, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* The dbuf evict thread is responsible for aging out dbufs from the
|
||||
* cache. Once the cache has reached it's maximum size, dbufs are removed
|
||||
* and destroyed. The eviction thread will continue running until the size
|
||||
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
|
||||
* out of the cache it is destroyed and becomes eligible for arc eviction.
|
||||
*/
|
||||
static void
|
||||
dbuf_evict_thread(void *dummy __unused)
|
||||
{
|
||||
callb_cpr_t cpr;
|
||||
|
||||
CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
|
||||
|
||||
mutex_enter(&dbuf_evict_lock);
|
||||
while (!dbuf_evict_thread_exit) {
|
||||
while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
|
||||
CALLB_CPR_SAFE_BEGIN(&cpr);
|
||||
(void) cv_timedwait_hires(&dbuf_evict_cv,
|
||||
&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
|
||||
CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
|
||||
}
|
||||
mutex_exit(&dbuf_evict_lock);
|
||||
|
||||
/*
|
||||
* Keep evicting as long as we're above the low water mark
|
||||
* for the cache. We do this without holding the locks to
|
||||
* minimize lock contention.
|
||||
*/
|
||||
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
|
||||
dbuf_evict_one();
|
||||
}
|
||||
|
||||
mutex_enter(&dbuf_evict_lock);
|
||||
}
|
||||
|
||||
dbuf_evict_thread_exit = B_FALSE;
|
||||
cv_broadcast(&dbuf_evict_cv);
|
||||
CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
|
||||
thread_exit();
|
||||
}
|
||||
|
||||
/*
|
||||
* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
|
||||
* If the dbuf cache is at its high water mark, then evict a dbuf from the
|
||||
* dbuf cache using the callers context.
|
||||
*/
|
||||
static void
|
||||
dbuf_evict_notify(void)
|
||||
{
|
||||
|
||||
/*
|
||||
* We use thread specific data to track when a thread has
|
||||
* started processing evictions. This allows us to avoid deeply
|
||||
* nested stacks that would have a call flow similar to this:
|
||||
*
|
||||
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
|
||||
* ^ |
|
||||
* | |
|
||||
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
|
||||
*
|
||||
* The dbuf_eviction_thread will always have its tsd set until
|
||||
* that thread exits. All other threads will only set their tsd
|
||||
* if they are participating in the eviction process. This only
|
||||
* happens if the eviction thread is unable to process evictions
|
||||
* fast enough. To keep the dbuf cache size in check, other threads
|
||||
* can evict from the dbuf cache directly. Those threads will set
|
||||
* their tsd values so that we ensure that they only evict one dbuf
|
||||
* from the dbuf cache.
|
||||
*/
|
||||
if (tsd_get(zfs_dbuf_evict_key) != NULL)
|
||||
return;
|
||||
|
||||
if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
|
||||
boolean_t evict_now = B_FALSE;
|
||||
|
||||
mutex_enter(&dbuf_evict_lock);
|
||||
if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
|
||||
evict_now = dbuf_cache_above_hiwater();
|
||||
cv_signal(&dbuf_evict_cv);
|
||||
}
|
||||
mutex_exit(&dbuf_evict_lock);
|
||||
|
||||
if (evict_now) {
|
||||
dbuf_evict_one();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@ -359,18 +592,38 @@ dbuf_init(void)
|
||||
goto retry;
|
||||
}
|
||||
|
||||
dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
|
||||
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
|
||||
sizeof (dmu_buf_impl_t),
|
||||
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
|
||||
|
||||
for (i = 0; i < DBUF_MUTEXES; i++)
|
||||
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
/*
|
||||
* Setup the parameters for the dbuf cache. We cap the size of the
|
||||
* dbuf cache to 1/32nd (default) of the size of the ARC.
|
||||
*/
|
||||
dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
|
||||
arc_max_bytes() >> dbuf_cache_max_shift);
|
||||
|
||||
/*
|
||||
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
|
||||
* configuration is not required.
|
||||
*/
|
||||
dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
|
||||
|
||||
multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
|
||||
offsetof(dmu_buf_impl_t, db_cache_link),
|
||||
zfs_arc_num_sublists_per_state,
|
||||
dbuf_cache_multilist_index_func);
|
||||
refcount_create(&dbuf_cache_size);
|
||||
|
||||
tsd_create(&zfs_dbuf_evict_key, NULL);
|
||||
dbuf_evict_thread_exit = B_FALSE;
|
||||
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
|
||||
dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
|
||||
NULL, 0, &p0, TS_RUN, minclsyspri);
|
||||
}
|
||||
|
||||
void
|
||||
@ -382,8 +635,23 @@ dbuf_fini(void)
|
||||
for (i = 0; i < DBUF_MUTEXES; i++)
|
||||
mutex_destroy(&h->hash_mutexes[i]);
|
||||
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
||||
kmem_cache_destroy(dbuf_cache);
|
||||
kmem_cache_destroy(dbuf_kmem_cache);
|
||||
taskq_destroy(dbu_evict_taskq);
|
||||
|
||||
mutex_enter(&dbuf_evict_lock);
|
||||
dbuf_evict_thread_exit = B_TRUE;
|
||||
while (dbuf_evict_thread_exit) {
|
||||
cv_signal(&dbuf_evict_cv);
|
||||
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
|
||||
}
|
||||
mutex_exit(&dbuf_evict_lock);
|
||||
tsd_destroy(&zfs_dbuf_evict_key);
|
||||
|
||||
mutex_destroy(&dbuf_evict_lock);
|
||||
cv_destroy(&dbuf_evict_cv);
|
||||
|
||||
refcount_destroy(&dbuf_cache_size);
|
||||
multilist_destroy(&dbuf_cache);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -541,7 +809,7 @@ dbuf_clear_data(dmu_buf_impl_t *db)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
dbuf_evict_user(db);
|
||||
db->db_buf = NULL;
|
||||
ASSERT3P(db->db_buf, ==, NULL);
|
||||
db->db.db_data = NULL;
|
||||
if (db->db_state != DB_NOFILL)
|
||||
db->db_state = DB_UNCACHED;
|
||||
@ -556,8 +824,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
|
||||
db->db_buf = buf;
|
||||
ASSERT(buf->b_data != NULL);
|
||||
db->db.db_data = buf->b_data;
|
||||
if (!arc_released(buf))
|
||||
arc_set_callback(buf, dbuf_do_evict, db);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -568,6 +834,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
|
||||
{
|
||||
arc_buf_t *abuf;
|
||||
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
|
||||
int blksz = db->db.db_size;
|
||||
@ -579,6 +846,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
|
||||
} else {
|
||||
abuf = db->db_buf;
|
||||
arc_loan_inuse_buf(abuf, db);
|
||||
db->db_buf = NULL;
|
||||
dbuf_clear_data(db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
}
|
||||
@ -647,7 +915,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
||||
} else {
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
ASSERT3P(db->db_buf, ==, NULL);
|
||||
VERIFY(arc_buf_remove_ref(buf, db));
|
||||
arc_buf_destroy(buf, db);
|
||||
db->db_state = DB_UNCACHED;
|
||||
}
|
||||
cv_broadcast(&db->db_changed);
|
||||
@ -696,7 +964,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||
BP_IS_HOLE(db->db_blkptr)))) {
|
||||
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
||||
|
||||
dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
|
||||
dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
|
||||
db->db.db_size, db, type));
|
||||
bzero(db->db.db_data, db->db.db_size);
|
||||
|
||||
@ -733,8 +1001,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||
|
||||
if (DBUF_IS_L2CACHEABLE(db))
|
||||
aflags |= ARC_FLAG_L2CACHE;
|
||||
if (DBUF_IS_L2COMPRESSIBLE(db))
|
||||
aflags |= ARC_FLAG_L2COMPRESS;
|
||||
|
||||
SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
|
||||
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
|
||||
@ -851,7 +1117,7 @@ dbuf_noread(dmu_buf_impl_t *db)
|
||||
|
||||
ASSERT(db->db_buf == NULL);
|
||||
ASSERT(db->db.db_data == NULL);
|
||||
dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
|
||||
dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
|
||||
db->db_state = DB_FILL;
|
||||
} else if (db->db_state == DB_NOFILL) {
|
||||
dbuf_clear_data(db);
|
||||
@ -907,9 +1173,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
||||
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
||||
spa_t *spa = db->db_objset->os_spa;
|
||||
|
||||
dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
|
||||
dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
|
||||
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
|
||||
} else {
|
||||
db->db_buf = NULL;
|
||||
dbuf_clear_data(db);
|
||||
}
|
||||
}
|
||||
@ -1033,7 +1300,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
|
||||
}
|
||||
if (refcount_count(&db->db_holds) == 0) {
|
||||
ASSERT(db->db_buf);
|
||||
dbuf_clear(db);
|
||||
dbuf_destroy(db);
|
||||
continue;
|
||||
}
|
||||
/* The dbuf is referenced */
|
||||
@ -1138,7 +1405,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
||||
dmu_buf_will_dirty(&db->db, tx);
|
||||
|
||||
/* create the data buffer for the new block */
|
||||
buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
|
||||
buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
|
||||
|
||||
/* copy old block data to the new block */
|
||||
obuf = db->db_buf;
|
||||
@ -1149,7 +1416,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
dbuf_set_data(db, buf);
|
||||
VERIFY(arc_buf_remove_ref(obuf, db));
|
||||
arc_buf_destroy(obuf, db);
|
||||
db->db.db_size = size;
|
||||
|
||||
if (db->db_level == 0) {
|
||||
@ -1547,7 +1814,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
ASSERT(db->db_buf != NULL);
|
||||
ASSERT(dr->dt.dl.dr_data != NULL);
|
||||
if (dr->dt.dl.dr_data != db->db_buf)
|
||||
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
|
||||
arc_buf_destroy(dr->dt.dl.dr_data, db);
|
||||
}
|
||||
|
||||
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
||||
@ -1556,12 +1823,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
db->db_dirtycnt -= 1;
|
||||
|
||||
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
|
||||
arc_buf_t *buf = db->db_buf;
|
||||
|
||||
ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
|
||||
dbuf_clear_data(db);
|
||||
VERIFY(arc_buf_remove_ref(buf, db));
|
||||
dbuf_evict(db);
|
||||
ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
|
||||
dbuf_destroy(db);
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
@ -1725,7 +1988,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
||||
mutex_exit(&db->db_mtx);
|
||||
(void) dbuf_dirty(db, tx);
|
||||
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
|
||||
VERIFY(arc_buf_remove_ref(buf, db));
|
||||
arc_buf_destroy(buf, db);
|
||||
xuio_stat_wbuf_copied();
|
||||
return;
|
||||
}
|
||||
@ -1743,10 +2006,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
||||
arc_release(db->db_buf, db);
|
||||
}
|
||||
dr->dt.dl.dr_data = buf;
|
||||
VERIFY(arc_buf_remove_ref(db->db_buf, db));
|
||||
arc_buf_destroy(db->db_buf, db);
|
||||
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
|
||||
arc_release(db->db_buf, db);
|
||||
VERIFY(arc_buf_remove_ref(db->db_buf, db));
|
||||
arc_buf_destroy(db->db_buf, db);
|
||||
}
|
||||
db->db_buf = NULL;
|
||||
}
|
||||
@ -1758,59 +2021,62 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
||||
dmu_buf_fill_done(&db->db, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* "Clear" the contents of this dbuf. This will mark the dbuf
|
||||
* EVICTING and clear *most* of its references. Unfortunately,
|
||||
* when we are not holding the dn_dbufs_mtx, we can't clear the
|
||||
* entry in the dn_dbufs list. We have to wait until dbuf_destroy()
|
||||
* in this case. For callers from the DMU we will usually see:
|
||||
* dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
|
||||
* For the arc callback, we will usually see:
|
||||
* dbuf_do_evict()->dbuf_clear();dbuf_destroy()
|
||||
* Sometimes, though, we will get a mix of these two:
|
||||
* DMU: dbuf_clear()->arc_clear_callback()
|
||||
* ARC: dbuf_do_evict()->dbuf_destroy()
|
||||
*
|
||||
* This routine will dissociate the dbuf from the arc, by calling
|
||||
* arc_clear_callback(), but will not evict the data from the ARC.
|
||||
*/
|
||||
void
|
||||
dbuf_clear(dmu_buf_impl_t *db)
|
||||
dbuf_destroy(dmu_buf_impl_t *db)
|
||||
{
|
||||
dnode_t *dn;
|
||||
dmu_buf_impl_t *parent = db->db_parent;
|
||||
dmu_buf_impl_t *dndb;
|
||||
boolean_t dbuf_gone = B_FALSE;
|
||||
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
ASSERT(refcount_is_zero(&db->db_holds));
|
||||
|
||||
dbuf_evict_user(db);
|
||||
if (db->db_buf != NULL) {
|
||||
arc_buf_destroy(db->db_buf, db);
|
||||
db->db_buf = NULL;
|
||||
}
|
||||
|
||||
if (db->db_state == DB_CACHED) {
|
||||
if (db->db_blkid == DMU_BONUS_BLKID) {
|
||||
ASSERT(db->db.db_data != NULL);
|
||||
if (db->db_blkid == DMU_BONUS_BLKID) {
|
||||
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
|
||||
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
|
||||
}
|
||||
db->db.db_data = NULL;
|
||||
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
|
||||
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
|
||||
db->db_state = DB_UNCACHED;
|
||||
}
|
||||
|
||||
dbuf_clear_data(db);
|
||||
|
||||
if (multilist_link_active(&db->db_cache_link)) {
|
||||
multilist_remove(&dbuf_cache, db);
|
||||
(void) refcount_remove_many(&dbuf_cache_size,
|
||||
db->db.db_size, db);
|
||||
}
|
||||
|
||||
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
||||
ASSERT(db->db_data_pending == NULL);
|
||||
|
||||
db->db_state = DB_EVICTING;
|
||||
db->db_blkptr = NULL;
|
||||
|
||||
/*
|
||||
* Now that db_state is DB_EVICTING, nobody else can find this via
|
||||
* the hash table. We can now drop db_mtx, which allows us to
|
||||
* acquire the dn_dbufs_mtx.
|
||||
*/
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
dn = DB_DNODE(db);
|
||||
dndb = dn->dn_dbuf;
|
||||
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
|
||||
if (db->db_blkid != DMU_BONUS_BLKID) {
|
||||
boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
|
||||
if (needlock)
|
||||
mutex_enter(&dn->dn_dbufs_mtx);
|
||||
avl_remove(&dn->dn_dbufs, db);
|
||||
atomic_dec_32(&dn->dn_dbufs_count);
|
||||
membar_producer();
|
||||
DB_DNODE_EXIT(db);
|
||||
if (needlock)
|
||||
mutex_exit(&dn->dn_dbufs_mtx);
|
||||
/*
|
||||
* Decrementing the dbuf count means that the hold corresponding
|
||||
* to the removed dbuf is no longer discounted in dnode_move(),
|
||||
@ -1821,15 +2087,25 @@ dbuf_clear(dmu_buf_impl_t *db)
|
||||
*/
|
||||
dnode_rele(dn, db);
|
||||
db->db_dnode_handle = NULL;
|
||||
|
||||
dbuf_hash_remove(db);
|
||||
} else {
|
||||
DB_DNODE_EXIT(db);
|
||||
}
|
||||
|
||||
if (db->db_buf)
|
||||
dbuf_gone = arc_clear_callback(db->db_buf);
|
||||
ASSERT(refcount_is_zero(&db->db_holds));
|
||||
|
||||
if (!dbuf_gone)
|
||||
mutex_exit(&db->db_mtx);
|
||||
db->db_parent = NULL;
|
||||
|
||||
ASSERT(db->db_buf == NULL);
|
||||
ASSERT(db->db.db_data == NULL);
|
||||
ASSERT(db->db_hash_next == NULL);
|
||||
ASSERT(db->db_blkptr == NULL);
|
||||
ASSERT(db->db_data_pending == NULL);
|
||||
ASSERT(!multilist_link_active(&db->db_cache_link));
|
||||
|
||||
kmem_cache_free(dbuf_kmem_cache, db);
|
||||
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
|
||||
|
||||
/*
|
||||
* If this dbuf is referenced from an indirect dbuf,
|
||||
@ -1922,7 +2198,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
||||
ASSERT(dn->dn_type != DMU_OT_NONE);
|
||||
|
||||
db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
|
||||
db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
|
||||
|
||||
db->db_objset = os;
|
||||
db->db.db_object = dn->dn_object;
|
||||
@ -1971,7 +2247,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
db->db_state = DB_EVICTING;
|
||||
if ((odb = dbuf_hash_insert(db)) != NULL) {
|
||||
/* someone else inserted it first */
|
||||
kmem_cache_free(dbuf_cache, db);
|
||||
kmem_cache_free(dbuf_kmem_cache, db);
|
||||
mutex_exit(&dn->dn_dbufs_mtx);
|
||||
return (odb);
|
||||
}
|
||||
@ -1996,76 +2272,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
return (db);
|
||||
}
|
||||
|
||||
static int
|
||||
dbuf_do_evict(void *private)
|
||||
{
|
||||
dmu_buf_impl_t *db = private;
|
||||
|
||||
if (!MUTEX_HELD(&db->db_mtx))
|
||||
mutex_enter(&db->db_mtx);
|
||||
|
||||
ASSERT(refcount_is_zero(&db->db_holds));
|
||||
|
||||
if (db->db_state != DB_EVICTING) {
|
||||
ASSERT(db->db_state == DB_CACHED);
|
||||
DBUF_VERIFY(db);
|
||||
db->db_buf = NULL;
|
||||
dbuf_evict(db);
|
||||
} else {
|
||||
mutex_exit(&db->db_mtx);
|
||||
dbuf_destroy(db);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_destroy(dmu_buf_impl_t *db)
|
||||
{
|
||||
ASSERT(refcount_is_zero(&db->db_holds));
|
||||
|
||||
if (db->db_blkid != DMU_BONUS_BLKID) {
|
||||
/*
|
||||
* If this dbuf is still on the dn_dbufs list,
|
||||
* remove it from that list.
|
||||
*/
|
||||
if (db->db_dnode_handle != NULL) {
|
||||
dnode_t *dn;
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
dn = DB_DNODE(db);
|
||||
mutex_enter(&dn->dn_dbufs_mtx);
|
||||
avl_remove(&dn->dn_dbufs, db);
|
||||
atomic_dec_32(&dn->dn_dbufs_count);
|
||||
mutex_exit(&dn->dn_dbufs_mtx);
|
||||
DB_DNODE_EXIT(db);
|
||||
/*
|
||||
* Decrementing the dbuf count means that the hold
|
||||
* corresponding to the removed dbuf is no longer
|
||||
* discounted in dnode_move(), so the dnode cannot be
|
||||
* moved until after we release the hold.
|
||||
*/
|
||||
dnode_rele(dn, db);
|
||||
db->db_dnode_handle = NULL;
|
||||
}
|
||||
dbuf_hash_remove(db);
|
||||
}
|
||||
db->db_parent = NULL;
|
||||
db->db_buf = NULL;
|
||||
|
||||
ASSERT(db->db.db_data == NULL);
|
||||
ASSERT(db->db_hash_next == NULL);
|
||||
ASSERT(db->db_blkptr == NULL);
|
||||
ASSERT(db->db_data_pending == NULL);
|
||||
|
||||
kmem_cache_free(dbuf_cache, db);
|
||||
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
|
||||
}
|
||||
|
||||
typedef struct dbuf_prefetch_arg {
|
||||
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
|
||||
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
|
||||
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
|
||||
int dpa_curlevel; /* The current level that we're reading */
|
||||
dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
|
||||
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
|
||||
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
|
||||
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
|
||||
@ -2103,10 +2315,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
|
||||
|
||||
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
|
||||
ASSERT3S(dpa->dpa_curlevel, >, 0);
|
||||
|
||||
/*
|
||||
* The dpa_dnode is only valid if we are called with a NULL
|
||||
* zio. This indicates that the arc_read() returned without
|
||||
* first calling zio_read() to issue a physical read. Once
|
||||
* a physical read is made the dpa_dnode must be invalidated
|
||||
* as the locks guarding it may have been dropped. If the
|
||||
* dpa_dnode is still valid, then we want to add it to the dbuf
|
||||
* cache. To do so, we must hold the dbuf associated with the block
|
||||
* we just prefetched, read its contents so that we associate it
|
||||
* with an arc_buf_t, and then release it.
|
||||
*/
|
||||
if (zio != NULL) {
|
||||
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
|
||||
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
|
||||
if (zio->io_flags & ZIO_FLAG_RAW) {
|
||||
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
|
||||
} else {
|
||||
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
|
||||
}
|
||||
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
|
||||
|
||||
dpa->dpa_dnode = NULL;
|
||||
} else if (dpa->dpa_dnode != NULL) {
|
||||
uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
|
||||
(dpa->dpa_epbs * (dpa->dpa_curlevel -
|
||||
dpa->dpa_zb.zb_level));
|
||||
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
|
||||
dpa->dpa_curlevel, curblkid, FTAG);
|
||||
(void) dbuf_read(db, NULL,
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
|
||||
dbuf_rele(db, FTAG);
|
||||
}
|
||||
|
||||
dpa->dpa_curlevel--;
|
||||
@ -2135,7 +2374,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
|
||||
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
||||
&iter_aflags, &zb);
|
||||
}
|
||||
(void) arc_buf_remove_ref(abuf, private);
|
||||
|
||||
arc_buf_destroy(abuf, private);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2229,6 +2469,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
||||
dpa->dpa_prio = prio;
|
||||
dpa->dpa_aflags = aflags;
|
||||
dpa->dpa_spa = dn->dn_objset->os_spa;
|
||||
dpa->dpa_dnode = dn;
|
||||
dpa->dpa_epbs = epbs;
|
||||
dpa->dpa_zio = pio;
|
||||
|
||||
@ -2309,18 +2550,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
if (db->db_buf && refcount_is_zero(&db->db_holds)) {
|
||||
arc_buf_add_ref(db->db_buf, db);
|
||||
if (db->db_buf->b_data == NULL) {
|
||||
dbuf_clear(db);
|
||||
if (parent) {
|
||||
dbuf_rele(parent, NULL);
|
||||
parent = NULL;
|
||||
}
|
||||
goto top;
|
||||
}
|
||||
if (db->db_buf != NULL)
|
||||
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
|
||||
}
|
||||
|
||||
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
|
||||
|
||||
@ -2338,13 +2569,19 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
||||
|
||||
dbuf_set_data(db,
|
||||
arc_buf_alloc(dn->dn_objset->os_spa,
|
||||
arc_alloc_buf(dn->dn_objset->os_spa,
|
||||
db->db.db_size, db, type));
|
||||
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
|
||||
db->db.db_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (multilist_link_active(&db->db_cache_link)) {
|
||||
ASSERT(refcount_is_zero(&db->db_holds));
|
||||
multilist_remove(&dbuf_cache, db);
|
||||
(void) refcount_remove_many(&dbuf_cache_size,
|
||||
db->db.db_size, db);
|
||||
}
|
||||
(void) refcount_add(&db->db_holds, tag);
|
||||
DBUF_VERIFY(db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
@ -2418,7 +2655,7 @@ void
|
||||
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
|
||||
{
|
||||
int64_t holds = refcount_add(&db->db_holds, tag);
|
||||
ASSERT(holds > 1);
|
||||
ASSERT3S(holds, >, 1);
|
||||
}
|
||||
|
||||
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
|
||||
@ -2489,8 +2726,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
|
||||
* We can't freeze indirects if there is a possibility that they
|
||||
* may be modified in the current syncing context.
|
||||
*/
|
||||
if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
|
||||
if (db->db_buf != NULL &&
|
||||
holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
|
||||
arc_buf_freeze(db->db_buf);
|
||||
}
|
||||
|
||||
if (holds == db->db_dirtycnt &&
|
||||
db->db_level == 0 && db->db_user_immediate_evict)
|
||||
@ -2535,55 +2774,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
|
||||
*/
|
||||
ASSERT(db->db_state == DB_UNCACHED ||
|
||||
db->db_state == DB_NOFILL);
|
||||
dbuf_evict(db);
|
||||
dbuf_destroy(db);
|
||||
} else if (arc_released(db->db_buf)) {
|
||||
arc_buf_t *buf = db->db_buf;
|
||||
/*
|
||||
* This dbuf has anonymous data associated with it.
|
||||
*/
|
||||
dbuf_clear_data(db);
|
||||
VERIFY(arc_buf_remove_ref(buf, db));
|
||||
dbuf_evict(db);
|
||||
dbuf_destroy(db);
|
||||
} else {
|
||||
VERIFY(!arc_buf_remove_ref(db->db_buf, db));
|
||||
boolean_t do_arc_evict = B_FALSE;
|
||||
blkptr_t bp;
|
||||
spa_t *spa = dmu_objset_spa(db->db_objset);
|
||||
|
||||
/*
|
||||
* A dbuf will be eligible for eviction if either the
|
||||
* 'primarycache' property is set or a duplicate
|
||||
* copy of this buffer is already cached in the arc.
|
||||
*
|
||||
* In the case of the 'primarycache' a buffer
|
||||
* is considered for eviction if it matches the
|
||||
* criteria set in the property.
|
||||
*
|
||||
* To decide if our buffer is considered a
|
||||
* duplicate, we must call into the arc to determine
|
||||
* if multiple buffers are referencing the same
|
||||
* block on-disk. If so, then we simply evict
|
||||
* ourselves.
|
||||
*/
|
||||
if (!DBUF_IS_CACHEABLE(db)) {
|
||||
if (db->db_blkptr != NULL &&
|
||||
!BP_IS_HOLE(db->db_blkptr) &&
|
||||
!BP_IS_EMBEDDED(db->db_blkptr)) {
|
||||
spa_t *spa =
|
||||
dmu_objset_spa(db->db_objset);
|
||||
blkptr_t bp = *db->db_blkptr;
|
||||
dbuf_clear(db);
|
||||
arc_freed(spa, &bp);
|
||||
} else {
|
||||
dbuf_clear(db);
|
||||
}
|
||||
} else if (db->db_pending_evict ||
|
||||
arc_buf_eviction_needed(db->db_buf)) {
|
||||
dbuf_clear(db);
|
||||
} else {
|
||||
mutex_exit(&db->db_mtx);
|
||||
if (!DBUF_IS_CACHEABLE(db) &&
|
||||
db->db_blkptr != NULL &&
|
||||
!BP_IS_HOLE(db->db_blkptr) &&
|
||||
!BP_IS_EMBEDDED(db->db_blkptr)) {
|
||||
do_arc_evict = B_TRUE;
|
||||
bp = *db->db_blkptr;
|
||||
}
|
||||
|
||||
if (!DBUF_IS_CACHEABLE(db) ||
|
||||
db->db_pending_evict) {
|
||||
dbuf_destroy(db);
|
||||
} else if (!multilist_link_active(&db->db_cache_link)) {
|
||||
multilist_insert(&dbuf_cache, db);
|
||||
(void) refcount_add_many(&dbuf_cache_size,
|
||||
db->db.db_size, db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_evict_notify();
|
||||
}
|
||||
|
||||
if (do_arc_evict)
|
||||
arc_freed(spa, &bp);
|
||||
}
|
||||
} else {
|
||||
mutex_exit(&db->db_mtx);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#pragma weak dmu_buf_refcount = dbuf_refcount
|
||||
@ -2871,7 +3099,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
||||
*/
|
||||
int blksz = arc_buf_size(*datap);
|
||||
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
||||
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
|
||||
*datap = arc_alloc_buf(os->os_spa, blksz, db, type);
|
||||
bcopy(db->db.db_data, (*datap)->b_data, blksz);
|
||||
}
|
||||
db->db_data_pending = dr;
|
||||
@ -3137,10 +3365,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
||||
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
|
||||
if (db->db_state != DB_NOFILL) {
|
||||
if (dr->dt.dl.dr_data != db->db_buf)
|
||||
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
|
||||
db));
|
||||
else if (!arc_released(db->db_buf))
|
||||
arc_set_callback(db->db_buf, dbuf_do_evict, db);
|
||||
arc_buf_destroy(dr->dt.dl.dr_data, db);
|
||||
}
|
||||
} else {
|
||||
dnode_t *dn;
|
||||
@ -3156,8 +3381,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
||||
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
|
||||
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
|
||||
db->db.db_size);
|
||||
if (!arc_released(db->db_buf))
|
||||
arc_set_callback(db->db_buf, dbuf_do_evict, db);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
mutex_destroy(&dr->dt.di.dr_mtx);
|
||||
@ -3334,8 +3557,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
||||
|
||||
dr->dr_zio = arc_write(zio, os->os_spa, txg,
|
||||
&dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
|
||||
DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
|
||||
children_ready_cb,
|
||||
&zp, dbuf_write_ready, children_ready_cb,
|
||||
dbuf_write_physdone, dbuf_write_done, db,
|
||||
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
||||
}
|
||||
|
@ -1407,7 +1407,7 @@ void
|
||||
dmu_return_arcbuf(arc_buf_t *buf)
|
||||
{
|
||||
arc_return_buf(buf, FTAG);
|
||||
VERIFY(arc_buf_remove_ref(buf, FTAG));
|
||||
arc_buf_destroy(buf, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1763,8 +1763,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
|
||||
|
||||
zio_nowait(arc_write(pio, os->os_spa, txg,
|
||||
bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
|
||||
DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
|
||||
NULL, NULL, dmu_sync_done, dsa,
|
||||
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
|
||||
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
|
||||
|
||||
return (0);
|
||||
@ -2137,11 +2136,11 @@ dmu_init(void)
|
||||
xuio_stat_init();
|
||||
dmu_objset_init();
|
||||
dnode_init();
|
||||
dbuf_init();
|
||||
zfetch_init();
|
||||
zio_compress_init();
|
||||
l2arc_init();
|
||||
arc_init();
|
||||
dbuf_init();
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/dmu.h>
|
||||
@ -169,7 +169,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
(void) arc_buf_remove_ref(abuf, &abuf);
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
if (err)
|
||||
return (err);
|
||||
/* Don't care about the data blocks */
|
||||
|
@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
||||
|
||||
if (DMU_OS_IS_L2CACHEABLE(os))
|
||||
aflags |= ARC_FLAG_L2CACHE;
|
||||
if (DMU_OS_IS_L2COMPRESSIBLE(os))
|
||||
aflags |= ARC_FLAG_L2COMPRESS;
|
||||
|
||||
dprintf_bp(os->os_rootbp, "reading %s", "");
|
||||
err = arc_read(NULL, spa, os->os_rootbp,
|
||||
@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
||||
/* Increase the blocksize if we are permitted. */
|
||||
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
|
||||
arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
|
||||
arc_buf_t *buf = arc_buf_alloc(spa,
|
||||
arc_buf_t *buf = arc_alloc_buf(spa,
|
||||
sizeof (objset_phys_t), &os->os_phys_buf,
|
||||
ARC_BUFC_METADATA);
|
||||
bzero(buf->b_data, sizeof (objset_phys_t));
|
||||
bcopy(os->os_phys_buf->b_data, buf->b_data,
|
||||
arc_buf_size(os->os_phys_buf));
|
||||
(void) arc_buf_remove_ref(os->os_phys_buf,
|
||||
&os->os_phys_buf);
|
||||
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
|
||||
os->os_phys_buf = buf;
|
||||
}
|
||||
|
||||
@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
||||
} else {
|
||||
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
|
||||
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
|
||||
os->os_phys_buf = arc_buf_alloc(spa, size,
|
||||
os->os_phys_buf = arc_alloc_buf(spa, size,
|
||||
&os->os_phys_buf, ARC_BUFC_METADATA);
|
||||
os->os_phys = os->os_phys_buf->b_data;
|
||||
bzero(os->os_phys, size);
|
||||
@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
||||
if (needlock)
|
||||
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
|
||||
if (err != 0) {
|
||||
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
|
||||
&os->os_phys_buf));
|
||||
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
|
||||
kmem_free(os, sizeof (objset_t));
|
||||
return (err);
|
||||
}
|
||||
@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os)
|
||||
}
|
||||
zil_free(os->os_zil);
|
||||
|
||||
VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
|
||||
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
|
||||
|
||||
/*
|
||||
* This is a barrier to prevent the objset from going away in
|
||||
@ -1128,7 +1124,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
|
||||
|
||||
zio = arc_write(pio, os->os_spa, tx->tx_txg,
|
||||
os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
|
||||
DMU_OS_IS_L2COMPRESSIBLE(os),
|
||||
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
|
||||
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
||||
|
||||
|
@ -634,7 +634,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
(void) arc_buf_remove_ref(abuf, &abuf);
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
} else if (type == DMU_OT_SA) {
|
||||
arc_flags_t aflags = ARC_FLAG_WAIT;
|
||||
arc_buf_t *abuf;
|
||||
@ -646,7 +646,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
|
||||
return (SET_ERROR(EIO));
|
||||
|
||||
err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
|
||||
(void) arc_buf_remove_ref(abuf, &abuf);
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
} else if (backup_do_embed(dsa, bp)) {
|
||||
/* it's an embedded level-0 block of a regular object */
|
||||
int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
|
||||
@ -670,7 +670,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
|
||||
&aflags, zb) != 0) {
|
||||
if (zfs_send_corrupt_data) {
|
||||
/* Send a block filled with 0x"zfs badd bloc" */
|
||||
abuf = arc_buf_alloc(spa, blksz, &abuf,
|
||||
abuf = arc_alloc_buf(spa, blksz, &abuf,
|
||||
ARC_BUFC_DATA);
|
||||
uint64_t *ptr;
|
||||
for (ptr = abuf->b_data;
|
||||
@ -700,7 +700,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
|
||||
err = dump_write(dsa, type, zb->zb_object,
|
||||
offset, blksz, bp, abuf->b_data);
|
||||
}
|
||||
(void) arc_buf_remove_ref(abuf, &abuf);
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
}
|
||||
|
||||
ASSERT(err == 0 || err == EINTR);
|
||||
|
@ -380,7 +380,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
}
|
||||
|
||||
if (buf)
|
||||
(void) arc_buf_remove_ref(buf, &buf);
|
||||
arc_buf_destroy(buf, &buf);
|
||||
|
||||
post:
|
||||
if (err == 0 && (td->td_flags & TRAVERSE_POST))
|
||||
@ -595,7 +595,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
|
||||
|
||||
osp = buf->b_data;
|
||||
traverse_zil(&td, &osp->os_zil_header);
|
||||
(void) arc_buf_remove_ref(buf, &buf);
|
||||
arc_buf_destroy(buf, &buf);
|
||||
}
|
||||
|
||||
if (!(flags & TRAVERSE_PREFETCH_DATA) ||
|
||||
|
@ -512,7 +512,7 @@ dnode_destroy(dnode_t *dn)
|
||||
}
|
||||
if (dn->dn_bonus != NULL) {
|
||||
mutex_enter(&dn->dn_bonus->db_mtx);
|
||||
dbuf_evict(dn->dn_bonus);
|
||||
dbuf_destroy(dn->dn_bonus);
|
||||
dn->dn_bonus = NULL;
|
||||
}
|
||||
dn->dn_zio = NULL;
|
||||
|
@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn)
|
||||
avl_insert_here(&dn->dn_dbufs, &db_marker, db,
|
||||
AVL_BEFORE);
|
||||
|
||||
dbuf_clear(db);
|
||||
dbuf_destroy(db);
|
||||
|
||||
db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
|
||||
avl_remove(&dn->dn_dbufs, &db_marker);
|
||||
@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn)
|
||||
if (dn->dn_bonus != NULL) {
|
||||
if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
|
||||
mutex_enter(&dn->dn_bonus->db_mtx);
|
||||
dbuf_evict(dn->dn_bonus);
|
||||
dbuf_destroy(dn->dn_bonus);
|
||||
dn->dn_bonus = NULL;
|
||||
} else {
|
||||
dn->dn_bonus->db_pending_evict = TRUE;
|
||||
|
@ -679,7 +679,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
||||
dsl_scan_visitbp(cbp, &czb, dnp,
|
||||
ds, scn, ostype, tx);
|
||||
}
|
||||
(void) arc_buf_remove_ref(buf, &buf);
|
||||
arc_buf_destroy(buf, &buf);
|
||||
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
|
||||
arc_flags_t flags = ARC_FLAG_WAIT;
|
||||
dnode_phys_t *cdnp;
|
||||
@ -705,7 +705,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
||||
cdnp, zb->zb_blkid * epb + i, tx);
|
||||
}
|
||||
|
||||
(void) arc_buf_remove_ref(buf, &buf);
|
||||
arc_buf_destroy(buf, &buf);
|
||||
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
|
||||
arc_flags_t flags = ARC_FLAG_WAIT;
|
||||
objset_phys_t *osp;
|
||||
@ -737,7 +737,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
||||
&osp->os_userused_dnode,
|
||||
DMU_USERUSED_OBJECT, tx);
|
||||
}
|
||||
(void) arc_buf_remove_ref(buf, &buf);
|
||||
arc_buf_destroy(buf, &buf);
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
@ -231,4 +231,28 @@ refcount_transfer(refcount_t *dst, refcount_t *src)
|
||||
list_destroy(&removed);
|
||||
}
|
||||
|
||||
void
|
||||
refcount_transfer_ownership(refcount_t *rc, void *current_holder,
|
||||
void *new_holder)
|
||||
{
|
||||
reference_t *ref;
|
||||
boolean_t found = B_FALSE;
|
||||
|
||||
mutex_enter(&rc->rc_mtx);
|
||||
if (!rc->rc_tracked) {
|
||||
mutex_exit(&rc->rc_mtx);
|
||||
return;
|
||||
}
|
||||
|
||||
for (ref = list_head(&rc->rc_list); ref;
|
||||
ref = list_next(&rc->rc_list, ref)) {
|
||||
if (ref->ref_holder == current_holder) {
|
||||
ref->ref_holder = new_holder;
|
||||
found = B_TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ASSERT(found);
|
||||
mutex_exit(&rc->rc_mtx);
|
||||
}
|
||||
#endif /* ZFS_DEBUG */
|
||||
|
@ -43,51 +43,83 @@ extern "C" {
|
||||
*/
|
||||
#define ARC_EVICT_ALL -1ULL
|
||||
|
||||
#define HDR_SET_LSIZE(hdr, x) do { \
|
||||
ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
|
||||
(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
|
||||
_NOTE(CONSTCOND) } while (0)
|
||||
|
||||
#define HDR_SET_PSIZE(hdr, x) do { \
|
||||
ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
|
||||
(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
|
||||
_NOTE(CONSTCOND) } while (0)
|
||||
|
||||
#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
|
||||
#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT)
|
||||
|
||||
typedef struct arc_buf_hdr arc_buf_hdr_t;
|
||||
typedef struct arc_buf arc_buf_t;
|
||||
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
|
||||
typedef int arc_evict_func_t(void *priv);
|
||||
|
||||
/* generic arc_done_func_t's which you can use */
|
||||
arc_done_func_t arc_bcopy_func;
|
||||
arc_done_func_t arc_getbuf_func;
|
||||
|
||||
extern int zfs_arc_num_sublists_per_state;
|
||||
|
||||
typedef enum arc_flags
|
||||
{
|
||||
/*
|
||||
* Public flags that can be passed into the ARC by external consumers.
|
||||
*/
|
||||
ARC_FLAG_NONE = 1 << 0, /* No flags set */
|
||||
ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */
|
||||
ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */
|
||||
ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */
|
||||
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
|
||||
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
|
||||
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
|
||||
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */
|
||||
ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */
|
||||
ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */
|
||||
ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
|
||||
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
|
||||
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
|
||||
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
|
||||
|
||||
/*
|
||||
* Private ARC flags. These flags are private ARC only flags that
|
||||
* will show up in b_flags in the arc_hdr_buf_t. These flags should
|
||||
* only be set by ARC code.
|
||||
*/
|
||||
ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */
|
||||
ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */
|
||||
ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */
|
||||
ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */
|
||||
ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */
|
||||
ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */
|
||||
ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
|
||||
ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
|
||||
ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
|
||||
ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
|
||||
/* Indicates that block was read with ASYNC priority. */
|
||||
ARC_FLAG_PRIO_ASYNC_READ = 1 << 14,
|
||||
ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */
|
||||
ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */
|
||||
ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */
|
||||
ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
|
||||
ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
|
||||
ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
|
||||
ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
|
||||
/* indicates that the buffer contains metadata (otherwise, data) */
|
||||
ARC_FLAG_BUFC_METADATA = 1 << 18,
|
||||
ARC_FLAG_BUFC_METADATA = 1 << 14,
|
||||
|
||||
/* Flags specifying whether optional hdr struct fields are defined */
|
||||
ARC_FLAG_HAS_L1HDR = 1 << 19,
|
||||
ARC_FLAG_HAS_L2HDR = 1 << 20,
|
||||
ARC_FLAG_HAS_L1HDR = 1 << 15,
|
||||
ARC_FLAG_HAS_L2HDR = 1 << 16,
|
||||
|
||||
/*
|
||||
* Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
|
||||
* This allows the l2arc to use the blkptr's checksum to verify
|
||||
* the data without having to store the checksum in the hdr.
|
||||
*/
|
||||
ARC_FLAG_COMPRESSED_ARC = 1 << 17,
|
||||
ARC_FLAG_SHARED_DATA = 1 << 18,
|
||||
|
||||
/*
|
||||
* The arc buffer's compression mode is stored in the top 7 bits of the
|
||||
* flags field, so these dummy flags are included so that MDB can
|
||||
* interpret the enum properly.
|
||||
*/
|
||||
ARC_FLAG_COMPRESS_0 = 1 << 24,
|
||||
ARC_FLAG_COMPRESS_1 = 1 << 25,
|
||||
ARC_FLAG_COMPRESS_2 = 1 << 26,
|
||||
ARC_FLAG_COMPRESS_3 = 1 << 27,
|
||||
ARC_FLAG_COMPRESS_4 = 1 << 28,
|
||||
ARC_FLAG_COMPRESS_5 = 1 << 29,
|
||||
ARC_FLAG_COMPRESS_6 = 1 << 30
|
||||
|
||||
} arc_flags_t;
|
||||
|
||||
struct arc_buf {
|
||||
@ -95,11 +127,10 @@ struct arc_buf {
|
||||
arc_buf_t *b_next;
|
||||
kmutex_t b_evict_lock;
|
||||
void *b_data;
|
||||
arc_evict_func_t *b_efunc;
|
||||
void *b_private;
|
||||
};
|
||||
|
||||
typedef enum arc_buf_contents {
|
||||
ARC_BUFC_INVALID, /* invalid type */
|
||||
ARC_BUFC_DATA, /* buffer contains data */
|
||||
ARC_BUFC_METADATA, /* buffer contains metadata */
|
||||
ARC_BUFC_NUMTYPES
|
||||
@ -119,19 +150,17 @@ typedef enum arc_space_type {
|
||||
|
||||
void arc_space_consume(uint64_t space, arc_space_type_t type);
|
||||
void arc_space_return(uint64_t space, arc_space_type_t type);
|
||||
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
|
||||
arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
|
||||
arc_buf_contents_t type);
|
||||
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
|
||||
void arc_return_buf(arc_buf_t *buf, void *tag);
|
||||
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
|
||||
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
|
||||
boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
|
||||
void arc_buf_destroy(arc_buf_t *buf, void *tag);
|
||||
int arc_buf_size(arc_buf_t *buf);
|
||||
void arc_release(arc_buf_t *buf, void *tag);
|
||||
int arc_released(arc_buf_t *buf);
|
||||
void arc_buf_freeze(arc_buf_t *buf);
|
||||
void arc_buf_thaw(arc_buf_t *buf);
|
||||
boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
|
||||
#ifdef ZFS_DEBUG
|
||||
int arc_referenced(arc_buf_t *buf);
|
||||
#endif
|
||||
@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
|
||||
arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
|
||||
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
|
||||
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
|
||||
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
|
||||
const zio_prop_t *zp,
|
||||
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
|
||||
arc_done_func_t *ready, arc_done_func_t *child_ready,
|
||||
arc_done_func_t *physdone, arc_done_func_t *done,
|
||||
void *priv, zio_priority_t priority, int zio_flags,
|
||||
const zbookmark_phys_t *zb);
|
||||
void arc_freed(spa_t *spa, const blkptr_t *bp);
|
||||
|
||||
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
|
||||
boolean_t arc_clear_callback(arc_buf_t *buf);
|
||||
|
||||
void arc_flush(spa_t *spa, boolean_t retry);
|
||||
void arc_tempreserve_clear(uint64_t reserve);
|
||||
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
|
||||
|
||||
uint64_t arc_max_bytes(void);
|
||||
void arc_init(void);
|
||||
void arc_fini(void);
|
||||
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/refcount.h>
|
||||
#include <sys/zrlock.h>
|
||||
#include <sys/multilist.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@ -228,6 +229,11 @@ typedef struct dmu_buf_impl {
|
||||
*/
|
||||
avl_node_t db_link;
|
||||
|
||||
/*
|
||||
* Link in dbuf_cache.
|
||||
*/
|
||||
multilist_node_t db_cache_link;
|
||||
|
||||
/* Data which is unique to data (leaf) blocks: */
|
||||
|
||||
/* User callback information. */
|
||||
@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
|
||||
bp_embedded_type_t etype, enum zio_compress comp,
|
||||
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
|
||||
|
||||
void dbuf_clear(dmu_buf_impl_t *db);
|
||||
void dbuf_evict(dmu_buf_impl_t *db);
|
||||
void dbuf_destroy(dmu_buf_impl_t *db);
|
||||
|
||||
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
||||
void dbuf_unoverride(dbuf_dirty_record_t *dr);
|
||||
@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
|
||||
(dbuf_is_metadata(_db) && \
|
||||
((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
|
||||
|
||||
#define DBUF_IS_L2COMPRESSIBLE(_db) \
|
||||
((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \
|
||||
(dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE))
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
|
||||
/*
|
||||
|
@ -73,6 +73,7 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag);
|
||||
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
|
||||
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
|
||||
void refcount_transfer(refcount_t *dst, refcount_t *src);
|
||||
void refcount_transfer_ownership(refcount_t *, void *, void *);
|
||||
|
||||
void refcount_sysinit(void);
|
||||
void refcount_fini(void);
|
||||
@ -100,6 +101,7 @@ typedef struct refcount {
|
||||
atomic_add_64(&(src)->rc_count, -__tmp); \
|
||||
atomic_add_64(&(dst)->rc_count, __tmp); \
|
||||
}
|
||||
#define refcount_transfer_ownership(rc, current_holder, new_holder)
|
||||
|
||||
#define refcount_sysinit()
|
||||
#define refcount_fini()
|
||||
|
@ -149,6 +149,8 @@ _NOTE(CONSTCOND) } while (0)
|
||||
#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
|
||||
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
|
||||
|
||||
#define SPA_COMPRESSBITS 7
|
||||
|
||||
/*
|
||||
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
|
||||
* The members of the dva_t should be considered opaque outside the SPA.
|
||||
@ -391,8 +393,10 @@ _NOTE(CONSTCOND) } while (0)
|
||||
16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
|
||||
_NOTE(CONSTCOND) } while (0)
|
||||
|
||||
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
|
||||
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
|
||||
#define BP_GET_COMPRESS(bp) \
|
||||
BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
|
||||
#define BP_SET_COMPRESS(bp, x) \
|
||||
BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
|
||||
|
||||
#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
|
||||
#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
|
||||
|
@ -564,6 +564,10 @@ extern void zio_buf_free(void *buf, size_t size);
|
||||
extern void *zio_data_buf_alloc(size_t size);
|
||||
extern void zio_data_buf_free(void *buf, size_t size);
|
||||
|
||||
extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
|
||||
uint64_t bufsize, zio_transform_func_t *transform);
|
||||
extern void zio_pop_transforms(zio_t *zio);
|
||||
|
||||
extern void zio_resubmit_stage_async(void *);
|
||||
|
||||
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
|
||||
|
@ -99,8 +99,12 @@ extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
|
||||
extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
|
||||
#endif
|
||||
|
||||
extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
|
||||
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
|
||||
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
|
||||
void *data, uint64_t size);
|
||||
extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
|
||||
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
|
||||
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
|
||||
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
|
||||
extern void zio_checksum_templates_free(spa_t *spa);
|
||||
|
@ -20,6 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
*/
|
||||
@ -253,7 +254,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
|
||||
}
|
||||
}
|
||||
|
||||
VERIFY(arc_buf_remove_ref(abuf, &abuf));
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
}
|
||||
|
||||
return (error);
|
||||
@ -290,7 +291,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
|
||||
if (error == 0) {
|
||||
if (wbuf != NULL)
|
||||
bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
|
||||
(void) arc_buf_remove_ref(abuf, &abuf);
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
}
|
||||
|
||||
return (error);
|
||||
|
@ -329,7 +329,7 @@ zio_data_buf_free(void *buf, size_t size)
|
||||
* Push and pop I/O transform buffers
|
||||
* ==========================================================================
|
||||
*/
|
||||
static void
|
||||
void
|
||||
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
|
||||
zio_transform_func_t *transform)
|
||||
{
|
||||
@ -347,7 +347,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
|
||||
zio->io_size = size;
|
||||
}
|
||||
|
||||
static void
|
||||
void
|
||||
zio_pop_transforms(zio_t *zio)
|
||||
{
|
||||
zio_transform_t *zt;
|
||||
@ -1022,8 +1022,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
||||
*/
|
||||
zio_t *
|
||||
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
|
||||
void *data, uint64_t size, int type, zio_priority_t priority,
|
||||
enum zio_flag flags, zio_done_func_t *done, void *private)
|
||||
void *data, uint64_t size, int type, zio_priority_t priority,
|
||||
enum zio_flag flags, zio_done_func_t *done, void *private)
|
||||
{
|
||||
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
|
||||
zio_t *zio;
|
||||
@ -2356,7 +2356,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
|
||||
bcmp(abuf->b_data, zio->io_orig_data,
|
||||
zio->io_orig_size) != 0)
|
||||
error = SET_ERROR(EEXIST);
|
||||
VERIFY(arc_buf_remove_ref(abuf, &abuf));
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
}
|
||||
|
||||
ddt_enter(ddt);
|
||||
|
@ -297,20 +297,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
|
||||
}
|
||||
|
||||
int
|
||||
zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
|
||||
zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
|
||||
void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
|
||||
{
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
|
||||
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
|
||||
int byteswap;
|
||||
int error;
|
||||
uint64_t size = (bp == NULL ? zio->io_size :
|
||||
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
|
||||
uint64_t offset = zio->io_offset;
|
||||
void *data = zio->io_data;
|
||||
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
|
||||
zio_cksum_t actual_cksum, expected_cksum, verifier;
|
||||
spa_t *spa = zio->io_spa;
|
||||
zio_cksum_t actual_cksum, expected_cksum;
|
||||
int byteswap;
|
||||
|
||||
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
|
||||
return (SET_ERROR(EINVAL));
|
||||
@ -319,6 +311,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
|
||||
|
||||
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
|
||||
zio_eck_t *eck;
|
||||
zio_cksum_t verifier;
|
||||
|
||||
if (checksum == ZIO_CHECKSUM_ZILOG2) {
|
||||
zil_chain_t *zilc = data;
|
||||
@ -358,35 +351,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
|
||||
spa->spa_cksum_tmpls[checksum], &actual_cksum);
|
||||
eck->zec_cksum = expected_cksum;
|
||||
|
||||
if (byteswap)
|
||||
if (byteswap) {
|
||||
byteswap_uint64_array(&expected_cksum,
|
||||
sizeof (zio_cksum_t));
|
||||
}
|
||||
} else {
|
||||
ASSERT(!BP_IS_GANG(bp));
|
||||
byteswap = BP_SHOULD_BYTESWAP(bp);
|
||||
expected_cksum = bp->blk_cksum;
|
||||
ci->ci_func[byteswap](data, size,
|
||||
spa->spa_cksum_tmpls[checksum], &actual_cksum);
|
||||
}
|
||||
|
||||
info->zbc_expected = expected_cksum;
|
||||
info->zbc_actual = actual_cksum;
|
||||
info->zbc_checksum_name = ci->ci_name;
|
||||
info->zbc_byteswapped = byteswap;
|
||||
info->zbc_injected = 0;
|
||||
info->zbc_has_cksum = 1;
|
||||
if (info != NULL) {
|
||||
info->zbc_expected = expected_cksum;
|
||||
info->zbc_actual = actual_cksum;
|
||||
info->zbc_checksum_name = ci->ci_name;
|
||||
info->zbc_byteswapped = byteswap;
|
||||
info->zbc_injected = 0;
|
||||
info->zbc_has_cksum = 1;
|
||||
}
|
||||
|
||||
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
|
||||
return (SET_ERROR(ECKSUM));
|
||||
|
||||
if (zio_injection_enabled && !zio->io_error &&
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
|
||||
{
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
|
||||
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
|
||||
int error;
|
||||
uint64_t size = (bp == NULL ? zio->io_size :
|
||||
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
|
||||
uint64_t offset = zio->io_offset;
|
||||
void *data = zio->io_data;
|
||||
spa_t *spa = zio->io_spa;
|
||||
|
||||
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
|
||||
offset, info);
|
||||
if (error != 0 && zio_injection_enabled && !zio->io_error &&
|
||||
(error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
|
||||
|
||||
info->zbc_injected = 1;
|
||||
return (error);
|
||||
}
|
||||
|
||||
return (0);
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user