MFV r302991: 6950 ARC should cache compressed data

illumos/illumos-gate@dcbf3bd6a1
dcbf3bd6a1

https://www.illumos.org/issues/6950
  When reading compressed data from disk, the ARC should keep the compressed
  block cached and only decompress it when consumers access the block. The
  uncompressed data should be short-lived allowing the ARC to cache a much larger
  amount of data. The DMU would also maintain a smaller cache of uncompressed
  blocks to minimize the impact of decompressing frequently accessed blocks.

Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Author: George Wilson <george.wilson@delphix.com>
This commit is contained in:
Alexander Motin 2016-09-03 08:30:51 +00:00
commit efa0867fb0
22 changed files with 2441 additions and 1976 deletions

View File

@ -1289,7 +1289,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
}
if (!err)
ASSERT3U(fill, ==, BP_GET_FILL(bp));
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
}
return (err);

View File

@ -189,6 +189,7 @@ extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@ -5355,6 +5356,12 @@ ztest_resume_thread(void *arg)
if (spa_suspended(spa))
ztest_resume(spa);
(void) poll(NULL, 0, 100);
/*
* Periodically change the zfs_compressed_arc_enabled setting.
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);
}
return (NULL);
}

File diff suppressed because it is too large Load Diff

View File

@ -45,6 +45,9 @@
#include <sys/zfeature.h>
#include <sys/blkptr.h>
#include <sys/range_tree.h>
#include <sys/callb.h>
uint_t zfs_dbuf_evict_key;
/*
* Number of times that zfs_free_range() took the slow path while doing
@ -52,7 +55,6 @@
*/
uint64_t zfs_free_range_recv_miss;
static void dbuf_destroy(dmu_buf_impl_t *db);
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@ -64,9 +66,76 @@ extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
/*
* Global data structures and functions for the dbuf cache.
*/
static kmem_cache_t *dbuf_cache;
static kmem_cache_t *dbuf_kmem_cache;
static taskq_t *dbu_evict_taskq;
static kthread_t *dbuf_cache_evict_thread;
static kmutex_t dbuf_evict_lock;
static kcondvar_t dbuf_evict_cv;
static boolean_t dbuf_evict_thread_exit;
/*
* LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
* are not currently held but have been recently released. These dbufs
* are not eligible for arc eviction until they are aged out of the cache.
* Dbufs are added to the dbuf cache once the last hold is released. If a
* dbuf is later accessed and still exists in the dbuf cache, then it will
* be removed from the cache and later re-added to the head of the cache.
* Dbufs that are aged out of the cache will be immediately destroyed and
* become eligible for arc eviction.
*/
static multilist_t dbuf_cache;
static refcount_t dbuf_cache_size;
uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
/* Cap the size of the dbuf cache to log2 fraction of arc size. */
int dbuf_cache_max_shift = 5;
/*
* The dbuf cache uses a three-stage eviction policy:
* - A low water marker designates when the dbuf eviction thread
* should stop evicting from the dbuf cache.
* - When we reach the maximum size (aka mid water mark), we
* signal the eviction thread to run.
* - The high water mark indicates when the eviction thread
* is unable to keep up with the incoming load and eviction must
* happen in the context of the calling thread.
*
* The dbuf cache:
* (max size)
* low water mid water hi water
* +----------------------------------------+----------+----------+
* | | | |
* | | | |
* | | | |
* | | | |
* +----------------------------------------+----------+----------+
* stop signal evict
* evicting eviction directly
* thread
*
* The high and low water marks indicate the operating range for the eviction
* thread. The low water mark is, by default, 90% of the total size of the
* cache and the high water mark is at 110% (both of these percentages can be
* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
* respectively). The eviction thread will try to ensure that the cache remains
* within this range by waking up every second and checking if the cache is
* above the low water mark. The thread can also be woken up by callers adding
* elements into the cache if the cache is larger than the mid water (i.e max
* cache size). Once the eviction thread is woken up and eviction is required,
* it will continue evicting buffers until it's able to reduce the cache size
* to the low water mark. If the cache size continues to grow and hits the high
* water mark, then callers adding elments to the cache will begin to evict
* directly from the cache until the cache is no longer above the high water
* mark.
*/
/*
* The percentage above and below the maximum cache size.
*/
uint_t dbuf_cache_hiwater_pct = 10;
uint_t dbuf_cache_lowater_pct = 10;
/* ARGSUSED */
static int
dbuf_cons(void *vdb, void *unused, int kmflag)
@ -76,6 +145,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
multilist_link_init(&db->db_cache_link);
refcount_create(&db->db_holds);
return (0);
@ -88,6 +158,7 @@ dbuf_dest(void *vdb, void *unused)
dmu_buf_impl_t *db = vdb;
mutex_destroy(&db->db_mtx);
cv_destroy(&db->db_changed);
ASSERT(!multilist_link_active(&db->db_cache_link));
refcount_destroy(&db->db_holds);
}
@ -117,8 +188,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
return (crc);
}
#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
((dbuf)->db.db_object == (obj) && \
(dbuf)->db_objset == (os) && \
@ -129,7 +198,7 @@ dmu_buf_impl_t *
dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
uint64_t hv = DBUF_HASH(os, obj, level, blkid);
uint64_t hv = dbuf_hash(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *db;
@ -180,7 +249,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
uint64_t obj = db->db.db_object;
int level = db->db_level;
uint64_t blkid = db->db_blkid;
uint64_t hv = DBUF_HASH(os, obj, level, blkid);
uint64_t hv = dbuf_hash(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *dbf;
@ -212,7 +281,7 @@ static void
dbuf_hash_remove(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
db->db_level, db->db_blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *dbf, **dbp;
@ -237,8 +306,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
atomic_dec_64(&dbuf_hash_count);
}
static arc_evict_func_t dbuf_do_evict;
typedef enum {
DBVU_EVICTING,
DBVU_NOT_EVICTING
@ -323,15 +390,181 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
}
}
void
dbuf_evict(dmu_buf_impl_t *db)
/*
* This function *must* return indices evenly distributed between all
* sublists of the multilist. This is needed due to how the dbuf eviction
* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
* distributed between all sublists and uses this assumption when
* deciding which sublist to evict from and how much to evict from it.
*/
unsigned int
dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_buf == NULL);
ASSERT(db->db_data_pending == NULL);
dmu_buf_impl_t *db = obj;
dbuf_clear(db);
dbuf_destroy(db);
/*
* The assumption here, is the hash value for a given
* dmu_buf_impl_t will remain constant throughout it's lifetime
* (i.e. it's objset, object, level and blkid fields don't change).
* Thus, we don't need to store the dbuf's sublist index
* on insertion, as this index can be recalculated on removal.
*
* Also, the low order bits of the hash value are thought to be
* distributed evenly. Otherwise, in the case that the multilist
* has a power of two number of sublists, each sublists' usage
* would not be evenly distributed.
*/
return (dbuf_hash(db->db_objset, db->db.db_object,
db->db_level, db->db_blkid) %
multilist_get_num_sublists(ml));
}
static inline boolean_t
dbuf_cache_above_hiwater(void)
{
uint64_t dbuf_cache_hiwater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
return (refcount_count(&dbuf_cache_size) >
dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
}
static inline boolean_t
dbuf_cache_above_lowater(void)
{
uint64_t dbuf_cache_lowater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
return (refcount_count(&dbuf_cache_size) >
dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
}
/*
* Evict the oldest eligible dbuf from the dbuf cache.
*/
static void
dbuf_evict_one(void)
{
int idx = multilist_get_random_index(&dbuf_cache);
multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
/*
* Set the thread's tsd to indicate that it's processing evictions.
* Once a thread stops evicting from the dbuf cache it will
* reset its tsd to NULL.
*/
ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db);
}
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
dbuf_destroy(db);
} else {
multilist_sublist_unlock(mls);
}
(void) tsd_set(zfs_dbuf_evict_key, NULL);
}
/*
* The dbuf evict thread is responsible for aging out dbufs from the
* cache. Once the cache has reached it's maximum size, dbufs are removed
* and destroyed. The eviction thread will continue running until the size
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
* out of the cache it is destroyed and becomes eligible for arc eviction.
*/
static void
dbuf_evict_thread(void *dummy __unused)
{
callb_cpr_t cpr;
CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
mutex_enter(&dbuf_evict_lock);
while (!dbuf_evict_thread_exit) {
while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait_hires(&dbuf_evict_cv,
&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
}
mutex_exit(&dbuf_evict_lock);
/*
* Keep evicting as long as we're above the low water mark
* for the cache. We do this without holding the locks to
* minimize lock contention.
*/
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
dbuf_evict_one();
}
mutex_enter(&dbuf_evict_lock);
}
dbuf_evict_thread_exit = B_FALSE;
cv_broadcast(&dbuf_evict_cv);
CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
thread_exit();
}
/*
* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
* If the dbuf cache is at its high water mark, then evict a dbuf from the
* dbuf cache using the callers context.
*/
static void
dbuf_evict_notify(void)
{
/*
* We use thread specific data to track when a thread has
* started processing evictions. This allows us to avoid deeply
* nested stacks that would have a call flow similar to this:
*
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
* ^ |
* | |
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
*
* The dbuf_eviction_thread will always have its tsd set until
* that thread exits. All other threads will only set their tsd
* if they are participating in the eviction process. This only
* happens if the eviction thread is unable to process evictions
* fast enough. To keep the dbuf cache size in check, other threads
* can evict from the dbuf cache directly. Those threads will set
* their tsd values so that we ensure that they only evict one dbuf
* from the dbuf cache.
*/
if (tsd_get(zfs_dbuf_evict_key) != NULL)
return;
if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
boolean_t evict_now = B_FALSE;
mutex_enter(&dbuf_evict_lock);
if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
evict_now = dbuf_cache_above_hiwater();
cv_signal(&dbuf_evict_cv);
}
mutex_exit(&dbuf_evict_lock);
if (evict_now) {
dbuf_evict_one();
}
}
}
void
@ -359,18 +592,38 @@ dbuf_init(void)
goto retry;
}
dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
/*
* Setup the parameters for the dbuf cache. We cap the size of the
* dbuf cache to 1/32nd (default) of the size of the ARC.
*/
dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
arc_max_bytes() >> dbuf_cache_max_shift);
/*
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
* configuration is not required.
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_cache_link),
zfs_arc_num_sublists_per_state,
dbuf_cache_multilist_index_func);
refcount_create(&dbuf_cache_size);
tsd_create(&zfs_dbuf_evict_key, NULL);
dbuf_evict_thread_exit = B_FALSE;
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
NULL, 0, &p0, TS_RUN, minclsyspri);
}
void
@ -382,8 +635,23 @@ dbuf_fini(void)
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_destroy(&h->hash_mutexes[i]);
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
kmem_cache_destroy(dbuf_cache);
kmem_cache_destroy(dbuf_kmem_cache);
taskq_destroy(dbu_evict_taskq);
mutex_enter(&dbuf_evict_lock);
dbuf_evict_thread_exit = B_TRUE;
while (dbuf_evict_thread_exit) {
cv_signal(&dbuf_evict_cv);
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
}
mutex_exit(&dbuf_evict_lock);
tsd_destroy(&zfs_dbuf_evict_key);
mutex_destroy(&dbuf_evict_lock);
cv_destroy(&dbuf_evict_cv);
refcount_destroy(&dbuf_cache_size);
multilist_destroy(&dbuf_cache);
}
/*
@ -541,7 +809,7 @@ dbuf_clear_data(dmu_buf_impl_t *db)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
dbuf_evict_user(db);
db->db_buf = NULL;
ASSERT3P(db->db_buf, ==, NULL);
db->db.db_data = NULL;
if (db->db_state != DB_NOFILL)
db->db_state = DB_UNCACHED;
@ -556,8 +824,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
db->db_buf = buf;
ASSERT(buf->b_data != NULL);
db->db.db_data = buf->b_data;
if (!arc_released(buf))
arc_set_callback(buf, dbuf_do_evict, db);
}
/*
@ -568,6 +834,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
{
arc_buf_t *abuf;
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
int blksz = db->db.db_size;
@ -579,6 +846,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
} else {
abuf = db->db_buf;
arc_loan_inuse_buf(abuf, db);
db->db_buf = NULL;
dbuf_clear_data(db);
mutex_exit(&db->db_mtx);
}
@ -647,7 +915,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
} else {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
VERIFY(arc_buf_remove_ref(buf, db));
arc_buf_destroy(buf, db);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
@ -696,7 +964,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
@ -733,8 +1001,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (DBUF_IS_L2CACHEABLE(db))
aflags |= ARC_FLAG_L2CACHE;
if (DBUF_IS_L2COMPRESSIBLE(db))
aflags |= ARC_FLAG_L2COMPRESS;
SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
@ -851,7 +1117,7 @@ dbuf_noread(dmu_buf_impl_t *db)
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
dbuf_clear_data(db);
@ -907,9 +1173,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
db->db_buf = NULL;
dbuf_clear_data(db);
}
}
@ -1033,7 +1300,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
}
if (refcount_count(&db->db_holds) == 0) {
ASSERT(db->db_buf);
dbuf_clear(db);
dbuf_destroy(db);
continue;
}
/* The dbuf is referenced */
@ -1138,7 +1405,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dmu_buf_will_dirty(&db->db, tx);
/* create the data buffer for the new block */
buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
/* copy old block data to the new block */
obuf = db->db_buf;
@ -1149,7 +1416,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
dbuf_set_data(db, buf);
VERIFY(arc_buf_remove_ref(obuf, db));
arc_buf_destroy(obuf, db);
db->db.db_size = size;
if (db->db_level == 0) {
@ -1547,7 +1814,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(db->db_buf != NULL);
ASSERT(dr->dt.dl.dr_data != NULL);
if (dr->dt.dl.dr_data != db->db_buf)
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
@ -1556,12 +1823,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_dirtycnt -= 1;
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
arc_buf_t *buf = db->db_buf;
ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
dbuf_clear_data(db);
VERIFY(arc_buf_remove_ref(buf, db));
dbuf_evict(db);
ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
}
@ -1725,7 +1988,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
VERIFY(arc_buf_remove_ref(buf, db));
arc_buf_destroy(buf, db);
xuio_stat_wbuf_copied();
return;
}
@ -1743,10 +2006,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
arc_release(db->db_buf, db);
}
dr->dt.dl.dr_data = buf;
VERIFY(arc_buf_remove_ref(db->db_buf, db));
arc_buf_destroy(db->db_buf, db);
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
arc_release(db->db_buf, db);
VERIFY(arc_buf_remove_ref(db->db_buf, db));
arc_buf_destroy(db->db_buf, db);
}
db->db_buf = NULL;
}
@ -1758,59 +2021,62 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
dmu_buf_fill_done(&db->db, tx);
}
/*
* "Clear" the contents of this dbuf. This will mark the dbuf
* EVICTING and clear *most* of its references. Unfortunately,
* when we are not holding the dn_dbufs_mtx, we can't clear the
* entry in the dn_dbufs list. We have to wait until dbuf_destroy()
* in this case. For callers from the DMU we will usually see:
* dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
* For the arc callback, we will usually see:
* dbuf_do_evict()->dbuf_clear();dbuf_destroy()
* Sometimes, though, we will get a mix of these two:
* DMU: dbuf_clear()->arc_clear_callback()
* ARC: dbuf_do_evict()->dbuf_destroy()
*
* This routine will dissociate the dbuf from the arc, by calling
* arc_clear_callback(), but will not evict the data from the ARC.
*/
void
dbuf_clear(dmu_buf_impl_t *db)
dbuf_destroy(dmu_buf_impl_t *db)
{
dnode_t *dn;
dmu_buf_impl_t *parent = db->db_parent;
dmu_buf_impl_t *dndb;
boolean_t dbuf_gone = B_FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(refcount_is_zero(&db->db_holds));
dbuf_evict_user(db);
if (db->db_buf != NULL) {
arc_buf_destroy(db->db_buf, db);
db->db_buf = NULL;
}
if (db->db_state == DB_CACHED) {
if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(db->db.db_data != NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
}
db->db.db_data = NULL;
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
db->db_state = DB_UNCACHED;
}
dbuf_clear_data(db);
if (multilist_link_active(&db->db_cache_link)) {
multilist_remove(&dbuf_cache, db);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
}
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_data_pending == NULL);
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
/*
* Now that db_state is DB_EVICTING, nobody else can find this via
* the hash table. We can now drop db_mtx, which allows us to
* acquire the dn_dbufs_mtx.
*/
mutex_exit(&db->db_mtx);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
dndb = dn->dn_dbuf;
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
if (db->db_blkid != DMU_BONUS_BLKID) {
boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
if (needlock)
mutex_enter(&dn->dn_dbufs_mtx);
avl_remove(&dn->dn_dbufs, db);
atomic_dec_32(&dn->dn_dbufs_count);
membar_producer();
DB_DNODE_EXIT(db);
if (needlock)
mutex_exit(&dn->dn_dbufs_mtx);
/*
* Decrementing the dbuf count means that the hold corresponding
* to the removed dbuf is no longer discounted in dnode_move(),
@ -1821,15 +2087,25 @@ dbuf_clear(dmu_buf_impl_t *db)
*/
dnode_rele(dn, db);
db->db_dnode_handle = NULL;
dbuf_hash_remove(db);
} else {
DB_DNODE_EXIT(db);
}
if (db->db_buf)
dbuf_gone = arc_clear_callback(db->db_buf);
ASSERT(refcount_is_zero(&db->db_holds));
if (!dbuf_gone)
mutex_exit(&db->db_mtx);
db->db_parent = NULL;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
ASSERT(!multilist_link_active(&db->db_cache_link));
kmem_cache_free(dbuf_kmem_cache, db);
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
/*
* If this dbuf is referenced from an indirect dbuf,
@ -1922,7 +2198,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_type != DMU_OT_NONE);
db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
db->db_objset = os;
db->db.db_object = dn->dn_object;
@ -1971,7 +2247,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_state = DB_EVICTING;
if ((odb = dbuf_hash_insert(db)) != NULL) {
/* someone else inserted it first */
kmem_cache_free(dbuf_cache, db);
kmem_cache_free(dbuf_kmem_cache, db);
mutex_exit(&dn->dn_dbufs_mtx);
return (odb);
}
@ -1996,76 +2272,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
return (db);
}
static int
dbuf_do_evict(void *private)
{
dmu_buf_impl_t *db = private;
if (!MUTEX_HELD(&db->db_mtx))
mutex_enter(&db->db_mtx);
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_state != DB_EVICTING) {
ASSERT(db->db_state == DB_CACHED);
DBUF_VERIFY(db);
db->db_buf = NULL;
dbuf_evict(db);
} else {
mutex_exit(&db->db_mtx);
dbuf_destroy(db);
}
return (0);
}
static void
dbuf_destroy(dmu_buf_impl_t *db)
{
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
if (db->db_dnode_handle != NULL) {
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
mutex_enter(&dn->dn_dbufs_mtx);
avl_remove(&dn->dn_dbufs, db);
atomic_dec_32(&dn->dn_dbufs_count);
mutex_exit(&dn->dn_dbufs_mtx);
DB_DNODE_EXIT(db);
/*
* Decrementing the dbuf count means that the hold
* corresponding to the removed dbuf is no longer
* discounted in dnode_move(), so the dnode cannot be
* moved until after we release the hold.
*/
dnode_rele(dn, db);
db->db_dnode_handle = NULL;
}
dbuf_hash_remove(db);
}
db->db_parent = NULL;
db->db_buf = NULL;
ASSERT(db->db.db_data == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
kmem_cache_free(dbuf_cache, db);
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
}
typedef struct dbuf_prefetch_arg {
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
int dpa_curlevel; /* The current level that we're reading */
dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
@ -2103,10 +2315,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0);
/*
* The dpa_dnode is only valid if we are called with a NULL
* zio. This indicates that the arc_read() returned without
* first calling zio_read() to issue a physical read. Once
* a physical read is made the dpa_dnode must be invalidated
* as the locks guarding it may have been dropped. If the
* dpa_dnode is still valid, then we want to add it to the dbuf
* cache. To do so, we must hold the dbuf associated with the block
* we just prefetched, read its contents so that we associate it
* with an arc_buf_t, and then release it.
*/
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
if (zio->io_flags & ZIO_FLAG_RAW) {
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
} else {
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
}
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
dpa->dpa_dnode = NULL;
} else if (dpa->dpa_dnode != NULL) {
uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel -
dpa->dpa_zb.zb_level));
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
dpa->dpa_curlevel, curblkid, FTAG);
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
dbuf_rele(db, FTAG);
}
dpa->dpa_curlevel--;
@ -2135,7 +2374,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
(void) arc_buf_remove_ref(abuf, private);
arc_buf_destroy(abuf, private);
}
/*
@ -2229,6 +2469,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
dpa->dpa_prio = prio;
dpa->dpa_aflags = aflags;
dpa->dpa_spa = dn->dn_objset->os_spa;
dpa->dpa_dnode = dn;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
@ -2309,18 +2550,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
return (SET_ERROR(ENOENT));
}
if (db->db_buf && refcount_is_zero(&db->db_holds)) {
arc_buf_add_ref(db->db_buf, db);
if (db->db_buf->b_data == NULL) {
dbuf_clear(db);
if (parent) {
dbuf_rele(parent, NULL);
parent = NULL;
}
goto top;
}
if (db->db_buf != NULL)
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
}
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
@ -2338,13 +2569,19 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
arc_buf_alloc(dn->dn_objset->os_spa,
arc_alloc_buf(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
}
}
if (multilist_link_active(&db->db_cache_link)) {
ASSERT(refcount_is_zero(&db->db_holds));
multilist_remove(&dbuf_cache, db);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
}
(void) refcount_add(&db->db_holds, tag);
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
@ -2418,7 +2655,7 @@ void
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
{
int64_t holds = refcount_add(&db->db_holds, tag);
ASSERT(holds > 1);
ASSERT3S(holds, >, 1);
}
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
@ -2489,8 +2726,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
* We can't freeze indirects if there is a possibility that they
* may be modified in the current syncing context.
*/
if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
if (db->db_buf != NULL &&
holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
arc_buf_freeze(db->db_buf);
}
if (holds == db->db_dirtycnt &&
db->db_level == 0 && db->db_user_immediate_evict)
@ -2535,55 +2774,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
*/
ASSERT(db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL);
dbuf_evict(db);
dbuf_destroy(db);
} else if (arc_released(db->db_buf)) {
arc_buf_t *buf = db->db_buf;
/*
* This dbuf has anonymous data associated with it.
*/
dbuf_clear_data(db);
VERIFY(arc_buf_remove_ref(buf, db));
dbuf_evict(db);
dbuf_destroy(db);
} else {
VERIFY(!arc_buf_remove_ref(db->db_buf, db));
boolean_t do_arc_evict = B_FALSE;
blkptr_t bp;
spa_t *spa = dmu_objset_spa(db->db_objset);
/*
* A dbuf will be eligible for eviction if either the
* 'primarycache' property is set or a duplicate
* copy of this buffer is already cached in the arc.
*
* In the case of the 'primarycache' a buffer
* is considered for eviction if it matches the
* criteria set in the property.
*
* To decide if our buffer is considered a
* duplicate, we must call into the arc to determine
* if multiple buffers are referencing the same
* block on-disk. If so, then we simply evict
* ourselves.
*/
if (!DBUF_IS_CACHEABLE(db)) {
if (db->db_blkptr != NULL &&
!BP_IS_HOLE(db->db_blkptr) &&
!BP_IS_EMBEDDED(db->db_blkptr)) {
spa_t *spa =
dmu_objset_spa(db->db_objset);
blkptr_t bp = *db->db_blkptr;
dbuf_clear(db);
arc_freed(spa, &bp);
} else {
dbuf_clear(db);
}
} else if (db->db_pending_evict ||
arc_buf_eviction_needed(db->db_buf)) {
dbuf_clear(db);
} else {
mutex_exit(&db->db_mtx);
if (!DBUF_IS_CACHEABLE(db) &&
db->db_blkptr != NULL &&
!BP_IS_HOLE(db->db_blkptr) &&
!BP_IS_EMBEDDED(db->db_blkptr)) {
do_arc_evict = B_TRUE;
bp = *db->db_blkptr;
}
if (!DBUF_IS_CACHEABLE(db) ||
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
multilist_insert(&dbuf_cache, db);
(void) refcount_add_many(&dbuf_cache_size,
db->db.db_size, db);
mutex_exit(&db->db_mtx);
dbuf_evict_notify();
}
if (do_arc_evict)
arc_freed(spa, &bp);
}
} else {
mutex_exit(&db->db_mtx);
}
}
#pragma weak dmu_buf_refcount = dbuf_refcount
@ -2871,7 +3099,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*/
int blksz = arc_buf_size(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
*datap = arc_alloc_buf(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
db->db_data_pending = dr;
@ -3137,10 +3365,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
if (db->db_state != DB_NOFILL) {
if (dr->dt.dl.dr_data != db->db_buf)
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
db));
else if (!arc_released(db->db_buf))
arc_set_callback(db->db_buf, dbuf_do_evict, db);
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
dnode_t *dn;
@ -3156,8 +3381,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
if (!arc_released(db->db_buf))
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
@ -3334,8 +3557,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dr_zio = arc_write(zio, os->os_spa, txg,
&dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
children_ready_cb,
&zp, dbuf_write_ready, children_ready_cb,
dbuf_write_physdone, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}

View File

@ -1407,7 +1407,7 @@ void
dmu_return_arcbuf(arc_buf_t *buf)
{
arc_return_buf(buf, FTAG);
VERIFY(arc_buf_remove_ref(buf, FTAG));
arc_buf_destroy(buf, FTAG);
}
/*
@ -1763,8 +1763,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
zio_nowait(arc_write(pio, os->os_spa, txg,
bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
NULL, NULL, dmu_sync_done, dsa,
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);
@ -2137,11 +2136,11 @@ dmu_init(void)
xuio_stat_init();
dmu_objset_init();
dnode_init();
dbuf_init();
zfetch_init();
zio_compress_init();
l2arc_init();
arc_init();
dbuf_init();
}
void

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/dmu.h>
@ -169,7 +169,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (err)
break;
}
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
if (err)
return (err);
/* Don't care about the data blocks */

View File

@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_FLAG_L2CACHE;
if (DMU_OS_IS_L2COMPRESSIBLE(os))
aflags |= ARC_FLAG_L2COMPRESS;
dprintf_bp(os->os_rootbp, "reading %s", "");
err = arc_read(NULL, spa, os->os_rootbp,
@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
arc_buf_t *buf = arc_buf_alloc(spa,
arc_buf_t *buf = arc_alloc_buf(spa,
sizeof (objset_phys_t), &os->os_phys_buf,
ARC_BUFC_METADATA);
bzero(buf->b_data, sizeof (objset_phys_t));
bcopy(os->os_phys_buf->b_data, buf->b_data,
arc_buf_size(os->os_phys_buf));
(void) arc_buf_remove_ref(os->os_phys_buf,
&os->os_phys_buf);
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
os->os_phys_buf = buf;
}
@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
os->os_phys_buf = arc_buf_alloc(spa, size,
os->os_phys_buf = arc_alloc_buf(spa, size,
&os->os_phys_buf, ARC_BUFC_METADATA);
os->os_phys = os->os_phys_buf->b_data;
bzero(os->os_phys, size);
@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
&os->os_phys_buf));
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
kmem_free(os, sizeof (objset_t));
return (err);
}
@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os)
}
zil_free(os->os_zil);
VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
/*
* This is a barrier to prevent the objset from going away in
@ -1128,7 +1124,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
zio = arc_write(pio, os->os_spa, tx->tx_txg,
os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
DMU_OS_IS_L2COMPRESSIBLE(os),
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

View File

@ -634,7 +634,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
if (err != 0)
break;
}
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
} else if (type == DMU_OT_SA) {
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
@ -646,7 +646,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
return (SET_ERROR(EIO));
err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
} else if (backup_do_embed(dsa, bp)) {
/* it's an embedded level-0 block of a regular object */
int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
@ -670,7 +670,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
&aflags, zb) != 0) {
if (zfs_send_corrupt_data) {
/* Send a block filled with 0x"zfs badd bloc" */
abuf = arc_buf_alloc(spa, blksz, &abuf,
abuf = arc_alloc_buf(spa, blksz, &abuf,
ARC_BUFC_DATA);
uint64_t *ptr;
for (ptr = abuf->b_data;
@ -700,7 +700,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
err = dump_write(dsa, type, zb->zb_object,
offset, blksz, bp, abuf->b_data);
}
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
}
ASSERT(err == 0 || err == EINTR);

View File

@ -380,7 +380,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
}
if (buf)
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
post:
if (err == 0 && (td->td_flags & TRAVERSE_POST))
@ -595,7 +595,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
osp = buf->b_data;
traverse_zil(&td, &osp->os_zil_header);
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
}
if (!(flags & TRAVERSE_PREFETCH_DATA) ||

View File

@ -512,7 +512,7 @@ dnode_destroy(dnode_t *dn)
}
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
}
dn->dn_zio = NULL;

View File

@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn)
avl_insert_here(&dn->dn_dbufs, &db_marker, db,
AVL_BEFORE);
dbuf_clear(db);
dbuf_destroy(db);
db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
avl_remove(&dn->dn_dbufs, &db_marker);
@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn)
if (dn->dn_bonus != NULL) {
if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
} else {
dn->dn_bonus->db_pending_evict = TRUE;

View File

@ -679,7 +679,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
dsl_scan_visitbp(cbp, &czb, dnp,
ds, scn, ostype, tx);
}
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
arc_flags_t flags = ARC_FLAG_WAIT;
dnode_phys_t *cdnp;
@ -705,7 +705,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
cdnp, zb->zb_blkid * epb + i, tx);
}
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
@ -737,7 +737,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
&osp->os_userused_dnode,
DMU_USERUSED_OBJECT, tx);
}
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
}
return (0);

View File

@ -231,4 +231,28 @@ refcount_transfer(refcount_t *dst, refcount_t *src)
list_destroy(&removed);
}
void
refcount_transfer_ownership(refcount_t *rc, void *current_holder,
void *new_holder)
{
reference_t *ref;
boolean_t found = B_FALSE;
mutex_enter(&rc->rc_mtx);
if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return;
}
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == current_holder) {
ref->ref_holder = new_holder;
found = B_TRUE;
break;
}
}
ASSERT(found);
mutex_exit(&rc->rc_mtx);
}
#endif /* ZFS_DEBUG */

View File

@ -43,51 +43,83 @@ extern "C" {
*/
#define ARC_EVICT_ALL -1ULL
#define HDR_SET_LSIZE(hdr, x) do { \
ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
_NOTE(CONSTCOND) } while (0)
#define HDR_SET_PSIZE(hdr, x) do { \
ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
_NOTE(CONSTCOND) } while (0)
#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT)
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
typedef int arc_evict_func_t(void *priv);
/* generic arc_done_func_t's which you can use */
arc_done_func_t arc_bcopy_func;
arc_done_func_t arc_getbuf_func;
extern int zfs_arc_num_sublists_per_state;
typedef enum arc_flags
{
/*
* Public flags that can be passed into the ARC by external consumers.
*/
ARC_FLAG_NONE = 1 << 0, /* No flags set */
ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */
ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */
ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */
ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */
ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */
ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */
ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
/* Indicates that block was read with ASYNC priority. */
ARC_FLAG_PRIO_ASYNC_READ = 1 << 14,
ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */
ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 18,
ARC_FLAG_BUFC_METADATA = 1 << 14,
/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 19,
ARC_FLAG_HAS_L2HDR = 1 << 20,
ARC_FLAG_HAS_L1HDR = 1 << 15,
ARC_FLAG_HAS_L2HDR = 1 << 16,
/*
* Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
* This allows the l2arc to use the blkptr's checksum to verify
* the data without having to store the checksum in the hdr.
*/
ARC_FLAG_COMPRESSED_ARC = 1 << 17,
ARC_FLAG_SHARED_DATA = 1 << 18,
/*
* The arc buffer's compression mode is stored in the top 7 bits of the
* flags field, so these dummy flags are included so that MDB can
* interpret the enum properly.
*/
ARC_FLAG_COMPRESS_0 = 1 << 24,
ARC_FLAG_COMPRESS_1 = 1 << 25,
ARC_FLAG_COMPRESS_2 = 1 << 26,
ARC_FLAG_COMPRESS_3 = 1 << 27,
ARC_FLAG_COMPRESS_4 = 1 << 28,
ARC_FLAG_COMPRESS_5 = 1 << 29,
ARC_FLAG_COMPRESS_6 = 1 << 30
} arc_flags_t;
struct arc_buf {
@ -95,11 +127,10 @@ struct arc_buf {
arc_buf_t *b_next;
kmutex_t b_evict_lock;
void *b_data;
arc_evict_func_t *b_efunc;
void *b_private;
};
typedef enum arc_buf_contents {
ARC_BUFC_INVALID, /* invalid type */
ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
@ -119,19 +150,17 @@ typedef enum arc_space_type {
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
void arc_buf_destroy(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
void arc_buf_thaw(arc_buf_t *buf);
boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
#ifdef ZFS_DEBUG
int arc_referenced(arc_buf_t *buf);
#endif
@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
const zio_prop_t *zp,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
arc_done_func_t *ready, arc_done_func_t *child_ready,
arc_done_func_t *physdone, arc_done_func_t *done,
void *priv, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb);
void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
boolean_t arc_clear_callback(arc_buf_t *buf);
void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
uint64_t arc_max_bytes(void);
void arc_init(void);
void arc_fini(void);

View File

@ -36,6 +36,7 @@
#include <sys/zfs_context.h>
#include <sys/refcount.h>
#include <sys/zrlock.h>
#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
@ -228,6 +229,11 @@ typedef struct dmu_buf_impl {
*/
avl_node_t db_link;
/*
* Link in dbuf_cache.
*/
multilist_node_t db_cache_link;
/* Data which is unique to data (leaf) blocks: */
/* User callback information. */
@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_destroy(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#define DBUF_IS_L2COMPRESSIBLE(_db) \
((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \
(dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE))
#ifdef ZFS_DEBUG
/*

View File

@ -73,6 +73,7 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag);
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_transfer_ownership(refcount_t *, void *, void *);
void refcount_sysinit(void);
void refcount_fini(void);
@ -100,6 +101,7 @@ typedef struct refcount {
atomic_add_64(&(src)->rc_count, -__tmp); \
atomic_add_64(&(dst)->rc_count, __tmp); \
}
#define refcount_transfer_ownership(rc, current_holder, new_holder)
#define refcount_sysinit()
#define refcount_fini()

View File

@ -149,6 +149,8 @@ _NOTE(CONSTCOND) } while (0)
#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
#define SPA_COMPRESSBITS 7
/*
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
* The members of the dva_t should be considered opaque outside the SPA.
@ -391,8 +393,10 @@ _NOTE(CONSTCOND) } while (0)
16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
#define BP_GET_COMPRESS(bp) \
BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
#define BP_SET_COMPRESS(bp, x) \
BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)

View File

@ -564,6 +564,10 @@ extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
uint64_t bufsize, zio_transform_func_t *transform);
extern void zio_pop_transforms(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,

View File

@ -99,8 +99,12 @@ extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
#endif
extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
extern void zio_checksum_templates_free(spa_t *spa);

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@ -253,7 +254,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
}
}
VERIFY(arc_buf_remove_ref(abuf, &abuf));
arc_buf_destroy(abuf, &abuf);
}
return (error);
@ -290,7 +291,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
if (error == 0) {
if (wbuf != NULL)
bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
}
return (error);

View File

@ -329,7 +329,7 @@ zio_data_buf_free(void *buf, size_t size)
* Push and pop I/O transform buffers
* ==========================================================================
*/
static void
void
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
@ -347,7 +347,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zio->io_size = size;
}
static void
void
zio_pop_transforms(zio_t *zio)
{
zio_transform_t *zt;
@ -1022,8 +1022,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
void *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
@ -2356,7 +2356,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
bcmp(abuf->b_data, zio->io_orig_data,
zio->io_orig_size) != 0)
error = SET_ERROR(EEXIST);
VERIFY(arc_buf_remove_ref(abuf, &abuf));
arc_buf_destroy(abuf, &abuf);
}
ddt_enter(ddt);

View File

@ -297,20 +297,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
}
int
zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
{
blkptr_t *bp = zio->io_bp;
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int byteswap;
int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
void *data = zio->io_data;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum, verifier;
spa_t *spa = zio->io_spa;
zio_cksum_t actual_cksum, expected_cksum;
int byteswap;
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (SET_ERROR(EINVAL));
@ -319,6 +311,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
zio_cksum_t verifier;
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
@ -358,35 +351,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
spa->spa_cksum_tmpls[checksum], &actual_cksum);
eck->zec_cksum = expected_cksum;
if (byteswap)
if (byteswap) {
byteswap_uint64_array(&expected_cksum,
sizeof (zio_cksum_t));
}
} else {
ASSERT(!BP_IS_GANG(bp));
byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum;
ci->ci_func[byteswap](data, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
info->zbc_expected = expected_cksum;
info->zbc_actual = actual_cksum;
info->zbc_checksum_name = ci->ci_name;
info->zbc_byteswapped = byteswap;
info->zbc_injected = 0;
info->zbc_has_cksum = 1;
if (info != NULL) {
info->zbc_expected = expected_cksum;
info->zbc_actual = actual_cksum;
info->zbc_checksum_name = ci->ci_name;
info->zbc_byteswapped = byteswap;
info->zbc_injected = 0;
info->zbc_has_cksum = 1;
}
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
return (SET_ERROR(ECKSUM));
if (zio_injection_enabled && !zio->io_error &&
return (0);
}
int
zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
{
blkptr_t *bp = zio->io_bp;
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
void *data = zio->io_data;
spa_t *spa = zio->io_spa;
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
offset, info);
if (error != 0 && zio_injection_enabled && !zio->io_error &&
(error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
info->zbc_injected = 1;
return (error);
}
return (0);
return (error);
}
/*