diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index 8414ad6df74b..1ae92c01ddbd 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -1289,7 +1289,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (err); diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index f655c3d3c9c6..488e1fde9f08 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -189,6 +189,7 @@ extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5355,6 +5356,12 @@ ztest_resume_thread(void *arg) if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); } return (NULL); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index c889e92276e7..1e6b6b685fa0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -120,9 +120,134 @@ * - ARC header release, as it removes from L2ARC buflists */ +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer, and always contains uncompressed data. The ARC will provide + * references to this data and will keep it cached until it is no longer in + * use. Typically, the arc will try to cache only the L1ARC's physical data + * block and will aggressively evict any arc_buf_t that is no longer referenced. + * The amount of memory consumed by the arc_buf_t's can be seen via the + * "overhead_size" kstat. + * + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * (potentially) | | | | + * compressed | | | | + * data +------+ | v + * +->+------+ +------+ + * uncompressed | | | | + * data | | | | + * +------+ +------+ + * + * The L1ARC's data pointer, however, may or may not be uncompressed. The + * ARC has the ability to store the physical data (b_pdata) associated with + * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk + * physical block, it will match its on-disk compression characteristics. + * If the block on-disk is compressed, then the physical data block + * in the cache will also be compressed and vice-versa. This behavior + * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, + * then an additional arc_buf_t is allocated and the uncompressed data is + * bcopied from the existing arc_buf_t. If the hdr is cached but does not + * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses + * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's + * b_pdata is not compressed, then the block is shared with the newly + * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t + * in the arc buffer chain. Sharing the block reduces the memory overhead + * required when the hdr is caching uncompressed blocks or the compressed + * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t: + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the arc requires that the ARC first discard the b_pdata + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline + * performs the write, it may compress the data before writing it to disk. + * The ARC will be called with the transformed data and will bcopy the + * transformed on-disk block into a newly allocated b_pdata. + * + * When the L2ARC is in use, it will also take advantage of the b_pdata. The + * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * that when compressed arc is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * arc is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + #include #include +#include #include +#include #include #include #include @@ -155,10 +280,6 @@ static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; -static kmutex_t arc_user_evicts_lock; -static kcondvar_t arc_user_evicts_cv; -static boolean_t arc_user_evicts_thread_exit; - static kmutex_t arc_dnlc_evicts_lock; static kcondvar_t arc_dnlc_evicts_cv; static boolean_t arc_dnlc_evicts_thread_exit; @@ -234,13 +355,14 @@ uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; -int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; /* Absolute min for arc min / max is 16MB. */ static uint64_t arc_abs_min = 16 << 20; +boolean_t zfs_compressed_arc_enabled = B_TRUE; + static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); @@ -270,6 +392,8 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, &arc_shrink_shift, 0, "log2(fraction of arc to reclaim)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, + &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); /* * We don't have a tunable for arc_free_target due to the dependency on @@ -351,7 +475,7 @@ typedef struct arc_state { /* * total amount of evictable data in this state */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. @@ -416,6 +540,26 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pdata. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually @@ -561,16 +705,12 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_compress_successes; - kstat_named_t arcstat_l2_compress_zeros; - kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_l2_padding_needed; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; @@ -585,9 +725,6 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_duplicate_buffers; - kstat_named_t arcstat_duplicate_buffers_size; - kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; @@ -630,6 +767,9 @@ static arc_stats_t arc_stats = { { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, @@ -663,16 +803,12 @@ static arc_stats_t arc_stats = { { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_compress_successes", KSTAT_DATA_UINT64 }, - { "l2_compress_zeros", KSTAT_DATA_UINT64 }, - { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "l2_padding_needed", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, @@ -687,9 +823,6 @@ static arc_stats_t arc_stats = { { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "duplicate_buffers", KSTAT_DATA_UINT64 }, - { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, @@ -762,8 +895,12 @@ static arc_state_t *arc_l2c_only; #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ -#define L2ARC_IS_VALID_COMPRESS(_c_) \ - ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; @@ -823,6 +960,7 @@ struct arc_write_callback { */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; #ifdef ZFS_DEBUG /* * used for debugging wtih kmem_flags - by allocating and freeing @@ -833,9 +971,10 @@ typedef struct l1arc_buf_hdr { #endif arc_buf_t *b_buf; - uint32_t b_datacnt; + uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; + uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; @@ -848,8 +987,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - /* temporary buffer holder for in-flight compressed or padded data */ - void *b_tmp_cdata; + void *b_pdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; @@ -858,9 +996,6 @@ typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ - /* real alloc'd buffer size depending on b_compress applied */ - int32_t b_asize; - uint8_t b_compress; list_node_t b_l2node; } l2arc_buf_hdr_t; @@ -869,20 +1004,37 @@ struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; - /* - * Even though this checksum is only set/verified when a buffer is in - * the L1 cache, it needs to be in the set of common fields because it - * must be preserved from the time before a buffer is written out to - * L2ARC until after it is read back in. - */ - zio_cksum_t *b_freeze_cksum; + arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; - /* immutable */ - int32_t b_size; - uint64_t b_spa; + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; @@ -986,9 +1138,6 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) } #endif -static arc_buf_t *arc_eviction_list; -static arc_buf_hdr_t arc_eviction_hdr; - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) @@ -997,25 +1146,35 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) + /* * Other sizes */ @@ -1068,16 +1227,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ -/* - * Used to distinguish headers that are being process by - * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk - * address. This can happen when the header is added to the l2arc's list - * of buffers to write in the first stage of l2arc_write_buffers(), but - * has not yet been written out which happens in the second stage of - * l2arc_write_buffers(). - */ -#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) - #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -1112,41 +1261,47 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, - &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, - &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, - &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, - &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, - &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, - &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, @@ -1179,12 +1334,10 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ + arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ - enum zio_compress l2rcb_compress; /* applied compress */ void *l2rcb_data; /* temporary buffer */ } l2arc_read_callback_t; @@ -1197,7 +1350,7 @@ typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; @@ -1205,21 +1358,22 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void arc_get_data_buf(arc_buf_t *); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); +static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static boolean_t l2arc_transform_buf(arc_buf_hdr_t *, boolean_t); -static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *); - static void l2arc_trim(const arc_buf_hdr_t *hdr) { @@ -1228,13 +1382,9 @@ l2arc_trim(const arc_buf_hdr_t *hdr) ASSERT(HDR_HAS_L2HDR(hdr)); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); - if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) - return; - if (hdr->b_l2hdr.b_asize != 0) { + if (HDR_GET_PSIZE(hdr) != 0) { trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, - hdr->b_l2hdr.b_asize, 0); - } else { - ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY); + HDR_GET_PSIZE(hdr), 0); } } @@ -1255,14 +1405,14 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) return (crc); } -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) @@ -1284,7 +1434,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, hdr)) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } @@ -1322,13 +1472,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { - if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { @@ -1356,12 +1506,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { - ASSERT(fhdr != NULL); + ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; - hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -1447,7 +1597,7 @@ hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -1461,7 +1611,7 @@ hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } @@ -1534,166 +1684,138 @@ buf_init(void) } } -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) { - ASSERT(HDR_HAS_L2HDR(hdr)); - - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - ASSERT(hdr->b_l1hdr.b_buf == NULL); - ASSERT0(hdr->b_l1hdr.b_datacnt); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif - - nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; - } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - - (void) refcount_add_many(&dev->l2ad_alloc, - nhdr->b_l2hdr.b_asize, nhdr); - - buf_discard_identity(hdr); - hdr->b_freeze_cksum = NULL; - kmem_cache_free(old, hdr); - - return (nhdr); + boolean_t shared = (buf->b_data != NULL && + buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + return (shared); } - -#define ARC_MINTIME (hz>>4) /* 62 ms */ +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +} static void arc_cksum_verify(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } -static int -arc_cksum_equal(arc_buf_t *buf) +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - zio_cksum_t zc; - int equal; + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); - return (equal); + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t csize; + + void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); + csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); + } + + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); } static void -arc_cksum_compute(arc_buf_t *buf, boolean_t force) +arc_cksum_compute(arc_buf_t *buf) { - if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - NULL, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif @@ -1735,7 +1857,7 @@ arc_buf_watch(arc_buf_t *buf) procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = buf->b_hdr->b_size; + ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr); ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); @@ -1747,11 +1869,14 @@ arc_buf_watch(arc_buf_t *buf) static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { + arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { - return (ARC_BUFC_METADATA); + type = ARC_BUFC_METADATA; } else { - return (ARC_BUFC_DATA); + type = ARC_BUFC_DATA; } + VERIFY3U(hdr->b_type, ==, type); + return (type); } static uint32_t @@ -1773,29 +1898,29 @@ arc_bufc_to_flags(arc_buf_contents_t type) void arc_buf_thaw(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(buf->b_hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; - } + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_thawed != NULL) - kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); - buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); + if (hdr->b_l1hdr.b_thawed != NULL) + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } #endif - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); @@ -1805,53 +1930,246 @@ arc_buf_thaw(arc_buf_t *buf) void arc_buf_freeze(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf, B_FALSE); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); mutex_exit(hash_lock); } +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ static void -add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} + +static int +arc_decompress(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + int error; + + if (arc_buf_is_shared(buf)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + /* + * The arc_buf_hdr_t is either not compressed or is + * associated with an embedded block or a hole in which + * case they remain anonymous. + */ + IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || + HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + } else { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (error != 0) { + zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + arc_cksum_compute(buf); + return (0); +} + +/* + * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(hash_lock)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } + arc_state_t *state = hdr->b_l1hdr.b_state; if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_remove(list, hdr); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evitable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { @@ -1868,15 +2186,9 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - arc_buf_contents_t type = arc_buf_type(hdr); - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_insert(list, hdr); - - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - atomic_add_64(size, hdr->b_size * - hdr->b_l1hdr.b_datacnt); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); } return (cnt); } @@ -1891,8 +2203,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, { arc_state_t *old_state; int64_t refcnt; - uint32_t datacnt; - uint64_t from_delta, to_delta; + uint32_t bufcnt; + boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* @@ -1905,20 +2217,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); - datacnt = hdr->b_l1hdr.b_datacnt; + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); } else { old_state = arc_l2c_only; refcnt = 0; - datacnt = 0; + bufcnt = 0; + update_old = B_FALSE; } + update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || datacnt > 0); - ASSERT(!GHOST_STATE(new_state) || datacnt == 0); - ASSERT(old_state != arc_anon || datacnt <= 1); - - from_delta = to_delta = datacnt * hdr->b_size; + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -1926,25 +2238,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - uint64_t *size = &old_state->arcs_lsize[buftype]; - ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - /* - * If prefetching out of the ghost cache, - * we will have a non-zero datacnt. - */ - if (GHOST_STATE(old_state) && datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(hdr->b_l1hdr.b_buf == NULL); - from_delta = hdr->b_size; + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; } - ASSERT3U(*size, >=, from_delta); - atomic_add_64(size, -from_delta); + arc_evitable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { - uint64_t *size = &new_state->arcs_lsize[buftype]; /* * An L1 header always exists here, since if we're @@ -1955,38 +2259,38 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); - ASSERT(hdr->b_l1hdr.b_buf == NULL); - to_delta = hdr->b_size; + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; } - atomic_add_64(size, to_delta); + arc_evictable_space_increment(hdr, new_state); } } - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ - if (to_delta && new_state != arc_l2c_only) { + if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); + ASSERT0(bufcnt); /* - * We moving a header to a ghost state, we first + * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a - * datacnt of zero, and no arc buffer to use for + * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1995,34 +2299,53 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, buf); + HDR_GET_LSIZE(hdr), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&new_state->arcs_size, + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); } } } - if (from_delta && old_state != arc_l2c_only) { + if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + /* * When moving a header off of a ghost state, - * there's the possibility for datacnt to be - * non-zero. This is because we first add the - * arc buffer to the header prior to changing - * the header's state. Since we used the header - * for the reference when putting the header on - * the ghost state, we must balance that and use - * the header when removing off the ghost state - * (even though datacnt is non zero). + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. */ - IMPLY(datacnt == 0, new_state == arc_anon || - new_state == arc_l2c_only); - (void) refcount_remove_many(&old_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3P(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2031,9 +2354,29 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3P(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many( - &old_state->arcs_size, hdr->b_size, buf); + &old_state->arcs_size, HDR_GET_LSIZE(hdr), + buf); } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } @@ -2111,39 +2454,85 @@ arc_space_return(uint64_t space, arc_space_type_t type) atomic_add_64(&arc_size, -space); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +/* + * Allocate an initial buffer for this hdr, subsequent buffers will + * use arc_buf_clone(). + */ +static arc_buf_t * +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { - arc_buf_hdr_t *hdr; arc_buf_t *buf; - ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3P(hdr->b_freeze_cksum, ==, NULL); - hdr->b_size = size; - hdr->b_spa = spa_load_guid(spa); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; buf->b_next = NULL; - hdr->b_flags = arc_bufc_to_flags(type); - hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * If the hdr's data can be shared (no byteswapping, hdr is + * uncompressed, hdr's data is not currently being written to the + * L2ARC write) then we share the data buffer and set the appropriate + * bit in the hdr's b_flags to indicate the hdr is sharing it's + * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to + * store the buf's data. + */ + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + buf->b_data = hdr->b_l1hdr.b_pdata; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } + VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_datacnt = 1; - hdr->b_l1hdr.b_tmp_cdata = NULL; + hdr->b_l1hdr.b_bufcnt += 1; - arc_get_data_buf(buf); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + return (buf); +} +/* + * Used when allocating additional buffers. + */ +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_l1hdr.b_bufcnt += 1; + + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } @@ -2160,7 +2549,7 @@ arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; - buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); @@ -2174,12 +2563,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -hdr->b_size); + atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -2188,179 +2577,106 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - buf->b_efunc = NULL; - buf->b_private = NULL; - atomic_add_64(&arc_loaned_bytes, hdr->b_size); -} - -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) -{ - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); - - /* - * This buffer already exists in the arc so create a duplicate - * copy for the caller. If the buffer is associated with user data - * then track the size and number of duplicates. These stats will be - * updated as duplicate buffers are created and destroyed. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMP(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); - } - hdr->b_l1hdr.b_datacnt += 1; - return (buf); -} - -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - - /* - * Check to see if this buffer is evicted. Callers - * must verify b_data != NULL to know if the add_ref - * was successful. - */ - mutex_enter(&buf->b_evict_lock); - if (buf->b_data == NULL) { - mutex_exit(&buf->b_evict_lock); - return; - } - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - mutex_exit(&buf->b_evict_lock); - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - add_reference(hdr, hash_lock, tag); - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { - l2arc_data_free_t *df; + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; - df->l2df_func = free_func; + df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } +static void +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); + } + (void) refcount_remove_many(&state->arcs_size, size, hdr); + + l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); +} + /* - * Free the arc data buffer. If it is an l2arc write in progress, - * the buffer is placed on l2arc_free_on_write to be freed later. + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - arc_buf_hdr_t *hdr = buf->b_hdr; + arc_state_t *state = hdr->b_l1hdr.b_state; - if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - free_func(buf->b_data, hdr->b_size); - } + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. + */ + refcount_transfer_ownership(&state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pdata = buf->b_data; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. + */ + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); } static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - size_t align, asize, len; + arc_state_t *state = hdr->b_l1hdr.b_state; - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * The b_tmp_cdata field is linked off of the b_l1hdr, so if - * that doesn't exist, the header is in the arc_l2c_only state, - * and there isn't anything to free (it's already been freed). + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. */ - if (!HDR_HAS_L1HDR(hdr)) - return; + refcount_transfer_ownership(&state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + hdr->b_l1hdr.b_pdata = NULL; /* - * The header isn't being written to the l2arc device, thus it - * shouldn't have a b_tmp_cdata to free. + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. */ - if (!HDR_L2_WRITING(hdr)) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } - - /* - * The bufer has been chosen for writing to L2ARC, but it's - * not being written just yet. In other words, - * b_tmp_cdata points to exactly the same buffer as b_data, - * l2arc_transform_buf hasn't been called. - */ - if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, - hdr->b_l1hdr.b_buf->b_data); - ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_OFF); - hdr->b_l1hdr.b_tmp_cdata = NULL; - return; - } - - /* - * There's nothing to free since the buffer was all zero's and - * compressed to a zero length buffer. - */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } - - /* - * Nothing to do if the temporary buffer was not required. - */ - if (hdr->b_l1hdr.b_tmp_cdata == NULL) - return; - - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - len = hdr->b_size; - align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; - asize = P2ROUNDUP(len, align); - arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, asize, - zio_data_buf_free); - hdr->b_l1hdr.b_tmp_cdata = NULL; + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } /* @@ -2368,56 +2684,43 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t remove) +arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; + arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); - /* free up data associated with the buf */ + /* + * Free up the data associated with the buf but only + * if we're not sharing this with the hdr. If we are sharing + * it with the hdr, then hdr will have performed the allocation + * so allow it to do the free. + */ if (buf->b_data != NULL) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); + if (destroyed_buf_is_shared) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(HDR_SHARED_DATA(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); } - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { - uint64_t *cnt = &state->arcs_lsize[type]; - - ASSERT(refcount_is_zero( - &buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - ASSERT3U(*cnt, >=, size); - atomic_add_64(cnt, -size); - } - - (void) refcount_remove_many(&state->arcs_size, size, buf); buf->b_data = NULL; - /* - * If we're destroying a duplicate buffer make sure - * that the appropriate statistics are updated. - */ - if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && - HDR_ISTYPE_DATA(buf->b_hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); - } - ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); - buf->b_hdr->b_l1hdr.b_datacnt -= 1; + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; } /* only remove the buf if requested */ @@ -2425,68 +2728,278 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; - bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; - buf->b_next = NULL; + arc_buf_t *lastbuf = NULL; + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; - ASSERT(buf->b_efunc == NULL); + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (destroyed_buf_is_shared && lastbuf != NULL) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + + if (hdr->b_l1hdr.b_bufcnt == 0) + arc_cksum_free(hdr); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } +static void +arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr), hdr); + } + hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compress, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + ASSERT3U(lsize, >, 0); + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compress); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pdata(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); +} + +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); + + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); + + return (nhdr); +} + +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag); + arc_buf_thaw(buf); + return (buf); +} + static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t asize = arc_hdr_size(hdr); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); - /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() - */ - arc_buf_l2_cdata_free(hdr); + ARCSTAT_INCR(arcstat_l2_asize, -asize); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - /* - * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then - * this header is being processed by l2arc_write_buffers() (i.e. - * it's in the first stage of l2arc_write_buffers()). - * Re-affirming that truth here, just to serve as a reminder. If - * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or - * may not have its HDR_L2_WRITING flag set. (the write may have - * completed, in which case HDR_L2_WRITING will be false and the - * b_daddr field will point to the address of the buffer on disk). - */ - IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); - /* - * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with - * l2arc_write_buffers(). Since we've just removed this header - * from the l2arc buffer list, this header will never reach the - * second stage of l2arc_write_buffers(), which increments the - * accounting stats for this header. Thus, we must be careful - * not to decrement them for this header either. - */ - if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - - vdev_space_update(dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - - (void) refcount_remove_many(&dev->l2ad_alloc, - l2hdr->b_asize, hdr); - } - - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void @@ -2494,13 +3007,16 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_datacnt > 0); + hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -2526,40 +3042,22 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } - if (!BUF_EMPTY(hdr)) - buf_discard_identity(hdr); - - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } - if (HDR_HAS_L1HDR(hdr)) { - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; + arc_cksum_free(hdr); + + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); - } - } #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif + + if (hdr->b_l1hdr.b_pdata != NULL) { + arc_hdr_free_pdata(hdr); + } } ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -2573,133 +3071,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } void -arc_buf_free(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_l1hdr.b_state != arc_anon; - - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); - - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); - - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; - /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_user_evicts_lock); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_user_evicts_lock); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, TRUE); - else - arc_hdr_destroy(hdr); - } -} - -boolean_t -arc_buf_remove_ref(arc_buf_t *buf, void* tag) +arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); - boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - arc_buf_free(buf, tag); - return (no_callback); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; } mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT(buf->b_data != NULL); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || - refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + arc_buf_destroy_impl(buf, B_TRUE); mutex_exit(hash_lock); - return (no_callback); } int32_t arc_buf_size(arc_buf_t *buf) { - return (buf->b_hdr->b_size); -} - -/* - * Called from the DMU to determine if the current buffer should be - * evicted. In order to ensure proper locking, the eviction must be initiated - * from the DMU. Return true if the buffer is associated with user data and - * duplicate buffers still exist. - */ -boolean_t -arc_buf_eviction_needed(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - boolean_t evict_needed = B_FALSE; - - if (zfs_disable_dup_eviction) - return (B_FALSE); - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(); let that function - * perform the eviction. - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We have already been added to the arc eviction list; - * recommend eviction. - */ - ASSERT3P(hdr, ==, &arc_eviction_hdr); - mutex_exit(&buf->b_evict_lock); - return (B_TRUE); - } - - if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) - evict_needed = B_TRUE; - - mutex_exit(&buf->b_evict_lock); - return (evict_needed); + return (HDR_GET_LSIZE(buf->b_hdr)); } /* @@ -2726,11 +3126,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. it's b_tmp_cdata field) during it's write phase. + * (i.e. its b_pdata field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. @@ -2741,11 +3141,13 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += hdr->b_size; + bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -2758,6 +3160,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2777,7 +3180,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { @@ -2785,37 +3187,39 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) break; } if (buf->b_data != NULL) - bytes_evicted += hdr->b_size; - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - arc_buf_destroy(buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf, B_TRUE); } if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) - ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); - else - ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } } - if (hdr->b_l1hdr.b_datacnt == 0) { + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_buf(). + */ + arc_hdr_free_pdata(hdr); + arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } @@ -3059,12 +3463,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * - * When 'retry' is set to FALSE, the function will make a single pass + * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * - * When 'retry' is set to TRUE, the function will continually retry the + * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might @@ -3076,7 +3480,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, { uint64_t evicted = 0; - while (state->arcs_lsize[type] != 0) { + while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) @@ -3100,8 +3504,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, { int64_t delta; - if (bytes > 0 && state->arcs_lsize[type] > 0) { - delta = MIN(state->arcs_lsize[type], bytes); + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } @@ -3364,36 +3768,13 @@ arc_adjust(void) return (total_evicted); } -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_user_evicts_lock); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = NULL; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_user_evicts_lock); - - if (buf->b_efunc != NULL) - VERIFY0(buf->b_efunc(buf->b_private)); - - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); -} - void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* - * If retry is TRUE, a spa must not be specified since we have + * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ @@ -3413,9 +3794,6 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - - arc_do_user_evicts(); - ASSERT(spa || arc_eviction_list == NULL); } void @@ -3620,7 +3998,7 @@ arc_available_memory(void) /* * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of TRUE indicates that the system + * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t @@ -3713,6 +4091,20 @@ arc_reclaim_thread(void *dummy __unused) int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { @@ -3784,57 +4176,12 @@ arc_reclaim_thread(void *dummy __unused) } } - arc_reclaim_thread_exit = FALSE; + arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ thread_exit(); } -static void -arc_user_evicts_thread(void *dummy __unused) -{ - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_user_evicts_lock); - while (!arc_user_evicts_thread_exit) { - mutex_exit(&arc_user_evicts_lock); - - arc_do_user_evicts(); - - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - mutex_enter(&arc_user_evicts_lock); - - /* - * Block until signaled, or after one second (we need to - * call the arc's kstat update function regularly). - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_user_evicts_cv, - &arc_user_evicts_lock, hz); - CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); - } - - arc_user_evicts_thread_exit = FALSE; - cv_broadcast(&arc_user_evicts_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ - thread_exit(); -} - static u_int arc_dnlc_evicts_arg; extern struct vfsops zfs_vfsops; @@ -3970,18 +4317,17 @@ arc_is_overflowing(void) } /* - * The buffer, supplied as the first argument, needs a data block. If we - * are hitting the hard limit for the cache size, we must sleep, waiting - * for the eviction thread to catch up. If we're past the target size - * but below the hard limit, we'll only signal the reclaim thread and - * continue on. + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. */ -static void -arc_get_data_buf(arc_buf_t *buf) +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + void *datap = NULL; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -4021,12 +4367,13 @@ arc_get_data_buf(arc_buf_t *buf) mutex_exit(&arc_reclaim_lock); } + VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -4034,11 +4381,9 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_state_t *state = hdr->b_l1hdr.b_state; + if (!GHOST_STATE(state)) { - (void) refcount_add_many(&state->arcs_size, size, buf); + (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is @@ -4051,9 +4396,10 @@ arc_get_data_buf(arc_buf_t *buf) */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], - size); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); } + /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p @@ -4064,6 +4410,37 @@ arc_get_data_buf(arc_buf_t *buf) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); + return (datap); +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(data, size); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(data, size); + arc_space_return(size, ARC_SPACE_DATA); + } } /* @@ -4107,7 +4484,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; @@ -4141,7 +4518,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -4210,8 +4587,8 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg)); + bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ @@ -4220,7 +4597,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg)); + arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; @@ -4228,18 +4605,30 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) } } +static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + } +} + static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ + arc_buf_hdr_t *hdr = zio->io_private; + arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; + int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -4259,33 +4648,32 @@ arc_read_done(zio_t *zio) arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && - hash_lock == NULL) || - (found == hdr && + ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); } - hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } + } + + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - hdr->b_flags &= ~ARC_FLAG_L2CACHE; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - /* byteswap if necessary */ callback_list = hdr->b_l1hdr.b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? - byteswap_uint64_array : - dmu_ot_byteswap[bswap].ob_func; - func(buf->b_data, hdr->b_size); - } - - arc_cksum_compute(buf, B_FALSE); -#ifdef illumos - arc_buf_watch(buf); -#endif + ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { @@ -4299,31 +4687,50 @@ arc_read_done(zio_t *zio) } /* create copies of the data buffer for the callers */ - abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ if (abuf == NULL) { - ARCSTAT_BUMP(arcstat_duplicate_reads); - abuf = arc_buf_clone(buf); + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private); + if (zio->io_error == 0) { + zio->io_error = + arc_decompress(acb->acb_buf); + } + abuf = acb->acb_buf; + } else { + add_reference(hdr, acb->acb_private); + acb->acb_buf = arc_buf_clone(abuf); } - acb->acb_buf = abuf; - abuf = NULL; } } hdr->b_l1hdr.b_acb = NULL; - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) { - ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (abuf == NULL) { + /* + * This buffer didn't have a callback so it must + * be a prefetch. + */ + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error != 0) { - hdr->b_flags |= ARC_FLAG_IO_ERROR; + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -4393,7 +4800,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; - arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); @@ -4410,8 +4816,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { - + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { @@ -4443,7 +4849,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { @@ -4464,10 +4871,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - ASSERT(acb->acb_done != NULL); + ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } @@ -4490,34 +4896,36 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } - add_reference(hdr, hash_lock, private); + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + if (buf == NULL) { + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + buf = arc_buf_alloc_impl(hdr, private); + VERIFY0(arc_decompress(buf)); } else { + add_reference(hdr, private); buf = arc_buf_clone(buf); } + ASSERT3P(buf->b_data, !=, NULL); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - hdr->b_flags |= ARC_FLAG_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), @@ -4527,20 +4935,21 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (done) done(NULL, buf, private); } else { - uint64_t size = BP_GET_LSIZE(bp); + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; - enum zio_compress b_compress = ZIO_COMPRESS_OFF; - int32_t b_asize = 0; + uint64_t size; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); @@ -4550,26 +4959,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); - (void) arc_buf_remove_ref(buf, private); + arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - - /* - * If there is a callback, we pass our reference to - * it; otherwise we remove our reference. - */ - if (done == NULL) { - (void) remove_reference(hdr, hash_lock, - private); - } - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* * This block is in the ghost cache. If it was L2-only @@ -4580,54 +4972,60 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } - + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* - * If there is a callback, we pass a reference to it. + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). */ - if (done != NULL) - add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_l1hdr.b_buf = buf; - ASSERT0(hdr->b_l1hdr.b_datacnt); - hdr->b_l1hdr.b_datacnt = 1; - arc_get_data_buf(buf); arc_access(hdr, hash_lock); + arc_hdr_alloc_pdata(hdr); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + size = arc_hdr_size(hdr); + + /* + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; } + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; - b_compress = hdr->b_l2hdr.b_compress; - b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -4636,6 +5034,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, vd = NULL; } + if (priority == ZIO_PRIORITY_ASYNC_READ) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + if (hash_lock != NULL) mutex_exit(hash_lock); @@ -4643,9 +5046,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ - ASSERT3U(hdr->b_size, ==, size); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_phys_t *, zb); + uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), @@ -4662,11 +5066,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, curthread->td_ru.ru_inblock++; #endif - if (priority == ZIO_PRIORITY_ASYNC_READ) - hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; - else - hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: @@ -4688,23 +5087,20 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_buf = buf; - cb->l2rcb_spa = spa; + cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; - cb->l2rcb_compress = b_compress; - if (b_asize > hdr->b_size) { - ASSERT3U(b_compress, ==, - ZIO_COMPRESS_OFF); - b_data = zio_data_buf_alloc(b_asize); + uint64_t asize = vdev_psize_to_asize(vd, size); + if (asize != size) { + b_data = zio_data_buf_alloc(asize); cb->l2rcb_data = b_data; } else { - b_data = buf->b_data; + b_data = hdr->b_l1hdr.b_pdata; } ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + size < vd->vdev_psize - + addr + asize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* @@ -4713,27 +5109,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - if (b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3U(b_asize, ==, 0); - rzio = zio_null(pio, spa, vd, - l2arc_read_done, cb, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); - } else { - rzio = zio_read_phys(pio, vd, addr, - b_asize, b_data, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - } + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + asize, b_data, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -4763,8 +5151,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } } - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, zio_flags, zb); + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -4775,20 +5163,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, return (0); } -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || - func == NULL); - ASSERT(buf->b_efunc == NULL); - ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); - - buf->b_efunc = func; - buf->b_private = private; -} - /* * Notify the arc that a block was freed, and thus will never be used again. */ @@ -4804,87 +5178,40 @@ arc_freed(spa_t *spa, const blkptr_t *bp) hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; - if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - mutex_exit(hash_lock); - arc_release(buf, FTAG); - (void) arc_buf_remove_ref(buf, FTAG); + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } -/* - * Clear the user eviction callback set by arc_set_callback(), first calling - * it if it exists. Because the presence of a callback keeps an arc_buf cached - * clearing the callback may result in the arc_buf being destroyed. However, - * it will not result in the *last* arc_buf being destroyed, hence the data - * will remain cached in the ARC. We make a copy of the arc buffer here so - * that we can process the callback without holding any locks. - * - * It's possible that the callback is already in the process of being cleared - * by another thread. In this case we can not clear the callback. - * - * Returns B_TRUE if the callback was successfully called and cleared. - */ -boolean_t -arc_clear_callback(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - arc_evict_func_t *efunc = buf->b_efunc; - void *private = buf->b_private; - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We are on the eviction list; process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&buf->b_evict_lock); - VERIFY0(efunc(private)); - return (B_TRUE); - } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, - hdr->b_l1hdr.b_datacnt); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - buf->b_efunc = NULL; - buf->b_private = NULL; - - if (hdr->b_l1hdr.b_datacnt > 1) { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - mutex_exit(&buf->b_evict_lock); - } - - mutex_exit(hash_lock); - VERIFY0(efunc(private)); - return (B_TRUE); -} - /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. @@ -4916,15 +5243,18 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(buf->b_efunc, ==, NULL); - ASSERT3P(buf->b_private, ==, NULL); - hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); arc_buf_thaw(buf); return; @@ -4967,48 +5297,87 @@ arc_release(arc_buf_t *buf, void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_datacnt > 1) { + if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); - uint32_t flags = hdr->b_flags; + VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + /* * Pull the data off of this hdr and attach it to - * a new anonymous hdr. + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. */ - (void) remove_reference(hdr, hash_lock, tag); + arc_buf_t *lastbuf = NULL; bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; - buf->b_next = NULL; + while (*bufp != NULL) { + if (*bufp == buf) { + *bufp = buf->b_next; + } + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + ASSERT3P(lastbuf, !=, NULL); + + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block, transfer + * ownership and setup sharing with a new arc_buf_t at the end + * of the hdr's b_buf list. + */ + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. Then, setup a new + * block sharing relationship with the last buffer + * on the arc_buf_t list. + */ + arc_unshare_buf(hdr, buf); + arc_share_buf(hdr, lastbuf); + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); - (void) refcount_remove_many( - &state->arcs_size, hdr->b_size, buf); + (void) refcount_remove_many(&state->arcs_size, + HDR_GET_LSIZE(hdr), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); - uint64_t *size = &state->arcs_lsize[type]; - ASSERT3U(*size, >=, hdr->b_size); - atomic_add_64(size, -hdr->b_size); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), buf); } - /* - * We're releasing a duplicate user data buffer, update - * our statistics accordingly. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, - -hdr->b_size); - } - hdr->b_l1hdr.b_datacnt -= 1; + hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); @@ -5016,25 +5385,25 @@ arc_release(arc_buf_t *buf, void *tag) mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - nhdr->b_size = blksz; - nhdr->b_spa = spa; - - nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_flags |= arc_bufc_to_flags(type); - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * Allocate a new hdr. The new hdr will contain a b_pdata + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_datacnt = 1; - nhdr->b_l1hdr.b_state = arc_anon; - nhdr->b_l1hdr.b_arc_access = 0; - nhdr->b_l1hdr.b_tmp_cdata = NULL; - nhdr->b_freeze_cksum = NULL; - + nhdr->b_l1hdr.b_bufcnt = 1; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); - (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); + (void) refcount_add_many(&arc_anon->arcs_size, + HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -5048,8 +5417,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_discard_identity(hdr); arc_buf_thaw(buf); } - buf->b_efunc = NULL; - buf->b_private = NULL; } int @@ -5083,28 +5450,85 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* - * If the IO is already in progress, then this is a re-write - * attempt, so we need to thaw and re-compute the cksum. - * It is the responsibility of the callback to handle the - * accounting for any re-write attempt. + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback as invoked. */ - if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + if (hdr->b_l1hdr.b_pdata != NULL) { + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + enum zio_compress compress; + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + /* + * If the hdr is compressed, then copy the compressed + * zio contents into arc_buf_hdr_t. Otherwise, copy the original + * data buf into the hdr. Ideally, we would like to always copy the + * io_data into b_pdata but the user may have disabled compressed + * arc thus the on-disk block may or may not match what we maintain + * in the hdr's b_pdata field. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pdata(hdr); + bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + /* + * This hdr is not compressed so we're able to share + * the arc_buf_t data buffer with the hdr. + */ + arc_share_buf(hdr, buf); + VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + HDR_GET_LSIZE(hdr))); + } + arc_hdr_verify(hdr, zio->io_bp); } static void @@ -5135,9 +5559,11 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { @@ -5145,7 +5571,7 @@ arc_write_done(zio_t *zio) hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); } /* @@ -5154,7 +5580,7 @@ arc_write_done(zio_t *zio) * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ - if (!BUF_EMPTY(hdr)) { + if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -5188,19 +5614,19 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -5210,9 +5636,8 @@ arc_write_done(zio_t *zio) } zio_t * -arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, arc_done_func_t *ready, +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) @@ -5221,16 +5646,14 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, arc_write_callback_t *callback; zio_t *zio; - ASSERT(ready != NULL); - ASSERT(done != NULL); + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_acb == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (l2arc_compress) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5239,7 +5662,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_private = private; callback->awcb_buf = buf; - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + /* + * The hdr's b_pdata is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pdata != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, @@ -5336,12 +5782,14 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, - arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, - arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, - reserve>>10, arc_c>>10); + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); @@ -5353,8 +5801,10 @@ arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); - evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; - evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int @@ -5407,7 +5857,7 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given @@ -5449,6 +5899,117 @@ arc_lowmem(void *arg __unused, int howto __unused) } #endif +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); +} + +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + void arc_init(void) { @@ -5458,9 +6019,6 @@ arc_init(void) cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); @@ -5512,6 +6070,7 @@ arc_init(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); + arc_size = 0; /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; @@ -5556,69 +6115,11 @@ arc_init(void) zfs_arc_min = arc_c_min; zfs_arc_max = arc_c_max; - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - arc_size = 0; - - multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - - refcount_create(&arc_anon->arcs_size); - refcount_create(&arc_mru->arcs_size); - refcount_create(&arc_mru_ghost->arcs_size); - refcount_create(&arc_mfu->arcs_size); - refcount_create(&arc_mfu_ghost->arcs_size); - refcount_create(&arc_l2c_only->arcs_size); - + arc_state_init(); buf_init(); - arc_reclaim_thread_exit = FALSE; - arc_user_evicts_thread_exit = FALSE; + arc_reclaim_thread_exit = B_FALSE; arc_dnlc_evicts_thread_exit = FALSE; - arc_eviction_list = NULL; - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -5637,13 +6138,10 @@ arc_init(void) EVENTHANDLER_PRI_FIRST); #endif - (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); - (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, TS_RUN, minclsyspri); - arc_dead = FALSE; + arc_dead = B_FALSE; arc_warm = B_FALSE; /* @@ -5702,10 +6200,10 @@ void arc_fini(void) { mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = TRUE; + arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to - * FALSE when it is finished exiting; we're waiting for that. + * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); @@ -5713,17 +6211,8 @@ arc_fini(void) } mutex_exit(&arc_reclaim_lock); - mutex_enter(&arc_user_evicts_lock); - arc_user_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_user_evicts_thread_exit) { - cv_signal(&arc_user_evicts_cv); - cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); mutex_enter(&arc_dnlc_evicts_lock); arc_dnlc_evicts_thread_exit = TRUE; @@ -5737,10 +6226,7 @@ arc_fini(void) } mutex_exit(&arc_dnlc_evicts_lock); - /* Use TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, TRUE); - - arc_dead = TRUE; + arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); @@ -5751,30 +6237,10 @@ arc_fini(void) cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); - mutex_destroy(&arc_user_evicts_lock); - cv_destroy(&arc_user_evicts_cv); - mutex_destroy(&arc_dnlc_evicts_lock); cv_destroy(&arc_dnlc_evicts_cv); - refcount_destroy(&arc_anon->arcs_size); - refcount_destroy(&arc_mru->arcs_size); - refcount_destroy(&arc_mru_ghost->arcs_size); - refcount_destroy(&arc_mfu->arcs_size); - refcount_destroy(&arc_mfu_ghost->arcs_size); - refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); - + arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); @@ -6081,9 +6547,13 @@ l2arc_do_free_on_write() for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT(df->l2df_data != NULL); - ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + ASSERT3P(df->l2df_data, !=, NULL); + if (df->l2df_type == ARC_BUFC_METADATA) { + zio_buf_free(df->l2df_data, df->l2df_size); + } else { + ASSERT(df->l2df_type == ARC_BUFC_DATA); + zio_data_buf_free(df->l2df_data, df->l2df_size); + } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6106,13 +6576,13 @@ l2arc_write_done(zio_t *zio) int64_t bytes_dropped = 0; cb = zio->io_private; - ASSERT(cb != NULL); + ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; - ASSERT(dev != NULL); + ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; - ASSERT(head != NULL); + ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; - ASSERT(buflist != NULL); + ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -6170,33 +6640,27 @@ l2arc_write_done(zio_t *zio) */ ASSERT(HDR_HAS_L1HDR(hdr)); - /* - * We may have allocated a buffer for L2ARC compression, - * we must release it to avoid leaking this data. - */ - l2arc_release_cdata_buf(hdr); - if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); l2arc_trim(hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - bytes_dropped += hdr->b_l2hdr.b_asize; + bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); + arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } @@ -6223,23 +6687,21 @@ l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; - arc_buf_t *buf; kmutex_t *hash_lock; - int equal; + boolean_t valid_cksum; - ASSERT(zio->io_vd != NULL); + ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; - ASSERT(cb != NULL); - buf = cb->l2rcb_buf; - ASSERT(buf != NULL); + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* @@ -6247,10 +6709,11 @@ l2arc_read_done(zio_t *zio) * move it and free the buffer. */ if (cb->l2rcb_data != NULL) { - ASSERT3U(hdr->b_size, <, zio->io_size); - ASSERT3U(cb->l2rcb_compress, ==, ZIO_COMPRESS_OFF); - if (zio->io_error == 0) - bcopy(cb->l2rcb_data, buf->b_data, hdr->b_size); + ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); + if (zio->io_error == 0) { + bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr)); + } /* * The following must be done regardless of whether @@ -6264,28 +6727,23 @@ l2arc_read_done(zio_t *zio) * needs real data. */ zio_data_buf_free(cb->l2rcb_data, zio->io_size); - zio->io_size = zio->io_orig_size = hdr->b_size; - zio->io_data = zio->io_orig_data = buf->b_data; + zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); + zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata; } - /* - * If the buffer was compressed, decompress it first. - */ - if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) - l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); - ASSERT(zio->io_data != NULL); - ASSERT3U(zio->io_size, ==, hdr->b_size); - ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); + ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ - equal = arc_cksum_equal(buf); - if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); - zio->io_private = buf; - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); @@ -6298,7 +6756,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!equal) + if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -6311,9 +6769,10 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, hdr->b_size, arc_read_done, buf, - zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); } } @@ -6463,12 +6922,11 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - hdr->b_flags |= ARC_FLAG_L2_EVICTED; + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); arc_hdr_l2hdr_destroy(hdr); } @@ -6489,39 +6947,25 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * the delta by which the device hand has changed due to alignment). */ static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - boolean_t *headroom_boost) +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_sz, headroom, - buf_compress_minsz; - void *buf_data; + uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); - const boolean_t do_headroom_boost = *headroom_boost; int try; - ASSERT(dev->l2ad_vdev != NULL); - - /* Lower the flag now, we might want to raise it again later. */ - *headroom_boost = B_FALSE; + ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_sz = write_asize = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; - head->b_flags |= ARC_FLAG_HAS_L2HDR; + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); - /* - * We will want to try to compress buffers that are at least 2x the - * device sector size. - */ - buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; - /* * Copy buffers for L2ARC writing. */ @@ -6545,20 +6989,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); headroom = target_sz * l2arc_headroom; - if (do_headroom_boost) + if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; - uint64_t buf_sz; - uint64_t buf_a_sz; - size_t align; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); - ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); + ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, + HDR_GET_LSIZE(hdr)); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { @@ -6569,7 +7011,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += hdr->b_size; + passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. @@ -6584,16 +7026,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - /* - * Assume that the buffer is not going to be compressed - * and could take more space on disk because of a larger - * disk block size. - */ - buf_sz = hdr->b_size; - align = (size_t)1 << dev->l2ad_vdev->vdev_ashift; - buf_a_sz = P2ROUNDUP(buf_sz, align); - - if ((write_asize + buf_a_sz) > target_sz) { + if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_full); @@ -6619,63 +7052,75 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_BUMP(arcstat_l2_write_pios); } - /* - * Create and add a new L2ARC header. - */ hdr->b_l2hdr.b_dev = dev; - hdr->b_flags |= ARC_FLAG_L2_WRITING; - /* - * Temporarily stash the data buffer in b_tmp_cdata. - * The subsequent write step will pick it up from - * there. This is because can't access b_l1hdr.b_buf - * without holding the hash_lock, which we in turn - * can't access without holding the ARC list locks - * (which we want to avoid during compression/writing). - */ - hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; - hdr->b_l2hdr.b_asize = hdr->b_size; - hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; - - /* - * Explicitly set the b_daddr field to a known - * value which means "invalid address". This - * enables us to differentiate which stage of - * l2arc_write_buffers() the particular header - * is in (e.g. this loop, or the one below). - * ARC_FLAG_L2_WRITING is not enough to make - * this distinction, and we need to know in - * order to do proper l2arc vdev accounting in - * arc_release() and arc_hdr_destroy(). - * - * Note, we can't use a new flag to distinguish - * the two stages because we don't hold the - * header's hash_lock below, in the second stage - * of this function. Thus, we can't simply - * change the b_flags field to denote that the - * IO has been sent. We can change the b_daddr - * field of the L2 portion, though, since we'll - * be holding the l2ad_mtx; which is why we're - * using it to denote the header's state change. - */ - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; - - hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* - * Compute and store the buffer cksum before - * writing. On debug the cksum is verified first. + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ - arc_cksum_verify(hdr->b_l1hdr.b_buf); - arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + uint64_t size = arc_hdr_size(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + size); + + (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); + + /* + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. To + * ensure that this copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. + */ + void *to_write; + if (!HDR_SHARED_DATA(hdr) && size == asize) { + to_write = hdr->b_l1hdr.b_pdata; + } else { + arc_buf_contents_t type = arc_buf_type(hdr); + if (type == ARC_BUFC_METADATA) { + to_write = zio_buf_alloc(asize); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + to_write = zio_data_buf_alloc(asize); + } + + bcopy(hdr->b_l1hdr.b_pdata, to_write, size); + if (asize != size) + bzero(to_write + size, asize - size); + l2arc_free_data_on_write(to_write, asize, type); + } + wzio = zio_write_phys(pio, dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, asize, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + write_sz += HDR_GET_LSIZE(hdr); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + + write_asize += size; + write_psize += asize; + dev->l2ad_hand += asize; mutex_exit(hash_lock); - write_sz += buf_sz; - write_asize += buf_a_sz; + (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); @@ -6692,89 +7137,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (0); } - mutex_enter(&dev->l2ad_mtx); - - /* - * Now start writing the buffers. We're starting at the write head - * and work backwards, retracing the course of the buffer selector - * loop above. - */ - write_asize = 0; - for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; - hdr = list_prev(&dev->l2ad_buflist, hdr)) { - uint64_t buf_sz; - boolean_t compress; - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_FLAG_L2_WRITING in the previous step, but we must - * take care to only access its L2 cache parameters. In - * particular, hdr->l1hdr.b_buf may be invalid by now due to - * ARC eviction. - */ - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - - /* - * Save a pointer to the original buffer data we had previously - * stashed away. - */ - buf_data = hdr->b_l1hdr.b_tmp_cdata; - - compress = HDR_L2COMPRESS(hdr) && - hdr->b_l2hdr.b_asize >= buf_compress_minsz; - if (l2arc_transform_buf(hdr, compress)) { - /* - * If compression succeeded, enable headroom - * boost on the next scan cycle. - */ - *headroom_boost = B_TRUE; - } - - /* - * Get the new buffer size that accounts for compression - * and padding. - */ - buf_sz = hdr->b_l2hdr.b_asize; - - /* - * We need to do this regardless if buf_sz is zero or - * not, otherwise, when this l2hdr is evicted we'll - * remove a reference that was never added. - */ - (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); - - /* Compression may have squashed the buffer to zero length. */ - if (buf_sz != 0) { - /* - * If the data was padded or compressed, then it - * it is in a new buffer. - */ - if (hdr->b_l1hdr.b_tmp_cdata != NULL) - buf_data = hdr->b_l1hdr.b_tmp_cdata; - wzio = zio_write_phys(pio, dev->l2ad_vdev, - dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, - NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_CANFAIL, B_FALSE); - - DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, - zio_t *, wzio); - (void) zio_nowait(wzio); - - write_asize += buf_sz; - dev->l2ad_hand += buf_sz; - } - } - - mutex_exit(&dev->l2ad_mtx); - ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); @@ -6798,203 +7160,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (write_asize); } -/* - * Transforms, possibly compresses and pads, an L2ARC buffer. - * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its - * size in l2hdr->b_asize. This routine tries to compress the data and - * depending on the compression result there are three possible outcomes: - * *) The buffer was incompressible. The buffer size was already ashift aligned. - * The original hdr contents were left untouched except for b_tmp_cdata, - * which is reset to NULL. The caller must keep a pointer to the original - * data. - * *) The buffer was incompressible. The buffer size was not ashift aligned. - * b_tmp_cdata was replaced with a temporary data buffer which holds a padded - * (aligned) copy of the data. Once writing is done, invoke - * l2arc_release_cdata_buf on this hdr to free the temporary buffer. - * *) The buffer was all-zeros, so there is no need to write it to an L2 - * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is - * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. - * *) Compression succeeded and b_tmp_cdata was replaced with a temporary - * data buffer which holds the compressed data to be written, and b_asize - * tells us how much data there is. b_compress is set to the appropriate - * compression algorithm. Once writing is done, invoke - * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. - * - * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the - * buffer was incompressible). - */ -static boolean_t -l2arc_transform_buf(arc_buf_hdr_t *hdr, boolean_t compress) -{ - void *cdata; - size_t align, asize, csize, len, rounded; - - ASSERT(HDR_HAS_L2HDR(hdr)); - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - - len = l2hdr->b_asize; - align = (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift; - asize = P2ROUNDUP(len, align); - cdata = zio_data_buf_alloc(asize); - ASSERT3P(cdata, !=, NULL); - if (compress) - csize = zio_compress_data(ZIO_COMPRESS_LZ4, - hdr->b_l1hdr.b_tmp_cdata, cdata, len); - else - csize = len; - - if (csize == 0) { - /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, asize); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; - l2hdr->b_asize = 0; - hdr->b_l1hdr.b_tmp_cdata = NULL; - ARCSTAT_BUMP(arcstat_l2_compress_zeros); - return (B_TRUE); - } - - rounded = P2ROUNDUP(csize, align); - ASSERT3U(rounded, <=, asize); - if (rounded < len) { - /* - * Compression succeeded, we'll keep the cdata around for - * writing and release it afterwards. - */ - if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); - csize = rounded; - } - l2hdr->b_compress = ZIO_COMPRESS_LZ4; - l2hdr->b_asize = csize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_compress_successes); - return (B_TRUE); - } else { - /* - * Compression did not save space. - */ - if (P2PHASE(len, align) != 0) { - /* - * Use compression buffer for a copy of data padded to - * the proper size. Compression algorithm remains set - * to ZIO_COMPRESS_OFF. - */ - ASSERT3U(len, <, asize); - bcopy(hdr->b_l1hdr.b_tmp_cdata, cdata, len); - bzero((char *)cdata + len, asize - len); - l2hdr->b_asize = asize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_padding_needed); - } else { - ASSERT3U(len, ==, asize); - /* - * The original buffer is good as is, - * release the compressed buffer. - * l2hdr will be left unmodified except for b_tmp_cdata. - */ - zio_data_buf_free(cdata, asize); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } - if (compress) - ARCSTAT_BUMP(arcstat_l2_compress_failures); - return (B_FALSE); - } -} - -/* - * Decompresses a zio read back from an l2arc device. On success, the - * underlying zio's io_data buffer is overwritten by the uncompressed - * version. On decompression error (corrupt compressed stream), the - * zio->io_error value is set to signal an I/O error. - * - * Please note that the compressed data stream is not checksummed, so - * if the underlying device is experiencing data corruption, we may feed - * corrupt data to the decompressor, so the decompressor needs to be - * able to handle this situation (LZ4 does). - */ -static void -l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) -{ - ASSERT(L2ARC_IS_VALID_COMPRESS(c)); - - if (zio->io_error != 0) { - /* - * An io error has occured, just restore the original io - * size in preparation for a main pool read. - */ - zio->io_orig_size = zio->io_size = hdr->b_size; - return; - } - - if (c == ZIO_COMPRESS_EMPTY) { - /* - * An empty buffer results in a null zio, which means we - * need to fill its io_data after we're done restoring the - * buffer's contents. - */ - ASSERT(hdr->b_l1hdr.b_buf != NULL); - bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; - } else { - ASSERT(zio->io_data != NULL); - /* - * We copy the compressed data from the start of the arc buffer - * (the zio_read will have pulled in only what we need, the - * rest is garbage which we will overwrite at decompression) - * and then decompress back to the ARC data buffer. This way we - * can minimize copying by simply decompressing back over the - * original compressed data (rather than decompressing to an - * aux buffer and then copying back the uncompressed buffer, - * which is likely to be much larger). - */ - uint64_t csize; - void *cdata; - - csize = zio->io_size; - cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, - hdr->b_size) != 0) - zio->io_error = EIO; - zio_data_buf_free(cdata, csize); - } - - /* Restore the expected uncompressed IO size. */ - zio->io_orig_size = zio->io_size = hdr->b_size; -} - -/* - * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. - * This buffer serves as a temporary holder of compressed or padded data while - * the buffer entry is being written to an l2arc device. Once that is - * done, we can dispose of it. - */ -static void -l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) -{ - size_t align, asize, len; - enum zio_compress comp = hdr->b_l2hdr.b_compress; - - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); - - if (hdr->b_l1hdr.b_tmp_cdata != NULL) { - ASSERT(comp != ZIO_COMPRESS_EMPTY); - len = hdr->b_size; - align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; - asize = P2ROUNDUP(len, align); - zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, asize); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } else { - ASSERT(comp == ZIO_COMPRESS_OFF || comp == ZIO_COMPRESS_EMPTY); - } -} - /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. @@ -7007,7 +7172,6 @@ l2arc_feed_thread(void *dummy __unused) spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); - boolean_t headroom_boost = B_FALSE; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -7045,7 +7209,7 @@ l2arc_feed_thread(void *dummy __unused) continue; spa = dev->l2ad_spa; - ASSERT(spa != NULL); + ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to @@ -7078,7 +7242,7 @@ l2arc_feed_thread(void *dummy __unused) /* * Write ARC buffers. */ - wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); + wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. @@ -7173,7 +7337,7 @@ l2arc_remove_vdev(vdev_t *vd) break; } } - ASSERT(remdev != NULL); + ASSERT3P(remdev, !=, NULL); /* * Remove device from global list diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 56c93d3a242e..a89d507481c5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -45,6 +45,9 @@ #include #include #include +#include + +uint_t zfs_dbuf_evict_key; /* * Number of times that zfs_free_range() took the slow path while doing @@ -52,7 +55,6 @@ */ uint64_t zfs_free_range_recv_miss; -static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -64,9 +66,76 @@ extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, /* * Global data structures and functions for the dbuf cache. */ -static kmem_cache_t *dbuf_cache; +static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs are added to the dbuf cache once the last hold is released. If a + * dbuf is later accessed and still exists in the dbuf cache, then it will + * be removed from the cache and later re-added to the head of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + */ +static multilist_t dbuf_cache; +static refcount_t dbuf_cache_size; +uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; + +/* Cap the size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_max_shift = 5; + +/* + * The dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) @@ -76,6 +145,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); return (0); @@ -88,6 +158,7 @@ dbuf_dest(void *vdb, void *unused) dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } @@ -117,8 +188,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) return (crc); } -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -129,7 +198,7 @@ dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; @@ -180,7 +249,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; @@ -212,7 +281,7 @@ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; @@ -237,8 +306,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db) atomic_dec_64(&dbuf_hash_count); } -static arc_evict_func_t dbuf_do_evict; - typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING @@ -323,15 +390,181 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } -void -dbuf_evict(dmu_buf_impl_t *db) +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); + dmu_buf_impl_t *db = obj; - dbuf_clear(db); - dbuf_destroy(db); + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + uint64_t dbuf_cache_hiwater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + uint64_t dbuf_cache_lowater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(&dbuf_cache); + multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); + + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + /* + * Set the thread's tsd to indicate that it's processing evictions. + * Once a thread stops evicting from the dbuf cache it will + * reset its tsd to NULL. + */ + ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); + (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); + + dmu_buf_impl_t *db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + dbuf_destroy(db); + } else { + multilist_sublist_unlock(mls); + } + (void) tsd_set(zfs_dbuf_evict_key, NULL); +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +static void +dbuf_evict_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } + + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + + /* + * We use thread specific data to track when a thread has + * started processing evictions. This allows us to avoid deeply + * nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + * The dbuf_eviction_thread will always have its tsd set until + * that thread exits. All other threads will only set their tsd + * if they are participating in the eviction process. This only + * happens if the eviction thread is unable to process evictions + * fast enough. To keep the dbuf cache size in check, other threads + * can evict from the dbuf cache directly. Those threads will set + * their tsd values so that we ensure that they only evict one dbuf + * from the dbuf cache. + */ + if (tsd_get(zfs_dbuf_evict_key) != NULL) + return; + + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + boolean_t evict_now = B_FALSE; + + mutex_enter(&dbuf_evict_lock); + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + evict_now = dbuf_cache_above_hiwater(); + cv_signal(&dbuf_evict_cv); + } + mutex_exit(&dbuf_evict_lock); + + if (evict_now) { + dbuf_evict_one(); + } + } } void @@ -359,18 +592,38 @@ dbuf_init(void) goto retry; } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + /* + * Setup the parameters for the dbuf cache. We cap the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. + */ + dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_max_shift); + /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); + + multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + zfs_arc_num_sublists_per_state, + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_cache_size); + + tsd_create(&zfs_dbuf_evict_key, NULL); + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); } void @@ -382,8 +635,23 @@ dbuf_fini(void) for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_cache); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + tsd_destroy(&zfs_dbuf_evict_key); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + refcount_destroy(&dbuf_cache_size); + multilist_destroy(&dbuf_cache); } /* @@ -541,7 +809,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - db->db_buf = NULL; + ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; @@ -556,8 +824,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -568,6 +834,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; @@ -579,6 +846,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } @@ -647,7 +915,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -696,7 +964,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); @@ -733,8 +1001,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; - if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -851,7 +1117,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -907,9 +1173,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { + db->db_buf = NULL; dbuf_clear_data(db); } } @@ -1033,7 +1300,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_destroy(db); continue; } /* The dbuf is referenced */ @@ -1138,7 +1405,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1149,7 +1416,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db)); + arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { @@ -1547,7 +1814,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); + arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1556,12 +1823,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); return (B_TRUE); } @@ -1725,7 +1988,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } @@ -1743,10 +2006,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } @@ -1758,59 +2021,62 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) dmu_buf_fill_done(&db->db, tx); } -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_clear_callback() - * ARC: dbuf_do_evict()->dbuf_destroy() - * - * This routine will dissociate the dbuf from the arc, by calling - * arc_clear_callback(), but will not evict the data from the ARC. - */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; - boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); - dbuf_evict_user(db); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } - if (db->db_state == DB_CACHED) { + if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db.db_data = NULL; + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); db->db_state = DB_UNCACHED; } + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), @@ -1821,15 +2087,25 @@ dbuf_clear(dmu_buf_impl_t *db) */ dnode_rele(dn, db); db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } - if (db->db_buf) - dbuf_gone = arc_clear_callback(db->db_buf); + ASSERT(refcount_is_zero(&db->db_holds)); - if (!dbuf_gone) - mutex_exit(&db->db_mtx); + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); /* * If this dbuf is referenced from an indirect dbuf, @@ -1922,7 +2198,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -1971,7 +2247,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); + kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } @@ -1996,76 +2272,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (db); } -static int -dbuf_do_evict(void *private) -{ - dmu_buf_impl_t *db = private; - - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); - - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); -} - -static void -dbuf_destroy(dmu_buf_impl_t *db) -{ - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (db->db_dnode_handle != NULL) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - atomic_dec_32(&dn->dn_dbufs_count); - mutex_exit(&dn->dn_dbufs_mtx); - DB_DNODE_EXIT(db); - /* - * Decrementing the dbuf count means that the hold - * corresponding to the removed dbuf is no longer - * discounted in dnode_move(), so the dnode cannot be - * moved until after we release the hold. - */ - dnode_rele(dn, db); - db->db_dnode_handle = NULL; - } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_buf = NULL; - - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - - kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); -} - typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ @@ -2103,10 +2315,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; @@ -2135,7 +2374,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } - (void) arc_buf_remove_ref(abuf, private); + + arc_buf_destroy(abuf, private); } /* @@ -2229,6 +2469,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; @@ -2309,18 +2550,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, return (SET_ERROR(ENOENT)); } - if (db->db_buf && refcount_is_zero(&db->db_holds)) { - arc_buf_add_ref(db->db_buf, db); - if (db->db_buf->b_data == NULL) { - dbuf_clear(db); - if (parent) { - dbuf_rele(parent, NULL); - parent = NULL; - } - goto top; - } + if (db->db_buf != NULL) ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); - } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); @@ -2338,13 +2569,19 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(dn->dn_objset->os_spa, + arc_alloc_buf(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(refcount_is_zero(&db->db_holds)); + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -2418,7 +2655,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); - ASSERT(holds > 1); + ASSERT3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref @@ -2489,8 +2726,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); + } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) @@ -2535,55 +2774,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_destroy(db); } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + dbuf_destroy(db); } else { - VERIFY(!arc_buf_remove_ref(db->db_buf, db)); + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); - /* - * A dbuf will be eligible for eviction if either the - * 'primarycache' property is set or a duplicate - * copy of this buffer is already cached in the arc. - * - * In the case of the 'primarycache' a buffer - * is considered for eviction if it matches the - * criteria set in the property. - * - * To decide if our buffer is considered a - * duplicate, we must call into the arc to determine - * if multiple buffers are referencing the same - * block on-disk. If so, then we simply evict - * ourselves. - */ - if (!DBUF_IS_CACHEABLE(db)) { - if (db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - spa_t *spa = - dmu_objset_spa(db->db_objset); - blkptr_t bp = *db->db_blkptr; - dbuf_clear(db); - arc_freed(spa, &bp); - } else { - dbuf_clear(db); - } - } else if (db->db_pending_evict || - arc_buf_eviction_needed(db->db_buf)) { - dbuf_clear(db); - } else { - mutex_exit(&db->db_mtx); + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; } + + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + multilist_insert(&dbuf_cache, db); + (void) refcount_add_many(&dbuf_cache_size, + db->db.db_size, db); + mutex_exit(&db->db_mtx); + + dbuf_evict_notify(); + } + + if (do_arc_evict) + arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } + } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -2871,7 +3099,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + *datap = arc_alloc_buf(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -3137,10 +3365,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db)); - else if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; @@ -3156,8 +3381,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -3334,8 +3557,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, - children_ready_cb, + &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index e5dae5dff44d..83caa5b9a35a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1407,7 +1407,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG)); + arc_buf_destroy(buf, FTAG); } /* @@ -1763,8 +1763,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, - NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -2137,11 +2136,11 @@ dmu_init(void) xuio_stat_init(); dmu_objset_init(); dnode_init(); - dbuf_init(); zfetch_init(); zio_compress_init(); l2arc_init(); arc_init(); + dbuf_init(); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c index e88968b7b00f..e7bfdaa90e97 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -169,7 +169,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (err) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); if (err) return (err); /* Don't care about the data blocks */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index fe0c0db5ff1d..b6ae9680ca6e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; - if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, @@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_buf_alloc(spa, + arc_buf_t *buf = arc_alloc_buf(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); - (void) arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } @@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_buf_alloc(spa, size, + os->os_phys_buf = arc_alloc_buf(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); @@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } @@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os) } zil_free(os->os_zil); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in @@ -1128,7 +1124,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 874a1ca89730..dfecfc9b29ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -634,7 +634,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (err != 0) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; @@ -646,7 +646,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; @@ -670,7 +670,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_buf_alloc(spa, blksz, &abuf, + abuf = arc_alloc_buf(spa, blksz, &abuf, ARC_BUFC_DATA); uint64_t *ptr; for (ptr = abuf->b_data; @@ -700,7 +700,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index dd0644a37ff6..a76e74b872ce 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -380,7 +380,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (buf) - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) @@ -595,7 +595,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, osp = buf->b_data; traverse_zil(&td, &osp->os_zil_header); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 39bef756733c..d599ed32b0d5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -512,7 +512,7 @@ dnode_destroy(dnode_t *dn) } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 7179c41cbfe3..daf539ec5cbe 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, &db_marker, db, AVL_BEFORE); - dbuf_clear(db); + dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); avl_remove(&dn->dn_dbufs, &db_marker); @@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn) if (dn->dn_bonus != NULL) { if (refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } else { dn->dn_bonus->db_pending_evict = TRUE; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index 89c8eba7dcef..f68a6ba62b3a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -679,7 +679,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; @@ -705,7 +705,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, cdnp, zb->zb_blkid * epb + i, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; @@ -737,7 +737,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c index ade681c1de8e..8f7914f87eb2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -231,4 +231,28 @@ refcount_transfer(refcount_t *dst, refcount_t *src) list_destroy(&removed); } +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} #endif /* ZFS_DEBUG */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index 714c5288fc36..5bf6ddd2d1ee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -43,51 +43,83 @@ extern "C" { */ #define ARC_EVICT_ALL -1ULL +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); -typedef int arc_evict_func_t(void *priv); /* generic arc_done_func_t's which you can use */ arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; +extern int zfs_arc_num_sublists_per_state; + typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ - ARC_FLAG_NONE = 1 << 0, /* No flags set */ - ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ - ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ - ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ - ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ - ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ - ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */ - ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */ - ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */ - ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 14, - ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, + ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 18, + ARC_FLAG_BUFC_METADATA = 1 << 14, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 19, - ARC_FLAG_HAS_L2HDR = 1 << 20, + ARC_FLAG_HAS_L1HDR = 1 << 15, + ARC_FLAG_HAS_L2HDR = 1 << 16, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 17, + ARC_FLAG_SHARED_DATA = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { @@ -95,11 +127,10 @@ struct arc_buf { arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; - arc_evict_func_t *b_efunc; - void *b_private; }; typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES @@ -119,19 +150,17 @@ typedef enum arc_space_type { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, +arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); -void arc_buf_add_ref(arc_buf_t *buf, void *tag); -boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); -boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif @@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *child_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); -void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv); -boolean_t arc_clear_callback(arc_buf_t *buf); - void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 496412614b16..6862599a6540 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -228,6 +229,11 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; + /* + * Link in dbuf_cache. + */ + multilist_node_t db_cache_link; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); @@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2COMPRESSIBLE(_db) \ - ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \ - (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE)) - #ifdef ZFS_DEBUG /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h index 3423645c3d92..917444e404ed 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -73,6 +73,7 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); +void refcount_transfer_ownership(refcount_t *, void *, void *); void refcount_sysinit(void); void refcount_fini(void); @@ -100,6 +101,7 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } +#define refcount_transfer_ownership(rc, current_holder, new_holder) #define refcount_sysinit() #define refcount_fini() diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index 105f8897bccd..984c17415964 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -149,6 +149,8 @@ _NOTE(CONSTCOND) } while (0) #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -391,8 +393,10 @@ _NOTE(CONSTCOND) } while (0) 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 7517e045eaba..f8c4bd6f6197 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -564,6 +564,10 @@ extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); +extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h index 6cba764f24e1..4f0ed6473dc3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -99,8 +99,12 @@ extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; #endif +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); +extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 31ff73440353..8cf274115b2f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -253,7 +254,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } return (error); @@ -290,7 +291,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } return (error); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 0b4f7d83310c..dbd55e291aa9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -329,7 +329,7 @@ zio_data_buf_free(void *buf, size_t size) * Push and pop I/O transform buffers * ========================================================================== */ -static void +void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { @@ -347,7 +347,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio->io_size = size; } -static void +void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; @@ -1022,8 +1022,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) + void *data, uint64_t size, int type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; @@ -2356,7 +2356,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c index 997e3133f3d8..751f825e9191 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c @@ -297,20 +297,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, + void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { - blkptr_t *bp = zio->io_bp; - uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - int byteswap; - int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); - uint64_t offset = zio->io_offset; - void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum, verifier; - spa_t *spa = zio->io_spa; + zio_cksum_t actual_cksum, expected_cksum; + int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); @@ -319,6 +311,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; + zio_cksum_t verifier; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -358,35 +351,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) spa->spa_cksum_tmpls[checksum], &actual_cksum); eck->zec_cksum = expected_cksum; - if (byteswap) + if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); + } } else { - ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; - info->zbc_checksum_name = ci->ci_name; - info->zbc_byteswapped = byteswap; - info->zbc_injected = 0; - info->zbc_has_cksum = 1; + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); - if (zio_injection_enabled && !zio->io_error && + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + void *data = zio->io_data; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + if (error != 0 && zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } - - return (0); + return (error); } /*