2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-28 13:45:14 -07:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
Illumos #764: panic in zfs:dbuf_sync_list
Hypothesis about what's going on here.
At some time in the past, something, i.e. dnode_reallocate()
calls one of:
dbuf_rm_spill(dn, tx);
These will do:
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx)
dbuf_undirty(db, tx)
Currently dbuf_undirty can leave a spill block in dn_dirty_records[],
(it having been put there previously by dbuf_dirty) and free it.
Sometime later, dbuf_sync_list trips over this reference to free'd
(and typically reused) memory.
Also, dbuf_undirty can call dnode_clear_range with a bogus
block ID. It needs to test for DMU_SPILL_BLKID, similar to
how dnode_clear_range is called in dbuf_dirty().
References to Illumos issue and patch:
- https://www.illumos.org/issues/764
- https://github.com/illumos/illumos-gate/commit/3f2366c2bb
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Mark.Maybe@oracle.com
Reviewed by: Albert Lee <trisk@nexenta.com
Approved by: Garrett D'Amore <garrett@nexenta.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
2011-07-26 11:37:06 -07:00
|
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
2015-07-02 18:23:20 +02:00
|
|
|
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
2013-08-01 13:02:10 -07:00
|
|
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
2015-04-02 14:44:32 +11:00
|
|
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/zfs_context.h>
|
2010-08-26 11:49:16 -07:00
|
|
|
#include <sys/arc.h>
|
2008-11-20 12:01:55 -08:00
|
|
|
#include <sys/dmu.h>
|
2013-07-29 10:58:53 -08:00
|
|
|
#include <sys/dmu_send.h>
|
2008-11-20 12:01:55 -08:00
|
|
|
#include <sys/dmu_impl.h>
|
|
|
|
#include <sys/dbuf.h>
|
|
|
|
#include <sys/dmu_objset.h>
|
|
|
|
#include <sys/dsl_dataset.h>
|
|
|
|
#include <sys/dsl_dir.h>
|
|
|
|
#include <sys/dmu_tx.h>
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/zio.h>
|
|
|
|
#include <sys/dmu_zfetch.h>
|
2010-05-28 13:45:14 -07:00
|
|
|
#include <sys/sa.h>
|
|
|
|
#include <sys/sa_impl.h>
|
2014-06-05 13:19:08 -08:00
|
|
|
#include <sys/zfeature.h>
|
|
|
|
#include <sys/blkptr.h>
|
2014-04-15 19:40:22 -08:00
|
|
|
#include <sys/range_tree.h>
|
2014-12-12 18:07:39 -08:00
|
|
|
#include <sys/trace_dbuf.h>
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
struct dbuf_hold_impl_data {
|
|
|
|
/* Function arguments */
|
|
|
|
dnode_t *dh_dn;
|
|
|
|
uint8_t dh_level;
|
|
|
|
uint64_t dh_blkid;
|
2015-12-22 02:31:57 +01:00
|
|
|
boolean_t dh_fail_sparse;
|
|
|
|
boolean_t dh_fail_uncached;
|
2010-08-26 10:52:00 -07:00
|
|
|
void *dh_tag;
|
|
|
|
dmu_buf_impl_t **dh_dbp;
|
|
|
|
/* Local variables */
|
|
|
|
dmu_buf_impl_t *dh_db;
|
|
|
|
dmu_buf_impl_t *dh_parent;
|
|
|
|
blkptr_t *dh_bp;
|
|
|
|
int dh_err;
|
|
|
|
dbuf_dirty_record_t *dh_dr;
|
|
|
|
arc_buf_contents_t dh_type;
|
|
|
|
int dh_depth;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
|
2015-12-22 02:31:57 +01:00
|
|
|
dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse,
|
|
|
|
boolean_t fail_uncached,
|
|
|
|
void *tag, dmu_buf_impl_t **dbp, int depth);
|
2010-08-26 10:52:00 -07:00
|
|
|
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
|
|
|
|
|
2013-08-20 20:11:52 -08:00
|
|
|
/*
|
|
|
|
* Number of times that zfs_free_range() took the slow path while doing
|
|
|
|
* a zfs receive. A nonzero value indicates a potential performance problem.
|
|
|
|
*/
|
|
|
|
uint64_t zfs_free_range_recv_miss;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void dbuf_destroy(dmu_buf_impl_t *db);
|
2013-09-04 07:00:57 -05:00
|
|
|
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
2008-12-03 12:09:06 -08:00
|
|
|
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
#ifndef __lint
|
|
|
|
extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
|
|
|
|
dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
|
|
|
|
#endif /* ! __lint */
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Global data structures and functions for the dbuf cache.
|
|
|
|
*/
|
|
|
|
static kmem_cache_t *dbuf_cache;
|
2015-04-02 14:44:32 +11:00
|
|
|
static taskq_t *dbu_evict_taskq;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
dbuf_cons(void *vdb, void *unused, int kmflag)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
bzero(db, sizeof (dmu_buf_impl_t));
|
|
|
|
|
|
|
|
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
|
|
|
|
refcount_create(&db->db_holds);
|
2015-04-03 14:14:28 +11:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_dest(void *vdb, void *unused)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
mutex_destroy(&db->db_mtx);
|
|
|
|
cv_destroy(&db->db_changed);
|
|
|
|
refcount_destroy(&db->db_holds);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dbuf hash table routines
|
|
|
|
*/
|
|
|
|
static dbuf_hash_table_t dbuf_hash_table;
|
|
|
|
|
|
|
|
static uint64_t dbuf_hash_count;
|
|
|
|
|
|
|
|
static uint64_t
|
|
|
|
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
|
|
|
|
{
|
|
|
|
uintptr_t osv = (uintptr_t)os;
|
|
|
|
uint64_t crc = -1ULL;
|
|
|
|
|
|
|
|
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
|
|
|
|
|
|
|
|
crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
|
|
|
|
|
|
|
|
return (crc);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
|
|
|
|
|
|
|
|
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
|
|
|
|
((dbuf)->db.db_object == (obj) && \
|
|
|
|
(dbuf)->db_objset == (os) && \
|
|
|
|
(dbuf)->db_level == (level) && \
|
|
|
|
(dbuf)->db_blkid == (blkid))
|
|
|
|
|
|
|
|
dmu_buf_impl_t *
|
2015-04-02 22:59:15 +11:00
|
|
|
dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
2010-08-26 09:52:39 -07:00
|
|
|
uint64_t hv;
|
|
|
|
uint64_t idx;
|
2008-11-20 12:01:55 -08:00
|
|
|
dmu_buf_impl_t *db;
|
|
|
|
|
2010-08-26 09:52:39 -07:00
|
|
|
hv = DBUF_HASH(os, obj, level, blkid);
|
|
|
|
idx = hv & h->hash_table_mask;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
|
|
|
|
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (db->db_state != DB_EVICTING) {
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
return (db);
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2015-04-02 22:59:15 +11:00
|
|
|
static dmu_buf_impl_t *
|
|
|
|
dbuf_find_bonus(objset_t *os, uint64_t object)
|
|
|
|
{
|
|
|
|
dnode_t *dn;
|
|
|
|
dmu_buf_impl_t *db = NULL;
|
|
|
|
|
|
|
|
if (dnode_hold(os, object, FTAG, &dn) == 0) {
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
|
|
if (dn->dn_bonus != NULL) {
|
|
|
|
db = dn->dn_bonus;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
}
|
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
dnode_rele(dn, FTAG);
|
|
|
|
}
|
|
|
|
return (db);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Insert an entry into the hash table. If there is already an element
|
|
|
|
* equal to elem in the hash table, then the already existing element
|
|
|
|
* will be returned and the new element will not be inserted.
|
|
|
|
* Otherwise returns NULL.
|
|
|
|
*/
|
|
|
|
static dmu_buf_impl_t *
|
|
|
|
dbuf_hash_insert(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
2010-05-28 13:45:14 -07:00
|
|
|
objset_t *os = db->db_objset;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t obj = db->db.db_object;
|
|
|
|
int level = db->db_level;
|
2010-08-26 09:52:39 -07:00
|
|
|
uint64_t blkid, hv, idx;
|
2008-11-20 12:01:55 -08:00
|
|
|
dmu_buf_impl_t *dbf;
|
|
|
|
|
2010-08-26 09:52:39 -07:00
|
|
|
blkid = db->db_blkid;
|
|
|
|
hv = DBUF_HASH(os, obj, level, blkid);
|
|
|
|
idx = hv & h->hash_table_mask;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
|
|
|
|
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
|
|
|
|
mutex_enter(&dbf->db_mtx);
|
|
|
|
if (dbf->db_state != DB_EVICTING) {
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
return (dbf);
|
|
|
|
}
|
|
|
|
mutex_exit(&dbf->db_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
db->db_hash_next = h->hash_table[idx];
|
|
|
|
h->hash_table[idx] = db;
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
2016-01-13 16:37:41 -08:00
|
|
|
atomic_inc_64(&dbuf_hash_count);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2014-07-15 03:43:18 -04:00
|
|
|
* Remove an entry from the hash table. It must be in the EVICTING state.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_hash_remove(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
2010-08-26 09:52:39 -07:00
|
|
|
uint64_t hv, idx;
|
2008-11-20 12:01:55 -08:00
|
|
|
dmu_buf_impl_t *dbf, **dbp;
|
|
|
|
|
2010-08-26 09:52:39 -07:00
|
|
|
hv = DBUF_HASH(db->db_objset, db->db.db_object,
|
|
|
|
db->db_level, db->db_blkid);
|
|
|
|
idx = hv & h->hash_table_mask;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2014-07-15 03:43:18 -04:00
|
|
|
* We musn't hold db_mtx to maintain lock ordering:
|
2008-11-20 12:01:55 -08:00
|
|
|
* DBUF_HASH_MUTEX > db_mtx.
|
|
|
|
*/
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
ASSERT(db->db_state == DB_EVICTING);
|
|
|
|
ASSERT(!MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
dbp = &h->hash_table[idx];
|
|
|
|
while ((dbf = *dbp) != db) {
|
|
|
|
dbp = &dbf->db_hash_next;
|
|
|
|
ASSERT(dbf != NULL);
|
|
|
|
}
|
|
|
|
*dbp = db->db_hash_next;
|
|
|
|
db->db_hash_next = NULL;
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
2016-01-13 16:37:41 -08:00
|
|
|
atomic_dec_64(&dbuf_hash_count);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static arc_evict_func_t dbuf_do_evict;
|
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
typedef enum {
|
|
|
|
DBVU_EVICTING,
|
|
|
|
DBVU_NOT_EVICTING
|
|
|
|
} dbvu_verify_type_t;
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
|
|
|
|
{
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
int64_t holds;
|
|
|
|
|
|
|
|
if (db->db_user == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Only data blocks support the attachment of user data. */
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
|
|
|
|
/* Clients must resolve a dbuf before attaching user data. */
|
|
|
|
ASSERT(db->db.db_data != NULL);
|
|
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
|
|
|
|
|
|
holds = refcount_count(&db->db_holds);
|
|
|
|
if (verify_type == DBVU_EVICTING) {
|
|
|
|
/*
|
|
|
|
* Immediate eviction occurs when holds == dirtycnt.
|
|
|
|
* For normal eviction buffers, holds is zero on
|
|
|
|
* eviction, except when dbuf_fix_old_data() calls
|
|
|
|
* dbuf_clear_data(). However, the hold count can grow
|
|
|
|
* during eviction even though db_mtx is held (see
|
|
|
|
* dmu_bonus_hold() for an example), so we can only
|
|
|
|
* test the generic invariant that holds >= dirtycnt.
|
|
|
|
*/
|
|
|
|
ASSERT3U(holds, >=, db->db_dirtycnt);
|
|
|
|
} else {
|
2015-10-13 14:09:45 -07:00
|
|
|
if (db->db_user_immediate_evict == TRUE)
|
2015-04-02 14:44:32 +11:00
|
|
|
ASSERT3U(holds, >=, db->db_dirtycnt);
|
|
|
|
else
|
|
|
|
ASSERT3U(holds, >, 0);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
|
|
|
dbuf_evict_user(dmu_buf_impl_t *db)
|
|
|
|
{
|
2015-04-02 14:44:32 +11:00
|
|
|
dmu_buf_user_t *dbu = db->db_user;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
if (dbu == NULL)
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
dbuf_verify_user(db, DBVU_EVICTING);
|
|
|
|
db->db_user = NULL;
|
|
|
|
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
if (dbu->dbu_clear_on_evict_dbufp != NULL)
|
|
|
|
*dbu->dbu_clear_on_evict_dbufp = NULL;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Invoke the callback from a taskq to avoid lock order reversals
|
|
|
|
* and limit stack depth.
|
|
|
|
*/
|
|
|
|
taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
|
|
|
|
&dbu->dbu_tqent);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
boolean_t
|
|
|
|
dbuf_is_metadata(dmu_buf_impl_t *db)
|
|
|
|
{
|
2014-05-02 12:26:47 -07:00
|
|
|
/*
|
|
|
|
* Consider indirect blocks and spill blocks to be meta data.
|
|
|
|
*/
|
|
|
|
if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
|
2010-08-26 14:24:34 -07:00
|
|
|
return (B_TRUE);
|
|
|
|
} else {
|
|
|
|
boolean_t is_metadata;
|
|
|
|
|
|
|
|
DB_DNODE_ENTER(db);
|
2012-12-13 15:24:15 -08:00
|
|
|
is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
|
|
|
return (is_metadata);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
|
|
|
dbuf_evict(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
ASSERT(db->db_data_pending == NULL);
|
|
|
|
|
|
|
|
dbuf_clear(db);
|
|
|
|
dbuf_destroy(db);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_init(void)
|
|
|
|
{
|
|
|
|
uint64_t hsize = 1ULL << 16;
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hash table is big enough to fill all of physical memory
|
2015-08-30 20:59:23 -05:00
|
|
|
* with an average block size of zfs_arc_average_blocksize (default 8K).
|
|
|
|
* By default, the table will take up
|
|
|
|
* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2015-08-30 20:59:23 -05:00
|
|
|
while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
|
2008-11-20 12:01:55 -08:00
|
|
|
hsize <<= 1;
|
|
|
|
|
|
|
|
retry:
|
|
|
|
h->hash_table_mask = hsize - 1;
|
2010-08-26 11:46:09 -07:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
2013-11-01 20:26:11 +01:00
|
|
|
/*
|
|
|
|
* Large allocations which do not require contiguous pages
|
|
|
|
* should be using vmem_alloc() in the linux kernel
|
|
|
|
*/
|
2014-11-20 19:09:39 -05:00
|
|
|
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
|
2010-08-26 11:46:09 -07:00
|
|
|
#else
|
2008-11-20 12:01:55 -08:00
|
|
|
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
|
2010-08-26 11:46:09 -07:00
|
|
|
#endif
|
2008-11-20 12:01:55 -08:00
|
|
|
if (h->hash_table == NULL) {
|
|
|
|
/* XXX - we should really return an error instead of assert */
|
|
|
|
ASSERT(hsize > (1ULL << 10));
|
|
|
|
hsize >>= 1;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
|
|
|
|
sizeof (dmu_buf_impl_t),
|
|
|
|
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
|
|
|
|
|
|
|
|
for (i = 0; i < DBUF_MUTEXES; i++)
|
2015-03-30 22:43:29 -05:00
|
|
|
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
|
2013-10-02 17:11:19 -07:00
|
|
|
|
|
|
|
dbuf_stats_init(h);
|
2015-04-02 14:44:32 +11:00
|
|
|
|
|
|
|
/*
|
|
|
|
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
|
|
|
|
* configuration is not required.
|
|
|
|
*/
|
2015-07-24 10:08:31 -07:00
|
|
|
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_fini(void)
|
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
|
|
int i;
|
|
|
|
|
2013-10-02 17:11:19 -07:00
|
|
|
dbuf_stats_destroy();
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
for (i = 0; i < DBUF_MUTEXES; i++)
|
|
|
|
mutex_destroy(&h->hash_mutexes[i]);
|
2010-08-26 11:46:09 -07:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
2013-11-01 20:26:11 +01:00
|
|
|
/*
|
|
|
|
* Large allocations which do not require contiguous pages
|
|
|
|
* should be using vmem_free() in the linux kernel
|
|
|
|
*/
|
2010-08-26 11:46:09 -07:00
|
|
|
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
|
|
|
#else
|
2008-11-20 12:01:55 -08:00
|
|
|
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
2010-08-26 11:46:09 -07:00
|
|
|
#endif
|
2008-11-20 12:01:55 -08:00
|
|
|
kmem_cache_destroy(dbuf_cache);
|
2015-04-02 14:44:32 +11:00
|
|
|
taskq_destroy(dbu_evict_taskq);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Other stuff.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
static void
|
|
|
|
dbuf_verify(dmu_buf_impl_t *db)
|
|
|
|
{
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2010-05-28 13:45:14 -07:00
|
|
|
dbuf_dirty_record_t *dr;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(db->db_objset != NULL);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
if (dn == NULL) {
|
|
|
|
ASSERT(db->db_parent == NULL);
|
|
|
|
ASSERT(db->db_blkptr == NULL);
|
|
|
|
} else {
|
|
|
|
ASSERT3U(db->db.db_object, ==, dn->dn_object);
|
|
|
|
ASSERT3P(db->db_objset, ==, dn->dn_objset);
|
|
|
|
ASSERT3U(db->db_level, <, dn->dn_nlevels);
|
2010-08-26 14:24:34 -07:00
|
|
|
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
|
|
|
|
db->db_blkid == DMU_SPILL_BLKID ||
|
2015-04-03 14:14:28 +11:00
|
|
|
!avl_is_empty(&dn->dn_dbufs));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
|
|
ASSERT(dn != NULL);
|
|
|
|
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
|
|
|
|
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
|
|
|
|
} else if (db->db_blkid == DMU_SPILL_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(dn != NULL);
|
2013-05-10 14:17:03 -07:00
|
|
|
ASSERT0(db->db.db_offset);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
|
|
|
|
}
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
|
|
|
|
ASSERT(dr->dr_dbuf == db);
|
|
|
|
|
|
|
|
for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
|
|
|
|
ASSERT(dr->dr_dbuf == db);
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* We can't assert that db_size matches dn_datablksz because it
|
|
|
|
* can be momentarily different when another thread is doing
|
|
|
|
* dnode_set_blksz().
|
|
|
|
*/
|
|
|
|
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
|
2010-05-28 13:45:14 -07:00
|
|
|
dr = db->db_data_pending;
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* It should only be modified in syncing context, so
|
|
|
|
* make sure we only have one copy of the data.
|
|
|
|
*/
|
|
|
|
ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* verify db->db_blkptr */
|
|
|
|
if (db->db_blkptr) {
|
|
|
|
if (db->db_parent == dn->dn_dbuf) {
|
|
|
|
/* db is pointed to by the dnode */
|
|
|
|
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
|
2009-07-02 15:44:48 -07:00
|
|
|
if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(db->db_parent == NULL);
|
|
|
|
else
|
|
|
|
ASSERT(db->db_parent != NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
|
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
|
|
&dn->dn_phys->dn_blkptr[db->db_blkid]);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
/* db is pointed to by an indirect block */
|
2010-08-26 09:53:00 -07:00
|
|
|
ASSERTV(int epb = db->db_parent->db.db_size >>
|
|
|
|
SPA_BLKPTRSHIFT);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
|
|
|
|
ASSERT3U(db->db_parent->db.db_object, ==,
|
|
|
|
db->db.db_object);
|
|
|
|
/*
|
|
|
|
* dnode_grow_indblksz() can make this fail if we don't
|
|
|
|
* have the struct_rwlock. XXX indblksz no longer
|
|
|
|
* grows. safe to do this now?
|
|
|
|
*/
|
2010-08-26 14:24:34 -07:00
|
|
|
if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
|
|
((blkptr_t *)db->db_parent->db.db_data +
|
|
|
|
db->db_blkid % epb));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
|
2010-05-28 13:45:14 -07:00
|
|
|
(db->db_buf == NULL || db->db_buf->b_data) &&
|
|
|
|
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state != DB_FILL && !dn->dn_free_txg) {
|
|
|
|
/*
|
|
|
|
* If the blkptr isn't set but they have nonzero data,
|
|
|
|
* it had better be dirty, otherwise we'll lose that
|
|
|
|
* data when we evict this buffer.
|
2016-05-15 08:02:28 -07:00
|
|
|
*
|
|
|
|
* There is an exception to this rule for indirect blocks; in
|
|
|
|
* this case, if the indirect block is a hole, we fill in a few
|
|
|
|
* fields on each of the child blocks (importantly, birth time)
|
|
|
|
* to prevent hole birth times from being lost when you
|
|
|
|
* partially fill in a hole.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
if (db->db_dirtycnt == 0) {
|
2016-05-15 08:02:28 -07:00
|
|
|
if (db->db_level == 0) {
|
|
|
|
uint64_t *buf = db->db.db_data;
|
|
|
|
int i;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2016-05-15 08:02:28 -07:00
|
|
|
for (i = 0; i < db->db.db_size >> 3; i++) {
|
|
|
|
ASSERT(buf[i] == 0);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
int i;
|
|
|
|
blkptr_t *bps = db->db.db_data;
|
|
|
|
ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
|
|
|
|
db->db.db_size);
|
|
|
|
/*
|
|
|
|
* We want to verify that all the blkptrs in the
|
|
|
|
* indirect block are holes, but we may have
|
|
|
|
* automatically set up a few fields for them.
|
|
|
|
* We iterate through each blkptr and verify
|
|
|
|
* they only have those fields set.
|
|
|
|
*/
|
|
|
|
for (i = 0;
|
|
|
|
i < db->db.db_size / sizeof (blkptr_t);
|
|
|
|
i++) {
|
|
|
|
blkptr_t *bp = &bps[i];
|
|
|
|
ASSERT(ZIO_CHECKSUM_IS_ZERO(
|
|
|
|
&bp->blk_cksum));
|
|
|
|
ASSERT(
|
|
|
|
DVA_IS_EMPTY(&bp->blk_dva[0]) &&
|
|
|
|
DVA_IS_EMPTY(&bp->blk_dva[1]) &&
|
|
|
|
DVA_IS_EMPTY(&bp->blk_dva[2]));
|
|
|
|
ASSERT0(bp->blk_fill);
|
|
|
|
ASSERT0(bp->blk_pad[0]);
|
|
|
|
ASSERT0(bp->blk_pad[1]);
|
|
|
|
ASSERT(!BP_IS_EMBEDDED(bp));
|
|
|
|
ASSERT(BP_IS_HOLE(bp));
|
|
|
|
ASSERT0(bp->blk_phys_birth);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
static void
|
|
|
|
dbuf_clear_data(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
dbuf_evict_user(db);
|
|
|
|
db->db_buf = NULL;
|
|
|
|
db->db.db_data = NULL;
|
|
|
|
if (db->db_state != DB_NOFILL)
|
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
|
|
|
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
2015-04-02 14:44:32 +11:00
|
|
|
ASSERT(buf != NULL);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_buf = buf;
|
2015-04-02 14:44:32 +11:00
|
|
|
ASSERT(buf->b_data != NULL);
|
|
|
|
db->db.db_data = buf->b_data;
|
|
|
|
if (!arc_released(buf))
|
|
|
|
arc_set_callback(buf, dbuf_do_evict, db);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
/*
|
|
|
|
* Loan out an arc_buf for read. Return the loaned arc_buf.
|
|
|
|
*/
|
|
|
|
arc_buf_t *
|
|
|
|
dbuf_loan_arcbuf(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
arc_buf_t *abuf;
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
|
|
|
|
int blksz = db->db.db_size;
|
2013-12-09 10:37:51 -08:00
|
|
|
spa_t *spa = db->db_objset->os_spa;
|
2010-08-26 14:24:34 -07:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&db->db_mtx);
|
2010-08-26 14:24:34 -07:00
|
|
|
abuf = arc_loan_buf(spa, blksz);
|
2010-05-28 13:45:14 -07:00
|
|
|
bcopy(db->db.db_data, abuf->b_data, blksz);
|
|
|
|
} else {
|
|
|
|
abuf = db->db_buf;
|
|
|
|
arc_loan_inuse_buf(abuf, db);
|
2015-04-02 14:44:32 +11:00
|
|
|
dbuf_clear_data(db);
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
return (abuf);
|
|
|
|
}
|
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
/*
|
|
|
|
* Calculate which level n block references the data at the level 0 offset
|
|
|
|
* provided.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t
|
2015-12-22 02:31:57 +01:00
|
|
|
dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-12-22 02:31:57 +01:00
|
|
|
if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
|
|
|
|
/*
|
|
|
|
* The level n blkid is equal to the level 0 blkid divided by
|
|
|
|
* the number of level 0s in a level n block.
|
|
|
|
*
|
|
|
|
* The level 0 blkid is offset >> datablkshift =
|
|
|
|
* offset / 2^datablkshift.
|
|
|
|
*
|
|
|
|
* The number of level 0s in a level n is the number of block
|
|
|
|
* pointers in an indirect block, raised to the power of level.
|
|
|
|
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
|
|
|
|
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
|
|
|
|
*
|
|
|
|
* Thus, the level n blkid is: offset /
|
|
|
|
* ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
|
|
|
|
* = offset / 2^(datablkshift + level *
|
|
|
|
* (indblkshift - SPA_BLKPTRSHIFT))
|
|
|
|
* = offset >> (datablkshift + level *
|
|
|
|
* (indblkshift - SPA_BLKPTRSHIFT))
|
|
|
|
*/
|
|
|
|
return (offset >> (dn->dn_datablkshift + level *
|
|
|
|
(dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
ASSERT3U(offset, <, dn->dn_datablksz);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
ASSERT3U(db->db_state, ==, DB_READ);
|
|
|
|
/*
|
|
|
|
* All reads are synchronous, so we must have a hold on the dbuf
|
|
|
|
*/
|
|
|
|
ASSERT(refcount_count(&db->db_holds) > 0);
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
if (db->db_level == 0 && db->db_freed_in_flight) {
|
|
|
|
/* we were freed in flight; disregard any error */
|
|
|
|
arc_release(buf, db);
|
|
|
|
bzero(buf->b_data, db->db.db_size);
|
|
|
|
arc_buf_freeze(buf);
|
|
|
|
db->db_freed_in_flight = FALSE;
|
|
|
|
dbuf_set_data(db, buf);
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
} else if (zio == NULL || zio->io_error == 0) {
|
|
|
|
dbuf_set_data(db, buf);
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
} else {
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3P(db->db_buf, ==, NULL);
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
}
|
|
|
|
cv_broadcast(&db->db_changed);
|
2010-05-28 13:45:14 -07:00
|
|
|
dbuf_rele_and_unlock(db, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-09-10 11:59:03 -07:00
|
|
|
static int
|
2015-12-26 22:10:31 +01:00
|
|
|
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2014-06-25 10:37:59 -08:00
|
|
|
zbookmark_phys_t zb;
|
2014-12-06 09:24:32 -08:00
|
|
|
uint32_t aflags = ARC_FLAG_NOWAIT;
|
2014-09-10 11:59:03 -07:00
|
|
|
int err;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
/* We need the struct_rwlock to prevent db_blkptr from changing. */
|
2008-12-03 12:09:06 -08:00
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(db->db_state == DB_UNCACHED);
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
/*
|
|
|
|
* The bonus length stored in the dnode may be less than
|
|
|
|
* the maximum available space in the bonus buffer.
|
|
|
|
*/
|
2009-07-02 15:44:48 -07:00
|
|
|
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT3U(bonuslen, <=, db->db.db_size);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
db->db.db_data = zio_buf_alloc(max_bonuslen);
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
if (bonuslen < max_bonuslen)
|
|
|
|
bzero(db->db.db_data, max_bonuslen);
|
2009-07-02 15:44:48 -07:00
|
|
|
if (bonuslen)
|
|
|
|
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
mutex_exit(&db->db_mtx);
|
2014-09-10 11:59:03 -07:00
|
|
|
return (0);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
|
|
|
|
* processes the delete record and clears the bp while we are waiting
|
|
|
|
* for the dn_mtx (resulting in a "no" from block_freed).
|
|
|
|
*/
|
|
|
|
if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
|
|
|
|
(db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
|
|
|
|
BP_IS_HOLE(db->db_blkptr)))) {
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
|
|
|
|
2013-12-09 10:37:51 -08:00
|
|
|
dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
|
|
|
|
db->db.db_size, db, type));
|
2008-11-20 12:01:55 -08:00
|
|
|
bzero(db->db.db_data, db->db.db_size);
|
2016-05-15 08:02:28 -07:00
|
|
|
|
|
|
|
if (db->db_blkptr != NULL && db->db_level > 0 &&
|
|
|
|
BP_IS_HOLE(db->db_blkptr) &&
|
|
|
|
db->db_blkptr->blk_birth != 0) {
|
|
|
|
blkptr_t *bps = db->db.db_data;
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < ((1 <<
|
|
|
|
DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
|
|
|
|
i++) {
|
|
|
|
blkptr_t *bp = &bps[i];
|
|
|
|
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
|
|
|
|
1 << dn->dn_indblkshift);
|
|
|
|
BP_SET_LSIZE(bp,
|
|
|
|
BP_GET_LEVEL(db->db_blkptr) == 1 ?
|
|
|
|
dn->dn_datablksz :
|
|
|
|
BP_GET_LSIZE(db->db_blkptr));
|
|
|
|
BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
|
|
|
|
BP_SET_LEVEL(bp,
|
|
|
|
BP_GET_LEVEL(db->db_blkptr) - 1);
|
|
|
|
BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
mutex_exit(&db->db_mtx);
|
2014-09-10 11:59:03 -07:00
|
|
|
return (0);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state = DB_READ;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
if (DBUF_IS_L2CACHEABLE(db))
|
2014-12-06 09:24:32 -08:00
|
|
|
aflags |= ARC_FLAG_L2CACHE;
|
2013-08-01 13:02:10 -07:00
|
|
|
if (DBUF_IS_L2COMPRESSIBLE(db))
|
2014-12-06 09:24:32 -08:00
|
|
|
aflags |= ARC_FLAG_L2COMPRESS;
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
|
|
|
|
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
|
|
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
dbuf_add_ref(db, NULL);
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2014-09-10 11:59:03 -07:00
|
|
|
err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
|
2015-12-26 22:10:31 +01:00
|
|
|
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
|
2008-11-20 12:01:55 -08:00
|
|
|
&aflags, &zb);
|
2014-09-10 11:59:03 -07:00
|
|
|
|
|
|
|
return (SET_ERROR(err));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
|
|
|
{
|
|
|
|
int err = 0;
|
2013-12-09 10:37:51 -08:00
|
|
|
boolean_t havepzio = (zio != NULL);
|
|
|
|
boolean_t prefetch;
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't have to hold the mutex to check db_state because it
|
|
|
|
* can't be freed while we have a hold on the buffer.
|
|
|
|
*/
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
if (db->db_state == DB_NOFILL)
|
2013-03-08 10:41:28 -08:00
|
|
|
return (SET_ERROR(EIO));
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-26 14:24:34 -07:00
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
2010-08-26 14:24:34 -07:00
|
|
|
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
|
2008-12-03 12:09:06 -08:00
|
|
|
DBUF_IS_CACHEABLE(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (prefetch)
|
2015-12-26 22:10:31 +01:00
|
|
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
|
2008-11-20 12:01:55 -08:00
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-26 14:24:34 -07:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else if (db->db_state == DB_UNCACHED) {
|
2010-08-26 14:24:34 -07:00
|
|
|
spa_t *spa = dn->dn_objset->os_spa;
|
|
|
|
|
|
|
|
if (zio == NULL)
|
|
|
|
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
2014-09-10 11:59:03 -07:00
|
|
|
|
2015-12-26 22:10:31 +01:00
|
|
|
err = dbuf_read_impl(db, zio, flags);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* dbuf_read_impl has dropped db_mtx for us */
|
|
|
|
|
2014-09-10 11:59:03 -07:00
|
|
|
if (!err && prefetch)
|
2015-12-26 22:10:31 +01:00
|
|
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-26 14:24:34 -07:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-09-10 11:59:03 -07:00
|
|
|
if (!err && !havepzio)
|
2008-11-20 12:01:55 -08:00
|
|
|
err = zio_wait(zio);
|
|
|
|
} else {
|
2013-06-11 09:12:34 -08:00
|
|
|
/*
|
|
|
|
* Another reader came in while the dbuf was in flight
|
|
|
|
* between UNCACHED and CACHED. Either a writer will finish
|
|
|
|
* writing the buffer (sending the dbuf to CACHED) or the
|
|
|
|
* first reader's request will reach the read_done callback
|
|
|
|
* and send the dbuf to CACHED. Otherwise, a failure
|
|
|
|
* occurred and the dbuf went to UNCACHED.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (prefetch)
|
2015-12-26 22:10:31 +01:00
|
|
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
|
2008-11-20 12:01:55 -08:00
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-26 14:24:34 -07:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Skip the wait per the caller's request. */
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if ((flags & DB_RF_NEVERWAIT) == 0) {
|
|
|
|
while (db->db_state == DB_READ ||
|
|
|
|
db->db_state == DB_FILL) {
|
|
|
|
ASSERT(db->db_state == DB_READ ||
|
|
|
|
(flags & DB_RF_HAVESTRUCT) == 0);
|
2014-09-17 08:53:02 +02:00
|
|
|
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
|
|
|
|
db, zio_t *, zio);
|
2008-11-20 12:01:55 -08:00
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
}
|
|
|
|
if (db->db_state == DB_UNCACHED)
|
2013-03-08 10:41:28 -08:00
|
|
|
err = SET_ERROR(EIO);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(err || havepzio || db->db_state == DB_CACHED);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_noread(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
if (db->db_state == DB_UNCACHED) {
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
2013-12-09 10:37:51 -08:00
|
|
|
spa_t *spa = db->db_objset->os_spa;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
2010-08-26 14:24:34 -07:00
|
|
|
dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state = DB_FILL;
|
2008-12-03 12:09:06 -08:00
|
|
|
} else if (db->db_state == DB_NOFILL) {
|
2015-04-02 14:44:32 +11:00
|
|
|
dbuf_clear_data(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is our just-in-time copy function. It makes a copy of
|
|
|
|
* buffers, that have been modified in a previous transaction
|
|
|
|
* group, before we modify them in the current active group.
|
|
|
|
*
|
|
|
|
* This function is used in two places: when we are dirtying a
|
|
|
|
* buffer for the first time in a txg, and when we are freeing
|
|
|
|
* a range in a dnode that includes this buffer.
|
|
|
|
*
|
|
|
|
* Note that when we are called from dbuf_free_range() we do
|
|
|
|
* not put a hold on the buffer, we just traverse the active
|
|
|
|
* dbuf list for the dnode.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr = db->db_last_dirty;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(db->db.db_data != NULL);
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
|
|
|
|
|
|
|
|
if (dr == NULL ||
|
|
|
|
(dr->dt.dl.dr_data !=
|
2010-05-28 13:45:14 -07:00
|
|
|
((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the last dirty record for this dbuf has not yet synced
|
|
|
|
* and its referencing the dbuf data, either:
|
2010-08-26 14:24:34 -07:00
|
|
|
* reset the reference to point to a new copy,
|
2008-11-20 12:01:55 -08:00
|
|
|
* or (if there a no active holders)
|
|
|
|
* just null out the current db_data pointer.
|
|
|
|
*/
|
|
|
|
ASSERT(dr->dr_txg >= txg - 2);
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/* Note that the data bufs here are zio_bufs */
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
dnode_t *dn = DB_DNODE(db);
|
|
|
|
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
|
|
|
dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
|
|
|
|
int size = db->db.db_size;
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
2013-12-09 10:37:51 -08:00
|
|
|
spa_t *spa = db->db_objset->os_spa;
|
2010-08-26 14:24:34 -07:00
|
|
|
|
|
|
|
dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
|
2008-11-20 12:01:55 -08:00
|
|
|
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
|
|
|
|
} else {
|
2015-04-02 14:44:32 +11:00
|
|
|
dbuf_clear_data(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_unoverride(dbuf_dirty_record_t *dr)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-05-28 13:45:14 -07:00
|
|
|
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t txg = dr->dr_txg;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID ||
|
2008-11-20 12:01:55 -08:00
|
|
|
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
|
|
|
|
return;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_data_pending != dr);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/* free this block */
|
2013-12-09 10:37:51 -08:00
|
|
|
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
|
|
|
|
zio_free(db->db_objset->os_spa, txg, bp);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
2013-05-10 12:47:54 -07:00
|
|
|
dr->dt.dl.dr_nopwrite = B_FALSE;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Release the already-written buffer, so we leave it in
|
|
|
|
* a consistent dirty state. Note that all callers are
|
|
|
|
* modifying the buffer, so they will immediately do
|
|
|
|
* another (redundant) arc_release(). Therefore, leave
|
|
|
|
* the buf thawed to save the effort of freezing &
|
|
|
|
* immediately re-thawing it.
|
|
|
|
*/
|
|
|
|
arc_release(dr->dt.dl.dr_data, db);
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Evict (if its unreferenced) or clear (if its referenced) any level-0
|
|
|
|
* data blocks in the free range, so that any future readers will find
|
2013-12-09 10:37:51 -08:00
|
|
|
* empty blocks.
|
2013-07-29 10:58:53 -08:00
|
|
|
*
|
|
|
|
* This is a no-op if the dataset is in the middle of an incremental
|
|
|
|
* receive; see comment below for details.
|
2008-12-03 12:09:06 -08:00
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
2015-04-03 14:14:28 +11:00
|
|
|
dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
|
|
|
|
dmu_tx_t *tx)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-04-02 14:44:32 +11:00
|
|
|
dmu_buf_impl_t *db_search;
|
|
|
|
dmu_buf_impl_t *db, *db_next;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t txg = tx->tx_txg;
|
2015-04-03 14:14:28 +11:00
|
|
|
avl_index_t where;
|
2014-11-10 23:26:33 -06:00
|
|
|
boolean_t freespill =
|
2015-04-03 14:14:28 +11:00
|
|
|
(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID);
|
|
|
|
|
|
|
|
if (end_blkid > dn->dn_maxblkid && !freespill)
|
|
|
|
end_blkid = dn->dn_maxblkid;
|
|
|
|
dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
|
2015-04-03 14:14:28 +11:00
|
|
|
db_search->db_level = 0;
|
|
|
|
db_search->db_blkid = start_blkid;
|
2015-04-02 02:10:58 +11:00
|
|
|
db_search->db_state = DB_SEARCH;
|
2013-07-29 10:58:53 -08:00
|
|
|
|
2013-08-20 20:11:52 -08:00
|
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
2015-04-03 14:14:28 +11:00
|
|
|
if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) {
|
2013-08-20 20:11:52 -08:00
|
|
|
/* There can't be any dbufs in this range; no need to search. */
|
2015-04-03 14:14:28 +11:00
|
|
|
#ifdef DEBUG
|
|
|
|
db = avl_find(&dn->dn_dbufs, db_search, &where);
|
|
|
|
ASSERT3P(db, ==, NULL);
|
|
|
|
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
|
|
|
|
ASSERT(db == NULL || db->db_level > 0);
|
|
|
|
#endif
|
|
|
|
goto out;
|
2013-08-20 20:11:52 -08:00
|
|
|
} else if (dmu_objset_is_receiving(dn->dn_objset)) {
|
2013-07-29 10:58:53 -08:00
|
|
|
/*
|
2013-08-20 20:11:52 -08:00
|
|
|
* If we are receiving, we expect there to be no dbufs in
|
|
|
|
* the range to be freed, because receive modifies each
|
|
|
|
* block at most once, and in offset order. If this is
|
|
|
|
* not the case, it can lead to performance problems,
|
|
|
|
* so note that we unexpectedly took the slow path.
|
2013-07-29 10:58:53 -08:00
|
|
|
*/
|
2013-08-20 20:11:52 -08:00
|
|
|
atomic_inc_64(&zfs_free_range_recv_miss);
|
2013-07-29 10:58:53 -08:00
|
|
|
}
|
|
|
|
|
2015-04-03 14:14:28 +11:00
|
|
|
db = avl_find(&dn->dn_dbufs, db_search, &where);
|
|
|
|
ASSERT3P(db, ==, NULL);
|
|
|
|
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
|
|
|
|
|
|
|
|
for (; db != NULL; db = db_next) {
|
|
|
|
db_next = AVL_NEXT(&dn->dn_dbufs, db);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2015-04-03 14:14:28 +11:00
|
|
|
if (db->db_level != 0 || db->db_blkid > end_blkid) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ASSERT3U(db->db_blkid, >=, start_blkid);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* found a level 0 buffer in the range */
|
2013-09-04 07:00:57 -05:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (dbuf_undirty(db, tx)) {
|
|
|
|
/* mutex has been dropped and dbuf destroyed */
|
2008-11-20 12:01:55 -08:00
|
|
|
continue;
|
2013-09-04 07:00:57 -05:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (db->db_state == DB_UNCACHED ||
|
2008-12-03 12:09:06 -08:00
|
|
|
db->db_state == DB_NOFILL ||
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state == DB_EVICTING) {
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
|
|
|
|
/* will be handled in dbuf_read_done or dbuf_rele */
|
|
|
|
db->db_freed_in_flight = TRUE;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (refcount_count(&db->db_holds) == 0) {
|
|
|
|
ASSERT(db->db_buf);
|
|
|
|
dbuf_clear(db);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* The dbuf is referenced */
|
|
|
|
|
|
|
|
if (db->db_last_dirty != NULL) {
|
|
|
|
dbuf_dirty_record_t *dr = db->db_last_dirty;
|
|
|
|
|
|
|
|
if (dr->dr_txg == txg) {
|
|
|
|
/*
|
|
|
|
* This buffer is "in-use", re-adjust the file
|
|
|
|
* size to reflect that this buffer may
|
|
|
|
* contain new data when we sync.
|
|
|
|
*/
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID &&
|
|
|
|
db->db_blkid > dn->dn_maxblkid)
|
2008-11-20 12:01:55 -08:00
|
|
|
dn->dn_maxblkid = db->db_blkid;
|
|
|
|
dbuf_unoverride(dr);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This dbuf is not dirty in the open context.
|
|
|
|
* Either uncache it (if its not referenced in
|
|
|
|
* the open context) or reset its contents to
|
|
|
|
* empty.
|
|
|
|
*/
|
|
|
|
dbuf_fix_old_data(db, txg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* clear the contents if its cached */
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
ASSERT(db->db.db_data != NULL);
|
|
|
|
arc_release(db->db_buf, db);
|
|
|
|
bzero(db->db.db_data, db->db.db_size);
|
|
|
|
arc_buf_freeze(db->db_buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
2015-04-03 14:14:28 +11:00
|
|
|
|
|
|
|
out:
|
|
|
|
kmem_free(db_search, sizeof (dmu_buf_impl_t));
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
dbuf_block_freeable(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
|
|
|
|
uint64_t birth_txg = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't need any locking to protect db_blkptr:
|
|
|
|
* If it's syncing, then db_last_dirty will be set
|
|
|
|
* so we'll ignore db_blkptr.
|
2013-12-09 10:37:51 -08:00
|
|
|
*
|
|
|
|
* This logic ensures that only block births for
|
|
|
|
* filled blocks are considered.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
2013-12-09 10:37:51 -08:00
|
|
|
if (db->db_last_dirty && (db->db_blkptr == NULL ||
|
|
|
|
!BP_IS_HOLE(db->db_blkptr))) {
|
2008-11-20 12:01:55 -08:00
|
|
|
birth_txg = db->db_last_dirty->dr_txg;
|
2013-12-09 10:37:51 -08:00
|
|
|
} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
birth_txg = db->db_blkptr->blk_birth;
|
2013-12-09 10:37:51 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
/*
|
2013-12-09 10:37:51 -08:00
|
|
|
* If this block don't exist or is in a snapshot, it can't be freed.
|
2010-08-26 14:24:34 -07:00
|
|
|
* Don't pass the bp to dsl_dataset_block_freeable() since we
|
|
|
|
* are holding the db_mtx lock and might deadlock if we are
|
|
|
|
* prefetching a dedup-ed block.
|
|
|
|
*/
|
2013-12-09 10:37:51 -08:00
|
|
|
if (birth_txg != 0)
|
2008-11-20 12:01:55 -08:00
|
|
|
return (ds == NULL ||
|
2010-08-26 14:24:34 -07:00
|
|
|
dsl_dataset_block_freeable(ds, NULL, birth_txg));
|
2008-11-20 12:01:55 -08:00
|
|
|
else
|
2013-12-09 10:37:51 -08:00
|
|
|
return (B_FALSE);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
arc_buf_t *buf, *obuf;
|
|
|
|
int osize = db->db.db_size;
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/* XXX does *this* func really need the lock? */
|
2010-08-26 14:24:34 -07:00
|
|
|
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
2013-12-09 10:37:51 -08:00
|
|
|
* This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
|
2008-11-20 12:01:55 -08:00
|
|
|
* is OK, because there can be no other references to the db
|
|
|
|
* when we are changing its size, so no concurrent DB_FILL can
|
|
|
|
* be happening.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* XXX we should be doing a dbuf_read, checking the return
|
|
|
|
* value and returning that up to our callers
|
|
|
|
*/
|
2013-12-09 10:37:51 -08:00
|
|
|
dmu_buf_will_dirty(&db->db, tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* create the data buffer for the new block */
|
2010-08-26 14:24:34 -07:00
|
|
|
buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* copy old block data to the new block */
|
|
|
|
obuf = db->db_buf;
|
|
|
|
bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
|
|
|
|
/* zero the remainder */
|
|
|
|
if (size > osize)
|
|
|
|
bzero((uint8_t *)buf->b_data + osize, size - osize);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
dbuf_set_data(db, buf);
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(obuf, db));
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db.db_size = size;
|
|
|
|
|
|
|
|
if (db->db_level == 0) {
|
|
|
|
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
|
|
|
|
db->db_last_dirty->dt.dl.dr_data = buf;
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_willuse_space(dn, size-osize, tx);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
void
|
|
|
|
dbuf_release_bp(dmu_buf_impl_t *db)
|
|
|
|
{
|
2013-12-09 10:37:51 -08:00
|
|
|
ASSERTV(objset_t *os = db->db_objset);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
|
|
|
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
|
|
|
|
ASSERT(arc_released(os->os_phys_buf) ||
|
|
|
|
list_link_active(&os->os_dsl_dataset->ds_synced_link));
|
|
|
|
ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
|
|
|
|
|
2013-07-02 13:26:24 -07:00
|
|
|
(void) arc_release(db->db_buf, db);
|
2010-05-28 13:45:14 -07:00
|
|
|
}
|
|
|
|
|
2015-11-04 21:37:33 +01:00
|
|
|
/*
|
|
|
|
* We already have a dirty record for this TXG, and we are being
|
|
|
|
* dirtied again.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_redirty(dbuf_dirty_record_t *dr)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
|
|
|
|
/*
|
|
|
|
* If this buffer has already been written out,
|
|
|
|
* we now need to reset its state.
|
|
|
|
*/
|
|
|
|
dbuf_unoverride(dr);
|
|
|
|
if (db->db.db_object != DMU_META_DNODE_OBJECT &&
|
|
|
|
db->db_state != DB_NOFILL) {
|
|
|
|
/* Already released on initial dirty, so just thaw. */
|
|
|
|
ASSERT(arc_released(db->db_buf));
|
|
|
|
arc_buf_thaw(db->db_buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_dirty_record_t *
|
|
|
|
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
|
|
{
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
|
|
|
objset_t *os;
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_dirty_record_t **drp, *dr;
|
|
|
|
int drop_struct_lock = FALSE;
|
2008-12-03 12:09:06 -08:00
|
|
|
boolean_t do_free_accounting = B_FALSE;
|
2008-11-20 12:01:55 -08:00
|
|
|
int txgoff = tx->tx_txg & TXG_MASK;
|
|
|
|
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
DMU_TX_DIRTY_BUF(tx, db);
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Shouldn't dirty a regular buffer in syncing context. Private
|
|
|
|
* objects may be dirtied in syncing context, but only if they
|
|
|
|
* were already pre-dirtied in open context.
|
|
|
|
*/
|
|
|
|
ASSERT(!dmu_tx_is_syncing(tx) ||
|
|
|
|
BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
|
2009-07-02 15:44:48 -07:00
|
|
|
DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
|
|
|
|
dn->dn_objset->os_dsl_dataset == NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* We make this assert for private objects as well, but after we
|
|
|
|
* check if we're already dirty. They are allowed to re-dirty
|
|
|
|
* in syncing context.
|
|
|
|
*/
|
|
|
|
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
|
|
|
|
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
|
|
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
/*
|
|
|
|
* XXX make this true for indirects too? The problem is that
|
|
|
|
* transactions created with dmu_tx_create_assigned() from
|
|
|
|
* syncing context don't bother holding ahead.
|
|
|
|
*/
|
|
|
|
ASSERT(db->db_level != 0 ||
|
2008-12-03 12:09:06 -08:00
|
|
|
db->db_state == DB_CACHED || db->db_state == DB_FILL ||
|
|
|
|
db->db_state == DB_NOFILL);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
/*
|
|
|
|
* Don't set dirtyctx to SYNC if we're just modifying this as we
|
|
|
|
* initialize the objset.
|
|
|
|
*/
|
|
|
|
if (dn->dn_dirtyctx == DN_UNDIRTIED &&
|
|
|
|
!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
|
|
|
|
dn->dn_dirtyctx =
|
|
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
|
|
|
|
ASSERT(dn->dn_dirtyctx_firstset == NULL);
|
2014-11-20 19:09:39 -05:00
|
|
|
dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID)
|
|
|
|
dn->dn_have_spill = B_TRUE;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* If this buffer is already dirty, we're done.
|
|
|
|
*/
|
|
|
|
drp = &db->db_last_dirty;
|
|
|
|
ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
|
|
|
|
db->db.db_object == DMU_META_DNODE_OBJECT);
|
|
|
|
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
|
|
|
|
drp = &dr->dr_next;
|
|
|
|
if (dr && dr->dr_txg == tx->tx_txg) {
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
2015-11-04 21:37:33 +01:00
|
|
|
dbuf_redirty(dr);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return (dr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only valid if not already dirty.
|
|
|
|
*/
|
2009-07-02 15:44:48 -07:00
|
|
|
ASSERT(dn->dn_object == 0 ||
|
|
|
|
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
|
2008-11-20 12:01:55 -08:00
|
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
|
|
|
|
|
|
|
|
ASSERT3U(dn->dn_nlevels, >, db->db_level);
|
|
|
|
ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
|
|
|
|
dn->dn_phys->dn_nlevels > db->db_level ||
|
|
|
|
dn->dn_next_nlevels[txgoff] > db->db_level ||
|
|
|
|
dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
|
|
|
|
dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We should only be dirtying in syncing context if it's the
|
2009-07-02 15:44:48 -07:00
|
|
|
* mos or we're initializing the os or it's a special object.
|
|
|
|
* However, we are allowed to dirty in syncing context provided
|
|
|
|
* we already dirtied it in open context. Hence we must make
|
|
|
|
* this assertion only if we're not already dirty.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2010-08-26 14:24:34 -07:00
|
|
|
os = dn->dn_objset;
|
2009-07-02 15:44:48 -07:00
|
|
|
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
|
|
|
|
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(db->db.db_size != 0);
|
|
|
|
|
|
|
|
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Update the accounting.
|
2008-12-03 12:09:06 -08:00
|
|
|
* Note: we delay "free accounting" until after we drop
|
|
|
|
* the db_mtx. This keeps us from grabbing other locks
|
2010-05-28 13:45:14 -07:00
|
|
|
* (and possibly deadlocking) in bp_get_dsize() while
|
2008-12-03 12:09:06 -08:00
|
|
|
* also holding the db_mtx.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
dnode_willuse_space(dn, db->db.db_size, tx);
|
2008-12-03 12:09:06 -08:00
|
|
|
do_free_accounting = dbuf_block_freeable(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this buffer is dirty in an old transaction group we need
|
|
|
|
* to make a copy of it so that the changes we make in this
|
|
|
|
* transaction group won't leak out when we sync the older txg.
|
|
|
|
*/
|
2014-11-20 19:09:39 -05:00
|
|
|
dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
|
2010-08-26 10:26:44 -07:00
|
|
|
list_link_init(&dr->dr_dirty_node);
|
2008-11-20 12:01:55 -08:00
|
|
|
if (db->db_level == 0) {
|
|
|
|
void *data_old = db->db_buf;
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-12-03 12:09:06 -08:00
|
|
|
dbuf_fix_old_data(db, tx->tx_txg);
|
|
|
|
data_old = db->db.db_data;
|
|
|
|
} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
|
|
|
|
/*
|
|
|
|
* Release the data buffer from the cache so
|
|
|
|
* that we can modify it without impacting
|
|
|
|
* possible other users of this cached data
|
|
|
|
* block. Note that indirect blocks and
|
|
|
|
* private objects are not released until the
|
|
|
|
* syncing state (since they are only modified
|
|
|
|
* then).
|
|
|
|
*/
|
|
|
|
arc_release(db->db_buf, db);
|
|
|
|
dbuf_fix_old_data(db, tx->tx_txg);
|
|
|
|
data_old = db->db_buf;
|
|
|
|
}
|
|
|
|
ASSERT(data_old != NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
dr->dt.dl.dr_data = data_old;
|
|
|
|
} else {
|
Identify locks flagged by lockdep
When running a kernel with CONFIG_LOCKDEP=y, lockdep reports possible
recursive locking in some cases and possible circular locking dependency
in others, within the SPL and ZFS modules.
This patch uses a mutex type defined in SPL, MUTEX_NOLOCKDEP, to mark
such mutexes when they are initialized. This mutex type causes
attempts to take or release those locks to be wrapped in lockdep_off()
and lockdep_on() calls to silence the dependency checker and allow the
use of lock_stats to examine contention.
For RW locks, it uses an analogous lock type, RW_NOLOCKDEP.
The goal is that these locks are ultimately changed back to type
MUTEX_DEFAULT or RW_DEFAULT, after the locks are annotated to reflect
their relationship (e.g. z_name_lock below) or any real problem with the
lock dependencies are fixed.
Some of the affected locks are:
tc_open_lock:
=============
This is an array of locks, all with same name, which txg_quiesce must
take all of in order to move txg to next state. All default to the same
lockdep class, and so to lockdep appears recursive.
zp->z_name_lock:
================
In zfs_rmdir,
dzp = znode for the directory (input to zfs_dirent_lock)
zp = znode for the entry being removed (output of zfs_dirent_lock)
zfs_rmdir()->zfs_dirent_lock() takes z_name_lock in dzp
zfs_rmdir() takes z_name_lock in zp
Since both dzp and zp are type znode_t, the locks have the same default
class, and lockdep considers it a possible recursive lock attempt.
l->l_rwlock:
============
zap_expand_leaf() sometimes creates two new zap leaf structures, via
these call paths:
zap_deref_leaf()->zap_get_leaf_byblk()->zap_leaf_open()
zap_expand_leaf()->zap_create_leaf()->zap_expand_leaf()->zap_create_leaf()
Because both zap_leaf_open() and zap_create_leaf() initialize
l->l_rwlock in their (separate) leaf structures, the lockdep class is
the same, and the linux kernel believes these might both be the same
lock, and emits a possible recursive lock warning.
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3895
2015-10-15 13:08:27 -07:00
|
|
|
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
list_create(&dr->dt.di.dr_children,
|
|
|
|
sizeof (dbuf_dirty_record_t),
|
|
|
|
offsetof(dbuf_dirty_record_t, dr_dirty_node));
|
|
|
|
}
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
|
|
|
|
dr->dr_accounted = db->db.db_size;
|
2008-11-20 12:01:55 -08:00
|
|
|
dr->dr_dbuf = db;
|
|
|
|
dr->dr_txg = tx->tx_txg;
|
|
|
|
dr->dr_next = *drp;
|
|
|
|
*drp = dr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We could have been freed_in_flight between the dbuf_noread
|
|
|
|
* and dbuf_dirty. We win, as though the dbuf_noread() had
|
|
|
|
* happened after the free.
|
|
|
|
*/
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
|
|
|
db->db_blkid != DMU_SPILL_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
2014-04-15 19:40:22 -08:00
|
|
|
if (dn->dn_free_ranges[txgoff] != NULL) {
|
|
|
|
range_tree_clear(dn->dn_free_ranges[txgoff],
|
|
|
|
db->db_blkid, 1);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
db->db_freed_in_flight = FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This buffer is now part of this txg
|
|
|
|
*/
|
|
|
|
dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
|
|
|
|
db->db_dirtycnt += 1;
|
|
|
|
ASSERT3U(db->db_dirtycnt, <=, 3);
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID ||
|
|
|
|
db->db_blkid == DMU_SPILL_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
|
|
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
dnode_setdirty(dn, tx);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
return (dr);
|
2008-12-03 12:09:06 -08:00
|
|
|
} else if (do_free_accounting) {
|
|
|
|
blkptr_t *bp = db->db_blkptr;
|
|
|
|
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
|
2010-05-28 13:45:14 -07:00
|
|
|
bp_get_dsize(os->os_spa, bp) : db->db.db_size;
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* This is only a guess -- if the dbuf is dirty
|
|
|
|
* in a previous txg, we don't know how much
|
|
|
|
* space it will use on disk yet. We should
|
|
|
|
* really have the struct_rwlock to access
|
|
|
|
* db_blkptr, but since this is just a guess,
|
|
|
|
* it's OK if we get an odd answer.
|
|
|
|
*/
|
2010-08-26 14:24:34 -07:00
|
|
|
ddt_prefetch(os->os_spa, bp);
|
2008-12-03 12:09:06 -08:00
|
|
|
dnode_willuse_space(dn, -willfree, tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
|
|
drop_struct_lock = TRUE;
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
if (db->db_level == 0) {
|
|
|
|
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
|
|
|
|
ASSERT(dn->dn_maxblkid >= db->db_blkid);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (db->db_level+1 < dn->dn_nlevels) {
|
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
|
|
dbuf_dirty_record_t *di;
|
|
|
|
int parent_held = FALSE;
|
|
|
|
|
|
|
|
if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
|
|
|
|
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
|
|
|
|
parent = dbuf_hold_level(dn, db->db_level+1,
|
|
|
|
db->db_blkid >> epbs, FTAG);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(parent != NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
parent_held = TRUE;
|
|
|
|
}
|
|
|
|
if (drop_struct_lock)
|
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
ASSERT3U(db->db_level+1, ==, parent->db_level);
|
|
|
|
di = dbuf_dirty(parent, tx);
|
|
|
|
if (parent_held)
|
|
|
|
dbuf_rele(parent, FTAG);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
/*
|
|
|
|
* Since we've dropped the mutex, it's possible that
|
|
|
|
* dbuf_undirty() might have changed this out from under us.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
if (db->db_last_dirty == dr ||
|
|
|
|
dn->dn_object == DMU_META_DNODE_OBJECT) {
|
|
|
|
mutex_enter(&di->dt.di.dr_mtx);
|
|
|
|
ASSERT3U(di->dr_txg, ==, tx->tx_txg);
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
|
|
list_insert_tail(&di->dt.di.dr_children, dr);
|
|
|
|
mutex_exit(&di->dt.di.dr_mtx);
|
|
|
|
dr->dr_parent = di;
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
} else {
|
|
|
|
ASSERT(db->db_level+1 == dn->dn_nlevels);
|
|
|
|
ASSERT(db->db_blkid < dn->dn_nblkptr);
|
2010-08-26 14:24:34 -07:00
|
|
|
ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
|
|
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
if (drop_struct_lock)
|
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
}
|
|
|
|
|
|
|
|
dnode_setdirty(dn, tx);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
return (dr);
|
|
|
|
}
|
|
|
|
|
2013-09-04 07:00:57 -05:00
|
|
|
/*
|
2013-06-11 09:12:34 -08:00
|
|
|
* Undirty a buffer in the transaction group referenced by the given
|
|
|
|
* transaction. Return whether this evicted the dbuf.
|
2013-09-04 07:00:57 -05:00
|
|
|
*/
|
|
|
|
static boolean_t
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
|
|
{
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t txg = tx->tx_txg;
|
|
|
|
dbuf_dirty_record_t *dr, **drp;
|
|
|
|
|
|
|
|
ASSERT(txg != 0);
|
2015-07-02 18:23:20 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Due to our use of dn_nlevels below, this can only be called
|
|
|
|
* in open context, unless we are operating on the MOS.
|
|
|
|
* From syncing context, dn_nlevels may be different from the
|
|
|
|
* dn_nlevels used when dbuf was dirtied.
|
|
|
|
*/
|
|
|
|
ASSERT(db->db_objset ==
|
|
|
|
dmu_objset_pool(db->db_objset)->dp_meta_objset ||
|
|
|
|
txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2013-09-04 07:00:57 -05:00
|
|
|
ASSERT0(db->db_level);
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this buffer is not dirty, we're done.
|
|
|
|
*/
|
|
|
|
for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
|
|
|
|
if (dr->dr_txg <= txg)
|
|
|
|
break;
|
2013-09-04 07:00:57 -05:00
|
|
|
if (dr == NULL || dr->dr_txg < txg)
|
|
|
|
return (B_FALSE);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(dr->dr_txg == txg);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(dr->dr_dbuf == db);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
|
|
|
|
|
|
|
|
ASSERT(db->db.db_size != 0);
|
|
|
|
|
2015-07-02 18:23:20 +02:00
|
|
|
dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
|
|
|
|
dr->dr_accounted, txg);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
*drp = dr->dr_next;
|
|
|
|
|
Illumos #764: panic in zfs:dbuf_sync_list
Hypothesis about what's going on here.
At some time in the past, something, i.e. dnode_reallocate()
calls one of:
dbuf_rm_spill(dn, tx);
These will do:
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx)
dbuf_undirty(db, tx)
Currently dbuf_undirty can leave a spill block in dn_dirty_records[],
(it having been put there previously by dbuf_dirty) and free it.
Sometime later, dbuf_sync_list trips over this reference to free'd
(and typically reused) memory.
Also, dbuf_undirty can call dnode_clear_range with a bogus
block ID. It needs to test for DMU_SPILL_BLKID, similar to
how dnode_clear_range is called in dbuf_dirty().
References to Illumos issue and patch:
- https://www.illumos.org/issues/764
- https://github.com/illumos/illumos-gate/commit/3f2366c2bb
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Mark.Maybe@oracle.com
Reviewed by: Albert Lee <trisk@nexenta.com
Approved by: Garrett D'Amore <garrett@nexenta.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
2011-07-26 11:37:06 -07:00
|
|
|
/*
|
|
|
|
* Note that there are three places in dbuf_dirty()
|
|
|
|
* where this dirty record may be put on a list.
|
|
|
|
* Make sure to do a list_remove corresponding to
|
|
|
|
* every one of those list_insert calls.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
if (dr->dr_parent) {
|
|
|
|
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
|
|
|
|
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
|
|
|
|
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
|
Illumos #764: panic in zfs:dbuf_sync_list
Hypothesis about what's going on here.
At some time in the past, something, i.e. dnode_reallocate()
calls one of:
dbuf_rm_spill(dn, tx);
These will do:
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx)
dbuf_undirty(db, tx)
Currently dbuf_undirty can leave a spill block in dn_dirty_records[],
(it having been put there previously by dbuf_dirty) and free it.
Sometime later, dbuf_sync_list trips over this reference to free'd
(and typically reused) memory.
Also, dbuf_undirty can call dnode_clear_range with a bogus
block ID. It needs to test for DMU_SPILL_BLKID, similar to
how dnode_clear_range is called in dbuf_dirty().
References to Illumos issue and patch:
- https://www.illumos.org/issues/764
- https://github.com/illumos/illumos-gate/commit/3f2366c2bb
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Mark.Maybe@oracle.com
Reviewed by: Albert Lee <trisk@nexenta.com
Approved by: Garrett D'Amore <garrett@nexenta.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
2011-07-26 11:37:06 -07:00
|
|
|
} else if (db->db_blkid == DMU_SPILL_BLKID ||
|
2015-07-02 18:23:20 +02:00
|
|
|
db->db_level + 1 == dn->dn_nlevels) {
|
2008-12-03 12:09:06 -08:00
|
|
|
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
}
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-09-04 07:00:57 -05:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
|
|
|
dbuf_unoverride(dr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(db->db_buf != NULL);
|
2013-09-04 07:00:57 -05:00
|
|
|
ASSERT(dr->dt.dl.dr_data != NULL);
|
|
|
|
if (dr->dt.dl.dr_data != db->db_buf)
|
|
|
|
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-04-02 00:49:14 +11:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
|
|
|
|
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
|
|
db->db_dirtycnt -= 1;
|
|
|
|
|
|
|
|
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
|
|
|
|
arc_buf_t *buf = db->db_buf;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
|
2015-04-02 14:44:32 +11:00
|
|
|
dbuf_clear_data(db);
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_evict(db);
|
2013-09-04 07:00:57 -05:00
|
|
|
return (B_TRUE);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2013-09-04 07:00:57 -05:00
|
|
|
return (B_FALSE);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2013-12-09 10:37:51 -08:00
|
|
|
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2013-12-09 10:37:51 -08:00
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
2008-11-20 12:01:55 -08:00
|
|
|
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
|
2015-11-04 21:37:33 +01:00
|
|
|
dbuf_dirty_record_t *dr;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
|
2015-11-04 21:37:33 +01:00
|
|
|
/*
|
|
|
|
* Quick check for dirtyness. For already dirty blocks, this
|
|
|
|
* reduces runtime of this function by >90%, and overall performance
|
|
|
|
* by 50% for some workloads (e.g. file deletion with indirect blocks
|
|
|
|
* cached).
|
|
|
|
*/
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
for (dr = db->db_last_dirty;
|
|
|
|
dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
|
|
|
|
/*
|
|
|
|
* It's possible that it is already dirty but not cached,
|
|
|
|
* because there are some calls to dbuf_dirty() that don't
|
|
|
|
* go through dmu_buf_will_dirty().
|
|
|
|
*/
|
|
|
|
if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
|
|
|
|
/* This dbuf is already dirty and cached. */
|
|
|
|
dbuf_redirty(dr);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
|
2008-11-20 12:01:55 -08:00
|
|
|
rf |= DB_RF_HAVESTRUCT;
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
(void) dbuf_read(db, NULL, rf);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
void
|
|
|
|
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
|
|
|
db->db_state = DB_NOFILL;
|
|
|
|
|
|
|
|
dmu_buf_will_fill(db_fake, tx);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
|
|
|
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
|
|
|
|
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
|
|
|
|
dmu_tx_private_ok(tx));
|
|
|
|
|
|
|
|
dbuf_noread(db);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
|
|
|
}
|
|
|
|
|
|
|
|
#pragma weak dmu_buf_fill_done = dbuf_fill_done
|
|
|
|
/* ARGSUSED */
|
|
|
|
void
|
|
|
|
dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
|
|
|
if (db->db_state == DB_FILL) {
|
|
|
|
if (db->db_level == 0 && db->db_freed_in_flight) {
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 12:01:55 -08:00
|
|
|
/* we were freed while filling */
|
|
|
|
/* XXX dbuf_undirty? */
|
|
|
|
bzero(db->db.db_data, db->db.db_size);
|
|
|
|
db->db_freed_in_flight = FALSE;
|
|
|
|
}
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
cv_broadcast(&db->db_changed);
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
void
|
|
|
|
dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
|
|
|
|
bp_embedded_type_t etype, enum zio_compress comp,
|
|
|
|
int uncompressed_size, int compressed_size, int byteorder,
|
|
|
|
dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
|
|
|
|
struct dirty_leaf *dl;
|
|
|
|
dmu_object_type_t type;
|
|
|
|
|
2015-07-24 09:53:55 -07:00
|
|
|
if (etype == BP_EMBEDDED_TYPE_DATA) {
|
|
|
|
ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
|
|
|
|
SPA_FEATURE_EMBEDDED_DATA));
|
|
|
|
}
|
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
type = DB_DNODE(db)->dn_type;
|
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
|
|
|
ASSERT0(db->db_level);
|
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
|
|
|
|
|
|
dmu_buf_will_not_fill(dbuf, tx);
|
|
|
|
|
|
|
|
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
|
|
|
|
dl = &db->db_last_dirty->dt.dl;
|
|
|
|
encode_embedded_bp_compressed(&dl->dr_overridden_by,
|
|
|
|
data, comp, uncompressed_size, compressed_size);
|
|
|
|
BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
|
|
|
|
BP_SET_TYPE(&dl->dr_overridden_by, type);
|
|
|
|
BP_SET_LEVEL(&dl->dr_overridden_by, 0);
|
|
|
|
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
|
|
|
|
|
|
|
|
dl->dr_override_state = DR_OVERRIDDEN;
|
|
|
|
dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
|
|
|
|
}
|
|
|
|
|
2009-07-02 15:44:48 -07:00
|
|
|
/*
|
|
|
|
* Directly assign a provided arc buf to a given dbuf if it's not referenced
|
|
|
|
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2009-07-02 15:44:48 -07:00
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
|
|
|
|
ASSERT(buf != NULL);
|
|
|
|
ASSERT(arc_buf_size(buf) == db->db.db_size);
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
|
|
|
|
arc_return_buf(buf, db);
|
|
|
|
ASSERT(arc_released(buf));
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
|
|
|
|
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
|
|
|
|
|
|
|
|
if (db->db_state == DB_CACHED &&
|
|
|
|
refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
|
|
|
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2010-05-28 13:45:14 -07:00
|
|
|
xuio_stat_wbuf_copied();
|
2009-07-02 15:44:48 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
xuio_stat_wbuf_nocopy();
|
2009-07-02 15:44:48 -07:00
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
dbuf_dirty_record_t *dr = db->db_last_dirty;
|
|
|
|
|
|
|
|
ASSERT(db->db_buf != NULL);
|
|
|
|
if (dr != NULL && dr->dr_txg == tx->tx_txg) {
|
|
|
|
ASSERT(dr->dt.dl.dr_data == db->db_buf);
|
|
|
|
if (!arc_released(db->db_buf)) {
|
|
|
|
ASSERT(dr->dt.dl.dr_override_state ==
|
|
|
|
DR_OVERRIDDEN);
|
|
|
|
arc_release(db->db_buf, db);
|
|
|
|
}
|
|
|
|
dr->dt.dl.dr_data = buf;
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(db->db_buf, db));
|
2009-07-02 15:44:48 -07:00
|
|
|
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
|
|
|
|
arc_release(db->db_buf, db);
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(db->db_buf, db));
|
2009-07-02 15:44:48 -07:00
|
|
|
}
|
|
|
|
db->db_buf = NULL;
|
|
|
|
}
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
dbuf_set_data(db, buf);
|
|
|
|
db->db_state = DB_FILL;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
2013-12-09 10:37:51 -08:00
|
|
|
dmu_buf_fill_done(&db->db, tx);
|
2009-07-02 15:44:48 -07:00
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* "Clear" the contents of this dbuf. This will mark the dbuf
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
* EVICTING and clear *most* of its references. Unfortunately,
|
2008-11-20 12:01:55 -08:00
|
|
|
* when we are not holding the dn_dbufs_mtx, we can't clear the
|
|
|
|
* entry in the dn_dbufs list. We have to wait until dbuf_destroy()
|
|
|
|
* in this case. For callers from the DMU we will usually see:
|
2014-07-15 03:43:18 -04:00
|
|
|
* dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
|
2008-11-20 12:01:55 -08:00
|
|
|
* For the arc callback, we will usually see:
|
2010-08-26 14:24:34 -07:00
|
|
|
* dbuf_do_evict()->dbuf_clear();dbuf_destroy()
|
2008-11-20 12:01:55 -08:00
|
|
|
* Sometimes, though, we will get a mix of these two:
|
2014-07-15 03:43:18 -04:00
|
|
|
* DMU: dbuf_clear()->arc_clear_callback()
|
2008-11-20 12:01:55 -08:00
|
|
|
* ARC: dbuf_do_evict()->dbuf_destroy()
|
2014-07-15 03:43:18 -04:00
|
|
|
*
|
|
|
|
* This routine will dissociate the dbuf from the arc, by calling
|
|
|
|
* arc_clear_callback(), but will not evict the data from the ARC.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
dbuf_clear(dmu_buf_impl_t *db)
|
|
|
|
{
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2008-11-20 12:01:55 -08:00
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
2010-08-26 14:24:34 -07:00
|
|
|
dmu_buf_impl_t *dndb;
|
2014-07-15 03:43:18 -04:00
|
|
|
boolean_t dbuf_gone = B_FALSE;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
|
|
|
|
dbuf_evict_user(db);
|
|
|
|
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
ASSERT(db->db.db_data != NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
int slots = DB_DNODE(db)->dn_num_slots;
|
|
|
|
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
|
|
|
|
zio_buf_free(db->db.db_data, bonuslen);
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_space_return(bonuslen, ARC_SPACE_BONUS);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
db->db.db_data = NULL;
|
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(db->db_data_pending == NULL);
|
|
|
|
|
|
|
|
db->db_state = DB_EVICTING;
|
|
|
|
db->db_blkptr = NULL;
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
dndb = dn->dn_dbuf;
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
|
2015-04-03 14:14:28 +11:00
|
|
|
avl_remove(&dn->dn_dbufs, db);
|
2015-03-12 11:03:31 +11:00
|
|
|
atomic_dec_32(&dn->dn_dbufs_count);
|
2010-08-26 14:24:34 -07:00
|
|
|
membar_producer();
|
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
|
|
* Decrementing the dbuf count means that the hold corresponding
|
|
|
|
* to the removed dbuf is no longer discounted in dnode_move(),
|
|
|
|
* so the dnode cannot be moved until after we release the hold.
|
|
|
|
* The membar_producer() ensures visibility of the decremented
|
|
|
|
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
|
|
|
|
* release any lock.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
dnode_rele(dn, db);
|
2010-08-26 14:24:34 -07:00
|
|
|
db->db_dnode_handle = NULL;
|
|
|
|
} else {
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (db->db_buf)
|
2014-07-15 03:43:18 -04:00
|
|
|
dbuf_gone = arc_clear_callback(db->db_buf);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (!dbuf_gone)
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
|
|
|
/*
|
2010-08-26 14:24:34 -07:00
|
|
|
* If this dbuf is referenced from an indirect dbuf,
|
2008-11-20 12:01:55 -08:00
|
|
|
* decrement the ref count on the indirect dbuf.
|
|
|
|
*/
|
|
|
|
if (parent && parent != dndb)
|
|
|
|
dbuf_rele(parent, db);
|
|
|
|
}
|
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
/*
|
|
|
|
* Note: While bpp will always be updated if the function returns success,
|
|
|
|
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
|
|
|
|
* this happens when the dnode is the meta-dnode, or a userused or groupused
|
|
|
|
* object.
|
|
|
|
*/
|
2010-08-26 10:58:00 -07:00
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline int
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
2010-08-26 10:52:00 -07:00
|
|
|
dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
int nlevels, epbs;
|
|
|
|
|
|
|
|
*parentp = NULL;
|
|
|
|
*bpp = NULL;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
|
|
|
|
|
|
|
if (blkid == DMU_SPILL_BLKID) {
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
if (dn->dn_have_spill &&
|
|
|
|
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
|
2010-05-28 13:45:14 -07:00
|
|
|
else
|
|
|
|
*bpp = NULL;
|
|
|
|
dbuf_add_ref(dn->dn_dbuf, NULL);
|
|
|
|
*parentp = dn->dn_dbuf;
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
return (0);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (dn->dn_phys->dn_nlevels == 0)
|
|
|
|
nlevels = 1;
|
|
|
|
else
|
|
|
|
nlevels = dn->dn_phys->dn_nlevels;
|
|
|
|
|
|
|
|
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
|
|
|
|
ASSERT3U(level * epbs, <, 64);
|
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
if (level >= nlevels ||
|
|
|
|
(blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
|
|
|
|
/* the buffer has no parent yet */
|
2013-03-08 10:41:28 -08:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 12:01:55 -08:00
|
|
|
} else if (level < nlevels-1) {
|
|
|
|
/* this block is referenced from an indirect block */
|
2010-08-26 10:52:00 -07:00
|
|
|
int err;
|
|
|
|
if (dh == NULL) {
|
2015-12-22 02:31:57 +01:00
|
|
|
err = dbuf_hold_impl(dn, level+1,
|
|
|
|
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
|
2013-11-01 20:26:11 +01:00
|
|
|
} else {
|
2010-08-26 10:52:00 -07:00
|
|
|
__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
|
2015-12-22 02:31:57 +01:00
|
|
|
blkid >> epbs, fail_sparse, FALSE, NULL,
|
|
|
|
parentp, dh->dh_depth + 1);
|
2010-08-26 10:52:00 -07:00
|
|
|
err = __dbuf_hold_impl(dh + 1);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
if (err)
|
|
|
|
return (err);
|
|
|
|
err = dbuf_read(*parentp, NULL,
|
|
|
|
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
|
|
|
if (err) {
|
|
|
|
dbuf_rele(*parentp, NULL);
|
|
|
|
*parentp = NULL;
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
|
|
|
|
(blkid & ((1ULL << epbs) - 1));
|
|
|
|
return (0);
|
|
|
|
} else {
|
|
|
|
/* the block is referenced from the dnode */
|
|
|
|
ASSERT3U(level, ==, nlevels-1);
|
|
|
|
ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
|
|
|
|
blkid < dn->dn_phys->dn_nblkptr);
|
|
|
|
if (dn->dn_dbuf) {
|
|
|
|
dbuf_add_ref(dn->dn_dbuf, NULL);
|
|
|
|
*parentp = dn->dn_dbuf;
|
|
|
|
}
|
|
|
|
*bpp = &dn->dn_phys->dn_blkptr[blkid];
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static dmu_buf_impl_t *
|
|
|
|
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
|
|
|
dmu_buf_impl_t *parent, blkptr_t *blkptr)
|
|
|
|
{
|
2010-05-28 13:45:14 -07:00
|
|
|
objset_t *os = dn->dn_objset;
|
2008-11-20 12:01:55 -08:00
|
|
|
dmu_buf_impl_t *db, *odb;
|
|
|
|
|
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
ASSERT(dn->dn_type != DMU_OT_NONE);
|
|
|
|
|
2014-11-20 19:09:39 -05:00
|
|
|
db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
db->db_objset = os;
|
|
|
|
db->db.db_object = dn->dn_object;
|
|
|
|
db->db_level = level;
|
|
|
|
db->db_blkid = blkid;
|
|
|
|
db->db_last_dirty = NULL;
|
|
|
|
db->db_dirtycnt = 0;
|
2010-08-26 14:24:34 -07:00
|
|
|
db->db_dnode_handle = dn->dn_handle;
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_parent = parent;
|
|
|
|
db->db_blkptr = blkptr;
|
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
db->db_user = NULL;
|
2015-10-13 14:09:45 -07:00
|
|
|
db->db_user_immediate_evict = FALSE;
|
|
|
|
db->db_freed_in_flight = FALSE;
|
|
|
|
db->db_pending_evict = FALSE;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (blkid == DMU_BONUS_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3P(parent, ==, dn->dn_dbuf);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
|
2008-11-20 12:01:55 -08:00
|
|
|
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
|
|
|
|
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
|
2010-05-28 13:45:14 -07:00
|
|
|
db->db.db_offset = DMU_BONUS_BLKID;
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
/* the bonus dbuf is not placed in the hash table */
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
|
2008-11-20 12:01:55 -08:00
|
|
|
return (db);
|
2010-05-28 13:45:14 -07:00
|
|
|
} else if (blkid == DMU_SPILL_BLKID) {
|
|
|
|
db->db.db_size = (blkptr != NULL) ?
|
|
|
|
BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
|
|
|
|
db->db.db_offset = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
int blocksize =
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db.db_size = blocksize;
|
|
|
|
db->db.db_offset = db->db_blkid * blocksize;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hold the dn_dbufs_mtx while we get the new dbuf
|
|
|
|
* in the hash table *and* added to the dbufs list.
|
|
|
|
* This prevents a possible deadlock with someone
|
|
|
|
* trying to look up this dbuf before its added to the
|
|
|
|
* dn_dbufs list.
|
|
|
|
*/
|
|
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
|
|
|
db->db_state = DB_EVICTING;
|
|
|
|
if ((odb = dbuf_hash_insert(db)) != NULL) {
|
|
|
|
/* someone else inserted it first */
|
|
|
|
kmem_cache_free(dbuf_cache, db);
|
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
|
|
return (odb);
|
|
|
|
}
|
2015-04-03 14:14:28 +11:00
|
|
|
avl_add(&dn->dn_dbufs, db);
|
2013-08-20 20:11:52 -08:00
|
|
|
if (db->db_level == 0 && db->db_blkid >=
|
|
|
|
dn->dn_unlisted_l0_blkid)
|
|
|
|
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (parent && parent != dn->dn_dbuf)
|
|
|
|
dbuf_add_ref(parent, db);
|
|
|
|
|
|
|
|
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
|
|
|
|
refcount_count(&dn->dn_holds) > 0);
|
|
|
|
(void) refcount_add(&dn->dn_holds, db);
|
2015-03-12 11:03:31 +11:00
|
|
|
atomic_inc_32(&dn->dn_dbufs_count);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
dprintf_dbuf(db, "db=%p\n", db);
|
|
|
|
|
|
|
|
return (db);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
dbuf_do_evict(void *private)
|
|
|
|
{
|
2014-07-15 03:43:18 -04:00
|
|
|
dmu_buf_impl_t *db = private;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (!MUTEX_HELD(&db->db_mtx))
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
|
|
|
|
if (db->db_state != DB_EVICTING) {
|
|
|
|
ASSERT(db->db_state == DB_CACHED);
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
db->db_buf = NULL;
|
|
|
|
dbuf_evict(db);
|
|
|
|
} else {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_destroy(db);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_destroy(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* If this dbuf is still on the dn_dbufs list,
|
|
|
|
* remove it from that list.
|
|
|
|
*/
|
2010-08-26 14:24:34 -07:00
|
|
|
if (db->db_dnode_handle != NULL) {
|
|
|
|
dnode_t *dn;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
2015-04-03 14:14:28 +11:00
|
|
|
avl_remove(&dn->dn_dbufs, db);
|
2015-03-12 11:03:31 +11:00
|
|
|
atomic_dec_32(&dn->dn_dbufs_count);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
|
|
* Decrementing the dbuf count means that the hold
|
|
|
|
* corresponding to the removed dbuf is no longer
|
|
|
|
* discounted in dnode_move(), so the dnode cannot be
|
|
|
|
* moved until after we release the hold.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
dnode_rele(dn, db);
|
2010-08-26 14:24:34 -07:00
|
|
|
db->db_dnode_handle = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
dbuf_hash_remove(db);
|
|
|
|
}
|
|
|
|
db->db_parent = NULL;
|
|
|
|
db->db_buf = NULL;
|
|
|
|
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
ASSERT(db->db_hash_next == NULL);
|
|
|
|
ASSERT(db->db_blkptr == NULL);
|
|
|
|
ASSERT(db->db_data_pending == NULL);
|
|
|
|
|
|
|
|
kmem_cache_free(dbuf_cache, db);
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
typedef struct dbuf_prefetch_arg {
|
|
|
|
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
|
|
|
|
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
|
|
|
|
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
|
|
|
|
int dpa_curlevel; /* The current level that we're reading */
|
|
|
|
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
|
|
|
|
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
|
|
|
|
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
|
|
|
|
} dbuf_prefetch_arg_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Actually issue the prefetch read for the block given.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
|
|
|
|
{
|
|
|
|
arc_flags_t aflags;
|
|
|
|
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
|
|
|
|
|
|
|
|
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
|
|
|
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
|
|
|
|
ASSERT(dpa->dpa_zio != NULL);
|
|
|
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
|
|
|
|
dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
|
|
&aflags, &dpa->dpa_zb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called when an indirect block above our prefetch target is read in. This
|
|
|
|
* will either read in the next indirect block down the tree or issue the actual
|
|
|
|
* prefetch if the next block down is our target.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
|
|
|
|
{
|
|
|
|
dbuf_prefetch_arg_t *dpa = private;
|
|
|
|
uint64_t nextblkid;
|
|
|
|
blkptr_t *bp;
|
|
|
|
|
|
|
|
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
|
|
|
|
ASSERT3S(dpa->dpa_curlevel, >, 0);
|
|
|
|
if (zio != NULL) {
|
|
|
|
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
|
|
|
|
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
|
|
|
|
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
|
|
|
|
}
|
|
|
|
|
|
|
|
dpa->dpa_curlevel--;
|
|
|
|
|
|
|
|
nextblkid = dpa->dpa_zb.zb_blkid >>
|
|
|
|
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
|
|
|
|
bp = ((blkptr_t *)abuf->b_data) +
|
|
|
|
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
|
|
|
|
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
|
|
|
|
kmem_free(dpa, sizeof (*dpa));
|
|
|
|
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
|
|
|
|
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
|
|
|
|
dbuf_issue_final_prefetch(dpa, bp);
|
|
|
|
kmem_free(dpa, sizeof (*dpa));
|
|
|
|
} else {
|
|
|
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
|
|
|
zbookmark_phys_t zb;
|
|
|
|
|
|
|
|
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
|
|
|
|
|
|
|
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
|
|
|
|
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
|
|
|
|
|
|
|
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
|
|
|
|
bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
|
|
&iter_aflags, &zb);
|
|
|
|
}
|
|
|
|
(void) arc_buf_remove_ref(abuf, private);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Issue prefetch reads for the given block on the given level. If the indirect
|
|
|
|
* blocks above that block are not in memory, we will read them in
|
|
|
|
* asynchronously. As a result, this call never blocks waiting for a read to
|
|
|
|
* complete.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
2015-12-22 02:31:57 +01:00
|
|
|
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|
|
|
arc_flags_t aflags)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-12-22 02:31:57 +01:00
|
|
|
blkptr_t bp;
|
|
|
|
int epbs, nlevels, curlevel;
|
|
|
|
uint64_t curblkid;
|
|
|
|
dmu_buf_impl_t *db;
|
|
|
|
zio_t *pio;
|
|
|
|
dbuf_prefetch_arg_t *dpa;
|
|
|
|
dsl_dataset_t *ds;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
|
2015-12-26 22:10:31 +01:00
|
|
|
if (blkid > dn->dn_maxblkid)
|
|
|
|
return;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (dnode_block_freed(dn, blkid))
|
|
|
|
return;
|
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
/*
|
|
|
|
* This dnode hasn't been written to disk yet, so there's nothing to
|
|
|
|
* prefetch.
|
|
|
|
*/
|
|
|
|
nlevels = dn->dn_phys->dn_nlevels;
|
|
|
|
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
|
|
|
|
return;
|
|
|
|
|
|
|
|
db = dbuf_find(dn->dn_objset, dn->dn_object,
|
|
|
|
level, blkid);
|
|
|
|
if (db != NULL) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
2010-08-26 14:24:34 -07:00
|
|
|
/*
|
2015-12-22 02:31:57 +01:00
|
|
|
* This dbuf already exists. It is either CACHED, or
|
|
|
|
* (we assume) about to be read or filled.
|
2010-08-26 14:24:34 -07:00
|
|
|
*/
|
|
|
|
return;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
/*
|
|
|
|
* Find the closest ancestor (indirect block) of the target block
|
|
|
|
* that is present in the cache. In this indirect block, we will
|
|
|
|
* find the bp that is at curlevel, curblkid.
|
|
|
|
*/
|
|
|
|
curlevel = level;
|
|
|
|
curblkid = blkid;
|
|
|
|
while (curlevel < nlevels - 1) {
|
|
|
|
int parent_level = curlevel + 1;
|
|
|
|
uint64_t parent_blkid = curblkid >> epbs;
|
|
|
|
dmu_buf_impl_t *db;
|
|
|
|
|
|
|
|
if (dbuf_hold_impl(dn, parent_level, parent_blkid,
|
|
|
|
FALSE, TRUE, FTAG, &db) == 0) {
|
|
|
|
blkptr_t *bpp = db->db_buf->b_data;
|
|
|
|
bp = bpp[P2PHASE(curblkid, 1 << epbs)];
|
|
|
|
dbuf_rele(db, FTAG);
|
|
|
|
break;
|
|
|
|
}
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
curlevel = parent_level;
|
|
|
|
curblkid = parent_blkid;
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
if (curlevel == nlevels - 1) {
|
|
|
|
/* No cached indirect blocks found. */
|
|
|
|
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
|
|
|
|
bp = dn->dn_phys->dn_blkptr[curblkid];
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-12-22 02:31:57 +01:00
|
|
|
if (BP_IS_HOLE(&bp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
|
|
|
|
|
|
|
|
pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
|
|
|
|
ZIO_FLAG_CANFAIL);
|
|
|
|
|
|
|
|
dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
|
|
|
|
ds = dn->dn_objset->os_dsl_dataset;
|
|
|
|
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
|
|
|
|
dn->dn_object, level, blkid);
|
|
|
|
dpa->dpa_curlevel = curlevel;
|
|
|
|
dpa->dpa_prio = prio;
|
|
|
|
dpa->dpa_aflags = aflags;
|
|
|
|
dpa->dpa_spa = dn->dn_objset->os_spa;
|
|
|
|
dpa->dpa_epbs = epbs;
|
|
|
|
dpa->dpa_zio = pio;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have the indirect just above us, no need to do the asynchronous
|
|
|
|
* prefetch chain; we'll just run the last step ourselves. If we're at
|
|
|
|
* a higher level, though, we want to issue the prefetches for all the
|
|
|
|
* indirect blocks asynchronously, so we can go on with whatever we were
|
|
|
|
* doing.
|
|
|
|
*/
|
|
|
|
if (curlevel == level) {
|
|
|
|
ASSERT3U(curblkid, ==, blkid);
|
|
|
|
dbuf_issue_final_prefetch(dpa, &bp);
|
|
|
|
kmem_free(dpa, sizeof (*dpa));
|
|
|
|
} else {
|
|
|
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
|
|
|
zbookmark_phys_t zb;
|
|
|
|
|
|
|
|
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
|
|
|
|
dn->dn_object, curlevel, curblkid);
|
|
|
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
|
|
|
|
&bp, dbuf_prefetch_indirect_done, dpa, prio,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
|
|
&iter_aflags, &zb);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We use pio here instead of dpa_zio since it's possible that
|
|
|
|
* dpa may have already been freed.
|
|
|
|
*/
|
|
|
|
zio_nowait(pio);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2013-11-01 20:26:11 +01:00
|
|
|
#define DBUF_HOLD_IMPL_MAX_DEPTH 20
|
2010-08-26 10:52:00 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Returns with db_holds incremented, and db_mtx not held.
|
|
|
|
* Note: dn_struct_rwlock must be held.
|
|
|
|
*/
|
2010-08-26 10:52:00 -07:00
|
|
|
static int
|
|
|
|
__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2010-08-26 10:52:00 -07:00
|
|
|
ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
|
|
|
|
dh->dh_parent = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
|
|
|
|
ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
|
|
|
|
ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
*(dh->dh_dbp) = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
top:
|
|
|
|
/* dbuf_find() returns with db_mtx held */
|
2015-04-02 22:59:15 +11:00
|
|
|
dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
|
|
|
|
dh->dh_level, dh->dh_blkid);
|
2010-08-26 10:52:00 -07:00
|
|
|
|
|
|
|
if (dh->dh_db == NULL) {
|
|
|
|
dh->dh_bp = NULL;
|
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
if (dh->dh_fail_uncached)
|
|
|
|
return (SET_ERROR(ENOENT));
|
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
ASSERT3P(dh->dh_parent, ==, NULL);
|
|
|
|
dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
|
|
|
|
dh->dh_fail_sparse, &dh->dh_parent,
|
|
|
|
&dh->dh_bp, dh);
|
|
|
|
if (dh->dh_fail_sparse) {
|
2013-11-01 20:26:11 +01:00
|
|
|
if (dh->dh_err == 0 &&
|
|
|
|
dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
|
2013-03-08 10:41:28 -08:00
|
|
|
dh->dh_err = SET_ERROR(ENOENT);
|
2010-08-26 10:52:00 -07:00
|
|
|
if (dh->dh_err) {
|
|
|
|
if (dh->dh_parent)
|
|
|
|
dbuf_rele(dh->dh_parent, NULL);
|
|
|
|
return (dh->dh_err);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
2010-08-26 10:52:00 -07:00
|
|
|
if (dh->dh_err && dh->dh_err != ENOENT)
|
|
|
|
return (dh->dh_err);
|
|
|
|
dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
|
|
|
|
dh->dh_parent, dh->dh_bp);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-12-22 02:31:57 +01:00
|
|
|
if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
|
|
|
|
mutex_exit(&dh->dh_db->db_mtx);
|
|
|
|
return (SET_ERROR(ENOENT));
|
|
|
|
}
|
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) {
|
|
|
|
arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db);
|
|
|
|
if (dh->dh_db->db_buf->b_data == NULL) {
|
|
|
|
dbuf_clear(dh->dh_db);
|
|
|
|
if (dh->dh_parent) {
|
|
|
|
dbuf_rele(dh->dh_parent, NULL);
|
|
|
|
dh->dh_parent = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
goto top;
|
|
|
|
}
|
2010-08-26 10:52:00 -07:00
|
|
|
ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this buffer is currently syncing out, and we are are
|
|
|
|
* still referencing it from db_data, we need to make a copy
|
|
|
|
* of it in case we decide we want to dirty it again in this txg.
|
|
|
|
*/
|
2010-08-26 10:52:00 -07:00
|
|
|
if (dh->dh_db->db_level == 0 &&
|
|
|
|
dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
|
|
|
|
dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
|
|
|
|
dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
|
|
|
|
dh->dh_dr = dh->dh_db->db_data_pending;
|
|
|
|
|
|
|
|
if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) {
|
|
|
|
dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db);
|
|
|
|
|
|
|
|
dbuf_set_data(dh->dh_db,
|
|
|
|
arc_buf_alloc(dh->dh_dn->dn_objset->os_spa,
|
|
|
|
dh->dh_db->db.db_size, dh->dh_db, dh->dh_type));
|
|
|
|
bcopy(dh->dh_dr->dt.dl.dr_data->b_data,
|
|
|
|
dh->dh_db->db.db_data, dh->dh_db->db.db_size);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
(void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
|
|
|
|
DBUF_VERIFY(dh->dh_db);
|
|
|
|
mutex_exit(&dh->dh_db->db_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* NOTE: we can't rele the parent until after we drop the db_mtx */
|
2010-08-26 10:52:00 -07:00
|
|
|
if (dh->dh_parent)
|
|
|
|
dbuf_rele(dh->dh_parent, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
|
|
|
|
ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
|
|
|
|
ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
|
|
|
|
*(dh->dh_dbp) = dh->dh_db;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
/*
|
|
|
|
* The following code preserves the recursive function dbuf_hold_impl()
|
|
|
|
* but moves the local variables AND function arguments to the heap to
|
|
|
|
* minimize the stack frame size. Enough space is initially allocated
|
|
|
|
* on the stack for 20 levels of recursion.
|
|
|
|
*/
|
|
|
|
int
|
2015-12-22 02:31:57 +01:00
|
|
|
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
|
|
|
|
boolean_t fail_sparse, boolean_t fail_uncached,
|
2010-08-26 10:52:00 -07:00
|
|
|
void *tag, dmu_buf_impl_t **dbp)
|
|
|
|
{
|
|
|
|
struct dbuf_hold_impl_data *dh;
|
|
|
|
int error;
|
|
|
|
|
2016-07-20 22:50:26 -07:00
|
|
|
dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) *
|
2014-11-20 19:09:39 -05:00
|
|
|
DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
|
2015-12-22 02:31:57 +01:00
|
|
|
__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse,
|
|
|
|
fail_uncached, tag, dbp, 0);
|
2010-08-26 10:52:00 -07:00
|
|
|
|
|
|
|
error = __dbuf_hold_impl(dh);
|
|
|
|
|
2013-11-01 20:26:11 +01:00
|
|
|
kmem_free(dh, sizeof (struct dbuf_hold_impl_data) *
|
2010-08-26 10:52:00 -07:00
|
|
|
DBUF_HOLD_IMPL_MAX_DEPTH);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
|
2015-12-22 02:31:57 +01:00
|
|
|
dnode_t *dn, uint8_t level, uint64_t blkid,
|
|
|
|
boolean_t fail_sparse, boolean_t fail_uncached,
|
|
|
|
void *tag, dmu_buf_impl_t **dbp, int depth)
|
2010-08-26 10:52:00 -07:00
|
|
|
{
|
|
|
|
dh->dh_dn = dn;
|
|
|
|
dh->dh_level = level;
|
|
|
|
dh->dh_blkid = blkid;
|
2015-12-22 02:31:57 +01:00
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
dh->dh_fail_sparse = fail_sparse;
|
2015-12-22 02:31:57 +01:00
|
|
|
dh->dh_fail_uncached = fail_uncached;
|
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
dh->dh_tag = tag;
|
|
|
|
dh->dh_dbp = dbp;
|
2016-07-20 22:50:26 -07:00
|
|
|
|
|
|
|
dh->dh_db = NULL;
|
|
|
|
dh->dh_parent = NULL;
|
|
|
|
dh->dh_bp = NULL;
|
|
|
|
dh->dh_err = 0;
|
|
|
|
dh->dh_dr = NULL;
|
|
|
|
dh->dh_type = 0;
|
|
|
|
|
2010-08-26 10:52:00 -07:00
|
|
|
dh->dh_depth = depth;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
dmu_buf_impl_t *
|
|
|
|
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
|
|
|
|
{
|
2015-12-22 02:31:57 +01:00
|
|
|
return (dbuf_hold_level(dn, 0, blkid, tag));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
dmu_buf_impl_t *
|
|
|
|
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db;
|
2015-12-22 02:31:57 +01:00
|
|
|
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
|
2008-11-20 12:01:55 -08:00
|
|
|
return (err ? NULL : db);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_create_bonus(dnode_t *dn)
|
|
|
|
{
|
|
|
|
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
|
|
|
|
|
|
|
|
ASSERT(dn->dn_bonus == NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
2013-03-08 10:41:28 -08:00
|
|
|
return (SET_ERROR(ENOTSUP));
|
2010-05-28 13:45:14 -07:00
|
|
|
if (blksz == 0)
|
|
|
|
blksz = SPA_MINBLOCKSIZE;
|
2014-11-03 12:15:08 -08:00
|
|
|
ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
|
|
|
|
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
|
2010-05-28 13:45:14 -07:00
|
|
|
dbuf_new_size(db, blksz, tx);
|
2010-08-26 14:24:34 -07:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
#pragma weak dmu_buf_add_ref = dbuf_add_ref
|
|
|
|
void
|
|
|
|
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
|
|
|
|
{
|
2010-08-26 09:53:00 -07:00
|
|
|
VERIFY(refcount_add(&db->db_holds, tag) > 1);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-04-02 22:59:15 +11:00
|
|
|
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
|
|
|
|
boolean_t
|
|
|
|
dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
|
|
|
|
void *tag)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
dmu_buf_impl_t *found_db;
|
|
|
|
boolean_t result = B_FALSE;
|
|
|
|
|
2015-05-28 16:14:19 -07:00
|
|
|
if (blkid == DMU_BONUS_BLKID)
|
2015-04-02 22:59:15 +11:00
|
|
|
found_db = dbuf_find_bonus(os, obj);
|
|
|
|
else
|
|
|
|
found_db = dbuf_find(os, obj, 0, blkid);
|
|
|
|
|
|
|
|
if (found_db != NULL) {
|
|
|
|
if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
|
|
|
|
(void) refcount_add(&db->db_holds, tag);
|
|
|
|
result = B_TRUE;
|
|
|
|
}
|
2015-05-28 16:14:19 -07:00
|
|
|
mutex_exit(&found_db->db_mtx);
|
2015-04-02 22:59:15 +11:00
|
|
|
}
|
|
|
|
return (result);
|
|
|
|
}
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
/*
|
|
|
|
* If you call dbuf_rele() you had better not be referencing the dnode handle
|
|
|
|
* unless you have some other direct or indirect hold on the dnode. (An indirect
|
|
|
|
* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
|
|
|
|
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
|
|
|
|
* dnode's parent dbuf evicting its dnode handles.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
|
|
|
dbuf_rele(dmu_buf_impl_t *db, void *tag)
|
2010-05-28 13:45:14 -07:00
|
|
|
{
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
dbuf_rele_and_unlock(db, tag);
|
|
|
|
}
|
|
|
|
|
2013-12-09 10:37:51 -08:00
|
|
|
void
|
|
|
|
dmu_buf_rele(dmu_buf_t *db, void *tag)
|
|
|
|
{
|
|
|
|
dbuf_rele((dmu_buf_impl_t *)db, tag);
|
|
|
|
}
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
/*
|
|
|
|
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
|
|
|
|
* db_dirtycnt and db_holds to be updated atomically.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
int64_t holds;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
2008-11-20 12:01:55 -08:00
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
/*
|
|
|
|
* Remove the reference to the dbuf before removing its hold on the
|
|
|
|
* dnode so we can guarantee in dnode_move() that a referenced bonus
|
|
|
|
* buffer has a corresponding dnode hold.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
holds = refcount_remove(&db->db_holds, tag);
|
|
|
|
ASSERT(holds >= 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can't freeze indirects if there is a possibility that they
|
|
|
|
* may be modified in the current syncing context.
|
|
|
|
*/
|
|
|
|
if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
|
|
|
|
arc_buf_freeze(db->db_buf);
|
|
|
|
|
|
|
|
if (holds == db->db_dirtycnt &&
|
2015-10-13 14:09:45 -07:00
|
|
|
db->db_level == 0 && db->db_user_immediate_evict)
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_evict_user(db);
|
|
|
|
|
|
|
|
if (holds == 0) {
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2015-03-12 11:10:35 +11:00
|
|
|
dnode_t *dn;
|
2015-10-13 14:09:45 -07:00
|
|
|
boolean_t evict_dbuf = db->db_pending_evict;
|
2010-08-26 14:24:34 -07:00
|
|
|
|
|
|
|
/*
|
2015-03-12 11:10:35 +11:00
|
|
|
* If the dnode moves here, we cannot cross this
|
|
|
|
* barrier until the move completes.
|
2010-08-26 14:24:34 -07:00
|
|
|
*/
|
|
|
|
DB_DNODE_ENTER(db);
|
2015-03-12 11:10:35 +11:00
|
|
|
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
atomic_dec_32(&dn->dn_dbufs_count);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decrementing the dbuf count means that the bonus
|
|
|
|
* buffer's dnode hold is no longer discounted in
|
|
|
|
* dnode_move(). The dnode cannot move until after
|
2015-10-13 14:09:45 -07:00
|
|
|
* the dnode_rele() below.
|
2015-03-12 11:10:35 +11:00
|
|
|
*/
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2015-03-12 11:10:35 +11:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not reference db after its lock is dropped.
|
|
|
|
* Another thread may evict it.
|
|
|
|
*/
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2015-10-13 14:09:45 -07:00
|
|
|
if (evict_dbuf)
|
2015-03-12 11:10:35 +11:00
|
|
|
dnode_evict_bonus(dn);
|
2015-10-13 14:09:45 -07:00
|
|
|
|
|
|
|
dnode_rele(dn, db);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else if (db->db_buf == NULL) {
|
|
|
|
/*
|
|
|
|
* This is a special case: we never associated this
|
|
|
|
* dbuf with any data allocated from the ARC.
|
|
|
|
*/
|
2008-12-03 12:09:06 -08:00
|
|
|
ASSERT(db->db_state == DB_UNCACHED ||
|
|
|
|
db->db_state == DB_NOFILL);
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_evict(db);
|
|
|
|
} else if (arc_released(db->db_buf)) {
|
|
|
|
arc_buf_t *buf = db->db_buf;
|
|
|
|
/*
|
|
|
|
* This dbuf has anonymous data associated with it.
|
|
|
|
*/
|
2015-04-02 14:44:32 +11:00
|
|
|
dbuf_clear_data(db);
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_evict(db);
|
|
|
|
} else {
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(!arc_buf_remove_ref(db->db_buf, db));
|
2012-12-21 14:57:09 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A dbuf will be eligible for eviction if either the
|
|
|
|
* 'primarycache' property is set or a duplicate
|
|
|
|
* copy of this buffer is already cached in the arc.
|
|
|
|
*
|
|
|
|
* In the case of the 'primarycache' a buffer
|
|
|
|
* is considered for eviction if it matches the
|
|
|
|
* criteria set in the property.
|
|
|
|
*
|
|
|
|
* To decide if our buffer is considered a
|
|
|
|
* duplicate, we must call into the arc to determine
|
|
|
|
* if multiple buffers are referencing the same
|
|
|
|
* block on-disk. If so, then we simply evict
|
|
|
|
* ourselves.
|
|
|
|
*/
|
2014-07-15 03:43:18 -04:00
|
|
|
if (!DBUF_IS_CACHEABLE(db)) {
|
|
|
|
if (db->db_blkptr != NULL &&
|
|
|
|
!BP_IS_HOLE(db->db_blkptr) &&
|
|
|
|
!BP_IS_EMBEDDED(db->db_blkptr)) {
|
|
|
|
spa_t *spa =
|
|
|
|
dmu_objset_spa(db->db_objset);
|
|
|
|
blkptr_t bp = *db->db_blkptr;
|
|
|
|
dbuf_clear(db);
|
|
|
|
arc_freed(spa, &bp);
|
|
|
|
} else {
|
|
|
|
dbuf_clear(db);
|
|
|
|
}
|
2015-10-13 14:09:45 -07:00
|
|
|
} else if (db->db_pending_evict ||
|
2015-04-02 14:44:32 +11:00
|
|
|
arc_buf_eviction_needed(db->db_buf)) {
|
2008-12-03 12:09:06 -08:00
|
|
|
dbuf_clear(db);
|
2014-07-15 03:43:18 -04:00
|
|
|
} else {
|
2008-12-03 12:09:06 -08:00
|
|
|
mutex_exit(&db->db_mtx);
|
2014-07-15 03:43:18 -04:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#pragma weak dmu_buf_refcount = dbuf_refcount
|
|
|
|
uint64_t
|
|
|
|
dbuf_refcount(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
return (refcount_count(&db->db_holds));
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
2015-04-02 14:44:32 +11:00
|
|
|
dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
|
|
|
|
dmu_buf_user_t *new_user)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-04-02 14:44:32 +11:00
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
dbuf_verify_user(db, DBVU_NOT_EVICTING);
|
|
|
|
if (db->db_user == old_user)
|
|
|
|
db->db_user = new_user;
|
|
|
|
else
|
|
|
|
old_user = db->db_user;
|
|
|
|
dbuf_verify_user(db, DBVU_NOT_EVICTING);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
|
|
|
return (old_user);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
2015-04-02 14:44:32 +11:00
|
|
|
dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-04-02 14:44:32 +11:00
|
|
|
return (dmu_buf_replace_user(db_fake, NULL, user));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
2015-04-02 14:44:32 +11:00
|
|
|
dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
2015-10-13 14:09:45 -07:00
|
|
|
db->db_user_immediate_evict = TRUE;
|
2015-04-02 14:44:32 +11:00
|
|
|
return (dmu_buf_set_user(db_fake, user));
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
void *
|
|
|
|
dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
|
|
|
|
{
|
|
|
|
return (dmu_buf_replace_user(db_fake, user, NULL));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
dmu_buf_get_user(dmu_buf_t *db_fake)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
2015-04-02 14:44:32 +11:00
|
|
|
dbuf_verify_user(db, DBVU_NOT_EVICTING);
|
|
|
|
return (db->db_user);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dmu_buf_user_evict_wait()
|
|
|
|
{
|
|
|
|
taskq_wait(dbu_evict_taskq);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2009-07-02 15:44:48 -07:00
|
|
|
boolean_t
|
|
|
|
dmu_buf_freeable(dmu_buf_t *dbuf)
|
|
|
|
{
|
|
|
|
boolean_t res = B_FALSE;
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
|
|
|
|
|
|
|
|
if (db->db_blkptr)
|
|
|
|
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
|
2010-05-28 13:45:14 -07:00
|
|
|
db->db_blkptr, db->db_blkptr->blk_birth);
|
2009-07-02 15:44:48 -07:00
|
|
|
|
|
|
|
return (res);
|
|
|
|
}
|
|
|
|
|
2013-05-10 12:47:54 -07:00
|
|
|
blkptr_t *
|
|
|
|
dmu_buf_get_blkptr(dmu_buf_t *db)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
|
|
|
|
return (dbi->db_blkptr);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
|
|
|
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
/* ASSERT(dmu_tx_is_syncing(tx) */
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
if (db->db_blkptr != NULL)
|
|
|
|
return;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
|
2010-05-28 13:45:14 -07:00
|
|
|
BP_ZERO(db->db_blkptr);
|
|
|
|
return;
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
|
|
|
|
/*
|
|
|
|
* This buffer was allocated at a time when there was
|
|
|
|
* no available blkptrs from the dnode, or it was
|
|
|
|
* inappropriate to hook it in (i.e., nlevels mis-match).
|
|
|
|
*/
|
|
|
|
ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
|
|
|
|
ASSERT(db->db_parent == NULL);
|
|
|
|
db->db_parent = dn->dn_dbuf;
|
|
|
|
db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
} else {
|
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
|
|
int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
|
|
|
|
ASSERT(dn->dn_phys->dn_nlevels > 1);
|
|
|
|
if (parent == NULL) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
2015-12-22 02:31:57 +01:00
|
|
|
parent = dbuf_hold_level(dn, db->db_level + 1,
|
|
|
|
db->db_blkid >> epbs, db);
|
2008-11-20 12:01:55 -08:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
db->db_parent = parent;
|
|
|
|
}
|
|
|
|
db->db_blkptr = (blkptr_t *)parent->db.db_data +
|
|
|
|
(db->db_blkid & ((1ULL << epbs) - 1));
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-01 20:26:11 +01:00
|
|
|
/*
|
|
|
|
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
|
2010-08-26 10:58:36 -07:00
|
|
|
* is critical the we not allow the compiler to inline this function in to
|
|
|
|
* dbuf_sync_list() thereby drastically bloating the stack usage.
|
|
|
|
*/
|
|
|
|
noinline static void
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2008-11-20 12:01:55 -08:00
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
|
|
|
|
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
ASSERT(db->db_level > 0);
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Read the block if it hasn't been read yet. */
|
2008-11-20 12:01:55 -08:00
|
|
|
if (db->db_buf == NULL) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
}
|
|
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
|
|
ASSERT(db->db_buf != NULL);
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Indirect block size must match what the dnode thinks it is. */
|
2010-08-26 14:24:34 -07:00
|
|
|
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_check_blkptr(dn, db);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Provide the pending dirty record to child dbufs */
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_data_pending = dr;
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
2008-12-03 12:09:06 -08:00
|
|
|
dbuf_write(dr, db->db_buf, tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
zio = dr->dr_zio;
|
|
|
|
mutex_enter(&dr->dt.di.dr_mtx);
|
2015-07-02 18:23:20 +02:00
|
|
|
dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
|
|
|
|
mutex_exit(&dr->dt.di.dr_mtx);
|
|
|
|
zio_nowait(zio);
|
|
|
|
}
|
|
|
|
|
2013-11-01 20:26:11 +01:00
|
|
|
/*
|
|
|
|
* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
|
2010-08-26 10:58:36 -07:00
|
|
|
* critical the we not allow the compiler to inline this function in to
|
|
|
|
* dbuf_sync_list() thereby drastically bloating the stack usage.
|
|
|
|
*/
|
|
|
|
noinline static void
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
arc_buf_t **datap = &dr->dt.dl.dr_data;
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
|
|
|
objset_t *os;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t txg = tx->tx_txg;
|
|
|
|
|
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
|
|
|
|
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
/*
|
|
|
|
* To be synced, we must be dirtied. But we
|
|
|
|
* might have been freed after the dirty.
|
|
|
|
*/
|
|
|
|
if (db->db_state == DB_UNCACHED) {
|
|
|
|
/* This buffer has been freed since it was dirtied */
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
} else if (db->db_state == DB_FILL) {
|
|
|
|
/* This buffer was freed and is now being re-filled */
|
|
|
|
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
|
|
|
|
} else {
|
2008-12-03 12:09:06 -08:00
|
|
|
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
2016-06-08 15:22:07 +08:00
|
|
|
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
|
|
|
|
/*
|
|
|
|
* In the previous transaction group, the bonus buffer
|
|
|
|
* was entirely used to store the attributes for the
|
|
|
|
* dnode which overrode the dn_spill field. However,
|
|
|
|
* when adding more attributes to the file a spill
|
|
|
|
* block was required to hold the extra attributes.
|
|
|
|
*
|
|
|
|
* Make sure to clear the garbage left in the dn_spill
|
|
|
|
* field from the previous attributes in the bonus
|
|
|
|
* buffer. Otherwise, after writing out the spill
|
|
|
|
* block to the new allocated dva, it will free
|
|
|
|
* the old block pointed to by the invalid dn_spill.
|
|
|
|
*/
|
|
|
|
db->db_blkptr = NULL;
|
|
|
|
}
|
2010-05-28 13:45:14 -07:00
|
|
|
dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* If this is a bonus buffer, simply copy the bonus data into the
|
|
|
|
* dnode. It will be written out when the dnode is synced (and it
|
|
|
|
* will be synced, since it must have been dirty for dbuf_sync to
|
|
|
|
* be called).
|
|
|
|
*/
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_dirty_record_t **drp;
|
|
|
|
|
|
|
|
ASSERT(*datap != NULL);
|
2013-05-10 14:17:03 -07:00
|
|
|
ASSERT0(db->db_level);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
|
|
|
|
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
|
2008-11-20 12:01:55 -08:00
|
|
|
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (*datap != db->db.db_data) {
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
int slots = DB_DNODE(db)->dn_num_slots;
|
|
|
|
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
|
|
|
|
zio_buf_free(*datap, bonuslen);
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_space_return(bonuslen, ARC_SPACE_BONUS);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
db->db_data_pending = NULL;
|
|
|
|
drp = &db->db_last_dirty;
|
|
|
|
while (*drp != dr)
|
|
|
|
drp = &(*drp)->dr_next;
|
|
|
|
ASSERT(dr->dr_next == NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(dr->dr_dbuf == db);
|
2008-11-20 12:01:55 -08:00
|
|
|
*drp = dr->dr_next;
|
2010-08-26 10:19:04 -07:00
|
|
|
if (dr->dr_dbuf->db_level != 0) {
|
|
|
|
mutex_destroy(&dr->dt.di.dr_mtx);
|
|
|
|
list_destroy(&dr->dt.di.dr_children);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
|
|
db->db_dirtycnt -= 1;
|
2010-05-28 13:45:14 -07:00
|
|
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
os = dn->dn_objset;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This function may have dropped the db_mtx lock allowing a dmu_sync
|
|
|
|
* operation to sneak in. As a result, we need to ensure that we
|
|
|
|
* don't check the dr_override_state until we have returned from
|
|
|
|
* dbuf_check_blkptr.
|
|
|
|
*/
|
|
|
|
dbuf_check_blkptr(dn, db);
|
|
|
|
|
|
|
|
/*
|
2010-08-26 14:24:34 -07:00
|
|
|
* If this buffer is in the middle of an immediate write,
|
2008-11-20 12:01:55 -08:00
|
|
|
* wait for the synchronous IO to complete.
|
|
|
|
*/
|
|
|
|
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
|
|
|
|
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
|
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
|
|
|
|
}
|
|
|
|
|
2009-07-02 15:44:48 -07:00
|
|
|
if (db->db_state != DB_NOFILL &&
|
|
|
|
dn->dn_object != DMU_META_DNODE_OBJECT &&
|
|
|
|
refcount_count(&db->db_holds) > 1 &&
|
2010-05-28 13:45:14 -07:00
|
|
|
dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
|
2009-07-02 15:44:48 -07:00
|
|
|
*datap == db->db_buf) {
|
|
|
|
/*
|
|
|
|
* If this buffer is currently "in use" (i.e., there
|
|
|
|
* are active holds and db_data still references it),
|
|
|
|
* then make a copy before we start the write so that
|
|
|
|
* any modifications from the open txg will not leak
|
|
|
|
* into this write.
|
|
|
|
*
|
|
|
|
* NOTE: this copy does not need to be made for
|
|
|
|
* objects only modified in the syncing context (e.g.
|
|
|
|
* DNONE_DNODE blocks).
|
|
|
|
*/
|
|
|
|
int blksz = arc_buf_size(*datap);
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
|
|
|
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
|
|
|
|
bcopy(db->db.db_data, (*datap)->b_data, blksz);
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
db->db_data_pending = dr;
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
dbuf_write(dr, *datap, tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
2010-08-26 14:24:34 -07:00
|
|
|
if (dn->dn_object == DMU_META_DNODE_OBJECT) {
|
2008-11-20 12:01:55 -08:00
|
|
|
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Although zio_nowait() does not "wait for an IO", it does
|
|
|
|
* initiate the IO. If this is an empty write it seems plausible
|
|
|
|
* that the IO could actually be completed before the nowait
|
|
|
|
* returns. We need to DB_DNODE_EXIT() first in case
|
|
|
|
* zio_nowait() invalidates the dbuf.
|
|
|
|
*/
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
zio_nowait(dr->dr_zio);
|
2010-08-26 14:24:34 -07:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2015-07-02 18:23:20 +02:00
|
|
|
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
|
2010-08-26 09:52:42 -07:00
|
|
|
while ((dr = list_head(list))) {
|
2008-11-20 12:01:55 -08:00
|
|
|
if (dr->dr_zio != NULL) {
|
|
|
|
/*
|
|
|
|
* If we find an already initialized zio then we
|
|
|
|
* are processing the meta-dnode, and we have finished.
|
|
|
|
* The dbufs for all dnodes are put back on the list
|
|
|
|
* during processing, so that we can zio_wait()
|
|
|
|
* these IOs after initiating all child IOs.
|
|
|
|
*/
|
|
|
|
ASSERT3U(dr->dr_dbuf->db.db_object, ==,
|
|
|
|
DMU_META_DNODE_OBJECT);
|
|
|
|
break;
|
|
|
|
}
|
2015-07-02 18:23:20 +02:00
|
|
|
if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
|
|
|
|
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
|
|
|
|
VERIFY3U(dr->dr_dbuf->db_level, ==, level);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
list_remove(list, dr);
|
|
|
|
if (dr->dr_dbuf->db_level > 0)
|
|
|
|
dbuf_sync_indirect(dr, tx);
|
|
|
|
else
|
|
|
|
dbuf_sync_leaf(dr, tx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
2008-12-03 12:09:06 -08:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 12:01:55 -08:00
|
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
2010-05-28 13:45:14 -07:00
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
int64_t delta;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t fill = 0;
|
2010-05-28 13:45:14 -07:00
|
|
|
int i;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2016-04-21 11:23:37 -07:00
|
|
|
ASSERT3P(db->db_blkptr, !=, NULL);
|
|
|
|
ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2010-05-28 13:45:14 -07:00
|
|
|
delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
|
|
|
|
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
|
|
|
|
zio->io_prev_space_delta = delta;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-12-09 10:37:51 -08:00
|
|
|
if (bp->blk_birth != 0) {
|
|
|
|
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
|
|
|
|
BP_GET_TYPE(bp) == dn->dn_type) ||
|
|
|
|
(db->db_blkid == DMU_SPILL_BLKID &&
|
2014-06-05 13:19:08 -08:00
|
|
|
BP_GET_TYPE(bp) == dn->dn_bonustype) ||
|
|
|
|
BP_IS_EMBEDDED(bp));
|
2013-12-09 10:37:51 -08:00
|
|
|
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
|
|
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
|
2016-04-21 11:23:37 -07:00
|
|
|
ASSERT(!(BP_IS_HOLE(bp)) &&
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
|
2010-05-28 13:45:14 -07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (db->db_level == 0) {
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
|
|
|
|
db->db_blkid != DMU_SPILL_BLKID)
|
2008-11-20 12:01:55 -08:00
|
|
|
dn->dn_phys->dn_maxblkid = db->db_blkid;
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
|
|
|
|
if (dn->dn_type == DMU_OT_DNODE) {
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
i = 0;
|
|
|
|
while (i < db->db.db_size) {
|
|
|
|
dnode_phys_t *dnp = db->db.db_data + i;
|
|
|
|
|
|
|
|
i += DNODE_MIN_SIZE;
|
|
|
|
if (dnp->dn_type != DMU_OT_NONE) {
|
2008-11-20 12:01:55 -08:00
|
|
|
fill++;
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
i += dnp->dn_extra_slots *
|
|
|
|
DNODE_MIN_SIZE;
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
} else {
|
2013-12-09 10:37:51 -08:00
|
|
|
if (BP_IS_HOLE(bp)) {
|
|
|
|
fill = 0;
|
|
|
|
} else {
|
|
|
|
fill = 1;
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
} else {
|
2008-12-03 12:09:06 -08:00
|
|
|
blkptr_t *ibp = db->db.db_data;
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
|
2008-12-03 12:09:06 -08:00
|
|
|
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
|
|
|
|
if (BP_IS_HOLE(ibp))
|
2008-11-20 12:01:55 -08:00
|
|
|
continue;
|
2014-06-05 13:19:08 -08:00
|
|
|
fill += BP_GET_FILL(ibp);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
if (!BP_IS_EMBEDDED(bp))
|
|
|
|
bp->blk_fill = fill;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
2016-04-21 11:23:37 -07:00
|
|
|
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
|
|
|
|
*db->db_blkptr = *bp;
|
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2016-05-15 08:02:28 -07:00
|
|
|
/* ARGSUSED */
|
|
|
|
/*
|
|
|
|
* This function gets called just prior to running through the compression
|
|
|
|
* stage of the zio pipeline. If we're an indirect block comprised of only
|
|
|
|
* holes, then we want this indirect to be compressed away to a hole. In
|
|
|
|
* order to do that we must zero out any information about the holes that
|
|
|
|
* this indirect points to prior to before we try to compress it.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
dnode_t *dn;
|
|
|
|
blkptr_t *bp;
|
|
|
|
uint64_t i;
|
|
|
|
int epbs;
|
|
|
|
|
|
|
|
ASSERT3U(db->db_level, >, 0);
|
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
|
|
|
|
/* Determine if all our children are holes */
|
|
|
|
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
|
|
|
|
if (!BP_IS_HOLE(bp))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If all the children are holes, then zero them all out so that
|
|
|
|
* we may get compressed away.
|
|
|
|
*/
|
|
|
|
if (i == 1 << epbs) {
|
|
|
|
/* didn't find any non-holes */
|
|
|
|
bzero(db->db.db_data, db->db.db_size);
|
|
|
|
}
|
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
}
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
/*
|
|
|
|
* The SPA will call this callback several times for each zio - once
|
|
|
|
* for every physical child i/o (zio->io_phys_children times). This
|
|
|
|
* allows the DMU to monitor the progress of each logical i/o. For example,
|
|
|
|
* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
|
|
|
|
* block. There may be a long delay before all copies/fragments are completed,
|
|
|
|
* so this callback allows us to retire dirty space gradually, as the physical
|
|
|
|
* i/os complete.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = arg;
|
|
|
|
objset_t *os = db->db_objset;
|
|
|
|
dsl_pool_t *dp = dmu_objset_pool(os);
|
|
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
int delta = 0;
|
|
|
|
|
|
|
|
dr = db->db_data_pending;
|
|
|
|
ASSERT3U(dr->dr_txg, ==, zio->io_txg);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The callback will be called io_phys_children times. Retire one
|
|
|
|
* portion of our dirty space each time we are called. Any rounding
|
|
|
|
* error will be cleaned up by dsl_pool_sync()'s call to
|
|
|
|
* dsl_pool_undirty_space().
|
|
|
|
*/
|
|
|
|
delta = dr->dr_accounted / zio->io_phys_children;
|
|
|
|
dsl_pool_undirty_space(dp, delta, zio->io_txg);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
2010-05-28 13:45:14 -07:00
|
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
2013-12-09 10:37:51 -08:00
|
|
|
blkptr_t *bp = db->db_blkptr;
|
|
|
|
objset_t *os = db->db_objset;
|
|
|
|
dmu_tx_t *tx = os->os_synctx;
|
2008-11-20 12:01:55 -08:00
|
|
|
dbuf_dirty_record_t **drp, *dr;
|
|
|
|
|
2013-05-10 14:17:03 -07:00
|
|
|
ASSERT0(zio->io_error);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkptr == bp);
|
|
|
|
|
2013-05-10 12:47:54 -07:00
|
|
|
/*
|
|
|
|
* For nopwrites and rewrites we ensure that the bp matches our
|
|
|
|
* original and bypass all the accounting.
|
|
|
|
*/
|
|
|
|
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(BP_EQUAL(bp, bp_orig));
|
|
|
|
} else {
|
2013-12-09 10:37:51 -08:00
|
|
|
dsl_dataset_t *ds = os->os_dsl_dataset;
|
2010-05-28 13:45:14 -07:00
|
|
|
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
|
|
|
|
dsl_dataset_block_born(ds, bp, tx);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
drp = &db->db_last_dirty;
|
|
|
|
while ((dr = *drp) != db->db_data_pending)
|
|
|
|
drp = &dr->dr_next;
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(dr->dr_dbuf == db);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(dr->dr_next == NULL);
|
|
|
|
*drp = dr->dr_next;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
|
|
|
|
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
|
|
|
|
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2010-05-28 13:45:14 -07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (db->db_level == 0) {
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
|
2008-12-03 12:09:06 -08:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
|
|
|
if (dr->dt.dl.dr_data != db->db_buf)
|
|
|
|
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
|
2013-09-04 07:00:57 -05:00
|
|
|
db));
|
2010-05-28 13:45:14 -07:00
|
|
|
else if (!arc_released(db->db_buf))
|
2008-12-03 12:09:06 -08:00
|
|
|
arc_set_callback(db->db_buf, dbuf_do_evict, db);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
|
|
|
|
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
|
2013-12-09 10:37:51 -08:00
|
|
|
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
|
2008-11-20 12:01:55 -08:00
|
|
|
if (!BP_IS_HOLE(db->db_blkptr)) {
|
2010-08-26 09:53:00 -07:00
|
|
|
ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
|
|
|
|
SPA_BLKPTRSHIFT);
|
2013-12-09 10:37:51 -08:00
|
|
|
ASSERT3U(db->db_blkid, <=,
|
|
|
|
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
|
|
|
|
db->db.db_size);
|
2014-06-05 13:19:08 -08:00
|
|
|
if (!arc_released(db->db_buf))
|
|
|
|
arc_set_callback(db->db_buf, dbuf_do_evict, db);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_destroy(&dr->dt.di.dr_mtx);
|
|
|
|
list_destroy(&dr->dt.di.dr_children);
|
|
|
|
}
|
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
|
|
|
|
|
|
cv_broadcast(&db->db_changed);
|
|
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
|
|
db->db_dirtycnt -= 1;
|
|
|
|
db->db_data_pending = NULL;
|
2013-12-09 10:37:51 -08:00
|
|
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
|
2010-05-28 13:45:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_nofill_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_write_ready(zio, NULL, zio->io_private);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_nofill_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_write_done(zio, NULL, zio->io_private);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_override_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
|
|
|
|
dbuf_write_ready(zio, NULL, db);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_override_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (!BP_EQUAL(zio->io_bp, obp)) {
|
|
|
|
if (!BP_IS_HOLE(obp))
|
|
|
|
dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
|
|
|
|
arc_release(dr->dt.dl.dr_data, db);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
dbuf_write_done(zio, NULL, db);
|
|
|
|
}
|
|
|
|
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Issue I/O to commit a dirty buffer to disk. */
|
2010-05-28 13:45:14 -07:00
|
|
|
static void
|
|
|
|
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-08-26 14:24:34 -07:00
|
|
|
dnode_t *dn;
|
|
|
|
objset_t *os;
|
2010-05-28 13:45:14 -07:00
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
|
|
uint64_t txg = tx->tx_txg;
|
2014-06-25 10:37:59 -08:00
|
|
|
zbookmark_phys_t zb;
|
2010-05-28 13:45:14 -07:00
|
|
|
zio_prop_t zp;
|
|
|
|
zio_t *zio;
|
|
|
|
int wp_flag = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2016-04-21 11:23:37 -07:00
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
os = dn->dn_objset;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
|
|
|
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
|
|
|
|
/*
|
|
|
|
* Private object buffers are released here rather
|
|
|
|
* than in dbuf_dirty() since they are only modified
|
|
|
|
* in the syncing context and we don't want the
|
|
|
|
* overhead of making multiple copies of the data.
|
|
|
|
*/
|
|
|
|
if (BP_IS_HOLE(db->db_blkptr)) {
|
|
|
|
arc_buf_thaw(data);
|
|
|
|
} else {
|
|
|
|
dbuf_release_bp(db);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (parent != dn->dn_dbuf) {
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Our parent is an indirect block. */
|
|
|
|
/* We have a dirty parent that has been scheduled for write. */
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(parent && parent->db_data_pending);
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Our parent's buffer is one level closer to the dnode. */
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(db->db_level == parent->db_level-1);
|
2013-06-11 09:12:34 -08:00
|
|
|
/*
|
|
|
|
* We're about to modify our parent's db_data by modifying
|
|
|
|
* our block pointer, so the parent must be released.
|
|
|
|
*/
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(arc_released(parent->db_buf));
|
|
|
|
zio = parent->db_data_pending->dr_zio;
|
|
|
|
} else {
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Our parent is the dnode itself. */
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
|
|
|
|
db->db_blkid != DMU_SPILL_BLKID) ||
|
|
|
|
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
|
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
|
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
|
|
&dn->dn_phys->dn_blkptr[db->db_blkid]);
|
|
|
|
zio = dn->dn_zio;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(db->db_level == 0 || data == db->db_buf);
|
|
|
|
ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
|
|
|
|
ASSERT(zio);
|
|
|
|
|
|
|
|
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
|
|
|
|
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
|
|
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
|
|
|
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID)
|
|
|
|
wp_flag = WP_SPILL;
|
|
|
|
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
|
|
|
|
|
|
|
|
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
|
2010-08-26 14:24:34 -07:00
|
|
|
DB_DNODE_EXIT(db);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2016-04-21 11:23:37 -07:00
|
|
|
/*
|
|
|
|
* We copy the blkptr now (rather than when we instantiate the dirty
|
|
|
|
* record), because its value can change between open context and
|
|
|
|
* syncing context. We do not need to hold dn_struct_rwlock to read
|
|
|
|
* db_blkptr because we are in syncing context.
|
|
|
|
*/
|
|
|
|
dr->dr_bp_copy = *db->db_blkptr;
|
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
if (db->db_level == 0 &&
|
|
|
|
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
|
|
|
|
/*
|
|
|
|
* The BP for this block has been provided by open context
|
|
|
|
* (by dmu_sync() or dmu_buf_write_embedded()).
|
|
|
|
*/
|
|
|
|
void *contents = (data != NULL) ? data->b_data : NULL;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
dr->dr_zio = zio_write(zio, os->os_spa, txg,
|
2016-04-21 11:23:37 -07:00
|
|
|
&dr->dr_bp_copy, contents, db->db.db_size, &zp,
|
2016-05-15 08:02:28 -07:00
|
|
|
dbuf_write_override_ready, NULL, NULL,
|
|
|
|
dbuf_write_override_done,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
|
|
|
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
|
2013-05-10 12:47:54 -07:00
|
|
|
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
} else if (db->db_state == DB_NOFILL) {
|
|
|
|
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
|
|
|
|
dr->dr_zio = zio_write(zio, os->os_spa, txg,
|
2016-04-21 11:23:37 -07:00
|
|
|
&dr->dr_bp_copy, NULL, db->db.db_size, &zp,
|
2016-05-15 08:02:28 -07:00
|
|
|
dbuf_write_nofill_ready, NULL, NULL,
|
|
|
|
dbuf_write_nofill_done, db,
|
2010-05-28 13:45:14 -07:00
|
|
|
ZIO_PRIORITY_ASYNC_WRITE,
|
|
|
|
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
|
|
|
|
} else {
|
2016-05-15 08:02:28 -07:00
|
|
|
arc_done_func_t *children_ready_cb = NULL;
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(arc_released(data));
|
2016-05-15 08:02:28 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For indirect blocks, we want to setup the children
|
|
|
|
* ready callback so that we can properly handle an indirect
|
|
|
|
* block that only contains holes.
|
|
|
|
*/
|
|
|
|
if (db->db_level != 0)
|
|
|
|
children_ready_cb = dbuf_write_children_ready;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
dr->dr_zio = arc_write(zio, os->os_spa, txg,
|
2016-04-21 11:23:37 -07:00
|
|
|
&dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
|
2013-08-01 13:02:10 -07:00
|
|
|
DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
|
2016-05-15 08:02:28 -07:00
|
|
|
children_ready_cb,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
dbuf_write_physdone, dbuf_write_done, db,
|
|
|
|
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
2010-05-28 13:45:14 -07:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2010-08-26 11:49:16 -07:00
|
|
|
|
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
2012-08-10 16:28:37 -07:00
|
|
|
EXPORT_SYMBOL(dbuf_find);
|
|
|
|
EXPORT_SYMBOL(dbuf_is_metadata);
|
|
|
|
EXPORT_SYMBOL(dbuf_evict);
|
|
|
|
EXPORT_SYMBOL(dbuf_loan_arcbuf);
|
|
|
|
EXPORT_SYMBOL(dbuf_whichblock);
|
|
|
|
EXPORT_SYMBOL(dbuf_read);
|
|
|
|
EXPORT_SYMBOL(dbuf_unoverride);
|
|
|
|
EXPORT_SYMBOL(dbuf_free_range);
|
|
|
|
EXPORT_SYMBOL(dbuf_new_size);
|
|
|
|
EXPORT_SYMBOL(dbuf_release_bp);
|
|
|
|
EXPORT_SYMBOL(dbuf_dirty);
|
2010-08-26 11:49:16 -07:00
|
|
|
EXPORT_SYMBOL(dmu_buf_will_dirty);
|
2012-08-10 16:28:37 -07:00
|
|
|
EXPORT_SYMBOL(dmu_buf_will_not_fill);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_will_fill);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_fill_done);
|
2012-08-14 08:35:32 -07:00
|
|
|
EXPORT_SYMBOL(dmu_buf_rele);
|
2012-08-10 16:28:37 -07:00
|
|
|
EXPORT_SYMBOL(dbuf_assign_arcbuf);
|
|
|
|
EXPORT_SYMBOL(dbuf_clear);
|
|
|
|
EXPORT_SYMBOL(dbuf_prefetch);
|
|
|
|
EXPORT_SYMBOL(dbuf_hold_impl);
|
|
|
|
EXPORT_SYMBOL(dbuf_hold);
|
|
|
|
EXPORT_SYMBOL(dbuf_hold_level);
|
|
|
|
EXPORT_SYMBOL(dbuf_create_bonus);
|
|
|
|
EXPORT_SYMBOL(dbuf_spill_set_blksz);
|
|
|
|
EXPORT_SYMBOL(dbuf_rm_spill);
|
|
|
|
EXPORT_SYMBOL(dbuf_add_ref);
|
|
|
|
EXPORT_SYMBOL(dbuf_rele);
|
|
|
|
EXPORT_SYMBOL(dbuf_rele_and_unlock);
|
|
|
|
EXPORT_SYMBOL(dbuf_refcount);
|
|
|
|
EXPORT_SYMBOL(dbuf_sync_list);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_set_user);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_set_user_ie);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_get_user);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_freeable);
|
2014-11-13 10:09:05 -08:00
|
|
|
EXPORT_SYMBOL(dmu_buf_get_blkptr);
|
2010-08-26 11:49:16 -07:00
|
|
|
#endif
|