OpenZFS 6988 spa_sync() spends half its time in dmu_objset_do_userquota_updates
Using a benchmark which creates 2 million files in one TXG, I observe that the thread running spa_sync() is on CPU almost the entire time we are syncing, and therefore can be a performance bottleneck. About 50% of the time in spa_sync() is in dmu_objset_do_userquota_updates(). The problem is that dmu_objset_do_userquota_updates() calls zap_increment_int(DMU_USERUSED_OBJECT) once for every file that was modified (or created). In this benchmark, all the files are owned by the same user/group, so all 2 million calls to zap_increment_int() are modifying the same entry in the zap. The same issue exists for the DMU_GROUPUSED_OBJECT. We should keep an in-memory map from user to space delta while we are syncing, and when we finish, iterate over the in-memory map and modify the ZAP once per entry. This reduces the number of calls to zap_increment_int() from "number of objects modified" to "number of owners/groups of modified files". This reduced the time spent in spa_sync() in the file create benchmark by ~33%, from 11 seconds to 7 seconds. Upstream bugs: DLPX-44799 Ported by: Ned Bass <bass6@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/6988 ZFSonLinux-issue: https://github.com/zfsonlinux/zfs/issues/4642 OpenZFS-commit: unmerged Porting notes: - Added curly braces around declaration of userquota_cache_t cache to quiet compiler warning; - Handled the userobj accounting the same way it proposed in this path. Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
This commit is contained in:
parent
1de321e626
commit
9b7a83cbb6
@ -1338,37 +1338,113 @@ dmu_objset_userobjused_enabled(objset_t *os)
|
||||
spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));
|
||||
}
|
||||
|
||||
typedef struct userquota_node {
|
||||
/* must be in the first filed, see userquota_update_cache() */
|
||||
char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];
|
||||
int64_t uqn_delta;
|
||||
avl_node_t uqn_node;
|
||||
} userquota_node_t;
|
||||
|
||||
typedef struct userquota_cache {
|
||||
avl_tree_t uqc_user_deltas;
|
||||
avl_tree_t uqc_group_deltas;
|
||||
} userquota_cache_t;
|
||||
|
||||
static int
|
||||
userquota_compare(const void *l, const void *r)
|
||||
{
|
||||
const userquota_node_t *luqn = l;
|
||||
const userquota_node_t *ruqn = r;
|
||||
|
||||
/*
|
||||
* NB: can only access uqn_id because userquota_update_cache() doesn't
|
||||
* pass in an entire userquota_node_t.
|
||||
*/
|
||||
return (strcmp(luqn->uqn_id, ruqn->uqn_id));
|
||||
}
|
||||
|
||||
static void
|
||||
do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
|
||||
uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
|
||||
do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
|
||||
{
|
||||
void *cookie;
|
||||
userquota_node_t *uqn;
|
||||
|
||||
ASSERT(dmu_tx_is_syncing(tx));
|
||||
|
||||
cookie = NULL;
|
||||
while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
|
||||
&cookie)) != NULL) {
|
||||
VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,
|
||||
uqn->uqn_id, uqn->uqn_delta, tx));
|
||||
kmem_free(uqn, sizeof (*uqn));
|
||||
}
|
||||
avl_destroy(&cache->uqc_user_deltas);
|
||||
|
||||
cookie = NULL;
|
||||
while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
|
||||
&cookie)) != NULL) {
|
||||
VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,
|
||||
uqn->uqn_id, uqn->uqn_delta, tx));
|
||||
kmem_free(uqn, sizeof (*uqn));
|
||||
}
|
||||
avl_destroy(&cache->uqc_group_deltas);
|
||||
}
|
||||
|
||||
static void
|
||||
userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)
|
||||
{
|
||||
userquota_node_t *uqn;
|
||||
avl_index_t idx;
|
||||
|
||||
ASSERT(strlen(id) < sizeof (uqn->uqn_id));
|
||||
/*
|
||||
* Use id directly for searching because uqn_id is the first field of
|
||||
* userquota_node_t and fields after uqn_id won't be accessed in
|
||||
* avl_find().
|
||||
*/
|
||||
uqn = avl_find(avl, (const void *)id, &idx);
|
||||
if (uqn == NULL) {
|
||||
uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
|
||||
strcpy(uqn->uqn_id, id);
|
||||
avl_insert(avl, uqn, idx);
|
||||
}
|
||||
uqn->uqn_delta += delta;
|
||||
}
|
||||
|
||||
static void
|
||||
do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
|
||||
uint64_t user, uint64_t group, boolean_t subtract)
|
||||
{
|
||||
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
|
||||
int64_t delta = DNODE_MIN_SIZE + used;
|
||||
char name[20];
|
||||
|
||||
if (subtract)
|
||||
delta = -delta;
|
||||
VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
|
||||
user, delta, tx));
|
||||
VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
|
||||
group, delta, tx));
|
||||
|
||||
(void) sprintf(name, "%llx", (longlong_t)user);
|
||||
userquota_update_cache(&cache->uqc_user_deltas, name, delta);
|
||||
|
||||
(void) sprintf(name, "%llx", (longlong_t)group);
|
||||
userquota_update_cache(&cache->uqc_group_deltas, name, delta);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
do_userobjquota_update(objset_t *os, uint64_t flags, uint64_t user,
|
||||
uint64_t group, boolean_t subtract, dmu_tx_t *tx)
|
||||
do_userobjquota_update(userquota_cache_t *cache, uint64_t flags,
|
||||
uint64_t user, uint64_t group, boolean_t subtract)
|
||||
{
|
||||
if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {
|
||||
char name[20 + DMU_OBJACCT_PREFIX_LEN];
|
||||
int delta = subtract ? -1 : 1;
|
||||
|
||||
(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
|
||||
(longlong_t)user);
|
||||
VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, name,
|
||||
subtract ? -1 : 1, tx));
|
||||
userquota_update_cache(&cache->uqc_user_deltas, name, delta);
|
||||
|
||||
(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
|
||||
(longlong_t)group);
|
||||
VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, name,
|
||||
subtract ? -1 : 1, tx));
|
||||
userquota_update_cache(&cache->uqc_group_deltas, name, delta);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1377,9 +1453,15 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
|
||||
{
|
||||
dnode_t *dn;
|
||||
list_t *list = &os->os_synced_dnodes;
|
||||
userquota_cache_t cache = { { 0 } };
|
||||
|
||||
ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
|
||||
|
||||
avl_create(&cache.uqc_user_deltas, userquota_compare,
|
||||
sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
|
||||
avl_create(&cache.uqc_group_deltas, userquota_compare,
|
||||
sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
|
||||
|
||||
while ((dn = list_head(list))) {
|
||||
int flags;
|
||||
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
|
||||
@ -1389,36 +1471,27 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
|
||||
|
||||
/* Allocate the user/groupused objects if necessary. */
|
||||
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
|
||||
VERIFY(0 == zap_create_claim(os,
|
||||
DMU_USERUSED_OBJECT,
|
||||
VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT,
|
||||
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
|
||||
VERIFY(0 == zap_create_claim(os,
|
||||
DMU_GROUPUSED_OBJECT,
|
||||
VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT,
|
||||
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* We intentionally modify the zap object even if the
|
||||
* net delta is zero. Otherwise
|
||||
* the block of the zap obj could be shared between
|
||||
* datasets but need to be different between them after
|
||||
* a bprewrite.
|
||||
*/
|
||||
|
||||
flags = dn->dn_id_flags;
|
||||
ASSERT(flags);
|
||||
if (flags & DN_ID_OLD_EXIST) {
|
||||
do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
|
||||
dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
|
||||
do_userobjquota_update(os, dn->dn_oldflags,
|
||||
dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
|
||||
do_userquota_update(&cache,
|
||||
dn->dn_oldused, dn->dn_oldflags,
|
||||
dn->dn_olduid, dn->dn_oldgid, B_TRUE);
|
||||
do_userobjquota_update(&cache, dn->dn_oldflags,
|
||||
dn->dn_olduid, dn->dn_oldgid, B_TRUE);
|
||||
}
|
||||
if (flags & DN_ID_NEW_EXIST) {
|
||||
do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
|
||||
dn->dn_phys->dn_flags, dn->dn_newuid,
|
||||
dn->dn_newgid, B_FALSE, tx);
|
||||
do_userobjquota_update(os, dn->dn_phys->dn_flags,
|
||||
dn->dn_newuid, dn->dn_newgid, B_FALSE, tx);
|
||||
do_userquota_update(&cache,
|
||||
DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,
|
||||
dn->dn_newuid, dn->dn_newgid, B_FALSE);
|
||||
do_userobjquota_update(&cache, dn->dn_phys->dn_flags,
|
||||
dn->dn_newuid, dn->dn_newgid, B_FALSE);
|
||||
}
|
||||
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
@ -1439,6 +1512,7 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
|
||||
list_remove(list, dn);
|
||||
dnode_rele(dn, list);
|
||||
}
|
||||
do_userquota_cacheflush(os, &cache, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2001,9 +2001,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
|
||||
boolean_t hole;
|
||||
int i, inc, error, span;
|
||||
|
||||
dprintf("probing object %llu offset %llx level %d of %u\n",
|
||||
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
|
||||
|
||||
hole = ((flags & DNODE_FIND_HOLE) != 0);
|
||||
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
|
||||
ASSERT(txg == 0 || !hole);
|
||||
|
Loading…
x
Reference in New Issue
Block a user