6988 spa_sync() spends half its time in dmu_objset_do_userquota_updates

Using a benchmark which creates 2 million files in one TXG, I observe
that the thread running spa_sync() is on CPU almost the entire time we
are syncing, and therefore can be a performance bottleneck. About 50% of
the time in spa_sync() is in dmu_objset_do_userquota_updates().

The problem is that dmu_objset_do_userquota_updates() calls
zap_increment_int(DMU_USERUSED_OBJECT) once for every file that was
modified (or created). In this benchmark, all the files are owned by the
same user/group, so all 2 million calls to zap_increment_int() are
modifying the same entry in the zap. The same issue exists for the
DMU_GROUPUSED_OBJECT.

We should keep an in-memory map from user to space delta while we are
syncing, and when we finish, iterate over the in-memory map and modify
the ZAP once per entry. This reduces the number of calls to
zap_increment_int() from "number of objects modified" to "number of
owners/groups of modified files".

This reduced the time spent in spa_sync() in the file create benchmark
by ~33%, from 11 seconds to 7 seconds.

Closes #107

Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Steve Gonczi <steve.gonczi@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Jinshan Xiong <jinshan.xiong@intel.com>
Author: Matthew Ahrens <mahrens@delphix.com>

openzfs/openzfs@5fc46359c5
This commit is contained in:
Alexander Motin 2016-10-14 11:46:17 +00:00
parent 23f49f4ada
commit d503f63e1b
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/vendor-sys/illumos/dist/; revision=307314

View File

@ -1207,18 +1207,83 @@ dmu_objset_userused_enabled(objset_t *os)
DMU_USERUSED_DNODE(os) != NULL);
}
typedef struct userquota_node {
uint64_t uqn_id;
int64_t uqn_delta;
avl_node_t uqn_node;
} userquota_node_t;
typedef struct userquota_cache {
avl_tree_t uqc_user_deltas;
avl_tree_t uqc_group_deltas;
} userquota_cache_t;
static int
userquota_compare(const void *l, const void *r)
{
const userquota_node_t *luqn = l;
const userquota_node_t *ruqn = r;
if (luqn->uqn_id < ruqn->uqn_id)
return (-1);
if (luqn->uqn_id > ruqn->uqn_id)
return (1);
return (0);
}
static void
do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
{
void *cookie;
userquota_node_t *uqn;
ASSERT(dmu_tx_is_syncing(tx));
cookie = NULL;
while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
&cookie)) != NULL) {
VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
uqn->uqn_id, uqn->uqn_delta, tx));
kmem_free(uqn, sizeof (*uqn));
}
avl_destroy(&cache->uqc_user_deltas);
cookie = NULL;
while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
&cookie)) != NULL) {
VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
uqn->uqn_id, uqn->uqn_delta, tx));
kmem_free(uqn, sizeof (*uqn));
}
avl_destroy(&cache->uqc_group_deltas);
}
static void
userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
{
userquota_node_t search = { .uqn_id = id };
avl_index_t idx;
userquota_node_t *uqn = avl_find(avl, &search, &idx);
if (uqn == NULL) {
uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
uqn->uqn_id = id;
avl_insert(avl, uqn, idx);
}
uqn->uqn_delta += delta;
}
static void
do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
uint64_t user, uint64_t group, boolean_t subtract)
{
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
int64_t delta = DNODE_SIZE + used;
if (subtract)
delta = -delta;
VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
user, delta, tx));
VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
group, delta, tx));
userquota_update_cache(&cache->uqc_user_deltas, user, delta);
userquota_update_cache(&cache->uqc_group_deltas, group, delta);
}
}
@ -1227,9 +1292,15 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
{
dnode_t *dn;
list_t *list = &os->os_synced_dnodes;
userquota_cache_t cache = { 0 };
ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
avl_create(&cache.uqc_user_deltas, userquota_compare,
sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
avl_create(&cache.uqc_group_deltas, userquota_compare,
sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
while (dn = list_head(list)) {
int flags;
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
@ -1239,32 +1310,26 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
/* Allocate the user/groupused objects if necessary. */
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
VERIFY(0 == zap_create_claim(os,
VERIFY0(zap_create_claim(os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
VERIFY(0 == zap_create_claim(os,
VERIFY0(zap_create_claim(os,
DMU_GROUPUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
/*
* We intentionally modify the zap object even if the
* net delta is zero. Otherwise
* the block of the zap obj could be shared between
* datasets but need to be different between them after
* a bprewrite.
*/
flags = dn->dn_id_flags;
ASSERT(flags);
if (flags & DN_ID_OLD_EXIST) {
do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
do_userquota_update(&cache,
dn->dn_oldused, dn->dn_oldflags,
dn->dn_olduid, dn->dn_oldgid, B_TRUE);
}
if (flags & DN_ID_NEW_EXIST) {
do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
do_userquota_update(&cache,
DN_USED_BYTES(dn->dn_phys),
dn->dn_phys->dn_flags, dn->dn_newuid,
dn->dn_newgid, B_FALSE, tx);
dn->dn_newgid, B_FALSE);
}
mutex_enter(&dn->dn_mtx);
@ -1285,6 +1350,7 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
list_remove(list, dn);
dnode_rele(dn, list);
}
do_userquota_cacheflush(os, &cache, tx);
}
/*