vfs: convert struct mount counters to per-cpu

There are 3 counters modified all the time in this structure - one for
keeping the structure alive, one for preventing unmount and one for
tracking active writers. Exact values of these counters are very rarely
needed, which makes them a prime candidate for conversion to a per-cpu
scheme, resulting in much better performance.

Sample benchmark performing fstatfs (modifying 2 out of 3 counters) on
a 104-way 2 socket Skylake system:
before:   852393 ops/s
after:  76682077 ops/s

Reviewed by:	kib, jeff
Tested by:	pho
Sponsored by:	The FreeBSD Foundation
Differential Revision:	https://reviews.freebsd.org/D21637
This commit is contained in:
Mateusz Guzik 2019-09-16 21:37:47 +00:00
parent e87f3f72f1
commit 4cace859c2
8 changed files with 204 additions and 46 deletions

View File

@ -190,8 +190,6 @@ tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type,
/* If the root directory of the 'tmp' file system is not yet
* allocated, this must be the request to do it. */
MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR));
KASSERT(tmp->tm_root == NULL || mp->mnt_writeopcount > 0,
("creating node not under vn_start_write"));
MPASS(IFF(type == VLNK, target != NULL));
MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL));

View File

@ -607,7 +607,7 @@ vop_stdgetwritemount(ap)
}
if (vfs_op_thread_enter(mp)) {
if (mp == vp->v_mount)
MNT_REF_UNLOCKED(mp);
vfs_mp_count_add_pcpu(mp, ref, 1);
else
mp = NULL;
vfs_op_thread_exit(mp);

View File

@ -126,6 +126,12 @@ mount_init(void *mem, int size, int flags)
lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
M_WAITOK | M_ZERO);
mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
M_WAITOK | M_ZERO);
mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
M_WAITOK | M_ZERO);
mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int,
M_WAITOK | M_ZERO);
mp->mnt_ref = 0;
mp->mnt_vfs_ops = 1;
return (0);
@ -137,6 +143,9 @@ mount_fini(void *mem, int size)
struct mount *mp;
mp = (struct mount *)mem;
uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu);
uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu);
uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu);
uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu);
lockdestroy(&mp->mnt_explock);
mtx_destroy(&mp->mnt_listmtx);
@ -452,7 +461,7 @@ vfs_ref(struct mount *mp)
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
if (vfs_op_thread_enter(mp)) {
MNT_REF_UNLOCKED(mp);
vfs_mp_count_add_pcpu(mp, ref, 1);
vfs_op_thread_exit(mp);
return;
}
@ -468,7 +477,7 @@ vfs_rel(struct mount *mp)
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
if (vfs_op_thread_enter(mp)) {
MNT_REL_UNLOCKED(mp);
vfs_mp_count_sub_pcpu(mp, ref, 1);
vfs_op_thread_exit(mp);
return;
}
@ -533,6 +542,8 @@ vfs_mount_destroy(struct mount *mp)
if (mp->mnt_vfs_ops == 0)
panic("%s: entered with zero vfs_ops\n", __func__);
vfs_assert_mount_counters(mp);
MNT_ILOCK(mp);
mp->mnt_kern_flag |= MNTK_REFEXPIRE;
if (mp->mnt_kern_flag & MNTK_MWAIT) {
@ -1382,6 +1393,7 @@ dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
void
vfs_op_enter(struct mount *mp)
{
int cpu;
MNT_ILOCK(mp);
mp->mnt_vfs_ops++;
@ -1395,7 +1407,20 @@ vfs_op_enter(struct mount *mp)
*/
atomic_thread_fence_seq_cst();
vfs_op_barrier_wait(mp);
/*
* Paired with a fence in vfs_op_thread_exit().
*/
atomic_thread_fence_acq();
CPU_FOREACH(cpu) {
mp->mnt_ref +=
zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu);
mp->mnt_lockref +=
zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu);
mp->mnt_writeopcount +=
zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu);
}
MNT_IUNLOCK(mp);
vfs_assert_mount_counters(mp);
}
void
@ -1435,6 +1460,93 @@ vfs_op_barrier_wait(struct mount *mp)
}
}
#ifdef DIAGNOSTIC
void
vfs_assert_mount_counters(struct mount *mp)
{
int cpu;
if (mp->mnt_vfs_ops == 0)
return;
CPU_FOREACH(cpu) {
if (*(int *)zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 ||
*(int *)zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 ||
*(int *)zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0)
vfs_dump_mount_counters(mp);
}
}
void
vfs_dump_mount_counters(struct mount *mp)
{
int cpu, *count;
int ref, lockref, writeopcount;
printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops);
printf(" ref : ");
ref = mp->mnt_ref;
CPU_FOREACH(cpu) {
count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu);
printf("%d ", *count);
ref += *count;
}
printf("\n");
printf(" lockref : ");
lockref = mp->mnt_lockref;
CPU_FOREACH(cpu) {
count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu);
printf("%d ", *count);
lockref += *count;
}
printf("\n");
printf("writeopcount: ");
writeopcount = mp->mnt_writeopcount;
CPU_FOREACH(cpu) {
count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu);
printf("%d ", *count);
writeopcount += *count;
}
printf("\n");
printf("counter struct total\n");
printf("ref %-5d %-5d\n", mp->mnt_ref, ref);
printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref);
printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount);
panic("invalid counts on struct mount");
}
#endif
int
vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which)
{
int *base, *pcpu;
int cpu, sum;
switch (which) {
case MNT_COUNT_REF:
base = &mp->mnt_ref;
pcpu = mp->mnt_ref_pcpu;
break;
case MNT_COUNT_LOCKREF:
base = &mp->mnt_lockref;
pcpu = mp->mnt_lockref_pcpu;
break;
case MNT_COUNT_WRITEOPCOUNT:
base = &mp->mnt_writeopcount;
pcpu = mp->mnt_writeopcount_pcpu;
break;
}
sum = *base;
CPU_FOREACH(cpu) {
sum += *(int *)zpcpu_get_cpu(pcpu, cpu);
}
return (sum);
}
/*
* Do the actual filesystem unmount.
*/

View File

@ -645,8 +645,8 @@ vfs_busy(struct mount *mp, int flags)
MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
MNT_REF_UNLOCKED(mp);
atomic_add_int(&mp->mnt_lockref, 1);
vfs_mp_count_add_pcpu(mp, ref, 1);
vfs_mp_count_add_pcpu(mp, lockref, 1);
vfs_op_thread_exit(mp);
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
@ -654,6 +654,7 @@ vfs_busy(struct mount *mp, int flags)
}
MNT_ILOCK(mp);
vfs_assert_mount_counters(mp);
MNT_REF(mp);
/*
* If mount point is currently being unmounted, sleep until the
@ -685,7 +686,7 @@ vfs_busy(struct mount *mp, int flags)
}
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
atomic_add_int(&mp->mnt_lockref, 1);
mp->mnt_lockref++;
MNT_IUNLOCK(mp);
return (0);
}
@ -702,17 +703,23 @@ vfs_unbusy(struct mount *mp)
if (vfs_op_thread_enter(mp)) {
MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1;
KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c));
MNT_REL_UNLOCKED(mp);
vfs_mp_count_sub_pcpu(mp, lockref, 1);
vfs_mp_count_sub_pcpu(mp, ref, 1);
vfs_op_thread_exit(mp);
return;
}
MNT_ILOCK(mp);
vfs_assert_mount_counters(mp);
MNT_REL(mp);
c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1;
KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c));
c = --mp->mnt_lockref;
if (mp->mnt_vfs_ops == 0) {
MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
MNT_IUNLOCK(mp);
return;
}
if (c < 0)
vfs_dump_mount_counters(mp);
if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
CTR1(KTR_VFS, "%s: waking up waiters", __func__);
@ -4040,16 +4047,19 @@ DB_SHOW_COMMAND(mount, db_show_mount)
if (jailed(mp->mnt_cred))
db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
db_printf(" }\n");
db_printf(" mnt_ref = %d\n", mp->mnt_ref);
db_printf(" mnt_ref = %d (with %d in the struct)\n",
vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
db_printf(" mnt_gen = %d\n", mp->mnt_gen);
db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
db_printf(" mnt_activevnodelistsize = %d\n",
mp->mnt_activevnodelistsize);
db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount);
db_printf(" mnt_writeopcount = %d (with %d in the struct)\n",
vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
db_printf(" mnt_lockref = %d\n", mp->mnt_lockref);
db_printf(" mnt_lockref = %d (with %d in the struct)\n",
vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
db_printf(" mnt_secondary_accwrites = %d\n",
mp->mnt_secondary_accwrites);

View File

@ -1628,7 +1628,7 @@ vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
vfs_op_thread_enter(mp)) {
MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
atomic_add_int(&mp->mnt_writeopcount, 1);
vfs_mp_count_add_pcpu(mp, writeopcount, 1);
vfs_op_thread_exit(mp);
return (0);
}
@ -1660,7 +1660,7 @@ vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
}
if (flags & V_XSLEEP)
goto unlock;
atomic_add_int(&mp->mnt_writeopcount, 1);
mp->mnt_writeopcount++;
unlock:
if (error != 0 || (flags & V_XSLEEP) != 0)
MNT_REL(mp);
@ -1797,19 +1797,23 @@ vn_finished_write(struct mount *mp)
return;
if (vfs_op_thread_enter(mp)) {
c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1;
if (c < 0)
panic("vn_finished_write: invalid writeopcount %d", c);
MNT_REL_UNLOCKED(mp);
vfs_mp_count_sub_pcpu(mp, writeopcount, 1);
vfs_mp_count_sub_pcpu(mp, ref, 1);
vfs_op_thread_exit(mp);
return;
}
MNT_ILOCK(mp);
vfs_assert_mount_counters(mp);
MNT_REL(mp);
c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1;
c = --mp->mnt_writeopcount;
if (mp->mnt_vfs_ops == 0) {
MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
MNT_IUNLOCK(mp);
return;
}
if (c < 0)
panic("vn_finished_write: invalid writeopcount %d", c);
vfs_dump_mount_counters(mp);
if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
wakeup(&mp->mnt_writeopcount);
MNT_IUNLOCK(mp);
@ -1852,6 +1856,7 @@ vfs_write_suspend(struct mount *mp, int flags)
vfs_op_enter(mp);
MNT_ILOCK(mp);
vfs_assert_mount_counters(mp);
if (mp->mnt_susp_owner == curthread) {
vfs_op_exit_locked(mp);
MNT_IUNLOCK(mp);
@ -1909,7 +1914,7 @@ vfs_write_resume(struct mount *mp, int flags)
curthread->td_pflags &= ~TDP_IGNSUSP;
if ((flags & VR_START_WRITE) != 0) {
MNT_REF(mp);
atomic_add_int(&mp->mnt_writeopcount, 1);
mp->mnt_writeopcount++;
}
MNT_IUNLOCK(mp);
if ((flags & VR_NO_SUSPCLR) == 0)

View File

@ -228,6 +228,9 @@ struct mount {
TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/
int mnt_vfs_ops; /* (i) pending vfs ops */
int *mnt_thread_in_ops_pcpu;
int *mnt_ref_pcpu;
int *mnt_lockref_pcpu;
int *mnt_writeopcount_pcpu;
};
/*
@ -268,25 +271,16 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *);
#define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx)
#define MNT_MTX(mp) (&(mp)->mnt_mtx)
#define MNT_REF_UNLOCKED(mp) do { \
atomic_add_int(&(mp)->mnt_ref, 1); \
} while (0)
#define MNT_REL_UNLOCKED(mp) do { \
int _c; \
_c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \
KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \
} while (0)
#define MNT_REF(mp) do { \
mtx_assert(MNT_MTX(mp), MA_OWNED); \
atomic_add_int(&(mp)->mnt_ref, 1); \
mp->mnt_ref++; \
} while (0)
#define MNT_REL(mp) do { \
int _c; \
mtx_assert(MNT_MTX(mp), MA_OWNED); \
_c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \
KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \
if (_c == 0) \
(mp)->mnt_ref--; \
if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0) \
vfs_dump_mount_counters(mp); \
if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops) \
wakeup((mp)); \
} while (0)
@ -959,6 +953,17 @@ void vfs_op_enter(struct mount *);
void vfs_op_exit_locked(struct mount *);
void vfs_op_exit(struct mount *);
#ifdef DIAGNOSTIC
void vfs_assert_mount_counters(struct mount *);
void vfs_dump_mount_counters(struct mount *);
#else
#define vfs_assert_mount_counters(mp) do { } while (0)
#define vfs_dump_mount_counters(mp) do { } while (0)
#endif
enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT };
int vfs_mount_fetch_counter(struct mount *, enum mount_counter);
/*
* We mark ourselves as entering the section and post a sequentially consistent
* fence, meaning the store is completed before we get into the section and
@ -976,26 +981,41 @@ void vfs_op_exit(struct mount *);
* before making any changes or only make changes safe while the section is
* executed.
*/
#define vfs_op_thread_entered(mp) ({ \
MPASS(curthread->td_critnest > 0); \
*(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) == 1; \
})
#define vfs_op_thread_enter(mp) ({ \
struct mount *_mp = (mp); \
bool _retval = true; \
critical_enter(); \
*(int *)zpcpu_get(_mp->mnt_thread_in_ops_pcpu) = 1; \
MPASS(!vfs_op_thread_entered(mp)); \
*(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 1; \
atomic_thread_fence_seq_cst(); \
if (__predict_false(_mp->mnt_vfs_ops > 0)) { \
vfs_op_thread_exit(_mp); \
if (__predict_false(mp->mnt_vfs_ops > 0)) { \
vfs_op_thread_exit(mp); \
_retval = false; \
} \
_retval; \
})
#define vfs_op_thread_exit(mp) do { \
MPASS(vfs_op_thread_entered(mp)); \
atomic_thread_fence_rel(); \
*(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0; \
critical_exit(); \
} while (0)
#define vfs_mp_count_add_pcpu(mp, count, val) do { \
MPASS(vfs_op_thread_entered(mp)); \
(*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) += val; \
} while (0)
#define vfs_mp_count_sub_pcpu(mp, count, val) do { \
MPASS(vfs_op_thread_entered(mp)); \
(*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) -= val; \
} while (0)
#else /* !_KERNEL */
#include <sys/cdefs.h>

View File

@ -242,6 +242,18 @@ zpcpu_get_cpu(void *base, int cpu)
return ((char *)(base) + UMA_PCPU_ALLOC_SIZE * cpu);
}
/*
* This operation is NOT atomic and does not post any barriers.
* If you use this the assumption is that the target CPU will not
* be modifying this variable.
* If you need atomicity use xchg.
* */
#define zpcpu_replace_cpu(base, val, cpu) ({ \
__typeof(val) _old = *(__typeof(val) *)zpcpu_get_cpu(base, cpu);\
*(__typeof(val) *)zpcpu_get_cpu(base, cpu) = val; \
_old; \
})
/*
* Machine dependent callouts. cpu_pcpu_init() is responsible for
* initializing machine dependent fields of struct pcpu, and

View File

@ -13403,10 +13403,11 @@ softdep_request_cleanup(fs, vp, cred, resource)
* (fs_minfree).
*/
if (resource == FLUSH_INODES_WAIT) {
needed = vp->v_mount->mnt_writeopcount + 2;
needed = vfs_mount_fetch_counter(vp->v_mount,
MNT_COUNT_WRITEOPCOUNT) + 2;
} else if (resource == FLUSH_BLOCKS_WAIT) {
needed = (vp->v_mount->mnt_writeopcount + 2) *
fs->fs_contigsumsize;
needed = (vfs_mount_fetch_counter(vp->v_mount,
MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize;
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE))
needed += fragstoblks(fs,
roundup((fs->fs_dsize * fs->fs_minfree / 100) -