From 4cace859c2a5929e41352d23750b3d5f02978869 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 16 Sep 2019 21:37:47 +0000 Subject: [PATCH] vfs: convert struct mount counters to per-cpu There are 3 counters modified all the time in this structure - one for keeping the structure alive, one for preventing unmount and one for tracking active writers. Exact values of these counters are very rarely needed, which makes them a prime candidate for conversion to a per-cpu scheme, resulting in much better performance. Sample benchmark performing fstatfs (modifying 2 out of 3 counters) on a 104-way 2 socket Skylake system: before: 852393 ops/s after: 76682077 ops/s Reviewed by: kib, jeff Tested by: pho Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D21637 --- sys/fs/tmpfs/tmpfs_subr.c | 2 - sys/kern/vfs_default.c | 2 +- sys/kern/vfs_mount.c | 116 +++++++++++++++++++++++++++++++++++++- sys/kern/vfs_subr.c | 32 +++++++---- sys/kern/vfs_vnops.c | 23 +++++--- sys/sys/mount.h | 56 ++++++++++++------ sys/sys/pcpu.h | 12 ++++ sys/ufs/ffs/ffs_softdep.c | 7 ++- 8 files changed, 204 insertions(+), 46 deletions(-) diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index 32c929fefaa8..51a1a7af51a5 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -190,8 +190,6 @@ tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type, /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); - KASSERT(tmp->tm_root == NULL || mp->mnt_writeopcount > 0, - ("creating node not under vn_start_write")); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index 92fa3ef6826e..163a617a3ec5 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -607,7 +607,7 @@ vop_stdgetwritemount(ap) } if (vfs_op_thread_enter(mp)) { if (mp == vp->v_mount) - MNT_REF_UNLOCKED(mp); + vfs_mp_count_add_pcpu(mp, ref, 1); else mp = NULL; vfs_op_thread_exit(mp); diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 487a162f41af..494f2a536281 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -126,6 +126,12 @@ mount_init(void *mem, int size, int flags) lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); + mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); + mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); + mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; return (0); @@ -137,6 +143,9 @@ mount_fini(void *mem, int size) struct mount *mp; mp = (struct mount *)mem; + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu); + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu); + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); @@ -452,7 +461,7 @@ vfs_ref(struct mount *mp) CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { - MNT_REF_UNLOCKED(mp); + vfs_mp_count_add_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } @@ -468,7 +477,7 @@ vfs_rel(struct mount *mp) CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { - MNT_REL_UNLOCKED(mp); + vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } @@ -533,6 +542,8 @@ vfs_mount_destroy(struct mount *mp) if (mp->mnt_vfs_ops == 0) panic("%s: entered with zero vfs_ops\n", __func__); + vfs_assert_mount_counters(mp); + MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { @@ -1382,6 +1393,7 @@ dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) void vfs_op_enter(struct mount *mp) { + int cpu; MNT_ILOCK(mp); mp->mnt_vfs_ops++; @@ -1395,7 +1407,20 @@ vfs_op_enter(struct mount *mp) */ atomic_thread_fence_seq_cst(); vfs_op_barrier_wait(mp); + /* + * Paired with a fence in vfs_op_thread_exit(). + */ + atomic_thread_fence_acq(); + CPU_FOREACH(cpu) { + mp->mnt_ref += + zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu); + mp->mnt_lockref += + zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu); + mp->mnt_writeopcount += + zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu); + } MNT_IUNLOCK(mp); + vfs_assert_mount_counters(mp); } void @@ -1435,6 +1460,93 @@ vfs_op_barrier_wait(struct mount *mp) } } +#ifdef DIAGNOSTIC +void +vfs_assert_mount_counters(struct mount *mp) +{ + int cpu; + + if (mp->mnt_vfs_ops == 0) + return; + + CPU_FOREACH(cpu) { + if (*(int *)zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 || + *(int *)zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 || + *(int *)zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0) + vfs_dump_mount_counters(mp); + } +} + +void +vfs_dump_mount_counters(struct mount *mp) +{ + int cpu, *count; + int ref, lockref, writeopcount; + + printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); + + printf(" ref : "); + ref = mp->mnt_ref; + CPU_FOREACH(cpu) { + count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu); + printf("%d ", *count); + ref += *count; + } + printf("\n"); + printf(" lockref : "); + lockref = mp->mnt_lockref; + CPU_FOREACH(cpu) { + count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu); + printf("%d ", *count); + lockref += *count; + } + printf("\n"); + printf("writeopcount: "); + writeopcount = mp->mnt_writeopcount; + CPU_FOREACH(cpu) { + count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu); + printf("%d ", *count); + writeopcount += *count; + } + printf("\n"); + + printf("counter struct total\n"); + printf("ref %-5d %-5d\n", mp->mnt_ref, ref); + printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); + printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); + + panic("invalid counts on struct mount"); +} +#endif + +int +vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) +{ + int *base, *pcpu; + int cpu, sum; + + switch (which) { + case MNT_COUNT_REF: + base = &mp->mnt_ref; + pcpu = mp->mnt_ref_pcpu; + break; + case MNT_COUNT_LOCKREF: + base = &mp->mnt_lockref; + pcpu = mp->mnt_lockref_pcpu; + break; + case MNT_COUNT_WRITEOPCOUNT: + base = &mp->mnt_writeopcount; + pcpu = mp->mnt_writeopcount_pcpu; + break; + } + + sum = *base; + CPU_FOREACH(cpu) { + sum += *(int *)zpcpu_get_cpu(pcpu, cpu); + } + return (sum); +} + /* * Do the actual filesystem unmount. */ diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 8a7995a79011..3bdc4d1e3f65 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -645,8 +645,8 @@ vfs_busy(struct mount *mp, int flags) MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); - MNT_REF_UNLOCKED(mp); - atomic_add_int(&mp->mnt_lockref, 1); + vfs_mp_count_add_pcpu(mp, ref, 1); + vfs_mp_count_add_pcpu(mp, lockref, 1); vfs_op_thread_exit(mp); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); @@ -654,6 +654,7 @@ vfs_busy(struct mount *mp, int flags) } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the @@ -685,7 +686,7 @@ vfs_busy(struct mount *mp, int flags) } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); - atomic_add_int(&mp->mnt_lockref, 1); + mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } @@ -702,17 +703,23 @@ vfs_unbusy(struct mount *mp) if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); - c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1; - KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c)); - MNT_REL_UNLOCKED(mp); + vfs_mp_count_sub_pcpu(mp, lockref, 1); + vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REL(mp); - c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1; - KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c)); + c = --mp->mnt_lockref; + if (mp->mnt_vfs_ops == 0) { + MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); + MNT_IUNLOCK(mp); + return; + } + if (c < 0) + vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); @@ -4040,16 +4047,19 @@ DB_SHOW_COMMAND(mount, db_show_mount) if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); - db_printf(" mnt_ref = %d\n", mp->mnt_ref); + db_printf(" mnt_ref = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); - db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); + db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); - db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); + db_printf(" mnt_lockref = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 7cccc2914f9f..4116ee51120d 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1628,7 +1628,7 @@ vn_start_write_refed(struct mount *mp, int flags, bool mplocked) if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); - atomic_add_int(&mp->mnt_writeopcount, 1); + vfs_mp_count_add_pcpu(mp, writeopcount, 1); vfs_op_thread_exit(mp); return (0); } @@ -1660,7 +1660,7 @@ vn_start_write_refed(struct mount *mp, int flags, bool mplocked) } if (flags & V_XSLEEP) goto unlock; - atomic_add_int(&mp->mnt_writeopcount, 1); + mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); @@ -1797,19 +1797,23 @@ vn_finished_write(struct mount *mp) return; if (vfs_op_thread_enter(mp)) { - c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1; - if (c < 0) - panic("vn_finished_write: invalid writeopcount %d", c); - MNT_REL_UNLOCKED(mp); + vfs_mp_count_sub_pcpu(mp, writeopcount, 1); + vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REL(mp); - c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1; + c = --mp->mnt_writeopcount; + if (mp->mnt_vfs_ops == 0) { + MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); + MNT_IUNLOCK(mp); + return; + } if (c < 0) - panic("vn_finished_write: invalid writeopcount %d", c); + vfs_dump_mount_counters(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); @@ -1852,6 +1856,7 @@ vfs_write_suspend(struct mount *mp, int flags) vfs_op_enter(mp); MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); if (mp->mnt_susp_owner == curthread) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); @@ -1909,7 +1914,7 @@ vfs_write_resume(struct mount *mp, int flags) curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); - atomic_add_int(&mp->mnt_writeopcount, 1); + mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 8bedb85f65a1..4a5333203f91 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -228,6 +228,9 @@ struct mount { TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ int mnt_vfs_ops; /* (i) pending vfs ops */ int *mnt_thread_in_ops_pcpu; + int *mnt_ref_pcpu; + int *mnt_lockref_pcpu; + int *mnt_writeopcount_pcpu; }; /* @@ -268,25 +271,16 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) -#define MNT_REF_UNLOCKED(mp) do { \ - atomic_add_int(&(mp)->mnt_ref, 1); \ -} while (0) -#define MNT_REL_UNLOCKED(mp) do { \ - int _c; \ - _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ - KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ -} while (0) - #define MNT_REF(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - atomic_add_int(&(mp)->mnt_ref, 1); \ + mp->mnt_ref++; \ } while (0) #define MNT_REL(mp) do { \ - int _c; \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ - KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ - if (_c == 0) \ + (mp)->mnt_ref--; \ + if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0) \ + vfs_dump_mount_counters(mp); \ + if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops) \ wakeup((mp)); \ } while (0) @@ -959,6 +953,17 @@ void vfs_op_enter(struct mount *); void vfs_op_exit_locked(struct mount *); void vfs_op_exit(struct mount *); +#ifdef DIAGNOSTIC +void vfs_assert_mount_counters(struct mount *); +void vfs_dump_mount_counters(struct mount *); +#else +#define vfs_assert_mount_counters(mp) do { } while (0) +#define vfs_dump_mount_counters(mp) do { } while (0) +#endif + +enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT }; +int vfs_mount_fetch_counter(struct mount *, enum mount_counter); + /* * We mark ourselves as entering the section and post a sequentially consistent * fence, meaning the store is completed before we get into the section and @@ -976,26 +981,41 @@ void vfs_op_exit(struct mount *); * before making any changes or only make changes safe while the section is * executed. */ +#define vfs_op_thread_entered(mp) ({ \ + MPASS(curthread->td_critnest > 0); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) == 1; \ +}) #define vfs_op_thread_enter(mp) ({ \ - struct mount *_mp = (mp); \ bool _retval = true; \ critical_enter(); \ - *(int *)zpcpu_get(_mp->mnt_thread_in_ops_pcpu) = 1; \ + MPASS(!vfs_op_thread_entered(mp)); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 1; \ atomic_thread_fence_seq_cst(); \ - if (__predict_false(_mp->mnt_vfs_ops > 0)) { \ - vfs_op_thread_exit(_mp); \ + if (__predict_false(mp->mnt_vfs_ops > 0)) { \ + vfs_op_thread_exit(mp); \ _retval = false; \ } \ _retval; \ }) #define vfs_op_thread_exit(mp) do { \ + MPASS(vfs_op_thread_entered(mp)); \ atomic_thread_fence_rel(); \ *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0; \ critical_exit(); \ } while (0) +#define vfs_mp_count_add_pcpu(mp, count, val) do { \ + MPASS(vfs_op_thread_entered(mp)); \ + (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) += val; \ +} while (0) + +#define vfs_mp_count_sub_pcpu(mp, count, val) do { \ + MPASS(vfs_op_thread_entered(mp)); \ + (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) -= val; \ +} while (0) + #else /* !_KERNEL */ #include diff --git a/sys/sys/pcpu.h b/sys/sys/pcpu.h index 5298f38fe4ec..5813b8dd0c90 100644 --- a/sys/sys/pcpu.h +++ b/sys/sys/pcpu.h @@ -242,6 +242,18 @@ zpcpu_get_cpu(void *base, int cpu) return ((char *)(base) + UMA_PCPU_ALLOC_SIZE * cpu); } +/* + * This operation is NOT atomic and does not post any barriers. + * If you use this the assumption is that the target CPU will not + * be modifying this variable. + * If you need atomicity use xchg. + * */ +#define zpcpu_replace_cpu(base, val, cpu) ({ \ + __typeof(val) _old = *(__typeof(val) *)zpcpu_get_cpu(base, cpu);\ + *(__typeof(val) *)zpcpu_get_cpu(base, cpu) = val; \ + _old; \ +}) + /* * Machine dependent callouts. cpu_pcpu_init() is responsible for * initializing machine dependent fields of struct pcpu, and diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index d9f07f3d9930..cbba3d63c904 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -13403,10 +13403,11 @@ softdep_request_cleanup(fs, vp, cred, resource) * (fs_minfree). */ if (resource == FLUSH_INODES_WAIT) { - needed = vp->v_mount->mnt_writeopcount + 2; + needed = vfs_mount_fetch_counter(vp->v_mount, + MNT_COUNT_WRITEOPCOUNT) + 2; } else if (resource == FLUSH_BLOCKS_WAIT) { - needed = (vp->v_mount->mnt_writeopcount + 2) * - fs->fs_contigsumsize; + needed = (vfs_mount_fetch_counter(vp->v_mount, + MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE)) needed += fragstoblks(fs, roundup((fs->fs_dsize * fs->fs_minfree / 100) -