amd64 pmap: rework delayed invalidation, removing global mutex.

For machines having cmpxcgh16b instruction, i.e. everything but very
early Athlons, provide lockless implementation of delayed
invalidation.

The implementation maintains lock-less single-linked list with the
trick from the T.L. Harris article about volatile mark of the elements
being removed. Double-CAS is used to atomically update both link and
generation.  New thread starting DI appends itself to the end of the
queue, setting the generation to the generation of the last element
+1.  On DI finish, thread donates its generation to the previous
element.  The generation of the fake head of the list is the last
passed DI generation.  Basically, the implementation is a queued
spinlock but without spinlock.

Many thanks both to Peter Holm and Mark Johnson for keeping with me
while I produced intermediate versions of the patch.

Reviewed by:	markj
Tested by:	pho
Sponsored by:	The FreeBSD Foundation
MFC after:	1 month
MFC note:	td_md.md_invl_gen should go to the end of struct thread
Differential revision:	https://reviews.freebsd.org/D19630
This commit is contained in:
Konstantin Belousov 2019-05-16 13:28:48 +00:00
parent a9fd669b4a
commit 4d3b28bcdc
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=347695
6 changed files with 385 additions and 21 deletions

View File

@ -107,6 +107,7 @@ __FBSDID("$FreeBSD$");
* and to when physical maps must be made correct.
*/
#include "opt_ddb.h"
#include "opt_pmap.h"
#include "opt_vm.h"
@ -130,6 +131,10 @@ __FBSDID("$FreeBSD$");
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/smp.h>
#ifdef DDB
#include <sys/kdb.h>
#include <ddb/ddb.h>
#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
@ -468,22 +473,46 @@ SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
static struct mtx invl_gen_mtx;
static u_long pmap_invl_gen = 0;
/* Fake lock object to satisfy turnstiles interface. */
static struct lock_object invl_gen_ts = {
.lo_name = "invlts",
};
static bool
pmap_not_in_di(void)
{
return (curthread->td_md.md_invl_gen.gen == 0);
}
static struct pmap_invl_gen pmap_invl_gen_head = {
.gen = 1,
.next = NULL,
};
static u_long pmap_invl_gen = 1;
#define PMAP_ASSERT_NOT_IN_DI() \
KASSERT(pmap_not_in_di(), ("DI already started"))
static bool pmap_not_in_di_l(void);
static bool pmap_not_in_di_u(void);
DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static)
{
return ((cpu_feature2 & CPUID2_CX16) == 0 ? pmap_not_in_di_l :
pmap_not_in_di_u);
}
static bool
pmap_not_in_di_l(void)
{
struct pmap_invl_gen *invl_gen;
invl_gen = &curthread->td_md.md_invl_gen;
return (invl_gen->gen == 0);
}
static void
pmap_thread_init_invl_gen_l(struct thread *td)
{
struct pmap_invl_gen *invl_gen;
invl_gen = &td->td_md.md_invl_gen;
invl_gen->gen = 0;
}
/*
* Start a new Delayed Invalidation (DI) block of code, executed by
* the current thread. Within a DI block, the current thread may
@ -493,7 +522,7 @@ pmap_not_in_di(void)
* pmap active.
*/
static void
pmap_delayed_invl_started(void)
pmap_delayed_invl_started_l(void)
{
struct pmap_invl_gen *invl_gen;
u_long currgen;
@ -525,7 +554,7 @@ pmap_delayed_invl_started(void)
* current thread's DI.
*/
static void
pmap_delayed_invl_finished(void)
pmap_delayed_invl_finished_l(void)
{
struct pmap_invl_gen *invl_gen, *next;
struct turnstile *ts;
@ -551,6 +580,284 @@ pmap_delayed_invl_finished(void)
invl_gen->gen = 0;
}
static bool
pmap_not_in_di_u(void)
{
struct pmap_invl_gen *invl_gen;
invl_gen = &curthread->td_md.md_invl_gen;
return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
}
static void
pmap_thread_init_invl_gen_u(struct thread *td)
{
struct pmap_invl_gen *invl_gen;
invl_gen = &td->td_md.md_invl_gen;
invl_gen->gen = 0;
invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
}
static bool
pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
{
uint64_t new_high, new_low, old_high, old_low;
char res;
old_low = new_low = 0;
old_high = new_high = (uintptr_t)0;
__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
: "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
: "b"(new_low), "c" (new_high)
: "memory", "cc");
if (res == 0) {
if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
return (false);
out->gen = old_low;
out->next = (void *)old_high;
} else {
out->gen = new_low;
out->next = (void *)new_high;
}
return (true);
}
static bool
pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
struct pmap_invl_gen *new_val)
{
uint64_t new_high, new_low, old_high, old_low;
char res;
new_low = new_val->gen;
new_high = (uintptr_t)new_val->next;
old_low = old_val->gen;
old_high = (uintptr_t)old_val->next;
__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
: "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
: "b"(new_low), "c" (new_high)
: "memory", "cc");
return (res);
}
#ifdef PV_STATS
static long invl_start_restart;
SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
&invl_start_restart, 0,
"");
static long invl_finish_restart;
SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
&invl_finish_restart, 0,
"");
static int invl_max_qlen;
SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
&invl_max_qlen, 0,
"");
#endif
static struct lock_delay_config __read_frequently di_delay;
LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
static void
pmap_delayed_invl_started_u(void)
{
struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
struct thread *td;
struct lock_delay_arg lda;
uintptr_t prevl;
u_char pri;
#ifdef PV_STATS
int i, ii;
#endif
td = curthread;
invl_gen = &td->td_md.md_invl_gen;
PMAP_ASSERT_NOT_IN_DI();
lock_delay_arg_init(&lda, &di_delay);
thread_lock(td);
pri = td->td_base_pri;
if (pri < PVM) {
invl_gen->saved_pri = 0;
} else {
invl_gen->saved_pri = pri;
sched_prio(td, PVM);
}
thread_unlock(td);
again:
PV_STAT(i = 0);
for (p = &pmap_invl_gen_head;; p = prev.next) {
PV_STAT(i++);
prevl = atomic_load_ptr(&p->next);
if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
PV_STAT(atomic_add_long(&invl_start_restart, 1));
lock_delay(&lda);
goto again;
}
if (prevl == 0)
break;
prev.next = (void *)prevl;
}
#ifdef PV_STATS
if ((ii = invl_max_qlen) < i)
atomic_cmpset_int(&invl_max_qlen, ii, i);
#endif
if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
PV_STAT(atomic_add_long(&invl_start_restart, 1));
lock_delay(&lda);
goto again;
}
new_prev.gen = prev.gen;
new_prev.next = invl_gen;
invl_gen->gen = prev.gen + 1;
/* Formal fence between store to invl->gen and updating *p. */
atomic_thread_fence_rel();
/*
* After inserting an invl_gen element with invalid bit set,
* this thread blocks any other thread trying to enter the
* delayed invalidation block. Do not allow to remove us from
* the CPU, because it causes starvation for other threads.
*/
critical_enter();
/*
* ABA for *p is not possible there, since p->gen can only
* increase. So if the *p thread finished its di, then
* started a new one and got inserted into the list at the
* same place, its gen will appear greater than the previously
* read gen.
*/
if (!pmap_di_store_invl(p, &prev, &new_prev)) {
critical_exit();
PV_STAT(atomic_add_long(&invl_start_restart, 1));
lock_delay(&lda);
goto again;
}
/*
* There we clear PMAP_INVL_GEN_NEXT_INVALID in
* invl_gen->next, allowing other threads to iterate past us.
* pmap_di_store_invl() provides fence between the generation
* write and the update of next.
*/
invl_gen->next = NULL;
critical_exit();
}
static bool
pmap_delayed_invl_finished_u_crit(struct pmap_invl_gen *invl_gen,
struct pmap_invl_gen *p)
{
struct pmap_invl_gen prev, new_prev;
u_long mygen;
/*
* Load invl_gen->gen after setting invl_gen->next
* PMAP_INVL_GEN_NEXT_INVALID. This prevents larger
* generations to propagate to our invl_gen->gen. Lock prefix
* in atomic_set_ptr() worked as seq_cst fence.
*/
mygen = atomic_load_long(&invl_gen->gen);
if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
return (false);
KASSERT(prev.gen < mygen,
("invalid di gen sequence %lu %lu", prev.gen, mygen));
new_prev.gen = mygen;
new_prev.next = (void *)((uintptr_t)invl_gen->next &
~PMAP_INVL_GEN_NEXT_INVALID);
/* Formal fence between load of prev and storing update to it. */
atomic_thread_fence_rel();
return (pmap_di_store_invl(p, &prev, &new_prev));
}
static void
pmap_delayed_invl_finished_u(void)
{
struct pmap_invl_gen *invl_gen, *p;
struct thread *td;
struct lock_delay_arg lda;
uintptr_t prevl;
td = curthread;
invl_gen = &td->td_md.md_invl_gen;
KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
("missed invl_start: INVALID"));
lock_delay_arg_init(&lda, &di_delay);
again:
for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
prevl = atomic_load_ptr(&p->next);
if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
PV_STAT(atomic_add_long(&invl_finish_restart, 1));
lock_delay(&lda);
goto again;
}
if ((void *)prevl == invl_gen)
break;
}
/*
* It is legitimate to not find ourself on the list if a
* thread before us finished its DI and started it again.
*/
if (__predict_false(p == NULL)) {
PV_STAT(atomic_add_long(&invl_finish_restart, 1));
lock_delay(&lda);
goto again;
}
critical_enter();
atomic_set_ptr((uintptr_t *)&invl_gen->next,
PMAP_INVL_GEN_NEXT_INVALID);
if (!pmap_delayed_invl_finished_u_crit(invl_gen, p)) {
atomic_clear_ptr((uintptr_t *)&invl_gen->next,
PMAP_INVL_GEN_NEXT_INVALID);
critical_exit();
PV_STAT(atomic_add_long(&invl_finish_restart, 1));
lock_delay(&lda);
goto again;
}
critical_exit();
if (invl_gen->saved_pri != 0) {
thread_lock(td);
sched_prio(td, invl_gen->saved_pri);
thread_unlock(td);
}
}
#ifdef DDB
DB_SHOW_COMMAND(di_queue, pmap_di_queue)
{
struct pmap_invl_gen *p, *pn;
struct thread *td;
uintptr_t nextl;
bool first;
for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
first = false) {
nextl = atomic_load_ptr(&p->next);
pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
td = first ? NULL : __containerof(p, struct thread,
td_md.md_invl_gen);
db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
(nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
td != NULL ? td->td_tid : -1);
}
}
#endif
#ifdef PV_STATS
static long invl_wait;
SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
@ -579,7 +886,7 @@ pmap_delayed_invl_genp(vm_page_t m)
* processor.
*/
static void
pmap_delayed_invl_wait(vm_page_t m)
pmap_delayed_invl_wait_l(vm_page_t m)
{
struct turnstile *ts;
u_long *m_gen;
@ -603,6 +910,54 @@ pmap_delayed_invl_wait(vm_page_t m)
}
}
static void
pmap_delayed_invl_wait_u(vm_page_t m)
{
u_long *m_gen;
#ifdef PV_STATS
bool accounted = false;
#endif
m_gen = pmap_delayed_invl_genp(m);
while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
#ifdef PV_STATS
if (!accounted) {
atomic_add_long(&invl_wait, 1);
accounted = true;
}
#endif
kern_yield(PRI_USER);
}
}
DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static)
{
return ((cpu_feature2 & CPUID2_CX16) == 0 ?
pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u);
}
DEFINE_IFUNC(static, void, pmap_delayed_invl_started, (void), static)
{
return ((cpu_feature2 & CPUID2_CX16) == 0 ?
pmap_delayed_invl_started_l : pmap_delayed_invl_started_u);
}
DEFINE_IFUNC(static, void, pmap_delayed_invl_finished, (void), static)
{
return ((cpu_feature2 & CPUID2_CX16) == 0 ?
pmap_delayed_invl_finished_l : pmap_delayed_invl_finished_u);
}
DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static)
{
return ((cpu_feature2 & CPUID2_CX16) == 0 ?
pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u);
}
/*
* Mark the page m's PV list as participating in the current thread's
* DI block. Any threads concurrently using m's PV list to remove or
@ -2854,6 +3209,7 @@ void
pmap_pinit0(pmap_t pmap)
{
struct proc *p;
struct thread *td;
int i;
PMAP_LOCK_INIT(pmap);
@ -2872,12 +3228,14 @@ pmap_pinit0(pmap_t pmap)
pmap->pm_pcids[i].pm_gen = 1;
}
pmap_activate_boot(pmap);
td = curthread;
if (pti) {
p = curproc;
p = td->td_proc;
PROC_LOCK(p);
p->p_md.md_flags |= P_MD_KPTI;
PROC_UNLOCK(p);
}
pmap_thread_init_invl_gen(td);
if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
@ -9327,11 +9685,7 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
return (error);
}
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kdb.h>
#include <ddb/ddb.h>
DB_SHOW_COMMAND(pte, pmap_print_pte)
{
pmap_t pmap;

View File

@ -1183,7 +1183,7 @@ amd64_syscall(struct thread *td, int traced)
KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
("System call %s returning with mangled pcb_save",
syscallname(td->td_proc, td->td_sa.code)));
KASSERT(td->td_md.md_invl_gen.gen == 0,
KASSERT(pmap_not_in_di(),
("System call %s returning with leaked invl_gen %lu",
syscallname(td->td_proc, td->td_sa.code),
td->td_md.md_invl_gen.gen));

View File

@ -228,7 +228,7 @@ cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
/* Setup to release spin count in fork_exit(). */
td2->td_md.md_spinlock_count = 1;
td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
td2->td_md.md_invl_gen.gen = 0;
pmap_thread_init_invl_gen(td2);
/* As an i386, do not copy io permission bitmap. */
pcb2->pcb_tssp = NULL;
@ -544,6 +544,7 @@ cpu_copy_thread(struct thread *td, struct thread *td0)
/* Setup to release spin count in fork_exit(). */
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
pmap_thread_init_invl_gen(td);
}
/*

View File

@ -441,6 +441,7 @@ void *pmap_mapbios(vm_paddr_t, vm_size_t);
void *pmap_mapdev(vm_paddr_t, vm_size_t);
void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
void *pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size);
bool pmap_not_in_di(void);
boolean_t pmap_page_is_mapped(vm_page_t m);
void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
void pmap_pinit_pml4(vm_page_t);
@ -465,6 +466,7 @@ void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
u_int keyidx, int flags);
void pmap_thread_init_invl_gen(struct thread *td);
int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
#endif /* _KERNEL */

View File

@ -50,10 +50,17 @@ struct proc_ldt {
int ldt_refcnt;
};
#define PMAP_INVL_GEN_NEXT_INVALID 0x1ULL
struct pmap_invl_gen {
u_long gen; /* (k) */
LIST_ENTRY(pmap_invl_gen) link; /* (pp) */
};
union {
LIST_ENTRY(pmap_invl_gen) link; /* (pp) */
struct {
struct pmap_invl_gen *next;
u_char saved_pri;
};
};
} __aligned(16);
/*
* Machine-dependent part of the proc structure for AMD64.

View File

@ -84,7 +84,7 @@ _Static_assert(offsetof(struct thread, td_pflags) == 0x104,
"struct thread KBI td_pflags");
_Static_assert(offsetof(struct thread, td_frame) == 0x478,
"struct thread KBI td_frame");
_Static_assert(offsetof(struct thread, td_emuldata) == 0x530,
_Static_assert(offsetof(struct thread, td_emuldata) == 0x548,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb0,
"struct proc KBI p_flag");