7667824ade
Previously, non-preemptible epochs could not check; in_epoch() would always fail, usually because non-preemptible epochs don't imply THREAD_NO_SLEEPING. For default epochs, it's easy enough to verify that we're in the given epoch: if we're in a critical section and our record for the given epoch is active, then we're in it. This patch also adds some additional INVARIANTS bookkeeping. Notably, we set and check the recorded thread in epoch_enter/epoch_exit to try and catch some edge-cases for the caller. It also checks upon freeing that none of the records had a thread in the epoch, which may make it a little easier to diagnose some improper use if epoch_free() took place while some other thread was inside. This version differs slightly from what was just previously reviewed by the below-listed, in that in_epoch() will assert that no CPU has this thread recorded even if it *is* currently in a critical section. This is intended to catch cases where the caller might have somehow messed up critical section nesting, we can catch both if they exited the critical section or if they exited, migrated, then re-entered (on the wrong CPU). Reviewed by: kib, markj (both previous version) MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D27098
994 lines
25 KiB
C
994 lines
25 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
*
|
|
* Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/counter.h>
|
|
#include <sys/epoch.h>
|
|
#include <sys/gtaskqueue.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/limits.h>
|
|
#include <sys/lock.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/mutex.h>
|
|
#include <sys/pcpu.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/sched.h>
|
|
#include <sys/sx.h>
|
|
#include <sys/smp.h>
|
|
#include <sys/sysctl.h>
|
|
#include <sys/turnstile.h>
|
|
#ifdef EPOCH_TRACE
|
|
#include <machine/stdarg.h>
|
|
#include <sys/stack.h>
|
|
#include <sys/tree.h>
|
|
#endif
|
|
#include <vm/vm.h>
|
|
#include <vm/vm_extern.h>
|
|
#include <vm/vm_kern.h>
|
|
#include <vm/uma.h>
|
|
|
|
#include <ck_epoch.h>
|
|
|
|
#ifdef __amd64__
|
|
#define EPOCH_ALIGN CACHE_LINE_SIZE*2
|
|
#else
|
|
#define EPOCH_ALIGN CACHE_LINE_SIZE
|
|
#endif
|
|
|
|
TAILQ_HEAD (epoch_tdlist, epoch_tracker);
|
|
typedef struct epoch_record {
|
|
ck_epoch_record_t er_record;
|
|
struct epoch_context er_drain_ctx;
|
|
struct epoch *er_parent;
|
|
volatile struct epoch_tdlist er_tdlist;
|
|
volatile uint32_t er_gen;
|
|
uint32_t er_cpuid;
|
|
#ifdef INVARIANTS
|
|
/* Used to verify record ownership for non-preemptible epochs. */
|
|
struct thread *er_td;
|
|
#endif
|
|
} __aligned(EPOCH_ALIGN) *epoch_record_t;
|
|
|
|
struct epoch {
|
|
struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
|
|
epoch_record_t e_pcpu_record;
|
|
int e_in_use;
|
|
int e_flags;
|
|
struct sx e_drain_sx;
|
|
struct mtx e_drain_mtx;
|
|
volatile int e_drain_count;
|
|
const char *e_name;
|
|
};
|
|
|
|
/* arbitrary --- needs benchmarking */
|
|
#define MAX_ADAPTIVE_SPIN 100
|
|
#define MAX_EPOCHS 64
|
|
|
|
CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
|
|
SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
"epoch information");
|
|
SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
"epoch stats");
|
|
|
|
/* Stats. */
|
|
static counter_u64_t block_count;
|
|
|
|
SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
|
|
&block_count, "# of times a thread was in an epoch when epoch_wait was called");
|
|
static counter_u64_t migrate_count;
|
|
|
|
SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
|
|
&migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
|
|
static counter_u64_t turnstile_count;
|
|
|
|
SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
|
|
&turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
|
|
static counter_u64_t switch_count;
|
|
|
|
SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
|
|
&switch_count, "# of times a thread voluntarily context switched in epoch_wait");
|
|
static counter_u64_t epoch_call_count;
|
|
|
|
SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW,
|
|
&epoch_call_count, "# of times a callback was deferred");
|
|
static counter_u64_t epoch_call_task_count;
|
|
|
|
SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW,
|
|
&epoch_call_task_count, "# of times a callback task was run");
|
|
|
|
TAILQ_HEAD (threadlist, thread);
|
|
|
|
CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
|
|
ck_epoch_entry_container)
|
|
|
|
static struct epoch epoch_array[MAX_EPOCHS];
|
|
|
|
DPCPU_DEFINE(struct grouptask, epoch_cb_task);
|
|
DPCPU_DEFINE(int, epoch_cb_count);
|
|
|
|
static __read_mostly int inited;
|
|
__read_mostly epoch_t global_epoch;
|
|
__read_mostly epoch_t global_epoch_preempt;
|
|
|
|
static void epoch_call_task(void *context __unused);
|
|
static uma_zone_t pcpu_zone_record;
|
|
|
|
static struct sx epoch_sx;
|
|
|
|
#define EPOCH_LOCK() sx_xlock(&epoch_sx)
|
|
#define EPOCH_UNLOCK() sx_xunlock(&epoch_sx)
|
|
|
|
#ifdef EPOCH_TRACE
|
|
struct stackentry {
|
|
RB_ENTRY(stackentry) se_node;
|
|
struct stack se_stack;
|
|
};
|
|
|
|
static int
|
|
stackentry_compare(struct stackentry *a, struct stackentry *b)
|
|
{
|
|
|
|
if (a->se_stack.depth > b->se_stack.depth)
|
|
return (1);
|
|
if (a->se_stack.depth < b->se_stack.depth)
|
|
return (-1);
|
|
for (int i = 0; i < a->se_stack.depth; i++) {
|
|
if (a->se_stack.pcs[i] > b->se_stack.pcs[i])
|
|
return (1);
|
|
if (a->se_stack.pcs[i] < b->se_stack.pcs[i])
|
|
return (-1);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks);
|
|
RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare);
|
|
|
|
static struct mtx epoch_stacks_lock;
|
|
MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF);
|
|
|
|
static bool epoch_trace_stack_print = true;
|
|
SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN,
|
|
&epoch_trace_stack_print, 0, "Print stack traces on epoch reports");
|
|
|
|
static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2);
|
|
static inline void
|
|
epoch_trace_report(const char *fmt, ...)
|
|
{
|
|
va_list ap;
|
|
struct stackentry se, *new;
|
|
|
|
stack_zero(&se.se_stack); /* XXX: is it really needed? */
|
|
stack_save(&se.se_stack);
|
|
|
|
/* Tree is never reduced - go lockless. */
|
|
if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL)
|
|
return;
|
|
|
|
new = malloc(sizeof(*new), M_STACK, M_NOWAIT);
|
|
if (new != NULL) {
|
|
bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack));
|
|
|
|
mtx_lock(&epoch_stacks_lock);
|
|
new = RB_INSERT(stacktree, &epoch_stacks, new);
|
|
mtx_unlock(&epoch_stacks_lock);
|
|
if (new != NULL)
|
|
free(new, M_STACK);
|
|
}
|
|
|
|
va_start(ap, fmt);
|
|
(void)vprintf(fmt, ap);
|
|
va_end(ap);
|
|
if (epoch_trace_stack_print)
|
|
stack_print_ddb(&se.se_stack);
|
|
}
|
|
|
|
static inline void
|
|
epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et,
|
|
const char *file, int line)
|
|
{
|
|
epoch_tracker_t iet;
|
|
|
|
SLIST_FOREACH(iet, &td->td_epochs, et_tlink)
|
|
if (iet->et_epoch == epoch)
|
|
epoch_trace_report("Recursively entering epoch %s "
|
|
"at %s:%d, previously entered at %s:%d\n",
|
|
epoch->e_name, file, line,
|
|
iet->et_file, iet->et_line);
|
|
et->et_epoch = epoch;
|
|
et->et_file = file;
|
|
et->et_line = line;
|
|
SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink);
|
|
}
|
|
|
|
static inline void
|
|
epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et,
|
|
const char *file, int line)
|
|
{
|
|
|
|
if (SLIST_FIRST(&td->td_epochs) != et) {
|
|
epoch_trace_report("Exiting epoch %s in a not nested order "
|
|
"at %s:%d. Most recently entered %s at %s:%d\n",
|
|
epoch->e_name,
|
|
file, line,
|
|
SLIST_FIRST(&td->td_epochs)->et_epoch->e_name,
|
|
SLIST_FIRST(&td->td_epochs)->et_file,
|
|
SLIST_FIRST(&td->td_epochs)->et_line);
|
|
/* This will panic if et is not anywhere on td_epochs. */
|
|
SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink);
|
|
} else
|
|
SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink);
|
|
}
|
|
|
|
/* Used by assertions that check thread state before going to sleep. */
|
|
void
|
|
epoch_trace_list(struct thread *td)
|
|
{
|
|
epoch_tracker_t iet;
|
|
|
|
SLIST_FOREACH(iet, &td->td_epochs, et_tlink)
|
|
printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name,
|
|
iet->et_file, iet->et_line);
|
|
}
|
|
#endif /* EPOCH_TRACE */
|
|
|
|
static void
|
|
epoch_init(void *arg __unused)
|
|
{
|
|
int cpu;
|
|
|
|
block_count = counter_u64_alloc(M_WAITOK);
|
|
migrate_count = counter_u64_alloc(M_WAITOK);
|
|
turnstile_count = counter_u64_alloc(M_WAITOK);
|
|
switch_count = counter_u64_alloc(M_WAITOK);
|
|
epoch_call_count = counter_u64_alloc(M_WAITOK);
|
|
epoch_call_task_count = counter_u64_alloc(M_WAITOK);
|
|
|
|
pcpu_zone_record = uma_zcreate("epoch_record pcpu",
|
|
sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
|
|
UMA_ALIGN_PTR, UMA_ZONE_PCPU);
|
|
CPU_FOREACH(cpu) {
|
|
GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0,
|
|
epoch_call_task, NULL);
|
|
taskqgroup_attach_cpu(qgroup_softirq,
|
|
DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL,
|
|
"epoch call task");
|
|
}
|
|
#ifdef EPOCH_TRACE
|
|
SLIST_INIT(&thread0.td_epochs);
|
|
#endif
|
|
sx_init(&epoch_sx, "epoch-sx");
|
|
inited = 1;
|
|
global_epoch = epoch_alloc("Global", 0);
|
|
global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT);
|
|
}
|
|
SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL);
|
|
|
|
#if !defined(EARLY_AP_STARTUP)
|
|
static void
|
|
epoch_init_smp(void *dummy __unused)
|
|
{
|
|
inited = 2;
|
|
}
|
|
SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL);
|
|
#endif
|
|
|
|
static void
|
|
epoch_ctor(epoch_t epoch)
|
|
{
|
|
epoch_record_t er;
|
|
int cpu;
|
|
|
|
epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK);
|
|
CPU_FOREACH(cpu) {
|
|
er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
|
|
bzero(er, sizeof(*er));
|
|
ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
|
|
TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
|
|
er->er_cpuid = cpu;
|
|
er->er_parent = epoch;
|
|
}
|
|
}
|
|
|
|
static void
|
|
epoch_adjust_prio(struct thread *td, u_char prio)
|
|
{
|
|
|
|
thread_lock(td);
|
|
sched_prio(td, prio);
|
|
thread_unlock(td);
|
|
}
|
|
|
|
epoch_t
|
|
epoch_alloc(const char *name, int flags)
|
|
{
|
|
epoch_t epoch;
|
|
int i;
|
|
|
|
MPASS(name != NULL);
|
|
|
|
if (__predict_false(!inited))
|
|
panic("%s called too early in boot", __func__);
|
|
|
|
EPOCH_LOCK();
|
|
|
|
/*
|
|
* Find a free index in the epoch array. If no free index is
|
|
* found, try to use the index after the last one.
|
|
*/
|
|
for (i = 0;; i++) {
|
|
/*
|
|
* If too many epochs are currently allocated,
|
|
* return NULL.
|
|
*/
|
|
if (i == MAX_EPOCHS) {
|
|
epoch = NULL;
|
|
goto done;
|
|
}
|
|
if (epoch_array[i].e_in_use == 0)
|
|
break;
|
|
}
|
|
|
|
epoch = epoch_array + i;
|
|
ck_epoch_init(&epoch->e_epoch);
|
|
epoch_ctor(epoch);
|
|
epoch->e_flags = flags;
|
|
epoch->e_name = name;
|
|
sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
|
|
mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
|
|
|
|
/*
|
|
* Set e_in_use last, because when this field is set the
|
|
* epoch_call_task() function will start scanning this epoch
|
|
* structure.
|
|
*/
|
|
atomic_store_rel_int(&epoch->e_in_use, 1);
|
|
done:
|
|
EPOCH_UNLOCK();
|
|
return (epoch);
|
|
}
|
|
|
|
void
|
|
epoch_free(epoch_t epoch)
|
|
{
|
|
#ifdef INVARIANTS
|
|
int cpu;
|
|
#endif
|
|
|
|
EPOCH_LOCK();
|
|
|
|
MPASS(epoch->e_in_use != 0);
|
|
|
|
epoch_drain_callbacks(epoch);
|
|
|
|
atomic_store_rel_int(&epoch->e_in_use, 0);
|
|
/*
|
|
* Make sure the epoch_call_task() function see e_in_use equal
|
|
* to zero, by calling epoch_wait() on the global_epoch:
|
|
*/
|
|
epoch_wait(global_epoch);
|
|
#ifdef INVARIANTS
|
|
CPU_FOREACH(cpu) {
|
|
epoch_record_t er;
|
|
|
|
er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
|
|
|
|
/*
|
|
* Sanity check: none of the records should be in use anymore.
|
|
* We drained callbacks above and freeing the pcpu records is
|
|
* imminent.
|
|
*/
|
|
MPASS(er->er_td == NULL);
|
|
MPASS(TAILQ_EMPTY(&er->er_tdlist));
|
|
}
|
|
#endif
|
|
uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
|
|
mtx_destroy(&epoch->e_drain_mtx);
|
|
sx_destroy(&epoch->e_drain_sx);
|
|
memset(epoch, 0, sizeof(*epoch));
|
|
|
|
EPOCH_UNLOCK();
|
|
}
|
|
|
|
static epoch_record_t
|
|
epoch_currecord(epoch_t epoch)
|
|
{
|
|
|
|
return (zpcpu_get(epoch->e_pcpu_record));
|
|
}
|
|
|
|
#define INIT_CHECK(epoch) \
|
|
do { \
|
|
if (__predict_false((epoch) == NULL)) \
|
|
return; \
|
|
} while (0)
|
|
|
|
void
|
|
_epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
|
|
{
|
|
struct epoch_record *er;
|
|
struct thread *td;
|
|
|
|
MPASS(cold || epoch != NULL);
|
|
MPASS(epoch->e_flags & EPOCH_PREEMPT);
|
|
td = curthread;
|
|
MPASS((vm_offset_t)et >= td->td_kstack &&
|
|
(vm_offset_t)et + sizeof(struct epoch_tracker) <=
|
|
td->td_kstack + td->td_kstack_pages * PAGE_SIZE);
|
|
|
|
INIT_CHECK(epoch);
|
|
#ifdef EPOCH_TRACE
|
|
epoch_trace_enter(td, epoch, et, file, line);
|
|
#endif
|
|
et->et_td = td;
|
|
THREAD_NO_SLEEPING();
|
|
critical_enter();
|
|
sched_pin();
|
|
td->td_pre_epoch_prio = td->td_priority;
|
|
er = epoch_currecord(epoch);
|
|
/* Record-level tracking is reserved for non-preemptible epochs. */
|
|
MPASS(er->er_td == NULL);
|
|
TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
|
|
ck_epoch_begin(&er->er_record, &et->et_section);
|
|
critical_exit();
|
|
}
|
|
|
|
void
|
|
epoch_enter(epoch_t epoch)
|
|
{
|
|
epoch_record_t er;
|
|
|
|
MPASS(cold || epoch != NULL);
|
|
INIT_CHECK(epoch);
|
|
critical_enter();
|
|
er = epoch_currecord(epoch);
|
|
#ifdef INVARIANTS
|
|
if (er->er_record.active == 0) {
|
|
MPASS(er->er_td == NULL);
|
|
er->er_td = curthread;
|
|
} else {
|
|
/* We've recursed, just make sure our accounting isn't wrong. */
|
|
MPASS(er->er_td == curthread);
|
|
}
|
|
#endif
|
|
ck_epoch_begin(&er->er_record, NULL);
|
|
}
|
|
|
|
void
|
|
_epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
|
|
{
|
|
struct epoch_record *er;
|
|
struct thread *td;
|
|
|
|
INIT_CHECK(epoch);
|
|
td = curthread;
|
|
critical_enter();
|
|
sched_unpin();
|
|
THREAD_SLEEPING_OK();
|
|
er = epoch_currecord(epoch);
|
|
MPASS(epoch->e_flags & EPOCH_PREEMPT);
|
|
MPASS(et != NULL);
|
|
MPASS(et->et_td == td);
|
|
#ifdef INVARIANTS
|
|
et->et_td = (void*)0xDEADBEEF;
|
|
/* Record-level tracking is reserved for non-preemptible epochs. */
|
|
MPASS(er->er_td == NULL);
|
|
#endif
|
|
ck_epoch_end(&er->er_record, &et->et_section);
|
|
TAILQ_REMOVE(&er->er_tdlist, et, et_link);
|
|
er->er_gen++;
|
|
if (__predict_false(td->td_pre_epoch_prio != td->td_priority))
|
|
epoch_adjust_prio(td, td->td_pre_epoch_prio);
|
|
critical_exit();
|
|
#ifdef EPOCH_TRACE
|
|
epoch_trace_exit(td, epoch, et, file, line);
|
|
#endif
|
|
}
|
|
|
|
void
|
|
epoch_exit(epoch_t epoch)
|
|
{
|
|
epoch_record_t er;
|
|
|
|
INIT_CHECK(epoch);
|
|
er = epoch_currecord(epoch);
|
|
ck_epoch_end(&er->er_record, NULL);
|
|
#ifdef INVARIANTS
|
|
MPASS(er->er_td == curthread);
|
|
if (er->er_record.active == 0)
|
|
er->er_td = NULL;
|
|
#endif
|
|
critical_exit();
|
|
}
|
|
|
|
/*
|
|
* epoch_block_handler_preempt() is a callback from the CK code when another
|
|
* thread is currently in an epoch section.
|
|
*/
|
|
static void
|
|
epoch_block_handler_preempt(struct ck_epoch *global __unused,
|
|
ck_epoch_record_t *cr, void *arg __unused)
|
|
{
|
|
epoch_record_t record;
|
|
struct thread *td, *owner, *curwaittd;
|
|
struct epoch_tracker *tdwait;
|
|
struct turnstile *ts;
|
|
struct lock_object *lock;
|
|
int spincount, gen;
|
|
int locksheld __unused;
|
|
|
|
record = __containerof(cr, struct epoch_record, er_record);
|
|
td = curthread;
|
|
locksheld = td->td_locks;
|
|
spincount = 0;
|
|
counter_u64_add(block_count, 1);
|
|
/*
|
|
* We lost a race and there's no longer any threads
|
|
* on the CPU in an epoch section.
|
|
*/
|
|
if (TAILQ_EMPTY(&record->er_tdlist))
|
|
return;
|
|
|
|
if (record->er_cpuid != curcpu) {
|
|
/*
|
|
* If the head of the list is running, we can wait for it
|
|
* to remove itself from the list and thus save us the
|
|
* overhead of a migration
|
|
*/
|
|
gen = record->er_gen;
|
|
thread_unlock(td);
|
|
/*
|
|
* We can't actually check if the waiting thread is running
|
|
* so we simply poll for it to exit before giving up and
|
|
* migrating.
|
|
*/
|
|
do {
|
|
cpu_spinwait();
|
|
} while (!TAILQ_EMPTY(&record->er_tdlist) &&
|
|
gen == record->er_gen &&
|
|
spincount++ < MAX_ADAPTIVE_SPIN);
|
|
thread_lock(td);
|
|
/*
|
|
* If the generation has changed we can poll again
|
|
* otherwise we need to migrate.
|
|
*/
|
|
if (gen != record->er_gen)
|
|
return;
|
|
/*
|
|
* Being on the same CPU as that of the record on which
|
|
* we need to wait allows us access to the thread
|
|
* list associated with that CPU. We can then examine the
|
|
* oldest thread in the queue and wait on its turnstile
|
|
* until it resumes and so on until a grace period
|
|
* elapses.
|
|
*
|
|
*/
|
|
counter_u64_add(migrate_count, 1);
|
|
sched_bind(td, record->er_cpuid);
|
|
/*
|
|
* At this point we need to return to the ck code
|
|
* to scan to see if a grace period has elapsed.
|
|
* We can't move on to check the thread list, because
|
|
* in the meantime new threads may have arrived that
|
|
* in fact belong to a different epoch.
|
|
*/
|
|
return;
|
|
}
|
|
/*
|
|
* Try to find a thread in an epoch section on this CPU
|
|
* waiting on a turnstile. Otherwise find the lowest
|
|
* priority thread (highest prio value) and drop our priority
|
|
* to match to allow it to run.
|
|
*/
|
|
TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
|
|
/*
|
|
* Propagate our priority to any other waiters to prevent us
|
|
* from starving them. They will have their original priority
|
|
* restore on exit from epoch_wait().
|
|
*/
|
|
curwaittd = tdwait->et_td;
|
|
if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) {
|
|
critical_enter();
|
|
thread_unlock(td);
|
|
thread_lock(curwaittd);
|
|
sched_prio(curwaittd, td->td_priority);
|
|
thread_unlock(curwaittd);
|
|
thread_lock(td);
|
|
critical_exit();
|
|
}
|
|
if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
|
|
((ts = curwaittd->td_blocked) != NULL)) {
|
|
/*
|
|
* We unlock td to allow turnstile_wait to reacquire
|
|
* the thread lock. Before unlocking it we enter a
|
|
* critical section to prevent preemption after we
|
|
* reenable interrupts by dropping the thread lock in
|
|
* order to prevent curwaittd from getting to run.
|
|
*/
|
|
critical_enter();
|
|
thread_unlock(td);
|
|
|
|
if (turnstile_lock(ts, &lock, &owner)) {
|
|
if (ts == curwaittd->td_blocked) {
|
|
MPASS(TD_IS_INHIBITED(curwaittd) &&
|
|
TD_ON_LOCK(curwaittd));
|
|
critical_exit();
|
|
turnstile_wait(ts, owner,
|
|
curwaittd->td_tsqueue);
|
|
counter_u64_add(turnstile_count, 1);
|
|
thread_lock(td);
|
|
return;
|
|
}
|
|
turnstile_unlock(ts, lock);
|
|
}
|
|
thread_lock(td);
|
|
critical_exit();
|
|
KASSERT(td->td_locks == locksheld,
|
|
("%d extra locks held", td->td_locks - locksheld));
|
|
}
|
|
}
|
|
/*
|
|
* We didn't find any threads actually blocked on a lock
|
|
* so we have nothing to do except context switch away.
|
|
*/
|
|
counter_u64_add(switch_count, 1);
|
|
mi_switch(SW_VOL | SWT_RELINQUISH);
|
|
/*
|
|
* It is important the thread lock is dropped while yielding
|
|
* to allow other threads to acquire the lock pointed to by
|
|
* TDQ_LOCKPTR(td). Currently mi_switch() will unlock the
|
|
* thread lock before returning. Else a deadlock like
|
|
* situation might happen.
|
|
*/
|
|
thread_lock(td);
|
|
}
|
|
|
|
void
|
|
epoch_wait_preempt(epoch_t epoch)
|
|
{
|
|
struct thread *td;
|
|
int was_bound;
|
|
int old_cpu;
|
|
int old_pinned;
|
|
u_char old_prio;
|
|
int locks __unused;
|
|
|
|
MPASS(cold || epoch != NULL);
|
|
INIT_CHECK(epoch);
|
|
td = curthread;
|
|
#ifdef INVARIANTS
|
|
locks = curthread->td_locks;
|
|
MPASS(epoch->e_flags & EPOCH_PREEMPT);
|
|
if ((epoch->e_flags & EPOCH_LOCKED) == 0)
|
|
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
|
|
"epoch_wait() can be long running");
|
|
KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle "
|
|
"of an epoch section of the same epoch"));
|
|
#endif
|
|
DROP_GIANT();
|
|
thread_lock(td);
|
|
|
|
old_cpu = PCPU_GET(cpuid);
|
|
old_pinned = td->td_pinned;
|
|
old_prio = td->td_priority;
|
|
was_bound = sched_is_bound(td);
|
|
sched_unbind(td);
|
|
td->td_pinned = 0;
|
|
sched_bind(td, old_cpu);
|
|
|
|
ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
|
|
NULL);
|
|
|
|
/* restore CPU binding, if any */
|
|
if (was_bound != 0) {
|
|
sched_bind(td, old_cpu);
|
|
} else {
|
|
/* get thread back to initial CPU, if any */
|
|
if (old_pinned != 0)
|
|
sched_bind(td, old_cpu);
|
|
sched_unbind(td);
|
|
}
|
|
/* restore pinned after bind */
|
|
td->td_pinned = old_pinned;
|
|
|
|
/* restore thread priority */
|
|
sched_prio(td, old_prio);
|
|
thread_unlock(td);
|
|
PICKUP_GIANT();
|
|
KASSERT(td->td_locks == locks,
|
|
("%d residual locks held", td->td_locks - locks));
|
|
}
|
|
|
|
static void
|
|
epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
|
|
void *arg __unused)
|
|
{
|
|
cpu_spinwait();
|
|
}
|
|
|
|
void
|
|
epoch_wait(epoch_t epoch)
|
|
{
|
|
|
|
MPASS(cold || epoch != NULL);
|
|
INIT_CHECK(epoch);
|
|
MPASS(epoch->e_flags == 0);
|
|
critical_enter();
|
|
ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
|
|
critical_exit();
|
|
}
|
|
|
|
void
|
|
epoch_call(epoch_t epoch, epoch_callback_t callback, epoch_context_t ctx)
|
|
{
|
|
epoch_record_t er;
|
|
ck_epoch_entry_t *cb;
|
|
|
|
cb = (void *)ctx;
|
|
|
|
MPASS(callback);
|
|
/* too early in boot to have epoch set up */
|
|
if (__predict_false(epoch == NULL))
|
|
goto boottime;
|
|
#if !defined(EARLY_AP_STARTUP)
|
|
if (__predict_false(inited < 2))
|
|
goto boottime;
|
|
#endif
|
|
|
|
critical_enter();
|
|
*DPCPU_PTR(epoch_cb_count) += 1;
|
|
er = epoch_currecord(epoch);
|
|
ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback);
|
|
critical_exit();
|
|
return;
|
|
boottime:
|
|
callback(ctx);
|
|
}
|
|
|
|
static void
|
|
epoch_call_task(void *arg __unused)
|
|
{
|
|
ck_stack_entry_t *cursor, *head, *next;
|
|
ck_epoch_record_t *record;
|
|
epoch_record_t er;
|
|
epoch_t epoch;
|
|
ck_stack_t cb_stack;
|
|
int i, npending, total;
|
|
|
|
ck_stack_init(&cb_stack);
|
|
critical_enter();
|
|
epoch_enter(global_epoch);
|
|
for (total = i = 0; i != MAX_EPOCHS; i++) {
|
|
epoch = epoch_array + i;
|
|
if (__predict_false(
|
|
atomic_load_acq_int(&epoch->e_in_use) == 0))
|
|
continue;
|
|
er = epoch_currecord(epoch);
|
|
record = &er->er_record;
|
|
if ((npending = record->n_pending) == 0)
|
|
continue;
|
|
ck_epoch_poll_deferred(record, &cb_stack);
|
|
total += npending - record->n_pending;
|
|
}
|
|
epoch_exit(global_epoch);
|
|
*DPCPU_PTR(epoch_cb_count) -= total;
|
|
critical_exit();
|
|
|
|
counter_u64_add(epoch_call_count, total);
|
|
counter_u64_add(epoch_call_task_count, 1);
|
|
|
|
head = ck_stack_batch_pop_npsc(&cb_stack);
|
|
for (cursor = head; cursor != NULL; cursor = next) {
|
|
struct ck_epoch_entry *entry =
|
|
ck_epoch_entry_container(cursor);
|
|
|
|
next = CK_STACK_NEXT(cursor);
|
|
entry->function(entry);
|
|
}
|
|
}
|
|
|
|
static int
|
|
in_epoch_verbose_preempt(epoch_t epoch, int dump_onfail)
|
|
{
|
|
epoch_record_t er;
|
|
struct epoch_tracker *tdwait;
|
|
struct thread *td;
|
|
|
|
MPASS(epoch != NULL);
|
|
MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0);
|
|
td = curthread;
|
|
if (THREAD_CAN_SLEEP())
|
|
return (0);
|
|
critical_enter();
|
|
er = epoch_currecord(epoch);
|
|
TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
|
|
if (tdwait->et_td == td) {
|
|
critical_exit();
|
|
return (1);
|
|
}
|
|
#ifdef INVARIANTS
|
|
if (dump_onfail) {
|
|
MPASS(td->td_pinned);
|
|
printf("cpu: %d id: %d\n", curcpu, td->td_tid);
|
|
TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
|
|
printf("td_tid: %d ", tdwait->et_td->td_tid);
|
|
printf("\n");
|
|
}
|
|
#endif
|
|
critical_exit();
|
|
return (0);
|
|
}
|
|
|
|
#ifdef INVARIANTS
|
|
static void
|
|
epoch_assert_nocpu(epoch_t epoch, struct thread *td)
|
|
{
|
|
epoch_record_t er;
|
|
int cpu;
|
|
bool crit;
|
|
|
|
crit = td->td_critnest > 0;
|
|
|
|
/* Check for a critical section mishap. */
|
|
CPU_FOREACH(cpu) {
|
|
er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
|
|
KASSERT(er->er_td != td,
|
|
("%s critical section in epoch '%s', from cpu %d",
|
|
(crit ? "exited" : "re-entered"), epoch->e_name, cpu));
|
|
}
|
|
}
|
|
#else
|
|
#define epoch_assert_nocpu(e, td)
|
|
#endif
|
|
|
|
int
|
|
in_epoch_verbose(epoch_t epoch, int dump_onfail)
|
|
{
|
|
epoch_record_t er;
|
|
struct thread *td;
|
|
|
|
if (__predict_false((epoch) == NULL))
|
|
return (0);
|
|
if ((epoch->e_flags & EPOCH_PREEMPT) != 0)
|
|
return (in_epoch_verbose_preempt(epoch, dump_onfail));
|
|
|
|
/*
|
|
* The thread being in a critical section is a necessary
|
|
* condition to be correctly inside a non-preemptible epoch,
|
|
* so it's definitely not in this epoch.
|
|
*/
|
|
td = curthread;
|
|
if (td->td_critnest == 0) {
|
|
epoch_assert_nocpu(epoch, td);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* The current cpu is in a critical section, so the epoch record will be
|
|
* stable for the rest of this function. Knowing that the record is not
|
|
* active is sufficient for knowing whether we're in this epoch or not,
|
|
* since it's a pcpu record.
|
|
*/
|
|
er = epoch_currecord(epoch);
|
|
if (er->er_record.active == 0) {
|
|
epoch_assert_nocpu(epoch, td);
|
|
return (0);
|
|
}
|
|
|
|
MPASS(er->er_td == td);
|
|
return (1);
|
|
}
|
|
|
|
int
|
|
in_epoch(epoch_t epoch)
|
|
{
|
|
return (in_epoch_verbose(epoch, 0));
|
|
}
|
|
|
|
static void
|
|
epoch_drain_cb(struct epoch_context *ctx)
|
|
{
|
|
struct epoch *epoch =
|
|
__containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
|
|
|
|
if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
|
|
mtx_lock(&epoch->e_drain_mtx);
|
|
wakeup(epoch);
|
|
mtx_unlock(&epoch->e_drain_mtx);
|
|
}
|
|
}
|
|
|
|
void
|
|
epoch_drain_callbacks(epoch_t epoch)
|
|
{
|
|
epoch_record_t er;
|
|
struct thread *td;
|
|
int was_bound;
|
|
int old_pinned;
|
|
int old_cpu;
|
|
int cpu;
|
|
|
|
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
|
|
"epoch_drain_callbacks() may sleep!");
|
|
|
|
/* too early in boot to have epoch set up */
|
|
if (__predict_false(epoch == NULL))
|
|
return;
|
|
#if !defined(EARLY_AP_STARTUP)
|
|
if (__predict_false(inited < 2))
|
|
return;
|
|
#endif
|
|
DROP_GIANT();
|
|
|
|
sx_xlock(&epoch->e_drain_sx);
|
|
mtx_lock(&epoch->e_drain_mtx);
|
|
|
|
td = curthread;
|
|
thread_lock(td);
|
|
old_cpu = PCPU_GET(cpuid);
|
|
old_pinned = td->td_pinned;
|
|
was_bound = sched_is_bound(td);
|
|
sched_unbind(td);
|
|
td->td_pinned = 0;
|
|
|
|
CPU_FOREACH(cpu)
|
|
epoch->e_drain_count++;
|
|
CPU_FOREACH(cpu) {
|
|
er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
|
|
sched_bind(td, cpu);
|
|
epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
|
|
}
|
|
|
|
/* restore CPU binding, if any */
|
|
if (was_bound != 0) {
|
|
sched_bind(td, old_cpu);
|
|
} else {
|
|
/* get thread back to initial CPU, if any */
|
|
if (old_pinned != 0)
|
|
sched_bind(td, old_cpu);
|
|
sched_unbind(td);
|
|
}
|
|
/* restore pinned after bind */
|
|
td->td_pinned = old_pinned;
|
|
|
|
thread_unlock(td);
|
|
|
|
while (epoch->e_drain_count != 0)
|
|
msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
|
|
|
|
mtx_unlock(&epoch->e_drain_mtx);
|
|
sx_xunlock(&epoch->e_drain_sx);
|
|
|
|
PICKUP_GIANT();
|
|
}
|