Add an atomic-free tick moderated lazy update variant of SMR.

This enables very cheap read sections with free-to-use latencies and memory
overhead similar to epoch.  On a recent AMD platform a read section cost
1ns vs 5ns for the default SMR.  On Xeon the numbers should be more like 1
ns vs 11.  The memory consumption should be proportional to the product
of the free rate and 2*1/hz while normal SMR consumption is proportional
to the product of free rate and maximum read section time.

While here refactor the code to make future additions more
straightforward.

Name the overall technique Global Unbound Sequences (GUS) and adjust some
comments accordingly.  This helps distinguish discussions of the general
technique (SMR) vs this specific implementation (GUS).

Discussed with:	rlibby, markj
This commit is contained in:
jeff 2020-02-22 03:44:10 +00:00
parent 4d75f3c558
commit 51fe124dfe
4 changed files with 443 additions and 175 deletions

View File

@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
/*
* Global Unbounded Sequences (GUS)
*
* This is a novel safe memory reclamation technique inspired by
* epoch based reclamation from Samy Al Bahra's concurrency kit which
* in turn was based on work described in:
@ -53,7 +55,8 @@ __FBSDID("$FreeBSD$");
* This is not an implementation of hazard pointers or related
* techniques. The term safe memory reclamation is used as a
* generic descriptor for algorithms that defer frees to avoid
* use-after-free errors with lockless datastructures.
* use-after-free errors with lockless datastructures or as
* a mechanism to detect quiescence for writer synchronization.
*
* The basic approach is to maintain a monotonic write sequence
* number that is updated on some application defined granularity.
@ -67,7 +70,7 @@ __FBSDID("$FreeBSD$");
* a global write clock that is used to mark memory on free.
*
* The write and read sequence numbers can be thought of as a two
* handed clock with readers always advancing towards writers. SMR
* handed clock with readers always advancing towards writers. GUS
* maintains the invariant that all readers can safely access memory
* that was visible at the time they loaded their copy of the sequence
* number. Periodically the read sequence or hand is polled and
@ -80,9 +83,12 @@ __FBSDID("$FreeBSD$");
* A stored sequence number that falls outside of this range has expired
* and needs no scan to reclaim.
*
* A notable distinction between this SMR and Epoch, qsbr, rcu, etc. is
* A notable distinction between GUS and Epoch, qsbr, rcu, etc. is
* that advancing the sequence number is decoupled from detecting its
* observation. This results in a more granular assignment of sequence
* observation. That is to say, the delta between read and write
* sequence numbers is not bound. This can be thought of as a more
* generalized form of epoch which requires them at most one step
* apart. This results in a more granular assignment of sequence
* numbers even as read latencies prohibit all or some expiration.
* It also allows writers to advance the sequence number and save the
* poll for expiration until a later time when it is likely to
@ -164,31 +170,192 @@ static uma_zone_t smr_zone;
#define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2
#endif
static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");
static counter_u64_t advance = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RD, &advance, "");
static counter_u64_t advance_wait = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RD, &advance_wait, "");
static counter_u64_t poll = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RD, &poll, "");
static counter_u64_t poll_scan = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, "");
/*
* The grace period for lazy (tick based) SMR.
*
* Hardclock is responsible for advancing ticks on a single CPU while every
* CPU receives a regular clock interrupt. The clock interrupts are flushing
* the store buffers and any speculative loads that may violate our invariants.
* Because these interrupts are not synchronized we must wait one additional
* tick in the future to be certain that all processors have had their state
* synchronized by an interrupt.
*
* This assumes that the clock interrupt will only be delayed by other causes
* that will flush the store buffer or prevent access to the section protected
* data. For example, an idle processor, or an system management interrupt,
* or a vm exit.
*
* We must wait one additional tick if we are around the wrap condition
* because the write seq will move forward by two with one interrupt.
*/
#define SMR_LAZY_GRACE 2
#define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1)
/*
* Advance the write sequence and return the new value for use as the
* The maximum sequence number ahead of wr_seq that may still be valid. The
* sequence may not be advanced on write for lazy or deferred SMRs. In this
* case poll needs to attempt to forward the sequence number if the goal is
* within wr_seq + SMR_SEQ_ADVANCE.
*/
#define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX)
static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");
static counter_u64_t advance = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, "");
static counter_u64_t advance_wait = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, "");
static counter_u64_t poll = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, "");
static counter_u64_t poll_scan = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, "");
static counter_u64_t poll_fail = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, "");
/*
* Advance a lazy write sequence number. These move forward at the rate of
* ticks. Grace is two ticks in the future. lazy write sequence numbers can
* be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap.
*
* This returns the _current_ write sequence number. The lazy goal sequence
* number is SMR_LAZY_GRACE ticks ahead.
*/
static smr_seq_t
smr_lazy_advance(smr_t smr, smr_shared_t s)
{
smr_seq_t s_rd_seq, s_wr_seq, goal;
int t;
CRITICAL_ASSERT(curthread);
/*
* Load s_wr_seq prior to ticks to ensure that the thread that
* observes the largest value wins.
*/
s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
/*
* We must not allow a zero tick value. We go back in time one tick
* and advance the grace period forward one tick around zero.
*/
t = ticks;
if (t == SMR_SEQ_INVALID)
t--;
/*
* The most probable condition that the update already took place.
*/
if (__predict_true(t == s_wr_seq))
goto out;
/*
* After long idle periods the read sequence may fall too far
* behind write. Prevent poll from ever seeing this condition
* by updating the stale rd_seq. This assumes that there can
* be no valid section 2bn ticks old. The rd_seq update must
* be visible before wr_seq to avoid races with other advance
* callers.
*/
s_rd_seq = atomic_load_int(&s->s_rd_seq);
if (SMR_SEQ_GT(s_rd_seq, t))
atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t);
/*
* Release to synchronize with the wr_seq load above. Ignore
* cmpset failures from simultaneous updates.
*/
atomic_cmpset_rel_int(&s->s_wr_seq, s_wr_seq, t);
counter_u64_add(advance, 1);
/* If we lost either update race another thread did it. */
s_wr_seq = t;
out:
goal = s_wr_seq + SMR_LAZY_GRACE;
/* Skip over the SMR_SEQ_INVALID tick. */
if (goal < SMR_LAZY_GRACE)
goal++;
return (goal);
}
/*
* Increment the shared write sequence by 2. Since it is initialized
* to 1 this means the only valid values are odd and an observed value
* of 0 in a particular CPU means it is not currently in a read section.
*/
static smr_seq_t
smr_shared_advance(smr_shared_t s)
{
return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);
}
/*
* Advance the write sequence number for a normal smr section. If the
* write sequence is too far behind the read sequence we have to poll
* to advance rd_seq and prevent undetectable wraps.
*/
static smr_seq_t
smr_default_advance(smr_t smr, smr_shared_t s)
{
smr_seq_t goal, s_rd_seq;
CRITICAL_ASSERT(curthread);
KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
("smr_default_advance: called with lazy smr."));
/*
* Load the current read seq before incrementing the goal so
* we are guaranteed it is always < goal.
*/
s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
goal = smr_shared_advance(s);
/*
* Force a synchronization here if the goal is getting too
* far ahead of the read sequence number. This keeps the
* wrap detecting arithmetic working in pathological cases.
*/
if (SMR_SEQ_DELTA(goal, s_rd_seq) >= SMR_SEQ_MAX_DELTA) {
counter_u64_add(advance_wait, 1);
smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
}
counter_u64_add(advance, 1);
return (goal);
}
/*
* Deferred SMRs conditionally update s_wr_seq based on an
* cpu local interval count.
*/
static smr_seq_t
smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self)
{
if (++self->c_deferred < self->c_limit)
return (smr_shared_current(s) + SMR_SEQ_INCR);
self->c_deferred = 0;
return (smr_default_advance(smr, s));
}
/*
* Advance the write sequence and return the value for use as the
* wait goal. This guarantees that any changes made by the calling
* thread prior to this call will be visible to all threads after
* rd_seq meets or exceeds the return value.
*
* This function may busy loop if the readers are roughly 1 billion
* sequence numbers behind the writers.
*
* Lazy SMRs will not busy loop and the wrap happens every 49.6 days
* at 1khz and 119 hours at 10khz. Readers can block for no longer
* than half of this for SMR_SEQ_ macros to continue working.
*/
smr_seq_t
smr_advance(smr_t smr)
{
smr_t self;
smr_shared_t s;
smr_seq_t goal, s_rd_seq;
smr_seq_t goal;
int flags;
/*
* It is illegal to enter while in an smr section.
@ -201,55 +368,121 @@ smr_advance(smr_t smr)
*/
atomic_thread_fence_rel();
/*
* Load the current read seq before incrementing the goal so
* we are guaranteed it is always < goal.
*/
s = zpcpu_get(smr)->c_shared;
s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
/*
* Increment the shared write sequence by 2. Since it is
* initialized to 1 this means the only valid values are
* odd and an observed value of 0 in a particular CPU means
* it is not currently in a read section.
*/
goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR;
counter_u64_add(advance, 1);
/*
* Force a synchronization here if the goal is getting too
* far ahead of the read sequence number. This keeps the
* wrap detecting arithmetic working in pathological cases.
*/
if (SMR_SEQ_DELTA(goal, s_rd_seq) >= SMR_SEQ_MAX_DELTA) {
counter_u64_add(advance_wait, 1);
smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
}
critical_enter();
/* Try to touch the line once. */
self = zpcpu_get(smr);
s = self->c_shared;
flags = self->c_flags;
goal = SMR_SEQ_INVALID;
if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0)
goal = smr_default_advance(smr, s);
else if ((flags & SMR_LAZY) != 0)
goal = smr_lazy_advance(smr, s);
else if ((flags & SMR_DEFERRED) != 0)
goal = smr_deferred_advance(smr, s, self);
critical_exit();
return (goal);
}
smr_seq_t
smr_advance_deferred(smr_t smr, int limit)
/*
* Poll to determine the currently observed sequence number on a cpu
* and spinwait if the 'wait' argument is true.
*/
static smr_seq_t
smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait)
{
smr_seq_t goal;
smr_t csmr;
smr_seq_t c_seq;
SMR_ASSERT_NOT_ENTERED(smr);
c_seq = SMR_SEQ_INVALID;
for (;;) {
c_seq = atomic_load_int(&c->c_seq);
if (c_seq == SMR_SEQ_INVALID)
break;
critical_enter();
csmr = zpcpu_get(smr);
if (++csmr->c_deferred >= limit) {
goal = SMR_SEQ_INVALID;
csmr->c_deferred = 0;
} else
goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR;
critical_exit();
if (goal != SMR_SEQ_INVALID)
return (goal);
/*
* There is a race described in smr.h:smr_enter that
* can lead to a stale seq value but not stale data
* access. If we find a value out of range here we
* pin it to the current min to prevent it from
* advancing until that stale section has expired.
*
* The race is created when a cpu loads the s_wr_seq
* value in a local register and then another thread
* advances s_wr_seq and calls smr_poll() which will
* oberve no value yet in c_seq and advance s_rd_seq
* up to s_wr_seq which is beyond the register
* cached value. This is only likely to happen on
* hypervisor or with a system management interrupt.
*/
if (SMR_SEQ_LT(c_seq, s_rd_seq))
c_seq = s_rd_seq;
return (smr_advance(smr));
/*
* If the sequence number meets the goal we are done
* with this cpu.
*/
if (SMR_SEQ_LEQ(goal, c_seq))
break;
if (!wait)
break;
cpu_spinwait();
}
return (c_seq);
}
/*
* Loop until all cores have observed the goal sequence or have
* gone inactive. Returns the oldest sequence currently active;
*
* This function assumes a snapshot of sequence values has
* been obtained and validated by smr_poll().
*/
static smr_seq_t
smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq,
smr_seq_t s_wr_seq, smr_seq_t goal, bool wait)
{
smr_seq_t rd_seq, c_seq;
int i;
CRITICAL_ASSERT(curthread);
counter_u64_add_protected(poll_scan, 1);
/*
* The read sequence can be no larger than the write sequence at
* the start of the poll.
*/
rd_seq = s_wr_seq;
CPU_FOREACH(i) {
/*
* Query the active sequence on this cpu. If we're not
* waiting and we don't meet the goal we will still scan
* the rest of the cpus to update s_rd_seq before returning
* failure.
*/
c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal,
wait);
/*
* Limit the minimum observed rd_seq whether we met the goal
* or not.
*/
if (c_seq != SMR_SEQ_INVALID)
rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
}
/*
* Advance the rd_seq as long as we observed a more recent value.
*/
s_rd_seq = atomic_load_int(&s->s_rd_seq);
if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) {
atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);
s_rd_seq = rd_seq;
}
return (s_rd_seq);
}
/*
@ -268,9 +501,10 @@ bool
smr_poll(smr_t smr, smr_seq_t goal, bool wait)
{
smr_shared_t s;
smr_t c;
smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq;
int i;
smr_t self;
smr_seq_t s_wr_seq, s_rd_seq;
smr_delta_t delta;
int flags;
bool success;
/*
@ -278,6 +512,8 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait)
*/
KASSERT(!wait || !SMR_ENTERED(smr),
("smr_poll: Blocking not allowed in a SMR section."));
KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
("smr_poll: Blocking not allowed on lazy smrs."));
/*
* Use a critical section so that we can avoid ABA races
@ -285,9 +521,19 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait)
*/
success = true;
critical_enter();
s = zpcpu_get(smr)->c_shared;
/* Attempt to load from self only once. */
self = zpcpu_get(smr);
s = self->c_shared;
flags = self->c_flags;
counter_u64_add_protected(poll, 1);
/*
* Conditionally advance the lazy write clock on any writer
* activity. This may reset s_rd_seq.
*/
if ((flags & SMR_LAZY) != 0)
smr_lazy_advance(smr, s);
/*
* Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
* observe an updated read sequence that is larger than write.
@ -295,106 +541,59 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait)
s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
/*
* wr_seq must be loaded prior to any c_seq value so that a stale
* c_seq can only reference time after this wr_seq.
* If we have already observed the sequence number we can immediately
* return success. Most polls should meet this criterion.
*/
if (SMR_SEQ_LEQ(goal, s_rd_seq))
goto out;
/*
* wr_seq must be loaded prior to any c_seq value so that a
* stale c_seq can only reference time after this wr_seq.
*/
s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
/*
* This may have come from a deferred advance. Consider one
* increment past the current wr_seq valid and make sure we
* have advanced far enough to succeed. We simply add to avoid
* an additional fence.
* This is the distance from s_wr_seq to goal. Positive values
* are in the future.
*/
if (goal == s_wr_seq + SMR_SEQ_INCR) {
atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR);
s_wr_seq = goal;
delta = SMR_SEQ_DELTA(goal, s_wr_seq);
/*
* Detect a stale wr_seq.
*
* This goal may have come from a deferred advance or a lazy
* smr. If we are not blocking we can not succeed but the
* sequence number is valid.
*/
if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE &&
(flags & (SMR_LAZY | SMR_DEFERRED)) != 0) {
if (!wait) {
success = false;
goto out;
}
/* LAZY is always !wait. */
s_wr_seq = smr_shared_advance(s);
delta = 0;
}
/*
* Detect whether the goal is valid and has already been observed.
* Detect an invalid goal.
*
* The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
* it to be valid. If it is not then the caller held on to it and
* the integer wrapped. If we wrapped back within range the caller
* will harmlessly scan.
*
* A valid goal must be greater than s_rd_seq or we have not verified
* that it has been observed and must fall through to polling.
*/
if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal))
if (delta > 0)
goto out;
/*
* Loop until all cores have observed the goal sequence or have
* gone inactive. Keep track of the oldest sequence currently
* active as rd_seq.
*/
counter_u64_add_protected(poll_scan, 1);
rd_seq = s_wr_seq;
CPU_FOREACH(i) {
c = zpcpu_get_cpu(smr, i);
c_seq = SMR_SEQ_INVALID;
for (;;) {
c_seq = atomic_load_int(&c->c_seq);
if (c_seq == SMR_SEQ_INVALID)
break;
/*
* There is a race described in smr.h:smr_enter that
* can lead to a stale seq value but not stale data
* access. If we find a value out of range here we
* pin it to the current min to prevent it from
* advancing until that stale section has expired.
*
* The race is created when a cpu loads the s_wr_seq
* value in a local register and then another thread
* advances s_wr_seq and calls smr_poll() which will
* oberve no value yet in c_seq and advance s_rd_seq
* up to s_wr_seq which is beyond the register
* cached value. This is only likely to happen on
* hypervisor or with a system management interrupt.
*/
if (SMR_SEQ_LT(c_seq, s_rd_seq))
c_seq = s_rd_seq;
/*
* If the sequence number meets the goal we are
* done with this cpu.
*/
if (SMR_SEQ_GEQ(c_seq, goal))
break;
/*
* If we're not waiting we will still scan the rest
* of the cpus and update s_rd_seq before returning
* an error.
*/
if (!wait) {
success = false;
break;
}
cpu_spinwait();
}
/*
* Limit the minimum observed rd_seq whether we met the goal
* or not.
*/
if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq))
rd_seq = c_seq;
}
/*
* Advance the rd_seq as long as we observed the most recent one.
*/
s_rd_seq = atomic_load_int(&s->s_rd_seq);
do {
if (SMR_SEQ_LEQ(rd_seq, s_rd_seq))
goto out;
} while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0);
/* Determine the lowest visible sequence number. */
s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait);
success = SMR_SEQ_LEQ(goal, s_rd_seq);
out:
if (!success)
counter_u64_add_protected(poll_fail, 1);
critical_exit();
/*
@ -407,7 +606,7 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait)
}
smr_t
smr_create(const char *name)
smr_create(const char *name, int limit, int flags)
{
smr_t smr, c;
smr_shared_t s;
@ -417,13 +616,19 @@ smr_create(const char *name)
smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);
s->s_name = name;
s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
if ((flags & SMR_LAZY) == 0)
s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
else
s->s_rd_seq = s->s_wr_seq = ticks;
/* Initialize all CPUS, not just those running. */
for (i = 0; i <= mp_maxid; i++) {
c = zpcpu_get_cpu(smr, i);
c->c_seq = SMR_SEQ_INVALID;
c->c_shared = s;
c->c_deferred = 0;
c->c_limit = limit;
c->c_flags = flags;
}
atomic_thread_fence_seq_cst();
@ -460,5 +665,6 @@ smr_init_counters(void *unused)
advance_wait = counter_u64_alloc(M_WAITOK);
poll = counter_u64_alloc(M_WAITOK);
poll_scan = counter_u64_alloc(M_WAITOK);
poll_fail = counter_u64_alloc(M_WAITOK);
}
SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL);

View File

@ -32,6 +32,7 @@
#define _SYS__SMR_H_
typedef uint32_t smr_seq_t;
typedef int32_t smr_delta_t;
typedef struct smr *smr_t;
#endif /* __SYS_SMR_H_ */

View File

@ -45,11 +45,13 @@
* Modular arithmetic for comparing sequence numbers that have
* potentially wrapped. Copied from tcp_seq.h.
*/
#define SMR_SEQ_LT(a, b) ((int32_t)((a)-(b)) < 0)
#define SMR_SEQ_LEQ(a, b) ((int32_t)((a)-(b)) <= 0)
#define SMR_SEQ_GT(a, b) ((int32_t)((a)-(b)) > 0)
#define SMR_SEQ_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
#define SMR_SEQ_DELTA(a, b) ((int32_t)((a)-(b)))
#define SMR_SEQ_LT(a, b) ((smr_delta_t)((a)-(b)) < 0)
#define SMR_SEQ_LEQ(a, b) ((smr_delta_t)((a)-(b)) <= 0)
#define SMR_SEQ_GT(a, b) ((smr_delta_t)((a)-(b)) > 0)
#define SMR_SEQ_GEQ(a, b) ((smr_delta_t)((a)-(b)) >= 0)
#define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a)-(b)))
#define SMR_SEQ_MIN(a, b) (SMR_SEQ_LT((a), (b)) ? (a) : (b))
#define SMR_SEQ_MAX(a, b) (SMR_SEQ_GT((a), (b)) ? (a) : (b))
#define SMR_SEQ_INVALID 0
@ -66,8 +68,13 @@ struct smr {
smr_seq_t c_seq; /* Current observed sequence. */
smr_shared_t c_shared; /* Shared SMR state. */
int c_deferred; /* Deferred advance counter. */
int c_limit; /* Deferred advance limit. */
int c_flags; /* SMR Configuration */
};
#define SMR_LAZY 0x0001 /* Higher latency write, fast read. */
#define SMR_DEFERRED 0x0002 /* Aggregate updates to wr_seq. */
#define SMR_ENTERED(smr) \
(curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID)
@ -94,7 +101,7 @@ struct smr {
* All acceses include a parameter for an assert to verify the required
* synchronization. For example, a writer might use:
*
* smr_serilized_store(pointer, value, mtx_owned(&writelock));
* smr_serialized_store(pointer, value, mtx_owned(&writelock));
*
* These are only enabled in INVARIANTS kernels.
*/
@ -127,6 +134,9 @@ typedef struct { \
* Store 'v' to an SMR protected pointer while serialized by an
* external mechanism. 'ex' should contain an assert that the
* external mechanism is held. i.e. mtx_owned()
*
* Writers that are serialized with mutual exclusion or on a single
* thread should use smr_serialized_store() rather than swap.
*/
#define smr_serialized_store(p, v, ex) do { \
SMR_ASSERT(ex, "smr_serialized_store"); \
@ -138,6 +148,8 @@ typedef struct { \
* swap 'v' with an SMR protected pointer and return the old value
* while serialized by an external mechanism. 'ex' should contain
* an assert that the external mechanism is provided. i.e. mtx_owned()
*
* Swap permits multiple writers to update a pointer concurrently.
*/
#define smr_serialized_swap(p, v, ex) ({ \
SMR_ASSERT(ex, "smr_serialized_swap"); \
@ -170,7 +182,8 @@ typedef struct { \
} while (0)
/*
* Return the current write sequence number.
* Return the current write sequence number. This is not the same as the
* current goal which may be in the future.
*/
static inline smr_seq_t
smr_shared_current(smr_shared_t s)
@ -195,6 +208,8 @@ smr_enter(smr_t smr)
critical_enter();
smr = zpcpu_get(smr);
KASSERT((smr->c_flags & SMR_LAZY) == 0,
("smr_enter(%s) lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq == 0,
("smr_enter(%s) does not support recursion.",
smr->c_shared->s_name));
@ -228,6 +243,8 @@ smr_exit(smr_t smr)
smr = zpcpu_get(smr);
CRITICAL_ASSERT(curthread);
KASSERT((smr->c_flags & SMR_LAZY) == 0,
("smr_exit(%s) lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq != SMR_SEQ_INVALID,
("smr_exit(%s) not in a smr section.", smr->c_shared->s_name));
@ -242,19 +259,63 @@ smr_exit(smr_t smr)
critical_exit();
}
/*
* Enter a lazy smr section. This is used for read-mostly state that
* can tolerate a high free latency.
*/
static inline void
smr_lazy_enter(smr_t smr)
{
critical_enter();
smr = zpcpu_get(smr);
KASSERT((smr->c_flags & SMR_LAZY) != 0,
("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq == 0,
("smr_lazy_enter(%s) does not support recursion.",
smr->c_shared->s_name));
/*
* This needs no serialization. If an interrupt occurs before we
* assign sr_seq to c_seq any speculative loads will be discarded.
* If we assign a stale wr_seq value due to interrupt we use the
* same algorithm that renders smr_enter() safe.
*/
smr->c_seq = smr_shared_current(smr->c_shared);
}
/*
* Exit a lazy smr section. This is used for read-mostly state that
* can tolerate a high free latency.
*/
static inline void
smr_lazy_exit(smr_t smr)
{
smr = zpcpu_get(smr);
CRITICAL_ASSERT(curthread);
KASSERT((smr->c_flags & SMR_LAZY) != 0,
("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq != SMR_SEQ_INVALID,
("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name));
/*
* All loads/stores must be retired before the sequence becomes
* visible. The fence compiles away on amd64. Another
* alternative would be to omit the fence but store the exit
* time and wait 1 tick longer.
*/
atomic_thread_fence_rel();
smr->c_seq = SMR_SEQ_INVALID;
critical_exit();
}
/*
* Advances the write sequence number. Returns the sequence number
* required to ensure that all modifications are visible to readers.
*/
smr_seq_t smr_advance(smr_t smr);
/*
* Advances the write sequence number only after N calls. Returns
* the correct goal for a wr_seq that has not yet occurred. Used to
* minimize shared cacheline invalidations for frequent writers.
*/
smr_seq_t smr_advance_deferred(smr_t smr, int limit);
/*
* Returns true if a goal sequence has been reached. If
* wait is true this will busy loop until success.
@ -262,7 +323,9 @@ smr_seq_t smr_advance_deferred(smr_t smr, int limit);
bool smr_poll(smr_t smr, smr_seq_t goal, bool wait);
/* Create a new SMR context. */
smr_t smr_create(const char *name);
smr_t smr_create(const char *name, int limit, int flags);
/* Destroy the context. */
void smr_destroy(smr_t smr);
/*

View File

@ -1140,7 +1140,6 @@ hash_free(struct uma_hash *hash)
* Returns:
* Nothing
*/
static void
bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
{
@ -1200,7 +1199,7 @@ cache_drain(uma_zone_t zone)
*/
seq = SMR_SEQ_INVALID;
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
seq = smr_current(zone->uz_smr);
seq = smr_advance(zone->uz_smr);
CPU_FOREACH(cpu) {
cache = &zone->uz_cpu[cpu];
bucket = cache_bucket_unload_alloc(cache);
@ -1329,7 +1328,7 @@ bucket_cache_reclaim(uma_zone_t zone, bool drain)
* the item count. Reclaim it individually here.
*/
zdom = ZDOM_GET(zone, i);
if ((zone->uz_flags & UMA_ZONE_SMR) == 0) {
if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) {
ZONE_CROSS_LOCK(zone);
bucket = zdom->uzd_cross;
zdom->uzd_cross = NULL;
@ -2679,7 +2678,7 @@ zone_ctor(void *mem, int size, void *udata, int flags)
/* Caller requests a private SMR context. */
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
zone->uz_smr = smr_create(zone->uz_name);
zone->uz_smr = smr_create(zone->uz_name, 0, 0);
KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
(UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
@ -4137,22 +4136,21 @@ zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
"uma_zfree: zone %s(%p) draining cross bucket %p",
zone->uz_name, zone, bucket);
STAILQ_INIT(&fullbuckets);
/*
* To avoid having ndomain * ndomain buckets for sorting we have a
* lock on the current crossfree bucket. A full matrix with
* per-domain locking could be used if necessary.
*/
ZONE_CROSS_LOCK(zone);
/*
* It is possible for buckets to arrive here out of order so we fetch
* the current smr seq rather than accepting the bucket's.
*/
seq = SMR_SEQ_INVALID;
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
seq = smr_current(zone->uz_smr);
seq = smr_advance(zone->uz_smr);
/*
* To avoid having ndomain * ndomain buckets for sorting we have a
* lock on the current crossfree bucket. A full matrix with
* per-domain locking could be used if necessary.
*/
STAILQ_INIT(&fullbuckets);
ZONE_CROSS_LOCK(zone);
while (bucket->ub_cnt > 0) {
item = bucket->ub_bucket[bucket->ub_cnt - 1];
domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));