locks: extend speculative spin waiting for readers to drain

Now that 10 years have passed since the original limit of 10000 was
committed, bump it a little bit.

Spinning waiting for writers is semi-informed in the sense that we always
know if the owner is running and base the decision to spin on that.
However, no such information is provided for read-locking. In particular
this means that it is possible for a write-spinner to completely waste cpu
time waiting for the lock to be released, while the reader holding it was
preempted and is now waiting for the spinner to go off cpu.

Nonetheless, in majority of cases it is an improvement to spin instead of
instantly giving up and going to sleep.

The current approach is pretty simple: snatch the number of current readers
and performs that many pauses before checking again. The total number of
pauses to execute is limited to 10k. If the lock is still not free by
that time, go to sleep.

Given the previously noted problem of not knowing whether spinning makes
any sense to begin with the new limit has to remain rather conservative.
But at the very least it should also be related to the machine. Waiting
for writers uses parameters selected based on the number of activated
hardware threads. The upper limit of pause instructions to be executed
in-between re-reads of the lock is typically 16384 or 32678. It was
selected as the limit of total spins. The lower bound is set to
already present 10000 as to not change it for smaller machines.

Bumping the limit reduces system time by few % during benchmarks like
buildworld, buildkernel and others. Tested on 2 and 4 socket machines
(Broadwell, Skylake).

Figuring out how to make a more informed decision while not pessimizing
the fast path is left as an exercise for the reader.
This commit is contained in:
Mateusz Guzik 2018-04-11 01:43:29 +00:00
parent 0629b15276
commit e0e259a888
2 changed files with 22 additions and 6 deletions

View File

@ -95,8 +95,8 @@ struct lock_class lock_class_rw = {
}; };
#ifdef ADAPTIVE_RWLOCKS #ifdef ADAPTIVE_RWLOCKS
static int __read_frequently rowner_retries = 10; static int __read_frequently rowner_retries;
static int __read_frequently rowner_loops = 10000; static int __read_frequently rowner_loops;
static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
"rwlock debugging"); "rwlock debugging");
SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
@ -109,7 +109,15 @@ SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_base, CTLFLAG_RW, &rw_delay.base,
SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max, SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max,
0, ""); 0, "");
LOCK_DELAY_SYSINIT_DEFAULT(rw_delay); static void
rw_lock_delay_init(void *arg __unused)
{
lock_delay_default_init(&rw_delay);
rowner_retries = 10;
rowner_loops = max(10000, rw_delay.max);
}
LOCK_DELAY_SYSINIT(rw_lock_delay_init);
#endif #endif
/* /*

View File

@ -145,8 +145,8 @@ struct lock_class lock_class_sx = {
#endif #endif
#ifdef ADAPTIVE_SX #ifdef ADAPTIVE_SX
static __read_frequently u_int asx_retries = 10; static __read_frequently u_int asx_retries;
static __read_frequently u_int asx_loops = 10000; static __read_frequently u_int asx_loops;
static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging");
SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
@ -158,7 +158,15 @@ SYSCTL_INT(_debug_sx, OID_AUTO, delay_base, CTLFLAG_RW, &sx_delay.base,
SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max,
0, ""); 0, "");
LOCK_DELAY_SYSINIT_DEFAULT(sx_delay); static void
sx_lock_delay_init(void *arg __unused)
{
lock_delay_default_init(&sx_delay);
asx_retries = 10;
asx_loops = max(10000, sx_delay.max);
}
LOCK_DELAY_SYSINIT(sx_lock_delay_init);
#endif #endif
void void