2012-05-30 16:06:38 +00:00
|
|
|
/*-
|
2017-11-27 15:20:12 +00:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
|
|
*
|
2012-05-30 16:06:38 +00:00
|
|
|
* Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice unmodified, this list of conditions, and the following
|
|
|
|
* disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/mutex.h>
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/rangelock.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
|
|
|
|
#include <vm/uma.h>
|
|
|
|
|
|
|
|
struct rl_q_entry {
|
|
|
|
TAILQ_ENTRY(rl_q_entry) rl_q_link;
|
|
|
|
off_t rl_q_start, rl_q_end;
|
|
|
|
int rl_q_flags;
|
|
|
|
};
|
|
|
|
|
|
|
|
static uma_zone_t rl_entry_zone;
|
|
|
|
|
|
|
|
static void
|
|
|
|
rangelock_sys_init(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
|
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
|
|
|
|
}
|
|
|
|
SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
|
|
|
|
|
|
|
|
static struct rl_q_entry *
|
|
|
|
rlqentry_alloc(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (uma_zalloc(rl_entry_zone, M_WAITOK));
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
rlqentry_free(struct rl_q_entry *rleq)
|
|
|
|
{
|
|
|
|
|
|
|
|
uma_zfree(rl_entry_zone, rleq);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
rangelock_init(struct rangelock *lock)
|
|
|
|
{
|
|
|
|
|
|
|
|
TAILQ_INIT(&lock->rl_waiters);
|
|
|
|
lock->rl_currdep = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
rangelock_destroy(struct rangelock *lock)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Two entries are compatible if their ranges do not overlap, or both
|
|
|
|
* entries are for read.
|
|
|
|
*/
|
|
|
|
static int
|
Change the queue of locks in kern_rangelock.c from holding lock requests in
the order that they arrive, to holding
(a) granted write lock requests, followed by
(b) granted read lock requests, followed by
(c) ungranted requests, in order of arrival.
This changes the stopping condition for iterating through granted locks to
see if a new request can be granted: When considering a read lock request,
we can stop iterating as soon as we see a read lock request, since anything
after that point is either a granted read lock request or a request which
has not yet been granted. (For write lock requests, we must still compare
against all granted lock requests.)
For workloads with R parallel reads and W parallel writes, this improves
the time spent from O((R+W)^2) to O(W*(R+W)); i.e., heavy parallel-read
workloads become significantly more scalable.
No statistically significant change in buildworld time has been measured,
but synthetic tests of parallel 'dd > /dev/null' and 'openssl enc >/dev/null'
with the input file cached yield dramatic (up to 10x) improvement with high
(up to 128 processes) levels of parallelism.
Reviewed by: kib
2013-08-15 20:19:17 +00:00
|
|
|
ranges_overlap(const struct rl_q_entry *e1,
|
2012-05-30 16:06:38 +00:00
|
|
|
const struct rl_q_entry *e2)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
|
|
|
|
return (1);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recalculate the lock->rl_currdep after an unlock.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
rangelock_calc_block(struct rangelock *lock)
|
|
|
|
{
|
Change the queue of locks in kern_rangelock.c from holding lock requests in
the order that they arrive, to holding
(a) granted write lock requests, followed by
(b) granted read lock requests, followed by
(c) ungranted requests, in order of arrival.
This changes the stopping condition for iterating through granted locks to
see if a new request can be granted: When considering a read lock request,
we can stop iterating as soon as we see a read lock request, since anything
after that point is either a granted read lock request or a request which
has not yet been granted. (For write lock requests, we must still compare
against all granted lock requests.)
For workloads with R parallel reads and W parallel writes, this improves
the time spent from O((R+W)^2) to O(W*(R+W)); i.e., heavy parallel-read
workloads become significantly more scalable.
No statistically significant change in buildworld time has been measured,
but synthetic tests of parallel 'dd > /dev/null' and 'openssl enc >/dev/null'
with the input file cached yield dramatic (up to 10x) improvement with high
(up to 128 processes) levels of parallelism.
Reviewed by: kib
2013-08-15 20:19:17 +00:00
|
|
|
struct rl_q_entry *entry, *nextentry, *entry1;
|
|
|
|
|
|
|
|
for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
|
|
|
|
nextentry = TAILQ_NEXT(entry, rl_q_link);
|
|
|
|
if (entry->rl_q_flags & RL_LOCK_READ) {
|
|
|
|
/* Reads must not overlap with granted writes. */
|
|
|
|
for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
|
|
|
|
!(entry1->rl_q_flags & RL_LOCK_READ);
|
|
|
|
entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
|
|
|
|
if (ranges_overlap(entry, entry1))
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* Write must not overlap with any granted locks. */
|
|
|
|
for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
|
|
|
|
entry1 != entry;
|
|
|
|
entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
|
|
|
|
if (ranges_overlap(entry, entry1))
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Move grantable write locks to the front. */
|
|
|
|
TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
|
|
|
|
TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
|
2012-05-30 16:06:38 +00:00
|
|
|
}
|
Change the queue of locks in kern_rangelock.c from holding lock requests in
the order that they arrive, to holding
(a) granted write lock requests, followed by
(b) granted read lock requests, followed by
(c) ungranted requests, in order of arrival.
This changes the stopping condition for iterating through granted locks to
see if a new request can be granted: When considering a read lock request,
we can stop iterating as soon as we see a read lock request, since anything
after that point is either a granted read lock request or a request which
has not yet been granted. (For write lock requests, we must still compare
against all granted lock requests.)
For workloads with R parallel reads and W parallel writes, this improves
the time spent from O((R+W)^2) to O(W*(R+W)); i.e., heavy parallel-read
workloads become significantly more scalable.
No statistically significant change in buildworld time has been measured,
but synthetic tests of parallel 'dd > /dev/null' and 'openssl enc >/dev/null'
with the input file cached yield dramatic (up to 10x) improvement with high
(up to 128 processes) levels of parallelism.
Reviewed by: kib
2013-08-15 20:19:17 +00:00
|
|
|
|
|
|
|
/* Grant this lock. */
|
|
|
|
entry->rl_q_flags |= RL_LOCK_GRANTED;
|
|
|
|
wakeup(entry);
|
2012-05-30 16:06:38 +00:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
lock->rl_currdep = entry;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
|
2019-06-27 23:10:40 +00:00
|
|
|
struct mtx *ilk, bool do_calc_block)
|
2012-05-30 16:06:38 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
MPASS(lock != NULL && entry != NULL && ilk != NULL);
|
|
|
|
mtx_assert(ilk, MA_OWNED);
|
2019-06-27 23:10:40 +00:00
|
|
|
|
|
|
|
if (!do_calc_block) {
|
|
|
|
/*
|
|
|
|
* This is the case where rangelock_enqueue() has been called
|
|
|
|
* with trylock == true and just inserted this entry in the
|
|
|
|
* queue.
|
|
|
|
* If rl_currdep is this entry, rl_currdep needs to
|
|
|
|
* be set to the next entry in the rl_waiters list.
|
|
|
|
* However, since this entry is the last entry in the
|
|
|
|
* list, the next entry is NULL.
|
|
|
|
*/
|
|
|
|
if (lock->rl_currdep == entry) {
|
|
|
|
KASSERT(TAILQ_NEXT(lock->rl_currdep, rl_q_link) == NULL,
|
|
|
|
("rangelock_enqueue: next entry not NULL"));
|
|
|
|
lock->rl_currdep = NULL;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
|
2012-05-30 16:06:38 +00:00
|
|
|
|
|
|
|
TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
|
2019-06-27 23:10:40 +00:00
|
|
|
if (do_calc_block)
|
|
|
|
rangelock_calc_block(lock);
|
2012-05-30 16:06:38 +00:00
|
|
|
mtx_unlock(ilk);
|
|
|
|
if (curthread->td_rlqe == NULL)
|
|
|
|
curthread->td_rlqe = entry;
|
|
|
|
else
|
|
|
|
rlqentry_free(entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
|
|
|
|
{
|
|
|
|
|
|
|
|
MPASS(lock != NULL && cookie != NULL && ilk != NULL);
|
|
|
|
|
|
|
|
mtx_lock(ilk);
|
2019-06-27 23:10:40 +00:00
|
|
|
rangelock_unlock_locked(lock, cookie, ilk, true);
|
2012-05-30 16:06:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unlock the sub-range of granted lock.
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
|
|
|
|
off_t end, struct mtx *ilk)
|
|
|
|
{
|
|
|
|
struct rl_q_entry *entry;
|
|
|
|
|
|
|
|
MPASS(lock != NULL && cookie != NULL && ilk != NULL);
|
|
|
|
entry = cookie;
|
|
|
|
KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
|
|
|
|
("Unlocking non-granted lock"));
|
|
|
|
KASSERT(entry->rl_q_start == start, ("wrong start"));
|
|
|
|
KASSERT(entry->rl_q_end >= end, ("wrong end"));
|
|
|
|
|
|
|
|
mtx_lock(ilk);
|
|
|
|
if (entry->rl_q_end == end) {
|
2019-06-27 23:10:40 +00:00
|
|
|
rangelock_unlock_locked(lock, cookie, ilk, true);
|
2012-05-30 16:06:38 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
entry->rl_q_end = end;
|
|
|
|
rangelock_calc_block(lock);
|
|
|
|
mtx_unlock(ilk);
|
|
|
|
return (cookie);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the lock request to the queue of the pending requests for
|
2019-06-27 23:10:40 +00:00
|
|
|
* rangelock. Sleep until the request can be granted unless trylock == true.
|
2012-05-30 16:06:38 +00:00
|
|
|
*/
|
|
|
|
static void *
|
|
|
|
rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
|
2019-06-27 23:10:40 +00:00
|
|
|
struct mtx *ilk, bool trylock)
|
2012-05-30 16:06:38 +00:00
|
|
|
{
|
|
|
|
struct rl_q_entry *entry;
|
|
|
|
struct thread *td;
|
|
|
|
|
|
|
|
MPASS(lock != NULL && ilk != NULL);
|
|
|
|
|
|
|
|
td = curthread;
|
|
|
|
if (td->td_rlqe != NULL) {
|
|
|
|
entry = td->td_rlqe;
|
|
|
|
td->td_rlqe = NULL;
|
|
|
|
} else
|
|
|
|
entry = rlqentry_alloc();
|
|
|
|
MPASS(entry != NULL);
|
|
|
|
entry->rl_q_flags = mode;
|
|
|
|
entry->rl_q_start = start;
|
|
|
|
entry->rl_q_end = end;
|
|
|
|
|
|
|
|
mtx_lock(ilk);
|
|
|
|
/*
|
|
|
|
* XXXKIB TODO. Check that a thread does not try to enqueue a
|
|
|
|
* lock that is incompatible with another request from the same
|
|
|
|
* thread.
|
|
|
|
*/
|
|
|
|
|
|
|
|
TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
|
2019-06-27 23:10:40 +00:00
|
|
|
/*
|
|
|
|
* If rl_currdep == NULL, there is no entry waiting for a conflicting
|
|
|
|
* range to be resolved, so set rl_currdep to this entry. If there is
|
|
|
|
* no conflicting entry for this entry, rl_currdep will be set back to
|
|
|
|
* NULL by rangelock_calc_block().
|
|
|
|
*/
|
2012-05-30 16:06:38 +00:00
|
|
|
if (lock->rl_currdep == NULL)
|
|
|
|
lock->rl_currdep = entry;
|
|
|
|
rangelock_calc_block(lock);
|
2019-06-27 23:10:40 +00:00
|
|
|
while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) {
|
|
|
|
if (trylock) {
|
|
|
|
/*
|
|
|
|
* For this case, the range is not actually locked
|
|
|
|
* yet, but removal from the list requires the same
|
|
|
|
* steps, except for not doing a rangelock_calc_block()
|
|
|
|
* call, since rangelock_calc_block() was called above.
|
|
|
|
*/
|
|
|
|
rangelock_unlock_locked(lock, entry, ilk, false);
|
|
|
|
return (NULL);
|
|
|
|
}
|
2012-05-30 16:06:38 +00:00
|
|
|
msleep(entry, ilk, 0, "range", 0);
|
2019-06-27 23:10:40 +00:00
|
|
|
}
|
2012-05-30 16:06:38 +00:00
|
|
|
mtx_unlock(ilk);
|
|
|
|
return (entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
|
|
|
|
{
|
|
|
|
|
2019-06-27 23:10:40 +00:00
|
|
|
return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, false));
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
|
|
|
|
struct mtx *ilk)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, true));
|
2012-05-30 16:06:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
|
|
|
|
{
|
|
|
|
|
2019-06-27 23:10:40 +00:00
|
|
|
return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, false));
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
|
|
|
|
struct mtx *ilk)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, true));
|
2012-05-30 16:06:38 +00:00
|
|
|
}
|