1229323d5f
Under Linux filesystem threads responsible for handling I/O are normally created with the maximum priority. Non-I/O filesystem processes run with the default priority. ZFS should adopt the same priority scheme under Linux to maintain good performance and so that it will complete fairly when other Linux filesystems are active. The priorities have been updated to the following: $ ps -eLo rtprio,cls,pid,pri,nice,cmd | egrep 'z_|spl_|zvol|arc|dbu|meta' - TS 10743 19 -20 [spl_kmem_cache] - TS 10744 19 -20 [spl_system_task] - TS 10745 19 -20 [spl_dynamic_tas] - TS 10764 19 0 [dbu_evict] - TS 10765 19 0 [arc_prune] - TS 10766 19 0 [arc_reclaim] - TS 10767 19 0 [arc_user_evicts] - TS 10768 19 0 [l2arc_feed] - TS 10769 39 0 [z_unmount] - TS 10770 39 -20 [zvol] - TS 11011 39 -20 [z_null_iss] - TS 11012 39 -20 [z_null_int] - TS 11013 39 -20 [z_rd_iss] - TS 11014 39 -20 [z_rd_int_0] - TS 11022 38 -19 [z_wr_iss] - TS 11023 39 -20 [z_wr_iss_h] - TS 11024 39 -20 [z_wr_int_0] - TS 11032 39 -20 [z_wr_int_h] - TS 11033 39 -20 [z_fr_iss_0] - TS 11041 39 -20 [z_fr_int] - TS 11042 39 -20 [z_cl_iss] - TS 11043 39 -20 [z_cl_int] - TS 11044 39 -20 [z_ioctl_iss] - TS 11045 39 -20 [z_ioctl_int] - TS 11046 39 -20 [metaslab_group_] - TS 11050 19 0 [z_iput] - TS 11121 38 -19 [z_wr_iss] Note that under Linux the meaning of a processes priority is inverted with respect to illumos. High values on Linux indicate a _low_ priority while high value on illumos indicate a _high_ priority. In order to preserve the logical meaning of the minclsyspri and maxclsyspri macros when they are used by the illumos wrapper functions their values have been inverted. This way when changes are merged from upstream illumos we won't need to remember to invert the macro. It could also lead to confusion. This patch depends on https://github.com/zfsonlinux/spl/pull/466. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ned Bass <bass6@llnl.gov> Closes #3607
384 lines
8.6 KiB
C
384 lines
8.6 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
|
|
* Use is subject to license terms.
|
|
*/
|
|
/*
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
|
* Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
|
|
* Copyright (c) 2014 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
int taskq_now;
|
|
taskq_t *system_taskq;
|
|
|
|
#define TASKQ_ACTIVE 0x00010000
|
|
#define TASKQ_NAMELEN 31
|
|
|
|
struct taskq {
|
|
char tq_name[TASKQ_NAMELEN + 1];
|
|
kmutex_t tq_lock;
|
|
krwlock_t tq_threadlock;
|
|
kcondvar_t tq_dispatch_cv;
|
|
kcondvar_t tq_wait_cv;
|
|
kthread_t **tq_threadlist;
|
|
int tq_flags;
|
|
int tq_active;
|
|
int tq_nthreads;
|
|
int tq_nalloc;
|
|
int tq_minalloc;
|
|
int tq_maxalloc;
|
|
kcondvar_t tq_maxalloc_cv;
|
|
int tq_maxalloc_wait;
|
|
taskq_ent_t *tq_freelist;
|
|
taskq_ent_t tq_task;
|
|
};
|
|
|
|
static taskq_ent_t *
|
|
task_alloc(taskq_t *tq, int tqflags)
|
|
{
|
|
taskq_ent_t *t;
|
|
int rv;
|
|
|
|
again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
|
|
ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
|
|
tq->tq_freelist = t->tqent_next;
|
|
} else {
|
|
if (tq->tq_nalloc >= tq->tq_maxalloc) {
|
|
if (!(tqflags & KM_SLEEP))
|
|
return (NULL);
|
|
|
|
/*
|
|
* We don't want to exceed tq_maxalloc, but we can't
|
|
* wait for other tasks to complete (and thus free up
|
|
* task structures) without risking deadlock with
|
|
* the caller. So, we just delay for one second
|
|
* to throttle the allocation rate. If we have tasks
|
|
* complete before one second timeout expires then
|
|
* taskq_ent_free will signal us and we will
|
|
* immediately retry the allocation.
|
|
*/
|
|
tq->tq_maxalloc_wait++;
|
|
rv = cv_timedwait(&tq->tq_maxalloc_cv,
|
|
&tq->tq_lock, ddi_get_lbolt() + hz);
|
|
tq->tq_maxalloc_wait--;
|
|
if (rv > 0)
|
|
goto again; /* signaled */
|
|
}
|
|
mutex_exit(&tq->tq_lock);
|
|
|
|
t = kmem_alloc(sizeof (taskq_ent_t), tqflags);
|
|
|
|
mutex_enter(&tq->tq_lock);
|
|
if (t != NULL) {
|
|
/* Make sure we start without any flags */
|
|
t->tqent_flags = 0;
|
|
tq->tq_nalloc++;
|
|
}
|
|
}
|
|
return (t);
|
|
}
|
|
|
|
static void
|
|
task_free(taskq_t *tq, taskq_ent_t *t)
|
|
{
|
|
if (tq->tq_nalloc <= tq->tq_minalloc) {
|
|
t->tqent_next = tq->tq_freelist;
|
|
tq->tq_freelist = t;
|
|
} else {
|
|
tq->tq_nalloc--;
|
|
mutex_exit(&tq->tq_lock);
|
|
kmem_free(t, sizeof (taskq_ent_t));
|
|
mutex_enter(&tq->tq_lock);
|
|
}
|
|
|
|
if (tq->tq_maxalloc_wait)
|
|
cv_signal(&tq->tq_maxalloc_cv);
|
|
}
|
|
|
|
taskqid_t
|
|
taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
|
|
{
|
|
taskq_ent_t *t;
|
|
|
|
if (taskq_now) {
|
|
func(arg);
|
|
return (1);
|
|
}
|
|
|
|
mutex_enter(&tq->tq_lock);
|
|
ASSERT(tq->tq_flags & TASKQ_ACTIVE);
|
|
if ((t = task_alloc(tq, tqflags)) == NULL) {
|
|
mutex_exit(&tq->tq_lock);
|
|
return (0);
|
|
}
|
|
if (tqflags & TQ_FRONT) {
|
|
t->tqent_next = tq->tq_task.tqent_next;
|
|
t->tqent_prev = &tq->tq_task;
|
|
} else {
|
|
t->tqent_next = &tq->tq_task;
|
|
t->tqent_prev = tq->tq_task.tqent_prev;
|
|
}
|
|
t->tqent_next->tqent_prev = t;
|
|
t->tqent_prev->tqent_next = t;
|
|
t->tqent_func = func;
|
|
t->tqent_arg = arg;
|
|
t->tqent_flags = 0;
|
|
cv_signal(&tq->tq_dispatch_cv);
|
|
mutex_exit(&tq->tq_lock);
|
|
return (1);
|
|
}
|
|
|
|
taskqid_t
|
|
taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags,
|
|
clock_t expire_time)
|
|
{
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
taskq_empty_ent(taskq_ent_t *t)
|
|
{
|
|
return (t->tqent_next == NULL);
|
|
}
|
|
|
|
void
|
|
taskq_init_ent(taskq_ent_t *t)
|
|
{
|
|
t->tqent_next = NULL;
|
|
t->tqent_prev = NULL;
|
|
t->tqent_func = NULL;
|
|
t->tqent_arg = NULL;
|
|
t->tqent_flags = 0;
|
|
}
|
|
|
|
void
|
|
taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
|
|
taskq_ent_t *t)
|
|
{
|
|
ASSERT(func != NULL);
|
|
|
|
/*
|
|
* Mark it as a prealloc'd task. This is important
|
|
* to ensure that we don't free it later.
|
|
*/
|
|
t->tqent_flags |= TQENT_FLAG_PREALLOC;
|
|
/*
|
|
* Enqueue the task to the underlying queue.
|
|
*/
|
|
mutex_enter(&tq->tq_lock);
|
|
|
|
if (flags & TQ_FRONT) {
|
|
t->tqent_next = tq->tq_task.tqent_next;
|
|
t->tqent_prev = &tq->tq_task;
|
|
} else {
|
|
t->tqent_next = &tq->tq_task;
|
|
t->tqent_prev = tq->tq_task.tqent_prev;
|
|
}
|
|
t->tqent_next->tqent_prev = t;
|
|
t->tqent_prev->tqent_next = t;
|
|
t->tqent_func = func;
|
|
t->tqent_arg = arg;
|
|
cv_signal(&tq->tq_dispatch_cv);
|
|
mutex_exit(&tq->tq_lock);
|
|
}
|
|
|
|
void
|
|
taskq_wait(taskq_t *tq)
|
|
{
|
|
mutex_enter(&tq->tq_lock);
|
|
while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
|
|
cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
|
|
mutex_exit(&tq->tq_lock);
|
|
}
|
|
|
|
void
|
|
taskq_wait_id(taskq_t *tq, taskqid_t id)
|
|
{
|
|
taskq_wait(tq);
|
|
}
|
|
|
|
void
|
|
taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
|
|
{
|
|
taskq_wait(tq);
|
|
}
|
|
|
|
static void
|
|
taskq_thread(void *arg)
|
|
{
|
|
taskq_t *tq = arg;
|
|
taskq_ent_t *t;
|
|
boolean_t prealloc;
|
|
|
|
mutex_enter(&tq->tq_lock);
|
|
while (tq->tq_flags & TASKQ_ACTIVE) {
|
|
if ((t = tq->tq_task.tqent_next) == &tq->tq_task) {
|
|
if (--tq->tq_active == 0)
|
|
cv_broadcast(&tq->tq_wait_cv);
|
|
cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
|
|
tq->tq_active++;
|
|
continue;
|
|
}
|
|
t->tqent_prev->tqent_next = t->tqent_next;
|
|
t->tqent_next->tqent_prev = t->tqent_prev;
|
|
t->tqent_next = NULL;
|
|
t->tqent_prev = NULL;
|
|
prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC;
|
|
mutex_exit(&tq->tq_lock);
|
|
|
|
rw_enter(&tq->tq_threadlock, RW_READER);
|
|
t->tqent_func(t->tqent_arg);
|
|
rw_exit(&tq->tq_threadlock);
|
|
|
|
mutex_enter(&tq->tq_lock);
|
|
if (!prealloc)
|
|
task_free(tq, t);
|
|
}
|
|
tq->tq_nthreads--;
|
|
cv_broadcast(&tq->tq_wait_cv);
|
|
mutex_exit(&tq->tq_lock);
|
|
thread_exit();
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
taskq_t *
|
|
taskq_create(const char *name, int nthreads, pri_t pri,
|
|
int minalloc, int maxalloc, uint_t flags)
|
|
{
|
|
taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP);
|
|
int t;
|
|
|
|
if (flags & TASKQ_THREADS_CPU_PCT) {
|
|
int pct;
|
|
ASSERT3S(nthreads, >=, 0);
|
|
ASSERT3S(nthreads, <=, 100);
|
|
pct = MIN(nthreads, 100);
|
|
pct = MAX(pct, 0);
|
|
|
|
nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100;
|
|
nthreads = MAX(nthreads, 1); /* need at least 1 thread */
|
|
} else {
|
|
ASSERT3S(nthreads, >=, 1);
|
|
}
|
|
|
|
rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
|
|
mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
|
|
cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
|
|
cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
|
|
(void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
|
|
tq->tq_flags = flags | TASKQ_ACTIVE;
|
|
tq->tq_active = nthreads;
|
|
tq->tq_nthreads = nthreads;
|
|
tq->tq_minalloc = minalloc;
|
|
tq->tq_maxalloc = maxalloc;
|
|
tq->tq_task.tqent_next = &tq->tq_task;
|
|
tq->tq_task.tqent_prev = &tq->tq_task;
|
|
tq->tq_threadlist = kmem_alloc(nthreads * sizeof (kthread_t *),
|
|
KM_SLEEP);
|
|
|
|
if (flags & TASKQ_PREPOPULATE) {
|
|
mutex_enter(&tq->tq_lock);
|
|
while (minalloc-- > 0)
|
|
task_free(tq, task_alloc(tq, KM_SLEEP));
|
|
mutex_exit(&tq->tq_lock);
|
|
}
|
|
|
|
for (t = 0; t < nthreads; t++)
|
|
VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0,
|
|
taskq_thread, tq, TS_RUN, NULL, 0, pri)) != NULL);
|
|
|
|
return (tq);
|
|
}
|
|
|
|
void
|
|
taskq_destroy(taskq_t *tq)
|
|
{
|
|
int nthreads = tq->tq_nthreads;
|
|
|
|
taskq_wait(tq);
|
|
|
|
mutex_enter(&tq->tq_lock);
|
|
|
|
tq->tq_flags &= ~TASKQ_ACTIVE;
|
|
cv_broadcast(&tq->tq_dispatch_cv);
|
|
|
|
while (tq->tq_nthreads != 0)
|
|
cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
|
|
|
|
tq->tq_minalloc = 0;
|
|
while (tq->tq_nalloc != 0) {
|
|
ASSERT(tq->tq_freelist != NULL);
|
|
task_free(tq, task_alloc(tq, KM_SLEEP));
|
|
}
|
|
|
|
mutex_exit(&tq->tq_lock);
|
|
|
|
kmem_free(tq->tq_threadlist, nthreads * sizeof (kthread_t *));
|
|
|
|
rw_destroy(&tq->tq_threadlock);
|
|
mutex_destroy(&tq->tq_lock);
|
|
cv_destroy(&tq->tq_dispatch_cv);
|
|
cv_destroy(&tq->tq_wait_cv);
|
|
cv_destroy(&tq->tq_maxalloc_cv);
|
|
|
|
kmem_free(tq, sizeof (taskq_t));
|
|
}
|
|
|
|
int
|
|
taskq_member(taskq_t *tq, kthread_t *t)
|
|
{
|
|
int i;
|
|
|
|
if (taskq_now)
|
|
return (1);
|
|
|
|
for (i = 0; i < tq->tq_nthreads; i++)
|
|
if (tq->tq_threadlist[i] == t)
|
|
return (1);
|
|
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
taskq_cancel_id(taskq_t *tq, taskqid_t id)
|
|
{
|
|
return (ENOENT);
|
|
}
|
|
|
|
void
|
|
system_taskq_init(void)
|
|
{
|
|
system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512,
|
|
TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
|
|
}
|
|
|
|
void
|
|
system_taskq_fini(void)
|
|
{
|
|
taskq_destroy(system_taskq);
|
|
system_taskq = NULL; /* defensive */
|
|
}
|