c6d31b8306
Make most AST handlers dynamically registered. This allows to have subsystem-specific handler source located in the subsystem files, instead of making subr_trap.c aware of it. For instance, signal delivery code on return to userspace is now moved to kern_sig.c. Also, it allows to have some handlers designated as the cleanup (kclear) type, which are called both at AST and on thread/process exit. For instance, ast(), exit1(), and NFS server no longer need to be aware about UFS softdep processing. The dynamic registration also allows third-party modules to register AST handlers if needed. There is one caveat with loadable modules: the code does not make any effort to ensure that the module is not unloaded before all threads processed through AST handler in it. In fact, this is already present behavior for hwpmc.ko and ufs.ko. I do not think it is worth the efforts and the runtime overhead to try to fix it. Reviewed by: markj Tested by: emaste (arm64), pho Discussed with: jhb Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D35888
1373 lines
34 KiB
C
1373 lines
34 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
*
|
|
* Copyright (c) 2010 The FreeBSD Foundation
|
|
*
|
|
* This software was developed by Edward Tomasz Napierala under sponsorship
|
|
* from the FreeBSD Foundation.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* $FreeBSD$
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include "opt_sched.h"
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/buf.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/eventhandler.h>
|
|
#include <sys/jail.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/kthread.h>
|
|
#include <sys/lock.h>
|
|
#include <sys/loginclass.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/mutex.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/racct.h>
|
|
#include <sys/resourcevar.h>
|
|
#include <sys/sbuf.h>
|
|
#include <sys/sched.h>
|
|
#include <sys/sdt.h>
|
|
#include <sys/smp.h>
|
|
#include <sys/sx.h>
|
|
#include <sys/sysctl.h>
|
|
#include <sys/sysproto.h>
|
|
#include <sys/umtxvar.h>
|
|
#include <machine/smp.h>
|
|
|
|
#ifdef RCTL
|
|
#include <sys/rctl.h>
|
|
#endif
|
|
|
|
#ifdef RACCT
|
|
|
|
FEATURE(racct, "Resource Accounting");
|
|
|
|
/*
|
|
* Do not block processes that have their %cpu usage <= pcpu_threshold.
|
|
*/
|
|
static int pcpu_threshold = 1;
|
|
#ifdef RACCT_DEFAULT_TO_DISABLED
|
|
bool __read_frequently racct_enable = false;
|
|
#else
|
|
bool __read_frequently racct_enable = true;
|
|
#endif
|
|
|
|
SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
"Resource Accounting");
|
|
SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
|
|
0, "Enable RACCT/RCTL");
|
|
SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
|
|
0, "Processes with higher %cpu usage than this value can be throttled.");
|
|
|
|
/*
|
|
* How many seconds it takes to use the scheduler %cpu calculations. When a
|
|
* process starts, we compute its %cpu usage by dividing its runtime by the
|
|
* process wall clock time. After RACCT_PCPU_SECS pass, we use the value
|
|
* provided by the scheduler.
|
|
*/
|
|
#define RACCT_PCPU_SECS 3
|
|
|
|
struct mtx racct_lock;
|
|
MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
|
|
|
|
static uma_zone_t racct_zone;
|
|
|
|
static void racct_sub_racct(struct racct *dest, const struct racct *src);
|
|
static void racct_sub_cred_locked(struct ucred *cred, int resource,
|
|
uint64_t amount);
|
|
static void racct_add_cred_locked(struct ucred *cred, int resource,
|
|
uint64_t amount);
|
|
|
|
SDT_PROVIDER_DEFINE(racct);
|
|
SDT_PROBE_DEFINE3(racct, , rusage, add,
|
|
"struct proc *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, add__failure,
|
|
"struct proc *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, add__buf,
|
|
"struct proc *", "const struct buf *", "int");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, add__cred,
|
|
"struct ucred *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, add__force,
|
|
"struct proc *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, set,
|
|
"struct proc *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, set__failure,
|
|
"struct proc *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, set__force,
|
|
"struct proc *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, sub,
|
|
"struct proc *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE3(racct, , rusage, sub__cred,
|
|
"struct ucred *", "int", "uint64_t");
|
|
SDT_PROBE_DEFINE1(racct, , racct, create,
|
|
"struct racct *");
|
|
SDT_PROBE_DEFINE1(racct, , racct, destroy,
|
|
"struct racct *");
|
|
SDT_PROBE_DEFINE2(racct, , racct, join,
|
|
"struct racct *", "struct racct *");
|
|
SDT_PROBE_DEFINE2(racct, , racct, join__failure,
|
|
"struct racct *", "struct racct *");
|
|
SDT_PROBE_DEFINE2(racct, , racct, leave,
|
|
"struct racct *", "struct racct *");
|
|
|
|
int racct_types[] = {
|
|
[RACCT_CPU] =
|
|
RACCT_IN_MILLIONS,
|
|
[RACCT_DATA] =
|
|
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
|
|
[RACCT_STACK] =
|
|
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
|
|
[RACCT_CORE] =
|
|
RACCT_DENIABLE,
|
|
[RACCT_RSS] =
|
|
RACCT_RECLAIMABLE,
|
|
[RACCT_MEMLOCK] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE,
|
|
[RACCT_NPROC] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE,
|
|
[RACCT_NOFILE] =
|
|
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
|
|
[RACCT_VMEM] =
|
|
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
|
|
[RACCT_NPTS] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_SWAP] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_NTHR] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE,
|
|
[RACCT_MSGQQUEUED] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_MSGQSIZE] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_NMSGQ] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_NSEM] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_NSEMOP] =
|
|
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
|
|
[RACCT_NSHM] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_SHMSIZE] =
|
|
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
|
|
[RACCT_WALLCLOCK] =
|
|
RACCT_IN_MILLIONS,
|
|
[RACCT_PCTCPU] =
|
|
RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
|
|
[RACCT_READBPS] =
|
|
RACCT_DECAYING,
|
|
[RACCT_WRITEBPS] =
|
|
RACCT_DECAYING,
|
|
[RACCT_READIOPS] =
|
|
RACCT_DECAYING,
|
|
[RACCT_WRITEIOPS] =
|
|
RACCT_DECAYING };
|
|
|
|
static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
|
|
|
|
#ifdef SCHED_4BSD
|
|
/*
|
|
* Contains intermediate values for %cpu calculations to avoid using floating
|
|
* point in the kernel.
|
|
* ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
|
|
* It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
|
|
* zero so the calculations are more straightforward.
|
|
*/
|
|
fixpt_t ccpu_exp[] = {
|
|
[0] = FSCALE * 1,
|
|
[1] = FSCALE * 0.95122942450071400909,
|
|
[2] = FSCALE * 0.90483741803595957316,
|
|
[3] = FSCALE * 0.86070797642505780722,
|
|
[4] = FSCALE * 0.81873075307798185866,
|
|
[5] = FSCALE * 0.77880078307140486824,
|
|
[6] = FSCALE * 0.74081822068171786606,
|
|
[7] = FSCALE * 0.70468808971871343435,
|
|
[8] = FSCALE * 0.67032004603563930074,
|
|
[9] = FSCALE * 0.63762815162177329314,
|
|
[10] = FSCALE * 0.60653065971263342360,
|
|
[11] = FSCALE * 0.57694981038048669531,
|
|
[12] = FSCALE * 0.54881163609402643262,
|
|
[13] = FSCALE * 0.52204577676101604789,
|
|
[14] = FSCALE * 0.49658530379140951470,
|
|
[15] = FSCALE * 0.47236655274101470713,
|
|
[16] = FSCALE * 0.44932896411722159143,
|
|
[17] = FSCALE * 0.42741493194872666992,
|
|
[18] = FSCALE * 0.40656965974059911188,
|
|
[19] = FSCALE * 0.38674102345450120691,
|
|
[20] = FSCALE * 0.36787944117144232159,
|
|
[21] = FSCALE * 0.34993774911115535467,
|
|
[22] = FSCALE * 0.33287108369807955328,
|
|
[23] = FSCALE * 0.31663676937905321821,
|
|
[24] = FSCALE * 0.30119421191220209664,
|
|
[25] = FSCALE * 0.28650479686019010032,
|
|
[26] = FSCALE * 0.27253179303401260312,
|
|
[27] = FSCALE * 0.25924026064589150757,
|
|
[28] = FSCALE * 0.24659696394160647693,
|
|
[29] = FSCALE * 0.23457028809379765313,
|
|
[30] = FSCALE * 0.22313016014842982893,
|
|
[31] = FSCALE * 0.21224797382674305771,
|
|
[32] = FSCALE * 0.20189651799465540848,
|
|
[33] = FSCALE * 0.19204990862075411423,
|
|
[34] = FSCALE * 0.18268352405273465022,
|
|
[35] = FSCALE * 0.17377394345044512668,
|
|
[36] = FSCALE * 0.16529888822158653829,
|
|
[37] = FSCALE * 0.15723716631362761621,
|
|
[38] = FSCALE * 0.14956861922263505264,
|
|
[39] = FSCALE * 0.14227407158651357185,
|
|
[40] = FSCALE * 0.13533528323661269189,
|
|
[41] = FSCALE * 0.12873490358780421886,
|
|
[42] = FSCALE * 0.12245642825298191021,
|
|
[43] = FSCALE * 0.11648415777349695786,
|
|
[44] = FSCALE * 0.11080315836233388333,
|
|
[45] = FSCALE * 0.10539922456186433678,
|
|
[46] = FSCALE * 0.10025884372280373372,
|
|
[47] = FSCALE * 0.09536916221554961888,
|
|
[48] = FSCALE * 0.09071795328941250337,
|
|
[49] = FSCALE * 0.08629358649937051097,
|
|
[50] = FSCALE * 0.08208499862389879516,
|
|
[51] = FSCALE * 0.07808166600115315231,
|
|
[52] = FSCALE * 0.07427357821433388042,
|
|
[53] = FSCALE * 0.07065121306042958674,
|
|
[54] = FSCALE * 0.06720551273974976512,
|
|
[55] = FSCALE * 0.06392786120670757270,
|
|
[56] = FSCALE * 0.06081006262521796499,
|
|
[57] = FSCALE * 0.05784432087483846296,
|
|
[58] = FSCALE * 0.05502322005640722902,
|
|
[59] = FSCALE * 0.05233970594843239308,
|
|
[60] = FSCALE * 0.04978706836786394297,
|
|
[61] = FSCALE * 0.04735892439114092119,
|
|
[62] = FSCALE * 0.04504920239355780606,
|
|
[63] = FSCALE * 0.04285212686704017991,
|
|
[64] = FSCALE * 0.04076220397836621516,
|
|
[65] = FSCALE * 0.03877420783172200988,
|
|
[66] = FSCALE * 0.03688316740124000544,
|
|
[67] = FSCALE * 0.03508435410084502588,
|
|
[68] = FSCALE * 0.03337326996032607948,
|
|
[69] = FSCALE * 0.03174563637806794323,
|
|
[70] = FSCALE * 0.03019738342231850073,
|
|
[71] = FSCALE * 0.02872463965423942912,
|
|
[72] = FSCALE * 0.02732372244729256080,
|
|
[73] = FSCALE * 0.02599112877875534358,
|
|
[74] = FSCALE * 0.02472352647033939120,
|
|
[75] = FSCALE * 0.02351774585600910823,
|
|
[76] = FSCALE * 0.02237077185616559577,
|
|
[77] = FSCALE * 0.02127973643837716938,
|
|
[78] = FSCALE * 0.02024191144580438847,
|
|
[79] = FSCALE * 0.01925470177538692429,
|
|
[80] = FSCALE * 0.01831563888873418029,
|
|
[81] = FSCALE * 0.01742237463949351138,
|
|
[82] = FSCALE * 0.01657267540176124754,
|
|
[83] = FSCALE * 0.01576441648485449082,
|
|
[84] = FSCALE * 0.01499557682047770621,
|
|
[85] = FSCALE * 0.01426423390899925527,
|
|
[86] = FSCALE * 0.01356855901220093175,
|
|
[87] = FSCALE * 0.01290681258047986886,
|
|
[88] = FSCALE * 0.01227733990306844117,
|
|
[89] = FSCALE * 0.01167856697039544521,
|
|
[90] = FSCALE * 0.01110899653824230649,
|
|
[91] = FSCALE * 0.01056720438385265337,
|
|
[92] = FSCALE * 0.01005183574463358164,
|
|
[93] = FSCALE * 0.00956160193054350793,
|
|
[94] = FSCALE * 0.00909527710169581709,
|
|
[95] = FSCALE * 0.00865169520312063417,
|
|
[96] = FSCALE * 0.00822974704902002884,
|
|
[97] = FSCALE * 0.00782837754922577143,
|
|
[98] = FSCALE * 0.00744658307092434051,
|
|
[99] = FSCALE * 0.00708340892905212004,
|
|
[100] = FSCALE * 0.00673794699908546709,
|
|
[101] = FSCALE * 0.00640933344625638184,
|
|
[102] = FSCALE * 0.00609674656551563610,
|
|
[103] = FSCALE * 0.00579940472684214321,
|
|
[104] = FSCALE * 0.00551656442076077241,
|
|
[105] = FSCALE * 0.00524751839918138427,
|
|
[106] = FSCALE * 0.00499159390691021621,
|
|
[107] = FSCALE * 0.00474815099941147558,
|
|
[108] = FSCALE * 0.00451658094261266798,
|
|
[109] = FSCALE * 0.00429630469075234057,
|
|
[110] = FSCALE * 0.00408677143846406699,
|
|
};
|
|
#endif
|
|
|
|
#define CCPU_EXP_MAX 110
|
|
|
|
/*
|
|
* This function is analogical to the getpcpu() function in the ps(1) command.
|
|
* They should both calculate in the same way so that the racct %cpu
|
|
* calculations are consistent with the values showed by the ps(1) tool.
|
|
* The calculations are more complex in the 4BSD scheduler because of the value
|
|
* of the ccpu variable. In ULE it is defined to be zero which saves us some
|
|
* work.
|
|
*/
|
|
static uint64_t
|
|
racct_getpcpu(struct proc *p, u_int pcpu)
|
|
{
|
|
u_int swtime;
|
|
#ifdef SCHED_4BSD
|
|
fixpt_t pctcpu, pctcpu_next;
|
|
#endif
|
|
#ifdef SMP
|
|
struct pcpu *pc;
|
|
int found;
|
|
#endif
|
|
fixpt_t p_pctcpu;
|
|
struct thread *td;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
/*
|
|
* If the process is swapped out, we count its %cpu usage as zero.
|
|
* This behaviour is consistent with the userland ps(1) tool.
|
|
*/
|
|
if ((p->p_flag & P_INMEM) == 0)
|
|
return (0);
|
|
swtime = (ticks - p->p_swtick) / hz;
|
|
|
|
/*
|
|
* For short-lived processes, the sched_pctcpu() returns small
|
|
* values even for cpu intensive processes. Therefore we use
|
|
* our own estimate in this case.
|
|
*/
|
|
if (swtime < RACCT_PCPU_SECS)
|
|
return (pcpu);
|
|
|
|
p_pctcpu = 0;
|
|
FOREACH_THREAD_IN_PROC(p, td) {
|
|
if (td == PCPU_GET(idlethread))
|
|
continue;
|
|
#ifdef SMP
|
|
found = 0;
|
|
STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
|
|
if (td == pc->pc_idlethread) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (found)
|
|
continue;
|
|
#endif
|
|
thread_lock(td);
|
|
#ifdef SCHED_4BSD
|
|
pctcpu = sched_pctcpu(td);
|
|
/* Count also the yet unfinished second. */
|
|
pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
|
|
pctcpu_next += sched_pctcpu_delta(td);
|
|
p_pctcpu += max(pctcpu, pctcpu_next);
|
|
#else
|
|
/*
|
|
* In ULE the %cpu statistics are updated on every
|
|
* sched_pctcpu() call. So special calculations to
|
|
* account for the latest (unfinished) second are
|
|
* not needed.
|
|
*/
|
|
p_pctcpu += sched_pctcpu(td);
|
|
#endif
|
|
thread_unlock(td);
|
|
}
|
|
|
|
#ifdef SCHED_4BSD
|
|
if (swtime <= CCPU_EXP_MAX)
|
|
return ((100 * (uint64_t)p_pctcpu * 1000000) /
|
|
(FSCALE - ccpu_exp[swtime]));
|
|
#endif
|
|
|
|
return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
|
|
}
|
|
|
|
static void
|
|
racct_add_racct(struct racct *dest, const struct racct *src)
|
|
{
|
|
int i;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
RACCT_LOCK_ASSERT();
|
|
|
|
/*
|
|
* Update resource usage in dest.
|
|
*/
|
|
for (i = 0; i <= RACCT_MAX; i++) {
|
|
KASSERT(dest->r_resources[i] >= 0,
|
|
("%s: resource %d propagation meltdown: dest < 0",
|
|
__func__, i));
|
|
KASSERT(src->r_resources[i] >= 0,
|
|
("%s: resource %d propagation meltdown: src < 0",
|
|
__func__, i));
|
|
dest->r_resources[i] += src->r_resources[i];
|
|
}
|
|
}
|
|
|
|
static void
|
|
racct_sub_racct(struct racct *dest, const struct racct *src)
|
|
{
|
|
int i;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
RACCT_LOCK_ASSERT();
|
|
|
|
/*
|
|
* Update resource usage in dest.
|
|
*/
|
|
for (i = 0; i <= RACCT_MAX; i++) {
|
|
if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
|
|
KASSERT(dest->r_resources[i] >= 0,
|
|
("%s: resource %d propagation meltdown: dest < 0",
|
|
__func__, i));
|
|
KASSERT(src->r_resources[i] >= 0,
|
|
("%s: resource %d propagation meltdown: src < 0",
|
|
__func__, i));
|
|
KASSERT(src->r_resources[i] <= dest->r_resources[i],
|
|
("%s: resource %d propagation meltdown: src > dest",
|
|
__func__, i));
|
|
}
|
|
if (RACCT_CAN_DROP(i)) {
|
|
dest->r_resources[i] -= src->r_resources[i];
|
|
if (dest->r_resources[i] < 0)
|
|
dest->r_resources[i] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
racct_create(struct racct **racctp)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
SDT_PROBE1(racct, , racct, create, racctp);
|
|
|
|
KASSERT(*racctp == NULL, ("racct already allocated"));
|
|
|
|
*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
|
|
}
|
|
|
|
static void
|
|
racct_destroy_locked(struct racct **racctp)
|
|
{
|
|
struct racct *racct;
|
|
int i;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
SDT_PROBE1(racct, , racct, destroy, racctp);
|
|
|
|
RACCT_LOCK_ASSERT();
|
|
KASSERT(racctp != NULL, ("NULL racctp"));
|
|
KASSERT(*racctp != NULL, ("NULL racct"));
|
|
|
|
racct = *racctp;
|
|
|
|
for (i = 0; i <= RACCT_MAX; i++) {
|
|
if (RACCT_IS_SLOPPY(i))
|
|
continue;
|
|
if (!RACCT_IS_RECLAIMABLE(i))
|
|
continue;
|
|
KASSERT(racct->r_resources[i] == 0,
|
|
("destroying non-empty racct: "
|
|
"%ju allocated for resource %d\n",
|
|
racct->r_resources[i], i));
|
|
}
|
|
uma_zfree(racct_zone, racct);
|
|
*racctp = NULL;
|
|
}
|
|
|
|
void
|
|
racct_destroy(struct racct **racct)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
RACCT_LOCK();
|
|
racct_destroy_locked(racct);
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
/*
|
|
* Increase consumption of 'resource' by 'amount' for 'racct',
|
|
* but not its parents. Differently from other cases, 'amount' here
|
|
* may be less than zero.
|
|
*/
|
|
static void
|
|
racct_adjust_resource(struct racct *racct, int resource,
|
|
int64_t amount)
|
|
{
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
RACCT_LOCK_ASSERT();
|
|
KASSERT(racct != NULL, ("NULL racct"));
|
|
|
|
racct->r_resources[resource] += amount;
|
|
if (racct->r_resources[resource] < 0) {
|
|
KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
|
|
("%s: resource %d usage < 0", __func__, resource));
|
|
racct->r_resources[resource] = 0;
|
|
}
|
|
|
|
/*
|
|
* There are some cases where the racct %cpu resource would grow
|
|
* beyond 100% per core. For example in racct_proc_exit() we add
|
|
* the process %cpu usage to the ucred racct containers. If too
|
|
* many processes terminated in a short time span, the ucred %cpu
|
|
* resource could grow too much. Also, the 4BSD scheduler sometimes
|
|
* returns for a thread more than 100% cpu usage. So we set a sane
|
|
* boundary here to 100% * the maximum number of CPUs.
|
|
*/
|
|
if ((resource == RACCT_PCTCPU) &&
|
|
(racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU))
|
|
racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU;
|
|
}
|
|
|
|
static int
|
|
racct_add_locked(struct proc *p, int resource, uint64_t amount, int force)
|
|
{
|
|
#ifdef RCTL
|
|
int error;
|
|
#endif
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
/*
|
|
* We need proc lock to dereference p->p_ucred.
|
|
*/
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
#ifdef RCTL
|
|
error = rctl_enforce(p, resource, amount);
|
|
if (error && !force && RACCT_IS_DENIABLE(resource)) {
|
|
SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount);
|
|
return (error);
|
|
}
|
|
#endif
|
|
racct_adjust_resource(p->p_racct, resource, amount);
|
|
racct_add_cred_locked(p->p_ucred, resource, amount);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Increase allocation of 'resource' by 'amount' for process 'p'.
|
|
* Return 0 if it's below limits, or errno, if it's not.
|
|
*/
|
|
int
|
|
racct_add(struct proc *p, int resource, uint64_t amount)
|
|
{
|
|
int error;
|
|
|
|
if (!racct_enable)
|
|
return (0);
|
|
|
|
SDT_PROBE3(racct, , rusage, add, p, resource, amount);
|
|
|
|
RACCT_LOCK();
|
|
error = racct_add_locked(p, resource, amount, 0);
|
|
RACCT_UNLOCK();
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Increase allocation of 'resource' by 'amount' for process 'p'.
|
|
* Doesn't check for limits and never fails.
|
|
*/
|
|
void
|
|
racct_add_force(struct proc *p, int resource, uint64_t amount)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
|
|
|
|
RACCT_LOCK();
|
|
racct_add_locked(p, resource, amount, 1);
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
static void
|
|
racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
|
|
{
|
|
struct prison *pr;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
|
|
for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
|
|
racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
|
|
amount);
|
|
racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount);
|
|
}
|
|
|
|
/*
|
|
* Increase allocation of 'resource' by 'amount' for credential 'cred'.
|
|
* Doesn't check for limits and never fails.
|
|
*/
|
|
void
|
|
racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount);
|
|
|
|
RACCT_LOCK();
|
|
racct_add_cred_locked(cred, resource, amount);
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
/*
|
|
* Account for disk IO resource consumption. Checks for limits,
|
|
* but never fails, due to disk limits being undeniable.
|
|
*/
|
|
void
|
|
racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
|
|
{
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write);
|
|
|
|
RACCT_LOCK();
|
|
if (is_write) {
|
|
racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
|
|
racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
|
|
} else {
|
|
racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
|
|
racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
|
|
}
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
static int
|
|
racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
|
|
{
|
|
int64_t old_amount, decayed_amount, diff_proc, diff_cred;
|
|
#ifdef RCTL
|
|
int error;
|
|
#endif
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
/*
|
|
* We need proc lock to dereference p->p_ucred.
|
|
*/
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
old_amount = p->p_racct->r_resources[resource];
|
|
/*
|
|
* The diffs may be negative.
|
|
*/
|
|
diff_proc = amount - old_amount;
|
|
if (resource == RACCT_PCTCPU) {
|
|
/*
|
|
* Resources in per-credential racct containers may decay.
|
|
* If this is the case, we need to calculate the difference
|
|
* between the new amount and the proportional value of the
|
|
* old amount that has decayed in the ucred racct containers.
|
|
*/
|
|
decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
|
|
diff_cred = amount - decayed_amount;
|
|
} else
|
|
diff_cred = diff_proc;
|
|
#ifdef notyet
|
|
KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
|
|
("%s: usage of non-droppable resource %d dropping", __func__,
|
|
resource));
|
|
#endif
|
|
#ifdef RCTL
|
|
if (diff_proc > 0) {
|
|
error = rctl_enforce(p, resource, diff_proc);
|
|
if (error && !force && RACCT_IS_DENIABLE(resource)) {
|
|
SDT_PROBE3(racct, , rusage, set__failure, p, resource,
|
|
amount);
|
|
return (error);
|
|
}
|
|
}
|
|
#endif
|
|
racct_adjust_resource(p->p_racct, resource, diff_proc);
|
|
if (diff_cred > 0)
|
|
racct_add_cred_locked(p->p_ucred, resource, diff_cred);
|
|
else if (diff_cred < 0)
|
|
racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Set allocation of 'resource' to 'amount' for process 'p'.
|
|
* Return 0 if it's below limits, or errno, if it's not.
|
|
*
|
|
* Note that decreasing the allocation always returns 0,
|
|
* even if it's above the limit.
|
|
*/
|
|
int
|
|
racct_set_unlocked(struct proc *p, int resource, uint64_t amount)
|
|
{
|
|
int error;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
PROC_LOCK(p);
|
|
error = racct_set(p, resource, amount);
|
|
PROC_UNLOCK(p);
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
racct_set(struct proc *p, int resource, uint64_t amount)
|
|
{
|
|
int error;
|
|
|
|
if (!racct_enable)
|
|
return (0);
|
|
|
|
SDT_PROBE3(racct, , rusage, set__force, p, resource, amount);
|
|
|
|
RACCT_LOCK();
|
|
error = racct_set_locked(p, resource, amount, 0);
|
|
RACCT_UNLOCK();
|
|
return (error);
|
|
}
|
|
|
|
void
|
|
racct_set_force(struct proc *p, int resource, uint64_t amount)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
SDT_PROBE3(racct, , rusage, set, p, resource, amount);
|
|
|
|
RACCT_LOCK();
|
|
racct_set_locked(p, resource, amount, 1);
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
/*
|
|
* Returns amount of 'resource' the process 'p' can keep allocated.
|
|
* Allocating more than that would be denied, unless the resource
|
|
* is marked undeniable. Amount of already allocated resource does
|
|
* not matter.
|
|
*/
|
|
uint64_t
|
|
racct_get_limit(struct proc *p, int resource)
|
|
{
|
|
#ifdef RCTL
|
|
uint64_t available;
|
|
|
|
if (!racct_enable)
|
|
return (UINT64_MAX);
|
|
|
|
RACCT_LOCK();
|
|
available = rctl_get_limit(p, resource);
|
|
RACCT_UNLOCK();
|
|
|
|
return (available);
|
|
#else
|
|
|
|
return (UINT64_MAX);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Returns amount of 'resource' the process 'p' can keep allocated.
|
|
* Allocating more than that would be denied, unless the resource
|
|
* is marked undeniable. Amount of already allocated resource does
|
|
* matter.
|
|
*/
|
|
uint64_t
|
|
racct_get_available(struct proc *p, int resource)
|
|
{
|
|
#ifdef RCTL
|
|
uint64_t available;
|
|
|
|
if (!racct_enable)
|
|
return (UINT64_MAX);
|
|
|
|
RACCT_LOCK();
|
|
available = rctl_get_available(p, resource);
|
|
RACCT_UNLOCK();
|
|
|
|
return (available);
|
|
#else
|
|
|
|
return (UINT64_MAX);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Returns amount of the %cpu resource that process 'p' can add to its %cpu
|
|
* utilization. Adding more than that would lead to the process being
|
|
* throttled.
|
|
*/
|
|
static int64_t
|
|
racct_pcpu_available(struct proc *p)
|
|
{
|
|
#ifdef RCTL
|
|
uint64_t available;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
RACCT_LOCK();
|
|
available = rctl_pcpu_available(p);
|
|
RACCT_UNLOCK();
|
|
|
|
return (available);
|
|
#else
|
|
|
|
return (INT64_MAX);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Decrease allocation of 'resource' by 'amount' for process 'p'.
|
|
*/
|
|
void
|
|
racct_sub(struct proc *p, int resource, uint64_t amount)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
SDT_PROBE3(racct, , rusage, sub, p, resource, amount);
|
|
|
|
/*
|
|
* We need proc lock to dereference p->p_ucred.
|
|
*/
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
KASSERT(RACCT_CAN_DROP(resource),
|
|
("%s: called for non-droppable resource %d", __func__, resource));
|
|
|
|
RACCT_LOCK();
|
|
KASSERT(amount <= p->p_racct->r_resources[resource],
|
|
("%s: freeing %ju of resource %d, which is more "
|
|
"than allocated %jd for %s (pid %d)", __func__, amount, resource,
|
|
(intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
|
|
|
|
racct_adjust_resource(p->p_racct, resource, -amount);
|
|
racct_sub_cred_locked(p->p_ucred, resource, amount);
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
static void
|
|
racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
|
|
{
|
|
struct prison *pr;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
|
|
for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
|
|
racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
|
|
-amount);
|
|
racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount);
|
|
}
|
|
|
|
/*
|
|
* Decrease allocation of 'resource' by 'amount' for credential 'cred'.
|
|
*/
|
|
void
|
|
racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount);
|
|
|
|
#ifdef notyet
|
|
KASSERT(RACCT_CAN_DROP(resource),
|
|
("%s: called for resource %d which can not drop", __func__,
|
|
resource));
|
|
#endif
|
|
|
|
RACCT_LOCK();
|
|
racct_sub_cred_locked(cred, resource, amount);
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
/*
|
|
* Inherit resource usage information from the parent process.
|
|
*/
|
|
int
|
|
racct_proc_fork(struct proc *parent, struct proc *child)
|
|
{
|
|
int i, error = 0;
|
|
|
|
if (!racct_enable)
|
|
return (0);
|
|
|
|
/*
|
|
* Create racct for the child process.
|
|
*/
|
|
racct_create(&child->p_racct);
|
|
|
|
PROC_LOCK(parent);
|
|
PROC_LOCK(child);
|
|
RACCT_LOCK();
|
|
|
|
#ifdef RCTL
|
|
error = rctl_proc_fork(parent, child);
|
|
if (error != 0)
|
|
goto out;
|
|
#endif
|
|
|
|
/* Init process cpu time. */
|
|
child->p_prev_runtime = 0;
|
|
child->p_throttled = 0;
|
|
|
|
/*
|
|
* Inherit resource usage.
|
|
*/
|
|
for (i = 0; i <= RACCT_MAX; i++) {
|
|
if (parent->p_racct->r_resources[i] == 0 ||
|
|
!RACCT_IS_INHERITABLE(i))
|
|
continue;
|
|
|
|
error = racct_set_locked(child, i,
|
|
parent->p_racct->r_resources[i], 0);
|
|
if (error != 0)
|
|
goto out;
|
|
}
|
|
|
|
error = racct_add_locked(child, RACCT_NPROC, 1, 0);
|
|
error += racct_add_locked(child, RACCT_NTHR, 1, 0);
|
|
|
|
out:
|
|
RACCT_UNLOCK();
|
|
PROC_UNLOCK(child);
|
|
PROC_UNLOCK(parent);
|
|
|
|
if (error != 0)
|
|
racct_proc_exit(child);
|
|
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Called at the end of fork1(), to handle rules that require the process
|
|
* to be fully initialized.
|
|
*/
|
|
void
|
|
racct_proc_fork_done(struct proc *child)
|
|
{
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
#ifdef RCTL
|
|
PROC_LOCK(child);
|
|
RACCT_LOCK();
|
|
rctl_enforce(child, RACCT_NPROC, 0);
|
|
rctl_enforce(child, RACCT_NTHR, 0);
|
|
RACCT_UNLOCK();
|
|
PROC_UNLOCK(child);
|
|
#endif
|
|
}
|
|
|
|
void
|
|
racct_proc_exit(struct proc *p)
|
|
{
|
|
struct timeval wallclock;
|
|
uint64_t pct_estimate, pct, runtime;
|
|
int i;
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
PROC_LOCK(p);
|
|
/*
|
|
* We don't need to calculate rux, proc_reap() has already done this.
|
|
*/
|
|
runtime = cputick2usec(p->p_rux.rux_runtime);
|
|
#ifdef notyet
|
|
KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
|
|
#else
|
|
if (runtime < p->p_prev_runtime)
|
|
runtime = p->p_prev_runtime;
|
|
#endif
|
|
microuptime(&wallclock);
|
|
timevalsub(&wallclock, &p->p_stats->p_start);
|
|
if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
|
|
pct_estimate = (1000000 * runtime * 100) /
|
|
((uint64_t)wallclock.tv_sec * 1000000 +
|
|
wallclock.tv_usec);
|
|
} else
|
|
pct_estimate = 0;
|
|
pct = racct_getpcpu(p, pct_estimate);
|
|
|
|
RACCT_LOCK();
|
|
racct_set_locked(p, RACCT_CPU, runtime, 0);
|
|
racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
|
|
|
|
KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0,
|
|
("process reaped with %ju allocated for RSS\n",
|
|
p->p_racct->r_resources[RACCT_RSS]));
|
|
for (i = 0; i <= RACCT_MAX; i++) {
|
|
if (p->p_racct->r_resources[i] == 0)
|
|
continue;
|
|
if (!RACCT_IS_RECLAIMABLE(i))
|
|
continue;
|
|
racct_set_locked(p, i, 0, 0);
|
|
}
|
|
|
|
#ifdef RCTL
|
|
rctl_racct_release(p->p_racct);
|
|
#endif
|
|
racct_destroy_locked(&p->p_racct);
|
|
RACCT_UNLOCK();
|
|
PROC_UNLOCK(p);
|
|
}
|
|
|
|
/*
|
|
* Called after credentials change, to move resource utilisation
|
|
* between raccts.
|
|
*/
|
|
void
|
|
racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
|
|
struct ucred *newcred)
|
|
{
|
|
struct uidinfo *olduip, *newuip;
|
|
struct loginclass *oldlc, *newlc;
|
|
struct prison *oldpr, *newpr, *pr;
|
|
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
newuip = newcred->cr_ruidinfo;
|
|
olduip = oldcred->cr_ruidinfo;
|
|
newlc = newcred->cr_loginclass;
|
|
oldlc = oldcred->cr_loginclass;
|
|
newpr = newcred->cr_prison;
|
|
oldpr = oldcred->cr_prison;
|
|
|
|
RACCT_LOCK();
|
|
if (newuip != olduip) {
|
|
racct_sub_racct(olduip->ui_racct, p->p_racct);
|
|
racct_add_racct(newuip->ui_racct, p->p_racct);
|
|
}
|
|
if (newlc != oldlc) {
|
|
racct_sub_racct(oldlc->lc_racct, p->p_racct);
|
|
racct_add_racct(newlc->lc_racct, p->p_racct);
|
|
}
|
|
if (newpr != oldpr) {
|
|
for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
|
|
racct_sub_racct(pr->pr_prison_racct->prr_racct,
|
|
p->p_racct);
|
|
for (pr = newpr; pr != NULL; pr = pr->pr_parent)
|
|
racct_add_racct(pr->pr_prison_racct->prr_racct,
|
|
p->p_racct);
|
|
}
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
void
|
|
racct_move(struct racct *dest, struct racct *src)
|
|
{
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
RACCT_LOCK();
|
|
racct_add_racct(dest, src);
|
|
racct_sub_racct(src, src);
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
static void
|
|
ast_racct(struct thread *td, int tda __unused)
|
|
{
|
|
struct proc *p;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
p = td->td_proc;
|
|
if (p->p_throttled == 0)
|
|
return;
|
|
|
|
PROC_LOCK(p);
|
|
while (p->p_throttled != 0) {
|
|
msleep(p->p_racct, &p->p_mtx, 0, "racct",
|
|
p->p_throttled < 0 ? 0 : p->p_throttled);
|
|
if (p->p_throttled > 0)
|
|
p->p_throttled = 0;
|
|
}
|
|
PROC_UNLOCK(p);
|
|
}
|
|
|
|
/*
|
|
* Make the process sleep in userret() for 'timeout' ticks. Setting
|
|
* timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
|
|
*/
|
|
void
|
|
racct_proc_throttle(struct proc *p, int timeout)
|
|
{
|
|
struct thread *td;
|
|
#ifdef SMP
|
|
int cpuid;
|
|
#endif
|
|
|
|
KASSERT(timeout != 0, ("timeout %d", timeout));
|
|
ASSERT_RACCT_ENABLED();
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
/*
|
|
* Do not block kernel processes. Also do not block processes with
|
|
* low %cpu utilization to improve interactivity.
|
|
*/
|
|
if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
|
|
return;
|
|
|
|
if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
|
|
return;
|
|
|
|
p->p_throttled = timeout;
|
|
|
|
FOREACH_THREAD_IN_PROC(p, td) {
|
|
thread_lock(td);
|
|
ast_sched_locked(td, TDA_RACCT);
|
|
|
|
switch (TD_GET_STATE(td)) {
|
|
case TDS_RUNQ:
|
|
/*
|
|
* If the thread is on the scheduler run-queue, we can
|
|
* not just remove it from there. So we set the flag
|
|
* TDA_SCHED for the thread, so that once it is
|
|
* running, it is taken off the cpu as soon as possible.
|
|
*/
|
|
ast_sched_locked(td, TDA_SCHED);
|
|
break;
|
|
case TDS_RUNNING:
|
|
/*
|
|
* If the thread is running, we request a context
|
|
* switch for it by setting the TDA_SCHED flag.
|
|
*/
|
|
ast_sched_locked(td, TDA_SCHED);
|
|
#ifdef SMP
|
|
cpuid = td->td_oncpu;
|
|
if ((cpuid != NOCPU) && (td != curthread))
|
|
ipi_cpu(cpuid, IPI_AST);
|
|
#endif
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
thread_unlock(td);
|
|
}
|
|
}
|
|
|
|
static void
|
|
racct_proc_wakeup(struct proc *p)
|
|
{
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
PROC_LOCK_ASSERT(p, MA_OWNED);
|
|
|
|
if (p->p_throttled != 0) {
|
|
p->p_throttled = 0;
|
|
wakeup(p->p_racct);
|
|
}
|
|
}
|
|
|
|
static void
|
|
racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
|
|
{
|
|
int64_t r_old, r_new;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
RACCT_LOCK_ASSERT();
|
|
|
|
#ifdef RCTL
|
|
rctl_throttle_decay(racct, RACCT_READBPS);
|
|
rctl_throttle_decay(racct, RACCT_WRITEBPS);
|
|
rctl_throttle_decay(racct, RACCT_READIOPS);
|
|
rctl_throttle_decay(racct, RACCT_WRITEIOPS);
|
|
#endif
|
|
|
|
r_old = racct->r_resources[RACCT_PCTCPU];
|
|
|
|
/* If there is nothing to decay, just exit. */
|
|
if (r_old <= 0)
|
|
return;
|
|
|
|
r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
|
|
racct->r_resources[RACCT_PCTCPU] = r_new;
|
|
}
|
|
|
|
static void
|
|
racct_decay_pre(void)
|
|
{
|
|
|
|
RACCT_LOCK();
|
|
}
|
|
|
|
static void
|
|
racct_decay_post(void)
|
|
{
|
|
|
|
RACCT_UNLOCK();
|
|
}
|
|
|
|
static void
|
|
racct_decay(void)
|
|
{
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
ui_racct_foreach(racct_decay_callback, racct_decay_pre,
|
|
racct_decay_post, NULL, NULL);
|
|
loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
|
|
racct_decay_post, NULL, NULL);
|
|
prison_racct_foreach(racct_decay_callback, racct_decay_pre,
|
|
racct_decay_post, NULL, NULL);
|
|
}
|
|
|
|
static void
|
|
racctd(void)
|
|
{
|
|
struct thread *td;
|
|
struct proc *p;
|
|
struct timeval wallclock;
|
|
uint64_t pct, pct_estimate, runtime;
|
|
|
|
ASSERT_RACCT_ENABLED();
|
|
|
|
for (;;) {
|
|
racct_decay();
|
|
|
|
sx_slock(&allproc_lock);
|
|
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
|
PROC_LOCK(p);
|
|
if (p->p_state != PRS_NORMAL) {
|
|
if (p->p_state == PRS_ZOMBIE)
|
|
racct_set(p, RACCT_PCTCPU, 0);
|
|
PROC_UNLOCK(p);
|
|
continue;
|
|
}
|
|
|
|
microuptime(&wallclock);
|
|
timevalsub(&wallclock, &p->p_stats->p_start);
|
|
PROC_STATLOCK(p);
|
|
FOREACH_THREAD_IN_PROC(p, td)
|
|
ruxagg(p, td);
|
|
runtime = cputick2usec(p->p_rux.rux_runtime);
|
|
PROC_STATUNLOCK(p);
|
|
#ifdef notyet
|
|
KASSERT(runtime >= p->p_prev_runtime,
|
|
("runtime < p_prev_runtime"));
|
|
#else
|
|
if (runtime < p->p_prev_runtime)
|
|
runtime = p->p_prev_runtime;
|
|
#endif
|
|
p->p_prev_runtime = runtime;
|
|
if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
|
|
pct_estimate = (1000000 * runtime * 100) /
|
|
((uint64_t)wallclock.tv_sec * 1000000 +
|
|
wallclock.tv_usec);
|
|
} else
|
|
pct_estimate = 0;
|
|
pct = racct_getpcpu(p, pct_estimate);
|
|
RACCT_LOCK();
|
|
#ifdef RCTL
|
|
rctl_throttle_decay(p->p_racct, RACCT_READBPS);
|
|
rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
|
|
rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
|
|
rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
|
|
#endif
|
|
racct_set_locked(p, RACCT_PCTCPU, pct, 1);
|
|
racct_set_locked(p, RACCT_CPU, runtime, 0);
|
|
racct_set_locked(p, RACCT_WALLCLOCK,
|
|
(uint64_t)wallclock.tv_sec * 1000000 +
|
|
wallclock.tv_usec, 0);
|
|
RACCT_UNLOCK();
|
|
PROC_UNLOCK(p);
|
|
}
|
|
|
|
/*
|
|
* To ensure that processes are throttled in a fair way, we need
|
|
* to iterate over all processes again and check the limits
|
|
* for %cpu resource only after ucred racct containers have been
|
|
* properly filled.
|
|
*/
|
|
FOREACH_PROC_IN_SYSTEM(p) {
|
|
PROC_LOCK(p);
|
|
if (p->p_state != PRS_NORMAL) {
|
|
PROC_UNLOCK(p);
|
|
continue;
|
|
}
|
|
|
|
if (racct_pcpu_available(p) <= 0) {
|
|
if (p->p_racct->r_resources[RACCT_PCTCPU] >
|
|
pcpu_threshold)
|
|
racct_proc_throttle(p, -1);
|
|
} else if (p->p_throttled == -1) {
|
|
racct_proc_wakeup(p);
|
|
}
|
|
PROC_UNLOCK(p);
|
|
}
|
|
sx_sunlock(&allproc_lock);
|
|
pause("-", hz);
|
|
}
|
|
}
|
|
|
|
static struct kproc_desc racctd_kp = {
|
|
"racctd",
|
|
racctd,
|
|
NULL
|
|
};
|
|
|
|
static void
|
|
racctd_init(void)
|
|
{
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
kproc_start(&racctd_kp);
|
|
}
|
|
SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
|
|
|
|
static void
|
|
racct_init(void)
|
|
{
|
|
if (!racct_enable)
|
|
return;
|
|
|
|
racct_zone = uma_zcreate("racct", sizeof(struct racct),
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
|
|
ast_register(TDA_RACCT, ASTR_ASTF_REQUIRED, 0, ast_racct);
|
|
|
|
/*
|
|
* XXX: Move this somewhere.
|
|
*/
|
|
prison0.pr_prison_racct = prison_racct_find("0");
|
|
}
|
|
SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
|
|
|
|
#endif /* !RACCT */
|