freebsd-skq/sys/kern/kern_racct.c
kib 11cee2ecf7 The process spin lock currently has the following distinct uses:
- Threads lifetime cycle, in particular, counting of the threads in
  the process, and interlocking with process mutex and thread lock.
  The main reason of this is that turnstile locks are after thread
  locks, so you e.g. cannot unlock blockable mutex (think process
  mutex) while owning thread lock.

- Virtual and profiling itimers, since the timers activation is done
  from the clock interrupt context.  Replace the p_slock by p_itimmtx
  and PROC_ITIMLOCK().

- Profiling code (profil(2)), for similar reason.  Replace the p_slock
  by p_profmtx and PROC_PROFLOCK().

- Resource usage accounting.  Need for the spinlock there is subtle,
  my understanding is that spinlock blocks context switching for the
  current thread, which prevents td_runtime and similar fields from
  changing (updates are done at the mi_switch()).  Replace the p_slock
  by p_statmtx and PROC_STATLOCK().

The split is done mostly for code clarity, and should not affect
scalability.

Tested by:	pho
Sponsored by:	The FreeBSD Foundation
MFC after:	1 week
2014-11-26 14:10:00 +00:00

1207 lines
32 KiB
C

/*-
* Copyright (c) 2010 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Edward Tomasz Napierala under sponsorship
* from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/umtx.h>
#include <machine/smp.h>
#ifdef RCTL
#include <sys/rctl.h>
#endif
#ifdef RACCT
FEATURE(racct, "Resource Accounting");
/*
* Do not block processes that have their %cpu usage <= pcpu_threshold.
*/
static int pcpu_threshold = 1;
SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
0, "Processes with higher %cpu usage than this value can be throttled.");
/*
* How many seconds it takes to use the scheduler %cpu calculations. When a
* process starts, we compute its %cpu usage by dividing its runtime by the
* process wall clock time. After RACCT_PCPU_SECS pass, we use the value
* provided by the scheduler.
*/
#define RACCT_PCPU_SECS 3
static struct mtx racct_lock;
MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
static uma_zone_t racct_zone;
static void racct_sub_racct(struct racct *dest, const struct racct *src);
static void racct_sub_cred_locked(struct ucred *cred, int resource,
uint64_t amount);
static void racct_add_cred_locked(struct ucred *cred, int resource,
uint64_t amount);
SDT_PROVIDER_DEFINE(racct);
SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int",
"uint64_t");
SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure,
"struct proc *", "int", "uint64_t");
SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *",
"int", "uint64_t");
SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *",
"int", "uint64_t");
SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int",
"uint64_t");
SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure,
"struct proc *", "int", "uint64_t");
SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int",
"uint64_t");
SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *",
"int", "uint64_t");
SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *");
SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *");
SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *",
"struct racct *");
SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure,
"struct racct *", "struct racct *");
SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *",
"struct racct *");
int racct_types[] = {
[RACCT_CPU] =
RACCT_IN_MILLIONS,
[RACCT_DATA] =
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
[RACCT_STACK] =
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
[RACCT_CORE] =
RACCT_DENIABLE,
[RACCT_RSS] =
RACCT_RECLAIMABLE,
[RACCT_MEMLOCK] =
RACCT_RECLAIMABLE | RACCT_DENIABLE,
[RACCT_NPROC] =
RACCT_RECLAIMABLE | RACCT_DENIABLE,
[RACCT_NOFILE] =
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
[RACCT_VMEM] =
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
[RACCT_NPTS] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_SWAP] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_NTHR] =
RACCT_RECLAIMABLE | RACCT_DENIABLE,
[RACCT_MSGQQUEUED] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_MSGQSIZE] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_NMSGQ] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_NSEM] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_NSEMOP] =
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
[RACCT_NSHM] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_SHMSIZE] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_WALLCLOCK] =
RACCT_IN_MILLIONS,
[RACCT_PCTCPU] =
RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
#ifdef SCHED_4BSD
/*
* Contains intermediate values for %cpu calculations to avoid using floating
* point in the kernel.
* ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
* It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
* zero so the calculations are more straightforward.
*/
fixpt_t ccpu_exp[] = {
[0] = FSCALE * 1,
[1] = FSCALE * 0.95122942450071400909,
[2] = FSCALE * 0.90483741803595957316,
[3] = FSCALE * 0.86070797642505780722,
[4] = FSCALE * 0.81873075307798185866,
[5] = FSCALE * 0.77880078307140486824,
[6] = FSCALE * 0.74081822068171786606,
[7] = FSCALE * 0.70468808971871343435,
[8] = FSCALE * 0.67032004603563930074,
[9] = FSCALE * 0.63762815162177329314,
[10] = FSCALE * 0.60653065971263342360,
[11] = FSCALE * 0.57694981038048669531,
[12] = FSCALE * 0.54881163609402643262,
[13] = FSCALE * 0.52204577676101604789,
[14] = FSCALE * 0.49658530379140951470,
[15] = FSCALE * 0.47236655274101470713,
[16] = FSCALE * 0.44932896411722159143,
[17] = FSCALE * 0.42741493194872666992,
[18] = FSCALE * 0.40656965974059911188,
[19] = FSCALE * 0.38674102345450120691,
[20] = FSCALE * 0.36787944117144232159,
[21] = FSCALE * 0.34993774911115535467,
[22] = FSCALE * 0.33287108369807955328,
[23] = FSCALE * 0.31663676937905321821,
[24] = FSCALE * 0.30119421191220209664,
[25] = FSCALE * 0.28650479686019010032,
[26] = FSCALE * 0.27253179303401260312,
[27] = FSCALE * 0.25924026064589150757,
[28] = FSCALE * 0.24659696394160647693,
[29] = FSCALE * 0.23457028809379765313,
[30] = FSCALE * 0.22313016014842982893,
[31] = FSCALE * 0.21224797382674305771,
[32] = FSCALE * 0.20189651799465540848,
[33] = FSCALE * 0.19204990862075411423,
[34] = FSCALE * 0.18268352405273465022,
[35] = FSCALE * 0.17377394345044512668,
[36] = FSCALE * 0.16529888822158653829,
[37] = FSCALE * 0.15723716631362761621,
[38] = FSCALE * 0.14956861922263505264,
[39] = FSCALE * 0.14227407158651357185,
[40] = FSCALE * 0.13533528323661269189,
[41] = FSCALE * 0.12873490358780421886,
[42] = FSCALE * 0.12245642825298191021,
[43] = FSCALE * 0.11648415777349695786,
[44] = FSCALE * 0.11080315836233388333,
[45] = FSCALE * 0.10539922456186433678,
[46] = FSCALE * 0.10025884372280373372,
[47] = FSCALE * 0.09536916221554961888,
[48] = FSCALE * 0.09071795328941250337,
[49] = FSCALE * 0.08629358649937051097,
[50] = FSCALE * 0.08208499862389879516,
[51] = FSCALE * 0.07808166600115315231,
[52] = FSCALE * 0.07427357821433388042,
[53] = FSCALE * 0.07065121306042958674,
[54] = FSCALE * 0.06720551273974976512,
[55] = FSCALE * 0.06392786120670757270,
[56] = FSCALE * 0.06081006262521796499,
[57] = FSCALE * 0.05784432087483846296,
[58] = FSCALE * 0.05502322005640722902,
[59] = FSCALE * 0.05233970594843239308,
[60] = FSCALE * 0.04978706836786394297,
[61] = FSCALE * 0.04735892439114092119,
[62] = FSCALE * 0.04504920239355780606,
[63] = FSCALE * 0.04285212686704017991,
[64] = FSCALE * 0.04076220397836621516,
[65] = FSCALE * 0.03877420783172200988,
[66] = FSCALE * 0.03688316740124000544,
[67] = FSCALE * 0.03508435410084502588,
[68] = FSCALE * 0.03337326996032607948,
[69] = FSCALE * 0.03174563637806794323,
[70] = FSCALE * 0.03019738342231850073,
[71] = FSCALE * 0.02872463965423942912,
[72] = FSCALE * 0.02732372244729256080,
[73] = FSCALE * 0.02599112877875534358,
[74] = FSCALE * 0.02472352647033939120,
[75] = FSCALE * 0.02351774585600910823,
[76] = FSCALE * 0.02237077185616559577,
[77] = FSCALE * 0.02127973643837716938,
[78] = FSCALE * 0.02024191144580438847,
[79] = FSCALE * 0.01925470177538692429,
[80] = FSCALE * 0.01831563888873418029,
[81] = FSCALE * 0.01742237463949351138,
[82] = FSCALE * 0.01657267540176124754,
[83] = FSCALE * 0.01576441648485449082,
[84] = FSCALE * 0.01499557682047770621,
[85] = FSCALE * 0.01426423390899925527,
[86] = FSCALE * 0.01356855901220093175,
[87] = FSCALE * 0.01290681258047986886,
[88] = FSCALE * 0.01227733990306844117,
[89] = FSCALE * 0.01167856697039544521,
[90] = FSCALE * 0.01110899653824230649,
[91] = FSCALE * 0.01056720438385265337,
[92] = FSCALE * 0.01005183574463358164,
[93] = FSCALE * 0.00956160193054350793,
[94] = FSCALE * 0.00909527710169581709,
[95] = FSCALE * 0.00865169520312063417,
[96] = FSCALE * 0.00822974704902002884,
[97] = FSCALE * 0.00782837754922577143,
[98] = FSCALE * 0.00744658307092434051,
[99] = FSCALE * 0.00708340892905212004,
[100] = FSCALE * 0.00673794699908546709,
[101] = FSCALE * 0.00640933344625638184,
[102] = FSCALE * 0.00609674656551563610,
[103] = FSCALE * 0.00579940472684214321,
[104] = FSCALE * 0.00551656442076077241,
[105] = FSCALE * 0.00524751839918138427,
[106] = FSCALE * 0.00499159390691021621,
[107] = FSCALE * 0.00474815099941147558,
[108] = FSCALE * 0.00451658094261266798,
[109] = FSCALE * 0.00429630469075234057,
[110] = FSCALE * 0.00408677143846406699,
};
#endif
#define CCPU_EXP_MAX 110
/*
* This function is analogical to the getpcpu() function in the ps(1) command.
* They should both calculate in the same way so that the racct %cpu
* calculations are consistent with the values showed by the ps(1) tool.
* The calculations are more complex in the 4BSD scheduler because of the value
* of the ccpu variable. In ULE it is defined to be zero which saves us some
* work.
*/
static uint64_t
racct_getpcpu(struct proc *p, u_int pcpu)
{
u_int swtime;
#ifdef SCHED_4BSD
fixpt_t pctcpu, pctcpu_next;
#endif
#ifdef SMP
struct pcpu *pc;
int found;
#endif
fixpt_t p_pctcpu;
struct thread *td;
/*
* If the process is swapped out, we count its %cpu usage as zero.
* This behaviour is consistent with the userland ps(1) tool.
*/
if ((p->p_flag & P_INMEM) == 0)
return (0);
swtime = (ticks - p->p_swtick) / hz;
/*
* For short-lived processes, the sched_pctcpu() returns small
* values even for cpu intensive processes. Therefore we use
* our own estimate in this case.
*/
if (swtime < RACCT_PCPU_SECS)
return (pcpu);
p_pctcpu = 0;
FOREACH_THREAD_IN_PROC(p, td) {
if (td == PCPU_GET(idlethread))
continue;
#ifdef SMP
found = 0;
STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
if (td == pc->pc_idlethread) {
found = 1;
break;
}
}
if (found)
continue;
#endif
thread_lock(td);
#ifdef SCHED_4BSD
pctcpu = sched_pctcpu(td);
/* Count also the yet unfinished second. */
pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
pctcpu_next += sched_pctcpu_delta(td);
p_pctcpu += max(pctcpu, pctcpu_next);
#else
/*
* In ULE the %cpu statistics are updated on every
* sched_pctcpu() call. So special calculations to
* account for the latest (unfinished) second are
* not needed.
*/
p_pctcpu += sched_pctcpu(td);
#endif
thread_unlock(td);
}
#ifdef SCHED_4BSD
if (swtime <= CCPU_EXP_MAX)
return ((100 * (uint64_t)p_pctcpu * 1000000) /
(FSCALE - ccpu_exp[swtime]));
#endif
return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
}
static void
racct_add_racct(struct racct *dest, const struct racct *src)
{
int i;
mtx_assert(&racct_lock, MA_OWNED);
/*
* Update resource usage in dest.
*/
for (i = 0; i <= RACCT_MAX; i++) {
KASSERT(dest->r_resources[i] >= 0,
("%s: resource %d propagation meltdown: dest < 0",
__func__, i));
KASSERT(src->r_resources[i] >= 0,
("%s: resource %d propagation meltdown: src < 0",
__func__, i));
dest->r_resources[i] += src->r_resources[i];
}
}
static void
racct_sub_racct(struct racct *dest, const struct racct *src)
{
int i;
mtx_assert(&racct_lock, MA_OWNED);
/*
* Update resource usage in dest.
*/
for (i = 0; i <= RACCT_MAX; i++) {
if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
KASSERT(dest->r_resources[i] >= 0,
("%s: resource %d propagation meltdown: dest < 0",
__func__, i));
KASSERT(src->r_resources[i] >= 0,
("%s: resource %d propagation meltdown: src < 0",
__func__, i));
KASSERT(src->r_resources[i] <= dest->r_resources[i],
("%s: resource %d propagation meltdown: src > dest",
__func__, i));
}
if (RACCT_CAN_DROP(i)) {
dest->r_resources[i] -= src->r_resources[i];
if (dest->r_resources[i] < 0) {
KASSERT(RACCT_IS_SLOPPY(i) ||
RACCT_IS_DECAYING(i),
("%s: resource %d usage < 0", __func__, i));
dest->r_resources[i] = 0;
}
}
}
}
void
racct_create(struct racct **racctp)
{
SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
KASSERT(*racctp == NULL, ("racct already allocated"));
*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
}
static void
racct_destroy_locked(struct racct **racctp)
{
int i;
struct racct *racct;
SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
mtx_assert(&racct_lock, MA_OWNED);
KASSERT(racctp != NULL, ("NULL racctp"));
KASSERT(*racctp != NULL, ("NULL racct"));
racct = *racctp;
for (i = 0; i <= RACCT_MAX; i++) {
if (RACCT_IS_SLOPPY(i))
continue;
if (!RACCT_IS_RECLAIMABLE(i))
continue;
KASSERT(racct->r_resources[i] == 0,
("destroying non-empty racct: "
"%ju allocated for resource %d\n",
racct->r_resources[i], i));
}
uma_zfree(racct_zone, racct);
*racctp = NULL;
}
void
racct_destroy(struct racct **racct)
{
mtx_lock(&racct_lock);
racct_destroy_locked(racct);
mtx_unlock(&racct_lock);
}
/*
* Increase consumption of 'resource' by 'amount' for 'racct'
* and all its parents. Differently from other cases, 'amount' here
* may be less than zero.
*/
static void
racct_alloc_resource(struct racct *racct, int resource,
uint64_t amount)
{
mtx_assert(&racct_lock, MA_OWNED);
KASSERT(racct != NULL, ("NULL racct"));
racct->r_resources[resource] += amount;
if (racct->r_resources[resource] < 0) {
KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
("%s: resource %d usage < 0", __func__, resource));
racct->r_resources[resource] = 0;
}
/*
* There are some cases where the racct %cpu resource would grow
* beyond 100%.
* For example in racct_proc_exit() we add the process %cpu usage
* to the ucred racct containers. If too many processes terminated
* in a short time span, the ucred %cpu resource could grow too much.
* Also, the 4BSD scheduler sometimes returns for a thread more than
* 100% cpu usage. So we set a boundary here to 100%.
*/
if ((resource == RACCT_PCTCPU) &&
(racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
}
static int
racct_add_locked(struct proc *p, int resource, uint64_t amount)
{
#ifdef RCTL
int error;
#endif
SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
/*
* We need proc lock to dereference p->p_ucred.
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
#ifdef RCTL
error = rctl_enforce(p, resource, amount);
if (error && RACCT_IS_DENIABLE(resource)) {
SDT_PROBE(racct, kernel, rusage, add__failure, p, resource,
amount, 0, 0);
return (error);
}
#endif
racct_alloc_resource(p->p_racct, resource, amount);
racct_add_cred_locked(p->p_ucred, resource, amount);
return (0);
}
/*
* Increase allocation of 'resource' by 'amount' for process 'p'.
* Return 0 if it's below limits, or errno, if it's not.
*/
int
racct_add(struct proc *p, int resource, uint64_t amount)
{
int error;
mtx_lock(&racct_lock);
error = racct_add_locked(p, resource, amount);
mtx_unlock(&racct_lock);
return (error);
}
static void
racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
{
struct prison *pr;
SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount,
0, 0);
racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
amount);
racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
}
/*
* Increase allocation of 'resource' by 'amount' for credential 'cred'.
* Doesn't check for limits and never fails.
*
* XXX: Shouldn't this ever return an error?
*/
void
racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
{
mtx_lock(&racct_lock);
racct_add_cred_locked(cred, resource, amount);
mtx_unlock(&racct_lock);
}
/*
* Increase allocation of 'resource' by 'amount' for process 'p'.
* Doesn't check for limits and never fails.
*/
void
racct_add_force(struct proc *p, int resource, uint64_t amount)
{
SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0);
/*
* We need proc lock to dereference p->p_ucred.
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_lock(&racct_lock);
racct_alloc_resource(p->p_racct, resource, amount);
mtx_unlock(&racct_lock);
racct_add_cred(p->p_ucred, resource, amount);
}
static int
racct_set_locked(struct proc *p, int resource, uint64_t amount)
{
int64_t old_amount, decayed_amount;
int64_t diff_proc, diff_cred;
#ifdef RCTL
int error;
#endif
SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
/*
* We need proc lock to dereference p->p_ucred.
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
old_amount = p->p_racct->r_resources[resource];
/*
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
if (RACCT_IS_DECAYING(resource)) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
* between the new amount and the proportional value of the
* old amount that has decayed in the ucred racct containers.
*/
decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
diff_cred = amount - decayed_amount;
} else
diff_cred = diff_proc;
#ifdef notyet
KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
("%s: usage of non-droppable resource %d dropping", __func__,
resource));
#endif
#ifdef RCTL
if (diff_proc > 0) {
error = rctl_enforce(p, resource, diff_proc);
if (error && RACCT_IS_DENIABLE(resource)) {
SDT_PROBE(racct, kernel, rusage, set__failure, p,
resource, amount, 0, 0);
return (error);
}
}
#endif
racct_alloc_resource(p->p_racct, resource, diff_proc);
if (diff_cred > 0)
racct_add_cred_locked(p->p_ucred, resource, diff_cred);
else if (diff_cred < 0)
racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
return (0);
}
/*
* Set allocation of 'resource' to 'amount' for process 'p'.
* Return 0 if it's below limits, or errno, if it's not.
*
* Note that decreasing the allocation always returns 0,
* even if it's above the limit.
*/
int
racct_set(struct proc *p, int resource, uint64_t amount)
{
int error;
mtx_lock(&racct_lock);
error = racct_set_locked(p, resource, amount);
mtx_unlock(&racct_lock);
return (error);
}
static void
racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
{
int64_t old_amount, decayed_amount;
int64_t diff_proc, diff_cred;
SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
/*
* We need proc lock to dereference p->p_ucred.
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
old_amount = p->p_racct->r_resources[resource];
/*
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
if (RACCT_IS_DECAYING(resource)) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
* between the new amount and the proportional value of the
* old amount that has decayed in the ucred racct containers.
*/
decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
diff_cred = amount - decayed_amount;
} else
diff_cred = diff_proc;
racct_alloc_resource(p->p_racct, resource, diff_proc);
if (diff_cred > 0)
racct_add_cred_locked(p->p_ucred, resource, diff_cred);
else if (diff_cred < 0)
racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
}
void
racct_set_force(struct proc *p, int resource, uint64_t amount)
{
mtx_lock(&racct_lock);
racct_set_force_locked(p, resource, amount);
mtx_unlock(&racct_lock);
}
/*
* Returns amount of 'resource' the process 'p' can keep allocated.
* Allocating more than that would be denied, unless the resource
* is marked undeniable. Amount of already allocated resource does
* not matter.
*/
uint64_t
racct_get_limit(struct proc *p, int resource)
{
#ifdef RCTL
return (rctl_get_limit(p, resource));
#else
return (UINT64_MAX);
#endif
}
/*
* Returns amount of 'resource' the process 'p' can keep allocated.
* Allocating more than that would be denied, unless the resource
* is marked undeniable. Amount of already allocated resource does
* matter.
*/
uint64_t
racct_get_available(struct proc *p, int resource)
{
#ifdef RCTL
return (rctl_get_available(p, resource));
#else
return (UINT64_MAX);
#endif
}
/*
* Returns amount of the %cpu resource that process 'p' can add to its %cpu
* utilization. Adding more than that would lead to the process being
* throttled.
*/
static int64_t
racct_pcpu_available(struct proc *p)
{
#ifdef RCTL
return (rctl_pcpu_available(p));
#else
return (INT64_MAX);
#endif
}
/*
* Decrease allocation of 'resource' by 'amount' for process 'p'.
*/
void
racct_sub(struct proc *p, int resource, uint64_t amount)
{
SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
/*
* We need proc lock to dereference p->p_ucred.
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
KASSERT(RACCT_CAN_DROP(resource),
("%s: called for non-droppable resource %d", __func__, resource));
mtx_lock(&racct_lock);
KASSERT(amount <= p->p_racct->r_resources[resource],
("%s: freeing %ju of resource %d, which is more "
"than allocated %jd for %s (pid %d)", __func__, amount, resource,
(intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
racct_alloc_resource(p->p_racct, resource, -amount);
racct_sub_cred_locked(p->p_ucred, resource, amount);
mtx_unlock(&racct_lock);
}
static void
racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
{
struct prison *pr;
SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount,
0, 0);
#ifdef notyet
KASSERT(RACCT_CAN_DROP(resource),
("%s: called for resource %d which can not drop", __func__,
resource));
#endif
racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
-amount);
racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
}
/*
* Decrease allocation of 'resource' by 'amount' for credential 'cred'.
*/
void
racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
{
mtx_lock(&racct_lock);
racct_sub_cred_locked(cred, resource, amount);
mtx_unlock(&racct_lock);
}
/*
* Inherit resource usage information from the parent process.
*/
int
racct_proc_fork(struct proc *parent, struct proc *child)
{
int i, error = 0;
/*
* Create racct for the child process.
*/
racct_create(&child->p_racct);
PROC_LOCK(parent);
PROC_LOCK(child);
mtx_lock(&racct_lock);
#ifdef RCTL
error = rctl_proc_fork(parent, child);
if (error != 0)
goto out;
#endif
/* Init process cpu time. */
child->p_prev_runtime = 0;
child->p_throttled = 0;
/*
* Inherit resource usage.
*/
for (i = 0; i <= RACCT_MAX; i++) {
if (parent->p_racct->r_resources[i] == 0 ||
!RACCT_IS_INHERITABLE(i))
continue;
error = racct_set_locked(child, i,
parent->p_racct->r_resources[i]);
if (error != 0)
goto out;
}
error = racct_add_locked(child, RACCT_NPROC, 1);
error += racct_add_locked(child, RACCT_NTHR, 1);
out:
mtx_unlock(&racct_lock);
PROC_UNLOCK(child);
PROC_UNLOCK(parent);
if (error != 0)
racct_proc_exit(child);
return (error);
}
/*
* Called at the end of fork1(), to handle rules that require the process
* to be fully initialized.
*/
void
racct_proc_fork_done(struct proc *child)
{
#ifdef RCTL
PROC_LOCK(child);
mtx_lock(&racct_lock);
rctl_enforce(child, RACCT_NPROC, 0);
rctl_enforce(child, RACCT_NTHR, 0);
mtx_unlock(&racct_lock);
PROC_UNLOCK(child);
#endif
}
void
racct_proc_exit(struct proc *p)
{
int i;
uint64_t runtime;
struct timeval wallclock;
uint64_t pct_estimate, pct;
PROC_LOCK(p);
/*
* We don't need to calculate rux, proc_reap() has already done this.
*/
runtime = cputick2usec(p->p_rux.rux_runtime);
#ifdef notyet
KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
#else
if (runtime < p->p_prev_runtime)
runtime = p->p_prev_runtime;
#endif
microuptime(&wallclock);
timevalsub(&wallclock, &p->p_stats->p_start);
if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
pct_estimate = (1000000 * runtime * 100) /
((uint64_t)wallclock.tv_sec * 1000000 +
wallclock.tv_usec);
} else
pct_estimate = 0;
pct = racct_getpcpu(p, pct_estimate);
mtx_lock(&racct_lock);
racct_set_locked(p, RACCT_CPU, runtime);
racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
for (i = 0; i <= RACCT_MAX; i++) {
if (p->p_racct->r_resources[i] == 0)
continue;
if (!RACCT_IS_RECLAIMABLE(i))
continue;
racct_set_locked(p, i, 0);
}
mtx_unlock(&racct_lock);
PROC_UNLOCK(p);
#ifdef RCTL
rctl_racct_release(p->p_racct);
#endif
racct_destroy(&p->p_racct);
}
/*
* Called after credentials change, to move resource utilisation
* between raccts.
*/
void
racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
struct ucred *newcred)
{
struct uidinfo *olduip, *newuip;
struct loginclass *oldlc, *newlc;
struct prison *oldpr, *newpr, *pr;
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
newuip = newcred->cr_ruidinfo;
olduip = oldcred->cr_ruidinfo;
newlc = newcred->cr_loginclass;
oldlc = oldcred->cr_loginclass;
newpr = newcred->cr_prison;
oldpr = oldcred->cr_prison;
mtx_lock(&racct_lock);
if (newuip != olduip) {
racct_sub_racct(olduip->ui_racct, p->p_racct);
racct_add_racct(newuip->ui_racct, p->p_racct);
}
if (newlc != oldlc) {
racct_sub_racct(oldlc->lc_racct, p->p_racct);
racct_add_racct(newlc->lc_racct, p->p_racct);
}
if (newpr != oldpr) {
for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
racct_sub_racct(pr->pr_prison_racct->prr_racct,
p->p_racct);
for (pr = newpr; pr != NULL; pr = pr->pr_parent)
racct_add_racct(pr->pr_prison_racct->prr_racct,
p->p_racct);
}
mtx_unlock(&racct_lock);
#ifdef RCTL
rctl_proc_ucred_changed(p, newcred);
#endif
}
void
racct_move(struct racct *dest, struct racct *src)
{
mtx_lock(&racct_lock);
racct_add_racct(dest, src);
racct_sub_racct(src, src);
mtx_unlock(&racct_lock);
}
static void
racct_proc_throttle(struct proc *p)
{
struct thread *td;
#ifdef SMP
int cpuid;
#endif
PROC_LOCK_ASSERT(p, MA_OWNED);
/*
* Do not block kernel processes. Also do not block processes with
* low %cpu utilization to improve interactivity.
*/
if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
(p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
return;
p->p_throttled = 1;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
switch (td->td_state) {
case TDS_RUNQ:
/*
* If the thread is on the scheduler run-queue, we can
* not just remove it from there. So we set the flag
* TDF_NEEDRESCHED for the thread, so that once it is
* running, it is taken off the cpu as soon as possible.
*/
td->td_flags |= TDF_NEEDRESCHED;
break;
case TDS_RUNNING:
/*
* If the thread is running, we request a context
* switch for it by setting the TDF_NEEDRESCHED flag.
*/
td->td_flags |= TDF_NEEDRESCHED;
#ifdef SMP
cpuid = td->td_oncpu;
if ((cpuid != NOCPU) && (td != curthread))
ipi_cpu(cpuid, IPI_AST);
#endif
break;
default:
break;
}
thread_unlock(td);
}
}
static void
racct_proc_wakeup(struct proc *p)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
if (p->p_throttled) {
p->p_throttled = 0;
wakeup(p->p_racct);
}
}
static void
racct_decay_resource(struct racct *racct, void * res, void* dummy)
{
int resource;
int64_t r_old, r_new;
resource = *(int *)res;
r_old = racct->r_resources[resource];
/* If there is nothing to decay, just exit. */
if (r_old <= 0)
return;
mtx_lock(&racct_lock);
r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
racct->r_resources[resource] = r_new;
mtx_unlock(&racct_lock);
}
static void
racct_decay(int resource)
{
ui_racct_foreach(racct_decay_resource, &resource, NULL);
loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
prison_racct_foreach(racct_decay_resource, &resource, NULL);
}
static void
racctd(void)
{
struct thread *td;
struct proc *p;
struct timeval wallclock;
uint64_t runtime;
uint64_t pct, pct_estimate;
for (;;) {
racct_decay(RACCT_PCTCPU);
sx_slock(&allproc_lock);
LIST_FOREACH(p, &zombproc, p_list) {
PROC_LOCK(p);
racct_set(p, RACCT_PCTCPU, 0);
PROC_UNLOCK(p);
}
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state != PRS_NORMAL) {
PROC_UNLOCK(p);
continue;
}
microuptime(&wallclock);
timevalsub(&wallclock, &p->p_stats->p_start);
PROC_STATLOCK(p);
FOREACH_THREAD_IN_PROC(p, td)
ruxagg(p, td);
runtime = cputick2usec(p->p_rux.rux_runtime);
PROC_STATUNLOCK(p);
#ifdef notyet
KASSERT(runtime >= p->p_prev_runtime,
("runtime < p_prev_runtime"));
#else
if (runtime < p->p_prev_runtime)
runtime = p->p_prev_runtime;
#endif
p->p_prev_runtime = runtime;
if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
pct_estimate = (1000000 * runtime * 100) /
((uint64_t)wallclock.tv_sec * 1000000 +
wallclock.tv_usec);
} else
pct_estimate = 0;
pct = racct_getpcpu(p, pct_estimate);
mtx_lock(&racct_lock);
racct_set_force_locked(p, RACCT_PCTCPU, pct);
racct_set_locked(p, RACCT_CPU, runtime);
racct_set_locked(p, RACCT_WALLCLOCK,
(uint64_t)wallclock.tv_sec * 1000000 +
wallclock.tv_usec);
mtx_unlock(&racct_lock);
PROC_UNLOCK(p);
}
/*
* To ensure that processes are throttled in a fair way, we need
* to iterate over all processes again and check the limits
* for %cpu resource only after ucred racct containers have been
* properly filled.
*/
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state != PRS_NORMAL) {
PROC_UNLOCK(p);
continue;
}
if (racct_pcpu_available(p) <= 0)
racct_proc_throttle(p);
else if (p->p_throttled)
racct_proc_wakeup(p);
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
pause("-", hz);
}
}
static struct kproc_desc racctd_kp = {
"racctd",
racctd,
NULL
};
SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
static void
racct_init(void)
{
racct_zone = uma_zcreate("racct", sizeof(struct racct),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
/*
* XXX: Move this somewhere.
*/
prison0.pr_prison_racct = prison_racct_find("0");
}
SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
#endif /* !RACCT */