/*- * Copyright (c) 2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KTRACE #include #include #endif #include /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ /* XXX This is bogus compatability crap for ps */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) /* * These datastructures are allocated within their parent datastructure but * are scheduler specific. */ struct ke_sched { int ske_slice; struct runq *ske_runq; /* The following variables are only used for pctcpu calculation */ int ske_ltick; /* Last tick that we were running on */ int ske_ftick; /* First tick that we were running on */ int ske_ticks; /* Tick count */ }; #define ke_slice ke_sched->ske_slice #define ke_runq ke_sched->ske_runq #define ke_ltick ke_sched->ske_ltick #define ke_ftick ke_sched->ske_ftick #define ke_ticks ke_sched->ske_ticks struct kg_sched { int skg_slptime; }; #define kg_slptime kg_sched->skg_slptime struct td_sched { int std_slptime; }; #define td_slptime td_sched->std_slptime struct ke_sched ke_sched; struct kg_sched kg_sched; struct td_sched td_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = &kg_sched; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = &td_sched; /* * This priority range has 20 priorities on either end that are reachable * only through nice values. */ #define SCHED_PRI_NRESV 40 #define SCHED_PRI_RANGE ((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) - \ SCHED_PRI_NRESV) /* * These determine how sleep time effects the priority of a process. * * SLP_MAX: Maximum amount of accrued sleep time. * SLP_SCALE: Scale the number of ticks slept across the dynamic priority * range. * SLP_TOPRI: Convert a number of ticks slept into a priority value. * SLP_DECAY: Reduce the sleep time to 50% for every granted slice. */ #define SCHED_SLP_MAX (hz * 2) #define SCHED_SLP_SCALE(slp) (((slp) * SCHED_PRI_RANGE) / SCHED_SLP_MAX) #define SCHED_SLP_TOPRI(slp) (SCHED_PRI_RANGE - SCHED_SLP_SCALE((slp)) + \ SCHED_PRI_NRESV / 2) #define SCHED_SLP_DECAY(slp) ((slp) / 2) /* XXX Multiple kses break */ /* * These parameters and macros determine the size of the time slice that is * granted to each thread. * * SLICE_MIN: Minimum time slice granted, in units of ticks. * SLICE_MAX: Maximum time slice granted. * SLICE_RANGE: Range of available time slices scaled by hz. * SLICE_SCALE: The number slices granted per unit of pri or slp. * PRI_TOSLICE: Compute a slice size that is proportional to the priority. * SLP_TOSLICE: Compute a slice size that is inversely proportional to the * amount of time slept. (smaller slices for interactive ksegs) * PRI_COMP: This determines what fraction of the actual slice comes from * the slice size computed from the priority. * SLP_COMP: This determines what component of the actual slice comes from * the slize size computed from the sleep time. */ #define SCHED_SLICE_MIN (hz / 100) #define SCHED_SLICE_MAX (hz / 10) #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) #define SCHED_PRI_TOSLICE(pri) \ (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((pri), SCHED_PRI_RANGE)) #define SCHED_SLP_TOSLICE(slp) \ (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((slp), SCHED_SLP_MAX)) #define SCHED_SLP_COMP(slice) (((slice) / 5) * 3) /* 60% */ #define SCHED_PRI_COMP(slice) (((slice) / 5) * 2) /* 40% */ /* * This macro determines whether or not the kse belongs on the current or * next run queue. */ #define SCHED_CURR(kg) ((kg)->kg_slptime > (hz / 4) || \ (kg)->kg_pri_class != PRI_TIMESHARE) /* * Cpu percentage computation macros and defines. * * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. */ #define SCHED_CPU_TIME 60 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) /* * kseq - pair of runqs per processor */ struct kseq { struct runq ksq_runqs[2]; struct runq *ksq_curr; struct runq *ksq_next; int ksq_load; /* Total runnable */ }; /* * One kse queue per processor. */ struct kseq kseq_cpu[MAXCPU]; static int sched_slice(struct ksegrp *kg); static int sched_priority(struct ksegrp *kg); void sched_pctcpu_update(struct kse *ke); void sched_check_runqs(void); int sched_pickcpu(void); static void sched_setup(void *dummy) { int i; mtx_lock_spin(&sched_lock); /* init kseqs */ for (i = 0; i < MAXCPU; i++) { kseq_cpu[i].ksq_load = 0; kseq_cpu[i].ksq_curr = &kseq_cpu[i].ksq_runqs[0]; kseq_cpu[i].ksq_next = &kseq_cpu[i].ksq_runqs[1]; runq_init(kseq_cpu[i].ksq_curr); runq_init(kseq_cpu[i].ksq_next); } mtx_unlock_spin(&sched_lock); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static int sched_priority(struct ksegrp *kg) { int pri; if (kg->kg_pri_class != PRI_TIMESHARE) return (kg->kg_user_pri); pri = SCHED_SLP_TOPRI(kg->kg_slptime); CTR2(KTR_RUNQ, "sched_priority: slptime: %d\tpri: %d", kg->kg_slptime, pri); pri += PRI_MIN_TIMESHARE; pri += kg->kg_nice; if (pri > PRI_MAX_TIMESHARE) pri = PRI_MAX_TIMESHARE; else if (pri < PRI_MIN_TIMESHARE) pri = PRI_MIN_TIMESHARE; kg->kg_user_pri = pri; return (kg->kg_user_pri); } /* * Calculate a time slice based on the process priority. */ static int sched_slice(struct ksegrp *kg) { int pslice; int sslice; int slice; int pri; pri = kg->kg_user_pri; pri -= PRI_MIN_TIMESHARE; pslice = SCHED_PRI_TOSLICE(pri); sslice = SCHED_SLP_TOSLICE(kg->kg_slptime); slice = SCHED_SLP_COMP(sslice) + SCHED_PRI_COMP(pslice); kg->kg_slptime = SCHED_SLP_DECAY(kg->kg_slptime); CTR4(KTR_RUNQ, "sched_slice: pri: %d\tsslice: %d\tpslice: %d\tslice: %d", pri, sslice, pslice, slice); if (slice < SCHED_SLICE_MIN) slice = SCHED_SLICE_MIN; else if (slice > SCHED_SLICE_MAX) slice = SCHED_SLICE_MAX; return (slice); } int sched_rr_interval(void) { return (SCHED_SLICE_MAX); } void sched_pctcpu_update(struct kse *ke) { /* * Adjust counters and watermark for pctcpu calc. */ ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) * SCHED_CPU_TICKS; ke->ke_ltick = ticks; ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; } #ifdef SMP int sched_pickcpu(void) { int cpu; int load; int i; if (!smp_started) return (0); cpu = PCPU_GET(cpuid); load = kseq_cpu[cpu].ksq_load; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i)) continue; if (kseq_cpu[i].ksq_load < load) { cpu = i; load = kseq_cpu[i].ksq_load; } } CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); return (cpu); } #else int sched_pickcpu(void) { return (0); } #endif void sched_prio(struct thread *td, u_char prio) { struct kse *ke; struct runq *rq; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_priority = prio; if (TD_ON_RUNQ(td)) { rq = ke->ke_runq; runq_remove(rq, ke); runq_add(rq, ke); } } void sched_switchout(struct thread *td) { struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_last_kse = ke; td->td_lastcpu = ke->ke_oncpu; ke->ke_flags &= ~KEF_NEEDRESCHED; if (TD_IS_RUNNING(td)) { setrunqueue(td); return; } else td->td_kse->ke_runq = NULL; /* * We will not be on the run queue. So we must be * sleeping or similar. */ if (td->td_proc->p_flag & P_KSES) kse_reassign(ke); } void sched_switchin(struct thread *td) { /* struct kse *ke = td->td_kse; */ mtx_assert(&sched_lock, MA_OWNED); td->td_kse->ke_oncpu = PCPU_GET(cpuid); /* XXX */ if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && td->td_priority != td->td_ksegrp->kg_user_pri) curthread->td_kse->ke_flags |= KEF_NEEDRESCHED; } void sched_nice(struct ksegrp *kg, int nice) { struct thread *td; kg->kg_nice = nice; sched_priority(kg); FOREACH_THREAD_IN_GROUP(kg, td) { td->td_kse->ke_flags |= KEF_NEEDRESCHED; } } void sched_sleep(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); td->td_slptime = ticks; td->td_priority = prio; /* * If this is an interactive task clear its queue so it moves back * on to curr when it wakes up. Otherwise let it stay on the queue * that it was assigned to. */ if (SCHED_CURR(td->td_kse->ke_ksegrp)) td->td_kse->ke_runq = NULL; } void sched_wakeup(struct thread *td) { struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); /* * Let the kseg know how long we slept for. This is because process * interactivity behavior is modeled in the kseg. */ kg = td->td_ksegrp; if (td->td_slptime) { kg->kg_slptime += ticks - td->td_slptime; if (kg->kg_slptime > SCHED_SLP_MAX) kg->kg_slptime = SCHED_SLP_MAX; td->td_priority = sched_priority(kg); } td->td_slptime = 0; setrunqueue(td); if (td->td_priority < curthread->td_priority) curthread->td_kse->ke_flags |= KEF_NEEDRESCHED; } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct ksegrp *kg, struct ksegrp *child) { struct kse *ckse; struct kse *pkse; mtx_assert(&sched_lock, MA_OWNED); ckse = FIRST_KSE_IN_KSEGRP(child); pkse = FIRST_KSE_IN_KSEGRP(kg); /* XXX Need something better here */ child->kg_slptime = kg->kg_slptime; child->kg_user_pri = kg->kg_user_pri; if (pkse->ke_oncpu != PCPU_GET(cpuid)) { printf("pkse->ke_oncpu = %d\n", pkse->ke_oncpu); printf("cpuid = %d", PCPU_GET(cpuid)); Debugger("stop"); } ckse->ke_slice = pkse->ke_slice; ckse->ke_oncpu = pkse->ke_oncpu; /* sched_pickcpu(); */ ckse->ke_runq = NULL; /* * Claim that we've been running for one second for statistical * purposes. */ ckse->ke_ticks = 0; ckse->ke_ltick = ticks; ckse->ke_ftick = ticks - hz; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct ksegrp *kg, struct ksegrp *child) { struct kseq *kseq; struct kse *ke; /* XXX Need something better here */ mtx_assert(&sched_lock, MA_OWNED); kg->kg_slptime = child->kg_slptime; sched_priority(kg); /* * We drop the load here so that the running process leaves us with a * load of at least one. */ ke = FIRST_KSE_IN_KSEGRP(kg); kseq = &kseq_cpu[ke->ke_oncpu]; } int sched_clock_switches; void sched_clock(struct thread *td) { struct kse *ke; #if 0 struct kse *nke; #endif struct ksegrp *kg; struct kseq *kseq; int cpu; cpu = PCPU_GET(cpuid); kseq = &kseq_cpu[cpu]; mtx_assert(&sched_lock, MA_OWNED); KASSERT((td != NULL), ("schedclock: null thread pointer")); ke = td->td_kse; kg = td->td_ksegrp; ke->ke_ticks += 10000; ke->ke_ltick = ticks; /* Go up to one second beyond our max and then trim back down */ if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) sched_pctcpu_update(ke); if (td->td_kse->ke_flags & KEF_IDLEKSE) { #if 0 if (nke && nke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { printf("Idle running with %s on the runq!\n", nke->ke_proc->p_comm); Debugger("stop"); } #endif return; } #if 0 nke = runq_choose(kseq->ksq_curr); if (nke && nke->ke_thread && nke->ke_thread->td_priority < td->td_priority) { sched_clock_switches++; ke->ke_flags |= KEF_NEEDRESCHED; } #endif /* * We used a tick, decrease our total sleep time. This decreases our * "interactivity". */ if (kg->kg_slptime) kg->kg_slptime--; /* * We used up one time slice. */ ke->ke_slice--; /* * We're out of time, recompute priorities and requeue */ if (ke->ke_slice == 0) { struct kseq *kseq; kseq = &kseq_cpu[ke->ke_oncpu]; td->td_priority = sched_priority(kg); ke->ke_slice = sched_slice(kg); ke->ke_flags |= KEF_NEEDRESCHED; ke->ke_runq = NULL; } } void sched_print_load(void); void sched_print_load(void) { int cpu; for (cpu = 0; cpu < mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; printf("%d: %d\n", cpu, kseq_cpu[cpu].ksq_load); } } int sched_runnable(void) { struct kseq *kseq; int cpu; cpu = PCPU_GET(cpuid); kseq = &kseq_cpu[cpu]; if (runq_check(kseq->ksq_curr)) return (1); if (runq_check(kseq->ksq_next)) return (1); #ifdef SMP if (smp_started) { int i; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i)) continue; if (kseq_cpu[i].ksq_load && i != cpu) return (1); } } #endif return (0); } void sched_userret(struct thread *td) { struct ksegrp *kg; kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } void sched_check_runqs(void) { struct kseq *kseq; int cpu; for (cpu = 0; cpu < mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; kseq = &kseq_cpu[cpu]; if (kseq->ksq_load != (runq_depth(kseq->ksq_curr) + runq_depth(kseq->ksq_next))) { printf("CPU: %d\tload: %d\tcurr: %d\tnext: %d\n", cpu, kseq->ksq_load, runq_depth(kseq->ksq_curr), runq_depth(kseq->ksq_next)); Debugger("Imbalance"); } } } struct kse * sched_choose_kseq(struct kseq *kseq); struct kse * sched_choose_kseq(struct kseq *kseq) { struct kse *ke; struct runq *swap; if ((ke = runq_choose(kseq->ksq_curr)) == NULL) { swap = kseq->ksq_curr; kseq->ksq_curr = kseq->ksq_next; kseq->ksq_next = swap; ke = runq_choose(kseq->ksq_curr); } return (ke); } struct kse * sched_choose(void) { struct kse *ke; int cpu; cpu = PCPU_GET(cpuid); ke = sched_choose_kseq(&kseq_cpu[cpu]); if (ke) { runq_remove(ke->ke_runq, ke); ke->ke_state = KES_THREAD; #ifdef SMP kseq_cpu[cpu].ksq_load--; #if 0 sched_check_runqs(); #endif #endif } #ifdef SMP if (ke == NULL && smp_started) { int load; int me; int i; me = cpu; /* * Find the cpu with the highest load and steal one proc. */ for (load = 0, i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i) || i == me) continue; if (kseq_cpu[i].ksq_load > load) { load = kseq_cpu[i].ksq_load; cpu = i; } } if (load) { ke = sched_choose_kseq(&kseq_cpu[cpu]); kseq_cpu[cpu].ksq_load--; ke->ke_state = KES_THREAD; runq_remove(ke->ke_runq, ke); ke->ke_runq = NULL; ke->ke_oncpu = me; } } #endif return (ke); } void sched_add(struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("runq_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("runq_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_add: process swapped out")); if (ke->ke_runq == NULL) { struct kseq *kseq; kseq = &kseq_cpu[ke->ke_oncpu]; if (SCHED_CURR(ke->ke_ksegrp)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; } ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; runq_add(ke->ke_runq, ke); #ifdef SMP kseq_cpu[ke->ke_oncpu].ksq_load++; #if 0 sched_check_runqs(); #endif #endif } void sched_rem(struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); /* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */ runq_remove(ke->ke_runq, ke); ke->ke_runq = NULL; ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; #ifdef SMP kseq_cpu[ke->ke_oncpu].ksq_load--; #if 0 sched_check_runqs(); #endif #endif } fixpt_t sched_pctcpu(struct kse *ke) { fixpt_t pctcpu; pctcpu = 0; if (ke->ke_ticks) { int rtick; /* Update to account for time potentially spent sleeping */ ke->ke_ltick = ticks; sched_pctcpu_update(ke); /* How many rtick per second ? */ rtick = ke->ke_ticks / (SCHED_CPU_TIME * 10000); pctcpu = (FSCALE * ((FSCALE * rtick)/stathz)) >> FSHIFT; } ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; return (pctcpu); } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); }