Implement a unified run queue and adjust priority levels accordingly.

- All processes go into the same array of queues, with different scheduling classes using different portions of the array. This allows user processes to have their priorities propogated up into interrupt thread range if need be. - I chose 64 run queues as an arbitrary number that is greater than 32. We used to have 4 separate arrays of 32 queues each, so this may not be optimal. The new run queue code was written with this in mind; changing the number of run queues only requires changing constants in runq.h and adjusting the priority levels. - The new run queue code takes the run queue as a parameter. This is intended to be used to create per-cpu run queues. Implement wrappers for compatibility with the old interface which pass in the global run queue structure. - Group the priority level, user priority, native priority (before propogation) and the scheduling class into a struct priority. - Change any hard coded priority levels that I found to use symbolic constants (TTIPRI and TTOPRI). - Remove the curpriority global variable and use that of curproc. This was used to detect when a process' priority had lowered and it should yield. We now effectively yield on every interrupt. - Activate propogate_priority(). It should now have the desired effect without needing to also propogate the scheduling class. - Temporarily comment out the call to vm_page_zero_idle() in the idle loop. It interfered with propogate_priority() because the idle process needed to do a non-blocking acquire of Giant and then other processes would try to propogate their priority onto it. The idle process should not do anything except idle. vm_page_zero_idle() will return in the form of an idle priority kernel thread which is woken up at apprioriate times by the vm system. - Update struct kinfo_proc to the new priority interface. Deliberately change its size by adjusting the spare fields. It remained the same size, but the layout has changed, so userland processes that use it would parse the data incorrectly. The size constraint should really be changed to an arbitrary version number. Also add a debug.sizeof sysctl node for struct kinfo_proc.
svn path=/head/; revision=72376
2001-02-12 00:20:08 +00:00 · 2001-02-12 00:20:08 +00:00 · d5a08a6065 · 2020-12-20 02:59:44 +00:00
commit d5a08a6065
parent 216a89d6a4
39 changed files with 608 additions and 581 deletions
--- a/sys/alpha/alpha/trap.c
+++ b/sys/alpha/alpha/trap.c
@ -116,7 +116,7 @@ userret(p, frame, oticks)
 		postsig(sig);
 	}
 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, a clock interrupt could
@ -152,7 +152,6 @@ userret(p, frame, oticks)
 		addupc_task(p, frame->tf_regs[FRAME_PC],
 		    (int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }

--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@ -39,8 +39,6 @@
 #include "opt_npx.h"
 #include "opt_user_ldt.h"

-#include <sys/rtprio.h>
-
 #include <machine/asmacros.h>
 #include <machine/ipl.h>

@ -87,17 +85,12 @@ ENTRY(cpu_switch)
 	testl	%ecx,%ecx
 	jz	sw1

-#ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
-#endif /* SMP */
+
 	movl	P_VMSPACE(%ecx), %edx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif /* SMP */
 	btrl	%eax, VM_PMAP+PM_ACTIVE(%edx)

 	movl	P_ADDR(%ecx),%edx
@ -201,11 +194,7 @@ sw1b:
 	movl	%ebx,%cr3
 4:

-#ifdef SMP
 	movl	PCPU(CPUID), %esi
-#else
-	xorl	%esi, %esi
-#endif
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
@ -232,11 +221,7 @@ sw1b:
 	ltr	%si
 3:
 	movl	P_VMSPACE(%ecx), %ebx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif
 	btsl	%eax, VM_PMAP+PM_ACTIVE(%ebx)

 	/* restore context */
@ -256,9 +241,10 @@ sw1b:
 	andl	$~APIC_TPR_PRIO, _lapic+LA_TPR
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
+#endif /* SMP */
 	movl	PCPU(CPUID),%eax
 	movb	%al, P_ONCPU(%ecx)
-#endif /* SMP */
+
 	movl	%edx, PCPU(CURPCB)
 	movl	%ecx, PCPU(CURPROC)		/* into next process */

--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@ -88,10 +88,8 @@ ASSYM(P_WCHAN, offsetof(struct proc, p_wchan));
 ASSYM(PS_ASTPENDING, PS_ASTPENDING);
 ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED);

-#ifdef SMP
 ASSYM(P_ONCPU, offsetof(struct proc, p_oncpu));
 ASSYM(P_LASTCPU, offsetof(struct proc, p_lastcpu));
-#endif

 ASSYM(SSLEEP, SSLEEP);
 ASSYM(SRUN, SRUN);
@ -198,9 +196,9 @@ ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf));
 ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data));
 #endif

-#ifdef SMP
 ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid));

+#ifdef SMP
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
 ASSYM(LA_EOI, offsetof(struct LAPIC, eoi));
--- a/sys/amd64/amd64/swtch.s
+++ b/sys/amd64/amd64/swtch.s
@ -39,8 +39,6 @@
 #include "opt_npx.h"
 #include "opt_user_ldt.h"

-#include <sys/rtprio.h>
-
 #include <machine/asmacros.h>
 #include <machine/ipl.h>

@ -87,17 +85,12 @@ ENTRY(cpu_switch)
 	testl	%ecx,%ecx
 	jz	sw1

-#ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
-#endif /* SMP */
+
 	movl	P_VMSPACE(%ecx), %edx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif /* SMP */
 	btrl	%eax, VM_PMAP+PM_ACTIVE(%edx)

 	movl	P_ADDR(%ecx),%edx
@ -201,11 +194,7 @@ sw1b:
 	movl	%ebx,%cr3
 4:

-#ifdef SMP
 	movl	PCPU(CPUID), %esi
-#else
-	xorl	%esi, %esi
-#endif
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
@ -232,11 +221,7 @@ sw1b:
 	ltr	%si
 3:
 	movl	P_VMSPACE(%ecx), %ebx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif
 	btsl	%eax, VM_PMAP+PM_ACTIVE(%ebx)

 	/* restore context */
@ -256,9 +241,10 @@ sw1b:
 	andl	$~APIC_TPR_PRIO, _lapic+LA_TPR
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
+#endif /* SMP */
 	movl	PCPU(CPUID),%eax
 	movb	%al, P_ONCPU(%ecx)
-#endif /* SMP */
+
 	movl	%edx, PCPU(CURPCB)
 	movl	%ecx, PCPU(CURPROC)		/* into next process */

--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@ -179,7 +179,7 @@ userret(p, frame, oticks)
 	}

 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, clock will normally just change
@ -216,7 +216,6 @@ userret(p, frame, oticks)
 		addupc_task(p, TRAPF_PC(frame),
 			    (u_int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }

--- a/sys/dev/acpica/Osd/OsdSchedule.c
+++ b/sys/dev/acpica/Osd/OsdSchedule.c
@ -132,7 +132,7 @@ AcpiOsSleep (UINT32 Seconds, UINT32 Milliseconds)
    timo = (Seconds * hz) + Milliseconds / (1000 * hz);
    if (timo == 0)
 	timo = 1;
-    tsleep(NULL, 0, "acpislp", timo);
+    tsleep(NULL, PZERO, "acpislp", timo);
    return_VOID;
 }

--- a/sys/i386/i386/genassym.c
+++ b/sys/i386/i386/genassym.c
@ -88,10 +88,8 @@ ASSYM(P_WCHAN, offsetof(struct proc, p_wchan));
 ASSYM(PS_ASTPENDING, PS_ASTPENDING);
 ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED);

-#ifdef SMP
 ASSYM(P_ONCPU, offsetof(struct proc, p_oncpu));
 ASSYM(P_LASTCPU, offsetof(struct proc, p_lastcpu));
-#endif

 ASSYM(SSLEEP, SSLEEP);
 ASSYM(SRUN, SRUN);
@ -198,9 +196,9 @@ ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf));
 ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data));
 #endif

-#ifdef SMP
 ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid));

+#ifdef SMP
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
 ASSYM(LA_EOI, offsetof(struct LAPIC, eoi));
--- a/sys/i386/i386/swtch.s
+++ b/sys/i386/i386/swtch.s
@ -39,8 +39,6 @@
 #include "opt_npx.h"
 #include "opt_user_ldt.h"

-#include <sys/rtprio.h>
-
 #include <machine/asmacros.h>
 #include <machine/ipl.h>

@ -87,17 +85,12 @@ ENTRY(cpu_switch)
 	testl	%ecx,%ecx
 	jz	sw1

-#ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
-#endif /* SMP */
+
 	movl	P_VMSPACE(%ecx), %edx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif /* SMP */
 	btrl	%eax, VM_PMAP+PM_ACTIVE(%edx)

 	movl	P_ADDR(%ecx),%edx
@ -201,11 +194,7 @@ sw1b:
 	movl	%ebx,%cr3
 4:

-#ifdef SMP
 	movl	PCPU(CPUID), %esi
-#else
-	xorl	%esi, %esi
-#endif
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
@ -232,11 +221,7 @@ sw1b:
 	ltr	%si
 3:
 	movl	P_VMSPACE(%ecx), %ebx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif
 	btsl	%eax, VM_PMAP+PM_ACTIVE(%ebx)

 	/* restore context */
@ -256,9 +241,10 @@ sw1b:
 	andl	$~APIC_TPR_PRIO, _lapic+LA_TPR
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
+#endif /* SMP */
 	movl	PCPU(CPUID),%eax
 	movb	%al, P_ONCPU(%ecx)
-#endif /* SMP */
+
 	movl	%edx, PCPU(CURPCB)
 	movl	%ecx, PCPU(CURPROC)		/* into next process */

--- a/sys/i386/i386/trap.c
+++ b/sys/i386/i386/trap.c
@ -179,7 +179,7 @@ userret(p, frame, oticks)
 	}

 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, clock will normally just change
@ -216,7 +216,6 @@ userret(p, frame, oticks)
 		addupc_task(p, TRAPF_PC(frame),
 			    (u_int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }

--- a/sys/ia64/ia64/trap.c
+++ b/sys/ia64/ia64/trap.c
@ -94,7 +94,7 @@ userret(register struct proc *p, struct trapframe *frame, u_quad_t oticks)
 		postsig(sig);
 	}
 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (want_resched) {
 		/*
 		 * Since we are curproc, a clock interrupt could
@ -131,7 +131,6 @@ userret(register struct proc *p, struct trapframe *frame, u_quad_t oticks)
 		addupc_task(p, frame->tf_cr_iip,
 		    (int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }

--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@ -305,8 +305,9 @@ proc0_init(void *dummy __unused)
 	p->p_sflag = PS_INMEM;
 	p->p_stat = SRUN;
 	p->p_nice = NZERO;
-	p->p_rtprio.type = RTP_PRIO_NORMAL;
-	p->p_rtprio.prio = 0;
+	p->p_pri.pri_class = PRI_TIMESHARE;
+	p->p_pri.pri_level = PVM;
+	p->p_pri.pri_user = PUSER;

 	p->p_peers = 0;
 	p->p_leader = p;
--- a/sys/kern/kern_condvar.c
+++ b/sys/kern/kern_condvar.c
@ -171,7 +171,7 @@ cv_waitq_add(struct cv *cvp, struct proc *p)
 	p->p_wchan = cvp;
 	p->p_wmesg = cvp->cv_description;
 	p->p_slptime = 0;
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;
 	CTR3(KTR_PROC, "cv_waitq_add: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	TAILQ_INSERT_TAIL(&cvp->cv_waitq, p, p_slpq);
@ -217,7 +217,6 @@ cv_wait(struct cv *cvp, struct mtx *mp)

 	cv_waitq_add(cvp, p);
 	cv_switch(p);
-	curpriority = p->p_usrpri;

 	mtx_unlock_spin(&sched_lock);
 #ifdef KTRACE
@ -271,7 +270,6 @@ cv_wait_sig(struct cv *cvp, struct mtx *mp)

 	cv_waitq_add(cvp, p);
 	sig = cv_switch_catch(p);
-	curpriority = p->p_usrpri;

 	mtx_unlock_spin(&sched_lock);
 	PICKUP_GIANT();
@ -338,7 +336,6 @@ cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
 	cv_waitq_add(cvp, p);
 	callout_reset(&p->p_slpcallout, timo, cv_timedwait_end, p);
 	cv_switch(p);
-	curpriority = p->p_usrpri;

 	if (p->p_sflag & PS_TIMEOUT) {
 		p->p_sflag &= ~PS_TIMEOUT;
@ -401,7 +398,6 @@ cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
 	cv_waitq_add(cvp, p);
 	callout_reset(&p->p_slpcallout, timo, cv_timedwait_end, p);
 	sig = cv_switch_catch(p);
-	curpriority = p->p_usrpri;

 	if (p->p_sflag & PS_TIMEOUT) {
 		p->p_sflag &= ~PS_TIMEOUT;
--- a/sys/kern/kern_idle.c
+++ b/sys/kern/kern_idle.c
@ -101,8 +101,10 @@ idle_proc(void *dummy)
 				    " for a process");
 #endif

+#if 0
 			if (vm_page_zero_idle() != 0)
 				continue;
+#endif

 #ifdef __i386__
 			cpu_idle();
--- a/sys/kern/kern_intr.c
+++ b/sys/kern/kern_intr.c
@ -121,13 +121,13 @@ ithread_update(struct ithd *ithd)
 	strncpy(p->p_comm, ithd->it_name, sizeof(ithd->it_name));
 	ih = TAILQ_FIRST(&ithd->it_handlers);
 	if (ih == NULL) {
-		p->p_rtprio.prio = RTP_PRIO_MAX;
+		p->p_pri.pri_level = PRI_MAX_ITHD;
 		ithd->it_flags &= ~IT_ENTROPY;
 		return;
 	}

 	entropy = 0;
-	p->p_rtprio.prio = ih->ih_pri;
+	p->p_pri.pri_level = ih->ih_pri;
 	TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
 		if (strlen(p->p_comm) + strlen(ih->ih_name) + 1 <
 		    sizeof(p->p_comm)) {
@ -179,8 +179,8 @@ ithread_create(struct ithd **ithread, int vector, int flags,
 		free(ithd, M_ITHREAD);
 		return (error);
 	}
-	p->p_rtprio.type = RTP_PRIO_ITHREAD;
-	p->p_rtprio.prio = RTP_PRIO_MAX;
+	p->p_pri.pri_class = PRI_ITHD;
+	p->p_pri.pri_level = PRI_MAX_ITHD;
 	p->p_stat = SWAIT;
 	ithd->it_proc = p;
 	p->p_ithd = ithd;
@ -320,8 +320,8 @@ swi_add(struct ithd **ithdp, const char *name, driver_intr_t handler,
 		if (ithdp != NULL)
 			*ithdp = ithd;
 	}
-	return (ithread_add_handler(ithd, name, handler, arg, pri + PI_SOFT,
-		    flags, cookiep));
+	return (ithread_add_handler(ithd, name, handler, arg,
+		    (pri * RQ_PPQ) + PI_SOFT, flags, cookiep));
 }


--- a/sys/kern/kern_mib.c
+++ b/sys/kern/kern_mib.c
@ -253,3 +253,7 @@ SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
    0, sizeof(struct bio), "sizeof(struct bio)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
    0, sizeof(struct buf), "sizeof(struct buf)");
+
+#include <sys/user.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
+    0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@ -106,7 +106,7 @@ struct mtx_debug {
 	: (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))

 #define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
-#define SET_PRIO(p, pri)	(p)->p_priority = (pri)
+#define SET_PRIO(p, pri)	(p)->p_pri.pri_level = (pri)

 /*
 * Early WITNESS-enabled declarations.
@ -180,7 +180,7 @@ static void	propagate_priority(struct proc *);
 static void
 propagate_priority(struct proc *p)
 {
-	int pri = p->p_priority;
+	int pri = p->p_pri.pri_level;
 	struct mtx *m = p->p_blocked;

 	mtx_assert(&sched_lock, MA_OWNED);
@ -201,7 +201,7 @@ propagate_priority(struct proc *p)

 		MPASS(p->p_magic == P_MAGIC);
 		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
-		if (p->p_priority <= pri)
+		if (p->p_pri.pri_level <= pri)
 			return;

 		/*
@ -212,32 +212,16 @@ propagate_priority(struct proc *p)
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-#ifdef SMP
-		/*
-		 * For SMP, we can check the p_oncpu field to see if we are
-		 * running.
-		 */
 		if (p->p_oncpu != 0xff) {
 			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
 			return;
 		}
-#else
-		/*
-		 * For UP, we check to see if p is curproc (this shouldn't
-		 * ever happen however as it would mean we are in a deadlock.)
-		 */
-		if (p == curproc) {
-			panic("Deadlock detected");
-			return;
-		}
-#endif
+
 		/*
 		 * If on run queue move to new run queue, and
 		 * quit.
 		 */
 		if (p->p_stat == SRUN) {
-			printf("XXX: moving proc %d(%s) to a new run queue\n",
-			       p->p_pid, p->p_comm);
 			MPASS(p->p_blocked == NULL);
 			remrunqueue(p);
 			setrunqueue(p);
@ -258,23 +242,16 @@ propagate_priority(struct proc *p)
 		m = p->p_blocked;
 		MPASS(m != NULL);

-		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
-		    p->p_comm, m->mtx_description);
-
 		/*
 		 * Check if the proc needs to be moved up on
 		 * the blocked chain
 		 */
 		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
-			printf("XXX: process at head of run queue\n");
 			continue;
 		}

-		p1 = TAILQ_PREV(p, rq, p_procq);
-		if (p1->p_priority <= pri) {
-			printf(
-			   "XXX: previous process %d(%s) has higher priority\n",
-	                    p->p_pid, p->p_comm);
+		p1 = TAILQ_PREV(p, procqueue, p_procq);
+		if (p1->p_pri.pri_level <= pri) {
 			continue;
 		}

@ -288,7 +265,7 @@ propagate_priority(struct proc *p)
 		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
 		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
 			MPASS(p1->p_magic == P_MAGIC);
-			if (p1->p_priority > pri)
+			if (p1->p_pri.pri_level > pri)
 				break;
 		}

@ -371,7 +348,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 	 * p_nativepri is only read when we are blocked on a mutex, so that
 	 * can't be happening right now either.
 	 */
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;

 	while (!_obtain_lock(m, p)) {
 		uintptr_t v;
@ -396,8 +373,8 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 			MPASS(p1 != NULL);
 			m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;

-			if (p1->p_priority < p->p_priority)
-				SET_PRIO(p, p1->p_priority); 
+			if (p1->p_pri.pri_level < p->p_pri.pri_level)
+				SET_PRIO(p, p1->p_pri.pri_level); 
 			mtx_unlock_spin(&sched_lock);
 			return;
 		}
@ -446,7 +423,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		} else {
 			TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
-				if (p1->p_priority > p->p_priority)
+				if (p1->p_pri.pri_level > p->p_pri.pri_level)
 					break;
 			if (p1)
 				TAILQ_INSERT_BEFORE(p1, p, p_procq);
@ -460,9 +437,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 		p->p_blocked = m;
 		p->p_mtxname = m->mtx_description;
 		p->p_stat = SMTX;
-#if 0
 		propagate_priority(p);
-#endif

 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
@ -565,15 +540,15 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 	} else
 		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);

-	pri = MAXPRI;
+	pri = PRI_MAX;
 	LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
-		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level;
 		if (cp < pri)
 			pri = cp;
 	}

-	if (pri > p->p_nativepri)
-		pri = p->p_nativepri;
+	if (pri > p->p_pri.pri_native)
+		pri = p->p_pri.pri_native;
 	SET_PRIO(p, pri);

 	if ((opts & MTX_QUIET) == 0)
@ -585,7 +560,7 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 	p1->p_stat = SRUN;
 	setrunqueue(p1);

-	if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
+	if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) {
 #ifdef notyet
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@ -439,11 +439,8 @@ fill_kinfo_proc(p, kp)
 	kp->ki_swtime = p->p_swtime;
 	kp->ki_wchan = p->p_wchan;
 	kp->ki_traceflag = p->p_traceflag;
-	kp->ki_priority = p->p_priority;
-	kp->ki_usrpri = p->p_usrpri;
-	kp->ki_nativepri = p->p_nativepri;
+	kp->ki_pri = p->p_pri;
 	kp->ki_nice = p->p_nice;
-	kp->ki_rtprio = p->p_rtprio;
 	kp->ki_runtime = p->p_runtime;
 	kp->ki_pid = p->p_pid;
 	kp->ki_rqindex = p->p_rqindex;
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@ -264,7 +264,8 @@ rtprio(curp, uap)

 	switch (uap->function) {
 	case RTP_LOOKUP:
-		return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio)));
+		pri_to_rtp(&p->p_pri, &rtp);
+		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_can(curp, p, P_CAN_SCHED, NULL)))
 		        return (error);
@ -287,26 +288,59 @@ rtprio(curp, uap)
 			if (rtp.type != RTP_PRIO_NORMAL)
 				return (EPERM);
 		}
-		switch (rtp.type) {
-#ifdef RTP_PRIO_FIFO
-		case RTP_PRIO_FIFO:
-#endif
-		case RTP_PRIO_REALTIME:
-		case RTP_PRIO_NORMAL:
-		case RTP_PRIO_IDLE:
-			if (rtp.prio > RTP_PRIO_MAX)
-				return (EINVAL);
-			p->p_rtprio = rtp;
+		if (rtp_to_pri(&rtp, &p->p_pri) == 0)
 			return (0);
-		default:
-			return (EINVAL);
-		}
-
+		return (EINVAL);
 	default:
 		return (EINVAL);
 	}
 }

+int
+rtp_to_pri(struct rtprio *rtp, struct priority *pri)
+{
+
+	if (rtp->prio > RTP_PRIO_MAX)
+		return (-1);
+	switch (RTP_PRIO_BASE(rtp->type)) {
+	case RTP_PRIO_REALTIME:
+		pri->pri_level = PRI_MIN_REALTIME + rtp->prio;
+		break;
+	case RTP_PRIO_NORMAL:
+		pri->pri_level = PRI_MIN_TIMESHARE + rtp->prio;
+		break;
+	case RTP_PRIO_IDLE:
+		pri->pri_level = PRI_MIN_IDLE + rtp->prio;
+		break;
+	default:
+		return (-1);
+	}
+	pri->pri_class = rtp->type;
+	pri->pri_native = pri->pri_level;
+	pri->pri_user = pri->pri_level;
+	return (0);
+}
+
+void
+pri_to_rtp(struct priority *pri, struct rtprio *rtp)
+{
+
+	switch (PRI_BASE(pri->pri_class)) {
+	case PRI_REALTIME:
+		rtp->prio = pri->pri_level - PRI_MIN_REALTIME;
+		break;
+	case PRI_TIMESHARE:
+		rtp->prio = pri->pri_level - PRI_MIN_TIMESHARE;
+		break;
+	case PRI_IDLE:
+		rtp->prio = pri->pri_level - PRI_MIN_IDLE;
+		break;
+	default:
+		break;
+	}
+	rtp->type = pri->pri_class;
+}
+
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct osetrlimit_args {
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@ -1271,8 +1271,8 @@ psignal(p, sig)
 	 * Raise priority to at least PUSER.
 	 */
 	mtx_lock_spin(&sched_lock);
-	if (p->p_priority > PUSER)
-		p->p_priority = PUSER;
+	if (p->p_pri.pri_level > PUSER)
+		p->p_pri.pri_level = PUSER;
 run:
 	/* If we jump here, sched_lock has to be owned. */
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@ -379,7 +379,7 @@ uio_yield()
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	DROP_GIANT_NOSWITCH();
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	setrunqueue(p);
 	p->p_stats->p_ru.ru_nivcsw++;
 	mi_switch();
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@ -1,6 +1,8 @@
 /*
 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
 * All rights reserved.
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@ -32,225 +34,205 @@
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
-#include <sys/rtprio.h>
 #include <sys/queue.h>

 /*
- * We have NQS (32) run queues per scheduling class.  For the normal
- * class, there are 128 priorities scaled onto these 32 queues.  New
- * processes are added to the last entry in each queue, and processes
- * are selected for running by taking them from the head and maintaining
- * a simple FIFO arrangement.
- *
- * Interrupt, real time and idle priority processes have and explicit
- * 0-31 priority which maps directly onto their class queue index.
- * When a queue has something in it, the corresponding bit is set in
- * the queuebits variable, allowing a single read to determine the
- * state of all 32 queues and then a ffs() to find the first busy
- * queue.
- *
- * XXX This needs fixing.  First, we only have one idle process, so we
- * hardly need 32 queues for it.  Secondly, the number of classes
- * makes things unwieldy.  We should be able to merge them into a
- * single 96 or 128 entry queue.
+ * Global run queue.
 */
-struct rq itqueues[NQS];		/* interrupt threads */
-struct rq rtqueues[NQS];		/* real time processes */
-struct rq queues[NQS];			/* time sharing processes */
-struct rq idqueues[NQS];		/* idle process */
-u_int32_t itqueuebits;
-u_int32_t rtqueuebits;
-u_int32_t queuebits;
-u_int32_t idqueuebits;
+static struct runq runq;
+SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)

 /*
- * Initialize the run queues at boot time.
+ * Wrappers which implement old interface; act on global run queue.
 */
-static void
-rqinit(void *dummy)
-{
-	int i;

-	for (i = 0; i < NQS; i++) {
-		TAILQ_INIT(&itqueues[i]);
-		TAILQ_INIT(&rtqueues[i]);
-		TAILQ_INIT(&queues[i]);
-		TAILQ_INIT(&idqueues[i]);
-	}
-}
-SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
-
-/*
- * setrunqueue() examines a process priority and class and inserts it on
- * the tail of it's appropriate run queue (based on class and priority).
- * This sets the queue busy bit.
- * The process must be runnable.
- * This must be called at splhigh().
- */
-void
-setrunqueue(struct proc *p)
-{
-	struct rq *q;
-	u_int8_t pri;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(p->p_stat == SRUN, ("setrunqueue: proc %p (%s) not SRUN", p, \
-	    p->p_comm));
-
-	/*
-	 * Decide which class we want to run.  We now have four
-	 * queues, and this is becoming ugly.  We should be able to
-	 * collapse the first three classes into a single contiguous
-	 * queue.  XXX FIXME.
-	 */
-	CTR4(KTR_PROC, "setrunqueue: proc %p (pid %d, %s), schedlock %lx",
-		p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock);
-	if (p->p_rtprio.type == RTP_PRIO_ITHREAD) {	/* interrupt thread */
-		pri = p->p_rtprio.prio;
-		q = &itqueues[pri];
-		itqueuebits |= 1 << pri;
-	} else if (p->p_rtprio.type == RTP_PRIO_REALTIME || /* real time */
-		   p->p_rtprio.type == RTP_PRIO_FIFO) {
-		pri = p->p_rtprio.prio;
-		q = &rtqueues[pri];
-		rtqueuebits |= 1 << pri;
-	} else if (p->p_rtprio.type == RTP_PRIO_NORMAL) {   /* time sharing */
-		pri = p->p_priority >> 2;
-		q = &queues[pri];
-		queuebits |= 1 << pri;
-	} else if (p->p_rtprio.type == RTP_PRIO_IDLE) {	    /* idle proc */
-		pri = p->p_rtprio.prio;
-		q = &idqueues[pri];
-		idqueuebits |= 1 << pri;
-	} else {
-		panic("setrunqueue: invalid rtprio type %d", p->p_rtprio.type);
-	}
-	p->p_rqindex = pri;		/* remember the queue index */
-	TAILQ_INSERT_TAIL(q, p, p_procq);
-}
-
-/*
- * remrunqueue() removes a given process from the run queue that it is on,
- * clearing the queue busy bit if it becomes empty.
- * This must be called at splhigh().
- */
-void
-remrunqueue(struct proc *p)
-{
-	struct rq *q;
-	u_int32_t *which;
-	u_int8_t pri;
-
-	CTR4(KTR_PROC, "remrunqueue: proc %p (pid %d, %s), schedlock %lx",
-		p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock);
-	mtx_assert(&sched_lock, MA_OWNED);
-	pri = p->p_rqindex;
-	if (p->p_rtprio.type == RTP_PRIO_ITHREAD) {
-		q = &itqueues[pri];
-		which = &itqueuebits;
-	} else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
-		   p->p_rtprio.type == RTP_PRIO_FIFO) {
-		q = &rtqueues[pri];
-		which = &rtqueuebits;
-	} else if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
-		q = &queues[pri];
-		which = &queuebits;
-	} else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
-		q = &idqueues[pri];
-		which = &idqueuebits;
-	} else {
-		panic("remrunqueue: invalid rtprio type");
-	}
-	TAILQ_REMOVE(q, p, p_procq);
-	if (TAILQ_EMPTY(q)) {
-		KASSERT((*which & (1 << pri)) != 0,
-			("remrunqueue: remove from empty queue"));
-		*which &= ~(1 << pri);
-	}
-}
-
-/*
- * procrunnable() returns a boolean true (non-zero) value if there are
- * any runnable processes.  This is intended to be called from the idle
- * loop to avoid the more expensive (and destructive) chooseproc().
- *
- * MP SAFE.  CALLED WITHOUT THE MP LOCK
- *
- * XXX I doubt this.  It's possibly fail-safe, but there's obviously
- * the case here where one of the bits words gets loaded, the
- * processor gets preempted, and by the time it returns from this
- * function, some other processor has picked the runnable process.
- * What am I missing?  (grog, 23 July 2000).
- */
-u_int32_t
-procrunnable(void)
-{
-	return (itqueuebits || rtqueuebits || queuebits || idqueuebits);
-}
-
-/*
- * chooseproc() selects the next process to run.  Ideally, cpu_switch()
- * would have determined that there is a process available before calling
- * this, but it is not a requirement.  The selected process is removed
- * from it's queue, and the queue busy bit is cleared if it becomes empty.
- * This must be called at splhigh().
- *
- * For SMP, trivial affinity is implemented by locating the first process
- * on the queue that has a matching lastcpu id.  Since normal priorities
- * are mapped four priority levels per queue, this may allow the cpu to
- * choose a slightly lower priority process in order to preserve the cpu
- * caches.
- */
 struct proc *
 chooseproc(void)
 {
-	struct proc *p;
-	struct rq *q;
-	u_int32_t *which;
-	u_int32_t pri;
-#ifdef SMP
-	u_char id;
-#endif
+	return runq_choose(&runq);
+}
+
+int
+procrunnable(void)
+{
+	return runq_check(&runq);
+}
+
+void
+remrunqueue(struct proc *p)
+{
+	runq_remove(&runq, p);
+}
+
+void
+setrunqueue(struct proc *p)
+{
+	runq_add(&runq, p);
+}
+
+/*
+ * Clear the status bit of the queue corresponding to priority level pri,
+ * indicating that it is empty.
+ */
+static __inline void
+runq_clrbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
+
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
+}
+
+/*
+ * Find the index of the first non-empty run queue.  This is done by
+ * scanning the status bits, a set bit indicates a non-empty queue.
+ */
+static __inline int
+runq_findbit(struct runq *rq)
+{
+	struct rqbits *rqb;
+	int pri;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			pri = (RQB_FFS(rqb->rqb_bits[i]) - 1) +
+			    (i << RQB_L2BPW);
+			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
+			    rqb->rqb_bits[i], i, pri);
+			return (pri);
+		}
+
+	return (-1);
+}
+
+/*
+ * Set the status bit of the queue corresponding to priority level pri,
+ * indicating that it is non-empty.
+ */
+static __inline void
+runq_setbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
+
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
+}
+
+/*
+ * Add the process to the queue specified by its priority, and set the
+ * corresponding status bit.
+ */
+void
+runq_add(struct runq *rq, struct proc *p)
+{
+	struct rqhead *rqh;
+	int pri;

 	mtx_assert(&sched_lock, MA_OWNED);
-	if (itqueuebits) {
-		pri = ffs(itqueuebits) - 1;
-		q = &itqueues[pri];
-		which = &itqueuebits;
-	} else if (rtqueuebits) {
-		pri = ffs(rtqueuebits) - 1;
-		q = &rtqueues[pri];
-		which = &rtqueuebits;
-	} else if (queuebits) {
-		pri = ffs(queuebits) - 1;
-		q = &queues[pri];
-		which = &queuebits;
-	} else if (idqueuebits) {
-		pri = ffs(idqueuebits) - 1;
-		q = &idqueues[pri];
-		which = &idqueuebits;
-	} else {
-		CTR1(KTR_PROC, "chooseproc: idleproc, schedlock %lx",
-			(long)sched_lock.mtx_lock);
-		return PCPU_GET(idleproc);
-	}
-	p = TAILQ_FIRST(q);
-#ifdef SMP
-	/* wander down the current run queue for this pri level for a match */
-	id = PCPU_GET(cpuid);
-	while (p->p_lastcpu != id) {
-		p = TAILQ_NEXT(p, p_procq);
-		if (p == NULL) {
-			p = TAILQ_FIRST(q);
-			break;
-		}
-	}
-#endif
-	CTR4(KTR_PROC, "chooseproc: proc %p (pid %d, %s), schedlock %lx",
-		p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock);
-	KASSERT(p, ("chooseproc: no proc on busy queue"));
-	TAILQ_REMOVE(q, p, p_procq);
-	if (TAILQ_EMPTY(q))
-		*which &= ~(1 << pri);
-	return p;
+	KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN",
+	    p, p->p_comm));
+	pri = p->p_pri.pri_level / RQ_PPQ;
+	p->p_rqindex = pri;
+	runq_setbit(rq, pri);
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",
+	    p, p->p_pri.pri_level, pri, rqh);
+	TAILQ_INSERT_TAIL(rqh, p, p_procq);
+}
+
+/*
+ * Return true if there are runnable processes of any priority on the run
+ * queue, false otherwise.  Has no side effects, does not modify the run
+ * queue structure.
+ */
+int
+runq_check(struct runq *rq)
+{
+	struct rqbits *rqb;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
+			    rqb->rqb_bits[i], i);
+			return (1);
+		}
+	CTR0(KTR_RUNQ, "runq_check: empty");
+
+	return (0);
+}
+
+/*
+ * Find and remove the highest priority process from the run queue.
+ * If there are no runnable processes, the per-cpu idle process is
+ * returned.  Will not return NULL under any circumstances.
+ */
+struct proc *
+runq_choose(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct proc *p;
+	int pri;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	if ((pri = runq_findbit(rq)) != -1) {
+		rqh = &rq->rq_queues[pri];
+		p = TAILQ_FIRST(rqh);
+		CTR3(KTR_RUNQ, "runq_choose: pri=%d p=%p rqh=%p", pri, p, rqh);
+		TAILQ_REMOVE(rqh, p, p_procq);
+		if (TAILQ_EMPTY(rqh)) {
+			CTR0(KTR_RUNQ, "runq_choose: empty");
+			runq_clrbit(rq, pri);
+		}
+		return (p);
+	}
+	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
+
+	return (PCPU_GET(idleproc));
+}
+
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+	int i;
+
+	for (i = 0; i < RQ_NQS; i++)
+		TAILQ_INIT(&rq->rq_queues[i]);
+}
+
+/*
+ * Remove the process from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ */
+void
+runq_remove(struct runq *rq, struct proc *p)
+{
+	struct rqhead *rqh;
+	int pri;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	pri = p->p_rqindex;
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p",
+	    p, p->p_pri.pri_level, pri, rqh);
+	KASSERT(p != NULL, ("runq_remove: no proc on busy queue"));
+	TAILQ_REMOVE(rqh, p, p_procq);
+	if (TAILQ_EMPTY(rqh)) {
+		CTR0(KTR_RUNQ, "runq_remove: empty");
+		runq_clrbit(rq, pri);
+	}
 }
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@ -68,7 +68,6 @@
 static void sched_setup __P((void *dummy));
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)

-u_char	curpriority;
 int	hogticks;
 int	lbolt;
 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
@ -76,7 +75,6 @@ int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
 static struct callout schedcpu_callout;
 static struct callout roundrobin_callout;

-static int	curpriority_cmp __P((struct proc *p));
 static void	endtsleep __P((void *));
 static void	roundrobin __P((void *arg));
 static void	schedcpu __P((void *arg));
@ -100,56 +98,16 @@ sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");

-/*-
- * Compare priorities.  Return:
- *     <0: priority of p < current priority
- *      0: priority of p == current priority
- *     >0: priority of p > current priority
- * The priorities are the normal priorities or the normal realtime priorities
- * if p is on the same scheduler as curproc.  Otherwise the process on the
- * more realtimeish scheduler has lowest priority.  As usual, a higher
- * priority really means a lower priority.
- */
-static int
-curpriority_cmp(p)
-	struct proc *p;
-{
-	int c_class, p_class;
-
-	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
-	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
-	if (p_class != c_class)
-		return (p_class - c_class);
-	if (p_class == RTP_PRIO_NORMAL)
-		return (((int)p->p_priority - (int)curpriority) / PPQ);
-	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
-}
-
 /*
 * Arrange to reschedule if necessary, taking the priorities and
 * schedulers into account.
 */
 void
-maybe_resched(chk)
-	struct proc *chk;
+maybe_resched(p)
+	struct proc *p;
 {
-	struct proc *p = curproc; /* XXX */

-	/*
-	 * XXX idle scheduler still broken because proccess stays on idle
-	 * scheduler during waits (such as when getting FS locks).  If a
-	 * standard process becomes runaway cpu-bound, the system can lockup
-	 * due to idle-scheduler processes in wakeup never getting any cpu.
-	 */
-	if (p == PCPU_GET(idleproc)) {
-#if 0
-		need_resched();
-#endif
-	} else if (chk == p) {
-		/* We may need to yield if our priority has been raised. */
-		if (curpriority_cmp(chk) > 0)
-			need_resched();
-	} else if (curpriority_cmp(chk) < 0)
+	if (p->p_pri.pri_level < curproc->p_pri.pri_level)
 		need_resched();
 }

@ -325,19 +283,20 @@ schedcpu(arg)
 		p->p_cpticks = 0;
 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
 		resetpriority(p);
-		if (p->p_priority >= PUSER) {
+		if (p->p_pri.pri_level >= PUSER) {
 			if ((p != curproc) &&
 #ifdef SMP
 			    p->p_oncpu == 0xff && 	/* idle */
 #endif
 			    p->p_stat == SRUN &&
 			    (p->p_sflag & PS_INMEM) &&
-			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
+			    (p->p_pri.pri_level / RQ_PPQ) !=
+			    (p->p_pri.pri_user / RQ_PPQ)) {
 				remrunqueue(p);
-				p->p_priority = p->p_usrpri;
+				p->p_pri.pri_level = p->p_pri.pri_user;
 				setrunqueue(p);
 			} else
-				p->p_priority = p->p_usrpri;
+				p->p_pri.pri_level = p->p_pri.pri_user;
 		}
 		mtx_unlock_spin(&sched_lock);
 		splx(s);
@ -461,7 +420,7 @@ msleep(ident, mtx, priority, wmesg, timo)
 	p->p_wchan = ident;
 	p->p_wmesg = wmesg;
 	p->p_slptime = 0;
-	p->p_priority = priority & PRIMASK;
+	p->p_pri.pri_level = priority & PRIMASK;
 	CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p",
 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
@ -503,7 +462,6 @@ msleep(ident, mtx, priority, wmesg, timo)
 	        "msleep resume: proc %p (pid %d, %s), schedlock %p",
 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 resume:
-	curpriority = p->p_usrpri;
 	splx(s);
 	p->p_sflag &= ~PS_SINTR;
 	if (p->p_sflag & PS_TIMEOUT) {
@ -671,7 +629,6 @@ mawait(struct mtx *mtx, int priority, int timo)
 		p->p_stats->p_ru.ru_nvcsw++;
 		mi_switch();
 resume:
-		curpriority = p->p_usrpri;

 		splx(s);
 		p->p_sflag &= ~PS_SINTR;
@ -1033,11 +990,12 @@ resetpriority(p)
 	register unsigned int newpriority;

 	mtx_lock_spin(&sched_lock);
-	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+	if (p->p_pri.pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
 		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
-		newpriority = min(newpriority, MAXPRI);
-		p->p_usrpri = newpriority;
+		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
+		    PRI_MAX_TIMESHARE);
+		p->p_pri.pri_user = newpriority;
 	}
 	maybe_resched(p);
 	mtx_unlock_spin(&sched_lock);
@ -1080,8 +1038,8 @@ schedclock(p)
 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
 	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(p);
-		if (p->p_priority >= PUSER)
-			p->p_priority = p->p_usrpri;
+		if (p->p_pri.pri_level >= PUSER)
+			p->p_pri.pri_level = p->p_pri.pri_user;
 	}
 }

@ -1098,7 +1056,7 @@ yield(struct proc *p, struct yield_args *uap)
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	DROP_GIANT_NOSWITCH();
-	p->p_priority = MAXPRI;
+	p->p_pri.pri_level = PRI_MAX_TIMESHARE;
 	setrunqueue(p);
 	p->p_stats->p_ru.ru_nvcsw++;
 	mi_switch();
--- a/sys/kern/ksched.c
+++ b/sys/kern/ksched.c
@ -96,9 +96,11 @@ int ksched_detach(struct ksched *p)
 static __inline int
 getscheduler(register_t *ret, struct ksched *ksched, struct proc *p)
 {
+	struct rtprio rtp;
 	int e = 0;

-	switch (p->p_rtprio.type)
+	pri_to_rtp(&p->p_pri, &rtp);
+	switch (rtp.type)
 	{
 		case RTP_PRIO_FIFO:
 		*ret = SCHED_FIFO;
@ -138,8 +140,11 @@ int ksched_setparam(register_t *ret, struct ksched *ksched,
 int ksched_getparam(register_t *ret, struct ksched *ksched,
 	struct proc *p, struct sched_param *param)
 {
-	if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type))
-		param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio);
+	struct rtprio rtp;
+
+	pri_to_rtp(&p->p_pri, &rtp);
+	if (RTP_PRIO_IS_REALTIME(rtp.type))
+		param->sched_priority = rtpprio_to_p4prio(rtp.prio);

 	return 0;
 }
@ -169,7 +174,7 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 			rtp.type = (policy == SCHED_FIFO)
 				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;

-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);
 			need_resched();
 		}
 		else
@ -182,7 +187,7 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 		{
 			rtp.type = RTP_PRIO_NORMAL;
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);

 			/* XXX Simply revert to whatever we had for last
 			 *     normal scheduler priorities.
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@ -179,7 +179,7 @@ userret(p, frame, oticks)
 	}

 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, clock will normally just change
@ -216,7 +216,6 @@ userret(p, frame, oticks)
 		addupc_task(p, TRAPF_PC(frame),
 			    (u_int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }

--- a/sys/kern/subr_turnstile.c
+++ b/sys/kern/subr_turnstile.c
@ -106,7 +106,7 @@ struct mtx_debug {
 	: (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))

 #define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
-#define SET_PRIO(p, pri)	(p)->p_priority = (pri)
+#define SET_PRIO(p, pri)	(p)->p_pri.pri_level = (pri)

 /*
 * Early WITNESS-enabled declarations.
@ -180,7 +180,7 @@ static void	propagate_priority(struct proc *);
 static void
 propagate_priority(struct proc *p)
 {
-	int pri = p->p_priority;
+	int pri = p->p_pri.pri_level;
 	struct mtx *m = p->p_blocked;

 	mtx_assert(&sched_lock, MA_OWNED);
@ -201,7 +201,7 @@ propagate_priority(struct proc *p)

 		MPASS(p->p_magic == P_MAGIC);
 		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
-		if (p->p_priority <= pri)
+		if (p->p_pri.pri_level <= pri)
 			return;

 		/*
@ -212,32 +212,16 @@ propagate_priority(struct proc *p)
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-#ifdef SMP
-		/*
-		 * For SMP, we can check the p_oncpu field to see if we are
-		 * running.
-		 */
 		if (p->p_oncpu != 0xff) {
 			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
 			return;
 		}
-#else
-		/*
-		 * For UP, we check to see if p is curproc (this shouldn't
-		 * ever happen however as it would mean we are in a deadlock.)
-		 */
-		if (p == curproc) {
-			panic("Deadlock detected");
-			return;
-		}
-#endif
+
 		/*
 		 * If on run queue move to new run queue, and
 		 * quit.
 		 */
 		if (p->p_stat == SRUN) {
-			printf("XXX: moving proc %d(%s) to a new run queue\n",
-			       p->p_pid, p->p_comm);
 			MPASS(p->p_blocked == NULL);
 			remrunqueue(p);
 			setrunqueue(p);
@ -258,23 +242,16 @@ propagate_priority(struct proc *p)
 		m = p->p_blocked;
 		MPASS(m != NULL);

-		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
-		    p->p_comm, m->mtx_description);
-
 		/*
 		 * Check if the proc needs to be moved up on
 		 * the blocked chain
 		 */
 		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
-			printf("XXX: process at head of run queue\n");
 			continue;
 		}

-		p1 = TAILQ_PREV(p, rq, p_procq);
-		if (p1->p_priority <= pri) {
-			printf(
-			   "XXX: previous process %d(%s) has higher priority\n",
-	                    p->p_pid, p->p_comm);
+		p1 = TAILQ_PREV(p, procqueue, p_procq);
+		if (p1->p_pri.pri_level <= pri) {
 			continue;
 		}

@ -288,7 +265,7 @@ propagate_priority(struct proc *p)
 		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
 		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
 			MPASS(p1->p_magic == P_MAGIC);
-			if (p1->p_priority > pri)
+			if (p1->p_pri.pri_level > pri)
 				break;
 		}

@ -371,7 +348,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 	 * p_nativepri is only read when we are blocked on a mutex, so that
 	 * can't be happening right now either.
 	 */
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;

 	while (!_obtain_lock(m, p)) {
 		uintptr_t v;
@ -396,8 +373,8 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 			MPASS(p1 != NULL);
 			m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;

-			if (p1->p_priority < p->p_priority)
-				SET_PRIO(p, p1->p_priority); 
+			if (p1->p_pri.pri_level < p->p_pri.pri_level)
+				SET_PRIO(p, p1->p_pri.pri_level); 
 			mtx_unlock_spin(&sched_lock);
 			return;
 		}
@ -446,7 +423,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		} else {
 			TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
-				if (p1->p_priority > p->p_priority)
+				if (p1->p_pri.pri_level > p->p_pri.pri_level)
 					break;
 			if (p1)
 				TAILQ_INSERT_BEFORE(p1, p, p_procq);
@ -460,9 +437,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 		p->p_blocked = m;
 		p->p_mtxname = m->mtx_description;
 		p->p_stat = SMTX;
-#if 0
 		propagate_priority(p);
-#endif

 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
@ -565,15 +540,15 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 	} else
 		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);

-	pri = MAXPRI;
+	pri = PRI_MAX;
 	LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
-		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level;
 		if (cp < pri)
 			pri = cp;
 	}

-	if (pri > p->p_nativepri)
-		pri = p->p_nativepri;
+	if (pri > p->p_pri.pri_native)
+		pri = p->p_pri.pri_native;
 	SET_PRIO(p, pri);

 	if ((opts & MTX_QUIET) == 0)
@ -585,7 +560,7 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 	p1->p_stat = SRUN;
 	setrunqueue(p1);

-	if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
+	if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) {
 #ifdef notyet
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@ -106,7 +106,7 @@ struct mtx_debug {
 	: (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))

 #define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
-#define SET_PRIO(p, pri)	(p)->p_priority = (pri)
+#define SET_PRIO(p, pri)	(p)->p_pri.pri_level = (pri)

 /*
 * Early WITNESS-enabled declarations.
@ -180,7 +180,7 @@ static void	propagate_priority(struct proc *);
 static void
 propagate_priority(struct proc *p)
 {
-	int pri = p->p_priority;
+	int pri = p->p_pri.pri_level;
 	struct mtx *m = p->p_blocked;

 	mtx_assert(&sched_lock, MA_OWNED);
@ -201,7 +201,7 @@ propagate_priority(struct proc *p)

 		MPASS(p->p_magic == P_MAGIC);
 		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
-		if (p->p_priority <= pri)
+		if (p->p_pri.pri_level <= pri)
 			return;

 		/*
@ -212,32 +212,16 @@ propagate_priority(struct proc *p)
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-#ifdef SMP
-		/*
-		 * For SMP, we can check the p_oncpu field to see if we are
-		 * running.
-		 */
 		if (p->p_oncpu != 0xff) {
 			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
 			return;
 		}
-#else
-		/*
-		 * For UP, we check to see if p is curproc (this shouldn't
-		 * ever happen however as it would mean we are in a deadlock.)
-		 */
-		if (p == curproc) {
-			panic("Deadlock detected");
-			return;
-		}
-#endif
+
 		/*
 		 * If on run queue move to new run queue, and
 		 * quit.
 		 */
 		if (p->p_stat == SRUN) {
-			printf("XXX: moving proc %d(%s) to a new run queue\n",
-			       p->p_pid, p->p_comm);
 			MPASS(p->p_blocked == NULL);
 			remrunqueue(p);
 			setrunqueue(p);
@ -258,23 +242,16 @@ propagate_priority(struct proc *p)
 		m = p->p_blocked;
 		MPASS(m != NULL);

-		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
-		    p->p_comm, m->mtx_description);
-
 		/*
 		 * Check if the proc needs to be moved up on
 		 * the blocked chain
 		 */
 		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
-			printf("XXX: process at head of run queue\n");
 			continue;
 		}

-		p1 = TAILQ_PREV(p, rq, p_procq);
-		if (p1->p_priority <= pri) {
-			printf(
-			   "XXX: previous process %d(%s) has higher priority\n",
-	                    p->p_pid, p->p_comm);
+		p1 = TAILQ_PREV(p, procqueue, p_procq);
+		if (p1->p_pri.pri_level <= pri) {
 			continue;
 		}

@ -288,7 +265,7 @@ propagate_priority(struct proc *p)
 		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
 		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
 			MPASS(p1->p_magic == P_MAGIC);
-			if (p1->p_priority > pri)
+			if (p1->p_pri.pri_level > pri)
 				break;
 		}

@ -371,7 +348,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 	 * p_nativepri is only read when we are blocked on a mutex, so that
 	 * can't be happening right now either.
 	 */
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;

 	while (!_obtain_lock(m, p)) {
 		uintptr_t v;
@ -396,8 +373,8 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 			MPASS(p1 != NULL);
 			m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;

-			if (p1->p_priority < p->p_priority)
-				SET_PRIO(p, p1->p_priority); 
+			if (p1->p_pri.pri_level < p->p_pri.pri_level)
+				SET_PRIO(p, p1->p_pri.pri_level); 
 			mtx_unlock_spin(&sched_lock);
 			return;
 		}
@ -446,7 +423,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		} else {
 			TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
-				if (p1->p_priority > p->p_priority)
+				if (p1->p_pri.pri_level > p->p_pri.pri_level)
 					break;
 			if (p1)
 				TAILQ_INSERT_BEFORE(p1, p, p_procq);
@ -460,9 +437,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 		p->p_blocked = m;
 		p->p_mtxname = m->mtx_description;
 		p->p_stat = SMTX;
-#if 0
 		propagate_priority(p);
-#endif

 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
@ -565,15 +540,15 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 	} else
 		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);

-	pri = MAXPRI;
+	pri = PRI_MAX;
 	LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
-		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level;
 		if (cp < pri)
 			pri = cp;
 	}

-	if (pri > p->p_nativepri)
-		pri = p->p_nativepri;
+	if (pri > p->p_pri.pri_native)
+		pri = p->p_pri.pri_native;
 	SET_PRIO(p, pri);

 	if ((opts & MTX_QUIET) == 0)
@ -585,7 +560,7 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 	p1->p_stat = SRUN;
 	setrunqueue(p1);

-	if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
+	if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) {
 #ifdef notyet
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
--- a/sys/posix4/ksched.c
+++ b/sys/posix4/ksched.c
@ -96,9 +96,11 @@ int ksched_detach(struct ksched *p)
 static __inline int
 getscheduler(register_t *ret, struct ksched *ksched, struct proc *p)
 {
+	struct rtprio rtp;
 	int e = 0;

-	switch (p->p_rtprio.type)
+	pri_to_rtp(&p->p_pri, &rtp);
+	switch (rtp.type)
 	{
 		case RTP_PRIO_FIFO:
 		*ret = SCHED_FIFO;
@ -138,8 +140,11 @@ int ksched_setparam(register_t *ret, struct ksched *ksched,
 int ksched_getparam(register_t *ret, struct ksched *ksched,
 	struct proc *p, struct sched_param *param)
 {
-	if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type))
-		param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio);
+	struct rtprio rtp;
+
+	pri_to_rtp(&p->p_pri, &rtp);
+	if (RTP_PRIO_IS_REALTIME(rtp.type))
+		param->sched_priority = rtpprio_to_p4prio(rtp.prio);

 	return 0;
 }
@ -169,7 +174,7 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 			rtp.type = (policy == SCHED_FIFO)
 				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;

-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);
 			need_resched();
 		}
 		else
@ -182,7 +187,7 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 		{
 			rtp.type = RTP_PRIO_NORMAL;
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);

 			/* XXX Simply revert to whatever we had for last
 			 *     normal scheduler priorities.
--- a/sys/sys/ktr.h
+++ b/sys/sys/ktr.h
@ -66,6 +66,7 @@
 #define KTR_VOP		0x00080000		/* The obvious */
 #define KTR_VM		0x00100000		/* The virtual memory system */
 #define KTR_IDLELOOP	0x00200000		/* checks done in the idle process */
+#define	KTR_RUNQ	0x00400000		/* Run queue */

 /*
 * Trace classes which can be assigned to particular use at compile time
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@ -81,6 +81,7 @@
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #include <sys/time.h>
+#include <sys/priority.h>

 #define	FALSE	0
 #define	TRUE	1
@ -97,24 +98,6 @@
 #include <machine/limits.h>
 #endif

-/*
- * Priorities.  Note that with 32 run queues, differences less than 4 are
- * insignificant.
- */
-#define	PSWP	0
-#define	PVM	4
-#define	PINOD	8
-#define	PRIBIO	16
-#define	PVFS	20
-#define	PZERO	22		/* No longer magic, shouldn't be here.  XXX */
-#define	PSOCK	24
-#define	PWAIT	32
-#define	PCONFIG	32
-#define	PLOCK	36
-#define	PPAUSE	40
-#define	PUSER	48
-#define	MAXPRI	127		/* Priorities range from 0 through MAXPRI. */
-
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
--- a/sys/sys/priority.h
+++ b/sys/sys/priority.h
@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 1994, Henrik Vestergaard Draboel
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by (name).
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PRIORITY_H_
+#define _SYS_PRIORITY_H_
+
+/*
+ * Process priority specifications.
+ */
+
+/*
+ * Priority classes.
+ */
+
+#define	PRI_ITHD		1	/* Interrupt thread. */
+#define	PRI_REALTIME		2	/* Real time process. */
+#define	PRI_TIMESHARE		3	/* Time sharing process. */
+#define	PRI_IDLE		4	/* Idle process. */
+
+/*
+ * PRI_FIFO is POSIX.1B SCHED_FIFO.
+ */
+
+#define	PRI_FIFO_BIT		8
+#define	PRI_FIFO		(PRI_FIFO_BIT | PRI_REALTIME)
+
+#define	PRI_BASE(P)		((P) & ~PRI_FIFO_BIT)
+#define	PRI_IS_REALTIME(P)	(PRI_BASE(P) == PRI_REALTIME)
+#define	PRI_NEED_RR(P)		((P) != PRI_FIFO)
+
+/*
+ * Priorities.  Note that with 64 run queues, differences less than 4 are
+ * insignificant.
+ */
+
+/*
+ * Priorities range from 0 to 255, but differences of less then 4 (RQ_PPQ)
+ * are insignificant.  Ranges are as follows:
+ *
+ * Interrupt threads:		0 - 63
+ * Top half kernel threads:	64 - 127
+ * Realtime user threads:	128 - 159
+ * Time sharing user threads:	160 - 223
+ * Idle user threads:		224 - 255
+ *
+ * XXX If/When the specific interrupt thread and top half thread ranges
+ * disappear, a larger range can be used for user processes.
+ */
+
+#define	PRI_MIN			(0)		/* Highest priority. */
+#define	PRI_MAX			(255)		/* Lowest priority. */
+
+#define	PRI_MIN_ITHD		(PRI_MIN)
+#define	PRI_MAX_ITHD		(PRI_MIN_KERN - 1)
+
+#define	PI_REALTIME		(PRI_MIN_ITHD + 0)
+#define	PI_AV			(PRI_MIN_ITHD + 4)
+#define	PI_TTYHIGH		(PRI_MIN_ITHD + 8)
+#define	PI_TAPE			(PRI_MIN_ITHD + 12)
+#define	PI_NET			(PRI_MIN_ITHD + 16)
+#define	PI_DISK			(PRI_MIN_ITHD + 20)
+#define	PI_TTYLOW		(PRI_MIN_ITHD + 24)
+#define	PI_DISKLOW		(PRI_MIN_ITHD + 28)
+#define	PI_DULL			(PRI_MIN_ITHD + 32)
+#define	PI_SOFT			(PRI_MIN_ITHD + 36)
+
+#define	PRI_MIN_KERN		(64)
+#define	PRI_MAX_KERN		(PRI_MIN_REALTIME - 1)
+
+#define	PSWP			(PRI_MIN_KERN + 0)
+#define	PVM			(PRI_MIN_KERN + 4)
+#define	PINOD			(PRI_MIN_KERN + 8)
+#define	PRIBIO			(PRI_MIN_KERN + 12)
+#define	PVFS			(PRI_MIN_KERN + 16)
+#define	PZERO			(PRI_MIN_KERN + 20)
+#define	PSOCK			(PRI_MIN_KERN + 24)
+#define	PWAIT			(PRI_MIN_KERN + 28)
+#define	PCONFIG			(PRI_MIN_KERN + 32)
+#define	PLOCK			(PRI_MIN_KERN + 36)
+#define	PPAUSE			(PRI_MIN_KERN + 40)
+
+#define	PRI_MIN_REALTIME	(128)
+#define	PRI_MAX_REALTIME	(PRI_MIN_TIMESHARE - 1)
+
+#define	PRI_MIN_TIMESHARE	(160)
+#define	PRI_MAX_TIMESHARE	(PRI_MIN_IDLE - 1)
+
+#define	PUSER			(PRI_MIN_TIMESHARE)
+
+#define	PRI_MIN_IDLE		(224)
+#define	PRI_MAX_IDLE		(PRI_MAX)
+
+struct	priority {
+	u_char	pri_class;	/* Scheduling class. */
+	u_char	pri_level;	/* Normal priority level. */
+	u_char	pri_native;	/* Priority before propogation. */
+	u_char	pri_user;	/* User priority based on p_cpu and p_nice. */
+};
+
+#endif	/* !_SYS_PRIORITY_H_ */
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@ -46,7 +46,9 @@
 #include <sys/event.h>			/* For struct klist. */
 #include <sys/filedesc.h>
 #include <sys/queue.h>
-#include <sys/rtprio.h>			/* For struct rtprio. */
+#include <sys/priority.h>
+#include <sys/rtprio.h>			/* XXX */
+#include <sys/runq.h>
 #include <sys/signal.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
@ -251,15 +253,12 @@ struct	proc {
 	stack_t	p_sigstk;	/* (c) Stack pointer and on-stack flag. */

 	int	p_magic;	/* (b) Magic number. */
-	u_char	p_priority;	/* (j) Process priority. */
-	u_char	p_usrpri; /* (j) User priority based on p_cpu and p_nice. */
-	u_char	p_nativepri;	/* (j) Priority before propagation. */
+	struct	priority p_pri;	/* (j) Process priority. */
 	char	p_nice;		/* (j?/k?) Process "nice" value. */
 	char	p_comm[MAXCOMLEN + 1];	/* (b) Process name. */

 	struct 	pgrp *p_pgrp;	/* (e?/c?) Pointer to process group. */
 	struct 	sysentvec *p_sysent; /* (b) System call dispatch information. */
-	struct	rtprio p_rtprio;	/* (j) Realtime priority. */
 	struct	prison *p_prison;	/* (b?) jail(4). */
 	struct	pargs *p_args;		/* (b?) Process arguments. */

@ -497,18 +496,12 @@ extern int ps_showallprocs;
 extern int sched_quantum;		/* Scheduling quantum in ticks. */

 LIST_HEAD(proclist, proc);
+TAILQ_HEAD(procqueue, proc);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 extern struct proc *updateproc;		/* Process slot for syncer (sic). */

-#define	NQS	32			/* 32 run queues. */
-
-TAILQ_HEAD(rq, proc);
-extern struct rq itqueues[];
-extern struct rq rtqueues[];
-extern struct rq queues[];
-extern struct rq idqueues[];
 extern struct vm_zone *proc_zone;

 /*
@ -519,10 +512,9 @@ extern struct vm_zone *proc_zone;
 */
 #define	ESTCPULIM(e) \
    min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
-	     PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
+	     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #define	NICE_WEIGHT	1		/* Priorities per nice level. */
-#define	PPQ		(128 / NQS)	/* Priorities per queue. */

 struct mtx;
 struct trapframe;
@ -547,7 +539,7 @@ int	p_can __P((const struct proc *p1, const struct proc *p2, int operation,
 int	p_trespass __P((struct proc *p1, struct proc *p2));
 void	procinit __P((void));
 void	proc_reparent __P((struct proc *child, struct proc *newparent));
-u_int32_t procrunnable __P((void));
+int	procrunnable __P((void));
 void	remrunqueue __P((struct proc *));
 void	resetpriority __P((struct proc *));
 int	roundrobin_interval __P((void));
--- a/sys/sys/rtprio.h
+++ b/sys/sys/rtprio.h
@ -34,25 +34,26 @@
 #ifndef _SYS_RTPRIO_H_
 #define _SYS_RTPRIO_H_

+#include <sys/priority.h>
+
 /*
 * Process realtime-priority specifications to rtprio.
 */

 /* priority types.  Start at 1 to catch uninitialized fields. */

-#define RTP_PRIO_ITHREAD	1	/* interrupt thread */
-#define RTP_PRIO_REALTIME	2	/* real time process */
-#define RTP_PRIO_NORMAL		3	/* time sharing process */
-#define RTP_PRIO_IDLE		4	/* idle process */
+#define RTP_PRIO_REALTIME	PRI_REALTIME	/* real time process */
+#define RTP_PRIO_NORMAL		PRI_TIMESHARE	/* time sharing process */
+#define RTP_PRIO_IDLE		PRI_IDLE	/* idle process */

 /* RTP_PRIO_FIFO is POSIX.1B SCHED_FIFO.
 */

-#define RTP_PRIO_FIFO_BIT	4
-#define RTP_PRIO_FIFO		(RTP_PRIO_REALTIME | RTP_PRIO_FIFO_BIT)
-#define RTP_PRIO_BASE(P)	((P) & ~RTP_PRIO_FIFO_BIT)
-#define RTP_PRIO_IS_REALTIME(P) (RTP_PRIO_BASE(P) == RTP_PRIO_REALTIME)
-#define RTP_PRIO_NEED_RR(P)	((P) != RTP_PRIO_FIFO)
+#define RTP_PRIO_FIFO_BIT	PRI_FIFO_BIT
+#define RTP_PRIO_FIFO		PRI_FIFO
+#define RTP_PRIO_BASE(P)	PRI_BASE(P)
+#define RTP_PRIO_IS_REALTIME(P) PRI_IS_REALTIME(P)
+#define RTP_PRIO_NEED_RR(P)	PRI_NEED_RR(P)

 /* priority range */
 #define RTP_PRIO_MIN		0	/* Highest priority */
@ -66,32 +67,18 @@

 #ifndef LOCORE
 /*
- * Scheduling class information.  This is strictly speaking not only
- * for real-time processes.  We should replace it with two variables:
- * class and priority.  At the moment we use prio here for real-time
- * and interrupt processes, and for others we use proc.p_pri.  FIXME.
+ * Scheduling class information.
 */
 struct rtprio {
 	u_short type;			/* scheduling class */
 	u_short prio;
 };
+
+#ifdef _KERNEL
+int	rtp_to_pri(struct rtprio *, struct priority *);
+void	pri_to_rtp(struct priority *, struct rtprio *);
+#endif
 #endif
-
-/*
- * Interrupt thread priorities, after BSD/OS.
- */
-#define	PI_REALTIME	 1		/* very high priority (clock) */
-#define	PI_AV		 2		/* Audio/video devices */
-#define	PI_TTYHIGH	 3		/* High priority tty's (small FIFOs) */
-#define	PI_TAPE		 4		/* Tape devices (high for streaming) */
-#define	PI_NET		 5		/* Network interfaces */
-#define	PI_DISK		 6		/* Disks and SCSI */
-#define	PI_TTYLOW	 7		/* Ttys with big buffers */
-#define	PI_DISKLOW	 8		/* Disks that do programmed I/O */
-#define	PI_DULL		 9		/* We don't know or care */
-
-/* Soft interrupt threads */
-#define	PI_SOFT  	15		/* All soft interrupts */

 #ifndef _KERNEL
 #include <sys/cdefs.h>
--- a/sys/sys/runq.h
+++ b/sys/sys/runq.h
@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_RUNQ_H_
+#define	_RUNQ_H_
+
+/*
+ * Run queue parameters.
+ */
+
+#define	RQ_NQS		(64)		/* Number of run queues. */
+#define	RQ_PPQ		(4)		/* Priorities per queue. */
+
+#define	RQB_LEN		(2)		/* Number of priority status words. */
+#define	RQB_L2BPW	(5)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
+#define	RQB_BPW		(1<<RQB_L2BPW)	/* Bits in an rqb_word_t. */
+
+#define	RQB_BIT(pri)	(1 << ((pri) & (RQB_BPW - 1)))
+#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
+#define	RQB_FFS(word)	(ffs(word))
+
+/*
+ * Type of run queue status word.
+ */
+typedef	u_int32_t	rqb_word_t;
+
+/*
+ * Head of run queues.
+ */
+TAILQ_HEAD(rqhead, proc);
+
+/*
+ * Bit array which maintains the status of a run queue.  When a queue is
+ * non-empty the bit corresponding to the queue number will be set.
+ */
+struct	rqbits {
+	rqb_word_t rqb_bits[RQB_LEN];
+};
+
+/*
+ * Run queue structure.  Contains an array of run queues on which processes
+ * are placed, and a structure to maintain the status of each queue.
+ */
+struct	runq {
+	struct	rqbits rq_status;
+	struct	rqhead rq_queues[RQ_NQS];
+};
+
+void	runq_add(struct runq *, struct proc *);
+int	runq_check(struct runq *);
+struct	proc *runq_choose(struct runq *);
+void	runq_init(struct runq *);
+void	runq_remove(struct runq *, struct proc *);
+
+#endif
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@ -58,8 +58,6 @@ extern int nswap;		/* size of swap space */

 extern int selwait;		/* select timeout address */

-extern u_char curpriority;	/* priority of current process */
-
 extern int physmem;		/* physical memory */

 extern dev_t dumpdev;		/* dump device */
--- a/sys/sys/tty.h
+++ b/sys/sys/tty.h
@ -115,8 +115,8 @@ struct tty {
 #define	t_ospeed	t_termios.c_ospeed
 #define	t_time		t_termios.c_time

-#define	TTIPRI	25			/* Sleep priority for tty reads. */
-#define	TTOPRI	26			/* Sleep priority for tty writes. */
+#define	TTIPRI		(PSOCK + 1)	/* Sleep priority for tty reads. */
+#define	TTOPRI		(PSOCK + 2)	/* Sleep priority for tty writes. */

 /*
 * User data unfortunately has to be copied through buffers on the way to
--- a/sys/sys/user.h
+++ b/sys/sys/user.h
@ -73,9 +73,9 @@
 * fill_kinfo_proc and in lib/libkvm/kvm_proc.c in the function kvm_proclist.
 */
 #ifdef	__alpha__
-#define	KINFO_PROC_SIZE	904		/* the correct size for kinfo_proc */
+#define	KINFO_PROC_SIZE	912		/* the correct size for kinfo_proc */
 #else
-#define	KINFO_PROC_SIZE	640		/* the correct size for kinfo_proc */
+#define	KINFO_PROC_SIZE	644		/* the correct size for kinfo_proc */
 #endif
 #define	WMESGLEN	8		/* size of returned wchan message */
 #define	MTXNAMELEN	8		/* size of returned mutex name */
@ -127,9 +127,6 @@ struct kinfo_proc {
 	long	ki_flag;		/* P_* flags */
 	long	ki_kiflag;		/* KI_* flags (below) */
 	int	ki_traceflag;		/* Kernel trace points */
-	u_char	ki_priority;		/* Process priority */
-	u_char	ki_usrpri;		/* User-priority based on p_cpu */
-	u_char	ki_nativepri;		/* Priority before propogation */
 	char	ki_stat;		/* S* process status */
 	char	ki_nice;		/* Process "nice" value */
 	char	ki_lock;		/* Process lock (prevent swap) count */
@ -141,10 +138,10 @@ struct kinfo_proc {
 	char	ki_login[MAXLOGNAME+1];	/* setlogin name */
 	char	ki_mtxname[MTXNAMELEN+1]; /* mutex name */
 	char	ki_sparestrings[102];	/* spare string space */
-	struct	rtprio ki_rtprio;	/* Realtime priority */
 	struct	rusage ki_rusage;	/* process rusage statistics */
 	long	ki_sflag;		/* PS_* flags */
-	long	ki_spare[24];		/* spare constants */
+	struct	priority ki_pri;	/* process priority */
+	long	ki_spare[25];		/* spare constants */
 };
 void fill_kinfo_proc __P((struct proc *, struct kinfo_proc *));

--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@ -985,7 +985,7 @@ ffs_copyonwrite(ap)
 				VOP_UNLOCK(vp, 0, p);
 				if (error != EWOULDBLOCK)
 					break;
-				tsleep(vp, p->p_usrpri, "nap", 1);
+				tsleep(vp, p->p_pri.pri_user, "nap", 1);
 				goto retry;
 			}
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
@ -1013,7 +1013,7 @@ ffs_copyonwrite(ap)
 		if (error) {
 			if (error != EWOULDBLOCK)
 				break;
-			tsleep(vp, p->p_usrpri, "nap", 1);
+			tsleep(vp, p->p_pri.pri_user, "nap", 1);
 			goto retry;
 		}
 #ifdef DEBUG
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@ -477,7 +477,7 @@ int action;
 			/*
 			 * do not swapout a realtime process
 			 */
-			if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type)) {
+			if (PRI_IS_REALTIME(p->p_pri.pri_class)) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
@ -487,7 +487,7 @@ int action;
 			 * event of some kind.  Also guarantee swap_idle_threshold1
 			 * time in memory.
 			 */
-			if (((p->p_priority & 0x7f) < PSOCK) ||
+			if (((p->p_pri.pri_level) < PSOCK) ||
 				(p->p_slptime < swap_idle_threshold1)) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@ -82,7 +82,8 @@ loadav(struct loadavg *avg)
 	for (nrun = 0, p = LIST_FIRST(&allproc); p != 0; p = LIST_NEXT(p, p_list)) {
 		switch (p->p_stat) {
 		case SSLEEP:
-			if (p->p_priority > PZERO || p->p_slptime != 0)
+			if (p->p_pri.pri_level > PZERO ||
+			    p->p_slptime != 0)
 				continue;
 			/* FALLTHROUGH */
 		case SRUN:
@ -163,7 +164,7 @@ vmtotal(SYSCTL_HANDLER_ARGS)
 		case SSLEEP:
 		case SSTOP:
 			if (p->p_sflag & PS_INMEM) {
-				if (p->p_priority <= PZERO)
+				if (p->p_pri.pri_level <= PZERO)
 					totalp->t_dw++;
 				else if (p->p_slptime < maxslp)
 					totalp->t_sl++;