Refactor a bunch of scheduler code to give basically the same behaviour

but with slightly cleaned up interfaces. The KSE structure has become the same as the "per thread scheduler private data" structure. In order to not make the diffs too great one is #defined as the other at this time. The KSE (or td_sched) structure is now allocated per thread and has no allocation code of its own. Concurrency for a KSEGRP is now kept track of via a simple pair of counters rather than using KSE structures as tokens. Since the KSE structure is different in each scheduler, kern_switch.c is now included at the end of each scheduler. Nothing outside the scheduler knows the contents of the KSE (aka td_sched) structure. The fields in the ksegrp structure that are to do with the scheduler's queueing mechanisms are now moved to the kg_sched structure. (per ksegrp scheduler private data structure). In other words how the scheduler queues and keeps track of threads is no-one's business except the scheduler's. This should allow people to write experimental schedulers with completely different internal structuring. A scheduler call sched_set_concurrency(kg, N) has been added that notifies teh scheduler that no more than N threads from that ksegrp should be allowed to be on concurrently scheduled. This is also used to enforce 'fainess' at this time so that a ksegrp with 10000 threads can not swamp a the run queue and force out a process with 1 thread, since the current code will not set the concurrency above NCPU, and both schedulers will not allow more than that many onto the system run queue at a time. Each scheduler should eventualy develop their own methods to do this now that they are effectively separated. Rejig libthr's kernel interface to follow the same code paths as linkse for scope system threads. This has slightly hurt libthr's performance but I will work to recover as much of it as I can. Thread exit code has been cleaned up greatly. exit and exec code now transitions a process back to 'standard non-threaded mode' before taking the next step. Reviewed by: scottl, peter MFC after: 1 week
svn path=/head/; revision=134791
2004-09-05 02:09:54 +00:00 · 2004-09-05 02:09:54 +00:00 · ed062c8d66 · 2020-12-20 02:59:44 +00:00
commit ed062c8d66
parent 057f1760a8
28 changed files with 813 additions and 857 deletions
--- a/lib/libkvm/kvm_proc.c
+++ b/lib/libkvm/kvm_proc.c
@ -104,7 +104,7 @@ kvm_proclist(kd, what, arg, p, bp, maxcnt)
 	struct pstats pstats;
 	struct ucred ucred;
 	struct thread mtd;
-	struct kse mke;
+	/*struct kse mke;*/
 	struct ksegrp mkg;
 	struct proc proc;
 	struct proc pproc;
@ -137,6 +137,7 @@ kvm_proclist(kd, what, arg, p, bp, maxcnt)
 					    TAILQ_FIRST(&proc.p_ksegrps));
 					return (-1);
 				}
+#if 0
 				if (KREAD(kd,
 				    (u_long)TAILQ_FIRST(&mkg.kg_kseq), &mke)) {
 					_kvm_err(kd, kd->program,
@ -144,6 +145,7 @@ kvm_proclist(kd, what, arg, p, bp, maxcnt)
 					    TAILQ_FIRST(&mkg.kg_kseq));
 					return (-1);
 				}
+#endif
 			}
 		}
 		if (KREAD(kd, (u_long)proc.p_ucred, &ucred) == 0) {
@ -386,9 +388,14 @@ kvm_proclist(kd, what, arg, p, bp, maxcnt)
 				kp->ki_pri.pri_user = mkg.kg_user_pri;
 				kp->ki_estcpu = mkg.kg_estcpu;

+#if 0
 				/* Stuff from the kse */
 				kp->ki_pctcpu = mke.ke_pctcpu;
 				kp->ki_rqindex = mke.ke_rqindex;
+#else
+				kp->ki_pctcpu = 0;
+				kp->ki_rqindex = 0;
+#endif
 			} else {
 				kp->ki_tdflags = -1;
 				/* All the rest are 0 for now */
--- a/sys/alpha/alpha/machdep.c
+++ b/sys/alpha/alpha/machdep.c
@ -846,7 +846,7 @@ alpha_init(pfn, ptb, bim, bip, biv)

 	}

-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	/*
 	 * Init mapping for u page(s) for proc 0
 	 */
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@ -1116,7 +1116,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);

 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
--- a/sys/arm/sa11x0/assabet_machdep.c
+++ b/sys/arm/sa11x0/assabet_machdep.c
@ -370,7 +370,7 @@ initarm(void *arg, void *arg2)

 	/* Set stack for exception handlers */
 	
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	proc0.p_uarea = (struct user *) proc0_uarea.pv_va;
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)
--- a/sys/conf/files
+++ b/sys/conf/files
@ -1086,7 +1086,6 @@ kern/kern_fork.c	standard
 kern/kern_idle.c	standard
 kern/kern_intr.c	standard
 kern/kern_jail.c	standard
-kern/kern_thr.c		standard
 kern/kern_kse.c		standard
 kern/kern_kthread.c	standard
 kern/kern_ktr.c		optional ktr
@ -1110,12 +1109,12 @@ kern/kern_sema.c	standard
 kern/kern_shutdown.c	standard
 kern/kern_sig.c		standard
 kern/kern_subr.c	standard
-kern/kern_switch.c	standard
 kern/kern_sx.c		standard
 kern/kern_synch.c	standard
 kern/kern_syscalls.c	standard
 kern/kern_sysctl.c	standard
 kern/kern_tc.c		standard
+kern/kern_thr.c		standard
 kern/kern_thread.c	standard
 kern/kern_time.c	standard
 kern/kern_timeout.c	standard
--- a/sys/ddb/db_ps.c
+++ b/sys/ddb/db_ps.c
@ -100,7 +100,7 @@ db_ps(dummy1, dummy2, dummy3, dummy4)
 		    p->p_ucred != NULL ? p->p_ucred->cr_ruid : 0, pp->p_pid,
 		    p->p_pgrp != NULL ? p->p_pgrp->pg_id : 0, p->p_flag,
 		    state);
-		if (p->p_flag & P_SA) 
+		if (p->p_flag & P_HADTHREADS)
 			db_printf("(threaded)  %s\n", p->p_comm);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			dumpthread(p, td);
@ -120,7 +120,7 @@ static void
 dumpthread(volatile struct proc *p, volatile struct thread *td)
 {

-	if (p->p_flag & P_SA) 
+	if (p->p_flag & P_HADTHREADS)
 		db_printf( "   thread %p ksegrp %p ", td, td->td_ksegrp);
 	if (TD_ON_SLEEPQ(td))
 		db_printf("[SLPQ %s %p]", td->td_wmesg, (void *)td->td_wchan);
@ -159,9 +159,11 @@ dumpthread(volatile struct proc *p, volatile struct thread *td)
 	default:
 		db_printf("[UNK: %#x]", td->td_state);
 	}
-	if (p->p_flag & P_SA) {
+	if (p->p_flag & P_HADTHREADS) {
+#ifdef KEF_DIDRUN
 		if (td->td_kse)
 			db_printf("[kse %p]", td->td_kse);
+#endif
 		db_printf("\n");
 	} else
 		db_printf(" %s\n", p->p_comm);
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@ -1952,7 +1952,7 @@ init386(first)
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);

 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
--- a/sys/ia64/ia64/machdep.c
+++ b/sys/ia64/ia64/machdep.c
@ -724,7 +724,7 @@ ia64_init(void)
 	msgbufp = (struct msgbuf *)pmap_steal_memory(MSGBUF_SIZE);
 	msgbufinit(msgbufp, MSGBUF_SIZE);

-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	/*
 	 * Init mapping for u page(s) for proc 0
 	 */
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@ -90,7 +90,6 @@ static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 struct	thread thread0;
-struct	kse kse0;
 struct	ksegrp ksegrp0;
 static struct filedesc0 filedesc0;
 struct	vmspace vmspace0;
@ -320,30 +319,28 @@ proc0_init(void *dummy __unused)
 	register unsigned i;
 	struct thread *td;
 	struct ksegrp *kg;
-	struct kse *ke;

 	GIANT_REQUIRED;
 	p = &proc0;
 	td = &thread0;
-	ke = &kse0;
 	kg = &ksegrp0;

-	ke->ke_sched = kse0_sched;
-	kg->kg_sched = ksegrp0_sched;
-	p->p_sched = proc0_sched;
-	td->td_sched = thread0_sched;
-
 	/*
 	 * Initialize magic number.
 	 */
 	p->p_magic = P_MAGIC;

 	/*
-	 * Initialize thread, process and pgrp structures.
+	 * Initialize thread, process and ksegrp structures.
 	 */
-	procinit();
-	threadinit();
+	procinit();	/* set up proc zone */
+	threadinit();	/* set up thead, upcall and KSEGRP zones */

+	/*
+	 * Initialise scheduler resources.
+	 * Add scheduler specific parts to proc, ksegrp, thread as needed.
+	 */
+	schedinit();	/* scheduler gets its house in order */
 	/*
 	 * Initialize sleep queue hash table
 	 */
@ -371,13 +368,6 @@ proc0_init(void *dummy __unused)
 	session0.s_leader = p;

 	p->p_sysent = &null_sysvec;
-
-	/*
-	 * proc_linkup was already done in init_i386() or alphainit() etc.
-	 * because the earlier code needed to follow td->td_proc. Otherwise
-	 * I would have done it here.. maybe this means this should be
-	 * done earlier too.
-	 */
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
 	p->p_state = PRS_NORMAL;
@ -388,10 +378,7 @@ proc0_init(void *dummy __unused)
 	kg->kg_user_pri = PUSER;
 	td->td_priority = PVM;
 	td->td_base_pri = PUSER;
-	td->td_kse = ke; /* XXXKSE */
 	td->td_oncpu = 0;
-	ke->ke_state = KES_THREAD;
-	ke->ke_thread = td;
 	p->p_peers = 0;
 	p->p_leader = p;

--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@ -254,7 +254,7 @@ kern_execve(td, fname, argv, envv, mac_p)
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
-	if (p->p_flag & P_SA || p->p_numthreads > 1) {
+	if (p->p_flag & P_HADTHREADS) {
 		if (thread_single(SINGLE_EXIT)) {
 			PROC_UNLOCK(p);
 			mtx_unlock(&Giant);
@ -262,12 +262,8 @@ kern_execve(td, fname, argv, envv, mac_p)
 		}
 		/*
 		 * If we get here all other threads are dead,
-		 * so unset the associated flags and lose KSE mode.
+		 * and threading mode has been turned off
 		 */
-		p->p_flag &= ~P_SA;
-		td->td_mailbox = NULL;
-		td->td_pflags &= ~TDP_SA;
-		thread_single_end();
 	}
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@ -134,7 +134,7 @@ exit1(struct thread *td, int rv)
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
-	if (p->p_flag & P_SA || p->p_numthreads > 1) {
+	if (p->p_flag & P_HADTHREADS) {
 retry:
 		/*
 		 * First check if some other thread got here before us..
@ -164,13 +164,8 @@ exit1(struct thread *td, int rv)
 			goto retry;
 		/*
 		 * All other activity in this process is now stopped.
-		 * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)
-		 * ...
-		 * Turn off threading support.
+		 * Threading support has been turned off.
 		 */
-		p->p_flag &= ~P_SA;
-		td->td_pflags &= ~TDP_SA;
-		thread_single_end();	/* Don't need this any more. */
 	}

 	p->p_flag |= P_WEXIT;
@ -384,16 +379,6 @@ exit1(struct thread *td, int rv)
 	PROC_UNLOCK(p);
 	lim_free(plim);

-	/*
-	 * Release this thread's reference to the ucred.  The actual proc
-	 * reference will stay around until the proc is harvested by
-	 * wait().  At this point the ucred is immutable (no other threads
-	 * from this proc are around that can change it) so we leave the
-	 * per-thread ucred pointer intact in case it is needed although
-	 * in theory nothing should be using it at this point.
-	 */
-	crfree(td->td_ucred);
-
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@ -203,7 +203,6 @@ fork1(td, flags, pages, procp)
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
-	struct kse *ke2;
 	struct ksegrp *kg2;
 	struct sigacts *newsigacts;
 	int error;
@ -466,7 +465,6 @@ fork1(td, flags, pages, procp)
 	 */
 	td2 = FIRST_THREAD_IN_PROC(p2);
 	kg2 = FIRST_KSEGRP_IN_PROC(p2);
-	ke2 = FIRST_KSE_IN_KSEGRP(kg2);

 	/* Allocate and switch to an alternate kstack if specified. */
 	if (pages != 0)
@ -479,8 +477,6 @@ fork1(td, flags, pages, procp)

 	bzero(&p2->p_startzero,
 	    (unsigned) RANGEOF(struct proc, p_startzero, p_endzero));
-	bzero(&ke2->ke_startzero,
-	    (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
 	bzero(&td2->td_startzero,
 	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
 	bzero(&kg2->kg_startzero,
@ -496,11 +492,6 @@ fork1(td, flags, pages, procp)

 	td2->td_sigstk = td->td_sigstk;

-	/* Set up the thread as an active thread (as if runnable). */
-	ke2->ke_state = KES_THREAD;
-	ke2->ke_thread = td2;
-	td2->td_kse = ke2;
-
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
@ -515,7 +506,7 @@ fork1(td, flags, pages, procp)
 	 * Allow the scheduler to adjust the priority of the child and
 	 * parent while we hold the sched_lock.
 	 */
-	sched_fork(td, p2);
+	sched_fork(td, td2);

 	mtx_unlock_spin(&sched_lock);
 	p2->p_ucred = crhold(td->td_ucred);
@ -792,7 +783,7 @@ fork_exit(callout, arg, frame)
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	cpu_critical_fork_exit();
 	CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
-		td, td->td_kse, p->p_pid, p->p_comm);
+		td, td->td_sched, p->p_pid, p->p_comm);

 	/*
 	 * Processes normally resume in mi_switch() after being
--- a/sys/kern/kern_intr.c
+++ b/sys/kern/kern_intr.c
@ -444,6 +444,7 @@ swi_add(struct ithd **ithdp, const char *name, driver_intr_t handler,
 	}
 	return (ithread_add_handler(ithd, name, handler, arg,
 		    (pri * RQ_PPQ) + PI_SOFT, flags, cookiep));
+		    /* XXKSE.. think of a better way to get separate queues */
 }


--- a/sys/kern/kern_kse.c
+++ b/sys/kern/kern_kse.c
@ -68,12 +68,6 @@ TAILQ_HEAD(, kse_upcall) zombie_upcalls =
 static int thread_update_usr_ticks(struct thread *td);
 static void thread_alloc_spare(struct thread *td);

-/* move to proc.h */
-extern void kse_purge(struct proc *p, struct thread *td);
-extern void kse_purge_group(struct thread *td);
-void kseinit(void);
-void kse_GC(void);
-
 struct kse_upcall *
 upcall_alloc(void)
 {
@ -277,7 +271,6 @@ kse_exit(struct thread *td, struct kse_exit_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
-	struct kse *ke;
 	struct kse_upcall *ku, *ku2;
 	int    error, count;

@ -330,22 +323,38 @@ kse_exit(struct thread *td, struct kse_exit_args *uap)
 		psignal(p, SIGSEGV);
 	mtx_lock_spin(&sched_lock);
 	upcall_remove(td);
-	ke = td->td_kse;
-	if (p->p_numthreads == 1) {
-		kse_purge(p, td);
-		p->p_flag &= ~P_SA;
-		mtx_unlock_spin(&sched_lock);
-		PROC_UNLOCK(p);
-	} else {
-		if (kg->kg_numthreads == 1) { /* Shutdown a group */
-			kse_purge_group(td);
-			ke->ke_flags |= KEF_EXIT;
-		}
+	if (p->p_numthreads != 1) {
+		/*
+		 * If we are not the last thread, but we are the last
+		 * thread in this ksegrp, then by definition this is not
+		 * the last group and we need to clean it up as well.
+		 * thread_exit will clean up the kseg as needed.
+		 */
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
+	/*
+	 * This is the last thread. Just return to the user.
+	 * We know that there is only one ksegrp too, as any others
+	 * would have been discarded in previous calls to thread_exit().
+	 * Effectively we have left threading mode..
+	 * The only real thing left to do is ensure that the
+	 * scheduler sets out concurrency back to 1 as that may be a
+	 * resource leak otherwise.
+	 * This is an A[PB]I issue.. what SHOULD we do?
+	 * One possibility is to return to the user. It may not cope well.
+	 * The other possibility would be to let the process exit.
+	 */
+	p->p_flag &= ~(P_SA|P_HADTHREADS);
+	sched_set_concurrency(td->td_ksegrp, 1);
+	mtx_unlock_spin(&sched_lock);
+	PROC_UNLOCK(p);
+#if 1
 	return (0);
+#else
+	exit1(td, 0);
+#endif
 }

 /*
@ -489,6 +498,10 @@ kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 /*
 * No new KSEG: first call: use current KSE, don't schedule an upcall
 * All other situations, do allocate max new KSEs and schedule an upcall.
+ *
+ * XXX should be changed so that 'first' behaviour lasts for as long
+ * as you have not made a kse in this ksegrp. i.e. as long as we do not have
+ * a mailbox..
 */
 /* struct kse_create_args {
 	struct kse_mailbox *mbx;
@ -497,7 +510,6 @@ kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 int
 kse_create(struct thread *td, struct kse_create_args *uap)
 {
-	struct kse *newke;
 	struct ksegrp *newkg;
 	struct ksegrp *kg;
 	struct proc *p;
@ -510,6 +522,13 @@ kse_create(struct thread *td, struct kse_create_args *uap)
 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 		return (err);

+	/*
+	 * Processes using the other threading model can't
+	 * suddenly start calling this one
+	 */
+	if ((p->p_flag & (P_SA|P_HADTHREADS)) == P_HADTHREADS)
+		 return (EINVAL);
+
 	ncpus = mp_ncpus;
 	if (virtual_cpu != 0)
 		ncpus = virtual_cpu;
@ -531,7 +550,7 @@ kse_create(struct thread *td, struct kse_create_args *uap)
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_SA)) {
 		first = 1;
-		p->p_flag |= P_SA;
+		p->p_flag |= P_SA|P_HADTHREADS;
 	}
 	PROC_UNLOCK(p);
 	/*
@ -612,17 +631,7 @@ kse_create(struct thread *td, struct kse_create_args *uap)
 		 * an upcall when blocked.  This simulates pthread system
 		 * scope thread behaviour.
 		 */
-		while (newkg->kg_kses < ncpus) {
-			newke = kse_alloc();
-			bzero(&newke->ke_startzero, RANGEOF(struct kse,
-			      ke_startzero, ke_endzero));
-			mtx_lock_spin(&sched_lock);
-			kse_link(newke, newkg);
-			sched_fork_kse(td, newke);
-			/* Add engine */
-			kse_reassign(newke);
-			mtx_unlock_spin(&sched_lock);
-		}
+		sched_set_concurrency(newkg, ncpus);
 	}
 	/* 
 	 * Even bound LWPs get a mailbox and an upcall to hold it.
@ -981,7 +990,8 @@ thread_update_usr_ticks(struct thread *td)
 /*
 * This function is intended to be used to initialize a spare thread
 * for upcall. Initialize thread's large data area outside sched_lock
- * for thread_schedule_upcall().
+ * for thread_schedule_upcall(). The crhold is also here to get it out
+ * from the schedlock as it has a mutex op itself.
 */
 void
 thread_alloc_spare(struct thread *td)
@ -1037,7 +1047,6 @@ thread_schedule_upcall(struct thread *td, struct kse_upcall *ku)
 	td2->td_upcall = ku;
 	td2->td_flags  = 0;
 	td2->td_pflags = TDP_SA|TDP_UPCALLING;
-	td2->td_kse    = NULL;
 	td2->td_state  = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	SIGFILLSET(td2->td_sigmask);
@ -1075,9 +1084,9 @@ thread_signal_add(struct thread *td, int sig)
 	PROC_LOCK(p);
 	mtx_lock(&ps->ps_mtx);
 }
-
-void
-thread_switchout(struct thread *td)
+#include "opt_sched.h"
+struct thread *
+thread_switchout(struct thread *td, int flags, struct thread *nextthread)
 {
 	struct kse_upcall *ku;
 	struct thread *td2;
@ -1113,8 +1122,20 @@ thread_switchout(struct thread *td)
 		td->td_upcall = NULL;
 		td->td_pflags &= ~TDP_CAN_UNBIND;
 		td2 = thread_schedule_upcall(td, ku);
+#ifdef SCHED_4BSD
+		if (flags & SW_INVOL || nextthread) {
+			setrunqueue(td2, SRQ_YIELDING);
+		} else {
+			/* Keep up with reality.. we have one extra thread 
+			 * in the picture.. and it's 'running'.
+			 */
+			return td2;
+		}
+#else
 		setrunqueue(td2, SRQ_YIELDING);
+#endif
 	}
+	return (nextthread);
 }

 /*
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@ -100,8 +100,6 @@ int uarea_pages = UAREA_PAGES;
 SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "");
 SYSCTL_INT(_kern, OID_AUTO, uarea_pages, CTLFLAG_RD, &uarea_pages, 0, "");

-#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
-
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);

 /*
@ -145,19 +143,20 @@ proc_dtor(void *mem, int size, void *arg)
 {
 	struct proc *p;
 	struct thread *td;
+#ifdef INVARIANTS
 	struct ksegrp *kg;
-	struct kse *ke;
+#endif

 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
+        td = FIRST_THREAD_IN_PROC(p);
+#ifdef INVARIANTS
 	KASSERT((p->p_numthreads == 1),
 	    ("bad number of threads in exiting process"));
-        td = FIRST_THREAD_IN_PROC(p);
 	KASSERT((td != NULL), ("proc_dtor: bad thread pointer"));
        kg = FIRST_KSEGRP_IN_PROC(p);
 	KASSERT((kg != NULL), ("proc_dtor: bad kg pointer"));
-        ke = FIRST_KSE_IN_KSEGRP(kg);
-	KASSERT((ke != NULL), ("proc_dtor: bad ke pointer"));
+#endif

 	/* Dispose of an alternate kstack, if it exists.
 	 * XXX What if there are more than one thread in the proc?
@ -166,14 +165,6 @@ proc_dtor(void *mem, int size, void *arg)
 	 */
 	if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
 		vm_thread_dispose_altkstack(td);
-
-	/*
-	 * We want to make sure we know the initial linkages.
-	 * so for now tear them down and remake them.
-	 * This is probably un-needed as we can probably rely
-	 * on the state coming in here from wait4().
-	 */
-	proc_linkup(p, kg, ke, td);
 }

 /*
@ -185,17 +176,16 @@ proc_init(void *mem, int size, int flags)
 	struct proc *p;
 	struct thread *td;
 	struct ksegrp *kg;
-	struct kse *ke;

 	p = (struct proc *)mem;
 	p->p_sched = (struct p_sched *)&p[1];
 	vm_proc_new(p);
 	td = thread_alloc();
-	ke = kse_alloc();
 	kg = ksegrp_alloc();
-	proc_linkup(p, kg, ke, td);
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	proc_linkup(p, kg, td);
+	sched_newproc(p, kg, td);
 	return (0);
 }

@ -208,7 +198,6 @@ proc_fini(void *mem, int size)
 	struct proc *p;
 	struct thread *td;
 	struct ksegrp *kg;
-	struct kse *ke;

 	p = (struct proc *)mem;
 	KASSERT((p->p_numthreads == 1),
@ -217,12 +206,10 @@ proc_fini(void *mem, int size)
 	KASSERT((td != NULL), ("proc_fini: bad thread pointer"));
        kg = FIRST_KSEGRP_IN_PROC(p);
 	KASSERT((kg != NULL), ("proc_fini: bad kg pointer"));
-        ke = FIRST_KSE_IN_KSEGRP(kg);
-	KASSERT((ke != NULL), ("proc_fini: bad ke pointer"));
 	vm_proc_dispose(p);
+	sched_destroyproc(p);
 	thread_free(td);
 	ksegrp_free(kg);
-	kse_free(ke);
 	mtx_destroy(&p->p_mtx);
 }

@ -635,7 +622,6 @@ fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
 {
 	struct proc *p;
 	struct thread *td0;
-	struct kse *ke;
 	struct ksegrp *kg;
 	struct tty *tp;
 	struct session *sp;
@ -756,7 +742,6 @@ fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
 		}

 		kg = td->td_ksegrp;
-		ke = td->td_kse;

 		/* things in the KSE GROUP */
 		kp->ki_estcpu = kg->kg_estcpu;
@ -777,11 +762,8 @@ fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
 		kp->ki_kstack = (void *)td->td_kstack;
 		kp->ki_pctcpu = sched_pctcpu(td);

-		/* Things in the kse */
-		if (ke)
-			kp->ki_rqindex = ke->ke_rqindex;
-		else
-			kp->ki_rqindex = 0;
+		/* We can't get this anymore but ps etc never used it anyway. */
+		kp->ki_rqindex = 0;

 	} else {
 		kp->ki_stat = SZOMB;
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");

 #include "opt_sched.h"

+#ifndef KERN_SWITCH_INCLUDE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
@ -100,6 +101,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
+#else  /* KERN_SWITCH_INCLUDE */
 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
 #include <sys/smp.h>
 #endif
@ -116,6 +118,8 @@ __FBSDID("$FreeBSD$");

 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);

+#define td_kse td_sched
+
 /************************************************************************
 * Functions that manipulate runnability from a thread perspective.	*
 ************************************************************************/
@ -149,7 +153,7 @@ choosethread(void)
 		td = ke->ke_thread;
 		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
 		kg = ke->ke_ksegrp;
-		if (td->td_proc->p_flag & P_SA) {
+		if (td->td_proc->p_flag & P_HADTHREADS) {
 			if (kg->kg_last_assigned == td) {
 				kg->kg_last_assigned = TAILQ_PREV(td,
 				    threadqueue, td_runq);
@ -183,51 +187,41 @@ choosethread(void)
 }

 /*
- * Given a surplus KSE, either assign a new runable thread to it
- * (and put it in the run queue) or put it in the ksegrp's idle KSE list.
- * Assumes that the original thread is not runnable.
+ * Given a surplus system slot, try assign a new runnable thread to it.
+ * Called from:
+ *  sched_thread_exit()  (local)
+ *  sched_switch()  (local)
+ *  sched_thread_exit()  (local)
+ *  remrunqueue()  (local) (commented out)
 */
-void
-kse_reassign(struct kse *ke)
+static void
+slot_fill(struct ksegrp *kg)
 {
-	struct ksegrp *kg;
 	struct thread *td;
-	struct thread *original;

 	mtx_assert(&sched_lock, MA_OWNED);
-	original = ke->ke_thread;
-	KASSERT(original == NULL || TD_IS_INHIBITED(original),
-    	    ("reassigning KSE with runnable thread"));
-	kg = ke->ke_ksegrp;
-	if (original)
-		original->td_kse = NULL;
+	while (kg->kg_avail_opennings > 0) {
+		/*
+		 * Find the first unassigned thread
+		 */
+		if ((td = kg->kg_last_assigned) != NULL)
+			td = TAILQ_NEXT(td, td_runq);
+		else
+			td = TAILQ_FIRST(&kg->kg_runq);

-	/*
-	 * Find the first unassigned thread
-	 */
-	if ((td = kg->kg_last_assigned) != NULL)
-		td = TAILQ_NEXT(td, td_runq);
-	else 
-		td = TAILQ_FIRST(&kg->kg_runq);
-
-	/*
-	 * If we found one, assign it the kse, otherwise idle the kse.
-	 */
-	if (td) {
-		kg->kg_last_assigned = td;
-		td->td_kse = ke;
-		ke->ke_thread = td;
-		CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);
-		sched_add(td, SRQ_BORING);
-		return;
+		/*
+		 * If we found one, send it to the system scheduler.
+		 */
+		if (td) {
+			kg->kg_last_assigned = td;
+			kg->kg_avail_opennings--;
+			sched_add(td, SRQ_BORING);
+			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
+		} else {
+			/* no threads to use up the slots. quit now */
+			break;
+		}
 	}
-
-	ke->ke_state = KES_IDLE;
-	ke->ke_thread = NULL;
-	TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist);
-	kg->kg_idle_kses++;
-	CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke);
-	return;
 }

 #if 0
@ -253,16 +247,17 @@ remrunqueue(struct thread *td)
 	/*
 	 * If it is not a threaded process, take the shortcut.
 	 */
-	if ((td->td_proc->p_flag & P_SA) == 0) {
+	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
 		/* Bring its kse with it, leave the thread attached */
 		sched_rem(td);
+		kg->kg_avail_opennings++;
 		ke->ke_state = KES_THREAD; 
 		return;
 	}
   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 	kg->kg_runnable--;
-	if (ke) {
+	if (ke->ke_state == KES_ONRUNQ) {
 		/*
 		 * This thread has been assigned to a KSE.
 		 * We need to dissociate it and try assign the
@ -270,12 +265,13 @@ remrunqueue(struct thread *td)
 		 * see if we need to move the KSE in the run queues.
 		 */
 		sched_rem(td);
+		kg->kg_avail_opennings++;
 		ke->ke_state = KES_THREAD; 
 		td2 = kg->kg_last_assigned;
 		KASSERT((td2 != NULL), ("last assigned has wrong value"));
 		if (td2 == td) 
 			kg->kg_last_assigned = td3;
-		kse_reassign(ke);
+		slot_fill(kg);
 	}
 }
 #endif
@ -297,7 +293,7 @@ adjustrunqueue( struct thread *td, int newpri)
 	/*
 	 * If it is not a threaded process, take the shortcut.
 	 */
-	if ((td->td_proc->p_flag & P_SA) == 0) {
+	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
 		/* We only care about the kse in the run queue. */
 		td->td_priority = newpri;
 		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
@ -310,77 +306,67 @@ adjustrunqueue( struct thread *td, int newpri)
 	/* It is a threaded process */
 	kg = td->td_ksegrp;
 	TD_SET_CAN_RUN(td);
-	if (ke) {
+	if (ke->ke_state == KES_ONRUNQ) {
 		if (kg->kg_last_assigned == td) {
 			kg->kg_last_assigned =
 			    TAILQ_PREV(td, threadqueue, td_runq);
 		}
 		sched_rem(td);
+		kg->kg_avail_opennings++;
 	}
 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 	kg->kg_runnable--;
 	td->td_priority = newpri;
 	setrunqueue(td, SRQ_BORING);
 }
-
+int limitcount;
 void
 setrunqueue(struct thread *td, int flags)
 {
-	struct kse *ke;
 	struct ksegrp *kg;
 	struct thread *td2;
 	struct thread *tda;
 	int count;

-	CTR4(KTR_RUNQ, "setrunqueue: td:%p ke:%p kg:%p pid:%d",
-	    td, td->td_kse, td->td_ksegrp, td->td_proc->p_pid);
+	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
+	    td, td->td_ksegrp, td->td_proc->p_pid);
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("setrunqueue: bad thread state"));
 	TD_SET_RUNQ(td);
 	kg = td->td_ksegrp;
-	if ((td->td_proc->p_flag & P_SA) == 0) {
+	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
 		/*
 		 * Common path optimisation: Only one of everything
 		 * and the KSE is always already attached.
 		 * Totally ignore the ksegrp run queue.
 		 */
+		if (kg->kg_avail_opennings != 1) {
+			if (limitcount < 100) {
+				limitcount++;
+				printf("pid %d: bad slot count (%d)\n",
+				    td->td_proc->p_pid, kg->kg_avail_opennings);
+
+			}
+			kg->kg_avail_opennings = 1;
+		}
+		kg->kg_avail_opennings--;
 		sched_add(td, flags);
 		return;
 	}

 	tda = kg->kg_last_assigned;
-	if ((ke = td->td_kse) == NULL) {
-		if (kg->kg_idle_kses) {
-			/*
-			 * There is a free one so it's ours for the asking..
-			 */
-			ke = TAILQ_FIRST(&kg->kg_iq);
-			CTR2(KTR_RUNQ, "setrunqueue: kg:%p: Use free ke:%p",
-			    kg, ke);
-			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
-			ke->ke_state = KES_THREAD;
-			kg->kg_idle_kses--;
-		} else if (tda && (tda->td_priority > td->td_priority)) {
-			/*
-			 * None free, but there is one we can commandeer.
-			 */
-			ke = tda->td_kse;
-			CTR3(KTR_RUNQ,
-			    "setrunqueue: kg:%p: take ke:%p from td: %p",
-			    kg, ke, tda);
-			sched_rem(tda);
-			tda->td_kse = NULL;
-			ke->ke_thread = NULL;
-			tda = kg->kg_last_assigned =
-		    	    TAILQ_PREV(tda, threadqueue, td_runq);
-		}
-	} else {
-		/* 
-		 * Temporarily disassociate so it looks like the other cases.
+	if ((kg->kg_avail_opennings <= 0) &&
+	(tda && (tda->td_priority > td->td_priority))) {
+		/*
+		 * None free, but there is one we can commandeer.
 		 */
-		ke->ke_thread = NULL;
-		td->td_kse = NULL;
+		CTR2(KTR_RUNQ,
+		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
+		sched_rem(tda);
+		tda = kg->kg_last_assigned =
+		    TAILQ_PREV(tda, threadqueue, td_runq);
+		kg->kg_avail_opennings++;
 	}

 	/*
@ -407,40 +393,30 @@ setrunqueue(struct thread *td, int flags)
 	}

 	/*
-	 * If we have a ke to use, then put it on the run queue and
-	 * If needed, readjust the last_assigned pointer.
+	 * If we have a slot to use, then put the thread on the system
+	 * run queue and if needed, readjust the last_assigned pointer.
 	 */
-	if (ke) {
+	if (kg->kg_avail_opennings > 0) {
 		if (tda == NULL) {
 			/*
 			 * No pre-existing last assigned so whoever is first
 			 * gets the KSE we brought in.. (maybe us)
 			 */
 			td2 = TAILQ_FIRST(&kg->kg_runq);
-			KASSERT((td2->td_kse == NULL),
-			    ("unexpected ke present"));
-			td2->td_kse = ke;
-			ke->ke_thread = td2;
 			kg->kg_last_assigned = td2;
 		} else if (tda->td_priority > td->td_priority) {
-			/*
-			 * It's ours, grab it, but last_assigned is past us
-			 * so don't change it.
-			 */
-			td->td_kse = ke;
-			ke->ke_thread = td;
+			td2 = td;
 		} else {
 			/* 
 			 * We are past last_assigned, so 
-			 * put the new kse on whatever is next,
+			 * gave the next slot to whatever is next,
 			 * which may or may not be us.
 			 */
 			td2 = TAILQ_NEXT(tda, td_runq);
 			kg->kg_last_assigned = td2;
-			td2->td_kse = ke;
-			ke->ke_thread = td2;
 		}
-		sched_add(ke->ke_thread, flags);
+		kg->kg_avail_opennings--;
+		sched_add(td2, flags);
 	} else {
 		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
 			td, td->td_ksegrp, td->td_proc->p_pid);
@ -692,7 +668,6 @@ runq_check(struct runq *rq)

 #if defined(SMP) && defined(SCHED_4BSD)
 int runq_fuzz = 1;
-SYSCTL_DECL(_kern_sched);
 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
 #endif

@ -766,3 +741,115 @@ runq_remove(struct runq *rq, struct kse *ke)
 	}
 }

+/****** functions that are temporarily here ***********/
+#include <vm/uma.h>
+#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
+extern struct mtx kse_zombie_lock;
+
+/*
+ *  Allocate scheduler specific per-process resources.
+ * The thread and ksegrp have already been linked in.
+ * In this case just set the default concurrency value.
+ *
+ * Called from:
+ *  proc_init() (UMA init method)
+ */
+void
+sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
+{
+
+	/* This can go in sched_fork */
+	sched_init_concurrency(kg);
+}
+
+/*
+ * Called by the uma process fini routine..
+ * undo anything we may have done in the uma_init method.
+ * Panic if it's not all 1:1:1:1
+ * Called from:
+ *  proc_fini() (UMA method)
+ */
+void
+sched_destroyproc(struct proc *p)
+{
+
+	/* this function slated for destruction */
+	KASSERT((p->p_numthreads == 1), ("Cached proc with > 1 thread "));
+	KASSERT((p->p_numksegrps == 1), ("Cached proc with > 1 ksegrp "));
+}
+
+#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
+/*
+ * thread is being either created or recycled.
+ * Fix up the per-scheduler resources associated with it.
+ * Called from:
+ *  sched_fork_thread()
+ *  thread_dtor()  (*may go away)
+ *  thread_init()  (*may go away)
+ */
+void
+sched_newthread(struct thread *td)
+{
+	struct td_sched *ke;
+
+	ke = (struct td_sched *) (td + 1);
+	bzero(ke, sizeof(*ke));
+	td->td_sched     = ke;
+	ke->ke_thread	= td;
+	ke->ke_oncpu	= NOCPU;
+	ke->ke_state	= KES_THREAD;
+}
+
+/*
+ * Set up an initial concurrency of 1
+ * and set the given thread (if given) to be using that
+ * concurrency slot.
+ * May be used "offline"..before the ksegrp is attached to the world
+ * and thus wouldn't need schedlock in that case.
+ * Called from:
+ *  thr_create()
+ *  proc_init() (UMA) via sched_newproc()
+ */
+void
+sched_init_concurrency(struct ksegrp *kg)
+{
+
+	kg->kg_concurrency = 1;
+	kg->kg_avail_opennings = 1;
+}
+
+/*
+ * Change the concurrency of an existing ksegrp to N
+ * Called from:
+ *  kse_create()
+ *  kse_exit()
+ *  thread_exit()
+ *  thread_single()
+ */
+void
+sched_set_concurrency(struct ksegrp *kg, int concurrency)
+{
+
+	/* Handle the case for a declining concurrency */
+	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
+	kg->kg_concurrency = concurrency;
+}
+
+/*
+ * Called from thread_exit() for all exiting thread
+ *
+ * Not to be confused with sched_exit_thread()
+ * that is only called from thread_exit() for threads exiting
+ * without the rest of the process exiting because it is also called from
+ * sched_exit() and we wouldn't want to call it twice.
+ * XXX This can probably be fixed.
+ */
+void
+sched_thread_exit(struct thread *td)
+{
+
+	td->td_ksegrp->kg_avail_opennings++;
+	slot_fill(td->td_ksegrp);
+}
+
+#endif /* KERN_SWITCH_INCLUDE */
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@ -334,13 +334,13 @@ mi_switch(int flags, struct thread *newtd)
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	CTR4(KTR_PROC, "mi_switch: old thread %p (kse %p, pid %ld, %s)",
-	    (void *)td, td->td_kse, (long)p->p_pid, p->p_comm);
+	    (void *)td, td->td_sched, (long)p->p_pid, p->p_comm);
 	if (td->td_proc->p_flag & P_SA)
-		thread_switchout(td);
+		newtd = thread_switchout(td, flags, newtd);
 	sched_switch(td, newtd);

 	CTR4(KTR_PROC, "mi_switch: new thread %p (kse %p, pid %ld, %s)",
-	    (void *)td, td->td_kse, (long)p->p_pid, p->p_comm);
+	    (void *)td, td->td_sched, (long)p->p_pid, p->p_comm);

 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/smp.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
@ -43,74 +44,13 @@ __FBSDID("$FreeBSD$");

 #include <machine/frame.h>

+extern int max_threads_per_proc;
+extern int max_groups_per_proc;
+
 /*
 * Back end support functions.
 */

-void
-thr_exit1(void)
-{
-	struct ksegrp *kg;
-	struct thread *td;
-	struct kse *ke;
-	struct proc *p;
-
-	td = curthread;
-	p = td->td_proc;
-	kg = td->td_ksegrp;
-	ke = td->td_kse;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
-
-	/*
-	 * Shutting down last thread in the proc.  This will actually
-	 * call exit() in the trampoline when it returns.
-	 */
-	if (p->p_numthreads == 1) {
-		PROC_UNLOCK(p);
-		return;
-	}
-
-	/*
-	 * XXX Undelivered process wide signals should be reposted to the
-	 * proc.
-	 */
-
-	/* Clean up cpu resources. */
-	cpu_thread_exit(td);
-
-	/* Unlink the thread from the process and kseg. */
-	thread_unlink(td);
-
-	ke->ke_state = KES_UNQUEUED;
-	ke->ke_thread = NULL;
-	kse_unlink(ke);
-	sched_exit_kse(TAILQ_NEXT(ke, ke_kglist), td);
-
-	/*
-	 * If we were stopped while waiting for all threads to exit and this
-	 * is the last thread wakeup the exiting thread.
-	 */
-	if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE)
-		if (p->p_numthreads == 1)
-			thread_unsuspend_one(p->p_singlethread);
-
-	PROC_UNLOCK(p);
-	td->td_kse = NULL;
-	td->td_state = TDS_INACTIVE;
-#if 0
-	td->td_proc = NULL;
-#endif
-	td->td_ksegrp = NULL;
-	td->td_last_kse = NULL;
-	sched_exit_thread(TAILQ_NEXT(td, td_kglist), td);
-	thread_stash(td);
-
-	cpu_throw(td, choosethread());
-}
-
 #define	RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))

 /*
@ -120,69 +60,79 @@ int
 thr_create(struct thread *td, struct thr_create_args *uap)
    /* ucontext_t *ctx, long *id, int flags */
 {
-	struct kse *ke0;
-	struct thread *td0;
+	struct thread *newtd;
 	ucontext_t ctx;
 	long id;
 	int error;
+	struct ksegrp *kg, *newkg;
+	struct proc *p;

+	p = td->td_proc;
+	kg = td->td_ksegrp;
 	if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
 		return (error);

-	/* Initialize our td. */
-	td0 = thread_alloc();
-
+	/* Have race condition but it is cheap */
+	if ((p->p_numksegrps >= max_groups_per_proc) ||
+	    (p->p_numthreads >= max_threads_per_proc)) {
+		return (EPROCLIM);
+	}
+	/* Initialize our td and new ksegrp.. */
+	newtd = thread_alloc();
+	newkg = ksegrp_alloc();
 	/*
 	 * Try the copyout as soon as we allocate the td so we don't have to
 	 * tear things down in a failure case below.
 	 */
-	id = td0->td_tid;
+	id = newtd->td_tid;
 	if ((error = copyout(&id, uap->id, sizeof(long)))) {
-		thread_free(td0);
+		ksegrp_free(newkg);
+		thread_free(newtd);
 		return (error);
 	}

-	bzero(&td0->td_startzero,
-	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
-	bcopy(&td->td_startcopy, &td0->td_startcopy,
+	bzero(&newtd->td_startzero,
+	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
+	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));

-	td0->td_proc = td->td_proc;
-	PROC_LOCK(td->td_proc);
-	td0->td_sigmask = td->td_sigmask;
-	PROC_UNLOCK(td->td_proc);
-	td0->td_ucred = crhold(td->td_ucred);
+	bzero(&newkg->kg_startzero,
+	    (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
+	bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
+	    (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));

-	/* Initialize our kse structure. */
-	ke0 = kse_alloc();
-	bzero(&ke0->ke_startzero,
-	    RANGEOF(struct kse, ke_startzero, ke_endzero));
+	newtd->td_proc = td->td_proc;
+	newtd->td_ucred = crhold(td->td_ucred);

 	/* Set up our machine context. */
-	cpu_set_upcall(td0, td);
-	error = set_mcontext(td0, &ctx.uc_mcontext);
+	cpu_set_upcall(newtd, td);
+	error = set_mcontext(newtd, &ctx.uc_mcontext);
 	if (error != 0) {
-		kse_free(ke0);
-		thread_free(td0);
+		ksegrp_free(newkg);
+		thread_free(newtd);
+		crfree(td->td_ucred);
 		goto out;
 	}

 	/* Link the thread and kse into the ksegrp and make it runnable. */
+	PROC_LOCK(td->td_proc);
+	td->td_proc->p_flag |= P_HADTHREADS; 
+	newtd->td_sigmask = td->td_sigmask;
 	mtx_lock_spin(&sched_lock);
+	ksegrp_link(newkg, p);
+	thread_link(newtd, newkg);
+	mtx_unlock_spin(&sched_lock);
+	PROC_UNLOCK(p);
+	sched_init_concurrency(newkg);

-	thread_link(td0, td->td_ksegrp);
-	kse_link(ke0, td->td_ksegrp);
+	/* let the scheduler know about these things. */
+	mtx_lock_spin(&sched_lock);
+	sched_fork_ksegrp(td, newkg);
+	sched_fork_thread(td, newtd);

-	/* Bind this thread and kse together. */
-	td0->td_kse = ke0;
-	ke0->ke_thread = td0;
-
-	sched_fork_kse(td, ke0);
-	sched_fork_thread(td, td0);
-
-	TD_SET_CAN_RUN(td0);
+	TD_SET_CAN_RUN(newtd);
 	if ((uap->flags & THR_SUSPENDED) == 0)
-		setrunqueue(td0, SRQ_BORING);
+		setrunqueue(newtd, SRQ_BORING);

 	mtx_unlock_spin(&sched_lock);

@ -216,12 +166,15 @@ thr_exit(struct thread *td, struct thr_exit_args *uap)
 	mtx_lock_spin(&sched_lock);

 	/*
-	 * This unlocks proc and doesn't return unless this is the last
-	 * thread.
+	 * Shutting down last thread in the proc.  This will actually
+	 * call exit() in the trampoline when it returns.
 	 */
-	thr_exit1();
+	if (p->p_numthreads != 1) {
+		thread_exit();
+		/* NOTREACHED */
+	}
 	mtx_unlock_spin(&sched_lock);
-
+	PROC_UNLOCK(p);
 	return (0);
 }

--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@ -50,7 +50,6 @@ __FBSDID("$FreeBSD$");
 * KSEGRP related storage.
 */
 static uma_zone_t ksegrp_zone;
-static uma_zone_t kse_zone;
 static uma_zone_t thread_zone;

 /* DEBUG ONLY */
@ -63,7 +62,7 @@ int max_threads_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");

-int max_groups_per_proc = 500;
+int max_groups_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
 	&max_groups_per_proc, 0, "Limit on thread groups per proc");

@ -76,19 +75,10 @@ int virtual_cpu;
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))

 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
-TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
 struct mtx kse_zombie_lock;
 MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);

-void kse_purge(struct proc *p, struct thread *td);
-void kse_purge_group(struct thread *td);
-
-/* move to proc.h */
-extern void	kseinit(void);
-extern void	kse_GC(void);
-
-
 static int
 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
 {
@ -198,6 +188,7 @@ thread_dtor(void *mem, int size, void *arg)
 		/* NOTREACHED */
 	}
 #endif
+	sched_newthread(td);
 }

 /*
@ -252,6 +243,7 @@ thread_init(void *mem, int size, int flags)
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_sched = (struct td_sched *)&td[1];
+	sched_newthread(td);
 	return (0);
 }

@ -286,19 +278,6 @@ thread_fini(void *mem, int size)
 	mtx_unlock(&tid_lock);
 }

-/*
- * Initialize type-stable parts of a kse (when newly created).
- */
-static int
-kse_init(void *mem, int size, int flags)
-{
-	struct kse	*ke;
-
-	ke = (struct kse *)mem;
-	ke->ke_sched = (struct ke_sched *)&ke[1];
-	return (0);
-}
-
 /*
 * Initialize type-stable parts of a ksegrp (when newly created).
 */
@ -309,46 +288,10 @@ ksegrp_init(void *mem, int size, int flags)

 	kg = (struct ksegrp *)mem;
 	kg->kg_sched = (struct kg_sched *)&kg[1];
+	/* sched_newksegrp(kg); */
 	return (0);
 }

-/*
- * KSE is linked into kse group.
- */
-void
-kse_link(struct kse *ke, struct ksegrp *kg)
-{
-	struct proc *p = kg->kg_proc;
-
-	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
-	kg->kg_kses++;
-	ke->ke_state	= KES_UNQUEUED;
-	ke->ke_proc	= p;
-	ke->ke_ksegrp	= kg;
-	ke->ke_thread	= NULL;
-	ke->ke_oncpu	= NOCPU;
-	ke->ke_flags	= 0;
-}
-
-void
-kse_unlink(struct kse *ke)
-{
-	struct ksegrp *kg;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	kg = ke->ke_ksegrp;
-	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
-	if (ke->ke_state == KES_IDLE) {
-		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
-		kg->kg_idle_kses--;
-	}
-	--kg->kg_kses;
-	/*
-	 * Aggregate stats from the KSE
-	 */
-	kse_stash(ke);
-}
-
 void
 ksegrp_link(struct ksegrp *kg, struct proc *p)
 {
@ -356,8 +299,6 @@ ksegrp_link(struct ksegrp *kg, struct proc *p)
 	TAILQ_INIT(&kg->kg_threads);
 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
-	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
-	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
 	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
 	kg->kg_proc = p;
 	/*
@ -366,15 +307,16 @@ ksegrp_link(struct ksegrp *kg, struct proc *p)
 	 */
 	kg->kg_numthreads = 0;
 	kg->kg_runnable   = 0;
-	kg->kg_kses       = 0;
-	kg->kg_runq_kses  = 0; /* XXXKSE change name */
-	kg->kg_idle_kses  = 0;
 	kg->kg_numupcalls = 0;
 	/* link it in now that it's consistent */
 	p->p_numksegrps++;
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 }

+/*
+ * Called from:
+ *   thread-exit()
+ */
 void
 ksegrp_unlink(struct ksegrp *kg)
 {
@ -382,7 +324,6 @@ ksegrp_unlink(struct ksegrp *kg)

 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
-	KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses"));
 	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));

 	p = kg->kg_proc;
@ -391,16 +332,18 @@ ksegrp_unlink(struct ksegrp *kg)
 	/*
 	 * Aggregate stats from the KSE
 	 */
-	ksegrp_stash(kg);
 }

 /*
 * For a newly created process,
 * link up all the structures and its initial threads etc.
+ * called from:
+ * {arch}/{arch}/machdep.c   ia64_init(), init386() etc.
+ * proc_dtor() (should go away)
+ * proc_init()
 */
 void
-proc_linkup(struct proc *p, struct ksegrp *kg,
-	    struct kse *ke, struct thread *td)
+proc_linkup(struct proc *p, struct ksegrp *kg, struct thread *td)
 {

 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
@ -410,7 +353,6 @@ proc_linkup(struct proc *p, struct ksegrp *kg,
 	p->p_numthreads = 0;

 	ksegrp_link(kg, p);
-	kse_link(ke, kg);
 	thread_link(td, kg);
 }

@ -429,10 +371,7 @@ threadinit(void)
 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
 	    NULL, NULL, ksegrp_init, NULL,
 	    UMA_ALIGN_CACHE, 0);
-	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
-	    NULL, NULL, kse_init, NULL,
-	    UMA_ALIGN_CACHE, 0);
-	kseinit();
+	kseinit();	/* set up kse specific stuff  e.g. upcall zone*/
 }

 /*
@ -446,17 +385,6 @@ thread_stash(struct thread *td)
 	mtx_unlock_spin(&kse_zombie_lock);
 }

-/*
- * Stash an embarasingly extra kse into the zombie kse queue.
- */
-void
-kse_stash(struct kse *ke)
-{
-	mtx_lock_spin(&kse_zombie_lock);
-	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
-	mtx_unlock_spin(&kse_zombie_lock);
-}
-
 /*
 * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
 */
@ -475,7 +403,6 @@ void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
-	struct kse *ke_first, *ke_next;
 	struct ksegrp *kg_first, * kg_next;

 	/*
@ -483,16 +410,12 @@ thread_reap(void)
 	 * we really don't care about the next instant..
 	 */
 	if ((!TAILQ_EMPTY(&zombie_threads))
-	    || (!TAILQ_EMPTY(&zombie_kses))
 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
 		mtx_lock_spin(&kse_zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
-		ke_first = TAILQ_FIRST(&zombie_kses);
 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
-		if (ke_first)
-			TAILQ_INIT(&zombie_kses);
 		if (kg_first)
 			TAILQ_INIT(&zombie_ksegrps);
 		mtx_unlock_spin(&kse_zombie_lock);
@ -503,18 +426,17 @@ thread_reap(void)
 			thread_free(td_first);
 			td_first = td_next;
 		}
-		while (ke_first) {
-			ke_next = TAILQ_NEXT(ke_first, ke_procq);
-			kse_free(ke_first);
-			ke_first = ke_next;
-		}
 		while (kg_first) {
 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
 			ksegrp_free(kg_first);
 			kg_first = kg_next;
 		}
+		/*
+		 * there will always be a thread on the list if one of these
+		 * is there.
+		 */
+		kse_GC();
 	}
-	kse_GC();
 }

 /*
@ -526,15 +448,6 @@ ksegrp_alloc(void)
 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
 }

-/*
- * Allocate a kse.
- */
-struct kse *
-kse_alloc(void)
-{
-	return (uma_zalloc(kse_zone, M_WAITOK));
-}
-
 /*
 * Allocate a thread.
 */
@ -554,15 +467,6 @@ ksegrp_free(struct ksegrp *td)
 	uma_zfree(ksegrp_zone, td);
 }

-/*
- * Deallocate a kse.
- */
-void
-kse_free(struct kse *td)
-{
-	uma_zfree(kse_zone, td);
-}
-
 /*
 * Deallocate a thread.
 */
@ -594,92 +498,150 @@ thread_free(struct thread *td)
 * Of course in the end, they end up coming here through exit1
 * anyhow..  After fixing 'thr' to play by the rules we should be able 
 * to merge these two functions together.
+ *
+ * called from:
+ * exit1()
+ * kse_exit()
+ * thr_exit()
+ * thread_user_enter()
+ * thread_userret()
+ * thread_suspend_check()
 */
 void
 thread_exit(void)
 {
 	struct thread *td;
-	struct kse *ke;
 	struct proc *p;
 	struct ksegrp	*kg;

 	td = curthread;
 	kg = td->td_ksegrp;
 	p = td->td_proc;
-	ke = td->td_kse;

 	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(p != NULL, ("thread exiting without a process"));
-	KASSERT(ke != NULL, ("thread exiting without a kse"));
-	KASSERT(kg != NULL, ("thread exiting without a kse group"));
+	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+	KASSERT(p != NULL, ("thread exiting without a process"));
+	KASSERT(kg != NULL, ("thread exiting without a kse group"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, p->p_comm);
-	mtx_assert(&Giant, MA_NOTOWNED);

 	if (td->td_standin != NULL) {
+		/*
+		 * Note that we don't need to free the cred here as it
+		 * is done in thread_reap().
+		 */
 		thread_stash(td->td_standin);
 		td->td_standin = NULL;
 	}

+	/*
+	 * drop FPU & debug register state storage, or any other
+	 * architecture specific resources that
+	 * would not be on a new untouched process.
+	 */
 	cpu_thread_exit(td);	/* XXXSMP */

+	/*
+	 * The thread is exiting. scheduler can release its stuff
+	 * and collect stats etc.
+	 */
+	sched_thread_exit(td);
+
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
-	 * all this stuff.
+	 * all this stuff if we never had threads.
+	 * EXIT clears all sign of other threads when
+	 * it goes to single threading, so the last thread always
+	 * takes the short path.
 	 */
-	if (p->p_numthreads > 1) {
-		thread_unlink(td);
-		if (p->p_maxthrwaits)
-			wakeup(&p->p_numthreads);
-		/*
-		 * The test below is NOT true if we are the
-		 * sole exiting thread. P_STOPPED_SNGL is unset
-		 * in exit1() after it is the only survivor.
-		 */
-		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
-			if (p->p_numthreads == p->p_suspcount) {
-				thread_unsuspend_one(p->p_singlethread);
-			}
-		}
+	if (p->p_flag & P_HADTHREADS) {
+		if (p->p_numthreads > 1) {
+			thread_unlink(td);

-		/*
-		 * Because each upcall structure has an owner thread,
-		 * owner thread exits only when process is in exiting
-		 * state, so upcall to userland is no longer needed,
-		 * deleting upcall structure is safe here.
-		 * So when all threads in a group is exited, all upcalls
-		 * in the group should be automatically freed.
-		 */
-		if (td->td_upcall)
+			/* XXX first arg not used in 4BSD or ULE */
+			sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
+
+			/*
+			 * as we are exiting there is room for another
+			 * to be created.
+			 */
+			if (p->p_maxthrwaits)
+				wakeup(&p->p_numthreads);
+
+			/*
+			 * The test below is NOT true if we are the
+			 * sole exiting thread. P_STOPPED_SNGL is unset
+			 * in exit1() after it is the only survivor.
+			 */
+			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+				if (p->p_numthreads == p->p_suspcount) {
+					thread_unsuspend_one(p->p_singlethread);
+				}
+			}
+
+			/*
+			 * Because each upcall structure has an owner thread,
+			 * owner thread exits only when process is in exiting
+			 * state, so upcall to userland is no longer needed,
+			 * deleting upcall structure is safe here.
+			 * So when all threads in a group is exited, all upcalls
+			 * in the group should be automatically freed.
+			 *  XXXKSE This is a KSE thing and should be exported
+			 * there somehow.
+			 */
 			upcall_remove(td);

-		sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
-		sched_exit_kse(FIRST_KSE_IN_PROC(p), td);
-		ke->ke_state = KES_UNQUEUED;
-		ke->ke_thread = NULL;
-		/*
-		 * Decide what to do with the KSE attached to this thread.
-		 */
-		if (ke->ke_flags & KEF_EXIT) {
-			kse_unlink(ke);
-			if (kg->kg_kses == 0) {
+			/*
+			 * If the thread we unlinked above was the last one,
+			 * then this ksegrp should go away too.
+			 */
+			if (kg->kg_numthreads == 0) {
+				/*
+				 * let the scheduler know about this in case
+				 * it needs to recover stats or resources.
+				 * Theoretically we could let
+				 * sched_exit_ksegrp()  do the equivalent of
+				 * setting the concurrency to 0
+				 * but don't do it yet to avoid changing
+				 * the existing scheduler code until we
+				 * are ready.
+				 * We supply a random other ksegrp
+				 * as the recipient of any built up
+				 * cpu usage etc. (If the scheduler wants it).
+				 * XXXKSE
+				 * This is probably not fair so think of
+ 				 * a better answer.
+				 */
 				sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
+				sched_set_concurrency(kg, 0); /* XXX TEMP */
 				ksegrp_unlink(kg);
+				ksegrp_stash(kg);
 			}
+			PROC_UNLOCK(p);
+			td->td_ksegrp	= NULL;
+			PCPU_SET(deadthread, td);
+		} else {
+			/*
+			 * The last thread is exiting.. but not through exit()
+			 * what should we do?
+			 * Theoretically this can't happen
+ 			 * exit1() - clears threading flags before coming here
+ 			 * kse_exit() - treats last thread specially
+ 			 * thr_exit() - treats last thread specially
+ 			 * thread_user_enter() - only if more exist
+ 			 * thread_userret() - only if more exist
+ 			 * thread_suspend_check() - only if more exist
+			 */
+			panic ("thread_exit: Last thread exiting on its own");
 		}
-		else
-			kse_reassign(ke);
-		PROC_UNLOCK(p);
-		td->td_kse	= NULL;
-#if 0
-		td->td_proc	= NULL;
-#endif
-		td->td_ksegrp	= NULL;
-		td->td_last_kse	= NULL;
-		PCPU_SET(deadthread, td);
 	} else {
+		/*
+		 * non threaded process comes here.
+		 * This includes an EX threaded process that is coming
+		 * here via exit1(). (exit1 dethreads the proc first).
+		 */
 		PROC_UNLOCK(p);
 	}
 	td->td_state = TDS_INACTIVE;
@ -703,10 +665,13 @@ thread_wait(struct proc *p)
 	KASSERT((p->p_numksegrps == 1), ("Multiple ksegrps in wait1()"));
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_standin != NULL) {
+			crfree(td->td_ucred);
+			td->td_ucred = NULL;
 			thread_free(td->td_standin);
 			td->td_standin = NULL;
 		}
 		cpu_thread_clean(td);
+		crfree(td->td_ucred);
 	}
 	thread_reap();	/* check for zombie threads etc. */
 }
@ -718,6 +683,10 @@ thread_wait(struct proc *p)
 *
 * Note that we do not link to the proc's ucred here.
 * The thread is linked as if running but no KSE assigned.
+ * Called from:
+ *  proc_linkup()
+ *  thread_schedule_upcall()
+ *  thr_create()
 */
 void
 thread_link(struct thread *td, struct ksegrp *kg)
@ -728,10 +697,8 @@ thread_link(struct thread *td, struct ksegrp *kg)
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_ksegrp   = kg;
-	td->td_last_kse = NULL;
 	td->td_flags    = 0;
 	td->td_kflags	= 0;
-	td->td_kse      = NULL;

 	LIST_INIT(&td->td_contested);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
@ -741,6 +708,10 @@ thread_link(struct thread *td, struct ksegrp *kg)
 	kg->kg_numthreads++;
 }

+/*
+ * Called from:
+ *  thread_exit()
+ */
 void
 thread_unlink(struct thread *td)
 {
@ -753,73 +724,7 @@ thread_unlink(struct thread *td)
 	TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
 	kg->kg_numthreads--;
 	/* could clear a few other things here */
-}
-
-/*
- * Purge a ksegrp resource. When a ksegrp is preparing to
- * exit, it calls this function.
- */
-void
-kse_purge_group(struct thread *td)
-{
-	struct ksegrp *kg;
-	struct kse *ke;
-
-	kg = td->td_ksegrp;
- 	KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__));
-	while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
-		KASSERT(ke->ke_state == KES_IDLE,
-			("%s: wrong idle KSE state", __func__));
-		kse_unlink(ke);
-	}
-	KASSERT((kg->kg_kses == 1),
-		("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses));
-	KASSERT((kg->kg_numupcalls == 0),
-	        ("%s: ksegrp still has %d upcall datas",
-		__func__, kg->kg_numupcalls));
-}
-
-/*
- * Purge a process's KSE resource. When a process is preparing to
- * exit, it calls kse_purge to release any extra KSE resources in
- * the process.
- */
-void
-kse_purge(struct proc *p, struct thread *td)
-{
-	struct ksegrp *kg;
-	struct kse *ke;
-
- 	KASSERT(p->p_numthreads == 1, ("bad thread number"));
-	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
-		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
-		p->p_numksegrps--;
-		/*
-		 * There is no ownership for KSE, after all threads
-		 * in the group exited, it is possible that some KSEs
-		 * were left in idle queue, gc them now.
-		 */
-		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
-			KASSERT(ke->ke_state == KES_IDLE,
-			   ("%s: wrong idle KSE state", __func__));
-			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
-			kg->kg_idle_kses--;
-			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
-			kg->kg_kses--;
-			kse_stash(ke);
-		}
-		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
-		        ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
-		        ("ksegrp has wrong kg_kses: %d", kg->kg_kses));
-		KASSERT((kg->kg_numupcalls == 0),
-		        ("%s: ksegrp still has %d upcall datas",
-			__func__, kg->kg_numupcalls));
-
-		if (kg != td->td_ksegrp)
-			ksegrp_stash(kg);
-	}
-	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
-	p->p_numksegrps++;
+	/* Must  NOT clear links to proc and ksegrp! */
 }

 /*
@ -849,7 +754,7 @@ thread_single(int force_exit)
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((td != NULL), ("curthread is NULL"));

-	if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1)
+	if ((p->p_flag & P_HADTHREADS) == 0)
 		return (0);

 	/* Is someone already single threading? */
@ -924,11 +829,17 @@ thread_single(int force_exit)
 			remaining = p->p_numthreads - p->p_suspcount;
 	}
 	if (force_exit == SINGLE_EXIT) {
-		if (td->td_upcall)
-			upcall_remove(td);
-		kse_purge(p, td);
+		upcall_remove(td);
+		p->p_flag &= ~(P_SA|P_HADTHREADS);
+		td->td_mailbox = NULL;
+		td->td_pflags &= ~TDP_SA;
+		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
+		p->p_singlethread = NULL;
+		sched_set_concurrency(td->td_ksegrp, 1);
+		mtx_unlock_spin(&sched_lock);
+	} else {
+		mtx_unlock_spin(&sched_lock);
 	}
-	mtx_unlock_spin(&sched_lock);
 	return (0);
 }

@ -1000,10 +911,7 @@ thread_suspend_check(int return_instead)
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
-			if (p->p_flag & P_SA)
-				thread_exit();
-			else
-				thr_exit1();
+			thread_exit();
 		}

 		/*
@ -1084,6 +992,10 @@ thread_unsuspend(struct proc *p)
 	}
 }

+/*
+ * End the single threading mode..
+ * Part of this is duplicated in thread-single in the SINGLE_EXIT case.
+ */
 void
 thread_single_end(void)
 {
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@ -35,6 +35,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");

+#define kse td_sched
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@ -64,29 +66,69 @@ __FBSDID("$FreeBSD$");
 #endif
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */

-struct ke_sched {
-	int		ske_cpticks;	/* (j) Ticks of cpu time. */
-	struct runq	*ske_runq;	/* runq the kse is currently on */
+/*
+ * The schedulable entity that can be given a context to run.
+ * A process may have several of these. Probably one per processor
+ * but posibly a few more. In this universe they are grouped
+ * with a KSEG that contains the priority and niceness
+ * for the group.
+ */
+struct kse {
+	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
+	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
+	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
+	struct thread	*ke_thread;	/* (*) Active associated thread. */
+	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
+	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
+	char		ke_rqindex;	/* (j) Run queue index. */
+	enum {
+		KES_THREAD = 0x0,	/* slaved to thread state */
+		KES_ONRUNQ
+	} ke_state;			/* (j) KSE status. */
+	int		ke_cpticks;	/* (j) Ticks of cpu time. */
+	struct runq	*ke_runq;	/* runq the kse is currently on */
+	int		ke_pinned;	/* nested count of pinned to a cpu */
 };
-#define ke_runq 	ke_sched->ske_runq
-#define ke_cpticks 	ke_sched->ske_cpticks
-#define KEF_BOUND	KEF_SCHED1
+
+#define ke_proc		ke_thread->td_proc
+#define ke_ksegrp	ke_thread->td_ksegrp
+
+#define td_kse td_sched
+
+/* flags kept in td_flags */
+#define TDF_DIDRUN	TDF_SCHED0	/* KSE actually ran. */
+#define TDF_EXIT	TDF_SCHED1	/* KSE is being killed. */
+#define TDF_BOUND	TDF_SCHED2
+
+#define ke_flags	ke_thread->td_flags
+#define KEF_DIDRUN	TDF_DIDRUN /* KSE actually ran. */
+#define KEF_EXIT	TDF_EXIT /* KSE is being killed. */
+#define KEF_BOUND	TDF_BOUND /* stuck to one CPU */

 #define SKE_RUNQ_PCPU(ke)						\
    ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)

+struct kg_sched {
+	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
+					   /* the system scheduler. */
+	int	skg_avail_opennings;	/* (j) Num KSEs requested in group. */
+	int	skg_concurrency;	/* (j) Num KSEs requested in group. */
+	int	skg_runq_kses;		/* (j) Num KSEs on runq. */
+};
+#define kg_last_assigned	kg_sched->skg_last_assigned
+#define kg_avail_opennings	kg_sched->skg_avail_opennings
+#define kg_concurrency		kg_sched->skg_concurrency
+#define kg_runq_kses		kg_sched->skg_runq_kses
+
 /*
 * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
 * cpus.
 */
 #define KSE_CAN_MIGRATE(ke)						\
-    ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
-static struct ke_sched ke_sched;
+    ((ke)->ke_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)

-struct ke_sched *kse0_sched = &ke_sched;
-struct kg_sched *ksegrp0_sched = NULL;
-struct p_sched *proc0_sched = NULL;
-struct td_sched *thread0_sched = NULL;
+static struct kse kse0;
+static struct kg_sched kg_sched0;

 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
@ -94,6 +136,9 @@ static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */

 static struct callout roundrobin_callout;

+static void	slot_fill(struct ksegrp *kg);
+static struct kse *sched_choose(void);		/* XXX Should be thread * */
+
 static void	setup_runqs(void);
 static void	roundrobin(void *arg);
 static void	schedcpu(void);
@ -213,7 +258,7 @@ maybe_resched(struct thread *td)
 {

 	mtx_assert(&sched_lock, MA_OWNED);
-	if (td->td_priority < curthread->td_priority && curthread->td_kse)
+	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }

@ -353,7 +398,8 @@ schedcpu(void)
 		p->p_swtime++;
 		FOREACH_KSEGRP_IN_PROC(p, kg) { 
 			awake = 0;
-			FOREACH_KSE_IN_GROUP(kg, ke) {
+			FOREACH_THREAD_IN_GROUP(kg, td) {
+				ke = td->td_kse;
 				/*
 				 * Increment sleep time (if sleeping).  We
 				 * ignore overflow, as above.
@ -366,7 +412,7 @@ schedcpu(void)
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				} else if ((ke->ke_state == KES_THREAD) &&
-				    (TD_IS_RUNNING(ke->ke_thread))) {
+				    (TD_IS_RUNNING(td))) {
 					awake = 1;
 					/* Do not clear KEF_DIDRUN */
 				} else if (ke->ke_flags & KEF_DIDRUN) {
@ -517,6 +563,28 @@ sched_setup(void *dummy)
 }

 /* External interfaces start here */
+/*
+ * Very early in the boot some setup of scheduler-specific
+ * parts of proc0 and of soem scheduler resources needs to be done.
+ * Called from:
+ *  proc0_init()
+ */
+void
+schedinit(void)
+{
+	/*
+	 * Set up the scheduler specific parts of proc0.
+	 */
+	proc0.p_sched = NULL; /* XXX */
+	ksegrp0.kg_sched = &kg_sched0;
+	thread0.td_sched = &kse0;
+	kse0.ke_thread = &thread0;
+	kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */
+	kse0.ke_state = KES_THREAD;
+	kg_sched0.skg_concurrency = 1;
+	kg_sched0.skg_avail_opennings = 0; /* we are already running */
+}
+
 int
 sched_runnable(void)
 {
@ -579,16 +647,10 @@ sched_clock(struct thread *td)
 void
 sched_exit(struct proc *p, struct thread *td)
 {
-	sched_exit_kse(FIRST_KSE_IN_PROC(p), td);
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }

-void
-sched_exit_kse(struct kse *ke, struct thread *child)
-{
-}
-
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd)
 {
@ -605,17 +667,10 @@ sched_exit_thread(struct thread *td, struct thread *child)
 }

 void
-sched_fork(struct thread *td, struct proc *p1)
+sched_fork(struct thread *td, struct thread *childtd)
 {
-	sched_fork_kse(td, FIRST_KSE_IN_PROC(p1));
-	sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(p1));
-	sched_fork_thread(td, FIRST_THREAD_IN_PROC(p1));
-}
-
-void
-sched_fork_kse(struct thread *td, struct kse *child)
-{
-	child->ke_cpticks = 0;
+	sched_fork_ksegrp(td, childtd->td_ksegrp);
+	sched_fork_thread(td, childtd);
 }

 void
@ -626,8 +681,9 @@ sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
 }

 void
-sched_fork_thread(struct thread *td, struct thread *child)
+sched_fork_thread(struct thread *td, struct thread *childtd)
 {
+	sched_newthread(childtd);
 }

 void
@ -687,14 +743,21 @@ sched_switch(struct thread *td, struct thread *newtd)
 	p = td->td_proc;

 	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT((ke->ke_state == KES_THREAD), ("sched_switch: kse state?"));

 	if ((p->p_flag & P_NOLOAD) == 0)
 		sched_tdcnt--;
 	if (newtd != NULL && (newtd->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_tdcnt++;
+	/* 
+	 * The thread we are about to run needs to be counted as if it had been 
+	 * added to the run queue and selected.
+	 */
+	if (newtd) {
+		newtd->td_ksegrp->kg_avail_opennings--;
+		newtd->td_kse->ke_flags |= KEF_DIDRUN;
+        	TD_SET_RUNNING(newtd);
+	}
 	td->td_lastcpu = td->td_oncpu;
-	td->td_last_kse = ke;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	td->td_pflags &= ~TDP_OWEPREEMPT;
 	td->td_oncpu = NOCPU;
@ -706,16 +769,19 @@ sched_switch(struct thread *td, struct thread *newtd)
 	 */
 	if (td == PCPU_GET(idlethread))
 		TD_SET_CAN_RUN(td);
-	else if (TD_IS_RUNNING(td)) {
-		/* Put us back on the run queue (kse and all). */
-		setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
-	} else if (p->p_flag & P_SA) {
-		/*
-		 * We will not be on the run queue. So we must be
-		 * sleeping or similar. As it's available,
-		 * someone else can use the KSE if they need it.
-		 */
-		kse_reassign(ke);
+	else {
+		td->td_ksegrp->kg_avail_opennings++;
+		if (TD_IS_RUNNING(td)) {
+			/* Put us back on the run queue (kse and all). */
+			setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
+		} else if (p->p_flag & P_HADTHREADS) {
+			/*
+			 * We will not be on the run queue. So we must be
+			 * sleeping or similar. As it's available,
+			 * someone else can use the KSE if they need it.
+			 */
+			slot_fill(td->td_ksegrp);
+		}
 	}
 	if (newtd == NULL)
 		newtd = choosethread();
@ -750,7 +816,7 @@ forward_wakeup(int  cpunum)

 	mtx_assert(&sched_lock, MA_OWNED);

-	CTR0(KTR_SMP, "forward_wakeup()");
+	CTR0(KTR_RUNQ, "forward_wakeup()");

 	if ((!forward_wakeup_enabled) ||
 	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
@ -838,9 +904,6 @@ sched_add(struct thread *td, int flags)

 	ke = td->td_kse;
 	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
-	KASSERT((ke->ke_thread->td_kse != NULL),
-	    ("sched_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("sched_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
@ -974,10 +1037,6 @@ sched_choose(void)
 		ke->ke_state = KES_THREAD;
 		ke->ke_ksegrp->kg_runq_kses--;

-		KASSERT((ke->ke_thread != NULL),
-		    ("sched_choose: No thread on KSE"));
-		KASSERT((ke->ke_thread->td_kse != NULL),
-		    ("sched_choose: No KSE on thread"));
 		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 		    ("sched_choose: process swapped out"));
 	}
@ -1041,15 +1100,10 @@ sched_load(void)
 	return (sched_tdcnt);
 }

-int
-sched_sizeof_kse(void)
-{
-	return (sizeof(struct kse) + sizeof(struct ke_sched));
-}
 int
 sched_sizeof_ksegrp(void)
 {
-	return (sizeof(struct ksegrp));
+	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 }
 int
 sched_sizeof_proc(void)
@ -1059,7 +1113,7 @@ sched_sizeof_proc(void)
 int
 sched_sizeof_thread(void)
 {
-	return (sizeof(struct thread));
+	return (sizeof(struct thread) + sizeof(struct kse));
 }

 fixpt_t
@ -1068,10 +1122,9 @@ sched_pctcpu(struct thread *td)
 	struct kse *ke;

 	ke = td->td_kse;
-	if (ke == NULL)
-		ke = td->td_last_kse;
-	if (ke)
-		return (ke->ke_pctcpu);
+	return (ke->ke_pctcpu);

 	return (0);
 }
+#define KERN_SWITCH_INCLUDE 1
+#include "kern/kern_switch.c"
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@ -29,6 +29,8 @@ __FBSDID("$FreeBSD$");

 #include <opt_sched.h>

+#define kse td_sched
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
@ -89,54 +91,83 @@ SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
    preempt_warning)
 #endif

+/*
+ * The schedulable entity that can be given a context to run.
+ * A process may have several of these. Probably one per processor
+ * but posibly a few more. In this universe they are grouped
+ * with a KSEG that contains the priority and niceness
+ * for the group.
+ */
+struct kse {
+	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of threads in ke_ksegrp. */
+	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of threads in this state.*/
+	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
+	int		ke_flags;	/* (j) KEF_* flags. */
+	struct thread	*ke_thread;	/* (*) Active associated thread. */
+	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
+	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
+	char		ke_rqindex;	/* (j) Run queue index. */
+	enum {
+		KES_THREAD = 0x0,	/* slaved to thread state */
+		KES_ONRUNQ
+	} ke_state;			/* (j) thread sched specific status. */
+	int		ke_slptime;
+	int		ke_pinned;
+	int		ke_slice;
+	struct runq	*ke_runq;
+	u_char		ke_cpu;		/* CPU that we have affinity for. */
+	/* The following variables are only used for pctcpu calculation */
+	int		ke_ltick;	/* Last tick that we were running on */
+	int		ke_ftick;	/* First tick that we were running on */
+	int		ke_ticks;	/* Tick count */
+
+};
+
+
+#define td_kse td_sched
+#define	td_slptime		td_kse->ke_slptime
+#define ke_proc			ke_thread->td_proc
+#define ke_ksegrp		ke_thread->td_ksegrp
+
+/* flags kept in ke_flags */
+#define	KEF_SCHED0	0x00001	/* For scheduler-specific use. */
+#define	KEF_SCHED1	0x00002	/* For scheduler-specific use. */
+#define	KEF_SCHED2	0x00004	/* For scheduler-specific use. */
+#define	KEF_SCHED3	0x00008	/* For scheduler-specific use. */
+#define	KEF_DIDRUN	0x02000	/* Thread actually ran. */
+#define	KEF_EXIT	0x04000	/* Thread is being killed. */
+
 /*
 * These datastructures are allocated within their parent datastructure but
 * are scheduler specific.
 */

-struct ke_sched {
-	int		ske_slice;
-	struct runq	*ske_runq;
-	/* The following variables are only used for pctcpu calculation */
-	int		ske_ltick;	/* Last tick that we were running on */
-	int		ske_ftick;	/* First tick that we were running on */
-	int		ske_ticks;	/* Tick count */
-	/* CPU that we have affinity for. */
-	u_char		ske_cpu;
-};
-#define	ke_slice	ke_sched->ske_slice
-#define	ke_runq		ke_sched->ske_runq
-#define	ke_ltick	ke_sched->ske_ltick
-#define	ke_ftick	ke_sched->ske_ftick
-#define	ke_ticks	ke_sched->ske_ticks
-#define	ke_cpu		ke_sched->ske_cpu
 #define	ke_assign	ke_procq.tqe_next

-#define	KEF_ASSIGNED	KEF_SCHED0	/* KSE is being migrated. */
-#define	KEF_BOUND	KEF_SCHED1	/* KSE can not migrate. */
-#define	KEF_XFERABLE	KEF_SCHED2	/* KSE was added as transferable. */
-#define	KEF_HOLD	KEF_SCHED3	/* KSE is temporarily bound. */
+#define	KEF_ASSIGNED	KEF_SCHED0	/* Thread is being migrated. */
+#define	KEF_BOUND	KEF_SCHED1	/* Thread can not migrate. */
+#define	KEF_XFERABLE	KEF_SCHED2	/* Thread was added as transferable. */
+#define	KEF_HOLD	KEF_SCHED3	/* Thread is temporarily bound. */

 struct kg_sched {
+	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
+					   /* the system scheduler */
 	int	skg_slptime;		/* Number of ticks we vol. slept */
 	int	skg_runtime;		/* Number of ticks we were running */
+	int	skg_avail_opennings;	/* (j) Num unfilled slots in group.*/
+	int	skg_concurrency;	/* (j) Num threads requested in group.*/
+	int	skg_runq_threads;	/* (j) Num KSEs on runq. */
 };
-#define	kg_slptime	kg_sched->skg_slptime
-#define	kg_runtime	kg_sched->skg_runtime
+#define kg_last_assigned	kg_sched->skg_last_assigned
+#define kg_avail_opennings	kg_sched->skg_avail_opennings
+#define kg_concurrency		kg_sched->skg_concurrency
+#define kg_runq_threads		kg_sched->skg_runq_threads
+#define kg_runtime		kg_sched->skg_runtime
+#define kg_slptime		kg_sched->skg_slptime

-struct td_sched {
-	int	std_slptime;
-};
-#define	td_slptime	td_sched->std_slptime

-struct td_sched td_sched;
-struct ke_sched ke_sched;
-struct kg_sched kg_sched;
-
-struct ke_sched *kse0_sched = &ke_sched;
-struct kg_sched *ksegrp0_sched = &kg_sched;
-struct p_sched *proc0_sched = NULL;
-struct td_sched *thread0_sched = &td_sched;
+static struct kse kse0;
+static struct kg_sched kg_sched0;

 /*
 * The priority is primarily determined by the interactivity score.  Thus, we
@ -191,7 +222,7 @@ struct td_sched *thread0_sched = &td_sched;
    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))

 /*
- * This macro determines whether or not the kse belongs on the current or
+ * This macro determines whether or not the thread belongs on the current or
 * next run queue.
 */
 #define	SCHED_INTERACTIVE(kg)						\
@ -274,6 +305,8 @@ static struct kseq	kseq_cpu;
 #define	KSEQ_CPU(x)	(&kseq_cpu)
 #endif

+static void	slot_fill(struct ksegrp *kg);
+static struct kse *sched_choose(void);		/* XXX Should be thread * */
 static void sched_add_internal(struct thread *td, int preemptive);
 static void sched_slice(struct kse *ke);
 static void sched_priority(struct ksegrp *kg);
@ -1105,6 +1138,28 @@ sched_interact_score(struct ksegrp *kg)

 }

+/*
+ * Very early in the boot some setup of scheduler-specific
+ * parts of proc0 and of soem scheduler resources needs to be done.
+ * Called from:
+ *  proc0_init()
+ */
+void
+schedinit(void)
+{
+	/*
+	 * Set up the scheduler specific parts of proc0.
+	 */
+	ksegrp0.kg_sched = &kg_sched0;
+	proc0.p_sched = NULL; /* XXX */
+	thread0.td_kse = &kse0;
+	kse0.ke_thread = &thread0;
+	kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */
+	kse0.ke_state = KES_THREAD;
+	kg_sched0.skg_concurrency = 1;
+	kg_sched0.skg_avail_opennings = 0; /* we are already running */
+}
+
 /*
 * This is only somewhat accurate since given many processes of the same
 * priority they will switch when their slices run out, which will be
@ -1178,12 +1233,20 @@ sched_switch(struct thread *td, struct thread *newtd)

 	ke = td->td_kse;

-	td->td_last_kse = ke;
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	td->td_pflags &= ~TDP_OWEPREEMPT;

+	/*
+	 * If we bring in a thread, 
+	 * then account for it as if it had been added to the run queue and then chosen.
+	 */
+	if (newtd) {
+		newtd->td_ksegrp->kg_avail_opennings--;
+		newtd->td_kse->ke_flags |= KEF_DIDRUN;
+        	TD_SET_RUNNING(newtd);
+	}
 	/*
 	 * If the KSE has been assigned it may be in the process of switching
 	 * to the new cpu.  This is the case in sched_bind().
@ -1191,24 +1254,29 @@ sched_switch(struct thread *td, struct thread *newtd)
 	if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
 		if (td == PCPU_GET(idlethread)) {
 			TD_SET_CAN_RUN(td);
-		} else if (TD_IS_RUNNING(td)) {
-			kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
-			/*
-			 * Don't allow the kse to migrate from a preemption.
-			 */
-			ke->ke_flags |= KEF_HOLD;
-			setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
 		} else {
-			if (ke->ke_runq) {
+			/* We are ending our run so make our slot available again */
+			td->td_ksegrp->kg_avail_opennings++;
+			if (TD_IS_RUNNING(td)) {
 				kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
-			} else if ((td->td_flags & TDF_IDLETD) == 0)
-				kdb_backtrace();
-			/*
-			 * We will not be on the run queue. So we must be
-			 * sleeping or similar.
-			 */
-			if (td->td_proc->p_flag & P_SA)
-				kse_reassign(ke);
+				/*
+				 * Don't allow the thread to migrate
+				 * from a preemption.
+				 */
+				ke->ke_flags |= KEF_HOLD;
+				setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
+			} else {
+				if (ke->ke_runq) {
+					kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
+				} else if ((td->td_flags & TDF_IDLETD) == 0)
+					kdb_backtrace();
+				/*
+				 * We will not be on the run queue.
+				 * So we must be sleeping or similar.
+				 */
+				if (td->td_proc->p_flag & P_HADTHREADS)
+					slot_fill(td->td_ksegrp);
+			}
 		}
 	}
 	if (newtd != NULL)
@ -1237,7 +1305,8 @@ sched_nice(struct proc *p, int nice)
 	 */
 	FOREACH_KSEGRP_IN_PROC(p, kg) {
 		if (kg->kg_pri_class == PRI_TIMESHARE) {
-			FOREACH_KSE_IN_GROUP(kg, ke) {
+			FOREACH_THREAD_IN_GROUP(kg, td) {
+				ke = td->td_kse;
 				if (ke->ke_runq == NULL)
 					continue;
 				kseq = KSEQ_CPU(ke->ke_cpu);
@ -1262,8 +1331,8 @@ sched_sleep(struct thread *td)
 	td->td_slptime = ticks;
 	td->td_base_pri = td->td_priority;

-	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
-	    td->td_kse, td->td_slptime);
+	CTR2(KTR_ULE, "sleep thread %p (tick: %d)",
+	    td, td->td_slptime);
 }

 void
@ -1289,10 +1358,8 @@ sched_wakeup(struct thread *td)
 			sched_interact_update(kg);
 		}
 		sched_priority(kg);
-		if (td->td_kse)
-			sched_slice(td->td_kse);
-		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
-		    td->td_kse, hzticks);
+		sched_slice(td->td_kse);
+		CTR2(KTR_ULE, "wakeup thread %p (%d ticks)", td, hzticks);
 		td->td_slptime = 0;
 	}
 	setrunqueue(td, SRQ_BORING);
@ -1303,37 +1370,20 @@ sched_wakeup(struct thread *td)
 * priority.
 */
 void
-sched_fork(struct thread *td, struct proc *p1)
+sched_fork(struct thread *td, struct thread *childtd)
 {

 	mtx_assert(&sched_lock, MA_OWNED);

-	p1->p_nice = td->td_proc->p_nice;
-	sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(p1));
-	sched_fork_kse(td, FIRST_KSE_IN_PROC(p1));
-	sched_fork_thread(td, FIRST_THREAD_IN_PROC(p1));
-}
-
-void
-sched_fork_kse(struct thread *td, struct kse *child)
-{
-	struct kse *ke = td->td_kse;
-
-	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
-	child->ke_cpu = ke->ke_cpu;
-	child->ke_runq = NULL;
-
-	/* Grab our parents cpu estimation information. */
-	child->ke_ticks = ke->ke_ticks;
-	child->ke_ltick = ke->ke_ltick;
-	child->ke_ftick = ke->ke_ftick;
+	sched_fork_ksegrp(td, childtd->td_ksegrp);
+	sched_fork_thread(td, childtd);
 }

 void
 sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
 {
 	struct ksegrp *kg = td->td_ksegrp;
-	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
+	mtx_assert(&sched_lock, MA_OWNED);

 	child->kg_slptime = kg->kg_slptime;
 	child->kg_runtime = kg->kg_runtime;
@ -1350,6 +1400,20 @@ sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
+	struct kse *ke;
+	struct kse *ke2;
+
+	sched_newthread(child);
+	ke = td->td_kse;
+	ke2 = child->td_kse;
+	ke2->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
+	ke2->ke_cpu = ke->ke_cpu;
+	ke2->ke_runq = NULL;
+
+	/* Grab our parents cpu estimation information. */
+	ke2->ke_ticks = ke->ke_ticks;
+	ke2->ke_ltick = ke->ke_ltick;
+	ke2->ke_ftick = ke->ke_ftick;
 }

 void
@ -1357,6 +1421,7 @@ sched_class(struct ksegrp *kg, int class)
 {
 	struct kseq *kseq;
 	struct kse *ke;
+	struct thread *td;
 	int nclass;
 	int oclass;

@ -1366,7 +1431,8 @@ sched_class(struct ksegrp *kg, int class)

 	nclass = PRI_BASE(class);
 	oclass = PRI_BASE(kg->kg_pri_class);
-	FOREACH_KSE_IN_GROUP(kg, ke) {
+	FOREACH_THREAD_IN_GROUP(kg, td) {
+		ke = td->td_kse;
 		if (ke->ke_state != KES_ONRUNQ &&
 		    ke->ke_state != KES_THREAD)
 			continue;
@ -1404,19 +1470,15 @@ sched_class(struct ksegrp *kg, int class)

 /*
 * Return some of the child's priority and interactivity to the parent.
+ * Avoid using sched_exit_thread to avoid having to decide which
+ * thread in the parent gets the honour since it isn't used.
 */
 void
-sched_exit(struct proc *p, struct thread *td)
+sched_exit(struct proc *p, struct thread *childtd)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
-	sched_exit_kse(FIRST_KSE_IN_PROC(p), td);
-	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
-}
-
-void
-sched_exit_kse(struct kse *ke, struct thread *td)
-{
-	kseq_load_rem(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse);
+	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
+	kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
 }

 void
@ -1428,8 +1490,9 @@ sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
 }

 void
-sched_exit_thread(struct thread *td, struct thread *child)
+sched_exit_thread(struct thread *td, struct thread *childtd)
 {
+	kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
 }

 void
@ -1483,8 +1546,8 @@ sched_clock(struct thread *td)
 	if (td->td_flags & TDF_IDLETD)
 		return;

-	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
-	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
+	CTR4(KTR_ULE, "Tick thread %p (slice: %d, slptime: %d, runtime: %d)",
+	    td, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
 	/*
 	 * We only do slicing code for TIMESHARE ksegrps.
 	 */
@ -1581,8 +1644,8 @@ sched_choose(void)
 		ke->ke_state = KES_THREAD;

 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
-			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
-			    ke, ke->ke_runq, ke->ke_slice,
+			CTR4(KTR_ULE, "Run thread %p from %p (slice: %d, pri: %d)",
+			    ke->ke_thread, ke->ke_runq, ke->ke_slice,
 			    ke->ke_thread->td_priority);
 		}
 		return (ke);
@ -1627,10 +1690,6 @@ sched_add_internal(struct thread *td, int preemptive)
 	if (ke->ke_flags & KEF_ASSIGNED)
 		return;
 	kseq = KSEQ_SELF();
-	KASSERT((ke->ke_thread != NULL),
-	    ("sched_add: No thread on KSE"));
-	KASSERT((ke->ke_thread->td_kse != NULL),
-	    ("sched_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("sched_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
@ -1715,7 +1774,7 @@ sched_add_internal(struct thread *td, int preemptive)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 	if (preemptive && maybe_preempt(td))
 		return;
-	ke->ke_ksegrp->kg_runq_kses++;
+	ke->ke_ksegrp->kg_runq_threads++;
 	ke->ke_state = KES_ONRUNQ;

 	kseq_runq_add(kseq, ke);
@ -1742,7 +1801,7 @@ sched_rem(struct thread *td)
 	    ("sched_rem: KSE not on run queue"));

 	ke->ke_state = KES_THREAD;
-	ke->ke_ksegrp->kg_runq_kses--;
+	ke->ke_ksegrp->kg_runq_threads--;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 	kseq_runq_rem(kseq, ke);
 	kseq_load_rem(kseq, ke);
@ -1795,7 +1854,7 @@ sched_bind(struct thread *td, int cpu)
 		return;
 	/* sched_rem without the runq_remove */
 	ke->ke_state = KES_THREAD;
-	ke->ke_ksegrp->kg_runq_kses--;
+	ke->ke_ksegrp->kg_runq_threads--;
 	kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 	kseq_notify(ke, cpu);
 	/* When we return from mi_switch we'll be on the correct cpu. */
@ -1826,12 +1885,6 @@ sched_load(void)
 #endif
 }

-int
-sched_sizeof_kse(void)
-{
-	return (sizeof(struct kse) + sizeof(struct ke_sched));
-}
-
 int
 sched_sizeof_ksegrp(void)
 {
@ -1849,3 +1902,5 @@ sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
+#define KERN_SWITCH_INCLUDE 1
+#include "kern/kern_switch.c"
--- a/sys/pc98/i386/machdep.c
+++ b/sys/pc98/i386/machdep.c
@ -2011,7 +2011,7 @@ init386(first)
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);

 #ifdef PC98
 	/*
--- a/sys/pc98/pc98/machdep.c
+++ b/sys/pc98/pc98/machdep.c
@ -2011,7 +2011,7 @@ init386(first)
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);

 #ifdef PC98
 	/*
--- a/sys/powerpc/aim/machdep.c
+++ b/sys/powerpc/aim/machdep.c
@ -279,7 +279,7 @@ powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp)
 	/*
 	 * Start initializing proc0 and thread0.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	proc0.p_uarea = (struct user *)uarea0;
 	proc0.p_stats = &proc0.p_uarea->u_stats;
 	thread0.td_frame = &frame0;
--- a/sys/powerpc/powerpc/machdep.c
+++ b/sys/powerpc/powerpc/machdep.c
@ -279,7 +279,7 @@ powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp)
 	/*
 	 * Start initializing proc0 and thread0.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	proc0.p_uarea = (struct user *)uarea0;
 	proc0.p_stats = &proc0.p_uarea->u_stats;
 	thread0.td_frame = &frame0;
--- a/sys/sparc64/sparc64/machdep.c
+++ b/sys/sparc64/sparc64/machdep.c
@ -342,7 +342,7 @@ sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
 	/*
 	 * Initialize proc0 stuff (p_contested needs to be done early).
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	proc0.p_md.md_sigtramp = NULL;
 	proc0.p_md.md_utrap = NULL;
 	proc0.p_uarea = (struct user *)uarea0;
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@ -142,14 +142,13 @@ struct pargs {
 *      q - td_contested lock
 *      r - p_peers lock
 *      x - created at fork, only changes during single threading in exec
- *      z - zombie threads/kse/ksegroup lock
+ *      z - zombie threads/ksegroup lock
 *
 * If the locking key specifies two identifiers (for example, p_pptr) then
 * either lock is sufficient for read access, but both locks must be held
 * for write access.
 */
 struct ithd;
-struct ke_sched;
 struct kg_sched;
 struct nlminfo;
 struct p_sched;
@ -159,7 +158,7 @@ struct trapframe;
 struct turnstile;

 /*
- * Here we define the four structures used for process information.
+ * Here we define the three structures used for process information.
 *
 * The first is the thread. It might be thought of as a "Kernel
 * Schedulable Entity Context".
@ -171,40 +170,26 @@ struct turnstile;
 * load balancing. Each of these is associated with a kernel stack
 * and a pcb.
 *
- * It is important to remember that a particular thread structure only
- * exists as long as the system call or kernel entrance (e.g. by pagefault)
+ * It is important to remember that a particular thread structure may only
+ * exist as long as the system call or kernel entrance (e.g. by pagefault)
 * which it is currently executing. It should therefore NEVER be referenced
 * by pointers in long lived structures that live longer than a single
 * request. If several threads complete their work at the same time,
 * they will all rewind their stacks to the user boundary, report their
 * completion state, and all but one will be freed. That last one will
 * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel
- * entrance. (basically to save freeing and then re-allocating it) The KSE
- * keeps a cached thread available to allow it to quickly
+ * entrance. (basically to save freeing and then re-allocating it) The existing
+ * thread keeps a cached spare thread available to allow it to quickly
 * get one when it needs a new one. There is also a system
 * cache of free threads. Threads have priority and partake in priority
 * inheritance schemes.
 */
 struct thread;

-/*
- * The second structure is the Kernel Schedulable Entity. (KSE)
- * It represents the ability to take a slot in the scheduler queue.
- * As long as this is scheduled, it could continue to run any threads that
- * are assigned to the KSEGRP (see later) until either it runs out
- * of runnable threads of high enough priority, or CPU.
- * It runs on one CPU and is assigned a quantum of time. When a thread is
- * blocked, The KSE continues to run and will search for another thread
- * in a runnable state amongst those it has. It May decide to return to user
- * mode with a new 'empty' thread if there are no runnable threads.
- * Threads are temporarily associated with a KSE for scheduling reasons.
- */
-struct kse;
-
 /*
 * The KSEGRP is allocated resources across a number of CPUs.
 * (Including a number of CPUxQUANTA. It parcels these QUANTA up among
- * its KSEs, each of which should be running in a different CPU.
+ * its threads, each of which should be running in a different CPU.
 * BASE priority and total available quanta are properties of a KSEGRP.
 * Multiple KSEGRPs in a single process compete against each other
 * for total quanta in the same way that a forked child competes against
@ -227,8 +212,8 @@ struct proc;
 With a single run queue used by all processors:

 RUNQ: --->KSE---KSE--...               SLEEPQ:[]---THREAD---THREAD---THREAD
-	   |   /                               []---THREAD
-	   KSEG---THREAD--THREAD--THREAD       []
+	     \      \                          []---THREAD
+      KSEG---THREAD--THREAD--THREAD            []
 					       []---THREAD---THREAD

  (processors run THREADs from the KSEG until they are exhausted or
@ -271,8 +256,6 @@ struct thread {
 	int		td_flags;	/* (j) TDF_* flags. */
 	int		td_inhibitors;	/* (j) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
-	struct kse	*td_last_kse;	/* (j) Previous value of td_kse. */
-	struct kse	*td_kse;	/* (j) Current KSE if any. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	void		*td_wchan;	/* (j) Sleep address. */
 	const char	*td_wmesg;	/* (j) Reason for sleep. */
@ -368,10 +351,10 @@ struct thread {
 #define	TDF_DBSUSPEND	0x00200000 /* Thread is suspended by debugger */
 #define	TDF_UNUSED22	0x00400000 /* --available -- */
 #define	TDF_UNUSED23	0x00800000 /* --available -- */
-#define	TDF_SCHED1	0x01000000 /* Reserved for scheduler private use */
-#define	TDF_SCHED2	0x02000000 /* Reserved for scheduler private use */
-#define	TDF_SCHED3	0x04000000 /* Reserved for scheduler private use */
-#define	TDF_SCHED4	0x08000000 /* Reserved for scheduler private use */
+#define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
+#define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
+#define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
+#define	TDF_SCHED3	0x08000000 /* Reserved for scheduler private use */

 /*
 * "Private" flags kept in td_pflags:
@ -454,45 +437,6 @@ struct thread {
 #define	TD_SET_RUNQ(td)		(td)->td_state = TDS_RUNQ
 #define	TD_SET_CAN_RUN(td)	(td)->td_state = TDS_CAN_RUN

-/*
- * The schedulable entity that can be given a context to run.
- * A process may have several of these. Probably one per processor
- * but posibly a few more. In this universe they are grouped
- * with a KSEG that contains the priority and niceness
- * for the group.
- */
-struct kse {
-	struct proc	*ke_proc;	/* (*) Associated process. */
-	struct ksegrp	*ke_ksegrp;	/* (*) Associated KSEG. */
-	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
-	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
-	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
-
-#define	ke_startzero ke_flags
-	int		ke_flags;	/* (j) KEF_* flags. */
-	struct thread	*ke_thread;	/* (*) Active associated thread. */
-	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
-	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
-	char		ke_rqindex;	/* (j) Run queue index. */
-	enum {
-		KES_UNUSED = 0x0,
-		KES_IDLE,
-		KES_ONRUNQ,
-		KES_UNQUEUED,		/* in transit */
-		KES_THREAD		/* slaved to thread state */
-	} ke_state;			/* (j) KSE status. */
-#define	ke_endzero ke_sched
-	struct ke_sched	*ke_sched;	/* (*) Scheduler-specific data. */
-};
-
-/* flags kept in ke_flags */
-#define	KEF_SCHED0	0x00001	/* For scheduler-specific use. */
-#define	KEF_SCHED1	0x00002	/* For scheduler-specific use. */
-#define	KEF_SCHED2	0X00004	/* For scheduler-specific use. */
-#define	KEF_SCHED3	0x00008	/* For scheduler-specific use. */
-#define	KEF_DIDRUN	0x02000	/* KSE actually ran. */
-#define	KEF_EXIT	0x04000	/* KSE is being killed. */
-
 /*
 * The upcall management structure.
 * The upcall is used when returning to userland.  If a thread does not have
@ -520,8 +464,6 @@ struct kse_upcall {
 struct ksegrp {
 	struct proc	*kg_proc;	/* (*) Process that contains this KSEG. */
 	TAILQ_ENTRY(ksegrp) kg_ksegrp;	/* (*) Queue of KSEGs in kg_proc. */
-	TAILQ_HEAD(, kse) kg_kseq;	/* (ke_kglist) All KSEs. */
-	TAILQ_HEAD(, kse) kg_iq;	/* (ke_kgrlist) All idle KSEs. */
 	TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */
 	TAILQ_HEAD(, thread) kg_runq;	/* (td_runq) waiting RUNNABLE threads */
 	TAILQ_HEAD(, thread) kg_slpq;	/* (td_runq) NONRUNNABLE threads. */
@ -529,10 +471,7 @@ struct ksegrp {
 #define	kg_startzero kg_estcpu
 	u_int		kg_estcpu;	/* (j) Sum of the same field in KSEs. */
 	u_int		kg_slptime;	/* (j) How long completely blocked. */
-	struct thread	*kg_last_assigned; /* (j) Last thread assigned to a KSE. */
 	int		kg_runnable;	/* (j) Num runnable threads on queue. */
-	int		kg_runq_kses;	/* (j) Num KSEs on runq. */
-	int		kg_idle_kses;	/* (j) Num KSEs on iq. */
 	int		kg_numupcalls;	/* (j) Num upcalls. */
 	int		kg_upsleeps;	/* (c) Num threads in kse_release(). */
 	struct kse_thr_mailbox *kg_completed; /* (c) Completed thread mboxes. */
@ -545,14 +484,12 @@ struct ksegrp {
 	u_char		kg_user_pri;	/* (j) User pri from estcpu and nice. */
 #define	kg_endcopy kg_numthreads
 	int		kg_numthreads;	/* (j) Num threads in total. */
-	int		kg_kses;	/* (j) Num KSEs in group. */
-	int		kg_concurrency;	/* (j) Num KSEs requested in group. */
 	struct kg_sched	*kg_sched;	/* (*) Scheduler-specific data. */
 };

 /*
 * The old fashionned process. May have multiple threads, KSEGRPs
- * and KSEs. Starts off with a single embedded KSEGRP, KSE and THREAD.
+ * and KSEs. Starts off with a single embedded KSEGRP and THREAD.
 */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
@ -562,7 +499,7 @@ struct proc {
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Ptr to open files structure. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Ptr to tracking node */
-					/* Accumulated stats for all KSEs? */
+					/* Accumulated stats for all threads? */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (c) Process limits. */
 	struct vm_object *p_upages_obj; /* (a) Upages object. */
@ -575,7 +512,7 @@ struct proc {
 	int		p_sflag;	/* (j) PS_* flags. */
 	enum {
 		PRS_NEW = 0,		/* In creation */
-		PRS_NORMAL,		/* KSEs can be run. */
+		PRS_NORMAL,		/* threads can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) S* process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
@ -742,8 +679,6 @@ MALLOC_DECLARE(M_ZOMBIE);
 	TAILQ_FOREACH((kg), &(p)->p_ksegrps, kg_ksegrp)
 #define	FOREACH_THREAD_IN_GROUP(kg, td)					\
 	TAILQ_FOREACH((td), &(kg)->kg_threads, td_kglist)
-#define	FOREACH_KSE_IN_GROUP(kg, ke)					\
-	TAILQ_FOREACH((ke), &(kg)->kg_kseq, ke_kglist)
 #define	FOREACH_UPCALL_IN_GROUP(kg, ku)					\
 	TAILQ_FOREACH((ku), &(kg)->kg_upcalls, ku_link)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
@ -752,8 +687,6 @@ MALLOC_DECLARE(M_ZOMBIE);
 /* XXXKSE the lines below should probably only be used in 1:1 code */
 #define	FIRST_THREAD_IN_PROC(p)	TAILQ_FIRST(&(p)->p_threads)
 #define	FIRST_KSEGRP_IN_PROC(p)	TAILQ_FIRST(&(p)->p_ksegrps)
-#define	FIRST_KSE_IN_KSEGRP(kg)	TAILQ_FIRST(&(kg)->kg_kseq)
-#define	FIRST_KSE_IN_PROC(p)	FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p))

 /*
 * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
@ -855,7 +788,6 @@ extern struct mtx ppeers_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread thread0;		/* Primary thread in proc0. */
 extern struct ksegrp ksegrp0;		/* Primary ksegrp in proc0. */
-extern struct kse kse0;			/* Primary kse in proc0. */
 extern struct vmspace vmspace0;		/* VM space for proc0. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int nprocs, maxproc;		/* Current and max number of procs. */
@ -905,8 +837,7 @@ void	pargs_free(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 void	procinit(void);
 void	threadinit(void);
-void	proc_linkup(struct proc *p, struct ksegrp *kg,
-	    struct kse *ke, struct thread *td);
+void	proc_linkup(struct proc *p, struct ksegrp *kg, struct thread *td);
 void	proc_reparent(struct proc *child, struct proc *newparent);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
@ -932,9 +863,8 @@ void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 struct	ksegrp *ksegrp_alloc(void);
 void	ksegrp_free(struct ksegrp *kg);
 void	ksegrp_stash(struct ksegrp *kg);
-struct	kse *kse_alloc(void);
-void	kse_free(struct kse *ke);
-void	kse_stash(struct kse *ke);
+void	kse_GC(void);
+void	kseinit(void);
 void	cpu_set_upcall(struct thread *td, struct thread *td0);
 void	cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku);
 void	cpu_thread_clean(struct thread *);
@ -943,9 +873,6 @@ void	cpu_thread_setup(struct thread *td);
 void	cpu_thread_siginfo(int sig, u_long code, siginfo_t *si);
 void	cpu_thread_swapin(struct thread *);
 void	cpu_thread_swapout(struct thread *);
-void	kse_reassign(struct kse *ke);
-void	kse_link(struct kse *ke, struct ksegrp *kg);
-void	kse_unlink(struct kse *ke);
 void	ksegrp_link(struct ksegrp *kg, struct proc *p);
 void	ksegrp_unlink(struct ksegrp *kg);
 void	thread_signal_add(struct thread *td, int sig);
@ -978,7 +905,8 @@ void	upcall_remove(struct thread *td);
 void	upcall_stash(struct kse_upcall *ke);
 void	thread_sanity_check(struct thread *td, char *);
 void	thread_stopped(struct proc *p);
-void	thread_switchout(struct thread *td);
+struct thread *thread_switchout(struct thread *td, int flags,
+						struct thread *newtd);
 void	thread_continued(struct proc *p);
 void	thr_exit1(void);
 #endif	/* _KERNEL */
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@ -46,7 +46,7 @@ int	sched_runnable(void);
 * Proc related scheduling hooks.
 */
 void	sched_exit(struct proc *p, struct thread *childtd);
-void	sched_fork(struct thread *td, struct proc *child);
+void	sched_fork(struct thread *td, struct thread *childtd);

 /*
 * KSE Groups contain scheduling priority information.  They record the
@ -74,7 +74,6 @@ void	sched_wakeup(struct thread *td);
 * Threads are moved on and off of run queues
 */
 void	sched_add(struct thread *td, int flags);
-struct kse *sched_choose(void);		/* XXX Should be thread * */
 void	sched_clock(struct thread *td);
 void	sched_rem(struct thread *td);

@ -87,26 +86,15 @@ static __inline void sched_pin(void);
 void	sched_unbind(struct thread *td);
 static __inline void sched_unpin(void);

-/*
- * These interfaces will eventually be removed.
- */
-void	sched_exit_kse(struct kse *ke, struct thread *childtd);
-void	sched_fork_kse(struct thread *td, struct kse *child);

 /*
 * These procedures tell the process data structure allocation code how
 * many bytes to actually allocate.
 */
-int	sched_sizeof_kse(void);
 int	sched_sizeof_ksegrp(void);
 int	sched_sizeof_proc(void);
 int	sched_sizeof_thread(void);

-extern struct ke_sched *kse0_sched;
-extern struct kg_sched *ksegrp0_sched;
-extern struct p_sched *proc0_sched;
-extern struct td_sched *thread0_sched;
-
 static __inline void
 sched_pin(void)
 {
@ -119,4 +107,13 @@ sched_unpin(void)
 	curthread->td_pinned--;
 }

+/* temporarily here */
+void schedinit(void);
+void sched_destroyproc(struct proc *p);
+void sched_init_concurrency(struct ksegrp *kg);
+void sched_set_concurrency(struct ksegrp *kg, int cuncurrency);
+void sched_schedinit(void);
+void sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td);
+void sched_thread_exit(struct thread *td);
+void sched_newthread(struct thread *td);
 #endif /* !_SYS_SCHED_H_ */