From 4e32b7b3cc7d282700ca3df60c5378d57e65ab14 Mon Sep 17 00:00:00 2001 From: David Xu Date: Wed, 20 Dec 2006 04:40:39 +0000 Subject: [PATCH] Add a lwpid field into per-cpu structure, the lwpid represents current running thread's id on each cpu. This allow us to add in-kernel adaptive spin for user level mutex. While spinning in user space is possible, without correct thread running state exported from kernel, it hardly can be implemented efficiently without wasting cpu cycles, however exporting thread running state unlikely will be implemented soon as it has to design and stablize interfaces. This implementation is transparent to user space, it can be disabled dynamically. With this change, mutex ping-pong program's performance is improved massively on SMP machine. performance of mysql super-smack select benchmark is increased about 7% on Intel dual dual-core2 Xeon machine, it indicates on systems which have bunch of cpus and system-call overhead is low (athlon64, opteron, and core-2 are known to be fast), the adaptive spin does help performance. Added sysctls: kern.threads.umtx_dflt_spins if the sysctl value is non-zero, a zero umutex.m_spincount will cause the sysctl value to be used a spin cycle count. kern.threads.umtx_max_spins the sysctl sets upper limit of spin cycle count. Tested on: Athlon64 X2 3800+, Dual Xeon 5130 --- sys/amd64/amd64/cpu_switch.S | 8 +++-- sys/amd64/amd64/genassym.c | 2 ++ sys/amd64/amd64/machdep.c | 1 + sys/i386/i386/genassym.c | 2 ++ sys/i386/i386/machdep.c | 1 + sys/i386/i386/swtch.s | 2 ++ sys/ia64/ia64/machdep.c | 3 ++ sys/kern/kern_umtx.c | 70 ++++++++++++++++++++++++++++++++++++ sys/sys/pcpu.h | 1 + sys/sys/umtx.h | 3 +- 10 files changed, 89 insertions(+), 4 deletions(-) diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index ea48bcd56b14..9daf6b42ba0f 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -206,6 +206,11 @@ sw1: movq %rbx, (%rax) movq %rbx, PCPU(RSP0) + movl TD_TID(%rsi), %eax + movq %r8, PCPU(CURPCB) + movl %eax, PCPU(CURTID) + movq %rsi, PCPU(CURTHREAD) /* into next thread */ + /* Restore context. */ movq PCB_RBX(%r8),%rbx movq PCB_RSP(%r8),%rsp @@ -217,9 +222,6 @@ sw1: movq PCB_RIP(%r8),%rax movq %rax,(%rsp) - movq %r8, PCPU(CURPCB) - movq %rsi, PCPU(CURTHREAD) /* into next thread */ - /* Test if debug registers should be restored. */ testl $PCB_DBREGS,PCB_FLAGS(%r8) jz 1f diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index e4fd1961ac19..6e6f67c0269e 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -79,6 +79,7 @@ ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); +ASSYM(TD_TID, offsetof(struct thread, td_tid)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); @@ -190,6 +191,7 @@ ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); +ASSYM(PC_CURTID, offsetof(struct pcpu, pc_curtid)); ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index c7dd71f36c89..905b4903a228 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1164,6 +1164,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); PCPU_SET(curpcb, thread0.td_pcb); + PCPU_SET(curtid, thread0.td_tid); PCPU_SET(tssp, &common_tss[0]); /* diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index ca24402ae486..bf15298a7c07 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -84,6 +84,7 @@ ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_MD, offsetof(struct thread, td_md)); +ASSYM(TD_TID, offsetof(struct thread, td_tid)); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); @@ -198,6 +199,7 @@ ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss)); +ASSYM(PC_CURTID, offsetof(struct pcpu, pc_curtid)); #ifdef DEV_APIC ASSYM(LA_VER, offsetof(struct LAPIC, version)); diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index ef64b95ecb0b..4aed4ceaa199 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -2107,6 +2107,7 @@ init386(first) PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); PCPU_SET(curpcb, thread0.td_pcb); + PCPU_SET(curtid, thread0.td_tid); /* * Initialize mutexes. diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index d4f50fedc95c..5df5959c15a7 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -245,7 +245,9 @@ sw1: popfl movl %edx, PCPU(CURPCB) + movl TD_TID(%ecx),%eax movl %ecx, PCPU(CURTHREAD) /* into next thread */ + movl %eax, PCPU(CURTID) /* * Determine the LDT to use and load it if is the default one and diff --git a/sys/ia64/ia64/machdep.c b/sys/ia64/ia64/machdep.c index 7c2e1f5bd7b1..9fe44a19722b 100644 --- a/sys/ia64/ia64/machdep.c +++ b/sys/ia64/ia64/machdep.c @@ -371,6 +371,7 @@ cpu_switch(struct thread *old, struct thread *new) oldpcb->pcb_current_pmap = pmap_switch(newpcb->pcb_current_pmap); PCPU_SET(curthread, new); + PCPU_SET(curtid, new->td_tid); #ifdef COMPAT_IA32 ia32_restorectx(newpcb); #endif @@ -391,6 +392,7 @@ cpu_throw(struct thread *old __unused, struct thread *new) newpcb = new->td_pcb; (void)pmap_switch(newpcb->pcb_current_pmap); PCPU_SET(curthread, new); + PCPU_SET(curtid, new->td_tid); #ifdef COMPAT_IA32 ia32_restorectx(newpcb); #endif @@ -609,6 +611,7 @@ ia64_init(void) ia64_set_k4((u_int64_t)pcpup); pcpu_init(pcpup, 0, sizeof(pcpu0)); PCPU_SET(curthread, &thread0); + PCPU_SET(curtid, thread0.td_tid); /* * Initialize the console before we print anything out. diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c index 645f02df8643..a43b4d4c4e9b 100644 --- a/sys/kern/kern_umtx.c +++ b/sys/kern/kern_umtx.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -51,6 +52,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include + #ifdef COMPAT_IA32 #include #endif @@ -190,6 +193,13 @@ static int umtx_pi_allocated; SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug"); SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD, &umtx_pi_allocated, 0, "Allocated umtx_pi"); +SYSCTL_DECL(_kern_threads); +static int umtx_dflt_spins = 0; +SYSCTL_INT(_kern_threads, OID_AUTO, umtx_dflt_spins, CTLFLAG_RW, + &umtx_dflt_spins, 0, "default umtx spin count"); +static int umtx_max_spins = 3000; +SYSCTL_INT(_kern_threads, OID_AUTO, umtx_max_spins, CTLFLAG_RW, + &umtx_max_spins, 0, "max umtx spin count"); static void umtxq_sysinit(void *); static void umtxq_hash(struct umtx_key *key); @@ -1012,16 +1022,33 @@ _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo, { struct umtx_q *uq; uint32_t owner, old, id; +#ifdef SMP + int spincount; +#endif int error = 0; id = td->td_tid; uq = td->td_umtxq; +#ifdef SMP + if (smp_cpus > 1) { + spincount = fuword32(&m->m_spincount); + if (spincount == 0) + spincount = umtx_dflt_spins; + if (spincount > umtx_max_spins) + spincount = umtx_max_spins; + } else + spincount = 0; +#endif + /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { +#ifdef SMP +try_unowned: +#endif /* * Try the uncontested case. This should be done in userland. */ @@ -1037,6 +1064,9 @@ _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo, /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED) { +#ifdef SMP +try_contested: +#endif owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); @@ -1058,6 +1088,46 @@ _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo, if (try != 0) return (EBUSY); +#ifdef SMP + if (spincount > 0 && (owner & ~UMUTEX_CONTESTED) != id) { + int i, found = 0; + struct pcpu *pcpu = NULL; + + /* Look for a cpu the owner is running on */ + for (i = 0; i < MAXCPU; i++) { + if (CPU_ABSENT(i)) + continue; + pcpu = pcpu_find(i); + if ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid) { + found = 1; + break; + } + } + + if (__predict_false(!found)) + goto end_spin; + + while ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid && + (owner & ~UMUTEX_CONTESTED) != id) { + if (--spincount <= 0) + break; + if ((td->td_flags & + (TDF_NEEDRESCHED|TDF_ASTPENDING|TDF_NEEDSIGCHK)) || + P_SHOULDSTOP(td->td_proc)) + break; + owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner)); + if (owner == UMUTEX_UNOWNED) + goto try_unowned; + if (owner == UMUTEX_CONTESTED) + goto try_contested; + cpu_spinwait(); + } + } +end_spin: + spincount = 0; + +#endif + /* * If we caught a signal, we have retried and now * exit immediately. diff --git a/sys/sys/pcpu.h b/sys/sys/pcpu.h index 00a3cac97c3e..7f4504279243 100644 --- a/sys/sys/pcpu.h +++ b/sys/sys/pcpu.h @@ -74,6 +74,7 @@ struct pcpu { PCPU_MD_FIELDS; struct vmmeter pc_cnt; /* VM stats counters */ struct device *pc_device; + lwpid_t pc_curtid; }; SLIST_HEAD(cpuhead, pcpu); diff --git a/sys/sys/umtx.h b/sys/sys/umtx.h index e7e534afa974..eaea8f6a2ad6 100644 --- a/sys/sys/umtx.h +++ b/sys/sys/umtx.h @@ -57,7 +57,8 @@ struct umutex { volatile __lwpid_t m_owner; /* Owner of the mutex */ uint32_t m_flags; /* Flags of the mutex */ uint32_t m_ceilings[2]; /* Priority protect ceiling */ - uint32_t m_spare[4]; /* Spare space */ + uint32_t m_spincount; /* Max spinning cycle */ + uint32_t m_spare[3]; }; struct ucond {