From 58909b74b9f576e29a9dde2618420242bbae4a29 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 7 Sep 2013 15:16:30 +0000 Subject: [PATCH] Micro-optimize cpu_search(), allowing compiler to use more efficient inline ffsl() implementation, when it is available, instead of homegrown iteration. On dual-E5645 amd64 system (2x6x2 cores) under heavy I/O load that reduces time spent inside cpu_search() from 19% to 13%, while IOPS increased by 5%. --- sys/kern/sched_ule.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 95105d80475e..cba9d804a3ab 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -667,10 +667,14 @@ cpu_search(const struct cpu_group *cg, struct cpu_search *low, } /* Iterate through the child CPU groups and then remaining CPUs. */ - for (i = cg->cg_children, cpu = mp_maxid; i >= 0; ) { + for (i = cg->cg_children, cpu = mp_maxid; ; ) { if (i == 0) { +#ifdef HAVE_INLINE_FFSL + cpu = CPU_FFS(&cpumask) - 1; +#else while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask)) cpu--; +#endif if (cpu < 0) break; child = NULL; @@ -695,6 +699,7 @@ cpu_search(const struct cpu_group *cg, struct cpu_search *low, break; } } else { /* Handle child CPU. */ + CPU_CLR(cpu, &cpumask); tdq = TDQ_CPU(cpu); load = tdq->tdq_load * 256; rndptr = DPCPU_PTR(randomval); @@ -742,8 +747,11 @@ cpu_search(const struct cpu_group *cg, struct cpu_search *low, i--; if (i == 0 && CPU_EMPTY(&cpumask)) break; - } else + } +#ifndef HAVE_INLINE_FFSL + else cpu--; +#endif } return (total); }