ddab052725
Clang emits SSE instructions on amd64 in the common path of pthread_mutex_unlock. If the thread does not otherwise use SSE, this usage incurs a context-switch of the FPU/SSE state, which reduces the performance of multiple real-world applications by a non-trivial amount (3-5% in one application). Instead of this change, I experimented with eagerly switching the FPU state at context-switch time. This did not help. Most of the cost seems to be in the read/write of memory--as kib@ stated--and not in the #NM handling. I tested on machines with and without XSAVEOPT. One counter-argument to this change is that most applications already use SIMD, and the number of applications and amount of SIMD usage are only increasing. This is absolutely true. I agree that--in general and in principle--this change is in the wrong direction. However, there are applications that do not use enough SSE to offset the extra context-switch cost. SSE does not provide a clear benefit in the current libthr code with the current compiler, but it does provide a clear loss in some cases. Therefore, disabling SSE in libthr is a non-loss for most, and a gain for some. I refrained from disabling SSE in libc--as was suggested--because I can't make the above argument for libc. It provides a wide variety of code; each case should be analyzed separately. https://lists.freebsd.org/pipermail/freebsd-current/2015-March/055193.html Suggestions from: dim, jmg, rpaulo Approved by: kib (mentor) MFC after: 2 weeks Sponsored by: Dell Inc.
310 lines
10 KiB
Makefile
310 lines
10 KiB
Makefile
# $FreeBSD$
|
|
|
|
# Set default CPU compile flags and baseline CPUTYPE for each arch. The
|
|
# compile flags must support the minimum CPU type for each architecture but
|
|
# may tune support for more advanced processors.
|
|
|
|
.if !defined(CPUTYPE) || empty(CPUTYPE)
|
|
_CPUCFLAGS =
|
|
. if ${MACHINE_CPUARCH} == "aarch64"
|
|
MACHINE_CPU = arm64
|
|
. elif ${MACHINE_CPUARCH} == "amd64"
|
|
MACHINE_CPU = amd64 sse2 sse mmx
|
|
. elif ${MACHINE_CPUARCH} == "arm"
|
|
MACHINE_CPU = arm
|
|
. elif ${MACHINE_CPUARCH} == "i386"
|
|
MACHINE_CPU = i486
|
|
. elif ${MACHINE_CPUARCH} == "mips"
|
|
MACHINE_CPU = mips
|
|
. elif ${MACHINE_CPUARCH} == "powerpc"
|
|
MACHINE_CPU = aim
|
|
. elif ${MACHINE_CPUARCH} == "sparc64"
|
|
MACHINE_CPU = ultrasparc
|
|
. endif
|
|
.else
|
|
|
|
# Handle aliases (not documented in make.conf to avoid user confusion
|
|
# between e.g. i586 and pentium)
|
|
|
|
. if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
|
|
. if ${CPUTYPE} == "barcelona"
|
|
CPUTYPE = amdfam10
|
|
. elif ${CPUTYPE} == "core-avx2"
|
|
CPUTYPE = haswell
|
|
. elif ${CPUTYPE} == "core-avx-i"
|
|
CPUTYPE = ivybridge
|
|
. elif ${CPUTYPE} == "corei7-avx"
|
|
CPUTYPE = sandybridge
|
|
. elif ${CPUTYPE} == "corei7"
|
|
CPUTYPE = nehalem
|
|
. elif ${CPUTYPE} == "slm"
|
|
CPUTYPE = silvermont
|
|
. elif ${CPUTYPE} == "atom"
|
|
CPUTYPE = bonnell
|
|
. elif ${CPUTYPE} == "core"
|
|
CPUTYPE = prescott
|
|
. endif
|
|
. if ${MACHINE_CPUARCH} == "amd64"
|
|
. if ${CPUTYPE} == "prescott"
|
|
CPUTYPE = nocona
|
|
. endif
|
|
. else
|
|
. if ${CPUTYPE} == "k7"
|
|
CPUTYPE = athlon
|
|
. elif ${CPUTYPE} == "p4"
|
|
CPUTYPE = pentium4
|
|
. elif ${CPUTYPE} == "p4m"
|
|
CPUTYPE = pentium4m
|
|
. elif ${CPUTYPE} == "p3"
|
|
CPUTYPE = pentium3
|
|
. elif ${CPUTYPE} == "p3m"
|
|
CPUTYPE = pentium3m
|
|
. elif ${CPUTYPE} == "p-m"
|
|
CPUTYPE = pentium-m
|
|
. elif ${CPUTYPE} == "p2"
|
|
CPUTYPE = pentium2
|
|
. elif ${CPUTYPE} == "i686"
|
|
CPUTYPE = pentiumpro
|
|
. elif ${CPUTYPE} == "i586/mmx"
|
|
CPUTYPE = pentium-mmx
|
|
. elif ${CPUTYPE} == "i586"
|
|
CPUTYPE = pentium
|
|
. endif
|
|
. endif
|
|
. elif ${MACHINE_ARCH} == "sparc64"
|
|
. if ${CPUTYPE} == "us"
|
|
CPUTYPE = ultrasparc
|
|
. elif ${CPUTYPE} == "us3"
|
|
CPUTYPE = ultrasparc3
|
|
. endif
|
|
. endif
|
|
|
|
###############################################################################
|
|
# Logic to set up correct gcc optimization flag. This must be included
|
|
# after /etc/make.conf so it can react to the local value of CPUTYPE
|
|
# defined therein. Consult:
|
|
# http://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html
|
|
# http://gcc.gnu.org/onlinedocs/gcc/IA_002d64-Options.html
|
|
# http://gcc.gnu.org/onlinedocs/gcc/RS_002f6000-and-PowerPC-Options.html
|
|
# http://gcc.gnu.org/onlinedocs/gcc/MIPS-Options.html
|
|
# http://gcc.gnu.org/onlinedocs/gcc/SPARC-Options.html
|
|
# http://gcc.gnu.org/onlinedocs/gcc/i386-and-x86_002d64-Options.html
|
|
|
|
. if ${MACHINE_CPUARCH} == "i386"
|
|
. if ${CPUTYPE} == "crusoe"
|
|
_CPUCFLAGS = -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0
|
|
. elif ${CPUTYPE} == "k5"
|
|
_CPUCFLAGS = -march=pentium
|
|
. elif ${CPUTYPE} == "c7"
|
|
_CPUCFLAGS = -march=c3-2
|
|
. else
|
|
_CPUCFLAGS = -march=${CPUTYPE}
|
|
. endif
|
|
. elif ${MACHINE_CPUARCH} == "amd64"
|
|
_CPUCFLAGS = -march=${CPUTYPE}
|
|
. elif ${MACHINE_CPUARCH} == "arm"
|
|
. if ${CPUTYPE} == "xscale"
|
|
#XXX: gcc doesn't seem to like -mcpu=xscale, and dies while rebuilding itself
|
|
#_CPUCFLAGS = -mcpu=xscale
|
|
_CPUCFLAGS = -march=armv5te -D__XSCALE__
|
|
. elif ${CPUTYPE} == "armv6"
|
|
_CPUCFLAGS = -march=${CPUTYPE} -DARM_ARCH_6=1
|
|
. elif ${CPUTYPE} == "cortexa"
|
|
_CPUCFLAGS = -march=armv7 -DARM_ARCH_6=1 -mfpu=vfp
|
|
. else
|
|
_CPUCFLAGS = -mcpu=${CPUTYPE}
|
|
. endif
|
|
. elif ${MACHINE_ARCH} == "powerpc"
|
|
. if ${CPUTYPE} == "e500"
|
|
_CPUCFLAGS = -Wa,-me500 -msoft-float
|
|
. else
|
|
_CPUCFLAGS = -mcpu=${CPUTYPE} -mno-powerpc64
|
|
. endif
|
|
. elif ${MACHINE_ARCH} == "powerpc64"
|
|
_CPUCFLAGS = -mcpu=${CPUTYPE}
|
|
. elif ${MACHINE_CPUARCH} == "mips"
|
|
. if ${CPUTYPE} == "mips32"
|
|
_CPUCFLAGS = -march=mips32
|
|
. elif ${CPUTYPE} == "mips32r2"
|
|
_CPUCFLAGS = -march=mips32r2
|
|
. elif ${CPUTYPE} == "mips64"
|
|
_CPUCFLAGS = -march=mips64
|
|
. elif ${CPUTYPE} == "mips64r2"
|
|
_CPUCFLAGS = -march=mips64r2
|
|
. elif ${CPUTYPE} == "mips4kc"
|
|
_CPUCFLAGS = -march=4kc
|
|
. elif ${CPUTYPE} == "mips24kc"
|
|
_CPUCFLAGS = -march=24kc
|
|
. endif
|
|
. elif ${MACHINE_ARCH} == "sparc64"
|
|
. if ${CPUTYPE} == "v9"
|
|
_CPUCFLAGS = -mcpu=v9
|
|
. elif ${CPUTYPE} == "ultrasparc"
|
|
_CPUCFLAGS = -mcpu=ultrasparc
|
|
. elif ${CPUTYPE} == "ultrasparc3"
|
|
_CPUCFLAGS = -mcpu=ultrasparc3
|
|
. endif
|
|
. endif
|
|
|
|
# Set up the list of CPU features based on the CPU type. This is an
|
|
# unordered list to make it easy for client makefiles to test for the
|
|
# presence of a CPU feature.
|
|
|
|
. if ${MACHINE_CPUARCH} == "i386"
|
|
. if ${CPUTYPE} == "bdver4"
|
|
MACHINE_CPU = xop avx2 avx sse42 sse41 ssse3 sse4a sse3 sse2 sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "bdver3" || ${CPUTYPE} == "bdver2" || \
|
|
${CPUTYPE} == "bdver1"
|
|
MACHINE_CPU = xop avx sse42 sse41 ssse3 sse4a sse3 sse2 sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "btver2"
|
|
MACHINE_CPU = avx sse42 sse41 ssse3 sse4a sse3 sse2 sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "btver1"
|
|
MACHINE_CPU = ssse3 sse4a sse3 sse2 sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "amdfam10"
|
|
MACHINE_CPU = athlon-xp athlon k7 3dnow sse4a sse3 sse2 sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "opteron-sse3" || ${CPUTYPE} == "athlon64-sse3"
|
|
MACHINE_CPU = athlon-xp athlon k7 3dnow sse3 sse2 sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "opteron" || ${CPUTYPE} == "athlon64" || \
|
|
${CPUTYPE} == "athlon-fx"
|
|
MACHINE_CPU = athlon-xp athlon k7 3dnow sse2 sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "athlon-mp" || ${CPUTYPE} == "athlon-xp" || \
|
|
${CPUTYPE} == "athlon-4"
|
|
MACHINE_CPU = athlon-xp athlon k7 3dnow sse mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "athlon" || ${CPUTYPE} == "athlon-tbird"
|
|
MACHINE_CPU = athlon k7 3dnow mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "k6-3" || ${CPUTYPE} == "k6-2" || ${CPUTYPE} == "geode"
|
|
MACHINE_CPU = 3dnow mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "k6"
|
|
MACHINE_CPU = mmx k6 k5 i586
|
|
. elif ${CPUTYPE} == "k5"
|
|
MACHINE_CPU = k5 i586
|
|
. elif ${CPUTYPE} == "skylake" || ${CPUTYPE} == "knl"
|
|
MACHINE_CPU = avx512 avx2 avx sse42 sse41 ssse3 sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "broadwell" || ${CPUTYPE} == "haswell"
|
|
MACHINE_CPU = avx2 avx sse42 sse41 ssse3 sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "ivybridge" || ${CPUTYPE} == "sandybridge"
|
|
MACHINE_CPU = avx sse42 sse41 ssse3 sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "westmere" || ${CPUTYPE} == "nehalem" || \
|
|
${CPUTYPE} == "silvermont"
|
|
MACHINE_CPU = sse42 sse41 ssse3 sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "penryn"
|
|
MACHINE_CPU = sse41 ssse3 sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "core2" || ${CPUTYPE} == "bonnell"
|
|
MACHINE_CPU = ssse3 sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "yonah" || ${CPUTYPE} == "prescott"
|
|
MACHINE_CPU = sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "pentium4" || ${CPUTYPE} == "pentium4m" || \
|
|
${CPUTYPE} == "pentium-m"
|
|
MACHINE_CPU = sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "pentium3" || ${CPUTYPE} == "pentium3m"
|
|
MACHINE_CPU = sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "pentium2"
|
|
MACHINE_CPU = i686 mmx i586
|
|
. elif ${CPUTYPE} == "pentiumpro"
|
|
MACHINE_CPU = i686 i586
|
|
. elif ${CPUTYPE} == "pentium-mmx"
|
|
MACHINE_CPU = mmx i586
|
|
. elif ${CPUTYPE} == "pentium"
|
|
MACHINE_CPU = i586
|
|
. elif ${CPUTYPE} == "c7"
|
|
MACHINE_CPU = sse3 sse2 sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "c3-2"
|
|
MACHINE_CPU = sse i686 mmx i586
|
|
. elif ${CPUTYPE} == "c3"
|
|
MACHINE_CPU = 3dnow mmx i586
|
|
. elif ${CPUTYPE} == "winchip2"
|
|
MACHINE_CPU = 3dnow mmx
|
|
. elif ${CPUTYPE} == "winchip-c6"
|
|
MACHINE_CPU = mmx
|
|
. endif
|
|
MACHINE_CPU += i486
|
|
. elif ${MACHINE_CPUARCH} == "amd64"
|
|
. if ${CPUTYPE} == "bdver4"
|
|
MACHINE_CPU = xop avx2 avx sse42 sse41 ssse3 sse4a sse3
|
|
. elif ${CPUTYPE} == "bdver3" || ${CPUTYPE} == "bdver2" || \
|
|
${CPUTYPE} == "bdver1"
|
|
MACHINE_CPU = xop avx sse42 sse41 ssse3 sse4a sse3
|
|
. elif ${CPUTYPE} == "btver2"
|
|
MACHINE_CPU = avx sse42 sse41 ssse3 sse4a sse3
|
|
. elif ${CPUTYPE} == "btver1"
|
|
MACHINE_CPU = ssse3 sse4a sse3
|
|
. elif ${CPUTYPE} == "amdfam10"
|
|
MACHINE_CPU = k8 3dnow sse4a sse3
|
|
. elif ${CPUTYPE} == "opteron-sse3" || ${CPUTYPE} == "athlon64-sse3" || \
|
|
${CPUTYPE} == "k8-sse3"
|
|
MACHINE_CPU = k8 3dnow sse3
|
|
. elif ${CPUTYPE} == "opteron" || ${CPUTYPE} == "athlon64" || \
|
|
${CPUTYPE} == "athlon-fx" || ${CPUTYPE} == "k8"
|
|
MACHINE_CPU = k8 3dnow
|
|
. elif ${CPUTYPE} == "skylake" || ${CPUTYPE} == "knl"
|
|
MACHINE_CPU = avx512 avx2 avx sse42 sse41 ssse3 sse3
|
|
. elif ${CPUTYPE} == "broadwell" || ${CPUTYPE} == "haswell"
|
|
MACHINE_CPU = avx2 avx sse42 sse41 ssse3 sse3
|
|
. elif ${CPUTYPE} == "ivybridge" || ${CPUTYPE} == "sandybridge"
|
|
MACHINE_CPU = avx sse42 sse41 ssse3 sse3
|
|
. elif ${CPUTYPE} == "westmere" || ${CPUTYPE} == "nehalem" || \
|
|
${CPUTYPE} == "silvermont"
|
|
MACHINE_CPU = sse42 sse41 ssse3 sse3
|
|
. elif ${CPUTYPE} == "penryn"
|
|
MACHINE_CPU = sse41 ssse3 sse3
|
|
. elif ${CPUTYPE} == "core2" || ${CPUTYPE} == "bonnell"
|
|
MACHINE_CPU = ssse3 sse3
|
|
. elif ${CPUTYPE} == "nocona"
|
|
MACHINE_CPU = sse3
|
|
. endif
|
|
MACHINE_CPU += amd64 sse2 sse mmx
|
|
. elif ${MACHINE_ARCH} == "powerpc"
|
|
. if ${CPUTYPE} == "e500"
|
|
MACHINE_CPU = booke
|
|
. endif
|
|
. elif ${MACHINE_ARCH} == "sparc64"
|
|
. if ${CPUTYPE} == "v9"
|
|
MACHINE_CPU = v9
|
|
. elif ${CPUTYPE} == "ultrasparc"
|
|
MACHINE_CPU = v9 ultrasparc
|
|
. elif ${CPUTYPE} == "ultrasparc3"
|
|
MACHINE_CPU = v9 ultrasparc ultrasparc3
|
|
. endif
|
|
. endif
|
|
.endif
|
|
|
|
.if ${MACHINE_CPUARCH} == "mips"
|
|
CFLAGS += -G0
|
|
.endif
|
|
|
|
.if ${MACHINE_ARCH} == "armv6"
|
|
_CPUCFLAGS += -mfloat-abi=softfp
|
|
.endif
|
|
|
|
# NB: COPTFLAGS is handled in /usr/src/sys/conf/kern.pre.mk
|
|
|
|
.if !defined(NO_CPU_CFLAGS)
|
|
CFLAGS += ${_CPUCFLAGS}
|
|
.endif
|
|
|
|
#
|
|
# Prohibit the compiler from emitting SIMD instructions.
|
|
# These flags are added to CFLAGS in areas where the extra context-switch
|
|
# cost outweighs the advantages of SIMD instructions.
|
|
#
|
|
# gcc:
|
|
# Setting -mno-mmx implies -mno-3dnow
|
|
# Setting -mno-sse implies -mno-sse2, -mno-sse3, -mno-ssse3 and -mfpmath=387
|
|
#
|
|
# clang:
|
|
# Setting -mno-mmx implies -mno-3dnow and -mno-3dnowa
|
|
# Setting -mno-sse implies -mno-sse2, -mno-sse3, -mno-ssse3, -mno-sse41 and
|
|
# -mno-sse42
|
|
# (-mfpmath= is not supported)
|
|
#
|
|
.if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64"
|
|
CFLAGS_NO_SIMD.clang= -mno-avx
|
|
CFLAGS_NO_SIMD= -mno-mmx -mno-sse
|
|
.endif
|
|
CFLAGS_NO_SIMD += ${CFLAGS_NO_SIMD.${COMPILER_TYPE}}
|
|
|
|
# Add in any architecture-specific CFLAGS.
|
|
# These come from make.conf or the command line or the environment.
|
|
CFLAGS += ${CFLAGS.${MACHINE_ARCH}}
|
|
CXXFLAGS += ${CXXFLAGS.${MACHINE_ARCH}}
|