Remove unneeded data dependency, currently imposed by

atomic_load_acq(9), on it source, for x86.

Right now, atomic_load_acq() on x86 is sequentially consistent with
other atomics, code ensures this by doing store/load barrier by
performing locked nop on the source.  Provide separate primitive
__storeload_barrier(), which is implemented as the locked nop done on
a cpu-private variable, and put __storeload_barrier() before load, to
keep seq_cst semantic but avoid introducing false dependency on the
no-modification of the source for its later use.

Note that seq_cst property of x86 atomic_load_acq() is not documented
and not carried by atomics implementations on other architectures,
although some kernel code relies on the behaviour.  This commit does
not intend to change this.

Reviewed by:	alc
Discussed with:	bde
Tested by:	pho
Sponsored by:	The FreeBSD Foundation
MFC after:	2 weeks
This commit is contained in:
Konstantin Belousov 2015-06-28 05:04:08 +00:00
parent af38028d11
commit 7626d062c3
5 changed files with 160 additions and 105 deletions

View File

@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
#undef ATOMIC_ASM
/* Make atomic.h generate public functions */
static __inline void __storeload_barrier(void);
#define WANT_FUNCTIONS
#define static
#undef __inline

View File

@ -93,6 +93,8 @@ _Static_assert(OFFSETOF_CURTHREAD == offsetof(struct pcpu, pc_curthread),
"OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
_Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
"OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
"OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf.");
struct savefpu *
get_pcb_user_save_td(struct thread *td)

View File

@ -85,7 +85,7 @@ u_long atomic_fetchadd_long(volatile u_long *p, u_long v);
int atomic_testandset_int(volatile u_int *p, u_int v);
int atomic_testandset_long(volatile u_long *p, u_int v);
#define ATOMIC_LOAD(TYPE, LOP) \
#define ATOMIC_LOAD(TYPE) \
u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p)
#define ATOMIC_STORE(TYPE) \
void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@ -245,54 +245,80 @@ atomic_testandset_long(volatile u_long *p, u_int v)
* We assume that a = b will do atomic loads and stores. Due to the
* IA32 memory model, a simple store guarantees release semantics.
*
* However, loads may pass stores, so for atomic_load_acq we have to
* ensure a Store/Load barrier to do the load in SMP kernels. We use
* "lock cmpxchg" as recommended by the AMD Software Optimization
* Guide, and not mfence. For UP kernels, however, the cache of the
* single processor is always consistent, so we only need to take care
* of the compiler.
* However, a load may pass a store if they are performed on distinct
* addresses, so for atomic_load_acq we introduce a Store/Load barrier
* before the load in SMP kernels. We use "lock addl $0,mem", as
* recommended by the AMD Software Optimization Guide, and not mfence.
* In the kernel, we use a private per-cpu cache line as the target
* for the locked addition, to avoid introducing false data
* dependencies. In userspace, a word in the red zone on the stack
* (-8(%rsp)) is utilized.
*
* For UP kernels, however, the memory of the single processor is
* always consistent, so we only need to stop the compiler from
* reordering accesses in a way that violates the semantics of acquire
* and release.
*/
#define ATOMIC_STORE(TYPE) \
static __inline void \
atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
{ \
__compiler_membar(); \
*p = v; \
} \
#if defined(_KERNEL)
/*
* OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
*
* The open-coded number is used instead of the symbolic expression to
* avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
* An assertion in amd64/vm_machdep.c ensures that the value is correct.
*/
#define OFFSETOF_MONITORBUF 0x180
#if defined(SMP)
static __inline void
__storeload_barrier(void)
{
__asm __volatile("lock; addl $0,%%gs:%0"
: "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
}
#else /* _KERNEL && UP */
static __inline void
__storeload_barrier(void)
{
__compiler_membar();
}
#endif /* SMP */
#else /* !_KERNEL */
static __inline void
__storeload_barrier(void)
{
__asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc");
}
#endif /* _KERNEL*/
#define ATOMIC_LOAD(TYPE) \
static __inline u_##TYPE \
atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
{ \
u_##TYPE res; \
\
__storeload_barrier(); \
res = *p; \
__compiler_membar(); \
return (res); \
} \
struct __hack
#if defined(_KERNEL) && !defined(SMP)
#define ATOMIC_LOAD(TYPE, LOP) \
static __inline u_##TYPE \
atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
{ \
u_##TYPE tmp; \
\
tmp = *p; \
__compiler_membar(); \
return (tmp); \
} \
#define ATOMIC_STORE(TYPE) \
static __inline void \
atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \
{ \
\
__compiler_membar(); \
*p = v; \
} \
struct __hack
#else /* !(_KERNEL && !SMP) */
#define ATOMIC_LOAD(TYPE, LOP) \
static __inline u_##TYPE \
atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
{ \
u_##TYPE res; \
\
__asm __volatile(MPLOCKED LOP \
: "=a" (res), /* 0 */ \
"+m" (*p) /* 1 */ \
: : "memory", "cc"); \
return (res); \
} \
struct __hack
#endif /* _KERNEL && !SMP */
#endif /* KLD_MODULE || !__GNUCLIKE_ASM */
ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v);
@ -315,20 +341,19 @@ ATOMIC_ASM(clear, long, "andq %1,%0", "ir", ~v);
ATOMIC_ASM(add, long, "addq %1,%0", "ir", v);
ATOMIC_ASM(subtract, long, "subq %1,%0", "ir", v);
ATOMIC_LOAD(char, "cmpxchgb %b0,%1");
ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
ATOMIC_LOAD(int, "cmpxchgl %0,%1");
ATOMIC_LOAD(long, "cmpxchgq %0,%1");
#define ATOMIC_LOADSTORE(TYPE) \
ATOMIC_LOAD(TYPE); \
ATOMIC_STORE(TYPE)
ATOMIC_STORE(char);
ATOMIC_STORE(short);
ATOMIC_STORE(int);
ATOMIC_STORE(long);
ATOMIC_LOADSTORE(char);
ATOMIC_LOADSTORE(short);
ATOMIC_LOADSTORE(int);
ATOMIC_LOADSTORE(long);
#undef ATOMIC_ASM
#undef ATOMIC_LOAD
#undef ATOMIC_STORE
#undef ATOMIC_LOADSTORE
#ifndef WANT_FUNCTIONS
/* Read the current value and store a new value in the destination. */

View File

@ -111,6 +111,8 @@ _Static_assert(OFFSETOF_CURTHREAD == offsetof(struct pcpu, pc_curthread),
"OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
_Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
"OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
"OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf.");
static void cpu_reset_real(void);
#ifdef SMP

View File

@ -87,7 +87,7 @@ int atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src);
u_int atomic_fetchadd_int(volatile u_int *p, u_int v);
int atomic_testandset_int(volatile u_int *p, u_int v);
#define ATOMIC_LOAD(TYPE, LOP) \
#define ATOMIC_LOAD(TYPE) \
u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p)
#define ATOMIC_STORE(TYPE) \
void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@ -228,54 +228,79 @@ atomic_testandset_int(volatile u_int *p, u_int v)
* We assume that a = b will do atomic loads and stores. Due to the
* IA32 memory model, a simple store guarantees release semantics.
*
* However, loads may pass stores, so for atomic_load_acq we have to
* ensure a Store/Load barrier to do the load in SMP kernels. We use
* "lock cmpxchg" as recommended by the AMD Software Optimization
* Guide, and not mfence. For UP kernels, however, the cache of the
* single processor is always consistent, so we only need to take care
* of the compiler.
* However, a load may pass a store if they are performed on distinct
* addresses, so for atomic_load_acq we introduce a Store/Load barrier
* before the load in SMP kernels. We use "lock addl $0,mem", as
* recommended by the AMD Software Optimization Guide, and not mfence.
* In the kernel, we use a private per-cpu cache line as the target
* for the locked addition, to avoid introducing false data
* dependencies. In userspace, a word at the top of the stack is
* utilized.
*
* For UP kernels, however, the memory of the single processor is
* always consistent, so we only need to stop the compiler from
* reordering accesses in a way that violates the semantics of acquire
* and release.
*/
#define ATOMIC_STORE(TYPE) \
static __inline void \
atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
{ \
__compiler_membar(); \
*p = v; \
} \
#if defined(_KERNEL)
/*
* OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
*
* The open-coded number is used instead of the symbolic expression to
* avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
* An assertion in i386/vm_machdep.c ensures that the value is correct.
*/
#define OFFSETOF_MONITORBUF 0x180
#if defined(SMP)
static __inline void
__storeload_barrier(void)
{
__asm __volatile("lock; addl $0,%%fs:%0"
: "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
}
#else /* _KERNEL && UP */
static __inline void
__storeload_barrier(void)
{
__compiler_membar();
}
#endif /* SMP */
#else /* !_KERNEL */
static __inline void
__storeload_barrier(void)
{
__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc");
}
#endif /* _KERNEL*/
#define ATOMIC_LOAD(TYPE) \
static __inline u_##TYPE \
atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
{ \
u_##TYPE res; \
\
__storeload_barrier(); \
res = *p; \
__compiler_membar(); \
return (res); \
} \
struct __hack
#if defined(_KERNEL) && !defined(SMP)
#define ATOMIC_LOAD(TYPE, LOP) \
static __inline u_##TYPE \
atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
{ \
u_##TYPE tmp; \
\
tmp = *p; \
__compiler_membar(); \
return (tmp); \
} \
#define ATOMIC_STORE(TYPE) \
static __inline void \
atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \
{ \
\
__compiler_membar(); \
*p = v; \
} \
struct __hack
#else /* !(_KERNEL && !SMP) */
#define ATOMIC_LOAD(TYPE, LOP) \
static __inline u_##TYPE \
atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
{ \
u_##TYPE res; \
\
__asm __volatile(MPLOCKED LOP \
: "=a" (res), /* 0 */ \
"+m" (*p) /* 1 */ \
: : "memory", "cc"); \
return (res); \
} \
struct __hack
#endif /* _KERNEL && !SMP */
#ifdef _KERNEL
#ifdef WANT_FUNCTIONS
@ -511,19 +536,19 @@ ATOMIC_ASM(clear, long, "andl %1,%0", "ir", ~v);
ATOMIC_ASM(add, long, "addl %1,%0", "ir", v);
ATOMIC_ASM(subtract, long, "subl %1,%0", "ir", v);
ATOMIC_LOAD(char, "cmpxchgb %b0,%1");
ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
ATOMIC_LOAD(int, "cmpxchgl %0,%1");
ATOMIC_LOAD(long, "cmpxchgl %0,%1");
#define ATOMIC_LOADSTORE(TYPE) \
ATOMIC_LOAD(TYPE); \
ATOMIC_STORE(TYPE)
ATOMIC_STORE(char);
ATOMIC_STORE(short);
ATOMIC_STORE(int);
ATOMIC_STORE(long);
ATOMIC_LOADSTORE(char);
ATOMIC_LOADSTORE(short);
ATOMIC_LOADSTORE(int);
ATOMIC_LOADSTORE(long);
#undef ATOMIC_ASM
#undef ATOMIC_LOAD
#undef ATOMIC_STORE
#undef ATOMIC_LOADSTORE
#ifndef WANT_FUNCTIONS