In total store which we use for running the kernel and all of the userland

atomic operations behave as if the were followed by a memory barrier so there's no need to include ones in the acquire variants of atomic(9). Removing these results a small performance improvement, specifically this is sufficient to compensate the performance loss seen in the worldstone benchmark seen when using SCHED_ULE instead of SCHED_4BSD. This change is inspired by Linux even more radically doing the equivalent thing some time ago. Thanks go to Peter Jeremy for additional testing.
svn path=/head/; revision=225889
2011-10-01 00:11:03 +00:00 · 2011-10-01 00:11:03 +00:00 · fafda37b15 · 2020-12-20 02:59:44 +00:00
commit fafda37b15
parent 9a91e2aa2e
1 changed files with 9 additions and 8 deletions
--- a/sys/sparc64/include/atomic.h
+++ b/sys/sparc64/include/atomic.h
@ -74,12 +74,16 @@
 *
 * the return value of cas is used to avoid the extra reload.
 *
- * The memory barriers provided by the acq and rel variants are intended
- * to be sufficient for use of relaxed memory ordering.  Due to the
- * suggested assembly syntax of the membar operands containing a #
- * character, they cannot be used in macros.  The cmask and mmask bits
+ * We only include a memory barrier in the rel variants as in total store
+ * order which we use for running the kernel and all of the userland atomic
+ * loads and stores behave as if the were followed by a membar with a mask
+ * of #LoadLoad | #LoadStore | #StoreStore.  In order to be also sufficient
+ * for use of relaxed memory ordering, the atomic_cas() in the acq variants 
+ * additionally would have to be followed by a membar #LoadLoad | #LoadStore.
+ * Due to the suggested assembly syntax of the membar operands containing a
+ * # character, they cannot be used in macros.  The cmask and mmask bits thus
 * are hard coded in machine/cpufunc.h and used here through macros.
- * Hopefully sun will choose not to change the bit numbers.
+ * Hopefully the bit numbers won't change in the future.
 */

 #define	itype(sz)	uint ## sz ## _t
@ -93,7 +97,6 @@
 #define	atomic_cas_acq(p, e, s, sz) ({					\
 	itype(sz) v;							\
 	v = atomic_cas(p, e, s, sz);					\
-	membar(LoadLoad | LoadStore);					\
 	v;								\
 })

@ -118,7 +121,6 @@
 #define	atomic_op_acq(p, op, v, sz) ({					\
 	itype(sz) t;							\
 	t = atomic_op(p, op, v, sz);					\
-	membar(LoadLoad | LoadStore);					\
 	t;								\
 })

@ -135,7 +137,6 @@
 #define	atomic_load_acq(p, sz) ({					\
 	itype(sz) v;							\
 	v = atomic_load(p, sz);						\
-	membar(LoadLoad | LoadStore);					\
 	v;								\
 })