From 7a966f2ded110856471f7090b54500a0f324949c Mon Sep 17 00:00:00 2001
From: Andrew Gallatin <gallatin@FreeBSD.org>
Date: Wed, 30 Oct 2002 01:41:44 +0000
Subject: [PATCH] Remove extranious memory barriers, and correct the placement
 of a few others. This provides a 30% reduction in system time and a 6%
 reduction in wallclock time for a make buildworld on my xp1000 (one 21264).

FWIW, I've been running this for nearly 2 months without problems.

Portions submitted by: ticso, jhb
Tested by: jhb (ds20 dual 21264)
---
 sys/alpha/alpha/atomic.s   |  8 --------
 sys/alpha/include/atomic.h | 18 ++++--------------
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/sys/alpha/alpha/atomic.s b/sys/alpha/alpha/atomic.s
index cab7f23b7121..03b866f28381 100644
--- a/sys/alpha/alpha/atomic.s
+++ b/sys/alpha/alpha/atomic.s
@@ -40,7 +40,6 @@ LEAF(atomic_set_8, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_set_8)
@@ -55,7 +54,6 @@ LEAF(atomic_clear_8, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_clear_8)
@@ -70,7 +68,6 @@ LEAF(atomic_add_8, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_add_8)
@@ -85,7 +82,6 @@ LEAF(atomic_subtract_8, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_subtract_8)
@@ -100,7 +96,6 @@ LEAF(atomic_set_16, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_set_16)
@@ -115,7 +110,6 @@ LEAF(atomic_clear_16, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_clear_16)
@@ -130,7 +124,6 @@ LEAF(atomic_add_16, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_add_16)
@@ -145,7 +138,6 @@ LEAF(atomic_subtract_16, 2)
 	or	t2, t0, t0
 	stq_c	t0, 0(t1)
 	beq	t0, 1f
-	mb
 	RET
 1:	br	0b
 	END(atomic_subtract_16)
diff --git a/sys/alpha/include/atomic.h b/sys/alpha/include/atomic.h
index 3fe8c92967fe..ba92cd6ac735 100644
--- a/sys/alpha/include/atomic.h
+++ b/sys/alpha/include/atomic.h
@@ -56,7 +56,6 @@ static __inline void atomic_set_32(volatile u_int32_t *p, u_int32_t v)
 		"bis %0, %3, %0\n\t"		/* calculate new value */
 		"stl_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -76,7 +75,6 @@ static __inline void atomic_clear_32(volatile u_int32_t *p, u_int32_t v)
 		"bic %0, %2, %0\n\t"		/* calculate new value */
 		"stl_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -96,7 +94,6 @@ static __inline void atomic_add_32(volatile u_int32_t *p, u_int32_t v)
 		"addl %0, %2, %0\n\t"		/* calculate new value */
 		"stl_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -116,7 +113,6 @@ static __inline void atomic_subtract_32(volatile u_int32_t *p, u_int32_t v)
 		"subl %0, %2, %0\n\t"		/* calculate new value */
 		"stl_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -139,7 +135,7 @@ static __inline u_int32_t atomic_readandclear_32(volatile u_int32_t *addr)
 		"beq %1,2f\n\t"		/* if the store failed, spin */
 		"br 3f\n"		/* it worked, exit */
 		"2:\tbr 1b\n"		/* *addr not updated, loop */
-		"3:\tmb\n"		/* it worked */
+		"3:\n"			/* it worked */
 		: "=&r"(result), "=&r"(temp), "+m" (*addr)
 		:
 		: "memory");
@@ -158,7 +154,6 @@ static __inline void atomic_set_64(volatile u_int64_t *p, u_int64_t v)
 		"bis %0, %2, %0\n\t"		/* calculate new value */
 		"stq_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -178,7 +173,6 @@ static __inline void atomic_clear_64(volatile u_int64_t *p, u_int64_t v)
 		"bic %0, %2, %0\n\t"		/* calculate new value */
 		"stq_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -198,7 +192,6 @@ static __inline void atomic_add_64(volatile u_int64_t *p, u_int64_t v)
 		"addq %0, %2, %0\n\t"		/* calculate new value */
 		"stq_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -218,7 +211,6 @@ static __inline void atomic_subtract_64(volatile u_int64_t *p, u_int64_t v)
 		"subq %0, %2, %0\n\t"		/* calculate new value */
 		"stq_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 2f\n\t"		/* spin if failed */
-		"mb\n\t"			/* drain to memory */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"2:\tbr 1b\n"			/* try again */
 		".previous\n"
@@ -241,7 +233,7 @@ static __inline u_int64_t atomic_readandclear_64(volatile u_int64_t *addr)
 		"beq %1,2f\n\t"		/* if the store failed, spin */
 		"br 3f\n"		/* it worked, exit */
 		"2:\tbr 1b\n"		/* *addr not updated, loop */
-		"3:\tmb\n"		/* it worked */
+		"3:\n"			/* it worked */
 		: "=&r"(result), "=&r"(temp), "+m" (*addr)
 		:
 		: "memory");
@@ -277,7 +269,7 @@ static __inline void							\
 atomic_##NAME##_acq_##WIDTH(volatile u_int##WIDTH##_t *p, u_int##WIDTH##_t v)\
 {									\
 	atomic_##NAME##_##WIDTH(p, v);					\
-	/* alpha_mb(); */						\
+	alpha_mb(); 							\
 }									\
 									\
 static __inline void							\
@@ -291,7 +283,7 @@ static __inline void							\
 atomic_##NAME##_acq_##TYPE(volatile u_int##WIDTH##_t *p, u_int##WIDTH##_t v)\
 {									\
 	atomic_##NAME##_##WIDTH(p, v);					\
-	/* alpha_mb(); */						\
+	alpha_mb();							\
 }									\
 									\
 static __inline void							\
@@ -382,7 +374,6 @@ atomic_cmpset_32(volatile u_int32_t* p, u_int32_t cmpval, u_int32_t newval)
 		"mov %3, %0\n\t"		/* value to store */
 		"stl_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 3f\n\t"		/* if it failed, spin */
-		"mb\n\t"			/* drain to memory */
 		"2:\n"				/* done */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"3:\tbr 1b\n"			/* try again */
@@ -413,7 +404,6 @@ atomic_cmpset_64(volatile u_int64_t* p, u_int64_t cmpval, u_int64_t newval)
 		"mov %3, %0\n\t"		/* value to store */
 		"stq_c %0, %1\n\t"		/* attempt to store */
 		"beq %0, 3f\n\t"		/* if it failed, spin */
-		"mb\n\t"			/* drain to memory */
 		"2:\n"				/* done */
 		".section .text3,\"ax\"\n"	/* improve branch prediction */
 		"3:\tbr 1b\n"			/* try again */