Allow assembling skein_block_asm.s with clang

GNU as seems to allow macro arguments without the '\' but clang is more
strict in that regard.
This change makes the source code compatible with LLVM's but does not yet
change the build system or rename it to .S.

The new code assembles identically with GNU as 2.17.50.

Reviewed By:	emaste
Differential Revision: https://reviews.freebsd.org/D25143
This commit is contained in:
Alex Richardson 2020-06-05 13:54:13 +00:00
parent 3fe733f375
commit 7de010f2ee
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=361833

View File

@ -10,7 +10,9 @@
#
.text
.altmacro
#ifndef __clang__
.psize 0,128 #list file has no page boundaries
#endif
#
_MASK_ALL_ = (256+512+1024) #all three algorithm bits
_MAX_FRAME_ = 240
@ -240,9 +242,8 @@ RC_1024_7_7 = 20
# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
#
.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM
_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM
.if _RCNT_ #is there anything to do?
rolq $_RCNT_,%\reg
.if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do?
rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
.endif
.endm
#
@ -334,7 +335,7 @@ __STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
#----------------------------------------------------------------
#
.macro Reset_Stack
addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??)
addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?)
.irp _reg_,r15,r14,r13,r12,rbx,rbp
popq %\_reg_ #restore caller's regs
_PushCnt_ = _PushCnt_ - 1
@ -724,22 +725,22 @@ C_label Skein_256_Unroll_Cnt
.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
#
addReg r\rn0, r\rn1
RotL64 r\rn1, 512,%((_Rn_) % 8),0
RotL64 r\rn1, 512,%((\_Rn_) % 8),0
xorReg r\rn1, r\rn0
op1
\op1
addReg r\rn2, r\rn3
RotL64 r\rn3, 512,%((_Rn_) % 8),1
RotL64 r\rn3, 512,%((\_Rn_) % 8),1
xorReg r\rn3, r\rn2
op2
\op2
addReg r\rn4, r\rn5
RotL64 r\rn5, 512,%((_Rn_) % 8),2
RotL64 r\rn5, 512,%((\_Rn_) % 8),2
xorReg r\rn5, r\rn4
op3
\op3
addReg r\rn6, r\rn7
RotL64 r\rn7, 512,%((_Rn_) % 8),3
RotL64 r\rn7, 512,%((\_Rn_) % 8),3
xorReg r\rn7, r\rn6
op4
Skein_Debug_Round 512,%(_Rn_+1),-4
\op4
Skein_Debug_Round 512,%(\_Rn_+1),-4
#
.endm #R_512_OneRound
#
@ -749,11 +750,11 @@ C_label Skein_256_Unroll_Cnt
.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8)
.if (SKEIN_ASM_UNROLL && 512)
# here for fully unrolled case.
_II_ = ((_RR_)/4) + 1 #key injection counter
R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
_II_ = ((\_RR_)/4) + 1 #key injection counter
R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
# inject the key schedule
addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
addReg r11, rax
@ -766,10 +767,10 @@ C_label Skein_256_Unroll_Cnt
.else
# here for looping case #"rotate" key/tweak schedule (move up on stack)
incq %rdi #bump key injection counter
R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
# inject the key schedule
addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8
addReg r11, rax
@ -814,9 +815,9 @@ Skein_512_block_loop:
movq %rbx,ksTwk+ 8+F_O(%rbp)
movq %rcx,ksTwk+16+F_O(%rbp)
.irp _Rn_,8,9,10,11,12,13,14,15
movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_
movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
xorq %r\_Rn_,%rdx #compute overall parity
movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp)
movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
.endr #load state into %r8 ..%r15, compute parity
movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
@ -853,7 +854,7 @@ Skein_512_block_loop:
.if _SKEIN_DEBUG
.irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output
movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp)
movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
.endr
Skein_Debug_Block 512 #debug dump
@ -891,12 +892,12 @@ _Rbase_ = _Rbase_+1
#################
# feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
.irp _Rn_,8,9,10,11,12,13,14,15
.if (_Rn_ == 8)
.if (\_Rn_ == 8)
movq $FIRST_MASK64,%rbx
.endif
xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR
movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result
.if (_Rn_ == 14)
xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR
movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result
.if (\_Rn_ == 14)
andq TWEAK+ 8(%rdi),%rbx
.endif
.endr
@ -917,7 +918,7 @@ Skein_Debug_Round_512:
pushq %rsi #save two regs for BLK_BITS-specific parms
pushq %rdi
.irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it
movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp)
movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
.endr
movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
movq $512,%rdi #now <rdi,rsi,rdx> are set for the call
@ -968,50 +969,50 @@ rIdx_offs = tmpStk_1024
#
.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
addReg \reg0 , \reg1 #perform the MIX
RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_
RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
xorReg \reg1 , \reg0
.if ((_RN0_) && 3) == 3 #time to do key injection?
.if ((\_RN0_) && 3) == 3 #time to do key injection?
.if _SKEIN_DEBUG
movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round
movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection)
movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round
movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection)
.endif
_II_ = ((_RN0_)/4)+1 #injection count
_II_ = ((\_RN0_)/4)+1 #injection count
.if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection
addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0
addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1
.if w1 == 13 #tweak injection
addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
.if \w1 == 13 #tweak injection
addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1
.elseif w0 == 14
.elseif \w0 == 14
addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0
.elseif w1 == 15
.elseif \w1 == 15
addq $_II_, %\reg1 #(injection counter)
.endif
.else #here to do looping key injection
.if (w0 == 0)
movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index
.if (\w0 == 0)
movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index
movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi
.else
addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection
addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
.endif
.if w1 == 13 #tweak injection
.if \w1 == 13 #tweak injection
addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
.elseif w0 == 14
.elseif \w0 == 14
addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
.elseif w1 == 15
.elseif \w1 == 15
addReg \reg1,rdi,,,1 #(injection counter)
.endif
addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection
addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
.endif
.endif
# insert the op provided, .if any
op1
\op1
.endm
#################
# MACRO: four rounds for 1024-bit blocks
#
.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4)
# should be here with X4 set properly, X6 stored on stack
_Rn_ = (_RR_) + 0
_Rn_ = (\_RR_) + 0
r1024_Mix 0, 1,rdi,rsi,_Rn_,0
r1024_Mix 2, 3,rbp,rax,_Rn_,1
r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
@ -1023,7 +1024,7 @@ _Rn_ = (_RR_) + 0
.if _SKEIN_DEBUG
Skein_Debug_Round 1024,%(_Rn_+1)
.endif
_Rn_ = (_RR_) + 1
_Rn_ = (\_RR_) + 1
r1024_Mix 0, 9,rdi,r9 ,_Rn_,0
r1024_Mix 2,13,rbp,r13,_Rn_,1
r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
@ -1035,7 +1036,7 @@ _Rn_ = (_RR_) + 1
.if _SKEIN_DEBUG
Skein_Debug_Round 1024,%(_Rn_+1)
.endif
_Rn_ = (_RR_) + 2
_Rn_ = (\_RR_) + 2
r1024_Mix 0, 7,rdi,rdx,_Rn_,0
r1024_Mix 2, 5,rbp,rbx,_Rn_,1
r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
@ -1047,7 +1048,7 @@ _Rn_ = (_RR_) + 2
.if _SKEIN_DEBUG
Skein_Debug_Round 1024,%(_Rn_+1)
.endif
_Rn_ = (_RR_) + 3
_Rn_ = (\_RR_) + 3
r1024_Mix 0,15,rdi,r15,_Rn_,0
r1024_Mix 2,11,rbp,r11,_Rn_,1
r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
@ -1116,13 +1117,13 @@ Skein1024_block_loop:
# the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
.irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps
movq X_VARS+8*_rN_(%rdi),%r14 #get state word
movq 8*_rN_(%rsi),%r15 #get msg word
movq X_VARS+8*\_rN_(%rdi),%r14 #get state word
movq 8*\_rN_(%rsi),%r15 #get msg word
xorq %r14,%rax #update key schedule overall parity
movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack
movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy
movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack
movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy
addq %r15,%r14 #do the initial key injection
movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack
movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack
.endr
# now process the rest, using the "real" registers
# (MUST do it in reverse order to inject tweaks r8/r9 first)
@ -1135,9 +1136,9 @@ _oo_ = o1K_\_rr_ #offset assocated with the register
movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward
addq %rcx,%\_rr_ #do the initial key injection
.if _oo_ == 13 #do the initial tweak injection
addReg _rr_,r8 # (only in words 13/14)
addReg \_rr_,r8 # (only in words 13/14)
.elseif _oo_ == 14
addReg _rr_,r9
addReg \_rr_,r9
.endif
.endr
movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity