From f04861597061f22f5ce027edb0a18577441201cc Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Sun, 30 Nov 2014 18:09:02 +0100 Subject: [PATCH 1/3] Rewrite field assembly to match the C version --- src/field_5x52_asm.asm | 832 ++++++++++++++++++++++------------------- 1 file changed, 446 insertions(+), 386 deletions(-) diff --git a/src/field_5x52_asm.asm b/src/field_5x52_asm.asm index 5e785f76305..11f12eeb2dd 100644 --- a/src/field_5x52_asm.asm +++ b/src/field_5x52_asm.asm @@ -1,4 +1,11 @@ - ;; Added by Diederik Huys, March 2013 + ;; Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille + ;; Distributed under the MIT software license, see the accompanying + ;; file COPYING or http://www.opensource.org/licenses/mit-license.php. + + ;; Changelog: + ;; * March 2013, Diederik Huys: Original version + ;; * November 2014, Pieter Wuille: Updated to use Peter Dettman's parallel + ;; multiplication algorithm ;; ;; Provided public procedures: ;; secp256k1_fe_mul_inner @@ -24,14 +31,12 @@ ;; ;; INTERNAL: rdx:rax = multiplication accumulator ;; r9:r8 = c - ;; r10-r13 = t0-t3 - ;; r14 = b.n[0] / t4 - ;; r15 = b.n[1] / t5 - ;; rbx = b.n[2] / t6 - ;; rcx = b.n[3] / t7 - ;; rbp = Constant 0FFFFFFFFFFFFFh / t8 - ;; rsi = b.n / b.n[4] / t9 - + ;; r10:r14 = a0-a4 + ;; rcx:rbx = d + ;; rbp = R + ;; rdi = t? + ;; r15 = b->n + ;; rsi = r->n GLOBAL SYM(secp256k1_fe_mul_inner) ALIGN 32 SYM(secp256k1_fe_mul_inner): @@ -41,263 +46,256 @@ SYM(secp256k1_fe_mul_inner): push r13 push r14 push r15 - push rdx - mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until - ; b.n[0] is no longer needed, then we reassign - ; r14 to t4 - ;; c=a.n[0] * b.n[0] - mov rax,[rdi+0*8] ; load a.n[0] - mov rbp,0FFFFFFFFFFFFFh - mul r14 ; rdx:rax=a.n[0]*b.n[0] - mov r15,[rsi+1*8] - mov r10,rbp ; load modulus into target register for t0 - mov r8,rax - and r10,rax ; only need lower qword of c - shrd r8,rdx,52 - xor r9,r9 ; c < 2^64, so we ditch the HO part + mov r10,[rdi+0*8] + mov r11,[rdi+1*8] + mov r12,[rdi+2*8] + mov r13,[rdi+3*8] + mov r14,[rdi+4*8] + mov rbp,01000003D10h + mov r15,rsi + mov rsi,rdx - ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] - mov rax,[rdi+0*8] - mul r15 - add r8,rax - adc r9,rdx - - mov rax,[rdi+1*8] - mul r14 - mov r11,rbp - mov rbx,[rsi+2*8] - add r8,rax - adc r9,rdx - and r11,r8 - shrd r8,r9,52 - xor r9,r9 - - ;; c+=a.n[0 1 2] * b.n[2 1 0] - mov rax,[rdi+0*8] - mul rbx - add r8,rax - adc r9,rdx - - mov rax,[rdi+1*8] - mul r15 - add r8,rax - adc r9,rdx - - mov rax,[rdi+2*8] + ;; d += a3 * b0 + mov rax,[r15+0*8] + mul r13 + mov rbx,rax + mov rcx,rdx + ;; d += a2 * b1 + mov rax,[r15+1*8] + mul r12 + add rbx,rax + adc rcx,rdx + ;; d += a1 * b2 + mov rax,[r15+2*8] + mul r11 + add rbx,rax + adc rcx,rdx + ;; d = a0 * b3 + mov rax,[r15+3*8] + mul r10 + add rbx,rax + adc rcx,rdx + ;; c = a4 * b4 + mov rax,[r15+4*8] mul r14 - mov r12,rbp - mov rcx,[rsi+3*8] - add r8,rax - adc r9,rdx - and r12,r8 - shrd r8,r9,52 - xor r9,r9 - - ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] - mov rax,[rdi+0*8] - mul rcx - add r8,rax - adc r9,rdx - - mov rax,[rdi+1*8] - mul rbx - add r8,rax - adc r9,rdx - - mov rax,[rdi+2*8] - mul r15 - add r8,rax - adc r9,rdx - - mov rax,[rdi+3*8] - mul r14 - mov r13,rbp - mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer - add r8,rax - adc r9,rdx - and r13,r8 - - shrd r8,r9,52 - xor r9,r9 - - - ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0] - mov rax,[rdi+0*8] - mul rsi - add r8,rax - adc r9,rdx - - mov rax,[rdi+1*8] - mul rcx - add r8,rax - adc r9,rdx - - mov rax,[rdi+2*8] - mul rbx - add r8,rax - adc r9,rdx - - mov rax,[rdi+3*8] - mul r15 - add r8,rax - adc r9,rdx - - mov rax,[rdi+4*8] - mul r14 - mov r14,rbp ; load modulus into t4 and destroy a.n[0] - add r8,rax - adc r9,rdx - and r14,r8 - shrd r8,r9,52 - xor r9,r9 - - ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1] - mov rax,[rdi+1*8] - mul rsi - add r8,rax - adc r9,rdx - - mov rax,[rdi+2*8] - mul rcx - add r8,rax - adc r9,rdx - - mov rax,[rdi+3*8] - mul rbx - add r8,rax - adc r9,rdx - - mov rax,[rdi+4*8] - mul r15 - mov r15,rbp - add r8,rax - adc r9,rdx - - and r15,r8 - shrd r8,r9,52 - xor r9,r9 - - ;; c+=a.n[2 3 4] * b.n[4 3 2] - mov rax,[rdi+2*8] - mul rsi - add r8,rax - adc r9,rdx - - mov rax,[rdi+3*8] - mul rcx - add r8,rax - adc r9,rdx - - mov rax,[rdi+4*8] - mul rbx - mov rbx,rbp - add r8,rax - adc r9,rdx - - and rbx,r8 - shrd r8,r9,52 - xor r9,r9 - - ;; c+=a.n[3 4] * b.n[4 3] - mov rax,[rdi+3*8] - mul rsi - add r8,rax - adc r9,rdx - - mov rax,[rdi+4*8] - mul rcx - mov rcx,rbp - add r8,rax - adc r9,rdx - and rcx,r8 - shrd r8,r9,52 - xor r9,r9 - - ;; c+=a.n[4] * b.n[4] - mov rax,[rdi+4*8] - mul rsi - ;; mov rbp,rbp ; modulus already there! - add r8,rax - adc r9,rdx - and rbp,r8 - shrd r8,r9,52 - xor r9,r9 - - mov rsi,r8 ; load c into t9 and destroy b.n[4] - - ;; ******************************************************* -common_exit_norm: - mov rdi,01000003D10h ; load constant - - mov rax,r15 ; get t5 - mul rdi - add rax,r10 ; +t0 - adc rdx,0 - mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers! - mov r8,rax ; +c - and r10,rax - shrd r8,rdx,52 - xor r9,r9 - - mov rax,rbx ; get t6 - mul rdi - add rax,r11 ; +t1 - adc rdx,0 - mov r11,0FFFFFFFFFFFFFh ; modulus - add r8,rax ; +c - adc r9,rdx - and r11,r8 - shrd r8,r9,52 - xor r9,r9 - - mov rax,rcx ; get t7 - mul rdi - add rax,r12 ; +t2 - adc rdx,0 - pop rbx ; retrieve pointer to this.n - mov r12,0FFFFFFFFFFFFFh ; modulus - add r8,rax ; +c - adc r9,rdx - and r12,r8 - mov [rbx+2*8],r12 ; mov into this.n[2] - shrd r8,r9,52 - xor r9,r9 - - mov rax,rbp ; get t8 - mul rdi - add rax,r13 ; +t3 - adc rdx,0 - mov r13,0FFFFFFFFFFFFFh ; modulus - add r8,rax ; +c - adc r9,rdx - and r13,r8 - mov [rbx+3*8],r13 ; -> this.n[3] - shrd r8,r9,52 - xor r9,r9 - - mov rax,rsi ; get t9 - mul rdi - add rax,r14 ; +t4 - adc rdx,0 - mov r14,0FFFFFFFFFFFFh ; !!! - add r8,rax ; +c - adc r9,rdx - and r14,r8 - mov [rbx+4*8],r14 ; -> this.n[4] - shrd r8,r9,48 ; !!! - xor r9,r9 - - mov rax,01000003D1h - mul r8 - add rax,r10 - adc rdx,0 - mov r10,0FFFFFFFFFFFFFh ; modulus mov r8,rax - and rax,r10 - shrd r8,rdx,52 - mov [rbx+0*8],rax ; -> this.n[0] - add r8,r11 - mov [rbx+1*8],r8 ; -> this.n[1] + mov r9,rdx + ;; d += (c & M) * R + mov rdx,0fffffffffffffh + and rax,rdx + mul rbp + add rbx,rax + adc rcx,rdx + ;; c >>= 52 (r8 only) + shrd r8,r9,52 + ;; t3 (stack) = d & M + mov rdi,rbx + mov rdx,0fffffffffffffh + and rdi,rdx + push rdi + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; d += a4 * b0 + mov rax,[r15+0*8] + mul r14 + add rbx,rax + adc rcx,rdx + ;; d += a3 * b1 + mov rax,[r15+1*8] + mul r13 + add rbx,rax + adc rcx,rdx + ;; d += a2 * b2 + mov rax,[r15+2*8] + mul r12 + add rbx,rax + adc rcx,rdx + ;; d += a1 * b3 + mov rax,[r15+3*8] + mul r11 + add rbx,rax + adc rcx,rdx + ;; d += a0 * b4 + mov rax,[r15+4*8] + mul r10 + add rbx,rax + adc rcx,rdx + ;; d += c * R + mov rax,r8 + mul rbp + add rbx,rax + adc rcx,rdx + ;; t4 = d & M (rdi) + mov rdi,rbx + mov rdx,0fffffffffffffh + and rdi,rdx + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; tx = t4 >> 48 (rbp, overwrites R) + mov rbp,rdi + shr rbp,48 + ;; t4 &= (M >> 4) (stack) + mov rax,0ffffffffffffh + and rdi,rax + push rdi + ;; c = a0 * b0 + mov rax,[r15+0*8] + mul r10 + mov r8,rax + mov r9,rdx + ;; d += a4 * b1 + mov rax,[r15+1*8] + mul r14 + add rbx,rax + adc rcx,rdx + ;; d += a3 * b2 + mov rax,[r15+2*8] + mul r13 + add rbx,rax + adc rcx,rdx + ;; d += a2 * b3 + mov rax,[r15+3*8] + mul r12 + add rbx,rax + adc rcx,rdx + ;; d += a1 * b4 + mov rax,[r15+4*8] + mul r11 + add rbx,rax + adc rcx,rdx + ;; u0 = d & M (rdi) + mov rdi,rbx + mov rdx,0fffffffffffffh + and rdi,rdx + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; u0 = (u0 << 4) | tx (rdi) + shl rdi,4 + or rdi,rbp + ;; c += u0 * (R >> 4) + mov rax,01000003D1h + mul rdi + add r8,rax + adc r9,rdx + ;; r[0] = c & M + mov rax,r8 + mov rdx,0fffffffffffffh + and rax,rdx + mov [rsi+0*8],rax + ;; c >>= 52 + shrd r8,r9,52 + mov r9,0 + ;; c += a1 * b0 + mov rax,[r15+0*8] + mul r11 + add r8,rax + adc r9,rdx + ;; c += a0 * b1 + mov rax,[r15+1*8] + mul r10 + add r8,rax + adc r9,rdx + ;; d += a4 * b2 + mov rax,[r15+2*8] + mul r14 + add rbx,rax + adc rcx,rdx + ;; d += a3 * b3 + mov rax,[r15+3*8] + mul r13 + add rbx,rax + adc rcx,rdx + ;; d += a2 * b4 + mov rax,[r15+4*8] + mul r12 + add rbx,rax + adc rcx,rdx + ;; restore rdp = R + mov rbp,01000003D10h + ;; c += (d & M) * R + mov rax,rbx + mov rdx,0fffffffffffffh + and rax,rdx + mul rbp + add r8,rax + adc r9,rdx + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; r[1] = c & M + mov rax,r8 + mov rdx,0fffffffffffffh + and rax,rdx + mov [rsi+8*1],rax + ;; c >>= 52 + shrd r8,r9,52 + mov r9,0 + ;; c += a2 * b0 + mov rax,[r15+0*8] + mul r12 + add r8,rax + adc r9,rdx + ;; c += a1 * b1 + mov rax,[r15+1*8] + mul r11 + add r8,rax + adc r9,rdx + ;; c += a0 * b2 (last use of r10 = a0) + mov rax,[r15+2*8] + mul r10 + add r8,rax + adc r9,rdx + ;; fetch t3 (r10, overwrites a0),t4 (rdi) + pop rdi + pop r10 + ;; d += a4 * b3 + mov rax,[r15+3*8] + mul r14 + add rbx,rax + adc rcx,rdx + ;; d += a3 * b4 + mov rax,[r15+4*8] + mul r13 + add rbx,rax + adc rcx,rdx + ;; c += (d & M) * R + mov rax,rbx + mov rdx,0fffffffffffffh + and rax,rdx + mul rbp + add r8,rax + adc r9,rdx + ;; d >>= 52 (rbx only) + shrd rbx,rcx,52 + ;; r[2] = c & M + mov rax,r8 + mov rdx,0fffffffffffffh + and rax,rdx + mov [rsi+2*8],rax + ;; c >>= 52 + shrd r8,r9,52 + mov r9,0 + ;; c += t3 + add r8,r10 + ;; c += d * R + mov rax,rbx + mul rbp + add r8,rax + adc r9,rdx + ;; r[3] = c & M + mov rax,r8 + mov rdx,0fffffffffffffh + and rax,rdx + mov [rsi+3*8],rax + ;; c >>= 52 (r8 only) + shrd r8,r9,52 + ;; c += t4 (r8 only) + add r8,rdi + ;; r[4] = c + mov [rsi+4*8],r8 pop r15 pop r14 @@ -311,16 +309,14 @@ common_exit_norm: ;; PROC ExSetSquare ;; Register Layout: ;; INPUT: rdi = a.n - ;; rsi = this.a + ;; rsi = r.n ;; INTERNAL: rdx:rax = multiplication accumulator ;; r9:r8 = c - ;; r10-r13 = t0-t3 - ;; r14 = a.n[0] / t4 - ;; r15 = a.n[1] / t5 - ;; rbx = a.n[2] / t6 - ;; rcx = a.n[3] / t7 - ;; rbp = 0FFFFFFFFFFFFFh / t8 - ;; rsi = a.n[4] / t9 + ;; r10:r14 = a0-a4 + ;; rcx:rbx = d + ;; rbp = R + ;; rdi = t? + ;; r15 = M GLOBAL SYM(secp256k1_fe_sqr_inner) ALIGN 32 SYM(secp256k1_fe_sqr_inner): @@ -330,140 +326,204 @@ SYM(secp256k1_fe_sqr_inner): push r13 push r14 push r15 - push rsi - mov rbp,0FFFFFFFFFFFFFh - - ;; c=a.n[0] * a.n[0] - mov r14,[rdi+0*8] ; r14=a.n[0] - mov r10,rbp ; modulus + mov r10,[rdi+0*8] + mov r11,[rdi+1*8] + mov r12,[rdi+2*8] + mov r13,[rdi+3*8] + mov r14,[rdi+4*8] + mov rbp,01000003D10h + mov r15,0fffffffffffffh + + ;; d = (a0*2) * a3 + lea rax,[r10*2] + mul r13 + mov rbx,rax + mov rcx,rdx + ;; d += (a1*2) * a2 + lea rax,[r11*2] + mul r12 + add rbx,rax + adc rcx,rdx + ;; c = a4 * a4 mov rax,r14 - mul rax - mov r15,[rdi+1*8] ; a.n[1] - add r14,r14 ; r14=2*a.n[0] + mul r14 mov r8,rax - and r10,rax ; only need lower qword - shrd r8,rdx,52 - xor r9,r9 - - ;; c+=2*a.n[0] * a.n[1] - mov rax,r14 ; r14=2*a.n[0] - mul r15 - mov rbx,[rdi+2*8] ; rbx=a.n[2] - mov r11,rbp ; modulus - add r8,rax - adc r9,rdx - and r11,r8 + mov r9,rdx + ;; d += (c & M) * R + and rax,r15 + mul rbp + add rbx,rax + adc rcx,rdx + ;; c >>= 52 (r8 only) shrd r8,r9,52 - xor r9,r9 - - ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] - mov rax,r14 - mul rbx + ;; t3 (stack) = d & M + mov rdi,rbx + and rdi,r15 + push rdi + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; a4 *= 2 + add r14,r14 + ;; d += a0 * a4 + mov rax,r10 + mul r14 + add rbx,rax + adc rcx,rdx + ;; d+= (a1*2) * a3 + lea rax,[r11*2] + mul r13 + add rbx,rax + adc rcx,rdx + ;; d += a2 * a2 + mov rax,r12 + mul r12 + add rbx,rax + adc rcx,rdx + ;; d += c * R + mov rax,r8 + mul rbp + add rbx,rax + adc rcx,rdx + ;; t4 = d & M (rdi) + mov rdi,rbx + and rdi,r15 + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; tx = t4 >> 48 (rbp, overwrites constant) + mov rbp,rdi + shr rbp,48 + ;; t4 &= (M >> 4) (stack) + mov rax,0ffffffffffffh + and rdi,rax + push rdi + ;; c = a0 * a0 + mov rax,r10 + mul r10 + mov r8,rax + mov r9,rdx + ;; d += a1 * a4 + mov rax,r11 + mul r14 + add rbx,rax + adc rcx,rdx + ;; d += (a2*2) * a3 + lea rax,[r12*2] + mul r13 + add rbx,rax + adc rcx,rdx + ;; u0 = d & M (rdi) + mov rdi,rbx + and rdi,r15 + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; u0 = (u0 << 4) | tx (rdi) + shl rdi,4 + or rdi,rbp + ;; c += u0 * (R >> 4) + mov rax,01000003D1h + mul rdi add r8,rax adc r9,rdx - - mov rax,r15 - mov r12,rbp ; modulus - mul rax - mov rcx,[rdi+3*8] ; rcx=a.n[3] - add r15,r15 ; r15=a.n[1]*2 - add r8,rax - adc r9,rdx - and r12,r8 ; only need lower dword + ;; r[0] = c & M + mov rax,r8 + and rax,r15 + mov [rsi+0*8],rax + ;; c >>= 52 shrd r8,r9,52 - xor r9,r9 - - ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] - mov rax,r14 - mul rcx + mov r9,0 + ;; a0 *= 2 + add r10,r10 + ;; c += a0 * a1 + mov rax,r10 + mul r11 add r8,rax adc r9,rdx - - mov rax,r15 ; rax=2*a.n[1] - mov r13,rbp ; modulus - mul rbx - mov rsi,[rdi+4*8] ; rsi=a.n[4] - add r8,rax - adc r9,rdx - and r13,r8 - shrd r8,r9,52 - xor r9,r9 - - ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] - mov rax,r14 ; last time we need 2*a.n[0] - mul rsi - add r8,rax - adc r9,rdx - - mov rax,r15 - mul rcx - mov r14,rbp ; modulus - add r8,rax - adc r9,rdx - + ;; d += a2 * a4 + mov rax,r12 + mul r14 + add rbx,rax + adc rcx,rdx + ;; d += a3 * a3 + mov rax,r13 + mul r13 + add rbx,rax + adc rcx,rdx + ;; load R in rbp + mov rbp,01000003D10h + ;; c += (d & M) * R mov rax,rbx - mul rax - add rbx,rbx ; rcx=2*a.n[2] + and rax,r15 + mul rbp add r8,rax adc r9,rdx - and r14,r8 + ;; d >>= 52 + shrd rbx,rcx,52 + mov rcx,0 + ;; r[1] = c & M + mov rax,r8 + and rax,r15 + mov [rsi+8*1],rax + ;; c >>= 52 shrd r8,r9,52 - xor r9,r9 - - ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] - mov rax,r15 ; last time we need 2*a.n[1] - mul rsi + mov r9,0 + ;; c += a0 * a2 (last use of r10) + mov rax,r10 + mul r12 add r8,rax adc r9,rdx - + ;; fetch t3 (r10, overwrites a0),t4 (rdi) + pop rdi + pop r10 + ;; c += a1 * a1 + mov rax,r11 + mul r11 + add r8,rax + adc r9,rdx + ;; d += a3 * a4 + mov rax,r13 + mul r14 + add rbx,rax + adc rcx,rdx + ;; c += (d & M) * R mov rax,rbx - mul rcx - mov r15,rbp ; modulus + and rax,r15 + mul rbp add r8,rax adc r9,rdx - and r15,r8 + ;; d >>= 52 (rbx only) + shrd rbx,rcx,52 + ;; r[2] = c & M + mov rax,r8 + and rax,r15 + mov [rsi+2*8],rax + ;; c >>= 52 shrd r8,r9,52 - xor r9,r9 - - ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3] - mov rax,rbx ; last time we need 2*a.n[2] - mul rsi + mov r9,0 + ;; c += t3 + add r8,r10 + ;; c += d * R + mov rax,rbx + mul rbp add r8,rax adc r9,rdx - - mov rax,rcx ; a.n[3] - mul rax - mov rbx,rbp ; modulus - add r8,rax - adc r9,rdx - and rbx,r8 ; only need lower dword - lea rax,[2*rcx] + ;; r[3] = c & M + mov rax,r8 + and rax,r15 + mov [rsi+3*8],rax + ;; c >>= 52 (r8 only) shrd r8,r9,52 - xor r9,r9 + ;; c += t4 (r8 only) + add r8,rdi + ;; r[4] = c + mov [rsi+4*8],r8 - ;; c+=2*a.n[3]*a.n[4] - mul rsi - mov rcx,rbp ; modulus - add r8,rax - adc r9,rdx - and rcx,r8 ; only need lower dword - shrd r8,r9,52 - xor r9,r9 - - ;; c+=a.n[4]*a.n[4] - mov rax,rsi - mul rax - ;; mov rbp,rbp ; modulus is already there! - add r8,rax - adc r9,rdx - and rbp,r8 - shrd r8,r9,52 - xor r9,r9 - - mov rsi,r8 - - ;; ******************************************************* - jmp common_exit_norm - end - - + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + ret From 67935050e133c53fcc96be0e129abd193c5946ea Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Tue, 2 Dec 2014 17:47:32 +0100 Subject: [PATCH 2/3] Convert YASM code into inline assembly --- Makefile.am | 24 +- build-aux/m4/bitcoin_secp.m4 | 42 +-- configure.ac | 1 - nasm_lt.sh | 57 ---- src/field_5x52_asm.asm | 529 ----------------------------------- src/field_5x52_asm_impl.h | 495 +++++++++++++++++++++++++++++++- 6 files changed, 506 insertions(+), 642 deletions(-) delete mode 100755 nasm_lt.sh delete mode 100644 src/field_5x52_asm.asm diff --git a/Makefile.am b/Makefile.am index dbf1790f341..6e42c64f3f1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,12 +1,6 @@ ACLOCAL_AMFLAGS = -I build-aux/m4 lib_LTLIBRARIES = libsecp256k1.la -if USE_ASM -COMMON_LIB = libsecp256k1_common.la -else -COMMON_LIB = -endif -noinst_LTLIBRARIES = $(COMMON_LIB) include_HEADERS = include/secp256k1.h noinst_HEADERS = noinst_HEADERS += src/scalar.h @@ -47,13 +41,9 @@ noinst_HEADERS += src/field_impl.h pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libsecp256k1.pc -if USE_ASM -libsecp256k1_common_la_SOURCES = src/field_5x52_asm.asm -endif - libsecp256k1_la_SOURCES = src/secp256k1.c libsecp256k1_la_CPPFLAGS = -I$(top_srcdir)/include $(SECP_INCLUDES) -libsecp256k1_la_LIBADD = $(COMMON_LIB) $(SECP_LIBS) +libsecp256k1_la_LIBADD = $(SECP_LIBS) noinst_PROGRAMS = @@ -66,7 +56,7 @@ bench_sign_SOURCES = src/bench_sign.c bench_sign_LDADD = libsecp256k1.la $(SECP_LIBS) bench_sign_LDFLAGS = -static bench_inv_SOURCES = src/bench_inv.c -bench_inv_LDADD = $(COMMON_LIB) $(SECP_LIBS) +bench_inv_LDADD = $(SECP_LIBS) bench_inv_LDFLAGS = -static bench_inv_CPPFLAGS = $(SECP_INCLUDES) endif @@ -75,15 +65,9 @@ if USE_TESTS noinst_PROGRAMS += tests tests_SOURCES = src/tests.c tests_CPPFLAGS = -DVERIFY $(SECP_INCLUDES) $(SECP_TEST_INCLUDES) -tests_LDADD = $(COMMON_LIB) $(SECP_LIBS) $(SECP_TEST_LIBS) +tests_LDADD = $(SECP_LIBS) $(SECP_TEST_LIBS) tests_LDFLAGS = -static TESTS = tests endif -EXTRA_DIST = autogen.sh nasm_lt.sh - -#x86_64 only -if USE_ASM -.asm.lo: - $(LIBTOOL) --mode=compile --tag YASM $(srcdir)/nasm_lt.sh $(YASM) -f $(YASM_BINFMT) $(YAFLAGS) -I$(srcdir) -I. $< -o $@ -endif +EXTRA_DIST = autogen.sh diff --git a/build-aux/m4/bitcoin_secp.m4 b/build-aux/m4/bitcoin_secp.m4 index 4ca28f99cfa..7163b948090 100644 --- a/build-aux/m4/bitcoin_secp.m4 +++ b/build-aux/m4/bitcoin_secp.m4 @@ -11,38 +11,16 @@ fi dnl AC_DEFUN([SECP_64BIT_ASM_CHECK],[ -if test x"$host_cpu" == x"x86_64"; then - AC_CHECK_PROG(YASM, yasm, yasm) -else - if test x"$set_field" = x"64bit_asm"; then - AC_MSG_ERROR([$set_field field support explicitly requested but is not compatible with this host]) - fi -fi -if test x$YASM = x; then - if test x"$set_field" = x"64bit_asm"; then - AC_MSG_ERROR([$set_field field support explicitly requested but yasm was not found]) - fi - has_64bit_asm=no -else - case x"$host_os" in - xdarwin*) - YASM_BINFMT=macho64 - ;; - x*-gnux32) - YASM_BINFMT=elfx32 - ;; - *) - YASM_BINFMT=elf64 - ;; - esac - if $YASM -f help | grep -q $YASM_BINFMT; then - has_64bit_asm=yes - else - if test x"$set_field" = x"64bit_asm"; then - AC_MSG_ERROR([$set_field field support explicitly requested but yasm doesn't support $YASM_BINFMT format]) - fi - AC_MSG_WARN([yasm too old for $YASM_BINFMT format]) - has_64bit_asm=no +AC_MSG_CHECKING(for x86_64 assembly availability) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include ]],[[ + uint64_t a = 11, tmp; + __asm__ __volatile__("movq $0x100000000,%1; mulq %%rsi" : "+a"(a) : "S"(tmp) : "cc", "%rdx"); + ]])],[has_64bit_asm=yes],[has_64bit_asm=no]) +AC_MSG_RESULT([$has_64bit_asm]) +if test x"$set_field" == x"64bit_asm"; then + if test x"$has_64bit_asm" == x"no"; then + AC_MSG_ERROR([$set_field field support explicitly requested but no x86_64 assembly available]) fi fi ]) diff --git a/configure.ac b/configure.ac index 6e6fccd7fdd..60a54051576 100644 --- a/configure.ac +++ b/configure.ac @@ -283,7 +283,6 @@ AC_SUBST(SECP_INCLUDES) AC_SUBST(SECP_LIBS) AC_SUBST(SECP_TEST_LIBS) AC_SUBST(SECP_TEST_INCLUDES) -AC_SUBST(YASM_BINFMT) AM_CONDITIONAL([USE_ASM], [test x"$set_field" == x"64bit_asm"]) AM_CONDITIONAL([USE_TESTS], [test x"$use_tests" != x"no"]) AM_CONDITIONAL([USE_BENCHMARK], [test x"$use_benchmark" != x"no"]) diff --git a/nasm_lt.sh b/nasm_lt.sh deleted file mode 100755 index 6cd73294c06..00000000000 --- a/nasm_lt.sh +++ /dev/null @@ -1,57 +0,0 @@ -#! /bin/sh -command="" -infile="" -o_opt=no -pic=no -while [ $# -gt 0 ]; do - case "$1" in - -DPIC|-fPIC|-fpic|-Kpic|-KPIC) - if [ "$pic" != "yes" ] ; then - command="$command -DPIC" - pic=yes - fi - ;; - -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \ - -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64) - # it's a file format specifier for nasm. - command="$command $1" - ;; - -f*) - # maybe a code-generation flag for gcc. - ;; - -[Ii]*) - incdir=`echo "$1" | sed 's/^-[Ii]//'` - if [ "x$incdir" = x -a "x$2" != x ] ; then - case "$2" in - -*) ;; - *) incdir="$2"; shift;; - esac - fi - if [ "x$incdir" != x ] ; then - # In the case of NASM, the trailing slash is necessary. - incdir=`echo "$incdir" | sed 's%/*$%/%'` - command="$command -I$incdir" - fi - ;; - -o*) - o_opt=yes - command="$command $1" - ;; - *.asm) - infile=$1 - command="$command $1" - ;; - *) - command="$command $1" - ;; - esac - shift -done -if [ "$o_opt" != yes ] ; then - # By default, NASM creates an output file - # in the same directory as the input file. - outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o" - command="$command $outfile" -fi -echo $command -exec $command diff --git a/src/field_5x52_asm.asm b/src/field_5x52_asm.asm deleted file mode 100644 index 11f12eeb2dd..00000000000 --- a/src/field_5x52_asm.asm +++ /dev/null @@ -1,529 +0,0 @@ - ;; Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille - ;; Distributed under the MIT software license, see the accompanying - ;; file COPYING or http://www.opensource.org/licenses/mit-license.php. - - ;; Changelog: - ;; * March 2013, Diederik Huys: Original version - ;; * November 2014, Pieter Wuille: Updated to use Peter Dettman's parallel - ;; multiplication algorithm - ;; - ;; Provided public procedures: - ;; secp256k1_fe_mul_inner - ;; secp256k1_fe_sqr_inner - ;; - ;; Needed tools: YASM (http://yasm.tortall.net) - ;; - ;; - - BITS 64 - -%ifidn __OUTPUT_FORMAT__,macho64 -%define SYM(x) _ %+ x -%else -%define SYM(x) x -%endif - - ;; Procedure ExSetMult - ;; Register Layout: - ;; INPUT: rdi = a->n - ;; rsi = b->n - ;; rdx = r->a - ;; - ;; INTERNAL: rdx:rax = multiplication accumulator - ;; r9:r8 = c - ;; r10:r14 = a0-a4 - ;; rcx:rbx = d - ;; rbp = R - ;; rdi = t? - ;; r15 = b->n - ;; rsi = r->n - GLOBAL SYM(secp256k1_fe_mul_inner) - ALIGN 32 -SYM(secp256k1_fe_mul_inner): - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - mov r10,[rdi+0*8] - mov r11,[rdi+1*8] - mov r12,[rdi+2*8] - mov r13,[rdi+3*8] - mov r14,[rdi+4*8] - mov rbp,01000003D10h - mov r15,rsi - mov rsi,rdx - - ;; d += a3 * b0 - mov rax,[r15+0*8] - mul r13 - mov rbx,rax - mov rcx,rdx - ;; d += a2 * b1 - mov rax,[r15+1*8] - mul r12 - add rbx,rax - adc rcx,rdx - ;; d += a1 * b2 - mov rax,[r15+2*8] - mul r11 - add rbx,rax - adc rcx,rdx - ;; d = a0 * b3 - mov rax,[r15+3*8] - mul r10 - add rbx,rax - adc rcx,rdx - ;; c = a4 * b4 - mov rax,[r15+4*8] - mul r14 - mov r8,rax - mov r9,rdx - ;; d += (c & M) * R - mov rdx,0fffffffffffffh - and rax,rdx - mul rbp - add rbx,rax - adc rcx,rdx - ;; c >>= 52 (r8 only) - shrd r8,r9,52 - ;; t3 (stack) = d & M - mov rdi,rbx - mov rdx,0fffffffffffffh - and rdi,rdx - push rdi - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; d += a4 * b0 - mov rax,[r15+0*8] - mul r14 - add rbx,rax - adc rcx,rdx - ;; d += a3 * b1 - mov rax,[r15+1*8] - mul r13 - add rbx,rax - adc rcx,rdx - ;; d += a2 * b2 - mov rax,[r15+2*8] - mul r12 - add rbx,rax - adc rcx,rdx - ;; d += a1 * b3 - mov rax,[r15+3*8] - mul r11 - add rbx,rax - adc rcx,rdx - ;; d += a0 * b4 - mov rax,[r15+4*8] - mul r10 - add rbx,rax - adc rcx,rdx - ;; d += c * R - mov rax,r8 - mul rbp - add rbx,rax - adc rcx,rdx - ;; t4 = d & M (rdi) - mov rdi,rbx - mov rdx,0fffffffffffffh - and rdi,rdx - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; tx = t4 >> 48 (rbp, overwrites R) - mov rbp,rdi - shr rbp,48 - ;; t4 &= (M >> 4) (stack) - mov rax,0ffffffffffffh - and rdi,rax - push rdi - ;; c = a0 * b0 - mov rax,[r15+0*8] - mul r10 - mov r8,rax - mov r9,rdx - ;; d += a4 * b1 - mov rax,[r15+1*8] - mul r14 - add rbx,rax - adc rcx,rdx - ;; d += a3 * b2 - mov rax,[r15+2*8] - mul r13 - add rbx,rax - adc rcx,rdx - ;; d += a2 * b3 - mov rax,[r15+3*8] - mul r12 - add rbx,rax - adc rcx,rdx - ;; d += a1 * b4 - mov rax,[r15+4*8] - mul r11 - add rbx,rax - adc rcx,rdx - ;; u0 = d & M (rdi) - mov rdi,rbx - mov rdx,0fffffffffffffh - and rdi,rdx - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; u0 = (u0 << 4) | tx (rdi) - shl rdi,4 - or rdi,rbp - ;; c += u0 * (R >> 4) - mov rax,01000003D1h - mul rdi - add r8,rax - adc r9,rdx - ;; r[0] = c & M - mov rax,r8 - mov rdx,0fffffffffffffh - and rax,rdx - mov [rsi+0*8],rax - ;; c >>= 52 - shrd r8,r9,52 - mov r9,0 - ;; c += a1 * b0 - mov rax,[r15+0*8] - mul r11 - add r8,rax - adc r9,rdx - ;; c += a0 * b1 - mov rax,[r15+1*8] - mul r10 - add r8,rax - adc r9,rdx - ;; d += a4 * b2 - mov rax,[r15+2*8] - mul r14 - add rbx,rax - adc rcx,rdx - ;; d += a3 * b3 - mov rax,[r15+3*8] - mul r13 - add rbx,rax - adc rcx,rdx - ;; d += a2 * b4 - mov rax,[r15+4*8] - mul r12 - add rbx,rax - adc rcx,rdx - ;; restore rdp = R - mov rbp,01000003D10h - ;; c += (d & M) * R - mov rax,rbx - mov rdx,0fffffffffffffh - and rax,rdx - mul rbp - add r8,rax - adc r9,rdx - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; r[1] = c & M - mov rax,r8 - mov rdx,0fffffffffffffh - and rax,rdx - mov [rsi+8*1],rax - ;; c >>= 52 - shrd r8,r9,52 - mov r9,0 - ;; c += a2 * b0 - mov rax,[r15+0*8] - mul r12 - add r8,rax - adc r9,rdx - ;; c += a1 * b1 - mov rax,[r15+1*8] - mul r11 - add r8,rax - adc r9,rdx - ;; c += a0 * b2 (last use of r10 = a0) - mov rax,[r15+2*8] - mul r10 - add r8,rax - adc r9,rdx - ;; fetch t3 (r10, overwrites a0),t4 (rdi) - pop rdi - pop r10 - ;; d += a4 * b3 - mov rax,[r15+3*8] - mul r14 - add rbx,rax - adc rcx,rdx - ;; d += a3 * b4 - mov rax,[r15+4*8] - mul r13 - add rbx,rax - adc rcx,rdx - ;; c += (d & M) * R - mov rax,rbx - mov rdx,0fffffffffffffh - and rax,rdx - mul rbp - add r8,rax - adc r9,rdx - ;; d >>= 52 (rbx only) - shrd rbx,rcx,52 - ;; r[2] = c & M - mov rax,r8 - mov rdx,0fffffffffffffh - and rax,rdx - mov [rsi+2*8],rax - ;; c >>= 52 - shrd r8,r9,52 - mov r9,0 - ;; c += t3 - add r8,r10 - ;; c += d * R - mov rax,rbx - mul rbp - add r8,rax - adc r9,rdx - ;; r[3] = c & M - mov rax,r8 - mov rdx,0fffffffffffffh - and rax,rdx - mov [rsi+3*8],rax - ;; c >>= 52 (r8 only) - shrd r8,r9,52 - ;; c += t4 (r8 only) - add r8,rdi - ;; r[4] = c - mov [rsi+4*8],r8 - - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp - ret - - - ;; PROC ExSetSquare - ;; Register Layout: - ;; INPUT: rdi = a.n - ;; rsi = r.n - ;; INTERNAL: rdx:rax = multiplication accumulator - ;; r9:r8 = c - ;; r10:r14 = a0-a4 - ;; rcx:rbx = d - ;; rbp = R - ;; rdi = t? - ;; r15 = M - GLOBAL SYM(secp256k1_fe_sqr_inner) - ALIGN 32 -SYM(secp256k1_fe_sqr_inner): - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - mov r10,[rdi+0*8] - mov r11,[rdi+1*8] - mov r12,[rdi+2*8] - mov r13,[rdi+3*8] - mov r14,[rdi+4*8] - mov rbp,01000003D10h - mov r15,0fffffffffffffh - - ;; d = (a0*2) * a3 - lea rax,[r10*2] - mul r13 - mov rbx,rax - mov rcx,rdx - ;; d += (a1*2) * a2 - lea rax,[r11*2] - mul r12 - add rbx,rax - adc rcx,rdx - ;; c = a4 * a4 - mov rax,r14 - mul r14 - mov r8,rax - mov r9,rdx - ;; d += (c & M) * R - and rax,r15 - mul rbp - add rbx,rax - adc rcx,rdx - ;; c >>= 52 (r8 only) - shrd r8,r9,52 - ;; t3 (stack) = d & M - mov rdi,rbx - and rdi,r15 - push rdi - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; a4 *= 2 - add r14,r14 - ;; d += a0 * a4 - mov rax,r10 - mul r14 - add rbx,rax - adc rcx,rdx - ;; d+= (a1*2) * a3 - lea rax,[r11*2] - mul r13 - add rbx,rax - adc rcx,rdx - ;; d += a2 * a2 - mov rax,r12 - mul r12 - add rbx,rax - adc rcx,rdx - ;; d += c * R - mov rax,r8 - mul rbp - add rbx,rax - adc rcx,rdx - ;; t4 = d & M (rdi) - mov rdi,rbx - and rdi,r15 - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; tx = t4 >> 48 (rbp, overwrites constant) - mov rbp,rdi - shr rbp,48 - ;; t4 &= (M >> 4) (stack) - mov rax,0ffffffffffffh - and rdi,rax - push rdi - ;; c = a0 * a0 - mov rax,r10 - mul r10 - mov r8,rax - mov r9,rdx - ;; d += a1 * a4 - mov rax,r11 - mul r14 - add rbx,rax - adc rcx,rdx - ;; d += (a2*2) * a3 - lea rax,[r12*2] - mul r13 - add rbx,rax - adc rcx,rdx - ;; u0 = d & M (rdi) - mov rdi,rbx - and rdi,r15 - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; u0 = (u0 << 4) | tx (rdi) - shl rdi,4 - or rdi,rbp - ;; c += u0 * (R >> 4) - mov rax,01000003D1h - mul rdi - add r8,rax - adc r9,rdx - ;; r[0] = c & M - mov rax,r8 - and rax,r15 - mov [rsi+0*8],rax - ;; c >>= 52 - shrd r8,r9,52 - mov r9,0 - ;; a0 *= 2 - add r10,r10 - ;; c += a0 * a1 - mov rax,r10 - mul r11 - add r8,rax - adc r9,rdx - ;; d += a2 * a4 - mov rax,r12 - mul r14 - add rbx,rax - adc rcx,rdx - ;; d += a3 * a3 - mov rax,r13 - mul r13 - add rbx,rax - adc rcx,rdx - ;; load R in rbp - mov rbp,01000003D10h - ;; c += (d & M) * R - mov rax,rbx - and rax,r15 - mul rbp - add r8,rax - adc r9,rdx - ;; d >>= 52 - shrd rbx,rcx,52 - mov rcx,0 - ;; r[1] = c & M - mov rax,r8 - and rax,r15 - mov [rsi+8*1],rax - ;; c >>= 52 - shrd r8,r9,52 - mov r9,0 - ;; c += a0 * a2 (last use of r10) - mov rax,r10 - mul r12 - add r8,rax - adc r9,rdx - ;; fetch t3 (r10, overwrites a0),t4 (rdi) - pop rdi - pop r10 - ;; c += a1 * a1 - mov rax,r11 - mul r11 - add r8,rax - adc r9,rdx - ;; d += a3 * a4 - mov rax,r13 - mul r14 - add rbx,rax - adc rcx,rdx - ;; c += (d & M) * R - mov rax,rbx - and rax,r15 - mul rbp - add r8,rax - adc r9,rdx - ;; d >>= 52 (rbx only) - shrd rbx,rcx,52 - ;; r[2] = c & M - mov rax,r8 - and rax,r15 - mov [rsi+2*8],rax - ;; c >>= 52 - shrd r8,r9,52 - mov r9,0 - ;; c += t3 - add r8,r10 - ;; c += d * R - mov rax,rbx - mul rbp - add r8,rax - adc r9,rdx - ;; r[3] = c & M - mov rax,r8 - and rax,r15 - mov [rsi+3*8],rax - ;; c >>= 52 (r8 only) - shrd r8,r9,52 - ;; c += t4 (r8 only) - add r8,rdi - ;; r[4] = c - mov [rsi+4*8],r8 - - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp - ret diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h index f29605b11b9..23857cded3a 100644 --- a/src/field_5x52_asm_impl.h +++ b/src/field_5x52_asm_impl.h @@ -1,13 +1,502 @@ /********************************************************************** - * Copyright (c) 2013 Pieter Wuille * + * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * * Distributed under the MIT software license, see the accompanying * * file COPYING or http://www.opensource.org/licenses/mit-license.php.* **********************************************************************/ +/** + * Changelog: + * - March 2013, Diederik Huys: original version + * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm + * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly + */ + #ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_ #define _SECP256K1_FIELD_INNER5X52_IMPL_H_ -void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r); -void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r); +SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) { +/** + * Registers: rdx:rax = multiplication accumulator + * r9:r8 = c + * r15:rcx = d + * r10-r14 = a0-a4 + * rbx = b + * %2 = r + * %0 = a / t? + * rbp = R (0x1000003d10) + */ +__asm__ __volatile__( + "pushq %%rbp\n" + + "movq 0(%0),%%r10\n" + "movq 8(%0),%%r11\n" + "movq 16(%0),%%r12\n" + "movq 24(%0),%%r13\n" + "movq 32(%0),%%r14\n" + "movq $0x1000003d10,%%rbp\n" + + /* d += a3 * b0 */ + "movq 0(%%rbx),%%rax\n" + "mulq %%r13\n" + "movq %%rax,%%rcx\n" + "movq %%rdx,%%r15\n" + /* d += a2 * b1 */ + "movq 8(%%rbx),%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a1 * b2 */ + "movq 16(%%rbx),%%rax\n" + "mulq %%r11\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d = a0 * b3 */ + "movq 24(%%rbx),%%rax\n" + "mulq %%r10\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* c = a4 * b4 */ + "movq 32(%%rbx),%%rax\n" + "mulq %%r14\n" + "movq %%rax,%%r8\n" + "movq %%rdx,%%r9\n" + /* d += (c & M) * R */ + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* c >>= 52 (%%r8 only) */ + "shrdq $52,%%r9,%%r8\n" + /* t3 (stack) = d & M */ + "movq %%rcx,%0\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%0\n" + "pushq %0\n" + /* d >>= 52 */ + "shrdq $52,%%r15,%%rcx\n" + "xorq %%r15,%%r15\n" + /* d += a4 * b0 */ + "movq 0(%%rbx),%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a3 * b1 */ + "movq 8(%%rbx),%%rax\n" + "mulq %%r13\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a2 * b2 */ + "movq 16(%%rbx),%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a1 * b3 */ + "movq 24(%%rbx),%%rax\n" + "mulq %%r11\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a0 * b4 */ + "movq 32(%%rbx),%%rax\n" + "mulq %%r10\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += c * R */ + "movq %%r8,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* t4 = d & M (%0) */ + "movq %%rcx,%0\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%0\n" + /* d >>= 52 */ + "shrdq $52,%%r15,%%rcx\n" + "xorq %%r15,%%r15\n" + /* tx = t4 >> 48 (%%rbp, overwrites R) */ + "movq %0,%%rbp\n" + "shrq $48,%%rbp\n" + /* t4 &= (M >> 4) (stack) */ + "movq $0xffffffffffff,%%rax\n" + "andq %%rax,%0\n" + "pushq %0\n" + /* c = a0 * b0 */ + "movq 0(%%rbx),%%rax\n" + "mulq %%r10\n" + "movq %%rax,%%r8\n" + "movq %%rdx,%%r9\n" + /* d += a4 * b1 */ + "movq 8(%%rbx),%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a3 * b2 */ + "movq 16(%%rbx),%%rax\n" + "mulq %%r13\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a2 * b3 */ + "movq 24(%%rbx),%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a1 * b4 */ + "movq 32(%%rbx),%%rax\n" + "mulq %%r11\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* u0 = d & M (%0) */ + "movq %%rcx,%0\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%0\n" + /* d >>= 52 */ + "shrdq $52,%%r15,%%rcx\n" + "xorq %%r15,%%r15\n" + /* u0 = (u0 << 4) | tx (%0) */ + "shlq $4,%0\n" + "orq %%rbp,%0\n" + /* c += u0 * (R >> 4) */ + "movq $0x1000003d1,%%rax\n" + "mulq %0\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* r[0] = c & M */ + "movq %%r8,%%rax\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%%rax\n" + "movq %%rax,0(%2)\n" + /* c >>= 52 */ + "shrdq $52,%%r9,%%r8\n" + "xorq %%r9,%%r9\n" + /* c += a1 * b0 */ + "movq 0(%%rbx),%%rax\n" + "mulq %%r11\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* c += a0 * b1 */ + "movq 8(%%rbx),%%rax\n" + "mulq %%r10\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* d += a4 * b2 */ + "movq 16(%%rbx),%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a3 * b3 */ + "movq 24(%%rbx),%%rax\n" + "mulq %%r13\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a2 * b4 */ + "movq 32(%%rbx),%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* restore rdp = R */ + "movq $0x1000003d10,%%rbp\n" + /* c += (d & M) * R */ + "movq %%rcx,%%rax\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* d >>= 52 */ + "shrdq $52,%%r15,%%rcx\n" + "xorq %%r15,%%r15\n" + /* r[1] = c & M */ + "movq %%r8,%%rax\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%%rax\n" + "movq %%rax,8(%2)\n" + /* c >>= 52 */ + "shrdq $52,%%r9,%%r8\n" + "xorq %%r9,%%r9\n" + /* c += a2 * b0 */ + "movq 0(%%rbx),%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* c += a1 * b1 */ + "movq 8(%%rbx),%%rax\n" + "mulq %%r11\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* c += a0 * b2 (last use of %%r10 = a0) */ + "movq 16(%%rbx),%%rax\n" + "mulq %%r10\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* fetch t3 (%%r10, overwrites a0),t4 (%0) */ + "popq %0\n" + "popq %%r10\n" + /* d += a4 * b3 */ + "movq 24(%%rbx),%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* d += a3 * b4 */ + "movq 32(%%rbx),%%rax\n" + "mulq %%r13\n" + "addq %%rax,%%rcx\n" + "adcq %%rdx,%%r15\n" + /* c += (d & M) * R */ + "movq %%rcx,%%rax\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* d >>= 52 (%%rcx only) */ + "shrdq $52,%%r15,%%rcx\n" + /* r[2] = c & M */ + "movq %%r8,%%rax\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%%rax\n" + "movq %%rax,16(%2)\n" + /* c >>= 52 */ + "shrdq $52,%%r9,%%r8\n" + "xorq %%r9,%%r9\n" + /* c += t3 */ + "addq %%r10,%%r8\n" + /* c += d * R */ + "movq %%rcx,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* r[3] = c & M */ + "movq %%r8,%%rax\n" + "movq $0xfffffffffffff,%%rdx\n" + "andq %%rdx,%%rax\n" + "movq %%rax,24(%2)\n" + /* c >>= 52 (%%r8 only) */ + "shrdq $52,%%r9,%%r8\n" + /* c += t4 (%%r8 only) */ + "addq %0,%%r8\n" + /* r[4] = c */ + "movq %%r8,32(%2)\n" + + "popq %%rbp\n" +: "+S"(a) +: "b"(b), "D"(r) +: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" +); +} + +SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) { +/** + * Registers: rdx:rax = multiplication accumulator + * r9:r8 = c + * rcx:rbx = d + * r10-r14 = a0-a4 + * r15 = M (0xfffffffffffff) + * %1 = r + * %0 = a / t? + * rbp = R (0x1000003d10) + */ +__asm__ __volatile__( + "pushq %%rbp\n" + + "movq 0(%0),%%r10\n" + "movq 8(%0),%%r11\n" + "movq 16(%0),%%r12\n" + "movq 24(%0),%%r13\n" + "movq 32(%0),%%r14\n" + "movq $0x1000003d10,%%rbp\n" + "movq $0xfffffffffffff,%%r15\n" + + /* d = (a0*2) * a3 */ + "leaq (%%r10,%%r10,1),%%rax\n" + "mulq %%r13\n" + "movq %%rax,%%rbx\n" + "movq %%rdx,%%rcx\n" + /* d += (a1*2) * a2 */ + "leaq (%%r11,%%r11,1),%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* c = a4 * a4 */ + "movq %%r14,%%rax\n" + "mulq %%r14\n" + "movq %%rax,%%r8\n" + "movq %%rdx,%%r9\n" + /* d += (c & M) * R */ + "andq %%r15,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* c >>= 52 (%%r8 only) */ + "shrdq $52,%%r9,%%r8\n" + /* t3 (stack) = d & M */ + "movq %%rbx,%0\n" + "andq %%r15,%0\n" + "pushq %0\n" + /* d >>= 52 */ + "shrdq $52,%%rcx,%%rbx\n" + "xorq %%rcx,%%rcx\n" + /* a4 *= 2 */ + "addq %%r14,%%r14\n" + /* d += a0 * a4 */ + "movq %%r10,%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* d+= (a1*2) * a3 */ + "leaq (%%r11,%%r11,1),%%rax\n" + "mulq %%r13\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* d += a2 * a2 */ + "movq %%r12,%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* d += c * R */ + "movq %%r8,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* t4 = d & M (%0) */ + "movq %%rbx,%0\n" + "andq %%r15,%0\n" + /* d >>= 52 */ + "shrdq $52,%%rcx,%%rbx\n" + "xorq %%rcx,%%rcx\n" + /* tx = t4 >> 48 (%%rbp, overwrites constant) */ + "movq %0,%%rbp\n" + "shrq $48,%%rbp\n" + /* t4 &= (M >> 4) (stack) */ + "movq $0xffffffffffff,%%rax\n" + "andq %%rax,%0\n" + "pushq %0\n" + /* c = a0 * a0 */ + "movq %%r10,%%rax\n" + "mulq %%r10\n" + "movq %%rax,%%r8\n" + "movq %%rdx,%%r9\n" + /* d += a1 * a4 */ + "movq %%r11,%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* d += (a2*2) * a3 */ + "leaq (%%r12,%%r12,1),%%rax\n" + "mulq %%r13\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* u0 = d & M (%0) */ + "movq %%rbx,%0\n" + "andq %%r15,%0\n" + /* d >>= 52 */ + "shrdq $52,%%rcx,%%rbx\n" + "xorq %%rcx,%%rcx\n" + /* u0 = (u0 << 4) | tx (%0) */ + "shlq $4,%0\n" + "orq %%rbp,%0\n" + /* c += u0 * (R >> 4) */ + "movq $0x1000003d1,%%rax\n" + "mulq %0\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* r[0] = c & M */ + "movq %%r8,%%rax\n" + "andq %%r15,%%rax\n" + "movq %%rax,0(%1)\n" + /* c >>= 52 */ + "shrdq $52,%%r9,%%r8\n" + "xorq %%r9,%%r9\n" + /* a0 *= 2 */ + "addq %%r10,%%r10\n" + /* c += a0 * a1 */ + "movq %%r10,%%rax\n" + "mulq %%r11\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* d += a2 * a4 */ + "movq %%r12,%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* d += a3 * a3 */ + "movq %%r13,%%rax\n" + "mulq %%r13\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* load R in %%rbp */ + "movq $0x1000003d10,%%rbp\n" + /* c += (d & M) * R */ + "movq %%rbx,%%rax\n" + "andq %%r15,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* d >>= 52 */ + "shrdq $52,%%rcx,%%rbx\n" + "xorq %%rcx,%%rcx\n" + /* r[1] = c & M */ + "movq %%r8,%%rax\n" + "andq %%r15,%%rax\n" + "movq %%rax,8(%1)\n" + /* c >>= 52 */ + "shrdq $52,%%r9,%%r8\n" + "xorq %%r9,%%r9\n" + /* c += a0 * a2 (last use of %%r10) */ + "movq %%r10,%%rax\n" + "mulq %%r12\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* fetch t3 (%%r10, overwrites a0),t4 (%0) */ + "popq %0\n" + "popq %%r10\n" + /* c += a1 * a1 */ + "movq %%r11,%%rax\n" + "mulq %%r11\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* d += a3 * a4 */ + "movq %%r13,%%rax\n" + "mulq %%r14\n" + "addq %%rax,%%rbx\n" + "adcq %%rdx,%%rcx\n" + /* c += (d & M) * R */ + "movq %%rbx,%%rax\n" + "andq %%r15,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* d >>= 52 (%%rbx only) */ + "shrdq $52,%%rcx,%%rbx\n" + /* r[2] = c & M */ + "movq %%r8,%%rax\n" + "andq %%r15,%%rax\n" + "movq %%rax,16(%1)\n" + /* c >>= 52 */ + "shrdq $52,%%r9,%%r8\n" + "xorq %%r9,%%r9\n" + /* c += t3 */ + "addq %%r10,%%r8\n" + /* c += d * R */ + "movq %%rbx,%%rax\n" + "mulq %%rbp\n" + "addq %%rax,%%r8\n" + "adcq %%rdx,%%r9\n" + /* r[3] = c & M */ + "movq %%r8,%%rax\n" + "andq %%r15,%%rax\n" + "movq %%rax,24(%1)\n" + /* c >>= 52 (%%r8 only) */ + "shrdq $52,%%r9,%%r8\n" + /* c += t4 (%%r8 only) */ + "addq %0,%%r8\n" + /* r[4] = c */ + "movq %%r8,32(%1)\n" + + "popq %%rbp\n" +: "+S"(a) +: "D"(r) +: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" +); +} #endif From b2c9681c6fdd6a0280e1a40426db76d0e732e8d8 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Tue, 2 Dec 2014 17:51:55 +0100 Subject: [PATCH 3/3] Make {mul,sqr}_inner use the same argument order as {mul,sqr} --- src/field_10x26_impl.h | 8 ++++---- src/field_5x52_asm_impl.h | 4 ++-- src/field_5x52_impl.h | 4 ++-- src/field_5x52_int128_impl.h | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index c4403fba22d..390a2bcbc24 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -271,7 +271,7 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1 #define VERIFY_BITS(x, n) do { } while(0) #endif -SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b, uint32_t *r) { +SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b) { VERIFY_BITS(a[0], 30); VERIFY_BITS(a[1], 30); VERIFY_BITS(a[2], 30); @@ -598,7 +598,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin /* [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */ } -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) { +SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a) { VERIFY_BITS(a[0], 30); VERIFY_BITS(a[1], 30); VERIFY_BITS(a[2], 30); @@ -879,7 +879,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s secp256k1_fe_verify(b); VERIFY_CHECK(r != b); #endif - secp256k1_fe_mul_inner(a->n, b->n, r->n); + secp256k1_fe_mul_inner(r->n, a->n, b->n); #ifdef VERIFY r->magnitude = 1; r->normalized = 0; @@ -892,7 +892,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) { VERIFY_CHECK(a->magnitude <= 8); secp256k1_fe_verify(a); #endif - secp256k1_fe_sqr_inner(a->n, r->n); + secp256k1_fe_sqr_inner(r->n, a->n); #ifdef VERIFY r->magnitude = 1; r->normalized = 0; diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h index 23857cded3a..af12fd30d2c 100644 --- a/src/field_5x52_asm_impl.h +++ b/src/field_5x52_asm_impl.h @@ -14,7 +14,7 @@ #ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_ #define _SECP256K1_FIELD_INNER5X52_IMPL_H_ -SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) { +SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { /** * Registers: rdx:rax = multiplication accumulator * r9:r8 = c @@ -284,7 +284,7 @@ __asm__ __volatile__( ); } -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) { +SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { /** * Registers: rdx:rax = multiplication accumulator * r9:r8 = c diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 75b210eaf68..4b833db3628 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -255,7 +255,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s secp256k1_fe_verify(b); VERIFY_CHECK(r != b); #endif - secp256k1_fe_mul_inner(a->n, b->n, r->n); + secp256k1_fe_mul_inner(r->n, a->n, b->n); #ifdef VERIFY r->magnitude = 1; r->normalized = 0; @@ -268,7 +268,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) { VERIFY_CHECK(a->magnitude <= 8); secp256k1_fe_verify(a); #endif - secp256k1_fe_sqr_inner(a->n, r->n); + secp256k1_fe_sqr_inner(r->n, a->n); #ifdef VERIFY r->magnitude = 1; r->normalized = 0; diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h index e552fb4319e..ec631833cfa 100644 --- a/src/field_5x52_int128_impl.h +++ b/src/field_5x52_int128_impl.h @@ -15,7 +15,7 @@ #define VERIFY_BITS(x, n) do { } while(0) #endif -SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) { +SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { VERIFY_BITS(a[0], 56); VERIFY_BITS(a[1], 56); VERIFY_BITS(a[2], 56); @@ -152,7 +152,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uin /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ } -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) { +SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { VERIFY_BITS(a[0], 56); VERIFY_BITS(a[1], 56); VERIFY_BITS(a[2], 56);