internal/radix51: Use a macro for the reduction/carry propagation

This requires changing the registers used for feSquare, but it removes
a gigantic block of functionally equivalent assembly.
This commit is contained in:
Yawning Angel 2021-02-09 14:38:08 +00:00
parent e283be7744
commit a8296fe54a
1 changed files with 118 additions and 146 deletions

View File

@ -5,8 +5,55 @@
// +build amd64,!purego
#include "textflag.h"
// reduce64 reduces the intermediaries stored in rsi, rbp, r8 .. r15.
//
// Inputs: rsi, rbp, r8 .. r15.
// Clobbers: rax, rdx
#define reduce64() \
MOVQ $2251799813685247, AX \ // (1<<51) - 1
SHLQ $13, SI, BP \ // r01 = shld with r00
ANDQ AX, SI \ // r00 &= mask51
SHLQ $13, R8, R9 \ // r11 = shld with r10
ANDQ AX, R8 \ // r10 &= mask51
ADDQ BP, R8 \ // r10 += r01
SHLQ $13, R10, R11 \ // r21 = shld with r20
ANDQ AX, R10 \ // r20 &= mask51
ADDQ R9, R10 \ // r20 += r11
SHLQ $13, R12, R13 \ // r31 = shld with r30
ANDQ AX, R12 \ // r30 &= mask51
ADDQ R11, R12 \ // r30 += r21
SHLQ $13, R14, R15 \ // r41 = shld with r40
ANDQ AX, R14 \ // r40 &= mask51
ADDQ R13, R14 \ // r40 += r31
IMUL3Q $19, R15, R15 \ // r41 = r41*19
ADDQ R15, SI \ // r00 += r41
\
MOVQ SI, DX \ // rdx <-- r00
SHRQ $51, DX \ // rdx <-- r00 >> 51
ADDQ DX, R8 \ // r10 += r00 >> 51
MOVQ R8, DX \ // rdx <-- r10
SHRQ $51, DX \ // rdx <-- r10 >> 51
ANDQ AX, SI \ // r00 &= mask51
ADDQ DX, R10 \ // r20 += r10 >> 51
MOVQ R10, DX \ // rdx <-- r20
SHRQ $51, DX \ // rdx <-- r20 >> 51
ANDQ AX, R8 \ // r10 &= mask51
ADDQ DX, R12 \ // r30 += r20 >> 51
MOVQ R12, DX \ // rdx <-- r30
SHRQ $51, DX \ // rdx <-- r30 >> 51
ANDQ AX, R10 \ // r20 &= mask51
ADDQ DX, R14 \ // r40 += r30 >> 51
MOVQ R14, DX \ // rdx <-- r40
SHRQ $51, DX \ // rdx <-- r40 >> 51
ANDQ AX, R12 \ // r30 &= mask51
IMUL3Q $19, DX, DX \ // rdx <-- (r40 >> 51) * 19
ADDQ DX, SI \ // r00 += (r40 >> 51) *19
ANDQ AX, R14 \ // r40 &= mask51
// func feMulAmd64(out, a, b *FieldElement, useBMI2 bool)
TEXT ·feMulAmd64(SB),$0-25
TEXT ·feMulAmd64(SB), NOSPLIT|NOFRAME, $0-25
MOVQ a+8(FP), BX
MOVQ b+16(FP), CX
@ -270,45 +317,7 @@ mul_vanilla:
ADCQ DX, R15
mul_reduce:
MOVQ $2251799813685247, AX // (1<<51) - 1
SHLQ $13, SI, BP // r01 = shld with r00
ANDQ AX, SI // r00 &= mask51
SHLQ $13, R8, R9 // r11 = shld with r10
ANDQ AX, R8 // r10 &= mask51
ADDQ BP, R8 // r10 += r01
SHLQ $13, R10, R11 // r21 = shld with r20
ANDQ AX, R10 // r20 &= mask51
ADDQ R9, R10 // r20 += r11
SHLQ $13, R12, R13 // r31 = shld with r30
ANDQ AX, R12 // r30 &= mask51
ADDQ R11, R12 // r30 += r21
SHLQ $13, R14, R15 // r41 = shld with r40
ANDQ AX, R14 // r40 &= mask51
ADDQ R13, R14 // r40 += r31
IMUL3Q $19, R15, R15 // r41 = r41*19
ADDQ R15, SI // r00 += r41
MOVQ SI, DX // rdx <-- r00
SHRQ $51, DX // rdx <-- r00 >> 51
ADDQ DX, R8 // r10 += r00 >> 51
MOVQ R8, DX // rdx <-- r10
SHRQ $51, DX // rdx <-- r10 >> 51
ANDQ AX, SI // r00 &= mask51
ADDQ DX, R10 // r20 += r10 >> 51
MOVQ R10, DX // rdx <-- r20
SHRQ $51, DX // rdx <-- r20 >> 51
ANDQ AX, R8 // r10 &= mask51
ADDQ DX, R12 // r30 += r20 >> 51
MOVQ R12, DX // rdx <-- r30
SHRQ $51, DX // rdx <-- r30 >> 51
ANDQ AX, R10 // r20 &= mask51
ADDQ DX, R14 // r40 += r30 >> 51
MOVQ R14, DX // rdx <-- r40
SHRQ $51, DX // rdx <-- r40 >> 51
ANDQ AX, R12 // r30 &= mask51
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
ADDQ DX, SI // r00 += (r40 >> 51) *19
ANDQ AX, R14 // r40 &= mask51
reduce64()
MOVQ out+0(FP), DI
MOVQ SI, 0(DI)
@ -319,146 +328,109 @@ mul_reduce:
RET
// func feSquare(out, x *FieldElement)
TEXT ·feSquare(SB),4,$0-16
MOVQ out+0(FP), DI
MOVQ x+8(FP), SI
TEXT ·feSquare(SB), NOSPLIT|NOFRAME, $0-16
MOVQ x+8(FP), BX
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
MOVQ 0(SI), AX
MULQ 0(SI)
MOVQ AX, CX // r00
MOVQ DX, R8 // r01
MOVQ 0(BX), AX
MULQ 0(BX)
MOVQ AX, SI // r00
MOVQ DX, BP // r01
MOVQ 8(SI), DX
MOVQ 8(BX), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, CX
ADCQ DX, R8
MULQ 32(BX)
ADDQ AX, SI
ADCQ DX, BP
MOVQ 16(SI), DX
MOVQ 16(BX), DX
IMUL3Q $38, DX, AX
MULQ 24(SI)
ADDQ AX, CX
ADCQ DX, R8
MULQ 24(BX)
ADDQ AX, SI
ADCQ DX, BP
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
MOVQ 0(SI), AX
MOVQ 0(BX), AX
SHLQ $1, AX
MULQ 8(SI)
MOVQ AX, R9 // r10
MOVQ DX, R10 // r11
MULQ 8(BX)
MOVQ AX, R8 // r10
MOVQ DX, R9 // r11
MOVQ 16(SI), DX
MOVQ 16(BX), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R9
ADCQ DX, R10
MULQ 32(BX)
ADDQ AX, R8
ADCQ DX, R9
MOVQ 24(SI), DX
MOVQ 24(BX), DX
IMUL3Q $19, DX, AX
MULQ 24(SI)
ADDQ AX, R9
ADCQ DX, R10
MULQ 24(BX)
ADDQ AX, R8
ADCQ DX, R9
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
MOVQ 0(SI), AX
MOVQ 0(BX), AX
SHLQ $1, AX
MULQ 16(SI)
MOVQ AX, R11 // r20
MOVQ DX, R12 // r21
MULQ 16(BX)
MOVQ AX, R10 // r20
MOVQ DX, R11 // r21
MOVQ 8(SI), AX
MULQ 8(SI)
ADDQ AX, R11
ADCQ DX, R12
MOVQ 8(BX), AX
MULQ 8(BX)
ADDQ AX, R10
ADCQ DX, R11
MOVQ 24(SI), DX
MOVQ 24(BX), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R11
ADCQ DX, R12
MULQ 32(BX)
ADDQ AX, R10
ADCQ DX, R11
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
MOVQ 0(SI), AX
MOVQ 0(BX), AX
SHLQ $1, AX
MULQ 24(SI)
MOVQ AX, R13 // r30
MOVQ DX, R14 // r31
MULQ 24(BX)
MOVQ AX, R12 // r30
MOVQ DX, R13 // r31
MOVQ 8(SI), AX
MOVQ 8(BX), AX
SHLQ $1, AX
MULQ 16(SI)
ADDQ AX, R13
ADCQ DX, R14
MULQ 16(BX)
ADDQ AX, R12
ADCQ DX, R13
MOVQ 32(SI), DX
MOVQ 32(BX), DX
IMUL3Q $19, DX, AX
MULQ 32(SI)
ADDQ AX, R13
ADCQ DX, R14
MULQ 32(BX)
ADDQ AX, R12
ADCQ DX, R13
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
MOVQ 0(SI), AX
MOVQ 0(BX), AX
SHLQ $1, AX
MULQ 32(SI)
MOVQ AX, R15 // r40
MOVQ DX, BX // r41
MULQ 32(BX)
MOVQ AX, R14 // r40
MOVQ DX, R15 // r41
MOVQ 8(SI), AX
MOVQ 8(BX), AX
SHLQ $1, AX
MULQ 24(SI)
ADDQ AX, R15
ADCQ DX, BX
MULQ 24(BX)
ADDQ AX, R14
ADCQ DX, R15
MOVQ 16(SI), AX
MULQ 16(SI)
ADDQ AX, R15
ADCQ DX, BX
MOVQ 16(BX), AX
MULQ 16(BX)
ADDQ AX, R14
ADCQ DX, R15
// Reduce
MOVQ $2251799813685247, AX // (1<<51) - 1
SHLQ $13, CX, R8 // r01 = shld with r00
ANDQ AX, CX // r00 &= mask51
SHLQ $13, R9, R10 // r11 = shld with r10
ANDQ AX, R9 // r10 &= mask51
ADDQ R8, R9 // r10 += r01
SHLQ $13, R11, R12 // r21 = shld with r20
ANDQ AX, R11 // r20 &= mask51
ADDQ R10, R11 // r20 += r11
SHLQ $13, R13, R14 // r31 = shld with r30
ANDQ AX, R13 // r30 &= mask51
ADDQ R12, R13 // r30 += r21
SHLQ $13, R15, BX // r41 = shld with r40
ANDQ AX, R15 // r40 &= mask51
ADDQ R14, R15 // r40 += r31
IMUL3Q $19, BX, DX // r41 = r41*19
ADDQ DX, CX // r00 += r41
reduce64()
MOVQ CX, DX // rdx <-- r00
SHRQ $51, DX // rdx <-- r00 >> 51
ADDQ DX, R9 // r10 += r00 >> 51
MOVQ R9, DX // rdx <-- r10
SHRQ $51, DX // rdx <-- r10 >> 51
ANDQ AX, CX // r00 &= mask51
ADDQ DX, R11 // r20 += r10 >> 51
MOVQ R11, DX // rdx <-- r20
SHRQ $51, DX // rdx <-- r20 >> 51
ANDQ AX, R9 // r10 &= mask51
ADDQ DX, R13 // r30 += r20 >> 51
MOVQ R13, DX // rdx <-- r30
SHRQ $51, DX // rdx <-- r30 >> 51
ANDQ AX, R11 // r20 &= mask51
ADDQ DX, R15 // r40 += r30 >> 51
MOVQ R15, DX // rdx <-- r40
SHRQ $51, DX // rdx <-- r40 >> 51
ANDQ AX, R13 // r30 &= mask51
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
ADDQ DX, CX // r00 += (r40 >> 51) *19
ANDQ AX, R15 // r40 &= mask51
MOVQ out+0(FP), DI
MOVQ SI, 0(DI)
MOVQ R8, 8(DI)
MOVQ R10, 16(DI)
MOVQ R12, 24(DI)
MOVQ R14, 32(DI)
MOVQ CX, 0(DI)
MOVQ R9, 8(DI)
MOVQ R11, 16(DI)
MOVQ R13, 24(DI)
MOVQ R15, 32(DI)
RET