double-width output seems to work

This commit is contained in:
George Tankersley 2018-05-28 00:00:00 +00:00
parent a12f6e84a9
commit a4358ce581
3 changed files with 56 additions and 85 deletions

View File

@ -382,6 +382,13 @@ func BenchmarkFeMul51(b *testing.B) {
}
}
func BenchmarkFeMulADX(b *testing.B) {
var h [10]uint64
for i := 0; i < b.N; i++ {
field.FeMulADX(&h, &radix51A, &radix51A)
}
}
func BenchmarkFeSquare51(b *testing.B) {
var h field.FieldElement
for i := 0; i < b.N; i++ {

View File

@ -15,7 +15,10 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0
// The first diagonal sets up the accumulators.
MOVQ 0(BP), DX // rdx <-- y0
MULXQ 0(SI), R8, R9 // r0 <-- x0*y0
MOVQ R8, 0(DI)
MULXQ 8(SI), R10, R11 // r1 <-- x1*y0
ADDQ R9, R10
MULXQ 16(SI), R12, R13 // r2 <-- x2*y0
@ -25,17 +28,6 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0
MULXQ 32(SI), BX, CX // r4 <-- x4*y0
ADCQ R15, BX
// CX is R[5], so we multiply by 19 add it to R[0]
ADCQ $0, CX
IMUL3Q $19, CX, AX
ADDQ AX, R8
// Now we have R8,R10,R12,R14,BX as R[4:0] and CX handled by our reduction
// identity. Since we can use offset addressing directly with ADD
// instructions, store the accumulators in the output to free up registers
// for more MULX results.
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
@ -44,9 +36,11 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0
XORQ AX, AX // clear flags
MOVQ 8(BP), DX // rdx <-- y1
MULXQ 0(SI), R10, R11 // r1 <-- x0*y1
//ADCXQ $0, R10 // this is a NOP
ADOXQ 8(DI), R10
MOVQ R10, 8(DI)
MULXQ 8(SI), R12, R13 // r2 <-- x1*y1
ADCXQ R11, R12
ADOXQ 16(DI), R12
@ -56,132 +50,102 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0
MULXQ 24(SI), BX, CX // r4 <-- x3*y1
ADCXQ R15, BX
ADOXQ 32(DI), BX
MULXQ 32(SI), R8, R9 // r0 = r5*19 <-- 19*(x4*y1)
MULXQ 32(SI), R8, R9 // r5 <-- x4*y1
ADCXQ CX, R8
ADOXQ 0(DI), R8
// Consolidate both carry chains in R9 then add it to R[6] mapped to R[1]
// by the reduction identity.
ADCXQ AX, R9
ADOXQ AX, R9
IMUL3Q $19, R9, AX
ADDQ AX, R10
ADOXQ 40(DI), R8
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
MOVQ R8, 40(DI)
XORQ AX, AX // clear flags
MOVQ 16(BP), DX // rdx <-- y2
MULXQ 0(SI), R12, R13 // r2 <-- x0*y2
//ADCXQ $0, R12 // this is a NOP
ADOXQ 16(DI), R12
MOVQ R12, 16(DI)
MULXQ 8(SI), R14, R15 // r3 <-- x1*y2
ADCXQ R13, R14
ADOXQ 24(DI), R14
MULXQ 16(SI), BX, CX // r4 <-- x2*y2
ADCXQ R15, BX
ADOXQ 32(DI), BX
MULXQ 24(SI), R8, R9 // r0 = r5*19 <-- 19*(x3*y2)
MULXQ 24(SI), R8, R9 // r5 <-- x3*y2
ADCXQ CX, R8
IMUL3Q $19, R8, R8
ADOXQ 0(DI), R8
MULXQ 32(SI), R10, R11 // r1 = r6*19 <-- 19*(x4*y2)
ADOXQ 40(DI), R8
MULXQ 32(SI), R10, R11 // r6 <-- x4*y2
ADCXQ R9, R10
IMUL3Q $19, R10, R10
ADOXQ 8(DI), R10
// Consolidate both carry chains in R11 then add it to R[7] mapped to R[2].
ADCXQ AX, R11
ADOXQ AX, R11
IMUL3Q $19, R11, AX
ADDQ AX, R12
ADOXQ 48(DI), R10
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
MOVQ R8, 40(DI)
MOVQ R10, 48(DI)
XORQ AX, AX // clear flags
MOVQ 24(BP), DX // rdx <-- y3
MULXQ 0(SI), R14, R15 // r3 <-- x0*y3
//ADCXQ $0, R14 // this is a NOP
ADOXQ 24(DI), R14
MOVQ R14, 24(DI)
MULXQ 8(SI), BX, CX // r4 <-- x1*y3
ADCXQ R15, BX
ADOXQ 32(DI), BX
MULXQ 16(SI), R8, R9 // r0 = r5*19 <-- 19*(x2*y3)
MULXQ 16(SI), R8, R9 // r5 <-- x2*y3
ADCXQ CX, R8
IMUL3Q $19, R8, R8
ADOXQ 0(DI), R8
MULXQ 24(SI), R10, R11 // r1 = r6*19 <-- 19*(x3*y3)
ADOXQ 40(DI), R8
MULXQ 24(SI), R10, R11 // r6 <-- x3*y3
ADCXQ R9, R10
IMUL3Q $19, R10, R10
ADOXQ 8(DI), R10
MULXQ 32(SI), R12, R13 // r2 = r7*19 <-- 19*(x4*y3)
ADOXQ 48(DI), R10
MULXQ 32(SI), R12, R13 //r7 <-- x4*y3
ADCXQ R11, R12
IMUL3Q $19, R12, R12
ADOXQ 16(DI), R12
// Consolidate both carry chains in R13 then add it to R[8] mapped to R[3].
ADCXQ AX, R13
ADOXQ AX, R13
IMUL3Q $19, R13, AX
ADDQ AX, R14
ADOXQ 56(DI), R12
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
MOVQ R8, 40(DI)
MOVQ R10, 48(DI)
MOVQ R12, 56(DI)
XORQ AX, AX // clear flags
MOVQ 32(BP), DX // rdx <-- y4
MULXQ 0(SI), BX, CX // r4 <-- x0*y4
//ADCXQ $0, BX // this is a NOP
ADOXQ 32(DI), BX
MOVQ BX, 32(DI)
MULXQ 8(SI), R8, R9 // r0 = r5*19 <-- 19*(x1*y4)
MULXQ 8(SI), R8, R9 // r5 <-- x1*y4
ADCXQ CX, R8
IMUL3Q $19, R8, R8
ADOXQ 0(DI), R8
ADOXQ 40(DI), R8
MULXQ 16(SI), R10, R11 // r1 = r6*19 <-- 19*(x2*y4)
MULXQ 16(SI), R10, R11 // r6 <-- x2*y4
ADCXQ R9, R10
IMUL3Q $19, R10, R10
ADOXQ 8(DI), R10
ADOXQ 48(DI), R10
MULXQ 24(SI), R12, R13 // r2 = r7*19 <-- 19*(x3*y4)
MULXQ 24(SI), R12, R13 // r7 <-- x3*y4
ADCXQ R11, R12
IMUL3Q $19, R12, R12
ADOXQ 16(DI), R12
ADOXQ 56(DI), R12
MULXQ 32(SI), R14, R15 // r3 = r8*19 <-- 19*(x4*y4)
MULXQ 32(SI), R14, R15 // r8 <-- x4*y4
ADCXQ R13, R14
IMUL3Q $19, R14, R14
ADOXQ 24(DI), R14
ADOXQ 64(DI), R14
// Consolidate both carry chains in R15 then add it to R[9] mapped to R[4]
// Consolidate both carry chains in R15, our final output.
ADCXQ AX, R15
ADOXQ AX, R15
IMUL3Q $19, R15, AX
ADDQ AX, BX
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
MOVQ R8, 40(DI)
MOVQ R10, 48(DI)
MOVQ R12, 56(DI)
MOVQ R14, 64(DI)
MOVQ R15, 72(DI)
RET

View File

@ -195,9 +195,9 @@ func TestFeInvert(t *testing.T) {
}
func TestFeMulADX(t *testing.T) {
var x FieldElement = [5]uint64{0, 0, 0, 0, 0}
x[0] = 0x4e645be9215a2
var y FieldElement = [5]uint64{268435456, 268435456, 268435456, 268435456, 268435456}
var x FieldElement = [5]uint64{1, 0, 0, 1, 16384}
// x[0] = 0x4e645be9215a2
var y FieldElement = [5]uint64{16384, 16384, 16384, 16384, 16384}
var z [10]uint64
FeMulADX(&z, &x, &y)