diff --git a/ed25519_test.go b/ed25519_test.go index c8a28b3..d92b269 100644 --- a/ed25519_test.go +++ b/ed25519_test.go @@ -382,6 +382,13 @@ func BenchmarkFeMul51(b *testing.B) { } } +func BenchmarkFeMulADX(b *testing.B) { + var h [10]uint64 + for i := 0; i < b.N; i++ { + field.FeMulADX(&h, &radix51A, &radix51A) + } +} + func BenchmarkFeSquare51(b *testing.B) { var h field.FieldElement for i := 0; i < b.N; i++ { diff --git a/internal/radix51/fe_mul_amd64_adx.s b/internal/radix51/fe_mul_amd64_adx.s index a0e540d..7f2075e 100644 --- a/internal/radix51/fe_mul_amd64_adx.s +++ b/internal/radix51/fe_mul_amd64_adx.s @@ -15,7 +15,10 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0 // The first diagonal sets up the accumulators. MOVQ 0(BP), DX // rdx <-- y0 + MULXQ 0(SI), R8, R9 // r0 <-- x0*y0 + MOVQ R8, 0(DI) + MULXQ 8(SI), R10, R11 // r1 <-- x1*y0 ADDQ R9, R10 MULXQ 16(SI), R12, R13 // r2 <-- x2*y0 @@ -25,17 +28,6 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0 MULXQ 32(SI), BX, CX // r4 <-- x4*y0 ADCQ R15, BX - // CX is R[5], so we multiply by 19 add it to R[0] - ADCQ $0, CX - IMUL3Q $19, CX, AX - ADDQ AX, R8 - - // Now we have R8,R10,R12,R14,BX as R[4:0] and CX handled by our reduction - // identity. Since we can use offset addressing directly with ADD - // instructions, store the accumulators in the output to free up registers - // for more MULX results. - - MOVQ R8, 0(DI) MOVQ R10, 8(DI) MOVQ R12, 16(DI) MOVQ R14, 24(DI) @@ -44,9 +36,11 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0 XORQ AX, AX // clear flags MOVQ 8(BP), DX // rdx <-- y1 + MULXQ 0(SI), R10, R11 // r1 <-- x0*y1 - //ADCXQ $0, R10 // this is a NOP ADOXQ 8(DI), R10 + MOVQ R10, 8(DI) + MULXQ 8(SI), R12, R13 // r2 <-- x1*y1 ADCXQ R11, R12 ADOXQ 16(DI), R12 @@ -56,132 +50,102 @@ TEXT ·FeMulADX(SB),NOSPLIT,$0 MULXQ 24(SI), BX, CX // r4 <-- x3*y1 ADCXQ R15, BX ADOXQ 32(DI), BX - MULXQ 32(SI), R8, R9 // r0 = r5*19 <-- 19*(x4*y1) + MULXQ 32(SI), R8, R9 // r5 <-- x4*y1 ADCXQ CX, R8 - ADOXQ 0(DI), R8 - - // Consolidate both carry chains in R9 then add it to R[6] mapped to R[1] - // by the reduction identity. - ADCXQ AX, R9 - ADOXQ AX, R9 - IMUL3Q $19, R9, AX - ADDQ AX, R10 + ADOXQ 40(DI), R8 // Update accumulators - MOVQ R8, 0(DI) - MOVQ R10, 8(DI) MOVQ R12, 16(DI) MOVQ R14, 24(DI) MOVQ BX, 32(DI) + MOVQ R8, 40(DI) XORQ AX, AX // clear flags MOVQ 16(BP), DX // rdx <-- y2 + MULXQ 0(SI), R12, R13 // r2 <-- x0*y2 - //ADCXQ $0, R12 // this is a NOP ADOXQ 16(DI), R12 + MOVQ R12, 16(DI) + MULXQ 8(SI), R14, R15 // r3 <-- x1*y2 ADCXQ R13, R14 ADOXQ 24(DI), R14 MULXQ 16(SI), BX, CX // r4 <-- x2*y2 ADCXQ R15, BX ADOXQ 32(DI), BX - MULXQ 24(SI), R8, R9 // r0 = r5*19 <-- 19*(x3*y2) + MULXQ 24(SI), R8, R9 // r5 <-- x3*y2 ADCXQ CX, R8 - IMUL3Q $19, R8, R8 - ADOXQ 0(DI), R8 - MULXQ 32(SI), R10, R11 // r1 = r6*19 <-- 19*(x4*y2) + ADOXQ 40(DI), R8 + MULXQ 32(SI), R10, R11 // r6 <-- x4*y2 ADCXQ R9, R10 - IMUL3Q $19, R10, R10 - ADOXQ 8(DI), R10 - - // Consolidate both carry chains in R11 then add it to R[7] mapped to R[2]. - ADCXQ AX, R11 - ADOXQ AX, R11 - IMUL3Q $19, R11, AX - ADDQ AX, R12 + ADOXQ 48(DI), R10 // Update accumulators - MOVQ R8, 0(DI) - MOVQ R10, 8(DI) - MOVQ R12, 16(DI) MOVQ R14, 24(DI) MOVQ BX, 32(DI) + MOVQ R8, 40(DI) + MOVQ R10, 48(DI) XORQ AX, AX // clear flags MOVQ 24(BP), DX // rdx <-- y3 + MULXQ 0(SI), R14, R15 // r3 <-- x0*y3 - //ADCXQ $0, R14 // this is a NOP ADOXQ 24(DI), R14 + MOVQ R14, 24(DI) + MULXQ 8(SI), BX, CX // r4 <-- x1*y3 ADCXQ R15, BX ADOXQ 32(DI), BX - MULXQ 16(SI), R8, R9 // r0 = r5*19 <-- 19*(x2*y3) + MULXQ 16(SI), R8, R9 // r5 <-- x2*y3 ADCXQ CX, R8 - IMUL3Q $19, R8, R8 - ADOXQ 0(DI), R8 - MULXQ 24(SI), R10, R11 // r1 = r6*19 <-- 19*(x3*y3) + ADOXQ 40(DI), R8 + MULXQ 24(SI), R10, R11 // r6 <-- x3*y3 ADCXQ R9, R10 - IMUL3Q $19, R10, R10 - ADOXQ 8(DI), R10 - MULXQ 32(SI), R12, R13 // r2 = r7*19 <-- 19*(x4*y3) + ADOXQ 48(DI), R10 + MULXQ 32(SI), R12, R13 //r7 <-- x4*y3 ADCXQ R11, R12 - IMUL3Q $19, R12, R12 - ADOXQ 16(DI), R12 - - // Consolidate both carry chains in R13 then add it to R[8] mapped to R[3]. - ADCXQ AX, R13 - ADOXQ AX, R13 - IMUL3Q $19, R13, AX - ADDQ AX, R14 + ADOXQ 56(DI), R12 // Update accumulators - MOVQ R8, 0(DI) - MOVQ R10, 8(DI) - MOVQ R12, 16(DI) - MOVQ R14, 24(DI) MOVQ BX, 32(DI) + MOVQ R8, 40(DI) + MOVQ R10, 48(DI) + MOVQ R12, 56(DI) XORQ AX, AX // clear flags - + MOVQ 32(BP), DX // rdx <-- y4 MULXQ 0(SI), BX, CX // r4 <-- x0*y4 - //ADCXQ $0, BX // this is a NOP ADOXQ 32(DI), BX + MOVQ BX, 32(DI) - MULXQ 8(SI), R8, R9 // r0 = r5*19 <-- 19*(x1*y4) + MULXQ 8(SI), R8, R9 // r5 <-- x1*y4 ADCXQ CX, R8 - IMUL3Q $19, R8, R8 - ADOXQ 0(DI), R8 + ADOXQ 40(DI), R8 - MULXQ 16(SI), R10, R11 // r1 = r6*19 <-- 19*(x2*y4) + MULXQ 16(SI), R10, R11 // r6 <-- x2*y4 ADCXQ R9, R10 - IMUL3Q $19, R10, R10 - ADOXQ 8(DI), R10 + ADOXQ 48(DI), R10 - MULXQ 24(SI), R12, R13 // r2 = r7*19 <-- 19*(x3*y4) + MULXQ 24(SI), R12, R13 // r7 <-- x3*y4 ADCXQ R11, R12 - IMUL3Q $19, R12, R12 - ADOXQ 16(DI), R12 + ADOXQ 56(DI), R12 - MULXQ 32(SI), R14, R15 // r3 = r8*19 <-- 19*(x4*y4) + MULXQ 32(SI), R14, R15 // r8 <-- x4*y4 ADCXQ R13, R14 - IMUL3Q $19, R14, R14 - ADOXQ 24(DI), R14 + ADOXQ 64(DI), R14 - // Consolidate both carry chains in R15 then add it to R[9] mapped to R[4] + // Consolidate both carry chains in R15, our final output. ADCXQ AX, R15 ADOXQ AX, R15 - IMUL3Q $19, R15, AX - ADDQ AX, BX - // Update accumulators - MOVQ R8, 0(DI) - MOVQ R10, 8(DI) - MOVQ R12, 16(DI) - MOVQ R14, 24(DI) - MOVQ BX, 32(DI) + MOVQ R8, 40(DI) + MOVQ R10, 48(DI) + MOVQ R12, 56(DI) + MOVQ R14, 64(DI) + MOVQ R15, 72(DI) RET diff --git a/internal/radix51/fe_test.go b/internal/radix51/fe_test.go index 017c441..af86e74 100644 --- a/internal/radix51/fe_test.go +++ b/internal/radix51/fe_test.go @@ -195,9 +195,9 @@ func TestFeInvert(t *testing.T) { } func TestFeMulADX(t *testing.T) { - var x FieldElement = [5]uint64{0, 0, 0, 0, 0} - x[0] = 0x4e645be9215a2 - var y FieldElement = [5]uint64{268435456, 268435456, 268435456, 268435456, 268435456} + var x FieldElement = [5]uint64{1, 0, 0, 1, 16384} + // x[0] = 0x4e645be9215a2 + var y FieldElement = [5]uint64{16384, 16384, 16384, 16384, 16384} var z [10]uint64 FeMulADX(&z, &x, &y)