carry-prop bugs

This commit is contained in:
George Tankersley 2018-05-27 00:00:00 +00:00
parent 699df1ec8a
commit a12f6e84a9
3 changed files with 161 additions and 31 deletions

View File

@ -10,4 +10,4 @@ package radix51
func FeMul(out, a, b *FieldElement)
// go:noescape
func FeMulADX(out, a, b *FieldElement)
func FeMulADX(out *[10]uint64, a, b *FieldElement)

View File

@ -1,4 +1,4 @@
// Copyright (c) 2017 George Tankersley. All rights reserved.
// Copyright (c) 2018 George Tankersley. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
@ -6,52 +6,182 @@
#include "textflag.h"
// Each quadword accumulator uses two registers
#define r00 R8
#define r01 R9
#define r10 R10
#define r11 R11
#define r20 R12
#define r21 R13
#define r30 R14
#define r31 R15
#define r40 BX
#define r41 CX
// func FeMulADX(zp *uint64, xp *uint64, yp *uint64)
TEXT ·FeMulADX(SB),NOSPLIT,$0
MOVQ zp+0(FP), DI
MOVQ xp+8(FP), SI
MOVQ yp+16(FP), BP
// Clear flags
XORQ AX, AX
// The first diagonal sets up the accumulators.
// DX is the implicit second operand of MULX
MOVQ 0(BP), DX // rdx <-- y0
MULXQ 0(SI), R8, R9 // r0 <-- x0*y0
MULXQ 8(SI), R10, R11 // r1 <-- x1*y0
ADDQ R9, R10
ADCQ $0, R11
MULXQ 16(SI), R12, R13 // r2 <-- x2*y0
ADDQ R11, R12
ADCQ $0, R13
ADCQ R11, R12
MULXQ 24(SI), R14, R15 // r3 <-- x3*y0
ADDQ R13, R14
ADCQ $0, R15
ADCQ R13, R14
MULXQ 32(SI), BX, CX // r4 <-- x4*y0
ADDQ R15, BX
ADCQ R15, BX
// CX is R[5], so we multiply by 19 add it to R[0]
ADCQ $0, CX
IMUL3Q $19, CX, AX
ADDQ AX, R8
// Now we have R8,R10,R12,R14,BX as R[4:0] and CX handled by our reduction
// identity. Since we can use offset addressing directly with ADD
// instructions, store the accumulators in the output to free up registers
// for more MULX results.
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
XORQ AX, AX // clear flags
MOVQ 8(BP), DX // rdx <-- y1
MULXQ 0(SI), R10, R11 // r1 <-- x0*y1
//ADCXQ $0, R10 // this is a NOP
ADOXQ 8(DI), R10
MULXQ 8(SI), R12, R13 // r2 <-- x1*y1
ADCXQ R11, R12
ADOXQ 16(DI), R12
MULXQ 16(SI), R14, R15 // r3 <-- x2*y1
ADCXQ R13, R14
ADOXQ 24(DI), R14
MULXQ 24(SI), BX, CX // r4 <-- x3*y1
ADCXQ R15, BX
ADOXQ 32(DI), BX
MULXQ 32(SI), R8, R9 // r0 = r5*19 <-- 19*(x4*y1)
ADCXQ CX, R8
ADOXQ 0(DI), R8
// Consolidate both carry chains in R9 then add it to R[6] mapped to R[1]
// by the reduction identity.
ADCXQ AX, R9
ADOXQ AX, R9
IMUL3Q $19, R9, AX
ADDQ AX, R10
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
XORQ AX, AX // clear flags
MOVQ 16(BP), DX // rdx <-- y2
MULXQ 0(SI), R12, R13 // r2 <-- x0*y2
//ADCXQ $0, R12 // this is a NOP
ADOXQ 16(DI), R12
MULXQ 8(SI), R14, R15 // r3 <-- x1*y2
ADCXQ R13, R14
ADOXQ 24(DI), R14
MULXQ 16(SI), BX, CX // r4 <-- x2*y2
ADCXQ R15, BX
ADOXQ 32(DI), BX
MULXQ 24(SI), R8, R9 // r0 = r5*19 <-- 19*(x3*y2)
ADCXQ CX, R8
IMUL3Q $19, R8, R8
ADOXQ 0(DI), R8
MULXQ 32(SI), R10, R11 // r1 = r6*19 <-- 19*(x4*y2)
ADCXQ R9, R10
IMUL3Q $19, R10, R10
ADOXQ 8(DI), R10
// Consolidate both carry chains in R11 then add it to R[7] mapped to R[2].
ADCXQ AX, R11
ADOXQ AX, R11
IMUL3Q $19, R11, AX
ADDQ AX, R12
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
XORQ AX, AX // clear flags
MOVQ 24(BP), DX // rdx <-- y3
MULXQ 0(SI), R14, R15 // r3 <-- x0*y3
//ADCXQ $0, R14 // this is a NOP
ADOXQ 24(DI), R14
MULXQ 8(SI), BX, CX // r4 <-- x1*y3
ADCXQ R15, BX
ADOXQ 32(DI), BX
MULXQ 16(SI), R8, R9 // r0 = r5*19 <-- 19*(x2*y3)
ADCXQ CX, R8
IMUL3Q $19, R8, R8
ADOXQ 0(DI), R8
MULXQ 24(SI), R10, R11 // r1 = r6*19 <-- 19*(x3*y3)
ADCXQ R9, R10
IMUL3Q $19, R10, R10
ADOXQ 8(DI), R10
MULXQ 32(SI), R12, R13 // r2 = r7*19 <-- 19*(x4*y3)
ADCXQ R11, R12
IMUL3Q $19, R12, R12
ADOXQ 16(DI), R12
// Consolidate both carry chains in R13 then add it to R[8] mapped to R[3].
ADCXQ AX, R13
ADOXQ AX, R13
IMUL3Q $19, R13, AX
ADDQ AX, R14
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
XORQ AX, AX // clear flags
MOVQ 32(BP), DX // rdx <-- y4
MULXQ 0(SI), BX, CX // r4 <-- x0*y4
//ADCXQ $0, BX // this is a NOP
ADOXQ 32(DI), BX
MULXQ 8(SI), R8, R9 // r0 = r5*19 <-- 19*(x1*y4)
ADCXQ CX, R8
IMUL3Q $19, R8, R8
ADOXQ 0(DI), R8
MULXQ 16(SI), R10, R11 // r1 = r6*19 <-- 19*(x2*y4)
ADCXQ R9, R10
IMUL3Q $19, R10, R10
ADOXQ 8(DI), R10
MULXQ 24(SI), R12, R13 // r2 = r7*19 <-- 19*(x3*y4)
ADCXQ R11, R12
IMUL3Q $19, R12, R12
ADOXQ 16(DI), R12
MULXQ 32(SI), R14, R15 // r3 = r8*19 <-- 19*(x4*y4)
ADCXQ R13, R14
IMUL3Q $19, R14, R14
ADOXQ 24(DI), R14
// Consolidate both carry chains in R15 then add it to R[9] mapped to R[4]
ADCXQ AX, R15
ADOXQ AX, R15
IMUL3Q $19, R15, AX
ADDQ AX, BX
// Update accumulators
MOVQ R8, 0(DI)
MOVQ R10, 8(DI)
MOVQ R12, 16(DI)
MOVQ R14, 24(DI)
MOVQ BX, 32(DI)
RET

View File

@ -197,8 +197,8 @@ func TestFeInvert(t *testing.T) {
func TestFeMulADX(t *testing.T) {
var x FieldElement = [5]uint64{0, 0, 0, 0, 0}
x[0] = 0x4e645be9215a2
var y FieldElement = [5]uint64{268435456, 0, 0, 0, 0}
var z FieldElement
var y FieldElement = [5]uint64{268435456, 268435456, 268435456, 268435456, 268435456}
var z [10]uint64
FeMulADX(&z, &x, &y)
fmt.Printf("%v\n", z)