internal/radix51: restructure according to golang.org/wiki/TargetSpecific

This commit is contained in:
Filippo Valsorda 2019-11-30 22:46:35 -04:00 committed by George Tankersley
parent f33454717c
commit 6db84dcfdc
11 changed files with 258 additions and 286 deletions

View File

@ -3,8 +3,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// GF(2^255-19) field arithmetic in radix 2^51 representation. This code is a
// port of the public domain amd64-51-30k version of ed25519 from SUPERCOP.
// Package radix51 implements GF(2^255-19) field arithmetic in radix 2^51
// representation. This code is a port of the public domain amd64-51-30k version
// of ed25519 from SUPERCOP.
//
// The interface works similarly to math/big.Int, and all arguments and
// receivers are allowed to alias.
@ -360,3 +361,15 @@ func (v *FieldElement) IsNegative() int {
func (v *FieldElement) Abs(u *FieldElement) *FieldElement {
return v.CondNeg(u, u.IsNegative())
}
// Mul sets v = x * y and returns v.
func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
feMul(v, x, y)
return v
}
// Square sets v = x * x and returns v.
func (v *FieldElement) Square(x *FieldElement) *FieldElement {
feSquare(v, x)
return v
}

View File

@ -2,15 +2,12 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64,!noasm
// +build amd64,!purego
package radix51
// Mul sets v = x * y and returns v.
func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
feMul(v, x, y)
return v
}
//go:noescape
func feMul(out, a, b *FieldElement)
//go:noescape
func feSquare(out, x *FieldElement)

View File

@ -2,13 +2,14 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Based on assembly generated by PeachPy. Equivalent to the Go in fe_mul.go,
// which was originally based on the amd64-51-30k assembly in SUPERCOP.
// +build amd64,!noasm
// +build amd64,!purego
// func feMul(out, a, b *FieldElement)
TEXT ·feMul(SB),$0-24
// Based on assembly generated by PeachPy. Equivalent to the Go in
// feMulGeneric, which was originally based on the amd64-51-30k
// assembly in SUPERCOP.
MOVQ out+0(FP), DI
MOVQ a+8(FP), BX
MOVQ b+16(FP), CX
@ -200,3 +201,148 @@ TEXT ·feMul(SB),$0-24
MOVQ R12, 24(DI)
MOVQ R14, 32(DI)
RET
// func feSquare(out, x *FieldElement)
TEXT ·feSquare(SB),4,$0-16
MOVQ out+0(FP), DI
MOVQ x+8(FP), SI
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
MOVQ 0(SI), AX
MULQ 0(SI)
MOVQ AX, CX // r00
MOVQ DX, R8 // r01
MOVQ 8(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, CX
ADCQ DX, R8
MOVQ 16(SI), DX
IMUL3Q $38, DX, AX
MULQ 24(SI)
ADDQ AX, CX
ADCQ DX, R8
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 8(SI)
MOVQ AX, R9 // r10
MOVQ DX, R10 // r11
MOVQ 16(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R9
ADCQ DX, R10
MOVQ 24(SI), DX
IMUL3Q $19, DX, AX
MULQ 24(SI)
ADDQ AX, R9
ADCQ DX, R10
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 16(SI)
MOVQ AX, R11 // r20
MOVQ DX, R12 // r21
MOVQ 8(SI), AX
MULQ 8(SI)
ADDQ AX, R11
ADCQ DX, R12
MOVQ 24(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R11
ADCQ DX, R12
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 24(SI)
MOVQ AX, R13 // r30
MOVQ DX, R14 // r31
MOVQ 8(SI), AX
SHLQ $1, AX
MULQ 16(SI)
ADDQ AX, R13
ADCQ DX, R14
MOVQ 32(SI), DX
IMUL3Q $19, DX, AX
MULQ 32(SI)
ADDQ AX, R13
ADCQ DX, R14
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 32(SI)
MOVQ AX, R15 // r40
MOVQ DX, BX // r41
MOVQ 8(SI), AX
SHLQ $1, AX
MULQ 24(SI)
ADDQ AX, R15
ADCQ DX, BX
MOVQ 16(SI), AX
MULQ 16(SI)
ADDQ AX, R15
ADCQ DX, BX
// Reduce
MOVQ $2251799813685247, AX // (1<<51) - 1
SHLQ $13, CX, R8 // r01 = shld with r00
ANDQ AX, CX // r00 &= mask51
SHLQ $13, R9, R10 // r11 = shld with r10
ANDQ AX, R9 // r10 &= mask51
ADDQ R8, R9 // r10 += r01
SHLQ $13, R11, R12 // r21 = shld with r20
ANDQ AX, R11 // r20 &= mask51
ADDQ R10, R11 // r20 += r11
SHLQ $13, R13, R14 // r31 = shld with r30
ANDQ AX, R13 // r30 &= mask51
ADDQ R12, R13 // r30 += r21
SHLQ $13, R15, BX // r41 = shld with r40
ANDQ AX, R15 // r40 &= mask51
ADDQ R14, R15 // r40 += r31
IMUL3Q $19, BX, DX // r41 = r41*19
ADDQ DX, CX // r00 += r41
MOVQ CX, DX // rdx <-- r00
SHRQ $51, DX // rdx <-- r00 >> 51
ADDQ DX, R9 // r10 += r00 >> 51
MOVQ R9, DX // rdx <-- r10
SHRQ $51, DX // rdx <-- r10 >> 51
ANDQ AX, CX // r00 &= mask51
ADDQ DX, R11 // r20 += r10 >> 51
MOVQ R11, DX // rdx <-- r20
SHRQ $51, DX // rdx <-- r20 >> 51
ANDQ AX, R9 // r10 &= mask51
ADDQ DX, R13 // r30 += r20 >> 51
MOVQ R13, DX // rdx <-- r30
SHRQ $51, DX // rdx <-- r30 >> 51
ANDQ AX, R11 // r20 &= mask51
ADDQ DX, R15 // r40 += r30 >> 51
MOVQ R15, DX // rdx <-- r40
SHRQ $51, DX // rdx <-- r40 >> 51
ANDQ AX, R13 // r30 &= mask51
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
ADDQ DX, CX // r00 += (r40 >> 51) *19
ANDQ AX, R15 // r40 &= mask51
MOVQ CX, 0(DI)
MOVQ R9, 8(DI)
MOVQ R11, 16(DI)
MOVQ R13, 24(DI)
MOVQ R15, 32(DI)
RET

View File

@ -2,12 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 noasm
package radix51
// Mul sets v = x * y and returns v.
func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
func feMulGeneric(v, x, y *FieldElement) {
x0 := x[0]
x1 := x[1]
x2 := x[2]
@ -104,5 +101,79 @@ func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
// finally from r_4 to r_0 . Each of these carries is done as one copy, one
// right shift by 51, one logical and with 2^51 1, and one addition.
*v = FieldElement{r00, r10, r20, r30, r40}
return v.carryPropagate1().carryPropagate2()
v.carryPropagate1().carryPropagate2()
}
func feSquareGeneric(v, x *FieldElement) {
// Squaring needs only 15 mul instructions. Some inputs are multiplied by 2;
// this is combined with multiplication by 19 where possible. The coefficient
// reduction after squaring is the same as for multiplication.
x0 := x[0]
x1 := x[1]
x2 := x[2]
x3 := x[3]
x4 := x[4]
x0_2 := x0 << 1
x1_2 := x1 << 1
x1_38 := x1 * 38
x2_38 := x2 * 38
x3_38 := x3 * 38
x3_19 := x3 * 19
x4_19 := x4 * 19
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
r00, r01 := madd64(0, 0, x0, x0)
r00, r01 = madd64(r00, r01, x1_38, x4)
r00, r01 = madd64(r00, r01, x2_38, x3)
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
r10, r11 := madd64(0, 0, x0_2, x1)
r10, r11 = madd64(r10, r11, x2_38, x4)
r10, r11 = madd64(r10, r11, x3_19, x3)
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
r20, r21 := madd64(0, 0, x0_2, x2)
r20, r21 = madd64(r20, r21, x1, x1)
r20, r21 = madd64(r20, r21, x3_38, x4)
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
r30, r31 := madd64(0, 0, x0_2, x3)
r30, r31 = madd64(r30, r31, x1_2, x2)
r30, r31 = madd64(r30, r31, x4_19, x4)
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
r40, r41 := madd64(0, 0, x0_2, x4)
r40, r41 = madd64(r40, r41, x1_2, x3)
r40, r41 = madd64(r40, r41, x2, x2)
// Same reduction
r01 = (r01 << 13) | (r00 >> 51)
r00 &= maskLow51Bits
r11 = (r11 << 13) | (r10 >> 51)
r10 &= maskLow51Bits
r10 += r01
r21 = (r21 << 13) | (r20 >> 51)
r20 &= maskLow51Bits
r20 += r11
r31 = (r31 << 13) | (r30 >> 51)
r30 &= maskLow51Bits
r30 += r21
r41 = (r41 << 13) | (r40 >> 51)
r40 &= maskLow51Bits
r40 += r31
r41 *= 19
r00 += r41
*v = FieldElement{r00, r10, r20, r30, r40}
v.carryPropagate1().carryPropagate2()
}

View File

@ -0,0 +1,11 @@
// Copyright (c) 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 purego
package radix51
func feMul(v, x, y *FieldElement) { feMulGeneric(v, x, y) }
func feSquare(v, x *FieldElement) { feSquareGeneric(v, x) }

View File

@ -1,82 +0,0 @@
// Copyright (c) 2017 George Tankersley. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 noasm
package radix51
// Square sets v = x * x and returns v.
func (v *FieldElement) Square(x *FieldElement) *FieldElement {
// Squaring needs only 15 mul instructions. Some inputs are multiplied by 2;
// this is combined with multiplication by 19 where possible. The coefficient
// reduction after squaring is the same as for multiplication.
x0 := x[0]
x1 := x[1]
x2 := x[2]
x3 := x[3]
x4 := x[4]
x0_2 := x0 << 1
x1_2 := x1 << 1
x1_38 := x1 * 38
x2_38 := x2 * 38
x3_38 := x3 * 38
x3_19 := x3 * 19
x4_19 := x4 * 19
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
r00, r01 := madd64(0, 0, x0, x0)
r00, r01 = madd64(r00, r01, x1_38, x4)
r00, r01 = madd64(r00, r01, x2_38, x3)
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
r10, r11 := madd64(0, 0, x0_2, x1)
r10, r11 = madd64(r10, r11, x2_38, x4)
r10, r11 = madd64(r10, r11, x3_19, x3)
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
r20, r21 := madd64(0, 0, x0_2, x2)
r20, r21 = madd64(r20, r21, x1, x1)
r20, r21 = madd64(r20, r21, x3_38, x4)
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
r30, r31 := madd64(0, 0, x0_2, x3)
r30, r31 = madd64(r30, r31, x1_2, x2)
r30, r31 = madd64(r30, r31, x4_19, x4)
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
r40, r41 := madd64(0, 0, x0_2, x4)
r40, r41 = madd64(r40, r41, x1_2, x3)
r40, r41 = madd64(r40, r41, x2, x2)
// Same reduction
r01 = (r01 << 13) | (r00 >> 51)
r00 &= maskLow51Bits
r11 = (r11 << 13) | (r10 >> 51)
r10 &= maskLow51Bits
r10 += r01
r21 = (r21 << 13) | (r20 >> 51)
r20 &= maskLow51Bits
r20 += r11
r31 = (r31 << 13) | (r30 >> 51)
r30 &= maskLow51Bits
r30 += r21
r41 = (r41 << 13) | (r40 >> 51)
r40 &= maskLow51Bits
r40 += r31
r41 *= 19
r00 += r41
*v = FieldElement{r00, r10, r20, r30, r40}
return v.carryPropagate1().carryPropagate2()
}

View File

@ -1,16 +0,0 @@
// Copyright (c) 2017 George Tankersley. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64,!noasm
package radix51
// Square sets v = x * x and returns v.
func (v *FieldElement) Square(x *FieldElement) *FieldElement {
feSquare(v, x)
return v
}
//go:noescape
func feSquare(out, x *FieldElement)

View File

@ -1,150 +0,0 @@
// Copyright (c) 2017 George Tankersley. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64,!noasm
// func feSquare(out, x *FieldElement)
TEXT ·feSquare(SB),4,$0-16
MOVQ out+0(FP), DI
MOVQ x+8(FP), SI
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
MOVQ 0(SI), AX
MULQ 0(SI)
MOVQ AX, CX // r00
MOVQ DX, R8 // r01
MOVQ 8(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, CX
ADCQ DX, R8
MOVQ 16(SI), DX
IMUL3Q $38, DX, AX
MULQ 24(SI)
ADDQ AX, CX
ADCQ DX, R8
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 8(SI)
MOVQ AX, R9 // r10
MOVQ DX, R10 // r11
MOVQ 16(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R9
ADCQ DX, R10
MOVQ 24(SI), DX
IMUL3Q $19, DX, AX
MULQ 24(SI)
ADDQ AX, R9
ADCQ DX, R10
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 16(SI)
MOVQ AX, R11 // r20
MOVQ DX, R12 // r21
MOVQ 8(SI), AX
MULQ 8(SI)
ADDQ AX, R11
ADCQ DX, R12
MOVQ 24(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R11
ADCQ DX, R12
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 24(SI)
MOVQ AX, R13 // r30
MOVQ DX, R14 // r31
MOVQ 8(SI), AX
SHLQ $1, AX
MULQ 16(SI)
ADDQ AX, R13
ADCQ DX, R14
MOVQ 32(SI), DX
IMUL3Q $19, DX, AX
MULQ 32(SI)
ADDQ AX, R13
ADCQ DX, R14
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 32(SI)
MOVQ AX, R15 // r40
MOVQ DX, BX // r41
MOVQ 8(SI), AX
SHLQ $1, AX
MULQ 24(SI)
ADDQ AX, R15
ADCQ DX, BX
MOVQ 16(SI), AX
MULQ 16(SI)
ADDQ AX, R15
ADCQ DX, BX
// Reduce
MOVQ $2251799813685247, AX // (1<<51) - 1
SHLQ $13, CX, R8 // r01 = shld with r00
ANDQ AX, CX // r00 &= mask51
SHLQ $13, R9, R10 // r11 = shld with r10
ANDQ AX, R9 // r10 &= mask51
ADDQ R8, R9 // r10 += r01
SHLQ $13, R11, R12 // r21 = shld with r20
ANDQ AX, R11 // r20 &= mask51
ADDQ R10, R11 // r20 += r11
SHLQ $13, R13, R14 // r31 = shld with r30
ANDQ AX, R13 // r30 &= mask51
ADDQ R12, R13 // r30 += r21
SHLQ $13, R15, BX // r41 = shld with r40
ANDQ AX, R15 // r40 &= mask51
ADDQ R14, R15 // r40 += r31
IMUL3Q $19, BX, DX // r41 = r41*19
ADDQ DX, CX // r00 += r41
MOVQ CX, DX // rdx <-- r00
SHRQ $51, DX // rdx <-- r00 >> 51
ADDQ DX, R9 // r10 += r00 >> 51
MOVQ R9, DX // rdx <-- r10
SHRQ $51, DX // rdx <-- r10 >> 51
ANDQ AX, CX // r00 &= mask51
ADDQ DX, R11 // r20 += r10 >> 51
MOVQ R11, DX // rdx <-- r20
SHRQ $51, DX // rdx <-- r20 >> 51
ANDQ AX, R9 // r10 &= mask51
ADDQ DX, R13 // r30 += r20 >> 51
MOVQ R13, DX // rdx <-- r30
SHRQ $51, DX // rdx <-- r30 >> 51
ANDQ AX, R11 // r20 &= mask51
ADDQ DX, R15 // r40 += r30 >> 51
MOVQ R15, DX // rdx <-- r40
SHRQ $51, DX // rdx <-- r40 >> 51
ANDQ AX, R13 // r30 &= mask51
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
ADDQ DX, CX // r00 += (r40 >> 51) *19
ANDQ AX, R15 // r40 &= mask51
MOVQ CX, 0(DI)
MOVQ R9, 8(DI)
MOVQ R11, 16(DI)
MOVQ R13, 24(DI)
MOVQ R15, 32(DI)
RET

View File

@ -1,18 +0,0 @@
// Copyright (c) 2017 George Tankersley. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package radix51
import "unsafe"
// mul64x64 multiples two 64-bit numbers and adds them to two accumulators.
// This function is written to ensure it inlines. I am so sorry.
func mul64x64(lo, hi, a, b uint64) (ol uint64, oh uint64) {
t1 := (a>>32)*(b&0xFFFFFFFF) + ((a & 0xFFFFFFFF) * (b & 0xFFFFFFFF) >> 32)
t2 := (a&0xFFFFFFFF)*(b>>32) + (t1 & 0xFFFFFFFF)
ol = (a * b) + lo
cmp := ol < lo
oh = hi + (a>>32)*(b>>32) + t1>>32 + t2>>32 + uint64(*(*byte)(unsafe.Pointer(&cmp)))
return
}

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build go1.12
// +build go1.13
package radix51

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !go1.12
// +build !go1.13
package radix51