mirror of https://github.com/gtank/ristretto255
internal/radix51: restructure according to golang.org/wiki/TargetSpecific
This commit is contained in:
parent
f33454717c
commit
b6e6e6cff6
|
@ -3,8 +3,9 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// GF(2^255-19) field arithmetic in radix 2^51 representation. This code is a
|
||||
// port of the public domain amd64-51-30k version of ed25519 from SUPERCOP.
|
||||
// Package radix51 implements GF(2^255-19) field arithmetic in radix 2^51
|
||||
// representation. This code is a port of the public domain amd64-51-30k version
|
||||
// of ed25519 from SUPERCOP.
|
||||
//
|
||||
// The interface works similarly to math/big.Int, and all arguments and
|
||||
// receivers are allowed to alias.
|
||||
|
@ -360,3 +361,15 @@ func (v *FieldElement) IsNegative() int {
|
|||
func (v *FieldElement) Abs(u *FieldElement) *FieldElement {
|
||||
return v.CondNeg(u, u.IsNegative())
|
||||
}
|
||||
|
||||
// Mul sets v = x * y and returns v.
|
||||
func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
|
||||
feMul(v, x, y)
|
||||
return v
|
||||
}
|
||||
|
||||
// Square sets v = x * x and returns v.
|
||||
func (v *FieldElement) Square(x *FieldElement) *FieldElement {
|
||||
feSquare(v, x)
|
||||
return v
|
||||
}
|
||||
|
|
|
@ -2,15 +2,12 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build amd64,!noasm
|
||||
// +build amd64,!purego
|
||||
|
||||
package radix51
|
||||
|
||||
// Mul sets v = x * y and returns v.
|
||||
func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
|
||||
feMul(v, x, y)
|
||||
return v
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func feMul(out, a, b *FieldElement)
|
||||
|
||||
//go:noescape
|
||||
func feSquare(out, x *FieldElement)
|
|
@ -2,13 +2,14 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Based on assembly generated by PeachPy. Equivalent to the Go in fe_mul.go,
|
||||
// which was originally based on the amd64-51-30k assembly in SUPERCOP.
|
||||
|
||||
// +build amd64,!noasm
|
||||
// +build amd64,!purego
|
||||
|
||||
// func feMul(out, a, b *FieldElement)
|
||||
TEXT ·feMul(SB),$0-24
|
||||
// Based on assembly generated by PeachPy. Equivalent to the Go in
|
||||
// feMulGeneric, which was originally based on the amd64-51-30k
|
||||
// assembly in SUPERCOP.
|
||||
|
||||
MOVQ out+0(FP), DI
|
||||
MOVQ a+8(FP), BX
|
||||
MOVQ b+16(FP), CX
|
||||
|
@ -200,3 +201,148 @@ TEXT ·feMul(SB),$0-24
|
|||
MOVQ R12, 24(DI)
|
||||
MOVQ R14, 32(DI)
|
||||
RET
|
||||
|
||||
// func feSquare(out, x *FieldElement)
|
||||
TEXT ·feSquare(SB),4,$0-16
|
||||
MOVQ out+0(FP), DI
|
||||
MOVQ x+8(FP), SI
|
||||
|
||||
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
|
||||
MOVQ 0(SI), AX
|
||||
MULQ 0(SI)
|
||||
MOVQ AX, CX // r00
|
||||
MOVQ DX, R8 // r01
|
||||
|
||||
MOVQ 8(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, CX
|
||||
ADCQ DX, R8
|
||||
|
||||
MOVQ 16(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, CX
|
||||
ADCQ DX, R8
|
||||
|
||||
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 8(SI)
|
||||
MOVQ AX, R9 // r10
|
||||
MOVQ DX, R10 // r11
|
||||
|
||||
MOVQ 16(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R10
|
||||
|
||||
MOVQ 24(SI), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R10
|
||||
|
||||
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 16(SI)
|
||||
MOVQ AX, R11 // r20
|
||||
MOVQ DX, R12 // r21
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
MULQ 8(SI)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R12
|
||||
|
||||
MOVQ 24(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R12
|
||||
|
||||
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 24(SI)
|
||||
MOVQ AX, R13 // r30
|
||||
MOVQ DX, R14 // r31
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 16(SI)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R14
|
||||
|
||||
MOVQ 32(SI), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R14
|
||||
|
||||
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 32(SI)
|
||||
MOVQ AX, R15 // r40
|
||||
MOVQ DX, BX // r41
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, R15
|
||||
ADCQ DX, BX
|
||||
|
||||
MOVQ 16(SI), AX
|
||||
MULQ 16(SI)
|
||||
ADDQ AX, R15
|
||||
ADCQ DX, BX
|
||||
|
||||
// Reduce
|
||||
MOVQ $2251799813685247, AX // (1<<51) - 1
|
||||
SHLQ $13, CX, R8 // r01 = shld with r00
|
||||
ANDQ AX, CX // r00 &= mask51
|
||||
SHLQ $13, R9, R10 // r11 = shld with r10
|
||||
ANDQ AX, R9 // r10 &= mask51
|
||||
ADDQ R8, R9 // r10 += r01
|
||||
SHLQ $13, R11, R12 // r21 = shld with r20
|
||||
ANDQ AX, R11 // r20 &= mask51
|
||||
ADDQ R10, R11 // r20 += r11
|
||||
SHLQ $13, R13, R14 // r31 = shld with r30
|
||||
ANDQ AX, R13 // r30 &= mask51
|
||||
ADDQ R12, R13 // r30 += r21
|
||||
SHLQ $13, R15, BX // r41 = shld with r40
|
||||
ANDQ AX, R15 // r40 &= mask51
|
||||
ADDQ R14, R15 // r40 += r31
|
||||
IMUL3Q $19, BX, DX // r41 = r41*19
|
||||
ADDQ DX, CX // r00 += r41
|
||||
|
||||
MOVQ CX, DX // rdx <-- r00
|
||||
SHRQ $51, DX // rdx <-- r00 >> 51
|
||||
ADDQ DX, R9 // r10 += r00 >> 51
|
||||
MOVQ R9, DX // rdx <-- r10
|
||||
SHRQ $51, DX // rdx <-- r10 >> 51
|
||||
ANDQ AX, CX // r00 &= mask51
|
||||
ADDQ DX, R11 // r20 += r10 >> 51
|
||||
MOVQ R11, DX // rdx <-- r20
|
||||
SHRQ $51, DX // rdx <-- r20 >> 51
|
||||
ANDQ AX, R9 // r10 &= mask51
|
||||
ADDQ DX, R13 // r30 += r20 >> 51
|
||||
MOVQ R13, DX // rdx <-- r30
|
||||
SHRQ $51, DX // rdx <-- r30 >> 51
|
||||
ANDQ AX, R11 // r20 &= mask51
|
||||
ADDQ DX, R15 // r40 += r30 >> 51
|
||||
MOVQ R15, DX // rdx <-- r40
|
||||
SHRQ $51, DX // rdx <-- r40 >> 51
|
||||
ANDQ AX, R13 // r30 &= mask51
|
||||
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
|
||||
ADDQ DX, CX // r00 += (r40 >> 51) *19
|
||||
ANDQ AX, R15 // r40 &= mask51
|
||||
|
||||
MOVQ CX, 0(DI)
|
||||
MOVQ R9, 8(DI)
|
||||
MOVQ R11, 16(DI)
|
||||
MOVQ R13, 24(DI)
|
||||
MOVQ R15, 32(DI)
|
||||
RET
|
|
@ -2,12 +2,9 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 noasm
|
||||
|
||||
package radix51
|
||||
|
||||
// Mul sets v = x * y and returns v.
|
||||
func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
|
||||
func feMulGeneric(v, x, y *FieldElement) {
|
||||
x0 := x[0]
|
||||
x1 := x[1]
|
||||
x2 := x[2]
|
||||
|
@ -104,5 +101,79 @@ func (v *FieldElement) Mul(x, y *FieldElement) *FieldElement {
|
|||
// finally from r_4 to r_0 . Each of these carries is done as one copy, one
|
||||
// right shift by 51, one logical and with 2^51 − 1, and one addition.
|
||||
*v = FieldElement{r00, r10, r20, r30, r40}
|
||||
return v.carryPropagate1().carryPropagate2()
|
||||
v.carryPropagate1().carryPropagate2()
|
||||
}
|
||||
|
||||
func feSquareGeneric(v, x *FieldElement) {
|
||||
// Squaring needs only 15 mul instructions. Some inputs are multiplied by 2;
|
||||
// this is combined with multiplication by 19 where possible. The coefficient
|
||||
// reduction after squaring is the same as for multiplication.
|
||||
|
||||
x0 := x[0]
|
||||
x1 := x[1]
|
||||
x2 := x[2]
|
||||
x3 := x[3]
|
||||
x4 := x[4]
|
||||
|
||||
x0_2 := x0 << 1
|
||||
x1_2 := x1 << 1
|
||||
|
||||
x1_38 := x1 * 38
|
||||
x2_38 := x2 * 38
|
||||
x3_38 := x3 * 38
|
||||
|
||||
x3_19 := x3 * 19
|
||||
x4_19 := x4 * 19
|
||||
|
||||
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
|
||||
r00, r01 := madd64(0, 0, x0, x0)
|
||||
r00, r01 = madd64(r00, r01, x1_38, x4)
|
||||
r00, r01 = madd64(r00, r01, x2_38, x3)
|
||||
|
||||
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
|
||||
r10, r11 := madd64(0, 0, x0_2, x1)
|
||||
r10, r11 = madd64(r10, r11, x2_38, x4)
|
||||
r10, r11 = madd64(r10, r11, x3_19, x3)
|
||||
|
||||
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
|
||||
r20, r21 := madd64(0, 0, x0_2, x2)
|
||||
r20, r21 = madd64(r20, r21, x1, x1)
|
||||
r20, r21 = madd64(r20, r21, x3_38, x4)
|
||||
|
||||
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
|
||||
r30, r31 := madd64(0, 0, x0_2, x3)
|
||||
r30, r31 = madd64(r30, r31, x1_2, x2)
|
||||
r30, r31 = madd64(r30, r31, x4_19, x4)
|
||||
|
||||
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
|
||||
r40, r41 := madd64(0, 0, x0_2, x4)
|
||||
r40, r41 = madd64(r40, r41, x1_2, x3)
|
||||
r40, r41 = madd64(r40, r41, x2, x2)
|
||||
|
||||
// Same reduction
|
||||
|
||||
r01 = (r01 << 13) | (r00 >> 51)
|
||||
r00 &= maskLow51Bits
|
||||
|
||||
r11 = (r11 << 13) | (r10 >> 51)
|
||||
r10 &= maskLow51Bits
|
||||
r10 += r01
|
||||
|
||||
r21 = (r21 << 13) | (r20 >> 51)
|
||||
r20 &= maskLow51Bits
|
||||
r20 += r11
|
||||
|
||||
r31 = (r31 << 13) | (r30 >> 51)
|
||||
r30 &= maskLow51Bits
|
||||
r30 += r21
|
||||
|
||||
r41 = (r41 << 13) | (r40 >> 51)
|
||||
r40 &= maskLow51Bits
|
||||
r40 += r31
|
||||
|
||||
r41 *= 19
|
||||
r00 += r41
|
||||
|
||||
*v = FieldElement{r00, r10, r20, r30, r40}
|
||||
v.carryPropagate1().carryPropagate2()
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
// Copyright (c) 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 purego
|
||||
|
||||
package radix51
|
||||
|
||||
func feMul(v, x, y *FieldElement) { feMulGeneric(v, x, y) }
|
||||
|
||||
func feSquare(v, x *FieldElement) { feSquareGeneric(v, x) }
|
|
@ -1,82 +0,0 @@
|
|||
// Copyright (c) 2017 George Tankersley. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 noasm
|
||||
|
||||
package radix51
|
||||
|
||||
// Square sets v = x * x and returns v.
|
||||
func (v *FieldElement) Square(x *FieldElement) *FieldElement {
|
||||
// Squaring needs only 15 mul instructions. Some inputs are multiplied by 2;
|
||||
// this is combined with multiplication by 19 where possible. The coefficient
|
||||
// reduction after squaring is the same as for multiplication.
|
||||
|
||||
x0 := x[0]
|
||||
x1 := x[1]
|
||||
x2 := x[2]
|
||||
x3 := x[3]
|
||||
x4 := x[4]
|
||||
|
||||
x0_2 := x0 << 1
|
||||
x1_2 := x1 << 1
|
||||
|
||||
x1_38 := x1 * 38
|
||||
x2_38 := x2 * 38
|
||||
x3_38 := x3 * 38
|
||||
|
||||
x3_19 := x3 * 19
|
||||
x4_19 := x4 * 19
|
||||
|
||||
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
|
||||
r00, r01 := madd64(0, 0, x0, x0)
|
||||
r00, r01 = madd64(r00, r01, x1_38, x4)
|
||||
r00, r01 = madd64(r00, r01, x2_38, x3)
|
||||
|
||||
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
|
||||
r10, r11 := madd64(0, 0, x0_2, x1)
|
||||
r10, r11 = madd64(r10, r11, x2_38, x4)
|
||||
r10, r11 = madd64(r10, r11, x3_19, x3)
|
||||
|
||||
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
|
||||
r20, r21 := madd64(0, 0, x0_2, x2)
|
||||
r20, r21 = madd64(r20, r21, x1, x1)
|
||||
r20, r21 = madd64(r20, r21, x3_38, x4)
|
||||
|
||||
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
|
||||
r30, r31 := madd64(0, 0, x0_2, x3)
|
||||
r30, r31 = madd64(r30, r31, x1_2, x2)
|
||||
r30, r31 = madd64(r30, r31, x4_19, x4)
|
||||
|
||||
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
|
||||
r40, r41 := madd64(0, 0, x0_2, x4)
|
||||
r40, r41 = madd64(r40, r41, x1_2, x3)
|
||||
r40, r41 = madd64(r40, r41, x2, x2)
|
||||
|
||||
// Same reduction
|
||||
|
||||
r01 = (r01 << 13) | (r00 >> 51)
|
||||
r00 &= maskLow51Bits
|
||||
|
||||
r11 = (r11 << 13) | (r10 >> 51)
|
||||
r10 &= maskLow51Bits
|
||||
r10 += r01
|
||||
|
||||
r21 = (r21 << 13) | (r20 >> 51)
|
||||
r20 &= maskLow51Bits
|
||||
r20 += r11
|
||||
|
||||
r31 = (r31 << 13) | (r30 >> 51)
|
||||
r30 &= maskLow51Bits
|
||||
r30 += r21
|
||||
|
||||
r41 = (r41 << 13) | (r40 >> 51)
|
||||
r40 &= maskLow51Bits
|
||||
r40 += r31
|
||||
|
||||
r41 *= 19
|
||||
r00 += r41
|
||||
|
||||
*v = FieldElement{r00, r10, r20, r30, r40}
|
||||
return v.carryPropagate1().carryPropagate2()
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
// Copyright (c) 2017 George Tankersley. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build amd64,!noasm
|
||||
|
||||
package radix51
|
||||
|
||||
// Square sets v = x * x and returns v.
|
||||
func (v *FieldElement) Square(x *FieldElement) *FieldElement {
|
||||
feSquare(v, x)
|
||||
return v
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func feSquare(out, x *FieldElement)
|
|
@ -1,150 +0,0 @@
|
|||
// Copyright (c) 2017 George Tankersley. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build amd64,!noasm
|
||||
|
||||
// func feSquare(out, x *FieldElement)
|
||||
TEXT ·feSquare(SB),4,$0-16
|
||||
MOVQ out+0(FP), DI
|
||||
MOVQ x+8(FP), SI
|
||||
|
||||
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
|
||||
MOVQ 0(SI), AX
|
||||
MULQ 0(SI)
|
||||
MOVQ AX, CX // r00
|
||||
MOVQ DX, R8 // r01
|
||||
|
||||
MOVQ 8(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, CX
|
||||
ADCQ DX, R8
|
||||
|
||||
MOVQ 16(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, CX
|
||||
ADCQ DX, R8
|
||||
|
||||
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 8(SI)
|
||||
MOVQ AX, R9 // r10
|
||||
MOVQ DX, R10 // r11
|
||||
|
||||
MOVQ 16(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R10
|
||||
|
||||
MOVQ 24(SI), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R10
|
||||
|
||||
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 16(SI)
|
||||
MOVQ AX, R11 // r20
|
||||
MOVQ DX, R12 // r21
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
MULQ 8(SI)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R12
|
||||
|
||||
MOVQ 24(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R12
|
||||
|
||||
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 24(SI)
|
||||
MOVQ AX, R13 // r30
|
||||
MOVQ DX, R14 // r31
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 16(SI)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R14
|
||||
|
||||
MOVQ 32(SI), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R14
|
||||
|
||||
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 32(SI)
|
||||
MOVQ AX, R15 // r40
|
||||
MOVQ DX, BX // r41
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, R15
|
||||
ADCQ DX, BX
|
||||
|
||||
MOVQ 16(SI), AX
|
||||
MULQ 16(SI)
|
||||
ADDQ AX, R15
|
||||
ADCQ DX, BX
|
||||
|
||||
// Reduce
|
||||
MOVQ $2251799813685247, AX // (1<<51) - 1
|
||||
SHLQ $13, CX, R8 // r01 = shld with r00
|
||||
ANDQ AX, CX // r00 &= mask51
|
||||
SHLQ $13, R9, R10 // r11 = shld with r10
|
||||
ANDQ AX, R9 // r10 &= mask51
|
||||
ADDQ R8, R9 // r10 += r01
|
||||
SHLQ $13, R11, R12 // r21 = shld with r20
|
||||
ANDQ AX, R11 // r20 &= mask51
|
||||
ADDQ R10, R11 // r20 += r11
|
||||
SHLQ $13, R13, R14 // r31 = shld with r30
|
||||
ANDQ AX, R13 // r30 &= mask51
|
||||
ADDQ R12, R13 // r30 += r21
|
||||
SHLQ $13, R15, BX // r41 = shld with r40
|
||||
ANDQ AX, R15 // r40 &= mask51
|
||||
ADDQ R14, R15 // r40 += r31
|
||||
IMUL3Q $19, BX, DX // r41 = r41*19
|
||||
ADDQ DX, CX // r00 += r41
|
||||
|
||||
MOVQ CX, DX // rdx <-- r00
|
||||
SHRQ $51, DX // rdx <-- r00 >> 51
|
||||
ADDQ DX, R9 // r10 += r00 >> 51
|
||||
MOVQ R9, DX // rdx <-- r10
|
||||
SHRQ $51, DX // rdx <-- r10 >> 51
|
||||
ANDQ AX, CX // r00 &= mask51
|
||||
ADDQ DX, R11 // r20 += r10 >> 51
|
||||
MOVQ R11, DX // rdx <-- r20
|
||||
SHRQ $51, DX // rdx <-- r20 >> 51
|
||||
ANDQ AX, R9 // r10 &= mask51
|
||||
ADDQ DX, R13 // r30 += r20 >> 51
|
||||
MOVQ R13, DX // rdx <-- r30
|
||||
SHRQ $51, DX // rdx <-- r30 >> 51
|
||||
ANDQ AX, R11 // r20 &= mask51
|
||||
ADDQ DX, R15 // r40 += r30 >> 51
|
||||
MOVQ R15, DX // rdx <-- r40
|
||||
SHRQ $51, DX // rdx <-- r40 >> 51
|
||||
ANDQ AX, R13 // r30 &= mask51
|
||||
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
|
||||
ADDQ DX, CX // r00 += (r40 >> 51) *19
|
||||
ANDQ AX, R15 // r40 &= mask51
|
||||
|
||||
MOVQ CX, 0(DI)
|
||||
MOVQ R9, 8(DI)
|
||||
MOVQ R11, 16(DI)
|
||||
MOVQ R13, 24(DI)
|
||||
MOVQ R15, 32(DI)
|
||||
RET
|
|
@ -1,18 +0,0 @@
|
|||
// Copyright (c) 2017 George Tankersley. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package radix51
|
||||
|
||||
import "unsafe"
|
||||
|
||||
// mul64x64 multiples two 64-bit numbers and adds them to two accumulators.
|
||||
// This function is written to ensure it inlines. I am so sorry.
|
||||
func mul64x64(lo, hi, a, b uint64) (ol uint64, oh uint64) {
|
||||
t1 := (a>>32)*(b&0xFFFFFFFF) + ((a & 0xFFFFFFFF) * (b & 0xFFFFFFFF) >> 32)
|
||||
t2 := (a&0xFFFFFFFF)*(b>>32) + (t1 & 0xFFFFFFFF)
|
||||
ol = (a * b) + lo
|
||||
cmp := ol < lo
|
||||
oh = hi + (a>>32)*(b>>32) + t1>>32 + t2>>32 + uint64(*(*byte)(unsafe.Pointer(&cmp)))
|
||||
return
|
||||
}
|
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build go1.12
|
||||
// +build go1.13
|
||||
|
||||
package radix51
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !go1.12
|
||||
// +build !go1.13
|
||||
|
||||
package radix51
|
||||
|
||||
|
|
Loading…
Reference in New Issue