diff --git a/LICENSE b/LICENSE index a42f161..cb25973 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ Copyright (c) 2009 The Go Authors. All rights reserved. Copyright (c) 2017 George Tankersley. All rights reserved. Copyright (c) 2019 Henry de Valence. All rights reserved. +Copyright (c) 2021 Oasis Labs Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/go.mod b/go.mod index 30a573b..eccb1de 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/gtank/ristretto255 go 1.11 + +require golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..1225630 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/internal/radix51/bench_test.go b/internal/radix51/bench_test.go index f5c05fc..86cdd92 100644 --- a/internal/radix51/bench_test.go +++ b/internal/radix51/bench_test.go @@ -38,3 +38,12 @@ func BenchmarkMul32(b *testing.B) { x.Mul32(&x, 0xaa42aa42) } } + +func BenchmarkSquare(b *testing.B) { + var x radix51.FieldElement + x.Add(radix51.One, radix51.One) + b.ResetTimer() + for i := 0; i < b.N; i++ { + x.Square(&x) + } +} diff --git a/internal/radix51/fe_amd64.go b/internal/radix51/fe_amd64.go index 8327caa..b23e6ba 100644 --- a/internal/radix51/fe_amd64.go +++ b/internal/radix51/fe_amd64.go @@ -1,4 +1,5 @@ // Copyright (c) 2017 George Tankersley. All rights reserved. +// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. @@ -6,8 +7,31 @@ package radix51 -//go:noescape -func feMul(out, a, b *FieldElement) +import "golang.org/x/sys/cpu" + +var useBMI2 bool + +// In an ideal world, this would have a function for basic amd64 +// assembly, and one that uses BMI2. The inliner fails to inline a +// function consisting of a single if/else statement, killing the +// performance gained by using BMI2 in the first place. +// +// This horrendous inliner behavior was last checked on Go 1.15.7. //go:noescape -func feSquare(out, x *FieldElement) +func feMulAmd64(out, a, b *FieldElement, useBMI2 bool) + +//go:noescape +func feSquareAmd64(out, x *FieldElement, useBMI2 bool) + +func feMul(out, a, b *FieldElement) { + feMulAmd64(out, a, b, useBMI2) +} + +func feSquare(out, x *FieldElement) { + feSquareAmd64(out, x, useBMI2) +} + +func init() { + useBMI2 = cpu.Initialized && cpu.X86.HasBMI2 +} diff --git a/internal/radix51/fe_amd64.s b/internal/radix51/fe_amd64.s index dfc95e6..293fb04 100644 --- a/internal/radix51/fe_amd64.s +++ b/internal/radix51/fe_amd64.s @@ -1,19 +1,181 @@ // Copyright (c) 2017 George Tankersley. All rights reserved. +// Copyright (c) 2021 Oasis Labs Inc. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build amd64,!purego -// func feMul(out, a, b *FieldElement) -TEXT ·feMul(SB),$0-24 +#include "textflag.h" + +// reduce64 reduces the intermediaries stored in rsi, rbp, r8 .. r15. +// +// Inputs: rsi, rbp, r8 .. r15. +// Clobbers: rax, rdx +#define reduce64() \ + MOVQ $2251799813685247, AX \ // (1<<51) - 1 + SHLQ $13, SI, BP \ // r01 = shld with r00 + ANDQ AX, SI \ // r00 &= mask51 + SHLQ $13, R8, R9 \ // r11 = shld with r10 + ANDQ AX, R8 \ // r10 &= mask51 + ADDQ BP, R8 \ // r10 += r01 + SHLQ $13, R10, R11 \ // r21 = shld with r20 + ANDQ AX, R10 \ // r20 &= mask51 + ADDQ R9, R10 \ // r20 += r11 + SHLQ $13, R12, R13 \ // r31 = shld with r30 + ANDQ AX, R12 \ // r30 &= mask51 + ADDQ R11, R12 \ // r30 += r21 + SHLQ $13, R14, R15 \ // r41 = shld with r40 + ANDQ AX, R14 \ // r40 &= mask51 + ADDQ R13, R14 \ // r40 += r31 + IMUL3Q $19, R15, R15 \ // r41 = r41*19 + ADDQ R15, SI \ // r00 += r41 + \ + MOVQ SI, DX \ // rdx <-- r00 + SHRQ $51, DX \ // rdx <-- r00 >> 51 + ADDQ DX, R8 \ // r10 += r00 >> 51 + MOVQ R8, DX \ // rdx <-- r10 + SHRQ $51, DX \ // rdx <-- r10 >> 51 + ANDQ AX, SI \ // r00 &= mask51 + ADDQ DX, R10 \ // r20 += r10 >> 51 + MOVQ R10, DX \ // rdx <-- r20 + SHRQ $51, DX \ // rdx <-- r20 >> 51 + ANDQ AX, R8 \ // r10 &= mask51 + ADDQ DX, R12 \ // r30 += r20 >> 51 + MOVQ R12, DX \ // rdx <-- r30 + SHRQ $51, DX \ // rdx <-- r30 >> 51 + ANDQ AX, R10 \ // r20 &= mask51 + ADDQ DX, R14 \ // r40 += r30 >> 51 + MOVQ R14, DX \ // rdx <-- r40 + SHRQ $51, DX \ // rdx <-- r40 >> 51 + ANDQ AX, R12 \ // r30 &= mask51 + IMUL3Q $19, DX, DX \ // rdx <-- (r40 >> 51) * 19 + ADDQ DX, SI \ // r00 += (r40 >> 51) *19 + ANDQ AX, R14 \ // r40 &= mask51 + +// func feMulAmd64(out, a, b *FieldElement, useBMI2 bool) +TEXT ·feMulAmd64(SB), NOSPLIT|NOFRAME, $0-25 + MOVQ a+8(FP), BX + MOVQ b+16(FP), CX + + MOVBQZX useBMI2+24(FP), DX + TESTQ DX, DX + JZ mul_vanilla + + // This codepath uses BMI2 to shave off a number of instructions, + // for a slight performance gain. + + // r0 = a0*b0 + // r1 = a1*b0 + // r2 = a2*b0 + // r3 = a3*b0 + // r4 = a4*b0 + MOVQ 0(CX), DX // rdx <- b0 + MULXQ 32(BX), R14, R15 // r40, r41 <- a4*b0 + MULXQ 24(BX), R12, R13 // r30, r31 <- a3*b0 + MULXQ 16(BX), R10, R11 // r20, r21 <- a2*b0 + MULXQ 8(BX), R8, R9 // r10, r11 <- a1*b0 + MULXQ 0(BX), SI, BP // r00, r01 <- a0*b0 + + // r0 += a4*b1_19 + // r1 += a0*b1 + // r2 += a1*b1 + // r3 += a2*b1 + // r4 += a3*b1 + MOVQ 8(CX), DX // rdx <- b1 + MULXQ 24(BX), AX, DI // rdi, rax <- a3*b1 + ADDQ AX, R14 // r40 += rax + ADCQ DI, R15 // r41 += rdi + cf + MULXQ 16(BX), AX, DI // rdi, rax <- a2*b1 + ADDQ AX, R12 // r30 += rax + ADCQ DI, R13 // r31 += rdi + cf + MULXQ 8(BX), AX, DI // rdi, rax <- a1*b1 + ADDQ AX, R10 // r20 += rax + ADCQ DI, R11 // r21 += rdi + cf + MULXQ 0(BX), AX, DI // rdi, rax <- a0*b1 + ADDQ AX, R8 // r10 += rax + ADCQ DI, R9 // r11 += rdi + cf + IMUL3Q $19, DX, DX // rdx <- b1*19 + MULXQ 32(BX), AX, DI // rdi, rax <- a4*b1_19 + ADDQ AX, SI // r00 += rax + ADCQ DI, BP // r01 += rdi + cf + + // r0 += a3*b2_19 + // r1 += a4*b2_19 + // r2 += a0*b1 + // r3 += a1*b1 + // r4 += a2*b1 + MOVQ 16(CX), DX // rdx <- b2 + MULXQ 16(BX), AX, DI // rdi, rax <= a2*b2 + ADDQ AX, R14 // r40 += rax + ADCQ DI, R15 // r41 += rdi + cf + MULXQ 8(BX), AX, DI // rdi, rax <- a1*b2 + ADDQ AX, R12 // r30 += rax + ADCQ DI, R13 // r31 += rdi + cf + MULXQ 0(BX), AX, DI // rdi, rax <- a0*b2 + ADDQ AX, R10 // r20 += rax + ADCQ DI, R11 // r21 += rdi + cf + IMUL3Q $19, DX, DX // rdx <- b2*19 + MULXQ 32(BX), AX, DI // rdi, rax <- a4*b2_19 + ADDQ AX, R8 // r10 += rax + ADCQ DI, R9 // r11 += rdi + cf + MULXQ 24(BX), AX, DI // rdi, rax <- a3*b2_19 + ADDQ AX, SI // r00 += rax + ADCQ DI, BP // r01 += rdi + cf + + // r0 += a2*b3_19 + // r1 += a3*b3_19 + // r2 += a4*b3_19 + // r3 += a0*b3 + // r4 += a1*b3 + MOVQ 24(CX), DX // rdx <- b3 + MULXQ 8(BX), AX, DI // rdi, rax <= a1*b3 + ADDQ AX, R14 // r40 += rax + ADCQ DI, R15 // r41 += rdi + cf + MULXQ 0(BX), AX, DI // rdi, rax <- a0*b3 + ADDQ AX, R12 // r30 += rax + ADCQ DI, R13 // r31 += rdi + cf + IMUL3Q $19, DX, DX // rdx <- b3*19 + MULXQ 32(BX), AX, DI // rdi, rax <- a4*b3_19 + ADDQ AX, R10 // r20 += rax + ADCQ DI, R11 // r21 += rdi + cf + MULXQ 24(BX), AX, DI // rdi, rax <- a3*b3_19 + ADDQ AX, R8 // r10 += rax + ADCQ DI, R9 // r11 += rdi + cf + MULXQ 16(BX), AX, DI // rdi, rax <- a2*b3_19 + ADDQ AX, SI // r00 += rax + ADCQ DI, BP // r01 += rdi + cf + + // r0 += a1*b4_19 + // r1 += a2*b4_19 + // r2 += a3*b4_19 + // r3 += a4*b4_19 + // r4 += a0*b4 + MOVQ 32(CX), DX // rdx <- b4 + MULXQ 0(BX), AX, DI // rdi, rax <= a0*b4 + ADDQ AX, R14 // r40 += rax + ADCQ DI, R15 // r41 += rdi + cf + IMUL3Q $19, DX, DX // rdx <- b4*19 + MULXQ 32(BX), AX, DI // rdi, rax <- a4*b4_19 + ADDQ AX, R12 // r30 += rax + ADCQ DI, R13 // r31 += rdi + cf + MULXQ 24(BX), AX, DI // rdi, rax <- a3*b4_19 + ADDQ AX, R10 // r20 += rax + ADCQ DI, R11 // r21 += rdi + cf + MULXQ 16(BX), AX, DI // rdi, rax <- a2*b4_19 + ADDQ AX, R8 // r10 += rax + ADCQ DI, R9 // r11 += rdi + cf + MULXQ 8(BX), AX, DI // rdi, rax <- a1*b4_19 + ADDQ AX, SI // r00 += rax + ADCQ DI, BP // r01 += rdi + cf + + JMP mul_reduce + +mul_vanilla: + // Based on assembly generated by PeachPy. Equivalent to the Go in // feMulGeneric, which was originally based on the amd64-51-30k // assembly in SUPERCOP. - MOVQ out+0(FP), DI - MOVQ a+8(FP), BX - MOVQ b+16(FP), CX - // Calculate r0 MOVQ 0(BX), AX // rax <-- x0 MULQ 0(CX) // rdx, rax <-- x0*y0 @@ -154,47 +316,10 @@ TEXT ·feMul(SB),$0-24 ADDQ AX, R14 ADCQ DX, R15 +mul_reduce: + reduce64() - MOVQ $2251799813685247, AX // (1<<51) - 1 - SHLQ $13, SI, BP // r01 = shld with r00 - ANDQ AX, SI // r00 &= mask51 - SHLQ $13, R8, R9 // r11 = shld with r10 - ANDQ AX, R8 // r10 &= mask51 - ADDQ BP, R8 // r10 += r01 - SHLQ $13, R10, R11 // r21 = shld with r20 - ANDQ AX, R10 // r20 &= mask51 - ADDQ R9, R10 // r20 += r11 - SHLQ $13, R12, R13 // r31 = shld with r30 - ANDQ AX, R12 // r30 &= mask51 - ADDQ R11, R12 // r30 += r21 - SHLQ $13, R14, R15 // r41 = shld with r40 - ANDQ AX, R14 // r40 &= mask51 - ADDQ R13, R14 // r40 += r31 - IMUL3Q $19, R15, R15 // r41 = r41*19 - ADDQ R15, SI // r00 += r41 - - MOVQ SI, DX // rdx <-- r00 - SHRQ $51, DX // rdx <-- r00 >> 51 - ADDQ DX, R8 // r10 += r00 >> 51 - MOVQ R8, DX // rdx <-- r10 - SHRQ $51, DX // rdx <-- r10 >> 51 - ANDQ AX, SI // r00 &= mask51 - ADDQ DX, R10 // r20 += r10 >> 51 - MOVQ R10, DX // rdx <-- r20 - SHRQ $51, DX // rdx <-- r20 >> 51 - ANDQ AX, R8 // r10 &= mask51 - ADDQ DX, R12 // r30 += r20 >> 51 - MOVQ R12, DX // rdx <-- r30 - SHRQ $51, DX // rdx <-- r30 >> 51 - ANDQ AX, R10 // r20 &= mask51 - ADDQ DX, R14 // r40 += r30 >> 51 - MOVQ R14, DX // rdx <-- r40 - SHRQ $51, DX // rdx <-- r40 >> 51 - ANDQ AX, R12 // r30 &= mask51 - IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19 - ADDQ DX, SI // r00 += (r40 >> 51) *19 - ANDQ AX, R14 // r40 &= mask51 - + MOVQ out+0(FP), DI MOVQ SI, 0(DI) MOVQ R8, 8(DI) MOVQ R10, 16(DI) @@ -202,147 +327,190 @@ TEXT ·feMul(SB),$0-24 MOVQ R14, 32(DI) RET -// func feSquare(out, x *FieldElement) -TEXT ·feSquare(SB),4,$0-16 - MOVQ out+0(FP), DI - MOVQ x+8(FP), SI +// func feSquareAmd64(out, x *FieldElement, useBMI2 bool) +TEXT ·feSquareAmd64(SB), NOSPLIT|NOFRAME, $0-17 + MOVQ x+8(FP), BX + + // Pick the appropriate implementation, based on if the caller thinks + // BMI2 is supported or not. + MOVBQZX useBMI2+16(FP), DX + TESTQ DX, DX + JZ square_vanilla + + // This codepath uses BMI2 to shave off a number of instructions, + // for a slight performance gain. + + // r0 = a0*a0 + // r1 = 2*a0*a1 + // r2 = 2*a0*a2 + // r3 = 2*a0*a3 + // r4 = 2*a0*a4 + MOVQ 0(BX), DX // rdx <- a0 + MULXQ DX, SI, BP // r00, r01 <- a0*a0 + SHLQ $1, DX // rdx *= 2 (d0 = 2*a0) + MULXQ 8(BX), R8, R9 // r10, r11 <- d0*a1 + MULXQ 16(BX), R10, R11 // r20, r21 <- d0*a2 + MULXQ 24(BX), R12, R13 // r30, r31 <- d0*a3 + MULXQ 32(BX), R14, R15 // r40, r41 <- d0*a4 + + // r2 += a1*a1 + // r3 += 2*a1*a2 + // r4 += 2*a1*a3 + MOVQ 8(BX), DX // rdx <- a1 + MULXQ DX, AX, DI // rdi, rax = a1*a1 + ADDQ AX, R10 // r20 += rax + ADCQ DI, R11 // r21 += rdi + cf + SHLQ $1, DX // rdx *= 2 (d1 = 2*a1) + MULXQ 16(BX), AX, DI // rdi, rax = d1*a2 + ADDQ AX, R12 // r30 += rax + ADCQ DI, R13 // r31 += rdi + cf + MULXQ 24(BX), AX, DI // rdi, rax = d1*a3 + ADDQ AX, R14 // r40 += rax + ADCQ DI, R15 // r41 += rdi + cf + IMUL3Q $19, DX, DX // rdx *= 19 (d1_38 = 2*19*a1) + MULXQ 32(BX), AX, DI // rdi, rax = d1_38*a4 + ADDQ AX, SI // r00 += rax + ADCQ DI, BP // r01 += rdi + cf + + // r4 += a2*a2 + // r0 += 2*19*a2*a3 + // r1 += 2*19*a2*a4 + MOVQ 16(BX), DX // rdx <- a2 + MULXQ DX, AX, DI // rdi, rax = a2*a2 + ADDQ AX, R14 // r40 += rax + ADCQ DI, R15 // r41 += rdi + cf + IMUL3Q $38, DX, DX // dx *= 2*19 (d2_38 = 2*19*a2) + MULXQ 24(BX), AX, DI // rdi, rax = d2_38*a3 + ADDQ AX, SI // r00 += rax + ADCQ DI, BP // r01 += rdi + cf + MULXQ 32(BX), AX, DI // rdi, rax = d2_38*a4 + ADDQ AX, R8 // r10 += rax + ADCQ DI, R9 // r11 += rdi + cf + + // r1 += 19*a3*a3 + // r2 += 2*19*a4*a3 + MOVQ 24(BX), DX // rdx <- a3 + IMUL3Q $19, DX, DX // dx *= 19 (d3_19 = 19*a3) + MULXQ 24(BX), AX, DI // rdi, rax = d3_19*a3 + ADDQ AX, R8 // r10 += rax + ADCQ DI, R9 // r11 += rdi + cf + SHLQ $1, DX // rdx *= 2 (d3_38 = 2*19*a3) + MULXQ 32(BX), AX, DI // rdi, rax = d3_38*a4 + ADDQ AX, R10 // r20 += rax + ADCQ DI, R11 // r21 += rdi + cf + + // r3 += 19*a4*a4 + MOVQ 32(BX), DX // rdx <- a4 + IMUL3Q $19, DX, DX // dx *= 19 (d4_19 = 19*a4) + MULXQ 32(BX), AX, DI // rdi, rax = d4_19*a4 + ADDQ AX, R12 // r30 += rax + ADCQ DI, R13 // r31 += rdi + cf + + JMP square_reduce + +square_vanilla: // r0 = x0*x0 + x1*38*x4 + x2*38*x3 - MOVQ 0(SI), AX - MULQ 0(SI) - MOVQ AX, CX // r00 - MOVQ DX, R8 // r01 + MOVQ 0(BX), AX + MULQ 0(BX) + MOVQ AX, SI // r00 + MOVQ DX, BP // r01 - MOVQ 8(SI), DX + MOVQ 8(BX), DX IMUL3Q $38, DX, AX - MULQ 32(SI) - ADDQ AX, CX - ADCQ DX, R8 + MULQ 32(BX) + ADDQ AX, SI + ADCQ DX, BP - MOVQ 16(SI), DX + MOVQ 16(BX), DX IMUL3Q $38, DX, AX - MULQ 24(SI) - ADDQ AX, CX - ADCQ DX, R8 + MULQ 24(BX) + ADDQ AX, SI + ADCQ DX, BP // r1 = x0*2*x1 + x2*38*x4 + x3*19*x3 - MOVQ 0(SI), AX + MOVQ 0(BX), AX SHLQ $1, AX - MULQ 8(SI) - MOVQ AX, R9 // r10 - MOVQ DX, R10 // r11 + MULQ 8(BX) + MOVQ AX, R8 // r10 + MOVQ DX, R9 // r11 - MOVQ 16(SI), DX + MOVQ 16(BX), DX IMUL3Q $38, DX, AX - MULQ 32(SI) - ADDQ AX, R9 - ADCQ DX, R10 + MULQ 32(BX) + ADDQ AX, R8 + ADCQ DX, R9 - MOVQ 24(SI), DX + MOVQ 24(BX), DX IMUL3Q $19, DX, AX - MULQ 24(SI) - ADDQ AX, R9 - ADCQ DX, R10 + MULQ 24(BX) + ADDQ AX, R8 + ADCQ DX, R9 // r2 = x0*2*x2 + x1*x1 + x3*38*x4 - MOVQ 0(SI), AX + MOVQ 0(BX), AX SHLQ $1, AX - MULQ 16(SI) - MOVQ AX, R11 // r20 - MOVQ DX, R12 // r21 + MULQ 16(BX) + MOVQ AX, R10 // r20 + MOVQ DX, R11 // r21 - MOVQ 8(SI), AX - MULQ 8(SI) - ADDQ AX, R11 - ADCQ DX, R12 + MOVQ 8(BX), AX + MULQ 8(BX) + ADDQ AX, R10 + ADCQ DX, R11 - MOVQ 24(SI), DX + MOVQ 24(BX), DX IMUL3Q $38, DX, AX - MULQ 32(SI) - ADDQ AX, R11 - ADCQ DX, R12 + MULQ 32(BX) + ADDQ AX, R10 + ADCQ DX, R11 // r3 = x0*2*x3 + x1*2*x2 + x4*19*x4 - MOVQ 0(SI), AX + MOVQ 0(BX), AX SHLQ $1, AX - MULQ 24(SI) - MOVQ AX, R13 // r30 - MOVQ DX, R14 // r31 + MULQ 24(BX) + MOVQ AX, R12 // r30 + MOVQ DX, R13 // r31 - MOVQ 8(SI), AX + MOVQ 8(BX), AX SHLQ $1, AX - MULQ 16(SI) - ADDQ AX, R13 - ADCQ DX, R14 + MULQ 16(BX) + ADDQ AX, R12 + ADCQ DX, R13 - MOVQ 32(SI), DX + MOVQ 32(BX), DX IMUL3Q $19, DX, AX - MULQ 32(SI) - ADDQ AX, R13 - ADCQ DX, R14 + MULQ 32(BX) + ADDQ AX, R12 + ADCQ DX, R13 // r4 = x0*2*x4 + x1*2*x3 + x2*x2 - MOVQ 0(SI), AX + MOVQ 0(BX), AX SHLQ $1, AX - MULQ 32(SI) - MOVQ AX, R15 // r40 - MOVQ DX, BX // r41 + MULQ 32(BX) + MOVQ AX, R14 // r40 + MOVQ DX, R15 // r41 - MOVQ 8(SI), AX + MOVQ 8(BX), AX SHLQ $1, AX - MULQ 24(SI) - ADDQ AX, R15 - ADCQ DX, BX + MULQ 24(BX) + ADDQ AX, R14 + ADCQ DX, R15 - MOVQ 16(SI), AX - MULQ 16(SI) - ADDQ AX, R15 - ADCQ DX, BX + MOVQ 16(BX), AX + MULQ 16(BX) + ADDQ AX, R14 + ADCQ DX, R15 +square_reduce: // Reduce - MOVQ $2251799813685247, AX // (1<<51) - 1 - SHLQ $13, CX, R8 // r01 = shld with r00 - ANDQ AX, CX // r00 &= mask51 - SHLQ $13, R9, R10 // r11 = shld with r10 - ANDQ AX, R9 // r10 &= mask51 - ADDQ R8, R9 // r10 += r01 - SHLQ $13, R11, R12 // r21 = shld with r20 - ANDQ AX, R11 // r20 &= mask51 - ADDQ R10, R11 // r20 += r11 - SHLQ $13, R13, R14 // r31 = shld with r30 - ANDQ AX, R13 // r30 &= mask51 - ADDQ R12, R13 // r30 += r21 - SHLQ $13, R15, BX // r41 = shld with r40 - ANDQ AX, R15 // r40 &= mask51 - ADDQ R14, R15 // r40 += r31 - IMUL3Q $19, BX, DX // r41 = r41*19 - ADDQ DX, CX // r00 += r41 + reduce64() - MOVQ CX, DX // rdx <-- r00 - SHRQ $51, DX // rdx <-- r00 >> 51 - ADDQ DX, R9 // r10 += r00 >> 51 - MOVQ R9, DX // rdx <-- r10 - SHRQ $51, DX // rdx <-- r10 >> 51 - ANDQ AX, CX // r00 &= mask51 - ADDQ DX, R11 // r20 += r10 >> 51 - MOVQ R11, DX // rdx <-- r20 - SHRQ $51, DX // rdx <-- r20 >> 51 - ANDQ AX, R9 // r10 &= mask51 - ADDQ DX, R13 // r30 += r20 >> 51 - MOVQ R13, DX // rdx <-- r30 - SHRQ $51, DX // rdx <-- r30 >> 51 - ANDQ AX, R11 // r20 &= mask51 - ADDQ DX, R15 // r40 += r30 >> 51 - MOVQ R15, DX // rdx <-- r40 - SHRQ $51, DX // rdx <-- r40 >> 51 - ANDQ AX, R13 // r30 &= mask51 - IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19 - ADDQ DX, CX // r00 += (r40 >> 51) *19 - ANDQ AX, R15 // r40 &= mask51 + MOVQ out+0(FP), DI + MOVQ SI, 0(DI) + MOVQ R8, 8(DI) + MOVQ R10, 16(DI) + MOVQ R12, 24(DI) + MOVQ R14, 32(DI) - MOVQ CX, 0(DI) - MOVQ R9, 8(DI) - MOVQ R11, 16(DI) - MOVQ R13, 24(DI) - MOVQ R15, 32(DI) RET