Replace x/crypto/ed25519 code with github.com/gtank/ed25519

Code pulled from commit 0a030f62c0 with FeEqual and FeCSwap removed.
2019-01-21 17:43:47 -05:00 · 2019-01-21 17:43:47 -05:00 · 7522470fbc
parent 2156d823cd
commit 7522470fbc
19 changed files with 1481 additions and 3249 deletions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 Copyright (c) 2009 The Go Authors. All rights reserved.
-Copyright (c) 2019 George Tankersley. All rights reserved.
+Copyright (c) 2017 George Tankersley. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
--- a/internal/edwards25519/const.go
+++ b/internal/edwards25519/const.go
@ -5,7 +5,11 @@

 package edwards25519

-import "math/big"
+import (
+	"math/big"
+
+	"github.com/gtank/ristretto255/internal/edwards25519/internal/radix51"
+)

 var (
 	SQRT_M1           FieldElement
@ -24,9 +28,9 @@ func init() {
 		CONST_D_MINUS_ONE_SQ, _    = new(big.Int).SetString("40440834346308536858101042469323190826248399146238708352240133220865137265952", 10)
 	)

-	feFromBig(&SQRT_M1, CONST_SQRT_M1)
-	feFromBig(&SQRT_AD_MINUS_ONE, CONST_SQRT_AD_MINUS_ONE)
-	feFromBig(&INVSQRT_A_MINUS_D, CONST_INVSQRT_A_MINUS_D)
-	feFromBig(&ONE_MINUS_D_SQ, CONST_ONE_MINUS_D_SQ)
-	feFromBig(&D_MINUS_ONE_SQ, CONST_D_MINUS_ONE_SQ)
+	radix51.FeFromBig(&SQRT_M1, CONST_SQRT_M1)
+	radix51.FeFromBig(&SQRT_AD_MINUS_ONE, CONST_SQRT_AD_MINUS_ONE)
+	radix51.FeFromBig(&INVSQRT_A_MINUS_D, CONST_INVSQRT_A_MINUS_D)
+	radix51.FeFromBig(&ONE_MINUS_D_SQ, CONST_ONE_MINUS_D_SQ)
+	radix51.FeFromBig(&D_MINUS_ONE_SQ, CONST_D_MINUS_ONE_SQ)
 }
--- a/internal/edwards25519/fe.go
+++ b/internal/edwards25519/fe.go
@ -7,51 +7,57 @@ package edwards25519

 import (
 	"crypto/subtle"
-	"math/big"

-	x "github.com/gtank/ristretto255/internal/edwards25519/internal/edwards25519"
+	"github.com/gtank/ristretto255/internal/edwards25519/internal/radix51"
 )

 // FeEqual returns 1 if a and b are equal, and 0 otherwise.
 func FeEqual(a, b *FieldElement) int {
 	var sa, sb [32]byte
-	x.FeToBytes(&sa, a)
-	x.FeToBytes(&sb, b)
+	radix51.FeToBytes(&sa, a)
+	radix51.FeToBytes(&sb, b)
 	return subtle.ConstantTimeCompare(sa[:], sb[:])
 }

 // FeSelect sets out to v if cond == 1, and to u if cond == 0.
 // out, v and u are allowed to overlap.
 func FeSelect(out, v, u *FieldElement, cond int) {
-	x.FeCMove(out, u, int32(cond^1))
-	x.FeCMove(out, v, int32(cond))
+	b := uint64(cond) * 0xffffffffffffffff
+	out[0] = (b & v[0]) | (^b & u[0])
+	out[1] = (b & v[1]) | (^b & u[1])
+	out[2] = (b & v[2]) | (^b & u[2])
+	out[3] = (b & v[3]) | (^b & u[3])
+	out[4] = (b & v[4]) | (^b & u[4])
 }

 // FeCondNeg sets u to -u if cond == 1, and to u if cond == 0.
 func FeCondNeg(u *FieldElement, cond int) {
 	var neg FieldElement
 	FeNeg(&neg, u)
-	x.FeCMove(u, &neg, int32(cond))
+
+	b := uint64(cond) * 0xffffffffffffffff
+	u[0] ^= b & (u[0] ^ neg[0])
+	u[1] ^= b & (u[1] ^ neg[1])
+	u[2] ^= b & (u[2] ^ neg[2])
+	u[3] ^= b & (u[3] ^ neg[3])
+	u[4] ^= b & (u[4] ^ neg[4])
+}
+
+// FeIsNegative returns 1 if u is negative, and 0 otherwise.
+func FeIsNegative(u *FieldElement) int {
+	var b [32]byte
+	radix51.FeToBytes(&b, u)
+	return int(b[0] & 1)
 }

 // FeAbs sets out to |u|. out and u are allowed to overlap.
 func FeAbs(out, u *FieldElement) {
 	var neg FieldElement
 	FeNeg(&neg, u)
-	FeSelect(out, &neg, u, int(FeIsNegative(u)))
+	FeSelect(out, &neg, u, FeIsNegative(u))
 }

-func feFromBig(dst *FieldElement, n *big.Int) {
-	var buf [32]byte
-	nn := n.Bytes()
-	copy(buf[len(buf)-len(nn):], nn)
-	for i := range buf[:len(buf)/2] {
-		buf[i], buf[len(buf)-1] = buf[len(buf)-1], buf[i]
-	}
-	x.FeFromBytes(dst, &buf)
-}
-
-// Copied from second-level internal/edwards25519
+// fePow22523 is from x/crypto/ed25519/internal/edwards25519.
 func fePow22523(out, z *FieldElement) {
 	var t0, t1, t2 FieldElement
 	var i int
--- a/internal/edwards25519/internal/edwards25519/const.go
+++ b/internal/edwards25519/internal/edwards25519/const.go
--- a/internal/edwards25519/internal/edwards25519/edwards25519.go
+++ b/internal/edwards25519/internal/edwards25519/edwards25519.go
--- a/internal/edwards25519/internal/group/const.go
+++ b/internal/edwards25519/internal/group/const.go
@ -0,0 +1,11 @@
+package group
+
+import "github.com/gtank/ristretto255/internal/edwards25519/internal/radix51"
+
+var (
+	// d, a constant in the curve equation
+	D radix51.FieldElement = [5]uint64{929955233495203, 466365720129213, 1662059464998953, 2033849074728123, 1442794654840575}
+
+	// 2*d, used in addition formula
+	D2 radix51.FieldElement = [5]uint64{1859910466990425, 932731440258426, 1072319116312658, 1815898335770999, 633789495995903}
+)
--- a/internal/edwards25519/internal/group/ge.go
+++ b/internal/edwards25519/internal/group/ge.go
@ -0,0 +1,272 @@
+// Implements group logic for the Ed25519 curve.
+
+package group
+
+import (
+	"math/big"
+
+	field "github.com/gtank/ristretto255/internal/edwards25519/internal/radix51"
+)
+
+// From EFD https://hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html
+// An elliptic curve in twisted Edwards form has parameters a, d and coordinates
+// x, y satisfying the following equations:
+//
+//     a * x^2 + y^2 = 1 + d * x^2 * y^2
+//
+// Extended coordinates assume a = -1 and represent x, y as (X, Y, Z, T)
+// satisfying the following equations:
+//
+//     x = X / Z
+//     y = Y / Z
+//     x * y = T / Z
+//
+// This representation was introduced in the HisilWongCarterDawson paper "Twisted
+// Edwards curves revisited" (Asiacrypt 2008).
+type ExtendedGroupElement struct {
+	X, Y, Z, T field.FieldElement
+}
+
+// Converts (x,y) to (X:Y:T:Z) extended coordinates, or "P3" in ref10. As
+// described in "Twisted Edwards Curves Revisited", Hisil-Wong-Carter-Dawson
+// 2008, Section 3.1 (https://eprint.iacr.org/2008/522.pdf)
+// See also https://hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html#addition-add-2008-hwcd-3
+func (v *ExtendedGroupElement) FromAffine(x, y *big.Int) {
+	field.FeFromBig(&v.X, x)
+	field.FeFromBig(&v.Y, y)
+	field.FeMul(&v.T, &v.X, &v.Y)
+	field.FeOne(&v.Z)
+}
+
+// Extended coordinates are XYZT with x = X/Z, y = Y/Z, or the "P3"
+// representation in ref10. Extended->affine is the same operation as moving
+// from projective to affine. Per HWCD, it is safe to move from extended to
+// projective by simply ignoring T.
+func (v *ExtendedGroupElement) ToAffine() (*big.Int, *big.Int) {
+	var x, y, zinv field.FieldElement
+
+	field.FeInvert(&zinv, &v.Z)
+	field.FeMul(&x, &v.X, &zinv)
+	field.FeMul(&y, &v.Y, &zinv)
+
+	return field.FeToBig(&x), field.FeToBig(&y)
+}
+
+// Per HWCD, it is safe to move from extended to projective by simply ignoring T.
+func (v *ExtendedGroupElement) ToProjective() *ProjectiveGroupElement {
+	var p ProjectiveGroupElement
+
+	field.FeCopy(&p.X, &v.X)
+	field.FeCopy(&p.Y, &v.Y)
+	field.FeCopy(&p.Z, &v.Z)
+
+	return &p
+}
+
+func (v *ExtendedGroupElement) Zero() *ExtendedGroupElement {
+	field.FeZero(&v.X)
+	field.FeOne(&v.Y)
+	field.FeOne(&v.Z)
+	field.FeZero(&v.T)
+	return v
+}
+
+// This is the same addition formula everyone uses, "add-2008-hwcd-3".
+// https://hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html#addition-add-2008-hwcd-3
+// TODO We know Z1=1 and Z2=1 here, so mmadd-2008-hwcd-3 (6M + 1S + 1*k + 9add) could apply
+func (v *ExtendedGroupElement) Add(p1, p2 *ExtendedGroupElement) *ExtendedGroupElement {
+	var tmp1, tmp2, A, B, C, D, E, F, G, H field.FieldElement
+	field.FeSub(&tmp1, &p1.Y, &p1.X) // tmp1 <-- Y1-X1
+	field.FeSub(&tmp2, &p2.Y, &p2.X) // tmp2 <-- Y2-X2
+	field.FeMul(&A, &tmp1, &tmp2)    // A <-- tmp1*tmp2 = (Y1-X1)*(Y2-X2)
+	field.FeAdd(&tmp1, &p1.Y, &p1.X) // tmp1 <-- Y1+X1
+	field.FeAdd(&tmp2, &p2.Y, &p2.X) // tmp2 <-- Y2+X2
+	field.FeMul(&B, &tmp1, &tmp2)    // B <-- tmp1*tmp2 = (Y1+X1)*(Y2+X2)
+	field.FeMul(&tmp1, &p1.T, &p2.T) // tmp1 <-- T1*T2
+	field.FeMul(&C, &tmp1, &D2)      // C <-- tmp1*2d = T1*2d*T2
+	field.FeMul(&tmp1, &p1.Z, &p2.Z) // tmp1 <-- Z1*Z2
+	field.FeAdd(&D, &tmp1, &tmp1)    // D <-- tmp1 + tmp1 = 2*Z1*Z2
+	field.FeSub(&E, &B, &A)          // E <-- B-A
+	field.FeSub(&F, &D, &C)          // F <-- D-C
+	field.FeAdd(&G, &D, &C)          // G <-- D+C
+	field.FeAdd(&H, &B, &A)          // H <-- B+A
+	field.FeMul(&v.X, &E, &F)        // X3 <-- E*F
+	field.FeMul(&v.Y, &G, &H)        // Y3 <-- G*H
+	field.FeMul(&v.T, &E, &H)        // T3 <-- E*H
+	field.FeMul(&v.Z, &F, &G)        // Z3 <-- F*G
+	return v
+}
+
+// This implements the explicit formulas from HWCD Section 3.3, "Dedicated
+// Doubling in [extended coordinates]".
+//
+// Explicit formula is as follows. Cost is 4M + 4S + 1D. For Ed25519, a = -1:
+//
+//       A ← X1^2
+//       B ← Y1^2
+//       C ← 2*Z1^2
+//       D ← a*A
+//       E ← (X1+Y1)^2 − A − B
+//       G ← D+B
+//       F ← G−C
+//       H ← D−B
+//       X3 ← E*F
+//       Y3 ← G*H
+//       T3 ← E*H
+//       Z3 ← F*G
+//
+// In ref10/donna/dalek etc, this is instead handled by a faster
+// mixed-coordinate doubling that results in a "Completed" group element
+// instead of another point in extended coordinates. I have implemented it
+// this way to see if more straightforward code is worth the (hopefully small)
+// performance tradeoff.
+func (v *ExtendedGroupElement) Double() *ExtendedGroupElement {
+	// TODO: Convert to projective coordinates? Section 4.3 mixed doubling?
+	// TODO: make a decision about how these APIs work wrt chaining/smashing
+	// *v = *(v.ToProjective().Double().ToExtended())
+	// return v
+
+	var A, B, C, D, E, F, G, H field.FieldElement
+
+	// A ← X1^2, B ← Y1^2
+	field.FeSquare(&A, &v.X)
+	field.FeSquare(&B, &v.Y)
+
+	// C ← 2*Z1^2
+	field.FeSquare(&C, &v.Z)
+	field.FeAdd(&C, &C, &C) // TODO should probably implement FeSquare2
+
+	// D ← -1*A
+	field.FeNeg(&D, &A) // implemented as substraction
+
+	// E ← (X1+Y1)^2 − A − B
+	var t0 field.FieldElement
+	field.FeAdd(&t0, &v.X, &v.Y)
+	field.FeSquare(&t0, &t0)
+	field.FeSub(&E, &t0, &A)
+	field.FeSub(&E, &E, &B)
+
+	// G ← D+B
+	field.FeAdd(&G, &D, &B)
+	// F ← G−C
+	field.FeSub(&F, &G, &C)
+	// H ← D−B
+	field.FeSub(&H, &D, &B)
+	// X3 ← E*F
+	field.FeMul(&v.X, &E, &F)
+	// Y3 ← G*H
+	field.FeMul(&v.Y, &G, &H)
+	// T3 ← E*H
+	field.FeMul(&v.T, &E, &H)
+	// Z3 ← F*G
+	field.FeMul(&v.Z, &F, &G)
+
+	return v
+}
+
+// Projective coordinates are XYZ with x = X/Z, y = Y/Z, or the "P2"
+// representation in ref10. This representation has a cheaper doubling formula
+// than extended coordinates.
+type ProjectiveGroupElement struct {
+	X, Y, Z field.FieldElement
+}
+
+func (v *ProjectiveGroupElement) FromAffine(x, y *big.Int) {
+	field.FeFromBig(&v.X, x)
+	field.FeFromBig(&v.Y, y)
+	field.FeOne(&v.Z)
+}
+
+func (v *ProjectiveGroupElement) ToAffine() (*big.Int, *big.Int) {
+	var x, y, zinv field.FieldElement
+
+	field.FeInvert(&zinv, &v.Z)
+	field.FeMul(&x, &v.X, &zinv)
+	field.FeMul(&y, &v.Y, &zinv)
+
+	return field.FeToBig(&x), field.FeToBig(&y)
+}
+
+// HWCD Section 3: "Given (X : Y : Z) in [projective coordinates] passing to
+// [extended coordinates, (X : Y : T : Z)] can be performed in 3M+1S by computing
+// (XZ, YZ, XY, Z^2)"
+func (v *ProjectiveGroupElement) ToExtended() *ExtendedGroupElement {
+	var r ExtendedGroupElement
+
+	field.FeMul(&r.X, &v.X, &v.Z)
+	field.FeMul(&r.Y, &v.Y, &v.Z)
+	field.FeMul(&r.T, &v.X, &v.Y)
+	field.FeSquare(&r.Z, &v.Z)
+
+	return &r
+}
+
+func (v *ProjectiveGroupElement) Zero() *ProjectiveGroupElement {
+	field.FeZero(&v.X)
+	field.FeOne(&v.Y)
+	field.FeOne(&v.Z)
+	return v
+}
+
+// Because we are often converting from affine, we can use "mdbl-2008-bbjlp"
+// which assumes Z1=1. We also assume a = -1.
+//
+// Assumptions: Z1 = 1.
+// Cost: 2M + 4S + 1*a + 7add + 1*2.
+// Source: 2008 BernsteinBirknerJoyeLangePeters
+//         http://eprint.iacr.org/2008/013, plus Z1=1, plus standard simplification.
+// Explicit formulas:
+//
+//       B = (X1+Y1)^2
+//       C = X1^2
+//       D = Y1^2
+//       E = a*C
+//       F = E+D
+//       X3 = (B-C-D)*(F-2)
+//       Y3 = F*(E-D)
+//       Z3 = F^2-2*F
+//
+// This assumption is one reason why this package is internal. For instance, it
+// will not hold throughout a Montgomery ladder, when we convert to projective
+// from possibly arbitrary extended coordinates.
+func (v *ProjectiveGroupElement) DoubleZ1() *ProjectiveGroupElement {
+	// TODO This function is inconsistent with the other ones in that it
+	// returns a copy rather than smashing the receiver. It doesn't matter
+	// because it is always called on ephemeral intermediate values, but should
+	// fix.
+	var p, q ProjectiveGroupElement
+	var t0, t1 field.FieldElement
+
+	p = *v
+
+	// C = X1^2, D = Y1^2
+	field.FeSquare(&t0, &p.X)
+	field.FeSquare(&t1, &p.Y)
+
+	// B = (X1+Y1)^2
+	field.FeAdd(&p.Z, &p.X, &p.Y) // Z is irrelevant but already allocated
+	field.FeSquare(&q.X, &p.Z)
+
+	// E = a*C where a = -1
+	field.FeNeg(&q.Z, &t0)
+
+	// F = E + D
+	field.FeAdd(&p.X, &q.Z, &t1)
+
+	// X3 = (B-C-D)*(F-2)
+	field.FeSub(&p.Y, &q.X, &t0)
+	field.FeSub(&p.Y, &p.Y, &t1)
+	field.FeSub(&p.Z, &p.X, &field.FieldTwo)
+	field.FeMul(&q.X, &p.Y, &p.Z)
+
+	// Y3 = F*(E-D)
+	field.FeSub(&p.Y, &q.Z, &t1)
+	field.FeMul(&q.Y, &p.X, &p.Y)
+
+	// Z3 = F^2 - 2*F
+	field.FeSquare(&q.Z, &p.X)
+	field.FeSub(&q.Z, &q.Z, &p.X)
+	field.FeSub(&q.Z, &q.Z, &p.X)
+
+	return &q
+}
--- a/internal/edwards25519/internal/radix51/const.go
+++ b/internal/edwards25519/internal/radix51/const.go
@ -0,0 +1,17 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Constants used in the implementation of GF(2^255-19) field arithmetic.
+package radix51
+
+const (
+	// The vaule 2^51-1, used in carry propagation
+	maskLow51Bits = uint64(1)<<51 - 1
+)
+
+var (
+	FieldZero FieldElement = [5]uint64{0, 0, 0, 0, 0}
+	FieldOne  FieldElement = [5]uint64{1, 0, 0, 0, 0}
+	FieldTwo  FieldElement = [5]uint64{2, 0, 0, 0, 0}
+)
--- a/internal/edwards25519/internal/radix51/fe.go
+++ b/internal/edwards25519/internal/radix51/fe.go
@ -0,0 +1,332 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Field arithmetic in radix 2^51 representation. This code is a port of the
+// public domain amd64-51-30k version of ed25519 from SUPERCOP.
+package radix51
+
+import (
+	"math/big"
+	"math/bits"
+)
+
+// FieldElement represents an element of the field GF(2^255-19). An element t
+// represents the integer t[0] + t[1]*2^51 + t[2]*2^102 + t[3]*2^153 +
+// t[4]*2^204.
+type FieldElement [5]uint64
+
+func FeZero(v *FieldElement) {
+	v[0] = 0
+	v[1] = 0
+	v[2] = 0
+	v[3] = 0
+	v[4] = 0
+}
+
+func FeOne(v *FieldElement) {
+	v[0] = 1
+	v[1] = 0
+	v[2] = 0
+	v[3] = 0
+	v[4] = 0
+}
+
+// SetInt sets the receiving FieldElement to the specified small integer.
+func SetInt(v *FieldElement, x uint64) {
+	v[0] = x
+	v[1] = 0
+	v[2] = 0
+	v[3] = 0
+	v[4] = 0
+}
+
+func FeReduce(t, v *FieldElement) {
+	// Copy v
+	*t = *v
+
+	// Lev v = v[0] + v[1]*2^51 + v[2]*2^102 + v[3]*2^153 + v[4]*2^204
+	// Reduce each limb below 2^51, propagating carries.
+	t[1] += t[0] >> 51
+	t[0] = t[0] & maskLow51Bits
+	t[2] += t[1] >> 51
+	t[1] = t[1] & maskLow51Bits
+	t[3] += t[2] >> 51
+	t[2] = t[2] & maskLow51Bits
+	t[4] += t[3] >> 51
+	t[3] = t[3] & maskLow51Bits
+	t[0] += (t[4] >> 51) * 19
+	t[4] = t[4] & maskLow51Bits
+
+	// We now hate a field element t < 2^255, but need t <= 2^255-19
+	// TODO Document why this works. It's the elaborate comment about r = h-pq etc etc.
+
+	// Get the carry bit
+	c := (t[0] + 19) >> 51
+	c = (t[1] + c) >> 51
+	c = (t[2] + c) >> 51
+	c = (t[3] + c) >> 51
+	c = (t[4] + c) >> 51
+
+	t[0] += 19 * c
+
+	t[1] += t[0] >> 51
+	t[0] = t[0] & maskLow51Bits
+	t[2] += t[1] >> 51
+	t[1] = t[1] & maskLow51Bits
+	t[3] += t[2] >> 51
+	t[2] = t[2] & maskLow51Bits
+	t[4] += t[3] >> 51
+	t[3] = t[3] & maskLow51Bits
+	// no additional carry
+	t[4] = t[4] & maskLow51Bits
+}
+
+// FeAdd sets out = a + b. Long sequences of additions without reduction that
+// let coefficients grow larger than 54 bits would be a problem. Paper
+// cautions: "do not have such sequences of additions".
+func FeAdd(out, a, b *FieldElement) {
+	out[0] = a[0] + b[0]
+	out[1] = a[1] + b[1]
+	out[2] = a[2] + b[2]
+	out[3] = a[3] + b[3]
+	out[4] = a[4] + b[4]
+}
+
+// FeSub sets out = a - b
+func FeSub(out, a, b *FieldElement) {
+	var t FieldElement
+	t = *b
+
+	// Reduce each limb below 2^51, propagating carries. Ensures that results
+	// fit within the limbs. This would not be required for reduced input.
+	t[1] += t[0] >> 51
+	t[0] = t[0] & maskLow51Bits
+	t[2] += t[1] >> 51
+	t[1] = t[1] & maskLow51Bits
+	t[3] += t[2] >> 51
+	t[2] = t[2] & maskLow51Bits
+	t[4] += t[3] >> 51
+	t[3] = t[3] & maskLow51Bits
+	t[0] += (t[4] >> 51) * 19
+	t[4] = t[4] & maskLow51Bits
+
+	// This is slightly more complicated. Because we use unsigned coefficients, we
+	// first add a multiple of p and then subtract.
+	out[0] = (a[0] + 0xFFFFFFFFFFFDA) - t[0]
+	out[1] = (a[1] + 0xFFFFFFFFFFFFE) - t[1]
+	out[2] = (a[2] + 0xFFFFFFFFFFFFE) - t[2]
+	out[3] = (a[3] + 0xFFFFFFFFFFFFE) - t[3]
+	out[4] = (a[4] + 0xFFFFFFFFFFFFE) - t[4]
+}
+
+// FeNeg sets out = -a
+func FeNeg(out, a *FieldElement) {
+	var t FieldElement
+	FeZero(&t)
+	FeSub(out, &t, a)
+}
+
+// FeInvert sets out = 1/z mod p by calculating z^(p-2), p-2 = 2^255 - 21.
+func FeInvert(out, z *FieldElement) {
+	// Inversion is implemented as exponentiation with exponent p − 2. It uses the
+	// same sequence of 255 squarings and 11 multiplications as [Curve25519].
+	var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t FieldElement
+
+	FeSquare(&z2, z)        // 2
+	FeSquare(&t, &z2)       // 4
+	FeSquare(&t, &t)        // 8
+	FeMul(&z9, &t, z)       // 9
+	FeMul(&z11, &z9, &z2)   // 11
+	FeSquare(&t, &z11)      // 22
+	FeMul(&z2_5_0, &t, &z9) // 2^5 - 2^0 = 31
+
+	FeSquare(&t, &z2_5_0) // 2^6 - 2^1
+	for i := 0; i < 4; i++ {
+		FeSquare(&t, &t) // 2^10 - 2^5
+	}
+	FeMul(&z2_10_0, &t, &z2_5_0) // 2^10 - 2^0
+
+	FeSquare(&t, &z2_10_0) // 2^11 - 2^1
+	for i := 0; i < 9; i++ {
+		FeSquare(&t, &t) // 2^20 - 2^10
+	}
+	FeMul(&z2_20_0, &t, &z2_10_0) // 2^20 - 2^0
+
+	FeSquare(&t, &z2_20_0) // 2^21 - 2^1
+	for i := 0; i < 19; i++ {
+		FeSquare(&t, &t) // 2^40 - 2^20
+	}
+	FeMul(&t, &t, &z2_20_0) // 2^40 - 2^0
+
+	FeSquare(&t, &t) // 2^41 - 2^1
+	for i := 0; i < 9; i++ {
+		FeSquare(&t, &t) // 2^50 - 2^10
+	}
+	FeMul(&z2_50_0, &t, &z2_10_0) // 2^50 - 2^0
+
+	FeSquare(&t, &z2_50_0) // 2^51 - 2^1
+	for i := 0; i < 49; i++ {
+		FeSquare(&t, &t) // 2^100 - 2^50
+	}
+	FeMul(&z2_100_0, &t, &z2_50_0) // 2^100 - 2^0
+
+	FeSquare(&t, &z2_100_0) // 2^101 - 2^1
+	for i := 0; i < 99; i++ {
+		FeSquare(&t, &t) // 2^200 - 2^100
+	}
+	FeMul(&t, &t, &z2_100_0) // 2^200 - 2^0
+
+	FeSquare(&t, &t) // 2^201 - 2^1
+	for i := 0; i < 49; i++ {
+		FeSquare(&t, &t) // 2^250 - 2^50
+	}
+	FeMul(&t, &t, &z2_50_0) // 2^250 - 2^0
+
+	FeSquare(&t, &t) // 2^251 - 2^1
+	FeSquare(&t, &t) // 2^252 - 2^2
+	FeSquare(&t, &t) // 2^253 - 2^3
+	FeSquare(&t, &t) // 2^254 - 2^4
+	FeSquare(&t, &t) // 2^255 - 2^5
+
+	FeMul(out, &t, &z11) // 2^255 - 21
+}
+
+func FeCopy(out, in *FieldElement) {
+	copy(out[:], in[:])
+}
+
+func FeFromBytes(v *FieldElement, x *[32]byte) {
+	v[0] = uint64(x[0])
+	v[0] |= uint64(x[1]) << 8
+	v[0] |= uint64(x[2]) << 16
+	v[0] |= uint64(x[3]) << 24
+	v[0] |= uint64(x[4]) << 32
+	v[0] |= uint64(x[5]) << 40
+	v[0] |= uint64(x[6]&7) << 48
+
+	v[1] = uint64(x[6]) >> 3
+	v[1] |= uint64(x[7]) << 5
+	v[1] |= uint64(x[8]) << 13
+	v[1] |= uint64(x[9]) << 21
+	v[1] |= uint64(x[10]) << 29
+	v[1] |= uint64(x[11]) << 37
+	v[1] |= uint64(x[12]&63) << 45
+
+	v[2] = uint64(x[12]) >> 6
+	v[2] |= uint64(x[13]) << 2
+	v[2] |= uint64(x[14]) << 10
+	v[2] |= uint64(x[15]) << 18
+	v[2] |= uint64(x[16]) << 26
+	v[2] |= uint64(x[17]) << 34
+	v[2] |= uint64(x[18]) << 42
+	v[2] |= uint64(x[19]&1) << 50
+
+	v[3] = uint64(x[19]) >> 1
+	v[3] |= uint64(x[20]) << 7
+	v[3] |= uint64(x[21]) << 15
+	v[3] |= uint64(x[22]) << 23
+	v[3] |= uint64(x[23]) << 31
+	v[3] |= uint64(x[24]) << 39
+	v[3] |= uint64(x[25]&15) << 47
+
+	v[4] = uint64(x[25]) >> 4
+	v[4] |= uint64(x[26]) << 4
+	v[4] |= uint64(x[27]) << 12
+	v[4] |= uint64(x[28]) << 20
+	v[4] |= uint64(x[29]) << 28
+	v[4] |= uint64(x[30]) << 36
+	v[4] |= uint64(x[31]&127) << 44
+}
+
+func FeToBytes(r *[32]byte, v *FieldElement) {
+	var t FieldElement
+	FeReduce(&t, v)
+
+	r[0] = byte(t[0] & 0xff)
+	r[1] = byte((t[0] >> 8) & 0xff)
+	r[2] = byte((t[0] >> 16) & 0xff)
+	r[3] = byte((t[0] >> 24) & 0xff)
+	r[4] = byte((t[0] >> 32) & 0xff)
+	r[5] = byte((t[0] >> 40) & 0xff)
+	r[6] = byte((t[0] >> 48))
+
+	r[6] ^= byte((t[1] << 3) & 0xf8)
+	r[7] = byte((t[1] >> 5) & 0xff)
+	r[8] = byte((t[1] >> 13) & 0xff)
+	r[9] = byte((t[1] >> 21) & 0xff)
+	r[10] = byte((t[1] >> 29) & 0xff)
+	r[11] = byte((t[1] >> 37) & 0xff)
+	r[12] = byte((t[1] >> 45))
+
+	r[12] ^= byte((t[2] << 6) & 0xc0)
+	r[13] = byte((t[2] >> 2) & 0xff)
+	r[14] = byte((t[2] >> 10) & 0xff)
+	r[15] = byte((t[2] >> 18) & 0xff)
+	r[16] = byte((t[2] >> 26) & 0xff)
+	r[17] = byte((t[2] >> 34) & 0xff)
+	r[18] = byte((t[2] >> 42) & 0xff)
+	r[19] = byte((t[2] >> 50))
+
+	r[19] ^= byte((t[3] << 1) & 0xfe)
+	r[20] = byte((t[3] >> 7) & 0xff)
+	r[21] = byte((t[3] >> 15) & 0xff)
+	r[22] = byte((t[3] >> 23) & 0xff)
+	r[23] = byte((t[3] >> 31) & 0xff)
+	r[24] = byte((t[3] >> 39) & 0xff)
+	r[25] = byte((t[3] >> 47))
+
+	r[25] ^= byte((t[4] << 4) & 0xf0)
+	r[26] = byte((t[4] >> 4) & 0xff)
+	r[27] = byte((t[4] >> 12) & 0xff)
+	r[28] = byte((t[4] >> 20) & 0xff)
+	r[29] = byte((t[4] >> 28) & 0xff)
+	r[30] = byte((t[4] >> 36) & 0xff)
+	r[31] = byte((t[4] >> 44))
+}
+
+func FeFromBig(h *FieldElement, num *big.Int) {
+	var buf [32]byte
+
+	offset := 0
+	words := num.Bits()
+	numWords := len(words)
+
+	for n := 0; n < numWords; n++ {
+		word := words[n]
+		for i := 0; i < bits.UintSize/8; i++ {
+			if offset >= len(buf) {
+				break
+			}
+			buf[offset] = byte(word >> uint((i << 3)))
+			offset++
+		}
+	}
+
+	FeFromBytes(h, &buf)
+}
+
+func FeToBig(h *FieldElement) *big.Int {
+	var buf [32]byte
+	FeToBytes(&buf, h) // does a reduction
+
+	numWords := 256 / bits.UintSize
+	words := make([]big.Word, numWords)
+
+	offset := 0
+	byteSize := uint(bits.UintSize >> 3)
+	for n := 0; n < numWords; n++ {
+		word := uint(0)
+		for i := uint(0); i < byteSize; i++ {
+			if offset >= len(buf) {
+				break
+			}
+			word |= uint(buf[offset]) << (i << 3)
+			offset++
+		}
+		words[n] = big.Word(word)
+	}
+
+	out := new(big.Int)
+	return out.SetBits(words)
+}
--- a/internal/edwards25519/internal/radix51/fe_mul.go
+++ b/internal/edwards25519/internal/radix51/fe_mul.go
@ -0,0 +1,126 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 noasm
+
+package radix51
+
+// FeMul sets out = a * b
+func FeMul(out, x, y *FieldElement) {
+	var x0, x1, x2, x3, x4 uint64
+	var y0, y1, y2, y3, y4 uint64
+
+	x0 = x[0]
+	x1 = x[1]
+	x2 = x[2]
+	x3 = x[3]
+	x4 = x[4]
+
+	y0 = y[0]
+	y1 = y[1]
+	y2 = y[2]
+	y3 = y[3]
+	y4 = y[4]
+
+	// Reduction can be carried out simultaneously to multiplication. For
+	// example, we do not compute a coefficient r_5 . Whenever the result of a
+	// mul instruction belongs to r_5 , for example in the multiplication of
+	// x_3*y_2 , we multiply one of the inputs by 19 and add the result to r_0.
+
+	x1_19 := x1 * 19
+	x2_19 := x2 * 19
+	x3_19 := x3 * 19
+	x4_19 := x4 * 19
+
+	// calculate r0 = x0*y0 + 19*(x1*y4 + x2*y3 + x3*y2 + x4*y1)
+	r00, r01 := mul64x64(0, 0, x0, y0)
+	r00, r01 = mul64x64(r00, r01, x1_19, y4)
+	r00, r01 = mul64x64(r00, r01, x2_19, y3)
+	r00, r01 = mul64x64(r00, r01, x3_19, y2)
+	r00, r01 = mul64x64(r00, r01, x4_19, y1)
+
+	// calculate r1 = x0*y1 + x1*y0 + 19*(x2*y4 + x3*y3 + x4*y2)
+	r10, r11 := mul64x64(0, 0, x0, y1)
+	r10, r11 = mul64x64(r10, r11, x1, y0)
+	r10, r11 = mul64x64(r10, r11, x2_19, y4)
+	r10, r11 = mul64x64(r10, r11, x3_19, y3)
+	r10, r11 = mul64x64(r10, r11, x4_19, y2)
+
+	// calculate r2 = x0*y2 + x1*y1 + x2*y0 + 19*(x3*y4 + x4*y3)
+	r20, r21 := mul64x64(0, 0, x0, y2)
+	r20, r21 = mul64x64(r20, r21, x1, y1)
+	r20, r21 = mul64x64(r20, r21, x2, y0)
+	r20, r21 = mul64x64(r20, r21, x3_19, y4)
+	r20, r21 = mul64x64(r20, r21, x4_19, y3)
+
+	// calculate r3 = x0*y3 + x1*y2 + x2*y1 + x3*y0 + 19*x4*y4
+	r30, r31 := mul64x64(0, 0, x0, y3)
+	r30, r31 = mul64x64(r30, r31, x1, y2)
+	r30, r31 = mul64x64(r30, r31, x2, y1)
+	r30, r31 = mul64x64(r30, r31, x3, y0)
+	r30, r31 = mul64x64(r30, r31, x4_19, y4)
+
+	// calculate r4 = x0*y4 + x1*y3 + x2*y2 + x3*y1 + x4*y0
+	r40, r41 := mul64x64(0, 0, x0, y4)
+	r40, r41 = mul64x64(r40, r41, x1, y3)
+	r40, r41 = mul64x64(r40, r41, x2, y2)
+	r40, r41 = mul64x64(r40, r41, x3, y1)
+	r40, r41 = mul64x64(r40, r41, x4, y0)
+
+	// After the multiplication we need to reduce (carry) the 5 coefficients to
+	// obtain a result with coefficients that are at most slightly larger than
+	// 2^51 . Denote the two registers holding coefficient r_0 as r_00 and r_01
+	// with r_0 = 2^64*r_01 + r_00 . Similarly denote the two registers holding
+	// coefficient r_1 as r_10 and r_11 . We first shift r_01 left by 13, while
+	// shifting in the most significant bits of r_00 (shld instruction) and
+	// then compute the logical and of r_00 with 2^51 − 1. We do the same with
+	// r_10 and r_11 and add r_01 into r_10 after the logical and with 2^51 −
+	// 1. We proceed this way for coefficients r_2,...,r_4; register r_41 is
+	// multiplied by 19 before adding it to r_00 .
+
+	r01 = (r01 << 13) | (r00 >> 51)
+	r00 &= maskLow51Bits
+
+	r11 = (r11 << 13) | (r10 >> 51)
+	r10 &= maskLow51Bits
+	r10 += r01
+
+	r21 = (r21 << 13) | (r20 >> 51)
+	r20 &= maskLow51Bits
+	r20 += r11
+
+	r31 = (r31 << 13) | (r30 >> 51)
+	r30 &= maskLow51Bits
+	r30 += r21
+
+	r41 = (r41 << 13) | (r40 >> 51)
+	r40 &= maskLow51Bits
+	r40 += r31
+
+	r41 *= 19
+	r00 += r41
+
+	// Now all 5 coefficients fit into 64-bit registers but are still too large
+	// to be used as input to another multiplication. We therefore carry from
+	// r_0 to r_1 , from r_1 to r_2 , from r_2 to r_3 , from r_3 to r_4 , and
+	// finally from r_4 to r_0 . Each of these carries is done as one copy, one
+	// right shift by 51, one logical and with 2^51 − 1, and one addition.
+
+	r10 += r00 >> 51
+	r00 &= maskLow51Bits
+	r20 += r10 >> 51
+	r10 &= maskLow51Bits
+	r30 += r20 >> 51
+	r20 &= maskLow51Bits
+	r40 += r30 >> 51
+	r30 &= maskLow51Bits
+	r00 += (r40 >> 51) * 19
+	r40 &= maskLow51Bits
+
+	out[0] = r00
+	out[1] = r10
+	out[2] = r20
+	out[3] = r30
+	out[4] = r40
+}
--- a/internal/edwards25519/internal/radix51/fe_mul_amd64.go
+++ b/internal/edwards25519/internal/radix51/fe_mul_amd64.go
@ -0,0 +1,10 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!noasm
+
+package radix51
+
+// go:noescape
+func FeMul(out, a, b *FieldElement)
--- a/internal/edwards25519/internal/radix51/fe_mul_amd64.s
+++ b/internal/edwards25519/internal/radix51/fe_mul_amd64.s
@ -0,0 +1,202 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Based on assembly generated by PeachPy. Equivalent to the Go in fe_mul.go,
+// which was originally based on the amd64-51-30k assembly in SUPERCOP.
+
+// +build amd64,!noasm
+
+// func FeMul(outp *uint64, xp *uint64, yp *uint64)
+TEXT ·FeMul(SB),$0-24
+	MOVQ outp+0(FP), DI
+	MOVQ xp+8(FP), BX
+	MOVQ yp+16(FP), CX
+
+	// Calculate r0
+	MOVQ 0(BX), AX     // rax <-- x0
+	MULQ 0(CX)         // rdx, rax <-- x0*y0
+	MOVQ AX, SI        // r00 = rax
+	MOVQ DX, BP        // r01 = rdx
+
+	MOVQ 8(BX), DX     // rdx <-- x1
+	IMUL3Q $19, DX, AX // rax <-- x1*19
+	MULQ 32(CX)        // rdx, rax <-- x1_19*y4
+	ADDQ AX, SI        // r00 += rax
+	ADCQ DX, BP        // r01 += rdx
+
+	MOVQ 16(BX), DX    // rdx <-- x2
+	IMUL3Q $19, DX, AX // rax <-- x2*19
+	MULQ 24(CX)        // rdx, rax <-- x2_19*y3
+	ADDQ AX, SI        // r00 += rax
+	ADCQ DX, BP        // r01 += rdx
+
+	MOVQ 24(BX), DX    // rdx <-- x3
+	IMUL3Q $19, DX, AX // rax <-- x3*19
+	MULQ 16(CX)        // rdx, rax <-- x3_19 * y2
+	ADDQ AX, SI        // r00 += rax
+	ADCQ DX, BP        // r01 += rdx
+
+	MOVQ 32(BX), DX    // rdx <-- x4
+	IMUL3Q $19, DX, AX // rax <-- x4*19
+	MULQ 8(CX)         // rdx rax <-- x4_19*y1
+	ADDQ AX, SI        // r00 += rax
+	ADCQ DX, BP        // r01 += rdx
+
+	// Calculate r1
+	MOVQ 0(BX), AX
+	MULQ 8(CX)
+	MOVQ AX, R8 // r10
+	MOVQ DX, R9 // r11
+
+	MOVQ 8(BX), AX
+	MULQ 0(CX)
+	ADDQ AX, R8
+	ADCQ DX, R9
+
+	MOVQ 16(BX), DX
+	IMUL3Q $19, DX, AX
+	MULQ 32(CX)
+	ADDQ AX, R8
+	ADCQ DX, R9
+
+	MOVQ 24(BX), DX
+	IMUL3Q $19, DX, AX
+	MULQ 24(CX)
+	ADDQ AX, R8
+	ADCQ DX, R9
+
+	MOVQ 32(BX), DX
+	IMUL3Q $19, DX, AX
+	MULQ 16(CX)
+	ADDQ AX, R8
+	ADCQ DX, R9
+
+	// Calculate r2
+	MOVQ 0(BX), AX
+	MULQ 16(CX)
+	MOVQ AX, R10 // r20
+	MOVQ DX, R11 // r21
+
+	MOVQ 8(BX), AX
+	MULQ 8(CX)
+	ADDQ AX, R10
+	ADCQ DX, R11
+
+	MOVQ 16(BX), AX
+	MULQ 0(CX)
+	ADDQ AX, R10
+	ADCQ DX, R11
+
+	MOVQ 24(BX), DX
+	IMUL3Q $19, DX, AX
+	MULQ 32(CX)
+	ADDQ AX, R10
+	ADCQ DX, R11
+
+	MOVQ 32(BX), DX
+	IMUL3Q $19, DX, AX
+	MULQ 24(CX)
+	ADDQ AX, R10
+	ADCQ DX, R11
+
+	// Calculate r3
+	MOVQ 0(BX), AX
+	MULQ 24(CX)
+	MOVQ AX, R12 // r30
+	MOVQ DX, R13 // r31
+
+	MOVQ 8(BX), AX
+	MULQ 16(CX)
+	ADDQ AX, R12
+	ADCQ DX, R13
+
+	MOVQ 16(BX), AX
+	MULQ 8(CX)
+	ADDQ AX, R12
+	ADCQ DX, R13
+
+	MOVQ 24(BX), AX
+	MULQ 0(CX)
+	ADDQ AX, R12
+	ADCQ DX, R13
+
+	MOVQ 32(BX), DX
+	IMUL3Q $19, DX, AX
+	MULQ 32(CX)
+	ADDQ AX, R12
+	ADCQ DX, R13
+
+	// Calculate r4
+	MOVQ 0(BX), AX
+	MULQ 32(CX)
+	MOVQ AX, R14 // r40
+	MOVQ DX, R15 // r41
+
+	MOVQ 8(BX), AX
+	MULQ 24(CX)
+	ADDQ AX, R14
+	ADCQ DX, R15
+
+	MOVQ 16(BX), AX
+	MULQ 16(CX)
+	ADDQ AX, R14
+	ADCQ DX, R15
+
+	MOVQ 24(BX), AX
+	MULQ 8(CX)
+	ADDQ AX, R14
+	ADCQ DX, R15
+
+	MOVQ 32(BX), AX
+	MULQ 0(CX)
+	ADDQ AX, R14
+	ADCQ DX, R15
+
+
+	MOVQ $2251799813685247, AX // (1<<51) - 1
+	SHLQ $13, SI, BP     // r01 = shld with r00
+	ANDQ AX, SI          // r00 &= mask51
+	SHLQ $13, R8, R9     // r11 = shld with r10
+	ANDQ AX, R8          // r10 &= mask51
+	ADDQ BP, R8          // r10 += r01
+	SHLQ $13, R10, R11   // r21 = shld with r20
+	ANDQ AX, R10         // r20 &= mask51
+	ADDQ R9, R10         // r20 += r11
+	SHLQ $13, R12, R13   // r31 = shld with r30
+	ANDQ AX, R12         // r30 &= mask51
+	ADDQ R11, R12        // r30 += r21
+	SHLQ $13, R14, R15   // r41 = shld with r40
+	ANDQ AX, R14         // r40 &= mask51
+	ADDQ R13, R14        // r40 += r31
+	IMUL3Q $19, R15, R15 // r41 = r41*19
+	ADDQ R15, SI         // r00 += r41
+
+	MOVQ SI, DX          // rdx <-- r00
+	SHRQ $51, DX         // rdx <-- r00 >> 51
+	ADDQ DX, R8          // r10 += r00 >> 51
+	MOVQ R8, DX          // rdx <-- r10
+	SHRQ $51, DX         // rdx <-- r10 >> 51
+	ANDQ AX, SI          // r00 &= mask51
+	ADDQ DX, R10         // r20 += r10 >> 51
+	MOVQ R10, DX         // rdx <-- r20
+	SHRQ $51, DX         // rdx <-- r20 >> 51
+	ANDQ AX, R8          // r10 &= mask51
+	ADDQ DX, R12         // r30 += r20 >> 51
+	MOVQ R12, DX         // rdx <-- r30
+	SHRQ $51, DX         // rdx <-- r30 >> 51
+	ANDQ AX, R10         // r20 &= mask51
+	ADDQ DX, R14         // r40 += r30 >> 51
+	MOVQ R14, DX         // rdx <-- r40
+	SHRQ $51, DX         // rdx <-- r40 >> 51
+	ANDQ AX, R12         // r30 &= mask51
+	IMUL3Q $19, DX, DX   // rdx <-- (r40 >> 51) * 19
+	ADDQ DX, SI          // r00 += (r40 >> 51) *19
+	ANDQ AX, R14         // r40 &= mask51
+
+	MOVQ SI, 0(DI)
+	MOVQ R8, 8(DI)
+	MOVQ R10, 16(DI)
+	MOVQ R12, 24(DI)
+	MOVQ R14, 32(DI)
+	RET
--- a/internal/edwards25519/internal/radix51/fe_square.go
+++ b/internal/edwards25519/internal/radix51/fe_square.go
@ -0,0 +1,98 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 noasm
+
+package radix51
+
+// FeSquare sets out = x*x
+func FeSquare(out, x *FieldElement) {
+	// Squaring needs only 15 mul instructions. Some inputs are multiplied by 2;
+	// this is combined with multiplication by 19 where possible. The coefficient
+	// reduction after squaring is the same as for multiplication.
+
+	var x0, x1, x2, x3, x4 uint64
+
+	x0 = x[0]
+	x1 = x[1]
+	x2 = x[2]
+	x3 = x[3]
+	x4 = x[4]
+
+	x0_2 := x0 << 1
+	x1_2 := x1 << 1
+
+	x1_38 := x1 * 38
+	x2_38 := x2 * 38
+	x3_38 := x3 * 38
+
+	x3_19 := x3 * 19
+	x4_19 := x4 * 19
+
+	// r0 = x0*x0 + x1*38*x4 + x2*38*x3
+	r00, r01 := mul64x64(0, 0, x0, x0)
+	r00, r01 = mul64x64(r00, r01, x1_38, x4)
+	r00, r01 = mul64x64(r00, r01, x2_38, x3)
+
+	// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
+	r10, r11 := mul64x64(0, 0, x0_2, x1)
+	r10, r11 = mul64x64(r10, r11, x2_38, x4)
+	r10, r11 = mul64x64(r10, r11, x3_19, x3)
+
+	// r2 = x0*2*x2 + x1*x1 + x3*38*x4
+	r20, r21 := mul64x64(0, 0, x0_2, x2)
+	r20, r21 = mul64x64(r20, r21, x1, x1)
+	r20, r21 = mul64x64(r20, r21, x3_38, x4)
+
+	// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
+	r30, r31 := mul64x64(0, 0, x0_2, x3)
+	r30, r31 = mul64x64(r30, r31, x1_2, x2)
+	r30, r31 = mul64x64(r30, r31, x4_19, x4)
+
+	// r4 = x0*2*x4 + x1*2*x3 + x2*x2
+	r40, r41 := mul64x64(0, 0, x0_2, x4)
+	r40, r41 = mul64x64(r40, r41, x1_2, x3)
+	r40, r41 = mul64x64(r40, r41, x2, x2)
+
+	// Same reduction
+
+	r01 = (r01 << 13) | (r00 >> 51)
+	r00 &= maskLow51Bits
+
+	r11 = (r11 << 13) | (r10 >> 51)
+	r10 &= maskLow51Bits
+	r10 += r01
+
+	r21 = (r21 << 13) | (r20 >> 51)
+	r20 &= maskLow51Bits
+	r20 += r11
+
+	r31 = (r31 << 13) | (r30 >> 51)
+	r30 &= maskLow51Bits
+	r30 += r21
+
+	r41 = (r41 << 13) | (r40 >> 51)
+	r40 &= maskLow51Bits
+	r40 += r31
+
+	r41 *= 19
+	r00 += r41
+
+	r10 += r00 >> 51
+	r00 &= maskLow51Bits
+	r20 += r10 >> 51
+	r10 &= maskLow51Bits
+	r30 += r20 >> 51
+	r20 &= maskLow51Bits
+	r40 += r30 >> 51
+	r30 &= maskLow51Bits
+	r00 += (r40 >> 51) * 19
+	r40 &= maskLow51Bits
+
+	out[0] = r00
+	out[1] = r10
+	out[2] = r20
+	out[3] = r30
+	out[4] = r40
+}
--- a/internal/edwards25519/internal/radix51/fe_square_amd64.go
+++ b/internal/edwards25519/internal/radix51/fe_square_amd64.go
@ -0,0 +1,10 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!noasm
+
+package radix51
+
+// go:noescape
+func FeSquare(out, x *FieldElement)
--- a/internal/edwards25519/internal/radix51/fe_square_amd64.s
+++ b/internal/edwards25519/internal/radix51/fe_square_amd64.s
@ -0,0 +1,150 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!noasm
+
+// func FeSquare(outp *uint64, xp *uint64)
+TEXT ·FeSquare(SB),4,$0-16
+    MOVQ outp+0(FP), DI
+    MOVQ xp+8(FP), SI
+
+    // r0 = x0*x0 + x1*38*x4 + x2*38*x3
+    MOVQ 0(SI), AX
+    MULQ 0(SI)
+    MOVQ AX, CX // r00
+    MOVQ DX, R8 // r01
+
+    MOVQ 8(SI), DX
+    IMUL3Q $38, DX, AX
+    MULQ 32(SI)
+    ADDQ AX, CX
+    ADCQ DX, R8
+
+    MOVQ 16(SI), DX
+    IMUL3Q $38, DX, AX
+    MULQ 24(SI)
+    ADDQ AX, CX
+    ADCQ DX, R8
+
+    // r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
+    MOVQ 0(SI), AX
+    SHLQ $1, AX
+    MULQ 8(SI)
+    MOVQ AX, R9  // r10
+    MOVQ DX, R10 // r11
+
+    MOVQ 16(SI), DX
+    IMUL3Q $38, DX, AX
+    MULQ 32(SI)
+    ADDQ AX, R9
+    ADCQ DX, R10
+
+    MOVQ 24(SI), DX
+    IMUL3Q $19, DX, AX
+    MULQ 24(SI)
+    ADDQ AX, R9
+    ADCQ DX, R10
+
+    // r2 = x0*2*x2 + x1*x1 + x3*38*x4
+    MOVQ 0(SI), AX
+    SHLQ $1, AX
+    MULQ 16(SI)
+    MOVQ AX, R11 // r20
+    MOVQ DX, R12 // r21
+
+    MOVQ 8(SI), AX
+    MULQ 8(SI)
+    ADDQ AX, R11
+    ADCQ DX, R12
+
+    MOVQ 24(SI), DX
+    IMUL3Q $38, DX, AX
+    MULQ 32(SI)
+    ADDQ AX, R11
+    ADCQ DX, R12
+
+    // r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
+    MOVQ 0(SI), AX
+    SHLQ $1, AX
+    MULQ 24(SI)
+    MOVQ AX, R13 // r30
+    MOVQ DX, R14 // r31
+
+    MOVQ 8(SI), AX
+    SHLQ $1, AX
+    MULQ 16(SI)
+    ADDQ AX, R13
+    ADCQ DX, R14
+
+    MOVQ 32(SI), DX
+    IMUL3Q $19, DX, AX
+    MULQ 32(SI)
+    ADDQ AX, R13
+    ADCQ DX, R14
+
+    // r4 = x0*2*x4 + x1*2*x3 + x2*x2
+    MOVQ 0(SI), AX
+    SHLQ $1, AX
+    MULQ 32(SI)
+    MOVQ AX, R15 // r40
+    MOVQ DX, BX  // r41
+
+    MOVQ 8(SI), AX
+    SHLQ $1, AX
+    MULQ 24(SI)
+    ADDQ AX, R15
+    ADCQ DX, BX
+
+    MOVQ 16(SI), AX
+    MULQ 16(SI)
+    ADDQ AX, R15
+    ADCQ DX, BX
+
+    // Reduce
+    MOVQ $2251799813685247, AX // (1<<51) - 1
+    SHLQ $13, CX, R8     // r01 = shld with r00
+    ANDQ AX, CX          // r00 &= mask51
+    SHLQ $13, R9, R10    // r11 = shld with r10
+    ANDQ AX, R9          // r10 &= mask51
+    ADDQ R8, R9          // r10 += r01
+    SHLQ $13, R11, R12   // r21 = shld with r20
+    ANDQ AX, R11         // r20 &= mask51
+    ADDQ R10, R11        // r20 += r11
+    SHLQ $13, R13, R14   // r31 = shld with r30
+    ANDQ AX, R13         // r30 &= mask51
+    ADDQ R12, R13        // r30 += r21
+    SHLQ $13, R15, BX    // r41 = shld with r40
+    ANDQ AX, R15         // r40 &= mask51
+    ADDQ R14, R15        // r40 += r31
+    IMUL3Q $19, BX, DX   // r41 = r41*19
+    ADDQ DX, CX          // r00 += r41
+
+    MOVQ CX, DX          // rdx <-- r00
+    SHRQ $51, DX         // rdx <-- r00 >> 51
+    ADDQ DX, R9          // r10 += r00 >> 51
+    MOVQ R9, DX          // rdx <-- r10
+    SHRQ $51, DX         // rdx <-- r10 >> 51
+    ANDQ AX, CX          // r00 &= mask51
+    ADDQ DX, R11         // r20 += r10 >> 51
+    MOVQ R11, DX         // rdx <-- r20
+    SHRQ $51, DX         // rdx <-- r20 >> 51
+    ANDQ AX, R9          // r10 &= mask51
+    ADDQ DX, R13         // r30 += r20 >> 51
+    MOVQ R13, DX         // rdx <-- r30
+    SHRQ $51, DX         // rdx <-- r30 >> 51
+    ANDQ AX, R11         // r20 &= mask51
+    ADDQ DX, R15         // r40 += r30 >> 51
+    MOVQ R15, DX         // rdx <-- r40
+    SHRQ $51, DX         // rdx <-- r40 >> 51
+    ANDQ AX, R13         // r30 &= mask51
+    IMUL3Q $19, DX, DX   // rdx <-- (r40 >> 51) * 19
+    ADDQ DX, CX          // r00 += (r40 >> 51) *19
+    ANDQ AX, R15         // r40 &= mask51
+
+    MOVQ CX, 0(DI)
+    MOVQ R9, 8(DI)
+    MOVQ R11, 16(DI)
+    MOVQ R13, 24(DI)
+    MOVQ R15, 32(DI)
+    RET
--- a/internal/edwards25519/internal/radix51/fe_test.go
+++ b/internal/edwards25519/internal/radix51/fe_test.go
@ -0,0 +1,179 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package radix51
+
+import (
+	"bytes"
+	"crypto/rand"
+	"io"
+	"testing"
+	"unsafe"
+)
+
+func TestMul64to128(t *testing.T) {
+	a := uint64(5)
+	b := uint64(5)
+	r0, r1 := mul64x64(0, 0, a, b)
+	if r0 != 0x19 || r1 != 0 {
+		t.Errorf("lo-range wide mult failed, got %d + %d*(2**64)", r0, r1)
+	}
+
+	a = uint64(18014398509481983) // 2^54 - 1
+	b = uint64(18014398509481983) // 2^54 - 1
+	r0, r1 = mul64x64(0, 0, a, b)
+	if r0 != 0xff80000000000001 || r1 != 0xfffffffffff {
+		t.Errorf("hi-range wide mult failed, got %d + %d*(2**64)", r0, r1)
+	}
+
+	a = uint64(1125899906842661)
+	b = uint64(2097155)
+	r0, r1 = mul64x64(0, 0, a, b)
+	r0, r1 = mul64x64(r0, r1, a, b)
+	r0, r1 = mul64x64(r0, r1, a, b)
+	r0, r1 = mul64x64(r0, r1, a, b)
+	r0, r1 = mul64x64(r0, r1, a, b)
+	if r0 != 16888498990613035 || r1 != 640 {
+		t.Errorf("wrong answer: %d + %d*(2**64)", r0, r1)
+	}
+}
+
+func BenchmarkWideMultInline(t *testing.B) {
+	var r0, r1, ol, oh uint64
+	a := uint64(18014398509481983) // 2^54 - 1
+	b := uint64(18014398509481983) // 2^54 - 1
+
+	for i := 0; i < t.N; i++ {
+		t1 := (a>>32)*(b&0xFFFFFFFF) + ((a & 0xFFFFFFFF) * (b & 0xFFFFFFFF) >> 32)
+		t2 := (a&0xFFFFFFFF)*(b>>32) + (t1 & 0xFFFFFFFF)
+		ol = (a * b) + r0
+		cmp := ol < r0
+		oh = r1 + (a>>32)*(b>>32) + t1>>32 + t2>>32 + uint64(*(*byte)(unsafe.Pointer(&cmp)))
+
+		r1 = oh
+		r0 = ol
+	}
+}
+
+func BenchmarkWideMultCall(t *testing.B) {
+	var r0, r1 uint64
+	a := uint64(18014398509481983)
+	b := uint64(18014398509481983)
+
+	for i := 0; i < t.N; i++ {
+		r0, r1 = mul64x64(r0, r1, a, b)
+	}
+}
+
+func TestFeFromBytesRoundTrip(t *testing.T) {
+	var in, out [32]byte
+	var fe, r FieldElement
+
+	in = [32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+		18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+
+	FeFromBytes(&fe, &in)
+	FeToBytes(&out, &fe)
+
+	if !bytes.Equal(in[:], out[:]) {
+		t.Error("Bytes<>FE doesn't roundtrip")
+	}
+
+	// Random field element
+	fe[0] = 0x4e645be9215a2
+	fe[1] = 0x4e9654922df12
+	fe[2] = 0x5829e468b0205
+	fe[3] = 0x5e8fca9e0881c
+	fe[4] = 0x5c490f087d796
+
+	FeToBytes(&out, &fe)
+	FeFromBytes(&r, &out)
+
+	for i := 0; i < len(fe); i++ {
+		if r[i] != fe[i] {
+			t.Error("FE<>Bytes doesn't roundtrip")
+		}
+	}
+}
+
+// Tests self-consistency between FeMul and FeSquare.
+func TestSanity(t *testing.T) {
+	var x FieldElement
+	var x2, x2sq FieldElement
+	// var x2Go, x2sqGo FieldElement
+
+	x = [5]uint64{1, 1, 1, 1, 1}
+	FeMul(&x2, &x, &x)
+	// FeMulGo(&x2Go, &x, &x)
+	FeSquare(&x2sq, &x)
+	// FeSquareGo(&x2sqGo, &x)
+
+	// if !vartimeEqual(x2, x2Go) || !vartimeEqual(x2sq, x2sqGo) || !vartimeEqual(x2, x2sq) {
+	// 	t.Fatalf("all ones failed\nmul.s: %d\nmul.g: %d\nsqr.s: %d\nsqr.g: %d\n", x2, x2Go, x2sq, x2sqGo)
+	// }
+
+	if !vartimeEqual(x2, x2sq) {
+		t.Fatalf("all ones failed\nmul: %x\nsqr: %x\n", x2, x2sq)
+	}
+
+	var bytes [32]byte
+
+	_, err := io.ReadFull(rand.Reader, bytes[:])
+	if err != nil {
+		t.Fatal(err)
+	}
+	FeFromBytes(&x, &bytes)
+
+	FeMul(&x2, &x, &x)
+	// FeMulGo(&x2Go, &x, &x)
+	FeSquare(&x2sq, &x)
+	// FeSquareGo(&x2sqGo, &x)
+
+	// if !vartimeEqual(x2, x2Go) || !vartimeEqual(x2sq, x2sqGo) || !vartimeEqual(x2, x2sq) {
+	// 	t.Fatalf("random field element failed\nfe: %x\n\nmul.s: %x\nmul.g: %x\nsqr.s: %x\nsqr.g: %x\n", x, x2, x2Go, x2sq, x2sqGo)
+	// }
+
+	if !vartimeEqual(x2, x2sq) {
+		t.Fatalf("all ones failed\nmul: %x\nsqr: %x\n", x2, x2sq)
+	}
+}
+
+func vartimeEqual(x, y FieldElement) bool {
+	for i := 0; i < 5; i++ {
+		if x[i] != y[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestFeInvert(t *testing.T) {
+	var x FieldElement = [5]uint64{1, 1, 1, 1, 1}
+	var one FieldElement = [5]uint64{1, 0, 0, 0, 0}
+	var xinv, r FieldElement
+
+	FeInvert(&xinv, &x)
+	FeMul(&r, &x, &xinv)
+	FeReduce(&r, &r)
+
+	if !vartimeEqual(one, r) {
+		t.Errorf("inversion identity failed, got: %x", r)
+	}
+
+	var bytes [32]byte
+
+	_, err := io.ReadFull(rand.Reader, bytes[:])
+	if err != nil {
+		t.Fatal(err)
+	}
+	FeFromBytes(&x, &bytes)
+
+	FeInvert(&xinv, &x)
+	FeMul(&r, &x, &xinv)
+	FeReduce(&r, &r)
+
+	if !vartimeEqual(one, r) {
+		t.Errorf("random inversion identity failed, got: %x for field element %x", r, x)
+	}
+}
--- a/internal/edwards25519/internal/radix51/mul.go
+++ b/internal/edwards25519/internal/radix51/mul.go
@ -0,0 +1,18 @@
+// Copyright (c) 2017 George Tankersley. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package radix51
+
+import "unsafe"
+
+// mul64x64 multiples two 64-bit numbers and adds them to two accumulators.
+// This function is written to ensure it inlines. I am so sorry.
+func mul64x64(lo, hi, a, b uint64) (ol uint64, oh uint64) {
+	t1 := (a>>32)*(b&0xFFFFFFFF) + ((a & 0xFFFFFFFF) * (b & 0xFFFFFFFF) >> 32)
+	t2 := (a&0xFFFFFFFF)*(b>>32) + (t1 & 0xFFFFFFFF)
+	ol = (a * b) + lo
+	cmp := ol < lo
+	oh = hi + (a>>32)*(b>>32) + t1>>32 + t2>>32 + uint64(*(*byte)(unsafe.Pointer(&cmp)))
+	return
+}
--- a/internal/edwards25519/xcrypto.go
+++ b/internal/edwards25519/xcrypto.go
@ -5,14 +5,16 @@

 package edwards25519

-import x "github.com/gtank/ristretto255/internal/edwards25519/internal/edwards25519"
+import (
+	"github.com/gtank/ristretto255/internal/edwards25519/internal/group"
+	"github.com/gtank/ristretto255/internal/edwards25519/internal/radix51"
+)

-// Expose some types and functions from the x/crypto code to ristretto255.
+// Expose some types and functions from the internal package to ristretto255.

-type ExtendedGroupElement = x.ExtendedGroupElement
-type FieldElement = x.FieldElement
+type ExtendedGroupElement = group.ExtendedGroupElement
+type FieldElement = radix51.FieldElement

-var FeMul = x.FeMul
-var FeSquare = x.FeSquare
-var FeNeg = x.FeNeg
-var FeIsNegative = x.FeIsNegative
+var FeMul = radix51.FeMul
+var FeSquare = radix51.FeSquare
+var FeNeg = radix51.FeNeg
--- a/ristretto255.go
+++ b/ristretto255.go
@ -31,3 +31,13 @@ func (e *Element) Equal(ee *Element) int {

 	return out
 }
+
+// FromUniformBytes maps the 64-byte slice b to an Element e uniformly and
+// deterministically. This can be used for hash-to-group operations or to obtain
+// a random element.
+func (e *Element) FromUniformBytes(b []byte) {
+	if len(b) != 64 {
+		panic("ristretto255: FromUniformBytes called with a byte slice of length different than 64")
+	}
+
+}