Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • oss/utilities/requirements-manager
1 result
Select Git revision
Show changes
Showing
with 2925 additions and 0 deletions
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sha3
// spongeDirection indicates the direction bytes are flowing through the sponge.
type spongeDirection int
const (
// spongeAbsorbing indicates that the sponge is absorbing input.
spongeAbsorbing spongeDirection = iota
// spongeSqueezing indicates that the sponge is being squeezed.
spongeSqueezing
)
const (
// maxRate is the maximum size of the internal buffer. SHAKE-256
// currently needs the largest buffer.
maxRate = 168
)
func (d *State) buf() []byte {
return d.storage.asBytes()[d.bufo:d.bufe]
}
type State struct {
// Generic sponge components.
a [25]uint64 // main state of the hash
rate int // the number of bytes of state to use
bufo int // offset of buffer in storage
bufe int // end of buffer in storage
// dsbyte contains the "domain separation" bits and the first bit of
// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
// SHA-3 and SHAKE functions by appending bitstrings to the message.
// Using a little-endian bit-ordering convention, these are "01" for SHA-3
// and "1111" for SHAKE, or 00000010b and 00001111b, respectively. Then the
// padding rule from section 5.1 is applied to pad the message to a multiple
// of the rate, which involves adding a "1" bit, zero or more "0" bits, and
// a final "1" bit. We merge the first "1" bit from the padding into dsbyte,
// giving 00000110b (0x06) and 00011111b (0x1f).
// [1] http://csrc.nist.gov/publications/drafts/fips-202/fips_202_draft.pdf
// "Draft FIPS 202: SHA-3 Standard: Permutation-Based Hash and
// Extendable-Output Functions (May 2014)"
dsbyte byte
storage storageBuf
// Specific to SHA-3 and SHAKE.
outputLen int // the default output size in bytes
state spongeDirection // whether the sponge is absorbing or squeezing
turbo bool // Whether we're using 12 rounds instead of 24
}
// BlockSize returns the rate of sponge underlying this hash function.
func (d *State) BlockSize() int { return d.rate }
// Size returns the output size of the hash function in bytes.
func (d *State) Size() int { return d.outputLen }
// Reset clears the internal state by zeroing the sponge state and
// the byte buffer, and setting Sponge.state to absorbing.
func (d *State) Reset() {
// Zero the permutation's state.
for i := range d.a {
d.a[i] = 0
}
d.state = spongeAbsorbing
d.bufo = 0
d.bufe = 0
}
func (d *State) clone() *State {
ret := *d
return &ret
}
// permute applies the KeccakF-1600 permutation. It handles
// any input-output buffering.
func (d *State) permute() {
switch d.state {
case spongeAbsorbing:
// If we're absorbing, we need to xor the input into the state
// before applying the permutation.
xorIn(d, d.buf())
d.bufe = 0
d.bufo = 0
KeccakF1600(&d.a, d.turbo)
case spongeSqueezing:
// If we're squeezing, we need to apply the permutation before
// copying more output.
KeccakF1600(&d.a, d.turbo)
d.bufe = d.rate
d.bufo = 0
copyOut(d, d.buf())
}
}
// pads appends the domain separation bits in dsbyte, applies
// the multi-bitrate 10..1 padding rule, and permutes the state.
func (d *State) padAndPermute(dsbyte byte) {
// Pad with this instance's domain-separator bits. We know that there's
// at least one byte of space in d.buf() because, if it were full,
// permute would have been called to empty it. dsbyte also contains the
// first one bit for the padding. See the comment in the state struct.
zerosStart := d.bufe + 1
d.bufe = d.rate
buf := d.buf()
buf[zerosStart-1] = dsbyte
for i := zerosStart; i < d.rate; i++ {
buf[i] = 0
}
// This adds the final one bit for the padding. Because of the way that
// bits are numbered from the LSB upwards, the final bit is the MSB of
// the last byte.
buf[d.rate-1] ^= 0x80
// Apply the permutation
d.permute()
d.state = spongeSqueezing
d.bufe = d.rate
copyOut(d, buf)
}
// Write absorbs more data into the hash's state. It produces an error
// if more data is written to the ShakeHash after writing
func (d *State) Write(p []byte) (written int, err error) {
if d.state != spongeAbsorbing {
panic("sha3: write to sponge after read")
}
written = len(p)
for len(p) > 0 {
bufl := d.bufe - d.bufo
if bufl == 0 && len(p) >= d.rate {
// The fast path; absorb a full "rate" bytes of input and apply the permutation.
xorIn(d, p[:d.rate])
p = p[d.rate:]
KeccakF1600(&d.a, d.turbo)
} else {
// The slow path; buffer the input until we can fill the sponge, and then xor it in.
todo := d.rate - bufl
if todo > len(p) {
todo = len(p)
}
d.bufe += todo
buf := d.buf()
copy(buf[bufl:], p[:todo])
p = p[todo:]
// If the sponge is full, apply the permutation.
if d.bufe == d.rate {
d.permute()
}
}
}
return written, nil
}
// Read squeezes an arbitrary number of bytes from the sponge.
func (d *State) Read(out []byte) (n int, err error) {
// If we're still absorbing, pad and apply the permutation.
if d.state == spongeAbsorbing {
d.padAndPermute(d.dsbyte)
}
n = len(out)
// Now, do the squeezing.
for len(out) > 0 {
buf := d.buf()
n := copy(out, buf)
d.bufo += n
out = out[n:]
// Apply the permutation if we've squeezed the sponge dry.
if d.bufo == d.bufe {
d.permute()
}
}
return
}
// Sum applies padding to the hash state and then squeezes out the desired
// number of output bytes.
func (d *State) Sum(in []byte) []byte {
// Make a copy of the original hash so that caller can keep writing
// and summing.
dup := d.clone()
hash := make([]byte, dup.outputLen)
_, _ = dup.Read(hash)
return append(in, hash...)
}
func (d *State) IsAbsorbing() bool {
return d.state == spongeAbsorbing
}
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !gccgo,!appengine
#include "textflag.h"
// func kimd(function code, chain *[200]byte, src []byte)
TEXT ·kimd(SB), NOFRAME|NOSPLIT, $0-40
MOVD function+0(FP), R0
MOVD chain+8(FP), R1
LMG src+16(FP), R2, R3 // R2=base, R3=len
continue:
WORD $0xB93E0002 // KIMD --, R2
BVS continue // continue if interrupted
MOVD $0, R0 // reset R0 for pre-go1.8 compilers
RET
// func klmd(function code, chain *[200]byte, dst, src []byte)
TEXT ·klmd(SB), NOFRAME|NOSPLIT, $0-64
// TODO: SHAKE support
MOVD function+0(FP), R0
MOVD chain+8(FP), R1
LMG dst+16(FP), R2, R3 // R2=base, R3=len
LMG src+40(FP), R4, R5 // R4=base, R5=len
continue:
WORD $0xB93F0024 // KLMD R2, R4
BVS continue // continue if interrupted
MOVD $0, R0 // reset R0 for pre-go1.8 compilers
RET
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sha3
// This file defines the ShakeHash interface, and provides
// functions for creating SHAKE and cSHAKE instances, as well as utility
// functions for hashing bytes to arbitrary-length output.
//
//
// SHAKE implementation is based on FIPS PUB 202 [1]
// cSHAKE implementations is based on NIST SP 800-185 [2]
//
// [1] https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
// [2] https://doi.org/10.6028/NIST.SP.800-185
import (
"io"
)
// ShakeHash defines the interface to hash functions that
// support arbitrary-length output.
type ShakeHash interface {
// Write absorbs more data into the hash's state. It panics if input is
// written to it after output has been read from it.
io.Writer
// Read reads more output from the hash; reading affects the hash's
// state. (ShakeHash.Read is thus very different from Hash.Sum)
// It never returns an error.
io.Reader
// Clone returns a copy of the ShakeHash in its current state.
Clone() ShakeHash
// Reset resets the ShakeHash to its initial state.
Reset()
}
// Consts for configuring initial SHA-3 state
const (
dsbyteShake = 0x1f
rate128 = 168
rate256 = 136
)
// Clone returns copy of SHAKE context within its current state.
func (d *State) Clone() ShakeHash {
return d.clone()
}
// NewShake128 creates a new SHAKE128 variable-output-length ShakeHash.
// Its generic security strength is 128 bits against all attacks if at
// least 32 bytes of its output are used.
func NewShake128() State {
return State{rate: rate128, dsbyte: dsbyteShake}
}
// NewTurboShake128 creates a new TurboSHAKE128 variable-output-length ShakeHash.
// Its generic security strength is 128 bits against all attacks if at
// least 32 bytes of its output are used.
// D is the domain separation byte and must be between 0x01 and 0x7f inclusive.
func NewTurboShake128(D byte) State {
if D == 0 || D > 0x7f {
panic("turboshake: D out of range")
}
return State{rate: rate128, dsbyte: D, turbo: true}
}
// NewShake256 creates a new SHAKE256 variable-output-length ShakeHash.
// Its generic security strength is 256 bits against all attacks if
// at least 64 bytes of its output are used.
func NewShake256() State {
return State{rate: rate256, dsbyte: dsbyteShake}
}
// NewTurboShake256 creates a new TurboSHAKE256 variable-output-length ShakeHash.
// Its generic security strength is 256 bits against all attacks if
// at least 64 bytes of its output are used.
// D is the domain separation byte and must be between 0x01 and 0x7f inclusive.
func NewTurboShake256(D byte) State {
if D == 0 || D > 0x7f {
panic("turboshake: D out of range")
}
return State{rate: rate256, dsbyte: D, turbo: true}
}
// ShakeSum128 writes an arbitrary-length digest of data into hash.
func ShakeSum128(hash, data []byte) {
h := NewShake128()
_, _ = h.Write(data)
_, _ = h.Read(hash)
}
// ShakeSum256 writes an arbitrary-length digest of data into hash.
func ShakeSum256(hash, data []byte) {
h := NewShake256()
_, _ = h.Write(data)
_, _ = h.Read(hash)
}
// TurboShakeSum128 writes an arbitrary-length digest of data into hash.
func TurboShakeSum128(hash, data []byte, D byte) {
h := NewTurboShake128(D)
_, _ = h.Write(data)
_, _ = h.Read(hash)
}
// TurboShakeSum256 writes an arbitrary-length digest of data into hash.
func TurboShakeSum256(hash, data []byte, D byte) {
h := NewTurboShake256(D)
_, _ = h.Write(data)
_, _ = h.Read(hash)
}
func (d *State) SwitchDS(D byte) {
d.dsbyte = D
}
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !386 && !ppc64le) || appengine
// +build !amd64,!386,!ppc64le appengine
package sha3
// A storageBuf is an aligned array of maxRate bytes.
type storageBuf [maxRate]byte
func (b *storageBuf) asBytes() *[maxRate]byte {
return (*[maxRate]byte)(b)
}
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 || appengine) && (!386 || appengine) && (!ppc64le || appengine)
// +build !amd64 appengine
// +build !386 appengine
// +build !ppc64le appengine
package sha3
import "encoding/binary"
// xorIn xors the bytes in buf into the state; it
// makes no non-portable assumptions about memory layout
// or alignment.
func xorIn(d *State, buf []byte) {
n := len(buf) / 8
for i := 0; i < n; i++ {
a := binary.LittleEndian.Uint64(buf)
d.a[i] ^= a
buf = buf[8:]
}
}
// copyOut copies ulint64s to a byte buffer.
func copyOut(d *State, b []byte) {
for i := 0; len(b) >= 8; i++ {
binary.LittleEndian.PutUint64(b, d.a[i])
b = b[8:]
}
}
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (amd64 || 386 || ppc64le) && !appengine
// +build amd64 386 ppc64le
// +build !appengine
package sha3
import "unsafe"
// A storageBuf is an aligned array of maxRate bytes.
type storageBuf [maxRate / 8]uint64
func (b *storageBuf) asBytes() *[maxRate]byte {
return (*[maxRate]byte)(unsafe.Pointer(b))
}
// xorInuses unaligned reads and writes to update d.a to contain d.a
// XOR buf.
func xorIn(d *State, buf []byte) {
n := len(buf)
bw := (*[maxRate / 8]uint64)(unsafe.Pointer(&buf[0]))[: n/8 : n/8]
if n >= 72 {
d.a[0] ^= bw[0]
d.a[1] ^= bw[1]
d.a[2] ^= bw[2]
d.a[3] ^= bw[3]
d.a[4] ^= bw[4]
d.a[5] ^= bw[5]
d.a[6] ^= bw[6]
d.a[7] ^= bw[7]
d.a[8] ^= bw[8]
}
if n >= 104 {
d.a[9] ^= bw[9]
d.a[10] ^= bw[10]
d.a[11] ^= bw[11]
d.a[12] ^= bw[12]
}
if n >= 136 {
d.a[13] ^= bw[13]
d.a[14] ^= bw[14]
d.a[15] ^= bw[15]
d.a[16] ^= bw[16]
}
if n >= 144 {
d.a[17] ^= bw[17]
}
if n >= 168 {
d.a[18] ^= bw[18]
d.a[19] ^= bw[19]
d.a[20] ^= bw[20]
}
}
func copyOut(d *State, buf []byte) {
ab := (*[maxRate]uint8)(unsafe.Pointer(&d.a[0]))
copy(buf, ab[:])
}
// Package fp25519 provides prime field arithmetic over GF(2^255-19).
package fp25519
import (
"errors"
"github.com/cloudflare/circl/internal/conv"
)
// Size in bytes of an element.
const Size = 32
// Elt is a prime field element.
type Elt [Size]byte
func (e Elt) String() string { return conv.BytesLe2Hex(e[:]) }
// p is the prime modulus 2^255-19.
var p = Elt{
0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
}
// P returns the prime modulus 2^255-19.
func P() Elt { return p }
// ToBytes stores in b the little-endian byte representation of x.
func ToBytes(b []byte, x *Elt) error {
if len(b) != Size {
return errors.New("wrong size")
}
Modp(x)
copy(b, x[:])
return nil
}
// IsZero returns true if x is equal to 0.
func IsZero(x *Elt) bool { Modp(x); return *x == Elt{} }
// SetOne assigns x=1.
func SetOne(x *Elt) { *x = Elt{}; x[0] = 1 }
// Neg calculates z = -x.
func Neg(z, x *Elt) { Sub(z, &p, x) }
// InvSqrt calculates z = sqrt(x/y) iff x/y is a quadratic-residue, which is
// indicated by returning isQR = true. Otherwise, when x/y is a quadratic
// non-residue, z will have an undetermined value and isQR = false.
func InvSqrt(z, x, y *Elt) (isQR bool) {
sqrtMinusOne := &Elt{
0xb0, 0xa0, 0x0e, 0x4a, 0x27, 0x1b, 0xee, 0xc4,
0x78, 0xe4, 0x2f, 0xad, 0x06, 0x18, 0x43, 0x2f,
0xa7, 0xd7, 0xfb, 0x3d, 0x99, 0x00, 0x4d, 0x2b,
0x0b, 0xdf, 0xc1, 0x4f, 0x80, 0x24, 0x83, 0x2b,
}
t0, t1, t2, t3 := &Elt{}, &Elt{}, &Elt{}, &Elt{}
Mul(t0, x, y) // t0 = u*v
Sqr(t1, y) // t1 = v^2
Mul(t2, t0, t1) // t2 = u*v^3
Sqr(t0, t1) // t0 = v^4
Mul(t1, t0, t2) // t1 = u*v^7
var Tab [4]*Elt
Tab[0] = &Elt{}
Tab[1] = &Elt{}
Tab[2] = t3
Tab[3] = t1
*Tab[0] = *t1
Sqr(Tab[0], Tab[0])
Sqr(Tab[1], Tab[0])
Sqr(Tab[1], Tab[1])
Mul(Tab[1], Tab[1], Tab[3])
Mul(Tab[0], Tab[0], Tab[1])
Sqr(Tab[0], Tab[0])
Mul(Tab[0], Tab[0], Tab[1])
Sqr(Tab[1], Tab[0])
for i := 0; i < 4; i++ {
Sqr(Tab[1], Tab[1])
}
Mul(Tab[1], Tab[1], Tab[0])
Sqr(Tab[2], Tab[1])
for i := 0; i < 4; i++ {
Sqr(Tab[2], Tab[2])
}
Mul(Tab[2], Tab[2], Tab[0])
Sqr(Tab[1], Tab[2])
for i := 0; i < 14; i++ {
Sqr(Tab[1], Tab[1])
}
Mul(Tab[1], Tab[1], Tab[2])
Sqr(Tab[2], Tab[1])
for i := 0; i < 29; i++ {
Sqr(Tab[2], Tab[2])
}
Mul(Tab[2], Tab[2], Tab[1])
Sqr(Tab[1], Tab[2])
for i := 0; i < 59; i++ {
Sqr(Tab[1], Tab[1])
}
Mul(Tab[1], Tab[1], Tab[2])
for i := 0; i < 5; i++ {
Sqr(Tab[1], Tab[1])
}
Mul(Tab[1], Tab[1], Tab[0])
Sqr(Tab[2], Tab[1])
for i := 0; i < 124; i++ {
Sqr(Tab[2], Tab[2])
}
Mul(Tab[2], Tab[2], Tab[1])
Sqr(Tab[2], Tab[2])
Sqr(Tab[2], Tab[2])
Mul(Tab[2], Tab[2], Tab[3])
Mul(z, t3, t2) // z = xy^(p+3)/8 = xy^3*(xy^7)^(p-5)/8
// Checking whether y z^2 == x
Sqr(t0, z) // t0 = z^2
Mul(t0, t0, y) // t0 = yz^2
Sub(t1, t0, x) // t1 = t0-u
Add(t2, t0, x) // t2 = t0+u
if IsZero(t1) {
return true
} else if IsZero(t2) {
Mul(z, z, sqrtMinusOne) // z = z*sqrt(-1)
return true
} else {
return false
}
}
// Inv calculates z = 1/x mod p.
func Inv(z, x *Elt) {
x0, x1, x2 := &Elt{}, &Elt{}, &Elt{}
Sqr(x1, x)
Sqr(x0, x1)
Sqr(x0, x0)
Mul(x0, x0, x)
Mul(z, x0, x1)
Sqr(x1, z)
Mul(x0, x0, x1)
Sqr(x1, x0)
for i := 0; i < 4; i++ {
Sqr(x1, x1)
}
Mul(x0, x0, x1)
Sqr(x1, x0)
for i := 0; i < 9; i++ {
Sqr(x1, x1)
}
Mul(x1, x1, x0)
Sqr(x2, x1)
for i := 0; i < 19; i++ {
Sqr(x2, x2)
}
Mul(x2, x2, x1)
for i := 0; i < 10; i++ {
Sqr(x2, x2)
}
Mul(x2, x2, x0)
Sqr(x0, x2)
for i := 0; i < 49; i++ {
Sqr(x0, x0)
}
Mul(x0, x0, x2)
Sqr(x1, x0)
for i := 0; i < 99; i++ {
Sqr(x1, x1)
}
Mul(x1, x1, x0)
for i := 0; i < 50; i++ {
Sqr(x1, x1)
}
Mul(x1, x1, x2)
for i := 0; i < 5; i++ {
Sqr(x1, x1)
}
Mul(z, z, x1)
}
// Cmov assigns y to x if n is 1.
func Cmov(x, y *Elt, n uint) { cmov(x, y, n) }
// Cswap interchanges x and y if n is 1.
func Cswap(x, y *Elt, n uint) { cswap(x, y, n) }
// Add calculates z = x+y mod p.
func Add(z, x, y *Elt) { add(z, x, y) }
// Sub calculates z = x-y mod p.
func Sub(z, x, y *Elt) { sub(z, x, y) }
// AddSub calculates (x,y) = (x+y mod p, x-y mod p).
func AddSub(x, y *Elt) { addsub(x, y) }
// Mul calculates z = x*y mod p.
func Mul(z, x, y *Elt) { mul(z, x, y) }
// Sqr calculates z = x^2 mod p.
func Sqr(z, x *Elt) { sqr(z, x) }
// Modp ensures that z is between [0,p-1].
func Modp(z *Elt) { modp(z) }
//go:build amd64 && !purego
// +build amd64,!purego
package fp25519
import (
"golang.org/x/sys/cpu"
)
var hasBmi2Adx = cpu.X86.HasBMI2 && cpu.X86.HasADX
var _ = hasBmi2Adx
func cmov(x, y *Elt, n uint) { cmovAmd64(x, y, n) }
func cswap(x, y *Elt, n uint) { cswapAmd64(x, y, n) }
func add(z, x, y *Elt) { addAmd64(z, x, y) }
func sub(z, x, y *Elt) { subAmd64(z, x, y) }
func addsub(x, y *Elt) { addsubAmd64(x, y) }
func mul(z, x, y *Elt) { mulAmd64(z, x, y) }
func sqr(z, x *Elt) { sqrAmd64(z, x) }
func modp(z *Elt) { modpAmd64(z) }
//go:noescape
func cmovAmd64(x, y *Elt, n uint)
//go:noescape
func cswapAmd64(x, y *Elt, n uint)
//go:noescape
func addAmd64(z, x, y *Elt)
//go:noescape
func subAmd64(z, x, y *Elt)
//go:noescape
func addsubAmd64(x, y *Elt)
//go:noescape
func mulAmd64(z, x, y *Elt)
//go:noescape
func sqrAmd64(z, x *Elt)
//go:noescape
func modpAmd64(z *Elt)
// This code was imported from https://github.com/armfazh/rfc7748_precomputed
// CHECK_BMI2ADX triggers bmi2adx if supported,
// otherwise it fallbacks to legacy code.
#define CHECK_BMI2ADX(label, legacy, bmi2adx) \
CMPB ·hasBmi2Adx(SB), $0 \
JE label \
bmi2adx \
RET \
label: \
legacy \
RET
// cselect is a conditional move
// if b=1: it copies y into x;
// if b=0: x remains with the same value;
// if b<> 0,1: undefined.
// Uses: AX, DX, FLAGS
// Instr: x86_64, cmov
#define cselect(x,y,b) \
TESTQ b, b \
MOVQ 0+x, AX; MOVQ 0+y, DX; CMOVQNE DX, AX; MOVQ AX, 0+x; \
MOVQ 8+x, AX; MOVQ 8+y, DX; CMOVQNE DX, AX; MOVQ AX, 8+x; \
MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x;
// cswap is a conditional swap
// if b=1: x,y <- y,x;
// if b=0: x,y remain with the same values;
// if b<> 0,1: undefined.
// Uses: AX, DX, R8, FLAGS
// Instr: x86_64, cmov
#define cswap(x,y,b) \
TESTQ b, b \
MOVQ 0+x, AX; MOVQ AX, R8; MOVQ 0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 0+x; MOVQ DX, 0+y; \
MOVQ 8+x, AX; MOVQ AX, R8; MOVQ 8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 8+x; MOVQ DX, 8+y; \
MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y;
// additionLeg adds x and y and stores in z
// Uses: AX, DX, R8-R11, FLAGS
// Instr: x86_64, cmov
#define additionLeg(z,x,y) \
MOVL $38, AX; \
MOVL $0, DX; \
MOVQ 0+x, R8; ADDQ 0+y, R8; \
MOVQ 8+x, R9; ADCQ 8+y, R9; \
MOVQ 16+x, R10; ADCQ 16+y, R10; \
MOVQ 24+x, R11; ADCQ 24+y, R11; \
CMOVQCS AX, DX; \
ADDQ DX, R8; \
ADCQ $0, R9; MOVQ R9, 8+z; \
ADCQ $0, R10; MOVQ R10, 16+z; \
ADCQ $0, R11; MOVQ R11, 24+z; \
MOVL $0, DX; \
CMOVQCS AX, DX; \
ADDQ DX, R8; MOVQ R8, 0+z;
// additionAdx adds x and y and stores in z
// Uses: AX, DX, R8-R11, FLAGS
// Instr: x86_64, cmov, adx
#define additionAdx(z,x,y) \
MOVL $38, AX; \
XORL DX, DX; \
MOVQ 0+x, R8; ADCXQ 0+y, R8; \
MOVQ 8+x, R9; ADCXQ 8+y, R9; \
MOVQ 16+x, R10; ADCXQ 16+y, R10; \
MOVQ 24+x, R11; ADCXQ 24+y, R11; \
CMOVQCS AX, DX ; \
XORL AX, AX; \
ADCXQ DX, R8; \
ADCXQ AX, R9; MOVQ R9, 8+z; \
ADCXQ AX, R10; MOVQ R10, 16+z; \
ADCXQ AX, R11; MOVQ R11, 24+z; \
MOVL $38, DX; \
CMOVQCS DX, AX; \
ADDQ AX, R8; MOVQ R8, 0+z;
// subtraction subtracts y from x and stores in z
// Uses: AX, DX, R8-R11, FLAGS
// Instr: x86_64, cmov
#define subtraction(z,x,y) \
MOVL $38, AX; \
MOVQ 0+x, R8; SUBQ 0+y, R8; \
MOVQ 8+x, R9; SBBQ 8+y, R9; \
MOVQ 16+x, R10; SBBQ 16+y, R10; \
MOVQ 24+x, R11; SBBQ 24+y, R11; \
MOVL $0, DX; \
CMOVQCS AX, DX; \
SUBQ DX, R8; \
SBBQ $0, R9; MOVQ R9, 8+z; \
SBBQ $0, R10; MOVQ R10, 16+z; \
SBBQ $0, R11; MOVQ R11, 24+z; \
MOVL $0, DX; \
CMOVQCS AX, DX; \
SUBQ DX, R8; MOVQ R8, 0+z;
// integerMulAdx multiplies x and y and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64, bmi2, adx
#define integerMulAdx(z,x,y) \
MOVL $0,R15; \
MOVQ 0+y, DX; XORL AX, AX; \
MULXQ 0+x, AX, R8; MOVQ AX, 0+z; \
MULXQ 8+x, AX, R9; ADCXQ AX, R8; \
MULXQ 16+x, AX, R10; ADCXQ AX, R9; \
MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; \
MOVQ 8+y, DX; XORL AX, AX; \
MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 8+z; \
MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; \
MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; \
MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; \
MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; \
MOVQ 16+y, DX; XORL AX, AX; \
MULXQ 0+x, AX, R8; ADCXQ R12, AX; MOVQ AX, 16+z; \
MULXQ 8+x, AX, R9; ADCXQ R13, R8; ADOXQ AX, R8; \
MULXQ 16+x, AX, R10; ADCXQ R14, R9; ADOXQ AX, R9; \
MULXQ 24+x, AX, R11; ADCXQ R15, R10; ADOXQ AX, R10; \
MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; ADOXQ AX, R11; \
MOVQ 24+y, DX; XORL AX, AX; \
MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 24+z; \
MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; MOVQ R12, 32+z; \
MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; MOVQ R13, 40+z; \
MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; MOVQ R14, 48+z; \
MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; MOVQ R15, 56+z;
// integerMulLeg multiplies x and y and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64
#define integerMulLeg(z,x,y) \
MOVQ 0+y, R8; \
MOVQ 0+x, AX; MULQ R8; MOVQ AX, 0+z; MOVQ DX, R15; \
MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
MOVQ 24+x, AX; MULQ R8; \
ADDQ R13, R15; \
ADCQ R14, R10; MOVQ R10, 16+z; \
ADCQ AX, R11; MOVQ R11, 24+z; \
ADCQ $0, DX; MOVQ DX, 32+z; \
MOVQ 8+y, R8; \
MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \
MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
MOVQ 24+x, AX; MULQ R8; \
ADDQ R12, R15; MOVQ R15, 8+z; \
ADCQ R13, R9; \
ADCQ R14, R10; \
ADCQ AX, R11; \
ADCQ $0, DX; \
ADCQ 16+z, R9; MOVQ R9, R15; \
ADCQ 24+z, R10; MOVQ R10, 24+z; \
ADCQ 32+z, R11; MOVQ R11, 32+z; \
ADCQ $0, DX; MOVQ DX, 40+z; \
MOVQ 16+y, R8; \
MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \
MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
MOVQ 24+x, AX; MULQ R8; \
ADDQ R12, R15; MOVQ R15, 16+z; \
ADCQ R13, R9; \
ADCQ R14, R10; \
ADCQ AX, R11; \
ADCQ $0, DX; \
ADCQ 24+z, R9; MOVQ R9, R15; \
ADCQ 32+z, R10; MOVQ R10, 32+z; \
ADCQ 40+z, R11; MOVQ R11, 40+z; \
ADCQ $0, DX; MOVQ DX, 48+z; \
MOVQ 24+y, R8; \
MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \
MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
MOVQ 24+x, AX; MULQ R8; \
ADDQ R12, R15; MOVQ R15, 24+z; \
ADCQ R13, R9; \
ADCQ R14, R10; \
ADCQ AX, R11; \
ADCQ $0, DX; \
ADCQ 32+z, R9; MOVQ R9, 32+z; \
ADCQ 40+z, R10; MOVQ R10, 40+z; \
ADCQ 48+z, R11; MOVQ R11, 48+z; \
ADCQ $0, DX; MOVQ DX, 56+z;
// integerSqrLeg squares x and stores in z
// Uses: AX, CX, DX, R8-R15, FLAGS
// Instr: x86_64
#define integerSqrLeg(z,x) \
MOVQ 0+x, R8; \
MOVQ 8+x, AX; MULQ R8; MOVQ AX, R9; MOVQ DX, R10; /* A[0]*A[1] */ \
MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; /* A[0]*A[2] */ \
MOVQ 24+x, AX; MULQ R8; MOVQ AX, R15; MOVQ DX, R12; /* A[0]*A[3] */ \
MOVQ 24+x, R8; \
MOVQ 8+x, AX; MULQ R8; MOVQ AX, CX; MOVQ DX, R13; /* A[3]*A[1] */ \
MOVQ 16+x, AX; MULQ R8; /* A[3]*A[2] */ \
\
ADDQ R14, R10;\
ADCQ R15, R11; MOVL $0, R15;\
ADCQ CX, R12;\
ADCQ AX, R13;\
ADCQ $0, DX; MOVQ DX, R14;\
MOVQ 8+x, AX; MULQ 16+x;\
\
ADDQ AX, R11;\
ADCQ DX, R12;\
ADCQ $0, R13;\
ADCQ $0, R14;\
ADCQ $0, R15;\
\
SHLQ $1, R14, R15; MOVQ R15, 56+z;\
SHLQ $1, R13, R14; MOVQ R14, 48+z;\
SHLQ $1, R12, R13; MOVQ R13, 40+z;\
SHLQ $1, R11, R12; MOVQ R12, 32+z;\
SHLQ $1, R10, R11; MOVQ R11, 24+z;\
SHLQ $1, R9, R10; MOVQ R10, 16+z;\
SHLQ $1, R9; MOVQ R9, 8+z;\
\
MOVQ 0+x,AX; MULQ AX; MOVQ AX, 0+z; MOVQ DX, R9;\
MOVQ 8+x,AX; MULQ AX; MOVQ AX, R10; MOVQ DX, R11;\
MOVQ 16+x,AX; MULQ AX; MOVQ AX, R12; MOVQ DX, R13;\
MOVQ 24+x,AX; MULQ AX; MOVQ AX, R14; MOVQ DX, R15;\
\
ADDQ 8+z, R9; MOVQ R9, 8+z;\
ADCQ 16+z, R10; MOVQ R10, 16+z;\
ADCQ 24+z, R11; MOVQ R11, 24+z;\
ADCQ 32+z, R12; MOVQ R12, 32+z;\
ADCQ 40+z, R13; MOVQ R13, 40+z;\
ADCQ 48+z, R14; MOVQ R14, 48+z;\
ADCQ 56+z, R15; MOVQ R15, 56+z;
// integerSqrAdx squares x and stores in z
// Uses: AX, CX, DX, R8-R15, FLAGS
// Instr: x86_64, bmi2, adx
#define integerSqrAdx(z,x) \
MOVQ 0+x, DX; /* A[0] */ \
MULXQ 8+x, R8, R14; /* A[1]*A[0] */ XORL R15, R15; \
MULXQ 16+x, R9, R10; /* A[2]*A[0] */ ADCXQ R14, R9; \
MULXQ 24+x, AX, CX; /* A[3]*A[0] */ ADCXQ AX, R10; \
MOVQ 24+x, DX; /* A[3] */ \
MULXQ 8+x, R11, R12; /* A[1]*A[3] */ ADCXQ CX, R11; \
MULXQ 16+x, AX, R13; /* A[2]*A[3] */ ADCXQ AX, R12; \
MOVQ 8+x, DX; /* A[1] */ ADCXQ R15, R13; \
MULXQ 16+x, AX, CX; /* A[2]*A[1] */ MOVL $0, R14; \
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADCXQ R15, R14; \
XORL R15, R15; \
ADOXQ AX, R10; ADCXQ R8, R8; \
ADOXQ CX, R11; ADCXQ R9, R9; \
ADOXQ R15, R12; ADCXQ R10, R10; \
ADOXQ R15, R13; ADCXQ R11, R11; \
ADOXQ R15, R14; ADCXQ R12, R12; \
;;;;;;;;;;;;;;; ADCXQ R13, R13; \
;;;;;;;;;;;;;;; ADCXQ R14, R14; \
MOVQ 0+x, DX; MULXQ DX, AX, CX; /* A[0]^2 */ \
;;;;;;;;;;;;;;; MOVQ AX, 0+z; \
ADDQ CX, R8; MOVQ R8, 8+z; \
MOVQ 8+x, DX; MULXQ DX, AX, CX; /* A[1]^2 */ \
ADCQ AX, R9; MOVQ R9, 16+z; \
ADCQ CX, R10; MOVQ R10, 24+z; \
MOVQ 16+x, DX; MULXQ DX, AX, CX; /* A[2]^2 */ \
ADCQ AX, R11; MOVQ R11, 32+z; \
ADCQ CX, R12; MOVQ R12, 40+z; \
MOVQ 24+x, DX; MULXQ DX, AX, CX; /* A[3]^2 */ \
ADCQ AX, R13; MOVQ R13, 48+z; \
ADCQ CX, R14; MOVQ R14, 56+z;
// reduceFromDouble finds z congruent to x modulo p such that 0<z<2^256
// Uses: AX, DX, R8-R13, FLAGS
// Instr: x86_64
#define reduceFromDoubleLeg(z,x) \
/* 2*C = 38 = 2^256 */ \
MOVL $38, AX; MULQ 32+x; MOVQ AX, R8; MOVQ DX, R9; /* C*C[4] */ \
MOVL $38, AX; MULQ 40+x; MOVQ AX, R12; MOVQ DX, R10; /* C*C[5] */ \
MOVL $38, AX; MULQ 48+x; MOVQ AX, R13; MOVQ DX, R11; /* C*C[6] */ \
MOVL $38, AX; MULQ 56+x; /* C*C[7] */ \
ADDQ R12, R9; \
ADCQ R13, R10; \
ADCQ AX, R11; \
ADCQ $0, DX; \
ADDQ 0+x, R8; \
ADCQ 8+x, R9; \
ADCQ 16+x, R10; \
ADCQ 24+x, R11; \
ADCQ $0, DX; \
MOVL $38, AX; \
IMULQ AX, DX; /* C*C[4], CF=0, OF=0 */ \
ADDQ DX, R8; \
ADCQ $0, R9; MOVQ R9, 8+z; \
ADCQ $0, R10; MOVQ R10, 16+z; \
ADCQ $0, R11; MOVQ R11, 24+z; \
MOVL $0, DX; \
CMOVQCS AX, DX; \
ADDQ DX, R8; MOVQ R8, 0+z;
// reduceFromDoubleAdx finds z congruent to x modulo p such that 0<z<2^256
// Uses: AX, DX, R8-R13, FLAGS
// Instr: x86_64, bmi2, adx
#define reduceFromDoubleAdx(z,x) \
MOVL $38, DX; /* 2*C = 38 = 2^256 */ \
MULXQ 32+x, R8, R10; /* C*C[4] */ XORL AX, AX; ADOXQ 0+x, R8; \
MULXQ 40+x, R9, R11; /* C*C[5] */ ADCXQ R10, R9; ADOXQ 8+x, R9; \
MULXQ 48+x, R10, R13; /* C*C[6] */ ADCXQ R11, R10; ADOXQ 16+x, R10; \
MULXQ 56+x, R11, R12; /* C*C[7] */ ADCXQ R13, R11; ADOXQ 24+x, R11; \
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADCXQ AX, R12; ADOXQ AX, R12; \
IMULQ DX, R12; /* C*C[4], CF=0, OF=0 */ \
ADCXQ R12, R8; \
ADCXQ AX, R9; MOVQ R9, 8+z; \
ADCXQ AX, R10; MOVQ R10, 16+z; \
ADCXQ AX, R11; MOVQ R11, 24+z; \
MOVL $0, R12; \
CMOVQCS DX, R12; \
ADDQ R12, R8; MOVQ R8, 0+z;
// addSub calculates two operations: x,y = x+y,x-y
// Uses: AX, DX, R8-R15, FLAGS
#define addSub(x,y) \
MOVL $38, AX; \
XORL DX, DX; \
MOVQ 0+x, R8; MOVQ R8, R12; ADDQ 0+y, R8; \
MOVQ 8+x, R9; MOVQ R9, R13; ADCQ 8+y, R9; \
MOVQ 16+x, R10; MOVQ R10, R14; ADCQ 16+y, R10; \
MOVQ 24+x, R11; MOVQ R11, R15; ADCQ 24+y, R11; \
CMOVQCS AX, DX; \
XORL AX, AX; \
ADDQ DX, R8; \
ADCQ $0, R9; \
ADCQ $0, R10; \
ADCQ $0, R11; \
MOVL $38, DX; \
CMOVQCS DX, AX; \
ADDQ AX, R8; \
MOVL $38, AX; \
SUBQ 0+y, R12; \
SBBQ 8+y, R13; \
SBBQ 16+y, R14; \
SBBQ 24+y, R15; \
MOVL $0, DX; \
CMOVQCS AX, DX; \
SUBQ DX, R12; \
SBBQ $0, R13; \
SBBQ $0, R14; \
SBBQ $0, R15; \
MOVL $0, DX; \
CMOVQCS AX, DX; \
SUBQ DX, R12; \
MOVQ R8, 0+x; \
MOVQ R9, 8+x; \
MOVQ R10, 16+x; \
MOVQ R11, 24+x; \
MOVQ R12, 0+y; \
MOVQ R13, 8+y; \
MOVQ R14, 16+y; \
MOVQ R15, 24+y;
//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
#include "fp_amd64.h"
// func cmovAmd64(x, y *Elt, n uint)
TEXT ·cmovAmd64(SB),NOSPLIT,$0-24
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
MOVQ n+16(FP), BX
cselect(0(DI),0(SI),BX)
RET
// func cswapAmd64(x, y *Elt, n uint)
TEXT ·cswapAmd64(SB),NOSPLIT,$0-24
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
MOVQ n+16(FP), BX
cswap(0(DI),0(SI),BX)
RET
// func subAmd64(z, x, y *Elt)
TEXT ·subAmd64(SB),NOSPLIT,$0-24
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
MOVQ y+16(FP), BX
subtraction(0(DI),0(SI),0(BX))
RET
// func addsubAmd64(x, y *Elt)
TEXT ·addsubAmd64(SB),NOSPLIT,$0-16
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
addSub(0(DI),0(SI))
RET
#define addLegacy \
additionLeg(0(DI),0(SI),0(BX))
#define addBmi2Adx \
additionAdx(0(DI),0(SI),0(BX))
#define mulLegacy \
integerMulLeg(0(SP),0(SI),0(BX)) \
reduceFromDoubleLeg(0(DI),0(SP))
#define mulBmi2Adx \
integerMulAdx(0(SP),0(SI),0(BX)) \
reduceFromDoubleAdx(0(DI),0(SP))
#define sqrLegacy \
integerSqrLeg(0(SP),0(SI)) \
reduceFromDoubleLeg(0(DI),0(SP))
#define sqrBmi2Adx \
integerSqrAdx(0(SP),0(SI)) \
reduceFromDoubleAdx(0(DI),0(SP))
// func addAmd64(z, x, y *Elt)
TEXT ·addAmd64(SB),NOSPLIT,$0-24
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
MOVQ y+16(FP), BX
CHECK_BMI2ADX(LADD, addLegacy, addBmi2Adx)
// func mulAmd64(z, x, y *Elt)
TEXT ·mulAmd64(SB),NOSPLIT,$64-24
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
MOVQ y+16(FP), BX
CHECK_BMI2ADX(LMUL, mulLegacy, mulBmi2Adx)
// func sqrAmd64(z, x *Elt)
TEXT ·sqrAmd64(SB),NOSPLIT,$64-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
CHECK_BMI2ADX(LSQR, sqrLegacy, sqrBmi2Adx)
// func modpAmd64(z *Elt)
TEXT ·modpAmd64(SB),NOSPLIT,$0-8
MOVQ z+0(FP), DI
MOVQ (DI), R8
MOVQ 8(DI), R9
MOVQ 16(DI), R10
MOVQ 24(DI), R11
MOVL $19, AX
MOVL $38, CX
BTRQ $63, R11 // PUT BIT 255 IN CARRY FLAG AND CLEAR
CMOVLCC AX, CX // C[255] ? 38 : 19
// ADD EITHER 19 OR 38 TO C
ADDQ CX, R8
ADCQ $0, R9
ADCQ $0, R10
ADCQ $0, R11
// TEST FOR BIT 255 AGAIN; ONLY TRIGGERED ON OVERFLOW MODULO 2^255-19
MOVL $0, CX
CMOVLPL AX, CX // C[255] ? 0 : 19
BTRQ $63, R11 // CLEAR BIT 255
// SUBTRACT 19 IF NECESSARY
SUBQ CX, R8
MOVQ R8, (DI)
SBBQ $0, R9
MOVQ R9, 8(DI)
SBBQ $0, R10
MOVQ R10, 16(DI)
SBBQ $0, R11
MOVQ R11, 24(DI)
RET
package fp25519
import (
"encoding/binary"
"math/bits"
)
func cmovGeneric(x, y *Elt, n uint) {
m := -uint64(n & 0x1)
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
x0 = (x0 &^ m) | (y0 & m)
x1 = (x1 &^ m) | (y1 & m)
x2 = (x2 &^ m) | (y2 & m)
x3 = (x3 &^ m) | (y3 & m)
binary.LittleEndian.PutUint64(x[0*8:1*8], x0)
binary.LittleEndian.PutUint64(x[1*8:2*8], x1)
binary.LittleEndian.PutUint64(x[2*8:3*8], x2)
binary.LittleEndian.PutUint64(x[3*8:4*8], x3)
}
func cswapGeneric(x, y *Elt, n uint) {
m := -uint64(n & 0x1)
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
t0 := m & (x0 ^ y0)
t1 := m & (x1 ^ y1)
t2 := m & (x2 ^ y2)
t3 := m & (x3 ^ y3)
x0 ^= t0
x1 ^= t1
x2 ^= t2
x3 ^= t3
y0 ^= t0
y1 ^= t1
y2 ^= t2
y3 ^= t3
binary.LittleEndian.PutUint64(x[0*8:1*8], x0)
binary.LittleEndian.PutUint64(x[1*8:2*8], x1)
binary.LittleEndian.PutUint64(x[2*8:3*8], x2)
binary.LittleEndian.PutUint64(x[3*8:4*8], x3)
binary.LittleEndian.PutUint64(y[0*8:1*8], y0)
binary.LittleEndian.PutUint64(y[1*8:2*8], y1)
binary.LittleEndian.PutUint64(y[2*8:3*8], y2)
binary.LittleEndian.PutUint64(y[3*8:4*8], y3)
}
func addGeneric(z, x, y *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
z0, c0 := bits.Add64(x0, y0, 0)
z1, c1 := bits.Add64(x1, y1, c0)
z2, c2 := bits.Add64(x2, y2, c1)
z3, c3 := bits.Add64(x3, y3, c2)
z0, c0 = bits.Add64(z0, (-c3)&38, 0)
z1, c1 = bits.Add64(z1, 0, c0)
z2, c2 = bits.Add64(z2, 0, c1)
z3, c3 = bits.Add64(z3, 0, c2)
z0, _ = bits.Add64(z0, (-c3)&38, 0)
binary.LittleEndian.PutUint64(z[0*8:1*8], z0)
binary.LittleEndian.PutUint64(z[1*8:2*8], z1)
binary.LittleEndian.PutUint64(z[2*8:3*8], z2)
binary.LittleEndian.PutUint64(z[3*8:4*8], z3)
}
func subGeneric(z, x, y *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
z0, c0 := bits.Sub64(x0, y0, 0)
z1, c1 := bits.Sub64(x1, y1, c0)
z2, c2 := bits.Sub64(x2, y2, c1)
z3, c3 := bits.Sub64(x3, y3, c2)
z0, c0 = bits.Sub64(z0, (-c3)&38, 0)
z1, c1 = bits.Sub64(z1, 0, c0)
z2, c2 = bits.Sub64(z2, 0, c1)
z3, c3 = bits.Sub64(z3, 0, c2)
z0, _ = bits.Sub64(z0, (-c3)&38, 0)
binary.LittleEndian.PutUint64(z[0*8:1*8], z0)
binary.LittleEndian.PutUint64(z[1*8:2*8], z1)
binary.LittleEndian.PutUint64(z[2*8:3*8], z2)
binary.LittleEndian.PutUint64(z[3*8:4*8], z3)
}
func addsubGeneric(x, y *Elt) {
z := &Elt{}
addGeneric(z, x, y)
subGeneric(y, x, y)
*x = *z
}
func mulGeneric(z, x, y *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
yi := y0
h0, l0 := bits.Mul64(x0, yi)
h1, l1 := bits.Mul64(x1, yi)
h2, l2 := bits.Mul64(x2, yi)
h3, l3 := bits.Mul64(x3, yi)
z0 := l0
a0, c0 := bits.Add64(h0, l1, 0)
a1, c1 := bits.Add64(h1, l2, c0)
a2, c2 := bits.Add64(h2, l3, c1)
a3, _ := bits.Add64(h3, 0, c2)
yi = y1
h0, l0 = bits.Mul64(x0, yi)
h1, l1 = bits.Mul64(x1, yi)
h2, l2 = bits.Mul64(x2, yi)
h3, l3 = bits.Mul64(x3, yi)
z1, c0 := bits.Add64(a0, l0, 0)
h0, c1 = bits.Add64(h0, l1, c0)
h1, c2 = bits.Add64(h1, l2, c1)
h2, c3 := bits.Add64(h2, l3, c2)
h3, _ = bits.Add64(h3, 0, c3)
a0, c0 = bits.Add64(a1, h0, 0)
a1, c1 = bits.Add64(a2, h1, c0)
a2, c2 = bits.Add64(a3, h2, c1)
a3, _ = bits.Add64(0, h3, c2)
yi = y2
h0, l0 = bits.Mul64(x0, yi)
h1, l1 = bits.Mul64(x1, yi)
h2, l2 = bits.Mul64(x2, yi)
h3, l3 = bits.Mul64(x3, yi)
z2, c0 := bits.Add64(a0, l0, 0)
h0, c1 = bits.Add64(h0, l1, c0)
h1, c2 = bits.Add64(h1, l2, c1)
h2, c3 = bits.Add64(h2, l3, c2)
h3, _ = bits.Add64(h3, 0, c3)
a0, c0 = bits.Add64(a1, h0, 0)
a1, c1 = bits.Add64(a2, h1, c0)
a2, c2 = bits.Add64(a3, h2, c1)
a3, _ = bits.Add64(0, h3, c2)
yi = y3
h0, l0 = bits.Mul64(x0, yi)
h1, l1 = bits.Mul64(x1, yi)
h2, l2 = bits.Mul64(x2, yi)
h3, l3 = bits.Mul64(x3, yi)
z3, c0 := bits.Add64(a0, l0, 0)
h0, c1 = bits.Add64(h0, l1, c0)
h1, c2 = bits.Add64(h1, l2, c1)
h2, c3 = bits.Add64(h2, l3, c2)
h3, _ = bits.Add64(h3, 0, c3)
z4, c0 := bits.Add64(a1, h0, 0)
z5, c1 := bits.Add64(a2, h1, c0)
z6, c2 := bits.Add64(a3, h2, c1)
z7, _ := bits.Add64(0, h3, c2)
red64(z, z0, z1, z2, z3, z4, z5, z6, z7)
}
func sqrGeneric(z, x *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
h0, a0 := bits.Mul64(x0, x1)
h1, l1 := bits.Mul64(x0, x2)
h2, l2 := bits.Mul64(x0, x3)
h3, l3 := bits.Mul64(x3, x1)
h4, l4 := bits.Mul64(x3, x2)
h, l := bits.Mul64(x1, x2)
a1, c0 := bits.Add64(l1, h0, 0)
a2, c1 := bits.Add64(l2, h1, c0)
a3, c2 := bits.Add64(l3, h2, c1)
a4, c3 := bits.Add64(l4, h3, c2)
a5, _ := bits.Add64(h4, 0, c3)
a2, c0 = bits.Add64(a2, l, 0)
a3, c1 = bits.Add64(a3, h, c0)
a4, c2 = bits.Add64(a4, 0, c1)
a5, c3 = bits.Add64(a5, 0, c2)
a6, _ := bits.Add64(0, 0, c3)
a0, c0 = bits.Add64(a0, a0, 0)
a1, c1 = bits.Add64(a1, a1, c0)
a2, c2 = bits.Add64(a2, a2, c1)
a3, c3 = bits.Add64(a3, a3, c2)
a4, c4 := bits.Add64(a4, a4, c3)
a5, c5 := bits.Add64(a5, a5, c4)
a6, _ = bits.Add64(a6, a6, c5)
b1, b0 := bits.Mul64(x0, x0)
b3, b2 := bits.Mul64(x1, x1)
b5, b4 := bits.Mul64(x2, x2)
b7, b6 := bits.Mul64(x3, x3)
b1, c0 = bits.Add64(b1, a0, 0)
b2, c1 = bits.Add64(b2, a1, c0)
b3, c2 = bits.Add64(b3, a2, c1)
b4, c3 = bits.Add64(b4, a3, c2)
b5, c4 = bits.Add64(b5, a4, c3)
b6, c5 = bits.Add64(b6, a5, c4)
b7, _ = bits.Add64(b7, a6, c5)
red64(z, b0, b1, b2, b3, b4, b5, b6, b7)
}
func modpGeneric(x *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
// CX = C[255] ? 38 : 19
cx := uint64(19) << (x3 >> 63)
// PUT BIT 255 IN CARRY FLAG AND CLEAR
x3 &^= 1 << 63
x0, c0 := bits.Add64(x0, cx, 0)
x1, c1 := bits.Add64(x1, 0, c0)
x2, c2 := bits.Add64(x2, 0, c1)
x3, _ = bits.Add64(x3, 0, c2)
// TEST FOR BIT 255 AGAIN; ONLY TRIGGERED ON OVERFLOW MODULO 2^255-19
// cx = C[255] ? 0 : 19
cx = uint64(19) &^ (-(x3 >> 63))
// CLEAR BIT 255
x3 &^= 1 << 63
x0, c0 = bits.Sub64(x0, cx, 0)
x1, c1 = bits.Sub64(x1, 0, c0)
x2, c2 = bits.Sub64(x2, 0, c1)
x3, _ = bits.Sub64(x3, 0, c2)
binary.LittleEndian.PutUint64(x[0*8:1*8], x0)
binary.LittleEndian.PutUint64(x[1*8:2*8], x1)
binary.LittleEndian.PutUint64(x[2*8:3*8], x2)
binary.LittleEndian.PutUint64(x[3*8:4*8], x3)
}
func red64(z *Elt, x0, x1, x2, x3, x4, x5, x6, x7 uint64) {
h0, l0 := bits.Mul64(x4, 38)
h1, l1 := bits.Mul64(x5, 38)
h2, l2 := bits.Mul64(x6, 38)
h3, l3 := bits.Mul64(x7, 38)
l1, c0 := bits.Add64(h0, l1, 0)
l2, c1 := bits.Add64(h1, l2, c0)
l3, c2 := bits.Add64(h2, l3, c1)
l4, _ := bits.Add64(h3, 0, c2)
l0, c0 = bits.Add64(l0, x0, 0)
l1, c1 = bits.Add64(l1, x1, c0)
l2, c2 = bits.Add64(l2, x2, c1)
l3, c3 := bits.Add64(l3, x3, c2)
l4, _ = bits.Add64(l4, 0, c3)
_, l4 = bits.Mul64(l4, 38)
l0, c0 = bits.Add64(l0, l4, 0)
z1, c1 := bits.Add64(l1, 0, c0)
z2, c2 := bits.Add64(l2, 0, c1)
z3, c3 := bits.Add64(l3, 0, c2)
z0, _ := bits.Add64(l0, (-c3)&38, 0)
binary.LittleEndian.PutUint64(z[0*8:1*8], z0)
binary.LittleEndian.PutUint64(z[1*8:2*8], z1)
binary.LittleEndian.PutUint64(z[2*8:3*8], z2)
binary.LittleEndian.PutUint64(z[3*8:4*8], z3)
}
//go:build !amd64 || purego
// +build !amd64 purego
package fp25519
func cmov(x, y *Elt, n uint) { cmovGeneric(x, y, n) }
func cswap(x, y *Elt, n uint) { cswapGeneric(x, y, n) }
func add(z, x, y *Elt) { addGeneric(z, x, y) }
func sub(z, x, y *Elt) { subGeneric(z, x, y) }
func addsub(x, y *Elt) { addsubGeneric(x, y) }
func mul(z, x, y *Elt) { mulGeneric(z, x, y) }
func sqr(z, x *Elt) { sqrGeneric(z, x) }
func modp(z *Elt) { modpGeneric(z) }
// Package fp448 provides prime field arithmetic over GF(2^448-2^224-1).
package fp448
import (
"errors"
"github.com/cloudflare/circl/internal/conv"
)
// Size in bytes of an element.
const Size = 56
// Elt is a prime field element.
type Elt [Size]byte
func (e Elt) String() string { return conv.BytesLe2Hex(e[:]) }
// p is the prime modulus 2^448-2^224-1.
var p = Elt{
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
}
// P returns the prime modulus 2^448-2^224-1.
func P() Elt { return p }
// ToBytes stores in b the little-endian byte representation of x.
func ToBytes(b []byte, x *Elt) error {
if len(b) != Size {
return errors.New("wrong size")
}
Modp(x)
copy(b, x[:])
return nil
}
// IsZero returns true if x is equal to 0.
func IsZero(x *Elt) bool { Modp(x); return *x == Elt{} }
// IsOne returns true if x is equal to 1.
func IsOne(x *Elt) bool { Modp(x); return *x == Elt{1} }
// SetOne assigns x=1.
func SetOne(x *Elt) { *x = Elt{1} }
// One returns the 1 element.
func One() (x Elt) { x = Elt{1}; return }
// Neg calculates z = -x.
func Neg(z, x *Elt) { Sub(z, &p, x) }
// Modp ensures that z is between [0,p-1].
func Modp(z *Elt) { Sub(z, z, &p) }
// InvSqrt calculates z = sqrt(x/y) iff x/y is a quadratic-residue. If so,
// isQR = true; otherwise, isQR = false, since x/y is a quadratic non-residue,
// and z = sqrt(-x/y).
func InvSqrt(z, x, y *Elt) (isQR bool) {
// First note that x^(2(k+1)) = x^(p-1)/2 * x = legendre(x) * x
// so that's x if x is a quadratic residue and -x otherwise.
// Next, y^(6k+3) = y^(4k+2) * y^(2k+1) = y^(p-1) * y^((p-1)/2) = legendre(y).
// So the z we compute satisfies z^2 y = x^(2(k+1)) y^(6k+3) = legendre(x)*legendre(y).
// Thus if x and y are quadratic residues, then z is indeed sqrt(x/y).
t0, t1 := &Elt{}, &Elt{}
Mul(t0, x, y) // x*y
Sqr(t1, y) // y^2
Mul(t1, t0, t1) // x*y^3
powPminus3div4(z, t1) // (x*y^3)^k
Mul(z, z, t0) // z = x*y*(x*y^3)^k = x^(k+1) * y^(3k+1)
// Check if x/y is a quadratic residue
Sqr(t0, z) // z^2
Mul(t0, t0, y) // y*z^2
Sub(t0, t0, x) // y*z^2-x
return IsZero(t0)
}
// Inv calculates z = 1/x mod p.
func Inv(z, x *Elt) {
// Calculates z = x^(4k+1) = x^(p-3+1) = x^(p-2) = x^-1, where k = (p-3)/4.
t := &Elt{}
powPminus3div4(t, x) // t = x^k
Sqr(t, t) // t = x^2k
Sqr(t, t) // t = x^4k
Mul(z, t, x) // z = x^(4k+1)
}
// powPminus3div4 calculates z = x^k mod p, where k = (p-3)/4.
func powPminus3div4(z, x *Elt) {
x0, x1 := &Elt{}, &Elt{}
Sqr(z, x)
Mul(z, z, x)
Sqr(x0, z)
Mul(x0, x0, x)
Sqr(z, x0)
Sqr(z, z)
Sqr(z, z)
Mul(z, z, x0)
Sqr(x1, z)
for i := 0; i < 5; i++ {
Sqr(x1, x1)
}
Mul(x1, x1, z)
Sqr(z, x1)
for i := 0; i < 11; i++ {
Sqr(z, z)
}
Mul(z, z, x1)
Sqr(z, z)
Sqr(z, z)
Sqr(z, z)
Mul(z, z, x0)
Sqr(x1, z)
for i := 0; i < 26; i++ {
Sqr(x1, x1)
}
Mul(x1, x1, z)
Sqr(z, x1)
for i := 0; i < 53; i++ {
Sqr(z, z)
}
Mul(z, z, x1)
Sqr(z, z)
Sqr(z, z)
Sqr(z, z)
Mul(z, z, x0)
Sqr(x1, z)
for i := 0; i < 110; i++ {
Sqr(x1, x1)
}
Mul(x1, x1, z)
Sqr(z, x1)
Mul(z, z, x)
for i := 0; i < 223; i++ {
Sqr(z, z)
}
Mul(z, z, x1)
}
// Cmov assigns y to x if n is 1.
func Cmov(x, y *Elt, n uint) { cmov(x, y, n) }
// Cswap interchanges x and y if n is 1.
func Cswap(x, y *Elt, n uint) { cswap(x, y, n) }
// Add calculates z = x+y mod p.
func Add(z, x, y *Elt) { add(z, x, y) }
// Sub calculates z = x-y mod p.
func Sub(z, x, y *Elt) { sub(z, x, y) }
// AddSub calculates (x,y) = (x+y mod p, x-y mod p).
func AddSub(x, y *Elt) { addsub(x, y) }
// Mul calculates z = x*y mod p.
func Mul(z, x, y *Elt) { mul(z, x, y) }
// Sqr calculates z = x^2 mod p.
func Sqr(z, x *Elt) { sqr(z, x) }
//go:build amd64 && !purego
// +build amd64,!purego
package fp448
import (
"golang.org/x/sys/cpu"
)
var hasBmi2Adx = cpu.X86.HasBMI2 && cpu.X86.HasADX
var _ = hasBmi2Adx
func cmov(x, y *Elt, n uint) { cmovAmd64(x, y, n) }
func cswap(x, y *Elt, n uint) { cswapAmd64(x, y, n) }
func add(z, x, y *Elt) { addAmd64(z, x, y) }
func sub(z, x, y *Elt) { subAmd64(z, x, y) }
func addsub(x, y *Elt) { addsubAmd64(x, y) }
func mul(z, x, y *Elt) { mulAmd64(z, x, y) }
func sqr(z, x *Elt) { sqrAmd64(z, x) }
/* Functions defined in fp_amd64.s */
//go:noescape
func cmovAmd64(x, y *Elt, n uint)
//go:noescape
func cswapAmd64(x, y *Elt, n uint)
//go:noescape
func addAmd64(z, x, y *Elt)
//go:noescape
func subAmd64(z, x, y *Elt)
//go:noescape
func addsubAmd64(x, y *Elt)
//go:noescape
func mulAmd64(z, x, y *Elt)
//go:noescape
func sqrAmd64(z, x *Elt)
// This code was imported from https://github.com/armfazh/rfc7748_precomputed
// CHECK_BMI2ADX triggers bmi2adx if supported,
// otherwise it fallbacks to legacy code.
#define CHECK_BMI2ADX(label, legacy, bmi2adx) \
CMPB ·hasBmi2Adx(SB), $0 \
JE label \
bmi2adx \
RET \
label: \
legacy \
RET
// cselect is a conditional move
// if b=1: it copies y into x;
// if b=0: x remains with the same value;
// if b<> 0,1: undefined.
// Uses: AX, DX, FLAGS
// Instr: x86_64, cmov
#define cselect(x,y,b) \
TESTQ b, b \
MOVQ 0+x, AX; MOVQ 0+y, DX; CMOVQNE DX, AX; MOVQ AX, 0+x; \
MOVQ 8+x, AX; MOVQ 8+y, DX; CMOVQNE DX, AX; MOVQ AX, 8+x; \
MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; \
MOVQ 32+x, AX; MOVQ 32+y, DX; CMOVQNE DX, AX; MOVQ AX, 32+x; \
MOVQ 40+x, AX; MOVQ 40+y, DX; CMOVQNE DX, AX; MOVQ AX, 40+x; \
MOVQ 48+x, AX; MOVQ 48+y, DX; CMOVQNE DX, AX; MOVQ AX, 48+x;
// cswap is a conditional swap
// if b=1: x,y <- y,x;
// if b=0: x,y remain with the same values;
// if b<> 0,1: undefined.
// Uses: AX, DX, R8, FLAGS
// Instr: x86_64, cmov
#define cswap(x,y,b) \
TESTQ b, b \
MOVQ 0+x, AX; MOVQ AX, R8; MOVQ 0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 0+x; MOVQ DX, 0+y; \
MOVQ 8+x, AX; MOVQ AX, R8; MOVQ 8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 8+x; MOVQ DX, 8+y; \
MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; \
MOVQ 32+x, AX; MOVQ AX, R8; MOVQ 32+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 32+x; MOVQ DX, 32+y; \
MOVQ 40+x, AX; MOVQ AX, R8; MOVQ 40+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 40+x; MOVQ DX, 40+y; \
MOVQ 48+x, AX; MOVQ AX, R8; MOVQ 48+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 48+x; MOVQ DX, 48+y;
// additionLeg adds x and y and stores in z
// Uses: AX, DX, R8-R14, FLAGS
// Instr: x86_64
#define additionLeg(z,x,y) \
MOVQ 0+x, R8; ADDQ 0+y, R8; \
MOVQ 8+x, R9; ADCQ 8+y, R9; \
MOVQ 16+x, R10; ADCQ 16+y, R10; \
MOVQ 24+x, R11; ADCQ 24+y, R11; \
MOVQ 32+x, R12; ADCQ 32+y, R12; \
MOVQ 40+x, R13; ADCQ 40+y, R13; \
MOVQ 48+x, R14; ADCQ 48+y, R14; \
MOVQ $0, AX; ADCQ $0, AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
ADDQ AX, R8; MOVQ $0, AX; \
ADCQ $0, R9; \
ADCQ $0, R10; \
ADCQ DX, R11; \
ADCQ $0, R12; \
ADCQ $0, R13; \
ADCQ $0, R14; \
ADCQ $0, AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
ADDQ AX, R8; MOVQ R8, 0+z; \
ADCQ $0, R9; MOVQ R9, 8+z; \
ADCQ $0, R10; MOVQ R10, 16+z; \
ADCQ DX, R11; MOVQ R11, 24+z; \
ADCQ $0, R12; MOVQ R12, 32+z; \
ADCQ $0, R13; MOVQ R13, 40+z; \
ADCQ $0, R14; MOVQ R14, 48+z;
// additionAdx adds x and y and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64, adx
#define additionAdx(z,x,y) \
MOVL $32, R15; \
XORL DX, DX; \
MOVQ 0+x, R8; ADCXQ 0+y, R8; \
MOVQ 8+x, R9; ADCXQ 8+y, R9; \
MOVQ 16+x, R10; ADCXQ 16+y, R10; \
MOVQ 24+x, R11; ADCXQ 24+y, R11; \
MOVQ 32+x, R12; ADCXQ 32+y, R12; \
MOVQ 40+x, R13; ADCXQ 40+y, R13; \
MOVQ 48+x, R14; ADCXQ 48+y, R14; \
;;;;;;;;;;;;;;; ADCXQ DX, DX; \
XORL AX, AX; \
ADCXQ DX, R8; SHLXQ R15, DX, DX; \
ADCXQ AX, R9; \
ADCXQ AX, R10; \
ADCXQ DX, R11; \
ADCXQ AX, R12; \
ADCXQ AX, R13; \
ADCXQ AX, R14; \
ADCXQ AX, AX; \
XORL DX, DX; \
ADCXQ AX, R8; MOVQ R8, 0+z; SHLXQ R15, AX, AX; \
ADCXQ DX, R9; MOVQ R9, 8+z; \
ADCXQ DX, R10; MOVQ R10, 16+z; \
ADCXQ AX, R11; MOVQ R11, 24+z; \
ADCXQ DX, R12; MOVQ R12, 32+z; \
ADCXQ DX, R13; MOVQ R13, 40+z; \
ADCXQ DX, R14; MOVQ R14, 48+z;
// subtraction subtracts y from x and stores in z
// Uses: AX, DX, R8-R14, FLAGS
// Instr: x86_64
#define subtraction(z,x,y) \
MOVQ 0+x, R8; SUBQ 0+y, R8; \
MOVQ 8+x, R9; SBBQ 8+y, R9; \
MOVQ 16+x, R10; SBBQ 16+y, R10; \
MOVQ 24+x, R11; SBBQ 24+y, R11; \
MOVQ 32+x, R12; SBBQ 32+y, R12; \
MOVQ 40+x, R13; SBBQ 40+y, R13; \
MOVQ 48+x, R14; SBBQ 48+y, R14; \
MOVQ $0, AX; SETCS AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
SUBQ AX, R8; MOVQ $0, AX; \
SBBQ $0, R9; \
SBBQ $0, R10; \
SBBQ DX, R11; \
SBBQ $0, R12; \
SBBQ $0, R13; \
SBBQ $0, R14; \
SETCS AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
SUBQ AX, R8; MOVQ R8, 0+z; \
SBBQ $0, R9; MOVQ R9, 8+z; \
SBBQ $0, R10; MOVQ R10, 16+z; \
SBBQ DX, R11; MOVQ R11, 24+z; \
SBBQ $0, R12; MOVQ R12, 32+z; \
SBBQ $0, R13; MOVQ R13, 40+z; \
SBBQ $0, R14; MOVQ R14, 48+z;
// maddBmi2Adx multiplies x and y and accumulates in z
// Uses: AX, DX, R15, FLAGS
// Instr: x86_64, bmi2, adx
#define maddBmi2Adx(z,x,y,i,r0,r1,r2,r3,r4,r5,r6) \
MOVQ i+y, DX; XORL AX, AX; \
MULXQ 0+x, AX, R8; ADOXQ AX, r0; ADCXQ R8, r1; MOVQ r0,i+z; \
MULXQ 8+x, AX, r0; ADOXQ AX, r1; ADCXQ r0, r2; MOVQ $0, R8; \
MULXQ 16+x, AX, r0; ADOXQ AX, r2; ADCXQ r0, r3; \
MULXQ 24+x, AX, r0; ADOXQ AX, r3; ADCXQ r0, r4; \
MULXQ 32+x, AX, r0; ADOXQ AX, r4; ADCXQ r0, r5; \
MULXQ 40+x, AX, r0; ADOXQ AX, r5; ADCXQ r0, r6; \
MULXQ 48+x, AX, r0; ADOXQ AX, r6; ADCXQ R8, r0; \
;;;;;;;;;;;;;;;;;;; ADOXQ R8, r0;
// integerMulAdx multiplies x and y and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64, bmi2, adx
#define integerMulAdx(z,x,y) \
MOVL $0,R15; \
MOVQ 0+y, DX; XORL AX, AX; MOVQ $0, R8; \
MULXQ 0+x, AX, R9; MOVQ AX, 0+z; \
MULXQ 8+x, AX, R10; ADCXQ AX, R9; \
MULXQ 16+x, AX, R11; ADCXQ AX, R10; \
MULXQ 24+x, AX, R12; ADCXQ AX, R11; \
MULXQ 32+x, AX, R13; ADCXQ AX, R12; \
MULXQ 40+x, AX, R14; ADCXQ AX, R13; \
MULXQ 48+x, AX, R15; ADCXQ AX, R14; \
;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R15; \
maddBmi2Adx(z,x,y, 8, R9,R10,R11,R12,R13,R14,R15) \
maddBmi2Adx(z,x,y,16,R10,R11,R12,R13,R14,R15, R9) \
maddBmi2Adx(z,x,y,24,R11,R12,R13,R14,R15, R9,R10) \
maddBmi2Adx(z,x,y,32,R12,R13,R14,R15, R9,R10,R11) \
maddBmi2Adx(z,x,y,40,R13,R14,R15, R9,R10,R11,R12) \
maddBmi2Adx(z,x,y,48,R14,R15, R9,R10,R11,R12,R13) \
MOVQ R15, 56+z; \
MOVQ R9, 64+z; \
MOVQ R10, 72+z; \
MOVQ R11, 80+z; \
MOVQ R12, 88+z; \
MOVQ R13, 96+z; \
MOVQ R14, 104+z;
// maddLegacy multiplies x and y and accumulates in z
// Uses: AX, DX, R15, FLAGS
// Instr: x86_64
#define maddLegacy(z,x,y,i) \
MOVQ i+y, R15; \
MOVQ 0+x, AX; MULQ R15; MOVQ AX, R8; ;;;;;;;;;;;; MOVQ DX, R9; \
MOVQ 8+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \
MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
ADDQ 0+i+z, R8; MOVQ R8, 0+i+z; \
ADCQ 8+i+z, R9; MOVQ R9, 8+i+z; \
ADCQ 16+i+z, R10; MOVQ R10, 16+i+z; \
ADCQ 24+i+z, R11; MOVQ R11, 24+i+z; \
ADCQ 32+i+z, R12; MOVQ R12, 32+i+z; \
ADCQ 40+i+z, R13; MOVQ R13, 40+i+z; \
ADCQ 48+i+z, R14; MOVQ R14, 48+i+z; \
ADCQ $0, DX; MOVQ DX, 56+i+z;
// integerMulLeg multiplies x and y and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64
#define integerMulLeg(z,x,y) \
MOVQ 0+y, R15; \
MOVQ 0+x, AX; MULQ R15; MOVQ AX, 0+z; ;;;;;;;;;;;; MOVQ DX, R8; \
MOVQ 8+x, AX; MULQ R15; ADDQ AX, R8; ADCQ $0, DX; MOVQ DX, R9; MOVQ R8, 8+z; \
MOVQ 16+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; MOVQ R9, 16+z; \
MOVQ 24+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; MOVQ R10, 24+z; \
MOVQ 32+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; MOVQ R11, 32+z; \
MOVQ 40+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; MOVQ R12, 40+z; \
MOVQ 48+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX,56+z; MOVQ R13, 48+z; \
maddLegacy(z,x,y, 8) \
maddLegacy(z,x,y,16) \
maddLegacy(z,x,y,24) \
maddLegacy(z,x,y,32) \
maddLegacy(z,x,y,40) \
maddLegacy(z,x,y,48)
// integerSqrLeg squares x and stores in z
// Uses: AX, CX, DX, R8-R15, FLAGS
// Instr: x86_64
#define integerSqrLeg(z,x) \
XORL R15, R15; \
MOVQ 0+x, CX; \
MOVQ CX, AX; MULQ CX; MOVQ AX, 0+z; MOVQ DX, R8; \
ADDQ CX, CX; ADCQ $0, R15; \
MOVQ 8+x, AX; MULQ CX; ADDQ AX, R8; ADCQ $0, DX; MOVQ DX, R9; MOVQ R8, 8+z; \
MOVQ 16+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \
MOVQ 24+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
MOVQ 32+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
MOVQ 40+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
MOVQ 48+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
\
MOVQ 8+x, CX; \
MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ R9,16+z; \
MOVQ R15, AX; NEGQ AX; ANDQ 8+x, AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
ADDQ 8+x, CX; ADCQ $0, R15; \
MOVQ 16+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 24+z; \
MOVQ 24+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX, R8; \
MOVQ 32+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; \
MOVQ 40+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
MOVQ 48+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R9; \
\
MOVQ 16+x, CX; \
MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 32+z; \
MOVQ R15, AX; NEGQ AX; ANDQ 16+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
ADDQ 16+x, CX; ADCQ $0, R15; \
MOVQ 24+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 40+z; \
MOVQ 32+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
MOVQ 40+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; \
MOVQ 48+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; ADDQ R8, R9; ADCQ $0, DX; MOVQ DX,R10; \
\
MOVQ 24+x, CX; \
MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 48+z; \
MOVQ R15, AX; NEGQ AX; ANDQ 24+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
ADDQ 24+x, CX; ADCQ $0, R15; \
MOVQ 32+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; MOVQ R14, 56+z; \
MOVQ 40+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; ADDQ R8, R9; ADCQ $0, DX; MOVQ DX, R8; \
MOVQ 48+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX,R11; \
\
MOVQ 32+x, CX; \
MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ R9, 64+z; \
MOVQ R15, AX; NEGQ AX; ANDQ 32+x,AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
ADDQ 32+x, CX; ADCQ $0, R15; \
MOVQ 40+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 72+z; \
MOVQ 48+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX,R12; \
\
XORL R13, R13; \
XORL R14, R14; \
MOVQ 40+x, CX; \
MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 80+z; \
MOVQ R15, AX; NEGQ AX; ANDQ 40+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
ADDQ 40+x, CX; ADCQ $0, R15; \
MOVQ 48+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 88+z; \
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8, R13; ADCQ $0,R14; \
\
XORL R9, R9; \
MOVQ 48+x, CX; \
MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 96+z; \
MOVQ R15, AX; NEGQ AX; ANDQ 48+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8,R14; ADCQ $0, R9; MOVQ R14, 104+z;
// integerSqrAdx squares x and stores in z
// Uses: AX, CX, DX, R8-R15, FLAGS
// Instr: x86_64, bmi2, adx
#define integerSqrAdx(z,x) \
XORL R15, R15; \
MOVQ 0+x, DX; \
;;;;;;;;;;;;;; MULXQ DX, AX, R8; MOVQ AX, 0+z; \
ADDQ DX, DX; ADCQ $0, R15; CLC; \
MULXQ 8+x, AX, R9; ADCXQ AX, R8; MOVQ R8, 8+z; \
MULXQ 16+x, AX, R10; ADCXQ AX, R9; MOVQ $0, R8;\
MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
MULXQ 32+x, AX, R12; ADCXQ AX, R11; \
MULXQ 40+x, AX, R13; ADCXQ AX, R12; \
MULXQ 48+x, AX, R14; ADCXQ AX, R13; \
;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R14; \
\
MOVQ 8+x, DX; \
MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
MULXQ AX, AX, CX; \
MOVQ R15, R8; NEGQ R8; ANDQ 8+x, R8; \
ADDQ AX, R9; MOVQ R9, 16+z; \
ADCQ CX, R8; \
ADCQ $0, R11; \
ADDQ 8+x, DX; \
ADCQ $0, R15; \
XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
MULXQ 16+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 24+z; \
MULXQ 24+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; MOVQ $0, R10; \
MULXQ 32+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; \
MULXQ 40+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; \
MULXQ 48+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; \
;;;;;;;;;;;;;;;;;;; ADCXQ R10, R9; \
\
MOVQ 16+x, DX; \
MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
MULXQ AX, AX, CX; \
MOVQ R15, R8; NEGQ R8; ANDQ 16+x, R8; \
ADDQ AX, R11; MOVQ R11, 32+z; \
ADCQ CX, R8; \
ADCQ $0, R13; \
ADDQ 16+x, DX; \
ADCQ $0, R15; \
XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
MULXQ 24+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 40+z; \
MULXQ 32+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; MOVQ $0, R12; \
MULXQ 40+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; \
MULXQ 48+x, AX, CX; ADCXQ AX, R9; ADOXQ CX, R10; \
;;;;;;;;;;;;;;;;;;; ADCXQ R11,R10; \
\
MOVQ 24+x, DX; \
MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
MULXQ AX, AX, CX; \
MOVQ R15, R8; NEGQ R8; ANDQ 24+x, R8; \
ADDQ AX, R13; MOVQ R13, 48+z; \
ADCQ CX, R8; \
ADCQ $0, R9; \
ADDQ 24+x, DX; \
ADCQ $0, R15; \
XORL R13, R13; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R14; \
MULXQ 32+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; MOVQ R14, 56+z; \
MULXQ 40+x, AX, CX; ADCXQ AX, R9; ADOXQ CX, R10; MOVQ $0, R14; \
MULXQ 48+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; \
;;;;;;;;;;;;;;;;;;; ADCXQ R12,R11; \
\
MOVQ 32+x, DX; \
MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
MULXQ AX, AX, CX; \
MOVQ R15, R8; NEGQ R8; ANDQ 32+x, R8; \
ADDQ AX, R9; MOVQ R9, 64+z; \
ADCQ CX, R8; \
ADCQ $0, R11; \
ADDQ 32+x, DX; \
ADCQ $0, R15; \
XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
MULXQ 40+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 72+z; \
MULXQ 48+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; \
;;;;;;;;;;;;;;;;;;; ADCXQ R13,R12; \
\
MOVQ 40+x, DX; \
MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
MULXQ AX, AX, CX; \
MOVQ R15, R8; NEGQ R8; ANDQ 40+x, R8; \
ADDQ AX, R11; MOVQ R11, 80+z; \
ADCQ CX, R8; \
ADCQ $0, R13; \
ADDQ 40+x, DX; \
ADCQ $0, R15; \
XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
MULXQ 48+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 88+z; \
;;;;;;;;;;;;;;;;;;; ADCXQ R14,R13; \
\
MOVQ 48+x, DX; \
MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
MULXQ AX, AX, CX; \
MOVQ R15, R8; NEGQ R8; ANDQ 48+x, R8; \
XORL R10, R10; ;;;;;;;;;;;;;; ADOXQ CX, R14; \
;;;;;;;;;;;;;; ADCXQ AX, R13; ;;;;;;;;;;;;;; MOVQ R13, 96+z; \
;;;;;;;;;;;;;; ADCXQ R8, R14; MOVQ R14, 104+z;
// reduceFromDoubleLeg finds a z=x modulo p such that z<2^448 and stores in z
// Uses: AX, R8-R15, FLAGS
// Instr: x86_64
#define reduceFromDoubleLeg(z,x) \
/* ( ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
/* (r14, r13, r12, r11, r10,r9,r8,r15) */ \
MOVQ 80+x,AX; MOVQ AX,R10; \
MOVQ $0xFFFFFFFF00000000, R8; \
ANDQ R8,R10; \
\
MOVQ $0,R14; \
MOVQ 104+x,R13; SHLQ $1,R13,R14; \
MOVQ 96+x,R12; SHLQ $1,R12,R13; \
MOVQ 88+x,R11; SHLQ $1,R11,R12; \
MOVQ 72+x, R9; SHLQ $1,R10,R11; \
MOVQ 64+x, R8; SHLQ $1,R10; \
MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
MOVQ 56+x,R15; \
\
ADDQ 0+x,R15; MOVQ R15, 0+z; MOVQ 56+x,R15; \
ADCQ 8+x, R8; MOVQ R8, 8+z; MOVQ 64+x, R8; \
ADCQ 16+x, R9; MOVQ R9,16+z; MOVQ 72+x, R9; \
ADCQ 24+x,R10; MOVQ R10,24+z; MOVQ 80+x,R10; \
ADCQ 32+x,R11; MOVQ R11,32+z; MOVQ 88+x,R11; \
ADCQ 40+x,R12; MOVQ R12,40+z; MOVQ 96+x,R12; \
ADCQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
ADCQ $0,R14; \
/* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
/* ( r9, r8, r15, r13, r12, r11, r10) */ \
MOVQ R10, AX; \
SHRQ $32,R11,R10; \
SHRQ $32,R12,R11; \
SHRQ $32,R13,R12; \
SHRQ $32,R15,R13; \
SHRQ $32, R8,R15; \
SHRQ $32, R9, R8; \
SHRQ $32, AX, R9; \
\
ADDQ 0+z,R10; \
ADCQ 8+z,R11; \
ADCQ 16+z,R12; \
ADCQ 24+z,R13; \
ADCQ 32+z,R15; \
ADCQ 40+z, R8; \
ADCQ 48+z, R9; \
ADCQ $0,R14; \
/* ( c7) + (c6,...,c0) */ \
/* (r14) */ \
MOVQ R14, AX; SHLQ $32, AX; \
ADDQ R14,R10; MOVQ $0,R14; \
ADCQ $0,R11; \
ADCQ $0,R12; \
ADCQ AX,R13; \
ADCQ $0,R15; \
ADCQ $0, R8; \
ADCQ $0, R9; \
ADCQ $0,R14; \
/* ( c7) + (c6,...,c0) */ \
/* (r14) */ \
MOVQ R14, AX; SHLQ $32,AX; \
ADDQ R14,R10; MOVQ R10, 0+z; \
ADCQ $0,R11; MOVQ R11, 8+z; \
ADCQ $0,R12; MOVQ R12,16+z; \
ADCQ AX,R13; MOVQ R13,24+z; \
ADCQ $0,R15; MOVQ R15,32+z; \
ADCQ $0, R8; MOVQ R8,40+z; \
ADCQ $0, R9; MOVQ R9,48+z;
// reduceFromDoubleAdx finds a z=x modulo p such that z<2^448 and stores in z
// Uses: AX, R8-R15, FLAGS
// Instr: x86_64, adx
#define reduceFromDoubleAdx(z,x) \
/* ( ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
/* (r14, r13, r12, r11, r10,r9,r8,r15) */ \
MOVQ 80+x,AX; MOVQ AX,R10; \
MOVQ $0xFFFFFFFF00000000, R8; \
ANDQ R8,R10; \
\
MOVQ $0,R14; \
MOVQ 104+x,R13; SHLQ $1,R13,R14; \
MOVQ 96+x,R12; SHLQ $1,R12,R13; \
MOVQ 88+x,R11; SHLQ $1,R11,R12; \
MOVQ 72+x, R9; SHLQ $1,R10,R11; \
MOVQ 64+x, R8; SHLQ $1,R10; \
MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
MOVQ 56+x,R15; \
\
XORL AX,AX; \
ADCXQ 0+x,R15; MOVQ R15, 0+z; MOVQ 56+x,R15; \
ADCXQ 8+x, R8; MOVQ R8, 8+z; MOVQ 64+x, R8; \
ADCXQ 16+x, R9; MOVQ R9,16+z; MOVQ 72+x, R9; \
ADCXQ 24+x,R10; MOVQ R10,24+z; MOVQ 80+x,R10; \
ADCXQ 32+x,R11; MOVQ R11,32+z; MOVQ 88+x,R11; \
ADCXQ 40+x,R12; MOVQ R12,40+z; MOVQ 96+x,R12; \
ADCXQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
ADCXQ AX,R14; \
/* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
/* ( r9, r8, r15, r13, r12, r11, r10) */ \
MOVQ R10, AX; \
SHRQ $32,R11,R10; \
SHRQ $32,R12,R11; \
SHRQ $32,R13,R12; \
SHRQ $32,R15,R13; \
SHRQ $32, R8,R15; \
SHRQ $32, R9, R8; \
SHRQ $32, AX, R9; \
\
XORL AX,AX; \
ADCXQ 0+z,R10; \
ADCXQ 8+z,R11; \
ADCXQ 16+z,R12; \
ADCXQ 24+z,R13; \
ADCXQ 32+z,R15; \
ADCXQ 40+z, R8; \
ADCXQ 48+z, R9; \
ADCXQ AX,R14; \
/* ( c7) + (c6,...,c0) */ \
/* (r14) */ \
MOVQ R14, AX; SHLQ $32, AX; \
CLC; \
ADCXQ R14,R10; MOVQ $0,R14; \
ADCXQ R14,R11; \
ADCXQ R14,R12; \
ADCXQ AX,R13; \
ADCXQ R14,R15; \
ADCXQ R14, R8; \
ADCXQ R14, R9; \
ADCXQ R14,R14; \
/* ( c7) + (c6,...,c0) */ \
/* (r14) */ \
MOVQ R14, AX; SHLQ $32, AX; \
CLC; \
ADCXQ R14,R10; MOVQ R10, 0+z; MOVQ $0,R14; \
ADCXQ R14,R11; MOVQ R11, 8+z; \
ADCXQ R14,R12; MOVQ R12,16+z; \
ADCXQ AX,R13; MOVQ R13,24+z; \
ADCXQ R14,R15; MOVQ R15,32+z; \
ADCXQ R14, R8; MOVQ R8,40+z; \
ADCXQ R14, R9; MOVQ R9,48+z;
// addSub calculates two operations: x,y = x+y,x-y
// Uses: AX, DX, R8-R15, FLAGS
#define addSub(x,y) \
MOVQ 0+x, R8; ADDQ 0+y, R8; \
MOVQ 8+x, R9; ADCQ 8+y, R9; \
MOVQ 16+x, R10; ADCQ 16+y, R10; \
MOVQ 24+x, R11; ADCQ 24+y, R11; \
MOVQ 32+x, R12; ADCQ 32+y, R12; \
MOVQ 40+x, R13; ADCQ 40+y, R13; \
MOVQ 48+x, R14; ADCQ 48+y, R14; \
MOVQ $0, AX; ADCQ $0, AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
ADDQ AX, R8; MOVQ $0, AX; \
ADCQ $0, R9; \
ADCQ $0, R10; \
ADCQ DX, R11; \
ADCQ $0, R12; \
ADCQ $0, R13; \
ADCQ $0, R14; \
ADCQ $0, AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
ADDQ AX, R8; MOVQ 0+x,AX; MOVQ R8, 0+x; MOVQ AX, R8; \
ADCQ $0, R9; MOVQ 8+x,AX; MOVQ R9, 8+x; MOVQ AX, R9; \
ADCQ $0, R10; MOVQ 16+x,AX; MOVQ R10, 16+x; MOVQ AX, R10; \
ADCQ DX, R11; MOVQ 24+x,AX; MOVQ R11, 24+x; MOVQ AX, R11; \
ADCQ $0, R12; MOVQ 32+x,AX; MOVQ R12, 32+x; MOVQ AX, R12; \
ADCQ $0, R13; MOVQ 40+x,AX; MOVQ R13, 40+x; MOVQ AX, R13; \
ADCQ $0, R14; MOVQ 48+x,AX; MOVQ R14, 48+x; MOVQ AX, R14; \
SUBQ 0+y, R8; \
SBBQ 8+y, R9; \
SBBQ 16+y, R10; \
SBBQ 24+y, R11; \
SBBQ 32+y, R12; \
SBBQ 40+y, R13; \
SBBQ 48+y, R14; \
MOVQ $0, AX; SETCS AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
SUBQ AX, R8; MOVQ $0, AX; \
SBBQ $0, R9; \
SBBQ $0, R10; \
SBBQ DX, R11; \
SBBQ $0, R12; \
SBBQ $0, R13; \
SBBQ $0, R14; \
SETCS AX; \
MOVQ AX, DX; \
SHLQ $32, DX; \
SUBQ AX, R8; MOVQ R8, 0+y; \
SBBQ $0, R9; MOVQ R9, 8+y; \
SBBQ $0, R10; MOVQ R10, 16+y; \
SBBQ DX, R11; MOVQ R11, 24+y; \
SBBQ $0, R12; MOVQ R12, 32+y; \
SBBQ $0, R13; MOVQ R13, 40+y; \
SBBQ $0, R14; MOVQ R14, 48+y;
//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
#include "fp_amd64.h"
// func cmovAmd64(x, y *Elt, n uint)
TEXT ·cmovAmd64(SB),NOSPLIT,$0-24
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
MOVQ n+16(FP), BX
cselect(0(DI),0(SI),BX)
RET
// func cswapAmd64(x, y *Elt, n uint)
TEXT ·cswapAmd64(SB),NOSPLIT,$0-24
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
MOVQ n+16(FP), BX
cswap(0(DI),0(SI),BX)
RET
// func subAmd64(z, x, y *Elt)
TEXT ·subAmd64(SB),NOSPLIT,$0-24
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
MOVQ y+16(FP), BX
subtraction(0(DI),0(SI),0(BX))
RET
// func addsubAmd64(x, y *Elt)
TEXT ·addsubAmd64(SB),NOSPLIT,$0-16
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
addSub(0(DI),0(SI))
RET
#define addLegacy \
additionLeg(0(DI),0(SI),0(BX))
#define addBmi2Adx \
additionAdx(0(DI),0(SI),0(BX))
#define mulLegacy \
integerMulLeg(0(SP),0(SI),0(BX)) \
reduceFromDoubleLeg(0(DI),0(SP))
#define mulBmi2Adx \
integerMulAdx(0(SP),0(SI),0(BX)) \
reduceFromDoubleAdx(0(DI),0(SP))
#define sqrLegacy \
integerSqrLeg(0(SP),0(SI)) \
reduceFromDoubleLeg(0(DI),0(SP))
#define sqrBmi2Adx \
integerSqrAdx(0(SP),0(SI)) \
reduceFromDoubleAdx(0(DI),0(SP))
// func addAmd64(z, x, y *Elt)
TEXT ·addAmd64(SB),NOSPLIT,$0-24
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
MOVQ y+16(FP), BX
CHECK_BMI2ADX(LADD, addLegacy, addBmi2Adx)
// func mulAmd64(z, x, y *Elt)
TEXT ·mulAmd64(SB),NOSPLIT,$112-24
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
MOVQ y+16(FP), BX
CHECK_BMI2ADX(LMUL, mulLegacy, mulBmi2Adx)
// func sqrAmd64(z, x *Elt)
TEXT ·sqrAmd64(SB),NOSPLIT,$112-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
CHECK_BMI2ADX(LSQR, sqrLegacy, sqrBmi2Adx)
package fp448
import (
"encoding/binary"
"math/bits"
)
func cmovGeneric(x, y *Elt, n uint) {
m := -uint64(n & 0x1)
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
x4 := binary.LittleEndian.Uint64(x[4*8 : 5*8])
x5 := binary.LittleEndian.Uint64(x[5*8 : 6*8])
x6 := binary.LittleEndian.Uint64(x[6*8 : 7*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
y4 := binary.LittleEndian.Uint64(y[4*8 : 5*8])
y5 := binary.LittleEndian.Uint64(y[5*8 : 6*8])
y6 := binary.LittleEndian.Uint64(y[6*8 : 7*8])
x0 = (x0 &^ m) | (y0 & m)
x1 = (x1 &^ m) | (y1 & m)
x2 = (x2 &^ m) | (y2 & m)
x3 = (x3 &^ m) | (y3 & m)
x4 = (x4 &^ m) | (y4 & m)
x5 = (x5 &^ m) | (y5 & m)
x6 = (x6 &^ m) | (y6 & m)
binary.LittleEndian.PutUint64(x[0*8:1*8], x0)
binary.LittleEndian.PutUint64(x[1*8:2*8], x1)
binary.LittleEndian.PutUint64(x[2*8:3*8], x2)
binary.LittleEndian.PutUint64(x[3*8:4*8], x3)
binary.LittleEndian.PutUint64(x[4*8:5*8], x4)
binary.LittleEndian.PutUint64(x[5*8:6*8], x5)
binary.LittleEndian.PutUint64(x[6*8:7*8], x6)
}
func cswapGeneric(x, y *Elt, n uint) {
m := -uint64(n & 0x1)
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
x4 := binary.LittleEndian.Uint64(x[4*8 : 5*8])
x5 := binary.LittleEndian.Uint64(x[5*8 : 6*8])
x6 := binary.LittleEndian.Uint64(x[6*8 : 7*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
y4 := binary.LittleEndian.Uint64(y[4*8 : 5*8])
y5 := binary.LittleEndian.Uint64(y[5*8 : 6*8])
y6 := binary.LittleEndian.Uint64(y[6*8 : 7*8])
t0 := m & (x0 ^ y0)
t1 := m & (x1 ^ y1)
t2 := m & (x2 ^ y2)
t3 := m & (x3 ^ y3)
t4 := m & (x4 ^ y4)
t5 := m & (x5 ^ y5)
t6 := m & (x6 ^ y6)
x0 ^= t0
x1 ^= t1
x2 ^= t2
x3 ^= t3
x4 ^= t4
x5 ^= t5
x6 ^= t6
y0 ^= t0
y1 ^= t1
y2 ^= t2
y3 ^= t3
y4 ^= t4
y5 ^= t5
y6 ^= t6
binary.LittleEndian.PutUint64(x[0*8:1*8], x0)
binary.LittleEndian.PutUint64(x[1*8:2*8], x1)
binary.LittleEndian.PutUint64(x[2*8:3*8], x2)
binary.LittleEndian.PutUint64(x[3*8:4*8], x3)
binary.LittleEndian.PutUint64(x[4*8:5*8], x4)
binary.LittleEndian.PutUint64(x[5*8:6*8], x5)
binary.LittleEndian.PutUint64(x[6*8:7*8], x6)
binary.LittleEndian.PutUint64(y[0*8:1*8], y0)
binary.LittleEndian.PutUint64(y[1*8:2*8], y1)
binary.LittleEndian.PutUint64(y[2*8:3*8], y2)
binary.LittleEndian.PutUint64(y[3*8:4*8], y3)
binary.LittleEndian.PutUint64(y[4*8:5*8], y4)
binary.LittleEndian.PutUint64(y[5*8:6*8], y5)
binary.LittleEndian.PutUint64(y[6*8:7*8], y6)
}
func addGeneric(z, x, y *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
x4 := binary.LittleEndian.Uint64(x[4*8 : 5*8])
x5 := binary.LittleEndian.Uint64(x[5*8 : 6*8])
x6 := binary.LittleEndian.Uint64(x[6*8 : 7*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
y4 := binary.LittleEndian.Uint64(y[4*8 : 5*8])
y5 := binary.LittleEndian.Uint64(y[5*8 : 6*8])
y6 := binary.LittleEndian.Uint64(y[6*8 : 7*8])
z0, c0 := bits.Add64(x0, y0, 0)
z1, c1 := bits.Add64(x1, y1, c0)
z2, c2 := bits.Add64(x2, y2, c1)
z3, c3 := bits.Add64(x3, y3, c2)
z4, c4 := bits.Add64(x4, y4, c3)
z5, c5 := bits.Add64(x5, y5, c4)
z6, z7 := bits.Add64(x6, y6, c5)
z0, c0 = bits.Add64(z0, z7, 0)
z1, c1 = bits.Add64(z1, 0, c0)
z2, c2 = bits.Add64(z2, 0, c1)
z3, c3 = bits.Add64(z3, z7<<32, c2)
z4, c4 = bits.Add64(z4, 0, c3)
z5, c5 = bits.Add64(z5, 0, c4)
z6, z7 = bits.Add64(z6, 0, c5)
z0, c0 = bits.Add64(z0, z7, 0)
z1, c1 = bits.Add64(z1, 0, c0)
z2, c2 = bits.Add64(z2, 0, c1)
z3, c3 = bits.Add64(z3, z7<<32, c2)
z4, c4 = bits.Add64(z4, 0, c3)
z5, c5 = bits.Add64(z5, 0, c4)
z6, _ = bits.Add64(z6, 0, c5)
binary.LittleEndian.PutUint64(z[0*8:1*8], z0)
binary.LittleEndian.PutUint64(z[1*8:2*8], z1)
binary.LittleEndian.PutUint64(z[2*8:3*8], z2)
binary.LittleEndian.PutUint64(z[3*8:4*8], z3)
binary.LittleEndian.PutUint64(z[4*8:5*8], z4)
binary.LittleEndian.PutUint64(z[5*8:6*8], z5)
binary.LittleEndian.PutUint64(z[6*8:7*8], z6)
}
func subGeneric(z, x, y *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
x4 := binary.LittleEndian.Uint64(x[4*8 : 5*8])
x5 := binary.LittleEndian.Uint64(x[5*8 : 6*8])
x6 := binary.LittleEndian.Uint64(x[6*8 : 7*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
y4 := binary.LittleEndian.Uint64(y[4*8 : 5*8])
y5 := binary.LittleEndian.Uint64(y[5*8 : 6*8])
y6 := binary.LittleEndian.Uint64(y[6*8 : 7*8])
z0, c0 := bits.Sub64(x0, y0, 0)
z1, c1 := bits.Sub64(x1, y1, c0)
z2, c2 := bits.Sub64(x2, y2, c1)
z3, c3 := bits.Sub64(x3, y3, c2)
z4, c4 := bits.Sub64(x4, y4, c3)
z5, c5 := bits.Sub64(x5, y5, c4)
z6, z7 := bits.Sub64(x6, y6, c5)
z0, c0 = bits.Sub64(z0, z7, 0)
z1, c1 = bits.Sub64(z1, 0, c0)
z2, c2 = bits.Sub64(z2, 0, c1)
z3, c3 = bits.Sub64(z3, z7<<32, c2)
z4, c4 = bits.Sub64(z4, 0, c3)
z5, c5 = bits.Sub64(z5, 0, c4)
z6, z7 = bits.Sub64(z6, 0, c5)
z0, c0 = bits.Sub64(z0, z7, 0)
z1, c1 = bits.Sub64(z1, 0, c0)
z2, c2 = bits.Sub64(z2, 0, c1)
z3, c3 = bits.Sub64(z3, z7<<32, c2)
z4, c4 = bits.Sub64(z4, 0, c3)
z5, c5 = bits.Sub64(z5, 0, c4)
z6, _ = bits.Sub64(z6, 0, c5)
binary.LittleEndian.PutUint64(z[0*8:1*8], z0)
binary.LittleEndian.PutUint64(z[1*8:2*8], z1)
binary.LittleEndian.PutUint64(z[2*8:3*8], z2)
binary.LittleEndian.PutUint64(z[3*8:4*8], z3)
binary.LittleEndian.PutUint64(z[4*8:5*8], z4)
binary.LittleEndian.PutUint64(z[5*8:6*8], z5)
binary.LittleEndian.PutUint64(z[6*8:7*8], z6)
}
func addsubGeneric(x, y *Elt) {
z := &Elt{}
addGeneric(z, x, y)
subGeneric(y, x, y)
*x = *z
}
func mulGeneric(z, x, y *Elt) {
x0 := binary.LittleEndian.Uint64(x[0*8 : 1*8])
x1 := binary.LittleEndian.Uint64(x[1*8 : 2*8])
x2 := binary.LittleEndian.Uint64(x[2*8 : 3*8])
x3 := binary.LittleEndian.Uint64(x[3*8 : 4*8])
x4 := binary.LittleEndian.Uint64(x[4*8 : 5*8])
x5 := binary.LittleEndian.Uint64(x[5*8 : 6*8])
x6 := binary.LittleEndian.Uint64(x[6*8 : 7*8])
y0 := binary.LittleEndian.Uint64(y[0*8 : 1*8])
y1 := binary.LittleEndian.Uint64(y[1*8 : 2*8])
y2 := binary.LittleEndian.Uint64(y[2*8 : 3*8])
y3 := binary.LittleEndian.Uint64(y[3*8 : 4*8])
y4 := binary.LittleEndian.Uint64(y[4*8 : 5*8])
y5 := binary.LittleEndian.Uint64(y[5*8 : 6*8])
y6 := binary.LittleEndian.Uint64(y[6*8 : 7*8])
yy := [7]uint64{y0, y1, y2, y3, y4, y5, y6}
zz := [7]uint64{}
yi := yy[0]
h0, l0 := bits.Mul64(x0, yi)
h1, l1 := bits.Mul64(x1, yi)
h2, l2 := bits.Mul64(x2, yi)
h3, l3 := bits.Mul64(x3, yi)
h4, l4 := bits.Mul64(x4, yi)
h5, l5 := bits.Mul64(x5, yi)
h6, l6 := bits.Mul64(x6, yi)
zz[0] = l0
a0, c0 := bits.Add64(h0, l1, 0)
a1, c1 := bits.Add64(h1, l2, c0)
a2, c2 := bits.Add64(h2, l3, c1)
a3, c3 := bits.Add64(h3, l4, c2)
a4, c4 := bits.Add64(h4, l5, c3)
a5, c5 := bits.Add64(h5, l6, c4)
a6, _ := bits.Add64(h6, 0, c5)
for i := 1; i < 7; i++ {
yi = yy[i]
h0, l0 = bits.Mul64(x0, yi)
h1, l1 = bits.Mul64(x1, yi)
h2, l2 = bits.Mul64(x2, yi)
h3, l3 = bits.Mul64(x3, yi)
h4, l4 = bits.Mul64(x4, yi)
h5, l5 = bits.Mul64(x5, yi)
h6, l6 = bits.Mul64(x6, yi)
zz[i], c0 = bits.Add64(a0, l0, 0)
a0, c1 = bits.Add64(a1, l1, c0)
a1, c2 = bits.Add64(a2, l2, c1)
a2, c3 = bits.Add64(a3, l3, c2)
a3, c4 = bits.Add64(a4, l4, c3)
a4, c5 = bits.Add64(a5, l5, c4)
a5, a6 = bits.Add64(a6, l6, c5)
a0, c0 = bits.Add64(a0, h0, 0)
a1, c1 = bits.Add64(a1, h1, c0)
a2, c2 = bits.Add64(a2, h2, c1)
a3, c3 = bits.Add64(a3, h3, c2)
a4, c4 = bits.Add64(a4, h4, c3)
a5, c5 = bits.Add64(a5, h5, c4)
a6, _ = bits.Add64(a6, h6, c5)
}
red64(z, &zz, &[7]uint64{a0, a1, a2, a3, a4, a5, a6})
}
func sqrGeneric(z, x *Elt) { mulGeneric(z, x, x) }
func red64(z *Elt, l, h *[7]uint64) {
/* (2C13, 2C12, 2C11, 2C10|C10, C9, C8, C7) + (C6,...,C0) */
h0 := h[0]
h1 := h[1]
h2 := h[2]
h3 := ((h[3] & (0xFFFFFFFF << 32)) << 1) | (h[3] & 0xFFFFFFFF)
h4 := (h[3] >> 63) | (h[4] << 1)
h5 := (h[4] >> 63) | (h[5] << 1)
h6 := (h[5] >> 63) | (h[6] << 1)
h7 := (h[6] >> 63)
l0, c0 := bits.Add64(h0, l[0], 0)
l1, c1 := bits.Add64(h1, l[1], c0)
l2, c2 := bits.Add64(h2, l[2], c1)
l3, c3 := bits.Add64(h3, l[3], c2)
l4, c4 := bits.Add64(h4, l[4], c3)
l5, c5 := bits.Add64(h5, l[5], c4)
l6, c6 := bits.Add64(h6, l[6], c5)
l7, _ := bits.Add64(h7, 0, c6)
/* (C10C9, C9C8,C8C7,C7C13,C13C12,C12C11,C11C10) + (C6,...,C0) */
h0 = (h[3] >> 32) | (h[4] << 32)
h1 = (h[4] >> 32) | (h[5] << 32)
h2 = (h[5] >> 32) | (h[6] << 32)
h3 = (h[6] >> 32) | (h[0] << 32)
h4 = (h[0] >> 32) | (h[1] << 32)
h5 = (h[1] >> 32) | (h[2] << 32)
h6 = (h[2] >> 32) | (h[3] << 32)
l0, c0 = bits.Add64(l0, h0, 0)
l1, c1 = bits.Add64(l1, h1, c0)
l2, c2 = bits.Add64(l2, h2, c1)
l3, c3 = bits.Add64(l3, h3, c2)
l4, c4 = bits.Add64(l4, h4, c3)
l5, c5 = bits.Add64(l5, h5, c4)
l6, c6 = bits.Add64(l6, h6, c5)
l7, _ = bits.Add64(l7, 0, c6)
/* (C7) + (C6,...,C0) */
l0, c0 = bits.Add64(l0, l7, 0)
l1, c1 = bits.Add64(l1, 0, c0)
l2, c2 = bits.Add64(l2, 0, c1)
l3, c3 = bits.Add64(l3, l7<<32, c2)
l4, c4 = bits.Add64(l4, 0, c3)
l5, c5 = bits.Add64(l5, 0, c4)
l6, l7 = bits.Add64(l6, 0, c5)
/* (C7) + (C6,...,C0) */
l0, c0 = bits.Add64(l0, l7, 0)
l1, c1 = bits.Add64(l1, 0, c0)
l2, c2 = bits.Add64(l2, 0, c1)
l3, c3 = bits.Add64(l3, l7<<32, c2)
l4, c4 = bits.Add64(l4, 0, c3)
l5, c5 = bits.Add64(l5, 0, c4)
l6, _ = bits.Add64(l6, 0, c5)
binary.LittleEndian.PutUint64(z[0*8:1*8], l0)
binary.LittleEndian.PutUint64(z[1*8:2*8], l1)
binary.LittleEndian.PutUint64(z[2*8:3*8], l2)
binary.LittleEndian.PutUint64(z[3*8:4*8], l3)
binary.LittleEndian.PutUint64(z[4*8:5*8], l4)
binary.LittleEndian.PutUint64(z[5*8:6*8], l5)
binary.LittleEndian.PutUint64(z[6*8:7*8], l6)
}
//go:build !amd64 || purego
// +build !amd64 purego
package fp448
func cmov(x, y *Elt, n uint) { cmovGeneric(x, y, n) }
func cswap(x, y *Elt, n uint) { cswapGeneric(x, y, n) }
func add(z, x, y *Elt) { addGeneric(z, x, y) }
func sub(z, x, y *Elt) { subGeneric(z, x, y) }
func addsub(x, y *Elt) { addsubGeneric(x, y) }
func mul(z, x, y *Elt) { mulGeneric(z, x, y) }
func sqr(z, x *Elt) { sqrGeneric(z, x) }
//go:build gofuzz
// +build gofuzz
// How to run the fuzzer:
//
// $ go get -u github.com/dvyukov/go-fuzz/go-fuzz
// $ go get -u github.com/dvyukov/go-fuzz/go-fuzz-build
// $ go-fuzz-build -libfuzzer -func FuzzReduction -o lib.a
// $ clang -fsanitize=fuzzer lib.a -o fu.exe
// $ ./fu.exe
package fp448
import (
"encoding/binary"
"fmt"
"math/big"
"github.com/cloudflare/circl/internal/conv"
)
// FuzzReduction is a fuzzer target for red64 function, which reduces t
// (112 bits) to a number t' (56 bits) congruent modulo p448.
func FuzzReduction(data []byte) int {
if len(data) != 2*Size {
return -1
}
var got, want Elt
var lo, hi [7]uint64
a := data[:Size]
b := data[Size:]
lo[0] = binary.LittleEndian.Uint64(a[0*8 : 1*8])
lo[1] = binary.LittleEndian.Uint64(a[1*8 : 2*8])
lo[2] = binary.LittleEndian.Uint64(a[2*8 : 3*8])
lo[3] = binary.LittleEndian.Uint64(a[3*8 : 4*8])
lo[4] = binary.LittleEndian.Uint64(a[4*8 : 5*8])
lo[5] = binary.LittleEndian.Uint64(a[5*8 : 6*8])
lo[6] = binary.LittleEndian.Uint64(a[6*8 : 7*8])
hi[0] = binary.LittleEndian.Uint64(b[0*8 : 1*8])
hi[1] = binary.LittleEndian.Uint64(b[1*8 : 2*8])
hi[2] = binary.LittleEndian.Uint64(b[2*8 : 3*8])
hi[3] = binary.LittleEndian.Uint64(b[3*8 : 4*8])
hi[4] = binary.LittleEndian.Uint64(b[4*8 : 5*8])
hi[5] = binary.LittleEndian.Uint64(b[5*8 : 6*8])
hi[6] = binary.LittleEndian.Uint64(b[6*8 : 7*8])
red64(&got, &lo, &hi)
t := conv.BytesLe2BigInt(data[:2*Size])
two448 := big.NewInt(1)
two448.Lsh(two448, 448) // 2^448
mask448 := big.NewInt(1)
mask448.Sub(two448, mask448) // 2^448-1
two224plus1 := big.NewInt(1)
two224plus1.Lsh(two224plus1, 224)
two224plus1.Add(two224plus1, big.NewInt(1)) // 2^224+1
var loBig, hiBig big.Int
for t.Cmp(two448) >= 0 {
loBig.And(t, mask448)
hiBig.Rsh(t, 448)
t.Mul(&hiBig, two224plus1)
t.Add(t, &loBig)
}
conv.BigInt2BytesLe(want[:], t)
if got != want {
fmt.Printf("in: %v\n", conv.BytesLe2BigInt(data[:2*Size]))
fmt.Printf("got: %v\n", got)
fmt.Printf("want: %v\n", want)
panic("error found")
}
return 1
}
// Package mlsbset provides a constant-time exponentiation method with precomputation.
//
// References: "Efficient and secure algorithms for GLV-based scalar
// multiplication and their implementation on GLV–GLS curves" by (Faz-Hernandez et al.)
// - https://doi.org/10.1007/s13389-014-0085-7
// - https://eprint.iacr.org/2013/158
package mlsbset
import (
"errors"
"fmt"
"math/big"
"github.com/cloudflare/circl/internal/conv"
)
// EltG is a group element.
type EltG interface{}
// EltP is a precomputed group element.
type EltP interface{}
// Group defines the operations required by MLSBSet exponentiation method.
type Group interface {
Identity() EltG // Returns the identity of the group.
Sqr(x EltG) // Calculates x = x^2.
Mul(x EltG, y EltP) // Calculates x = x*y.
NewEltP() EltP // Returns an arbitrary precomputed element.
ExtendedEltP() EltP // Returns the precomputed element x^(2^(w*d)).
Lookup(a EltP, v uint, s, u int32) // Sets a = s*T[v][u].
}
// Params contains the parameters of the encoding.
type Params struct {
T uint // T is the maximum size (in bits) of exponents.
V uint // V is the number of tables.
W uint // W is the window size.
E uint // E is the number of digits per table.
D uint // D is the number of digits in total.
L uint // L is the length of the code.
}
// Encoder allows to convert integers into valid powers.
type Encoder struct{ p Params }
// New produces an encoder of the MLSBSet algorithm.
func New(t, v, w uint) (Encoder, error) {
if !(t > 1 && v >= 1 && w >= 2) {
return Encoder{}, errors.New("t>1, v>=1, w>=2")
}
e := (t + w*v - 1) / (w * v)
d := e * v
l := d * w
return Encoder{Params{t, v, w, e, d, l}}, nil
}
// Encode converts an odd integer k into a valid power for exponentiation.
func (m Encoder) Encode(k []byte) (*Power, error) {
if len(k) == 0 {
return nil, errors.New("empty slice")
}
if !(len(k) <= int(m.p.L+7)>>3) {
return nil, errors.New("k too big")
}
if k[0]%2 == 0 {
return nil, errors.New("k must be odd")
}
ap := int((m.p.L+7)/8) - len(k)
k = append(k, make([]byte, ap)...)
s := m.signs(k)
b := make([]int32, m.p.L-m.p.D)
c := conv.BytesLe2BigInt(k)
c.Rsh(c, m.p.D)
var bi big.Int
for i := m.p.D; i < m.p.L; i++ {
c0 := int32(c.Bit(0))
b[i-m.p.D] = s[i%m.p.D] * c0
bi.SetInt64(int64(b[i-m.p.D] >> 1))
c.Rsh(c, 1)
c.Sub(c, &bi)
}
carry := int(c.Int64())
return &Power{m, s, b, carry}, nil
}
// signs calculates the set of signs.
func (m Encoder) signs(k []byte) []int32 {
s := make([]int32, m.p.D)
s[m.p.D-1] = 1
for i := uint(1); i < m.p.D; i++ {
ki := int32((k[i>>3] >> (i & 0x7)) & 0x1)
s[i-1] = 2*ki - 1
}
return s
}
// GetParams returns the complementary parameters of the encoding.
func (m Encoder) GetParams() Params { return m.p }
// tableSize returns the size of each table.
func (m Encoder) tableSize() uint { return 1 << (m.p.W - 1) }
// Elts returns the total number of elements that must be precomputed.
func (m Encoder) Elts() uint { return m.p.V * m.tableSize() }
// IsExtended returns true if the element x^(2^(wd)) must be calculated.
func (m Encoder) IsExtended() bool { q := m.p.T / (m.p.V * m.p.W); return m.p.T == q*m.p.V*m.p.W }
// Ops returns the number of squares and multiplications executed during an exponentiation.
func (m Encoder) Ops() (S uint, M uint) {
S = m.p.E
M = m.p.E * m.p.V
if m.IsExtended() {
M++
}
return
}
func (m Encoder) String() string {
return fmt.Sprintf("T: %v W: %v V: %v e: %v d: %v l: %v wv|t: %v",
m.p.T, m.p.W, m.p.V, m.p.E, m.p.D, m.p.L, m.IsExtended())
}