From 0151e10b627fbfb82b7d633f3fc9703678ed4eaf Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Thu, 6 Jan 2022 14:50:12 +0100 Subject: update buildah to latest and use new network stack Make sure buildah uses the new network stack. Signed-off-by: Paul Holzinger --- vendor/golang.org/x/crypto/poly1305/bits_compat.go | 40 -- vendor/golang.org/x/crypto/poly1305/bits_go1.13.go | 22 - vendor/golang.org/x/crypto/poly1305/mac_noasm.go | 10 - vendor/golang.org/x/crypto/poly1305/poly1305.go | 99 ---- vendor/golang.org/x/crypto/poly1305/sum_amd64.go | 48 -- vendor/golang.org/x/crypto/poly1305/sum_amd64.s | 109 ----- vendor/golang.org/x/crypto/poly1305/sum_generic.go | 310 ------------- vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go | 48 -- vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s | 182 -------- vendor/golang.org/x/crypto/poly1305/sum_s390x.go | 76 ---- vendor/golang.org/x/crypto/poly1305/sum_s390x.s | 504 --------------------- 11 files changed, 1448 deletions(-) delete mode 100644 vendor/golang.org/x/crypto/poly1305/bits_compat.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/bits_go1.13.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/mac_noasm.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/poly1305.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_amd64.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_amd64.s delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_generic.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_s390x.go delete mode 100644 vendor/golang.org/x/crypto/poly1305/sum_s390x.s (limited to 'vendor/golang.org/x/crypto/poly1305') diff --git a/vendor/golang.org/x/crypto/poly1305/bits_compat.go b/vendor/golang.org/x/crypto/poly1305/bits_compat.go deleted file mode 100644 index 45b5c966b..000000000 --- a/vendor/golang.org/x/crypto/poly1305/bits_compat.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.13 -// +build !go1.13 - -package poly1305 - -// Generic fallbacks for the math/bits intrinsics, copied from -// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had -// variable time fallbacks until Go 1.13. - -func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { - sum = x + y + carry - carryOut = ((x & y) | ((x | y) &^ sum)) >> 63 - return -} - -func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { - diff = x - y - borrow - borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63 - return -} - -func bitsMul64(x, y uint64) (hi, lo uint64) { - const mask32 = 1<<32 - 1 - x0 := x & mask32 - x1 := x >> 32 - y0 := y & mask32 - y1 := y >> 32 - w0 := x0 * y0 - t := x1*y0 + w0>>32 - w1 := t & mask32 - w2 := t >> 32 - w1 += x0 * y1 - hi = x1*y1 + w2 + w1>>32 - lo = x * y - return -} diff --git a/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go b/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go deleted file mode 100644 index ed52b3418..000000000 --- a/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.13 -// +build go1.13 - -package poly1305 - -import "math/bits" - -func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { - return bits.Add64(x, y, carry) -} - -func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { - return bits.Sub64(x, y, borrow) -} - -func bitsMul64(x, y uint64) (hi, lo uint64) { - return bits.Mul64(x, y) -} diff --git a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go b/vendor/golang.org/x/crypto/poly1305/mac_noasm.go deleted file mode 100644 index f184b67d9..000000000 --- a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego -// +build !amd64,!ppc64le,!s390x !gc purego - -package poly1305 - -type mac struct{ macGeneric } diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go deleted file mode 100644 index 9d7a6af09..000000000 --- a/vendor/golang.org/x/crypto/poly1305/poly1305.go +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package poly1305 implements Poly1305 one-time message authentication code as -// specified in https://cr.yp.to/mac/poly1305-20050329.pdf. -// -// Poly1305 is a fast, one-time authentication function. It is infeasible for an -// attacker to generate an authenticator for a message without the key. However, a -// key must only be used for a single message. Authenticating two different -// messages with the same key allows an attacker to forge authenticators for other -// messages with the same key. -// -// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was -// used with a fixed key in order to generate one-time keys from an nonce. -// However, in this package AES isn't used and the one-time key is specified -// directly. -package poly1305 // import "golang.org/x/crypto/poly1305" - -import "crypto/subtle" - -// TagSize is the size, in bytes, of a poly1305 authenticator. -const TagSize = 16 - -// Sum generates an authenticator for msg using a one-time key and puts the -// 16-byte result into out. Authenticating two different messages with the same -// key allows an attacker to forge messages at will. -func Sum(out *[16]byte, m []byte, key *[32]byte) { - h := New(key) - h.Write(m) - h.Sum(out[:0]) -} - -// Verify returns true if mac is a valid authenticator for m with the given key. -func Verify(mac *[16]byte, m []byte, key *[32]byte) bool { - var tmp [16]byte - Sum(&tmp, m, key) - return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1 -} - -// New returns a new MAC computing an authentication -// tag of all data written to it with the given key. -// This allows writing the message progressively instead -// of passing it as a single slice. Common users should use -// the Sum function instead. -// -// The key must be unique for each message, as authenticating -// two different messages with the same key allows an attacker -// to forge messages at will. -func New(key *[32]byte) *MAC { - m := &MAC{} - initialize(key, &m.macState) - return m -} - -// MAC is an io.Writer computing an authentication tag -// of the data written to it. -// -// MAC cannot be used like common hash.Hash implementations, -// because using a poly1305 key twice breaks its security. -// Therefore writing data to a running MAC after calling -// Sum or Verify causes it to panic. -type MAC struct { - mac // platform-dependent implementation - - finalized bool -} - -// Size returns the number of bytes Sum will return. -func (h *MAC) Size() int { return TagSize } - -// Write adds more data to the running message authentication code. -// It never returns an error. -// -// It must not be called after the first call of Sum or Verify. -func (h *MAC) Write(p []byte) (n int, err error) { - if h.finalized { - panic("poly1305: write to MAC after Sum or Verify") - } - return h.mac.Write(p) -} - -// Sum computes the authenticator of all data written to the -// message authentication code. -func (h *MAC) Sum(b []byte) []byte { - var mac [TagSize]byte - h.mac.Sum(&mac) - h.finalized = true - return append(b, mac[:]...) -} - -// Verify returns whether the authenticator of all data written to -// the message authentication code matches the expected value. -func (h *MAC) Verify(expected []byte) bool { - var mac [TagSize]byte - h.mac.Sum(&mac) - h.finalized = true - return subtle.ConstantTimeCompare(expected, mac[:]) == 1 -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go deleted file mode 100644 index 6d522333f..000000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -package poly1305 - -//go:noescape -func update(state *macState, msg []byte) - -// mac is a wrapper for macGeneric that redirects calls that would have gone to -// updateGeneric to update. -// -// Its Write and Sum methods are otherwise identical to the macGeneric ones, but -// using function pointers would carry a major performance cost. -type mac struct{ macGeneric } - -func (h *mac) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < TagSize { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - update(&h.macState, h.buffer[:]) - } - if n := len(p) - (len(p) % TagSize); n > 0 { - update(&h.macState, p[:n]) - p = p[n:] - } - if len(p) > 0 { - h.offset += copy(h.buffer[h.offset:], p) - } - return nn, nil -} - -func (h *mac) Sum(out *[16]byte) { - state := h.macState - if h.offset > 0 { - update(&state, h.buffer[:h.offset]) - } - finalize(out, &state.h, &state.s) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s deleted file mode 100644 index 1d74f0f88..000000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) -TEXT ·update(SB), $0-32 - MOVQ state+0(FP), DI - MOVQ msg_base+8(FP), SI - MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 - JB bytes_between_0_and_15 - -loop: - POLY1305_ADD(SI, R8, R9, R10) - -multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop - -bytes_between_0_and_15: - TESTQ R15, R15 - JZ done - MOVQ $1, BX - XORQ CX, CX - XORQ R13, R13 - ADDQ R15, SI - -flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX - MOVB -1(SI), R13 - XORQ R13, BX - DECQ SI - DECQ R15 - JNZ flush_buffer - - ADDQ BX, R8 - ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 - JMP multiply - -done: - MOVQ R8, 0(DI) - MOVQ R9, 8(DI) - MOVQ R10, 16(DI) - RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_generic.go b/vendor/golang.org/x/crypto/poly1305/sum_generic.go deleted file mode 100644 index c942a6590..000000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_generic.go +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file provides the generic implementation of Sum and MAC. Other files -// might provide optimized assembly implementations of some of this code. - -package poly1305 - -import "encoding/binary" - -// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag -// for a 64 bytes message is approximately -// -// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5 -// -// for some secret r and s. It can be computed sequentially like -// -// for len(msg) > 0: -// h += read(msg, 16) -// h *= r -// h %= 2¹³⁰ - 5 -// return h + s -// -// All the complexity is about doing performant constant-time math on numbers -// larger than any available numeric type. - -func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) { - h := newMACGeneric(key) - h.Write(msg) - h.Sum(out) -} - -func newMACGeneric(key *[32]byte) macGeneric { - m := macGeneric{} - initialize(key, &m.macState) - return m -} - -// macState holds numbers in saturated 64-bit little-endian limbs. That is, -// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸. -type macState struct { - // h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but - // can grow larger during and after rounds. It must, however, remain below - // 2 * (2¹³⁰ - 5). - h [3]uint64 - // r and s are the private key components. - r [2]uint64 - s [2]uint64 -} - -type macGeneric struct { - macState - - buffer [TagSize]byte - offset int -} - -// Write splits the incoming message into TagSize chunks, and passes them to -// update. It buffers incomplete chunks. -func (h *macGeneric) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < TagSize { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - updateGeneric(&h.macState, h.buffer[:]) - } - if n := len(p) - (len(p) % TagSize); n > 0 { - updateGeneric(&h.macState, p[:n]) - p = p[n:] - } - if len(p) > 0 { - h.offset += copy(h.buffer[h.offset:], p) - } - return nn, nil -} - -// Sum flushes the last incomplete chunk from the buffer, if any, and generates -// the MAC output. It does not modify its state, in order to allow for multiple -// calls to Sum, even if no Write is allowed after Sum. -func (h *macGeneric) Sum(out *[TagSize]byte) { - state := h.macState - if h.offset > 0 { - updateGeneric(&state, h.buffer[:h.offset]) - } - finalize(out, &state.h, &state.s) -} - -// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It -// clears some bits of the secret coefficient to make it possible to implement -// multiplication more efficiently. -const ( - rMask0 = 0x0FFFFFFC0FFFFFFF - rMask1 = 0x0FFFFFFC0FFFFFFC -) - -// initialize loads the 256-bit key into the two 128-bit secret values r and s. -func initialize(key *[32]byte, m *macState) { - m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0 - m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1 - m.s[0] = binary.LittleEndian.Uint64(key[16:24]) - m.s[1] = binary.LittleEndian.Uint64(key[24:32]) -} - -// uint128 holds a 128-bit number as two 64-bit limbs, for use with the -// bits.Mul64 and bits.Add64 intrinsics. -type uint128 struct { - lo, hi uint64 -} - -func mul64(a, b uint64) uint128 { - hi, lo := bitsMul64(a, b) - return uint128{lo, hi} -} - -func add128(a, b uint128) uint128 { - lo, c := bitsAdd64(a.lo, b.lo, 0) - hi, c := bitsAdd64(a.hi, b.hi, c) - if c != 0 { - panic("poly1305: unexpected overflow") - } - return uint128{lo, hi} -} - -func shiftRightBy2(a uint128) uint128 { - a.lo = a.lo>>2 | (a.hi&3)<<62 - a.hi = a.hi >> 2 - return a -} - -// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of -// 128 bits of message, it computes -// -// h₊ = (h + m) * r mod 2¹³⁰ - 5 -// -// If the msg length is not a multiple of TagSize, it assumes the last -// incomplete chunk is the final one. -func updateGeneric(state *macState, msg []byte) { - h0, h1, h2 := state.h[0], state.h[1], state.h[2] - r0, r1 := state.r[0], state.r[1] - - for len(msg) > 0 { - var c uint64 - - // For the first step, h + m, we use a chain of bits.Add64 intrinsics. - // The resulting value of h might exceed 2¹³⁰ - 5, but will be partially - // reduced at the end of the multiplication below. - // - // The spec requires us to set a bit just above the message size, not to - // hide leading zeroes. For full chunks, that's 1 << 128, so we can just - // add 1 to the most significant (2¹²⁸) limb, h2. - if len(msg) >= TagSize { - h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0) - h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c) - h2 += c + 1 - - msg = msg[TagSize:] - } else { - var buf [TagSize]byte - copy(buf[:], msg) - buf[len(msg)] = 1 - - h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0) - h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c) - h2 += c - - msg = nil - } - - // Multiplication of big number limbs is similar to elementary school - // columnar multiplication. Instead of digits, there are 64-bit limbs. - // - // We are multiplying a 3 limbs number, h, by a 2 limbs number, r. - // - // h2 h1 h0 x - // r1 r0 = - // ---------------- - // h2r0 h1r0 h0r0 <-- individual 128-bit products - // + h2r1 h1r1 h0r1 - // ------------------------ - // m3 m2 m1 m0 <-- result in 128-bit overlapping limbs - // ------------------------ - // m3.hi m2.hi m1.hi m0.hi <-- carry propagation - // + m3.lo m2.lo m1.lo m0.lo - // ------------------------------- - // t4 t3 t2 t1 t0 <-- final result in 64-bit limbs - // - // The main difference from pen-and-paper multiplication is that we do - // carry propagation in a separate step, as if we wrote two digit sums - // at first (the 128-bit limbs), and then carried the tens all at once. - - h0r0 := mul64(h0, r0) - h1r0 := mul64(h1, r0) - h2r0 := mul64(h2, r0) - h0r1 := mul64(h0, r1) - h1r1 := mul64(h1, r1) - h2r1 := mul64(h2, r1) - - // Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their - // top 4 bits cleared by rMask{0,1}, we know that their product is not going - // to overflow 64 bits, so we can ignore the high part of the products. - // - // This also means that the product doesn't have a fifth limb (t4). - if h2r0.hi != 0 { - panic("poly1305: unexpected overflow") - } - if h2r1.hi != 0 { - panic("poly1305: unexpected overflow") - } - - m0 := h0r0 - m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again - m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1. - m3 := h2r1 - - t0 := m0.lo - t1, c := bitsAdd64(m1.lo, m0.hi, 0) - t2, c := bitsAdd64(m2.lo, m1.hi, c) - t3, _ := bitsAdd64(m3.lo, m2.hi, c) - - // Now we have the result as 4 64-bit limbs, and we need to reduce it - // modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do - // a cheap partial reduction according to the reduction identity - // - // c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5 - // - // because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is - // likely to be larger than 2¹³⁰ - 5, but still small enough to fit the - // assumptions we make about h in the rest of the code. - // - // See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23 - - // We split the final result at the 2¹³⁰ mark into h and cc, the carry. - // Note that the carry bits are effectively shifted left by 2, in other - // words, cc = c * 4 for the c in the reduction identity. - h0, h1, h2 = t0, t1, t2&maskLow2Bits - cc := uint128{t2 & maskNotLow2Bits, t3} - - // To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c. - - h0, c = bitsAdd64(h0, cc.lo, 0) - h1, c = bitsAdd64(h1, cc.hi, c) - h2 += c - - cc = shiftRightBy2(cc) - - h0, c = bitsAdd64(h0, cc.lo, 0) - h1, c = bitsAdd64(h1, cc.hi, c) - h2 += c - - // h2 is at most 3 + 1 + 1 = 5, making the whole of h at most - // - // 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1 - } - - state.h[0], state.h[1], state.h[2] = h0, h1, h2 -} - -const ( - maskLow2Bits uint64 = 0x0000000000000003 - maskNotLow2Bits uint64 = ^maskLow2Bits -) - -// select64 returns x if v == 1 and y if v == 0, in constant time. -func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y } - -// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order. -const ( - p0 = 0xFFFFFFFFFFFFFFFB - p1 = 0xFFFFFFFFFFFFFFFF - p2 = 0x0000000000000003 -) - -// finalize completes the modular reduction of h and computes -// -// out = h + s mod 2¹²⁸ -// -func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) { - h0, h1, h2 := h[0], h[1], h[2] - - // After the partial reduction in updateGeneric, h might be more than - // 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction - // in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the - // result if the subtraction underflows, and t otherwise. - - hMinusP0, b := bitsSub64(h0, p0, 0) - hMinusP1, b := bitsSub64(h1, p1, b) - _, b = bitsSub64(h2, p2, b) - - // h = h if h < p else h - p - h0 = select64(b, h0, hMinusP0) - h1 = select64(b, h1, hMinusP1) - - // Finally, we compute the last Poly1305 step - // - // tag = h + s mod 2¹²⁸ - // - // by just doing a wide addition with the 128 low bits of h and discarding - // the overflow. - h0, c := bitsAdd64(h0, s[0], 0) - h1, _ = bitsAdd64(h1, s[1], c) - - binary.LittleEndian.PutUint64(out[0:8], h0) - binary.LittleEndian.PutUint64(out[8:16], h1) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go deleted file mode 100644 index 4a069941a..000000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -package poly1305 - -//go:noescape -func update(state *macState, msg []byte) - -// mac is a wrapper for macGeneric that redirects calls that would have gone to -// updateGeneric to update. -// -// Its Write and Sum methods are otherwise identical to the macGeneric ones, but -// using function pointers would carry a major performance cost. -type mac struct{ macGeneric } - -func (h *mac) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < TagSize { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - update(&h.macState, h.buffer[:]) - } - if n := len(p) - (len(p) % TagSize); n > 0 { - update(&h.macState, p[:n]) - p = p[n:] - } - if len(p) > 0 { - h.offset += copy(h.buffer[h.offset:], p) - } - return nn, nil -} - -func (h *mac) Sum(out *[16]byte) { - state := h.macState - if h.offset > 0 { - update(&state, h.buffer[:h.offset]) - } - finalize(out, &state.h, &state.s) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s deleted file mode 100644 index 58422aad2..000000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -#include "textflag.h" - -// This was ported from the amd64 implementation. - -#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \ - MOVD (msg), t0; \ - MOVD 8(msg), t1; \ - MOVD $1, t2; \ - ADDC t0, h0, h0; \ - ADDE t1, h1, h1; \ - ADDE t2, h2; \ - ADD $16, msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \ - MULLD r0, h0, t0; \ - MULLD r0, h1, t4; \ - MULHDU r0, h0, t1; \ - MULHDU r0, h1, t5; \ - ADDC t4, t1, t1; \ - MULLD r0, h2, t2; \ - ADDZE t5; \ - MULHDU r1, h0, t4; \ - MULLD r1, h0, h0; \ - ADD t5, t2, t2; \ - ADDC h0, t1, t1; \ - MULLD h2, r1, t3; \ - ADDZE t4, h0; \ - MULHDU r1, h1, t5; \ - MULLD r1, h1, t4; \ - ADDC t4, t2, t2; \ - ADDE t5, t3, t3; \ - ADDC h0, t2, t2; \ - MOVD $-4, t4; \ - MOVD t0, h0; \ - MOVD t1, h1; \ - ADDZE t3; \ - ANDCC $3, t2, h2; \ - AND t2, t4, t0; \ - ADDC t0, h0, h0; \ - ADDE t3, h1, h1; \ - SLD $62, t3, t4; \ - SRD $2, t2; \ - ADDZE h2; \ - OR t4, t2, t2; \ - SRD $2, t3; \ - ADDC t2, h0, h0; \ - ADDE t3, h1, h1; \ - ADDZE h2 - -DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -GLOBL ·poly1305Mask<>(SB), RODATA, $16 - -// func update(state *[7]uint64, msg []byte) -TEXT ·update(SB), $0-32 - MOVD state+0(FP), R3 - MOVD msg_base+8(FP), R4 - MOVD msg_len+16(FP), R5 - - MOVD 0(R3), R8 // h0 - MOVD 8(R3), R9 // h1 - MOVD 16(R3), R10 // h2 - MOVD 24(R3), R11 // r0 - MOVD 32(R3), R12 // r1 - - CMP R5, $16 - BLT bytes_between_0_and_15 - -loop: - POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22) - -multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21) - ADD $-16, R5 - CMP R5, $16 - BGE loop - -bytes_between_0_and_15: - CMP R5, $0 - BEQ done - MOVD $0, R16 // h0 - MOVD $0, R17 // h1 - -flush_buffer: - CMP R5, $8 - BLE just1 - - MOVD $8, R21 - SUB R21, R5, R21 - - // Greater than 8 -- load the rightmost remaining bytes in msg - // and put into R17 (h1) - MOVD (R4)(R21), R17 - MOVD $16, R22 - - // Find the offset to those bytes - SUB R5, R22, R22 - SLD $3, R22 - - // Shift to get only the bytes in msg - SRD R22, R17, R17 - - // Put 1 at high end - MOVD $1, R23 - SLD $3, R21 - SLD R21, R23, R23 - OR R23, R17, R17 - - // Remainder is 8 - MOVD $8, R5 - -just1: - CMP R5, $8 - BLT less8 - - // Exactly 8 - MOVD (R4), R16 - - CMP R17, $0 - - // Check if we've already set R17; if not - // set 1 to indicate end of msg. - BNE carry - MOVD $1, R17 - BR carry - -less8: - MOVD $0, R16 // h0 - MOVD $0, R22 // shift count - CMP R5, $4 - BLT less4 - MOVWZ (R4), R16 - ADD $4, R4 - ADD $-4, R5 - MOVD $32, R22 - -less4: - CMP R5, $2 - BLT less2 - MOVHZ (R4), R21 - SLD R22, R21, R21 - OR R16, R21, R16 - ADD $16, R22 - ADD $-2, R5 - ADD $2, R4 - -less2: - CMP R5, $0 - BEQ insert1 - MOVBZ (R4), R21 - SLD R22, R21, R21 - OR R16, R21, R16 - ADD $8, R22 - -insert1: - // Insert 1 at end of msg - MOVD $1, R21 - SLD R22, R21, R21 - OR R16, R21, R16 - -carry: - // Add new values to h0, h1, h2 - ADDC R16, R8 - ADDE R17, R9 - ADDZE R10, R10 - MOVD $16, R5 - ADD R5, R4 - BR multiply - -done: - // Save h0, h1, h2 in state - MOVD R8, 0(R3) - MOVD R9, 8(R3) - MOVD R10, 16(R3) - RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go deleted file mode 100644 index 62cc9f847..000000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -package poly1305 - -import ( - "golang.org/x/sys/cpu" -) - -// updateVX is an assembly implementation of Poly1305 that uses vector -// instructions. It must only be called if the vector facility (vx) is -// available. -//go:noescape -func updateVX(state *macState, msg []byte) - -// mac is a replacement for macGeneric that uses a larger buffer and redirects -// calls that would have gone to updateGeneric to updateVX if the vector -// facility is installed. -// -// A larger buffer is required for good performance because the vector -// implementation has a higher fixed cost per call than the generic -// implementation. -type mac struct { - macState - - buffer [16 * TagSize]byte // size must be a multiple of block size (16) - offset int -} - -func (h *mac) Write(p []byte) (int, error) { - nn := len(p) - if h.offset > 0 { - n := copy(h.buffer[h.offset:], p) - if h.offset+n < len(h.buffer) { - h.offset += n - return nn, nil - } - p = p[n:] - h.offset = 0 - if cpu.S390X.HasVX { - updateVX(&h.macState, h.buffer[:]) - } else { - updateGeneric(&h.macState, h.buffer[:]) - } - } - - tail := len(p) % len(h.buffer) // number of bytes to copy into buffer - body := len(p) - tail // number of bytes to process now - if body > 0 { - if cpu.S390X.HasVX { - updateVX(&h.macState, p[:body]) - } else { - updateGeneric(&h.macState, p[:body]) - } - } - h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0 - return nn, nil -} - -func (h *mac) Sum(out *[TagSize]byte) { - state := h.macState - remainder := h.buffer[:h.offset] - - // Use the generic implementation if we have 2 or fewer blocks left - // to sum. The vector implementation has a higher startup time. - if cpu.S390X.HasVX && len(remainder) > 2*TagSize { - updateVX(&state, remainder) - } else if len(remainder) > 0 { - updateGeneric(&state, remainder) - } - finalize(out, &state.h, &state.s) -} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s deleted file mode 100644 index 69c64f842..000000000 --- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s +++ /dev/null @@ -1,504 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build gc && !purego -// +build gc,!purego - -#include "textflag.h" - -// This implementation of Poly1305 uses the vector facility (vx) -// to process up to 2 blocks (32 bytes) per iteration using an -// algorithm based on the one described in: -// -// NEON crypto, Daniel J. Bernstein & Peter Schwabe -// https://cryptojedi.org/papers/neoncrypto-20120320.pdf -// -// This algorithm uses 5 26-bit limbs to represent a 130-bit -// value. These limbs are, for the most part, zero extended and -// placed into 64-bit vector register elements. Each vector -// register is 128-bits wide and so holds 2 of these elements. -// Using 26-bit limbs allows us plenty of headroom to accomodate -// accumulations before and after multiplication without -// overflowing either 32-bits (before multiplication) or 64-bits -// (after multiplication). -// -// In order to parallelise the operations required to calculate -// the sum we use two separate accumulators and then sum those -// in an extra final step. For compatibility with the generic -// implementation we perform this summation at the end of every -// updateVX call. -// -// To use two accumulators we must multiply the message blocks -// by r² rather than r. Only the final message block should be -// multiplied by r. -// -// Example: -// -// We want to calculate the sum (h) for a 64 byte message (m): -// -// h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r -// -// To do this we split the calculation into the even indices -// and odd indices of the message. These form our SIMD 'lanes': -// -// h = m[ 0:16]r⁴ + m[32:48]r² + <- lane 0 -// m[16:32]r³ + m[48:64]r <- lane 1 -// -// To calculate this iteratively we refactor so that both lanes -// are written in terms of r² and r: -// -// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0 -// (m[16:32]r² + m[48:64])r <- lane 1 -// ^ ^ -// | coefficients for second iteration -// coefficients for first iteration -// -// So in this case we would have two iterations. In the first -// both lanes are multiplied by r². In the second only the -// first lane is multiplied by r² and the second lane is -// instead multiplied by r. This gives use the odd and even -// powers of r that we need from the original equation. -// -// Notation: -// -// h - accumulator -// r - key -// m - message -// -// [a, b] - SIMD register holding two 64-bit values -// [a, b, c, d] - SIMD register holding four 32-bit values -// xᵢ[n] - limb n of variable x with bit width i -// -// Limbs are expressed in little endian order, so for 26-bit -// limbs x₂₆[4] will be the most significant limb and x₂₆[0] -// will be the least significant limb. - -// masking constants -#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits -#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits - -// expansion constants (see EXPAND macro) -#define EX0 V2 -#define EX1 V3 -#define EX2 V4 - -// key (r², r or 1 depending on context) -#define R_0 V5 -#define R_1 V6 -#define R_2 V7 -#define R_3 V8 -#define R_4 V9 - -// precalculated coefficients (5r², 5r or 0 depending on context) -#define R5_1 V10 -#define R5_2 V11 -#define R5_3 V12 -#define R5_4 V13 - -// message block (m) -#define M_0 V14 -#define M_1 V15 -#define M_2 V16 -#define M_3 V17 -#define M_4 V18 - -// accumulator (h) -#define H_0 V19 -#define H_1 V20 -#define H_2 V21 -#define H_3 V22 -#define H_4 V23 - -// temporary registers (for short-lived values) -#define T_0 V24 -#define T_1 V25 -#define T_2 V26 -#define T_3 V27 -#define T_4 V28 - -GLOBL ·constants<>(SB), RODATA, $0x30 -// EX0 -DATA ·constants<>+0x00(SB)/8, $0x0006050403020100 -DATA ·constants<>+0x08(SB)/8, $0x1016151413121110 -// EX1 -DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706 -DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716 -// EX2 -DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d -DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d - -// MULTIPLY multiplies each lane of f and g, partially reduced -// modulo 2¹³⁰ - 5. The result, h, consists of partial products -// in each lane that need to be reduced further to produce the -// final result. -// -// h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰ -// -// Note that the multiplication by 5 of the high bits is -// achieved by precalculating the multiplication of four of the -// g coefficients by 5. These are g51-g54. -#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \ - VMLOF f0, g0, h0 \ - VMLOF f0, g3, h3 \ - VMLOF f0, g1, h1 \ - VMLOF f0, g4, h4 \ - VMLOF f0, g2, h2 \ - VMLOF f1, g54, T_0 \ - VMLOF f1, g2, T_3 \ - VMLOF f1, g0, T_1 \ - VMLOF f1, g3, T_4 \ - VMLOF f1, g1, T_2 \ - VMALOF f2, g53, h0, h0 \ - VMALOF f2, g1, h3, h3 \ - VMALOF f2, g54, h1, h1 \ - VMALOF f2, g2, h4, h4 \ - VMALOF f2, g0, h2, h2 \ - VMALOF f3, g52, T_0, T_0 \ - VMALOF f3, g0, T_3, T_3 \ - VMALOF f3, g53, T_1, T_1 \ - VMALOF f3, g1, T_4, T_4 \ - VMALOF f3, g54, T_2, T_2 \ - VMALOF f4, g51, h0, h0 \ - VMALOF f4, g54, h3, h3 \ - VMALOF f4, g52, h1, h1 \ - VMALOF f4, g0, h4, h4 \ - VMALOF f4, g53, h2, h2 \ - VAG T_0, h0, h0 \ - VAG T_3, h3, h3 \ - VAG T_1, h1, h1 \ - VAG T_4, h4, h4 \ - VAG T_2, h2, h2 - -// REDUCE performs the following carry operations in four -// stages, as specified in Bernstein & Schwabe: -// -// 1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4] -// 2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0] -// 3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3] -// 4: h₂₆[3]->h₂₆[4] -// -// The result is that all of the limbs are limited to 26-bits -// except for h₂₆[1] and h₂₆[4] which are limited to 27-bits. -// -// Note that although each limb is aligned at 26-bit intervals -// they may contain values that exceed 2²⁶ - 1, hence the need -// to carry the excess bits in each limb. -#define REDUCE(h0, h1, h2, h3, h4) \ - VESRLG $26, h0, T_0 \ - VESRLG $26, h3, T_1 \ - VN MOD26, h0, h0 \ - VN MOD26, h3, h3 \ - VAG T_0, h1, h1 \ - VAG T_1, h4, h4 \ - VESRLG $26, h1, T_2 \ - VESRLG $26, h4, T_3 \ - VN MOD26, h1, h1 \ - VN MOD26, h4, h4 \ - VESLG $2, T_3, T_4 \ - VAG T_3, T_4, T_4 \ - VAG T_2, h2, h2 \ - VAG T_4, h0, h0 \ - VESRLG $26, h2, T_0 \ - VESRLG $26, h0, T_1 \ - VN MOD26, h2, h2 \ - VN MOD26, h0, h0 \ - VAG T_0, h3, h3 \ - VAG T_1, h1, h1 \ - VESRLG $26, h3, T_2 \ - VN MOD26, h3, h3 \ - VAG T_2, h4, h4 - -// EXPAND splits the 128-bit little-endian values in0 and in1 -// into 26-bit big-endian limbs and places the results into -// the first and second lane of d₂₆[0:4] respectively. -// -// The EX0, EX1 and EX2 constants are arrays of byte indices -// for permutation. The permutation both reverses the bytes -// in the input and ensures the bytes are copied into the -// destination limb ready to be shifted into their final -// position. -#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \ - VPERM in0, in1, EX0, d0 \ - VPERM in0, in1, EX1, d2 \ - VPERM in0, in1, EX2, d4 \ - VESRLG $26, d0, d1 \ - VESRLG $30, d2, d3 \ - VESRLG $4, d2, d2 \ - VN MOD26, d0, d0 \ // [in0₂₆[0], in1₂₆[0]] - VN MOD26, d3, d3 \ // [in0₂₆[3], in1₂₆[3]] - VN MOD26, d1, d1 \ // [in0₂₆[1], in1₂₆[1]] - VN MOD24, d4, d4 \ // [in0₂₆[4], in1₂₆[4]] - VN MOD26, d2, d2 // [in0₂₆[2], in1₂₆[2]] - -// func updateVX(state *macState, msg []byte) -TEXT ·updateVX(SB), NOSPLIT, $0 - MOVD state+0(FP), R1 - LMG msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len - - // load EX0, EX1 and EX2 - MOVD $·constants<>(SB), R5 - VLM (R5), EX0, EX2 - - // generate masks - VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff] - VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff] - - // load h (accumulator) and r (key) from state - VZERO T_1 // [0, 0] - VL 0(R1), T_0 // [h₆₄[0], h₆₄[1]] - VLEG $0, 16(R1), T_1 // [h₆₄[2], 0] - VL 24(R1), T_2 // [r₆₄[0], r₆₄[1]] - VPDI $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]] - VPDI $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]] - - // unpack h and r into 26-bit limbs - // note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value - VN MOD26, T_3, H_0 // [h₂₆[0], r₂₆[0]] - VZERO H_1 // [0, 0] - VZERO H_3 // [0, 0] - VGMG $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out - VESLG $24, T_1, T_1 // [h₆₄[2]<<24, 0] - VERIMG $-26&63, T_3, MOD26, H_1 // [h₂₆[1], r₂₆[1]] - VESRLG $+52&63, T_3, H_2 // [h₂₆[2], r₂₆[2]] - low 12 bits only - VERIMG $-14&63, T_4, MOD26, H_3 // [h₂₆[1], r₂₆[1]] - VESRLG $40, T_4, H_4 // [h₂₆[4], r₂₆[4]] - low 24 bits only - VERIMG $+12&63, T_4, T_0, H_2 // [h₂₆[2], r₂₆[2]] - complete - VO T_1, H_4, H_4 // [h₂₆[4], r₂₆[4]] - complete - - // replicate r across all 4 vector elements - VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]] - VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]] - VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]] - VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]] - VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]] - - // zero out lane 1 of h - VLEIG $1, $0, H_0 // [h₂₆[0], 0] - VLEIG $1, $0, H_1 // [h₂₆[1], 0] - VLEIG $1, $0, H_2 // [h₂₆[2], 0] - VLEIG $1, $0, H_3 // [h₂₆[3], 0] - VLEIG $1, $0, H_4 // [h₂₆[4], 0] - - // calculate 5r (ignore least significant limb) - VREPIF $5, T_0 - VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]] - VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]] - VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]] - VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]] - - // skip r² calculation if we are only calculating one block - CMPBLE R3, $16, skip - - // calculate r² - MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4) - REDUCE(M_0, M_1, M_2, M_3, M_4) - VGBM $0x0f0f, T_0 - VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]] - VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]] - VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]] - VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]] - VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]] - - // calculate 5r² (ignore least significant limb) - VREPIF $5, T_0 - VMLF T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]] - VMLF T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]] - VMLF T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]] - VMLF T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]] - -loop: - CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients - - // load next 2 blocks from message - VLM (R2), T_0, T_1 - - // update message slice - SUB $32, R3 - MOVD $32(R2), R2 - - // unpack message blocks into 26-bit big-endian limbs - EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) - - // add 2¹²⁸ to each message block value - VLEIB $4, $1, M_4 - VLEIB $12, $1, M_4 - -multiply: - // accumulate the incoming message - VAG H_0, M_0, M_0 - VAG H_3, M_3, M_3 - VAG H_1, M_1, M_1 - VAG H_4, M_4, M_4 - VAG H_2, M_2, M_2 - - // multiply the accumulator by the key coefficient - MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4) - - // carry and partially reduce the partial products - REDUCE(H_0, H_1, H_2, H_3, H_4) - - CMPBNE R3, $0, loop - -finish: - // sum lane 0 and lane 1 and put the result in lane 1 - VZERO T_0 - VSUMQG H_0, T_0, H_0 - VSUMQG H_3, T_0, H_3 - VSUMQG H_1, T_0, H_1 - VSUMQG H_4, T_0, H_4 - VSUMQG H_2, T_0, H_2 - - // reduce again after summation - // TODO(mundaym): there might be a more efficient way to do this - // now that we only have 1 active lane. For example, we could - // simultaneously pack the values as we reduce them. - REDUCE(H_0, H_1, H_2, H_3, H_4) - - // carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1 - // TODO(mundaym): in testing this final carry was unnecessary. - // Needs a proof before it can be removed though. - VESRLG $26, H_1, T_1 - VN MOD26, H_1, H_1 - VAQ T_1, H_2, H_2 - VESRLG $26, H_2, T_2 - VN MOD26, H_2, H_2 - VAQ T_2, H_3, H_3 - VESRLG $26, H_3, T_3 - VN MOD26, H_3, H_3 - VAQ T_3, H_4, H_4 - - // h is now < 2(2¹³⁰ - 5) - // Pack each lane in h₂₆[0:4] into h₁₂₈[0:1]. - VESLG $26, H_1, H_1 - VESLG $26, H_3, H_3 - VO H_0, H_1, H_0 - VO H_2, H_3, H_2 - VESLG $4, H_2, H_2 - VLEIB $7, $48, H_1 - VSLB H_1, H_2, H_2 - VO H_0, H_2, H_0 - VLEIB $7, $104, H_1 - VSLB H_1, H_4, H_3 - VO H_3, H_0, H_0 - VLEIB $7, $24, H_1 - VSRLB H_1, H_4, H_1 - - // update state - VSTEG $1, H_0, 0(R1) - VSTEG $0, H_0, 8(R1) - VSTEG $1, H_1, 16(R1) - RET - -b2: // 2 or fewer blocks remaining - CMPBLE R3, $16, b1 - - // Load the 2 remaining blocks (17-32 bytes remaining). - MOVD $-17(R3), R0 // index of final byte to load modulo 16 - VL (R2), T_0 // load full 16 byte block - VLL R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes - - // The Poly1305 algorithm requires that a 1 bit be appended to - // each message block. If the final block is less than 16 bytes - // long then it is easiest to insert the 1 before the message - // block is split into 26-bit limbs. If, on the other hand, the - // final message block is 16 bytes long then we append the 1 bit - // after expansion as normal. - MOVBZ $1, R0 - MOVD $-16(R3), R3 // index of byte in last block to insert 1 at (could be 16) - CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long - VLVGB R3, R0, T_1 // insert 1 into the byte at index R3 - - // Split both blocks into 26-bit limbs in the appropriate lanes. - EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) - - // Append a 1 byte to the end of the second to last block. - VLEIB $4, $1, M_4 - - // Append a 1 byte to the end of the last block only if it is a - // full 16 byte block. - CMPBNE R3, $16, 2(PC) - VLEIB $12, $1, M_4 - - // Finally, set up the coefficients for the final multiplication. - // We have previously saved r and 5r in the 32-bit even indexes - // of the R_[0-4] and R5_[1-4] coefficient registers. - // - // We want lane 0 to be multiplied by r² so that can be kept the - // same. We want lane 1 to be multiplied by r so we need to move - // the saved r value into the 32-bit odd index in lane 1 by - // rotating the 64-bit lane by 32. - VGBM $0x00ff, T_0 // [0, 0xffffffffffffffff] - mask lane 1 only - VERIMG $32, R_0, T_0, R_0 // [_, r²₂₆[0], _, r₂₆[0]] - VERIMG $32, R_1, T_0, R_1 // [_, r²₂₆[1], _, r₂₆[1]] - VERIMG $32, R_2, T_0, R_2 // [_, r²₂₆[2], _, r₂₆[2]] - VERIMG $32, R_3, T_0, R_3 // [_, r²₂₆[3], _, r₂₆[3]] - VERIMG $32, R_4, T_0, R_4 // [_, r²₂₆[4], _, r₂₆[4]] - VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]] - VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]] - VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]] - VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]] - - MOVD $0, R3 - BR multiply - -skip: - CMPBEQ R3, $0, finish - -b1: // 1 block remaining - - // Load the final block (1-16 bytes). This will be placed into - // lane 0. - MOVD $-1(R3), R0 - VLL R0, (R2), T_0 // pad to 16 bytes with zeros - - // The Poly1305 algorithm requires that a 1 bit be appended to - // each message block. If the final block is less than 16 bytes - // long then it is easiest to insert the 1 before the message - // block is split into 26-bit limbs. If, on the other hand, the - // final message block is 16 bytes long then we append the 1 bit - // after expansion as normal. - MOVBZ $1, R0 - CMPBEQ R3, $16, 2(PC) - VLVGB R3, R0, T_0 - - // Set the message block in lane 1 to the value 0 so that it - // can be accumulated without affecting the final result. - VZERO T_1 - - // Split the final message block into 26-bit limbs in lane 0. - // Lane 1 will be contain 0. - EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4) - - // Append a 1 byte to the end of the last block only if it is a - // full 16 byte block. - CMPBNE R3, $16, 2(PC) - VLEIB $4, $1, M_4 - - // We have previously saved r and 5r in the 32-bit even indexes - // of the R_[0-4] and R5_[1-4] coefficient registers. - // - // We want lane 0 to be multiplied by r so we need to move the - // saved r value into the 32-bit odd index in lane 0. We want - // lane 1 to be set to the value 1. This makes multiplication - // a no-op. We do this by setting lane 1 in every register to 0 - // and then just setting the 32-bit index 3 in R_0 to 1. - VZERO T_0 - MOVD $0, R0 - MOVD $0x10111213, R12 - VLVGP R12, R0, T_1 // [_, 0x10111213, _, 0x00000000] - VPERM T_0, R_0, T_1, R_0 // [_, r₂₆[0], _, 0] - VPERM T_0, R_1, T_1, R_1 // [_, r₂₆[1], _, 0] - VPERM T_0, R_2, T_1, R_2 // [_, r₂₆[2], _, 0] - VPERM T_0, R_3, T_1, R_3 // [_, r₂₆[3], _, 0] - VPERM T_0, R_4, T_1, R_4 // [_, r₂₆[4], _, 0] - VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0] - VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0] - VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0] - VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0] - - // Set the value of lane 1 to be 1. - VLEIF $3, $1, R_0 // [_, r₂₆[0], _, 1] - - MOVD $0, R3 - BR multiply -- cgit v1.2.3-54-g00ecf