summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost
diff options
context:
space:
mode:
authorDaniel J Walsh <dwalsh@redhat.com>2022-07-11 10:03:44 -0400
committerDaniel J Walsh <dwalsh@redhat.com>2022-07-18 10:42:04 -0400
commitf67ab1eb20ae357fd004815ec25c5350e5813a46 (patch)
treee25b2cf83e53263f9f7967e5ba5d3a20de4da7e0 /vendor/github.com/klauspost
parent5f848d89edef76adff6d203859803be9b791d258 (diff)
downloadpodman-f67ab1eb20ae357fd004815ec25c5350e5813a46.tar.gz
podman-f67ab1eb20ae357fd004815ec25c5350e5813a46.tar.bz2
podman-f67ab1eb20ae357fd004815ec25c5350e5813a46.zip
Vendor in containers/(storage,image, common, buildah)
Signed-off-by: Daniel J Walsh <dwalsh@redhat.com>
Diffstat (limited to 'vendor/github.com/klauspost')
-rw-r--r--vendor/github.com/klauspost/compress/README.md21
-rw-r--r--vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go68
-rw-r--r--vendor/github.com/klauspost/compress/flate/huffman_code.go74
-rw-r--r--vendor/github.com/klauspost/compress/flate/stateless.go4
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.go10
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s686
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder.go63
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go64
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s127
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go72
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s1999
11 files changed, 1867 insertions, 1321 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 5c3c2a258..c7cf1a20c 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,27 @@ This package provides various compression algorithms.
# changelog
+* June 29, 2022 (v1.15.7)
+
+ * s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633
+ * zip: Merge upstream https://github.com/klauspost/compress/pull/631
+ * zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
+ * zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
+ * flate: Faster histograms https://github.com/klauspost/compress/pull/620
+ * deflate: Use compound hcode https://github.com/klauspost/compress/pull/622
+
+* June 3, 2022 (v1.15.6)
+ * s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
+ * s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
+ * zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
+ * zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
+ * zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
+ * gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
+ * s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
+ * s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
+ * snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
+ * s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
+
* May 25, 2022 (v1.15.5)
* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index 25f6d1108..40ef45c2f 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -169,7 +169,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
b := w.offsetEncoding.codes
b = b[:len(a)]
for i, v := range a {
- if v != 0 && b[i].len == 0 {
+ if v != 0 && b[i].zero() {
return false
}
}
@@ -178,7 +178,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
b = w.literalEncoding.codes[256:literalCount]
b = b[:len(a)]
for i, v := range a {
- if v != 0 && b[i].len == 0 {
+ if v != 0 && b[i].zero() {
return false
}
}
@@ -186,7 +186,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
a = t.litHist[:256]
b = w.literalEncoding.codes[:len(a)]
for i, v := range a {
- if v != 0 && b[i].len == 0 {
+ if v != 0 && b[i].zero() {
return false
}
}
@@ -280,12 +280,12 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
// Copy the concatenated code sizes to codegen. Put a marker at the end.
cgnl := codegen[:numLiterals]
for i := range cgnl {
- cgnl[i] = uint8(litEnc.codes[i].len)
+ cgnl[i] = litEnc.codes[i].len()
}
cgnl = codegen[numLiterals : numLiterals+numOffsets]
for i := range cgnl {
- cgnl[i] = uint8(offEnc.codes[i].len)
+ cgnl[i] = offEnc.codes[i].len()
}
codegen[numLiterals+numOffsets] = badCode
@@ -428,8 +428,8 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
func (w *huffmanBitWriter) writeCode(c hcode) {
// The function does not get inlined if we "& 63" the shift.
- w.bits |= uint64(c.code) << (w.nbits & 63)
- w.nbits += c.len
+ w.bits |= c.code64() << (w.nbits & 63)
+ w.nbits += c.len()
if w.nbits >= 48 {
w.writeOutBits()
}
@@ -477,7 +477,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
w.writeBits(int32(numCodegens-4), 4)
for i := 0; i < numCodegens; i++ {
- value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+ value := uint(w.codegenEncoding.codes[codegenOrder[i]].len())
w.writeBits(int32(value), 3)
}
@@ -670,7 +670,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
// Estimate size for using a new table.
// Use the previous header size as the best estimate.
newSize := w.lastHeader + tokens.EstimatedBits()
- newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty
+ newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty
// The estimated size is calculated as an optimal table.
// We add a penalty to make it more realistic and re-use a bit more.
@@ -854,8 +854,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
if t < 256 {
//w.writeCode(lits[t.literal()])
c := lits[t]
- bits |= uint64(c.code) << (nbits & 63)
- nbits += c.len
+ bits |= c.code64() << (nbits & 63)
+ nbits += c.len()
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -882,8 +882,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
} else {
// inlined
c := lengths[lengthCode]
- bits |= uint64(c.code) << (nbits & 63)
- nbits += c.len
+ bits |= c.code64() << (nbits & 63)
+ nbits += c.len()
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -931,8 +931,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
} else {
// inlined
c := offs[offsetCode]
- bits |= uint64(c.code) << (nbits & 63)
- nbits += c.len
+ bits |= c.code64() << (nbits & 63)
+ nbits += c.len()
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -1009,8 +1009,6 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
}
}
- // Fill is rarely better...
- const fill = false
const numLiterals = endBlockMarker + 1
const numOffsets = 1
@@ -1019,7 +1017,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
// Assume header is around 70 bytes:
// https://stackoverflow.com/a/25454430
const guessHeaderSizeBits = 70 * 8
- histogram(input, w.literalFreq[:numLiterals], fill)
+ histogram(input, w.literalFreq[:numLiterals])
ssize, storable := w.storedSize(input)
if storable && len(input) > 1024 {
// Quick check for incompressible content.
@@ -1045,19 +1043,14 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
}
w.literalFreq[endBlockMarker] = 1
w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
- if fill {
- // Clear fill...
- for i := range w.literalFreq[:numLiterals] {
- w.literalFreq[i] = 0
- }
- histogram(input, w.literalFreq[:numLiterals], false)
- }
estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
- estBits += w.lastHeader
- if w.lastHeader == 0 {
- estBits += guessHeaderSizeBits
+ if estBits < math.MaxInt32 {
+ estBits += w.lastHeader
+ if w.lastHeader == 0 {
+ estBits += guessHeaderSizeBits
+ }
+ estBits += estBits >> w.logNewTablePenalty
}
- estBits += estBits >> w.logNewTablePenalty
// Store bytes, if we don't get a reasonable improvement.
if storable && ssize <= estBits {
@@ -1134,12 +1127,12 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
nbytes = 0
}
a, b := encoding[input[0]], encoding[input[1]]
- bits |= uint64(a.code) << (nbits & 63)
- bits |= uint64(b.code) << ((nbits + a.len) & 63)
+ bits |= a.code64() << (nbits & 63)
+ bits |= b.code64() << ((nbits + a.len()) & 63)
c := encoding[input[2]]
- nbits += b.len + a.len
- bits |= uint64(c.code) << (nbits & 63)
- nbits += c.len
+ nbits += b.len() + a.len()
+ bits |= c.code64() << (nbits & 63)
+ nbits += c.len()
input = input[3:]
}
@@ -1165,10 +1158,11 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
}
// Bitwriting inlined, ~30% speedup
c := encoding[t]
- bits |= uint64(c.code) << (nbits & 63)
- nbits += c.len
+ bits |= c.code64() << (nbits & 63)
+
+ nbits += c.len()
if debugDeflate {
- count += int(c.len)
+ count += int(c.len())
}
}
// Restore...
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
index 9ab497c27..5ac144f28 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -16,9 +16,18 @@ const (
)
// hcode is a huffman code with a bit code and bit length.
-type hcode struct {
- code uint16
- len uint8
+type hcode uint32
+
+func (h hcode) len() uint8 {
+ return uint8(h)
+}
+
+func (h hcode) code64() uint64 {
+ return uint64(h >> 8)
+}
+
+func (h hcode) zero() bool {
+ return h == 0
}
type huffmanEncoder struct {
@@ -58,8 +67,11 @@ type levelInfo struct {
// set sets the code and length of an hcode.
func (h *hcode) set(code uint16, length uint8) {
- h.len = length
- h.code = code
+ *h = hcode(length) | (hcode(code) << 8)
+}
+
+func newhcode(code uint16, length uint8) hcode {
+ return hcode(length) | (hcode(code) << 8)
}
func reverseBits(number uint16, bitLength byte) uint16 {
@@ -100,7 +112,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
bits = ch + 192 - 280
size = 8
}
- codes[ch] = hcode{code: reverseBits(bits, size), len: size}
+ codes[ch] = newhcode(reverseBits(bits, size), size)
}
return h
}
@@ -109,7 +121,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
h := newHuffmanEncoder(30)
codes := h.codes
for ch := range codes {
- codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+ codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5)
}
return h
}
@@ -121,7 +133,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
var total int
for i, f := range freq {
if f != 0 {
- total += int(f) * int(h.codes[i].len)
+ total += int(f) * int(h.codes[i].len())
}
}
return total
@@ -130,7 +142,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
var total int
for _, f := range b {
- total += int(h.codes[f].len)
+ total += int(h.codes[f].len())
}
return total
}
@@ -141,10 +153,10 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
for i, f := range freq {
if f != 0 {
code := h.codes[i]
- if code.len == 0 {
+ if code.zero() {
return math.MaxInt32
}
- total += int(f) * int(code.len)
+ total += int(f) * int(code.len())
}
}
return total
@@ -308,7 +320,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
sortByLiteral(chunk)
for _, node := range chunk {
- h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)}
+ h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n))
code++
}
list = list[0 : len(list)-int(bits)]
@@ -330,7 +342,7 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
list[count] = literalNode{uint16(i), f}
count++
} else {
- codes[i].len = 0
+ codes[i] = 0
}
}
list[count] = literalNode{}
@@ -364,21 +376,37 @@ func atLeastOne(v float32) float32 {
return v
}
-// Unassigned values are assigned '1' in the histogram.
-func fillHist(b []uint16) {
- for i, v := range b {
- if v == 0 {
- b[i] = 1
+func histogram(b []byte, h []uint16) {
+ if true && len(b) >= 8<<10 {
+ // Split for bigger inputs
+ histogramSplit(b, h)
+ } else {
+ h = h[:256]
+ for _, t := range b {
+ h[t]++
}
}
}
-func histogram(b []byte, h []uint16, fill bool) {
+func histogramSplit(b []byte, h []uint16) {
+ // Tested, and slightly faster than 2-way.
+ // Writing to separate arrays and combining is also slightly slower.
h = h[:256]
- for _, t := range b {
- h[t]++
+ for len(b)&3 != 0 {
+ h[b[0]]++
+ b = b[1:]
}
- if fill {
- fillHist(h)
+ n := len(b) / 4
+ x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:]
+ y, z, w = y[:len(x)], z[:len(x)], w[:len(x)]
+ for i, t := range x {
+ v0 := &h[t]
+ v1 := &h[y[i]]
+ v3 := &h[w[i]]
+ v2 := &h[z[i]]
+ *v0++
+ *v1++
+ *v2++
+ *v3++
}
}
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
index 544162a43..93a1d1503 100644
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -59,9 +59,9 @@ var bitWriterPool = sync.Pool{
},
}
-// StatelessDeflate allows to compress directly to a Writer without retaining state.
+// StatelessDeflate allows compressing directly to a Writer without retaining state.
// When returning everything will be flushed.
-// Up to 8KB of an optional dictionary can be given which is presumed to presumed to precede the block.
+// Up to 8KB of an optional dictionary can be given which is presumed to precede the block.
// Longer dictionaries will be truncated and will still produce valid output.
// Sending nil dictionary is perfectly fine.
func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index 671e630a8..9f3e9f79e 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
const fallback8BitSize = 800
type decompress4xContext struct {
- pbr0 *bitReaderShifted
- pbr1 *bitReaderShifted
- pbr2 *bitReaderShifted
- pbr3 *bitReaderShifted
+ pbr *[4]bitReaderShifted
peekBits uint8
out *byte
dstEvery int
@@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
ctx := decompress4xContext{
- pbr0: &br[0],
- pbr1: &br[1],
- pbr2: &br[2],
- pbr3: &br[3],
+ pbr: &br,
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
out: &out[0],
dstEvery: dstEvery,
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 6c65c6e2b..dd1a5aecd 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -4,45 +4,40 @@
// +build amd64,!appengine,!noasm,gc
// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), AX
- MOVBQZX 32(AX), SI
- MOVQ 40(AX), DI
- MOVQ DI, BX
- MOVQ 72(AX), CX
- MOVQ CX, (SP)
- MOVQ 48(AX), R8
- MOVQ 56(AX), R9
- MOVQ (AX), R10
- MOVQ 8(AX), R11
- MOVQ 16(AX), R12
- MOVQ 24(AX), R13
+ MOVBQZX 8(AX), DI
+ MOVQ 16(AX), SI
+ MOVQ 48(AX), BX
+ MOVQ 24(AX), R9
+ MOVQ 32(AX), R10
+ MOVQ (AX), R11
// Main loop
main_loop:
- MOVQ BX, DI
- CMPQ DI, (SP)
+ MOVQ SI, R8
+ CMPQ R8, BX
SETGE DL
// br0.fillFast32()
- MOVQ 32(R10), R14
- MOVBQZX 40(R10), R15
- CMPQ R15, $0x20
+ MOVQ 32(R11), R12
+ MOVBQZX 40(R11), R13
+ CMPQ R13, $0x20
JBE skip_fill0
- MOVQ 24(R10), AX
- SUBQ $0x20, R15
+ MOVQ 24(R11), AX
+ SUBQ $0x20, R13
SUBQ $0x04, AX
- MOVQ (R10), BP
+ MOVQ (R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(BP*1), BP
- MOVQ R15, CX
- SHLQ CL, BP
- MOVQ AX, 24(R10)
- ORQ BP, R14
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 24(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br0.off < 4)
CMPQ AX, $0x04
@@ -51,57 +46,57 @@ main_loop:
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
- MOVQ R14, BP
- MOVQ SI, CX
- SHRQ CL, BP
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br0.peekTopBits(peekBits)
- MOVQ SI, CX
- MOVQ R14, BP
- SHRQ CL, BP
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br0.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (DI)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVQ R14, 32(R10)
- MOVB R15, 40(R10)
- ADDQ R8, DI
+ // update the bitreader structure
+ MOVQ R12, 32(R11)
+ MOVB R13, 40(R11)
+ ADDQ R9, R8
// br1.fillFast32()
- MOVQ 32(R11), R14
- MOVBQZX 40(R11), R15
- CMPQ R15, $0x20
+ MOVQ 80(R11), R12
+ MOVBQZX 88(R11), R13
+ CMPQ R13, $0x20
JBE skip_fill1
- MOVQ 24(R11), AX
- SUBQ $0x20, R15
+ MOVQ 72(R11), AX
+ SUBQ $0x20, R13
SUBQ $0x04, AX
- MOVQ (R11), BP
+ MOVQ 48(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(BP*1), BP
- MOVQ R15, CX
- SHLQ CL, BP
- MOVQ AX, 24(R11)
- ORQ BP, R14
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 72(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br1.off < 4)
CMPQ AX, $0x04
@@ -110,57 +105,57 @@ skip_fill0:
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
- MOVQ R14, BP
- MOVQ SI, CX
- SHRQ CL, BP
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br1.peekTopBits(peekBits)
- MOVQ SI, CX
- MOVQ R14, BP
- SHRQ CL, BP
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br1.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (DI)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVQ R14, 32(R11)
- MOVB R15, 40(R11)
- ADDQ R8, DI
+ // update the bitreader structure
+ MOVQ R12, 80(R11)
+ MOVB R13, 88(R11)
+ ADDQ R9, R8
// br2.fillFast32()
- MOVQ 32(R12), R14
- MOVBQZX 40(R12), R15
- CMPQ R15, $0x20
+ MOVQ 128(R11), R12
+ MOVBQZX 136(R11), R13
+ CMPQ R13, $0x20
JBE skip_fill2
- MOVQ 24(R12), AX
- SUBQ $0x20, R15
+ MOVQ 120(R11), AX
+ SUBQ $0x20, R13
SUBQ $0x04, AX
- MOVQ (R12), BP
+ MOVQ 96(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(BP*1), BP
- MOVQ R15, CX
- SHLQ CL, BP
- MOVQ AX, 24(R12)
- ORQ BP, R14
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 120(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br2.off < 4)
CMPQ AX, $0x04
@@ -169,57 +164,57 @@ skip_fill1:
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
- MOVQ R14, BP
- MOVQ SI, CX
- SHRQ CL, BP
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br2.peekTopBits(peekBits)
- MOVQ SI, CX
- MOVQ R14, BP
- SHRQ CL, BP
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br2.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (DI)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVQ R14, 32(R12)
- MOVB R15, 40(R12)
- ADDQ R8, DI
+ // update the bitreader structure
+ MOVQ R12, 128(R11)
+ MOVB R13, 136(R11)
+ ADDQ R9, R8
// br3.fillFast32()
- MOVQ 32(R13), R14
- MOVBQZX 40(R13), R15
- CMPQ R15, $0x20
+ MOVQ 176(R11), R12
+ MOVBQZX 184(R11), R13
+ CMPQ R13, $0x20
JBE skip_fill3
- MOVQ 24(R13), AX
- SUBQ $0x20, R15
+ MOVQ 168(R11), AX
+ SUBQ $0x20, R13
SUBQ $0x04, AX
- MOVQ (R13), BP
+ MOVQ 144(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (AX)(BP*1), BP
- MOVQ R15, CX
- SHLQ CL, BP
- MOVQ AX, 24(R13)
- ORQ BP, R14
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 168(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br3.off < 4)
CMPQ AX, $0x04
@@ -228,149 +223,142 @@ skip_fill2:
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
- MOVQ R14, BP
- MOVQ SI, CX
- SHRQ CL, BP
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br3.peekTopBits(peekBits)
- MOVQ SI, CX
- MOVQ R14, BP
- SHRQ CL, BP
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW (R9)(BP*2), CX
+ MOVW (R10)(R14*2), CX
// br3.advance(uint8(v1.entry))
MOVB CH, AH
- SHLQ CL, R14
- ADDB CL, R15
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
- MOVW AX, (DI)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVQ R14, 32(R13)
- MOVB R15, 40(R13)
- ADDQ $0x02, BX
+ // update the bitreader structure
+ MOVQ R12, 176(R11)
+ MOVB R13, 184(R11)
+ ADDQ $0x02, SI
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
- MOVQ 40(AX), CX
- MOVQ BX, DX
- SUBQ CX, DX
- SHLQ $0x02, DX
- MOVQ DX, 64(AX)
+ SUBQ 16(AX), SI
+ SHLQ $0x02, SI
+ MOVQ SI, 40(AX)
RET
// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), CX
- MOVBQZX 32(CX), BX
- MOVQ 40(CX), SI
- MOVQ SI, (SP)
- MOVQ 72(CX), DX
- MOVQ DX, 8(SP)
- MOVQ 48(CX), DI
- MOVQ 56(CX), R8
- MOVQ (CX), R9
- MOVQ 8(CX), R10
- MOVQ 16(CX), R11
- MOVQ 24(CX), R12
+ MOVBQZX 8(CX), DI
+ MOVQ 16(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 24(CX), R9
+ MOVQ 32(CX), R10
+ MOVQ (CX), R11
// Main loop
main_loop:
- MOVQ (SP), SI
- CMPQ SI, 8(SP)
+ MOVQ BX, R8
+ CMPQ R8, SI
SETGE DL
- // br1000.fillFast32()
- MOVQ 32(R9), R13
- MOVBQZX 40(R9), R14
- CMPQ R14, $0x20
- JBE skip_fill1000
- MOVQ 24(R9), R15
- SUBQ $0x20, R14
- SUBQ $0x04, R15
- MOVQ (R9), BP
+ // br0.fillFast32()
+ MOVQ 32(R11), R12
+ MOVBQZX 40(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill0
+ MOVQ 24(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ (R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R15)(BP*1), BP
- MOVQ R14, CX
- SHLQ CL, BP
- MOVQ R15, 24(R9)
- ORQ BP, R13
-
- // exhausted = exhausted || (br1000.off < 4)
- CMPQ R15, $0x04
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 24(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br0.off < 4)
+ CMPQ R14, $0x04
SETLT AL
ORB AL, DL
-skip_fill1000:
+skip_fill0:
// val0 := br0.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br0.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v1 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br0.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// val2 := br0.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v2 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br0.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val3 := br0.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v3 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br0.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@@ -378,88 +366,88 @@ skip_fill1000:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (SI)
-
- // update the bitreader reader structure
- MOVQ R13, 32(R9)
- MOVB R14, 40(R9)
- ADDQ DI, SI
-
- // br1001.fillFast32()
- MOVQ 32(R10), R13
- MOVBQZX 40(R10), R14
- CMPQ R14, $0x20
- JBE skip_fill1001
- MOVQ 24(R10), R15
- SUBQ $0x20, R14
- SUBQ $0x04, R15
- MOVQ (R10), BP
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 32(R11)
+ MOVB R13, 40(R11)
+ ADDQ R9, R8
+
+ // br1.fillFast32()
+ MOVQ 80(R11), R12
+ MOVBQZX 88(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill1
+ MOVQ 72(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 48(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R15)(BP*1), BP
- MOVQ R14, CX
- SHLQ CL, BP
- MOVQ R15, 24(R10)
- ORQ BP, R13
-
- // exhausted = exhausted || (br1001.off < 4)
- CMPQ R15, $0x04
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 72(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br1.off < 4)
+ CMPQ R14, $0x04
SETLT AL
ORB AL, DL
-skip_fill1001:
+skip_fill1:
// val0 := br1.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br1.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v1 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br1.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// val2 := br1.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v2 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br1.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val3 := br1.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v3 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br1.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@@ -467,88 +455,88 @@ skip_fill1001:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (SI)
-
- // update the bitreader reader structure
- MOVQ R13, 32(R10)
- MOVB R14, 40(R10)
- ADDQ DI, SI
-
- // br1002.fillFast32()
- MOVQ 32(R11), R13
- MOVBQZX 40(R11), R14
- CMPQ R14, $0x20
- JBE skip_fill1002
- MOVQ 24(R11), R15
- SUBQ $0x20, R14
- SUBQ $0x04, R15
- MOVQ (R11), BP
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 80(R11)
+ MOVB R13, 88(R11)
+ ADDQ R9, R8
+
+ // br2.fillFast32()
+ MOVQ 128(R11), R12
+ MOVBQZX 136(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill2
+ MOVQ 120(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 96(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R15)(BP*1), BP
- MOVQ R14, CX
- SHLQ CL, BP
- MOVQ R15, 24(R11)
- ORQ BP, R13
-
- // exhausted = exhausted || (br1002.off < 4)
- CMPQ R15, $0x04
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 120(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br2.off < 4)
+ CMPQ R14, $0x04
SETLT AL
ORB AL, DL
-skip_fill1002:
+skip_fill2:
// val0 := br2.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br2.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v1 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br2.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// val2 := br2.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v2 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br2.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val3 := br2.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v3 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br2.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@@ -556,88 +544,88 @@ skip_fill1002:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (SI)
-
- // update the bitreader reader structure
- MOVQ R13, 32(R11)
- MOVB R14, 40(R11)
- ADDQ DI, SI
-
- // br1003.fillFast32()
- MOVQ 32(R12), R13
- MOVBQZX 40(R12), R14
- CMPQ R14, $0x20
- JBE skip_fill1003
- MOVQ 24(R12), R15
- SUBQ $0x20, R14
- SUBQ $0x04, R15
- MOVQ (R12), BP
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 128(R11)
+ MOVB R13, 136(R11)
+ ADDQ R9, R8
+
+ // br3.fillFast32()
+ MOVQ 176(R11), R12
+ MOVBQZX 184(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill3
+ MOVQ 168(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 144(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
- MOVL (R15)(BP*1), BP
- MOVQ R14, CX
- SHLQ CL, BP
- MOVQ R15, 24(R12)
- ORQ BP, R13
-
- // exhausted = exhausted || (br1003.off < 4)
- CMPQ R15, $0x04
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 168(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br3.off < 4)
+ CMPQ R14, $0x04
SETLT AL
ORB AL, DL
-skip_fill1003:
+skip_fill3:
// val0 := br3.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val1 := br3.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v1 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br3.advance(uint8(v1.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// val2 := br3.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v2 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br3.advance(uint8(v2.entry)
MOVB CH, AH
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
// val3 := br3.peekTopBits(peekBits)
- MOVQ R13, R15
- MOVQ BX, CX
- SHRQ CL, R15
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v3 := table[val0&mask]
- MOVW (R8)(R15*2), CX
+ MOVW (R10)(R14*2), CX
// br3.advance(uint8(v3.entry)
MOVB CH, AL
- SHLQ CL, R13
- ADDB CL, R14
+ SHLQ CL, R12
+ ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@@ -645,20 +633,18 @@ skip_fill1003:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
- MOVL AX, (SI)
+ MOVL AX, (R8)
- // update the bitreader reader structure
- MOVQ R13, 32(R12)
- MOVB R14, 40(R12)
- ADDQ $0x04, (SP)
+ // update the bitreader structure
+ MOVQ R12, 176(R11)
+ MOVB R13, 184(R11)
+ ADDQ $0x04, BX
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
- MOVQ 40(AX), CX
- MOVQ (SP), DX
- SUBQ CX, DX
- SHLQ $0x02, DX
- MOVQ DX, 64(AX)
+ SUBQ 16(AX), BX
+ SHLQ $0x02, BX
+ MOVQ BX, 40(AX)
RET
// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
@@ -750,10 +736,8 @@ loop_condition:
// Update ctx structure
MOVQ ctx+0(FP), AX
- MOVQ DX, CX
- MOVQ 16(AX), DX
- SUBQ DX, CX
- MOVQ CX, 40(AX)
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
MOVQ (AX), AX
MOVQ R9, 24(AX)
MOVQ R10, 32(AX)
@@ -847,10 +831,8 @@ loop_condition:
// Update ctx structure
MOVQ ctx+0(FP), AX
- MOVQ DX, CX
- MOVQ 16(AX), DX
- SUBQ DX, CX
- MOVQ CX, 40(AX)
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
MOVQ (AX), AX
MOVQ R9, 24(AX)
MOVQ R10, 32(AX)
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index 23333b969..2f8860a72 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -180,7 +180,6 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
}
b.advance((bitCount + 7) >> 3)
- // println(s.norm[:s.symbolLen], s.symbolLen)
return s.buildDtable()
}
@@ -269,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) {
s.dt[0] = symbol
}
-// buildDtable will build the decoding table.
-func (s *fseDecoder) buildDtable() error {
- tableSize := uint32(1 << s.actualTableLog)
- highThreshold := tableSize - 1
- symbolNext := s.stateTable[:256]
-
- // Init, lay down lowprob symbols
- {
- for i, v := range s.norm[:s.symbolLen] {
- if v == -1 {
- s.dt[highThreshold].setAddBits(uint8(i))
- highThreshold--
- symbolNext[i] = 1
- } else {
- symbolNext[i] = uint16(v)
- }
- }
- }
- // Spread symbols
- {
- tableMask := tableSize - 1
- step := tableStep(tableSize)
- position := uint32(0)
- for ss, v := range s.norm[:s.symbolLen] {
- for i := 0; i < int(v); i++ {
- s.dt[position].setAddBits(uint8(ss))
- position = (position + step) & tableMask
- for position > highThreshold {
- // lowprob area
- position = (position + step) & tableMask
- }
- }
- }
- if position != 0 {
- // position must reach all cells once, otherwise normalizedCounter is incorrect
- return errors.New("corrupted input (position != 0)")
- }
- }
-
- // Build Decoding table
- {
- tableSize := uint16(1 << s.actualTableLog)
- for u, v := range s.dt[:tableSize] {
- symbol := v.addBits()
- nextState := symbolNext[symbol]
- symbolNext[symbol] = nextState + 1
- nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
- s.dt[u&maxTableMask].setNBits(nBits)
- newState := (nextState << nBits) - tableSize
- if newState > tableSize {
- return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
- }
- if newState == uint16(u) && nBits == 0 {
- // Seems weird that this is possible with nbits > 0.
- return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
- }
- s.dt[u&maxTableMask].setNewState(newState)
- }
- }
- return nil
-}
-
// transform will transform the decoder table into a table usable for
// decoding without having to apply the transformation while decoding.
// The state will contain the base value and the number of bits to read.
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
new file mode 100644
index 000000000..e74df436c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -0,0 +1,64 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+ "fmt"
+)
+
+type buildDtableAsmContext struct {
+ // inputs
+ stateTable *uint16
+ norm *int16
+ dt *uint64
+
+ // outputs --- set by the procedure in the case of error;
+ // for interpretation please see the error handling part below
+ errParam1 uint64
+ errParam2 uint64
+}
+
+// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
+// Function returns non-zero exit code on error.
+// go:noescape
+func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+
+// please keep in sync with _generate/gen_fse.go
+const (
+ errorCorruptedNormalizedCounter = 1
+ errorNewStateTooBig = 2
+ errorNewStateNoBits = 3
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+ ctx := buildDtableAsmContext{
+ stateTable: (*uint16)(&s.stateTable[0]),
+ norm: (*int16)(&s.norm[0]),
+ dt: (*uint64)(&s.dt[0]),
+ }
+ code := buildDtable_asm(s, &ctx)
+
+ if code != 0 {
+ switch code {
+ case errorCorruptedNormalizedCounter:
+ position := ctx.errParam1
+ return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
+
+ case errorNewStateTooBig:
+ newState := decSymbol(ctx.errParam1)
+ size := ctx.errParam2
+ return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
+
+ case errorNewStateNoBits:
+ newState := decSymbol(ctx.errParam1)
+ oldState := decSymbol(ctx.errParam2)
+ return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
+
+ default:
+ return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
+ }
+ }
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
new file mode 100644
index 000000000..da32b4420
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@@ -0,0 +1,127 @@
+// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+TEXT ·buildDtable_asm(SB), $0-24
+ MOVQ ctx+8(FP), CX
+ MOVQ s+0(FP), DI
+
+ // Load values
+ MOVBQZX 4098(DI), DX
+ XORQ AX, AX
+ BTSQ DX, AX
+ MOVQ (CX), BX
+ MOVQ 16(CX), SI
+ LEAQ -1(AX), R8
+ MOVQ 8(CX), CX
+ MOVWQZX 4096(DI), DI
+
+ // End load values
+ // Init, lay down lowprob symbols
+ XORQ R9, R9
+ JMP init_main_loop_condition
+
+init_main_loop:
+ MOVWQSX (CX)(R9*2), R10
+ CMPW R10, $-1
+ JNE do_not_update_high_threshold
+ MOVB R9, 1(SI)(R8*8)
+ DECQ R8
+ MOVQ $0x0000000000000001, R10
+
+do_not_update_high_threshold:
+ MOVW R10, (BX)(R9*2)
+ INCQ R9
+
+init_main_loop_condition:
+ CMPQ R9, DI
+ JL init_main_loop
+
+ // Spread symbols
+ // Calculate table step
+ MOVQ AX, R9
+ SHRQ $0x01, R9
+ MOVQ AX, R10
+ SHRQ $0x03, R10
+ LEAQ 3(R9)(R10*1), R9
+
+ // Fill add bits values
+ LEAQ -1(AX), R10
+ XORQ R11, R11
+ XORQ R12, R12
+ JMP spread_main_loop_condition
+
+spread_main_loop:
+ XORQ R13, R13
+ MOVWQSX (CX)(R12*2), R14
+ JMP spread_inner_loop_condition
+
+spread_inner_loop:
+ MOVB R12, 1(SI)(R11*8)
+
+adjust_position:
+ ADDQ R9, R11
+ ANDQ R10, R11
+ CMPQ R11, R8
+ JG adjust_position
+ INCQ R13
+
+spread_inner_loop_condition:
+ CMPQ R13, R14
+ JL spread_inner_loop
+ INCQ R12
+
+spread_main_loop_condition:
+ CMPQ R12, DI
+ JL spread_main_loop
+ TESTQ R11, R11
+ JZ spread_check_ok
+ MOVQ ctx+8(FP), AX
+ MOVQ R11, 24(AX)
+ MOVQ $+1, ret+16(FP)
+ RET
+
+spread_check_ok:
+ // Build Decoding table
+ XORQ DI, DI
+
+build_table_main_table:
+ MOVBQZX 1(SI)(DI*8), CX
+ MOVWQZX (BX)(CX*2), R8
+ LEAQ 1(R8), R9
+ MOVW R9, (BX)(CX*2)
+ MOVQ R8, R9
+ BSRQ R9, R9
+ MOVQ DX, CX
+ SUBQ R9, CX
+ SHLQ CL, R8
+ SUBQ AX, R8
+ MOVB CL, (SI)(DI*8)
+ MOVW R8, 2(SI)(DI*8)
+ CMPQ R8, AX
+ JLE build_table_check1_ok
+ MOVQ ctx+8(FP), CX
+ MOVQ R8, 24(CX)
+ MOVQ AX, 32(CX)
+ MOVQ $+2, ret+16(FP)
+ RET
+
+build_table_check1_ok:
+ TESTB CL, CL
+ JNZ build_table_check2_ok
+ CMPW R8, DI
+ JNE build_table_check2_ok
+ MOVQ ctx+8(FP), AX
+ MOVQ R8, 24(AX)
+ MOVQ DI, 32(AX)
+ MOVQ $+3, ret+16(FP)
+ RET
+
+build_table_check2_ok:
+ INCQ DI
+ CMPQ DI, AX
+ JL build_table_main_table
+ MOVQ $+0, ret+16(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
new file mode 100644
index 000000000..332e51fe4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
@@ -0,0 +1,72 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+ "errors"
+ "fmt"
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+ tableSize := uint32(1 << s.actualTableLog)
+ highThreshold := tableSize - 1
+ symbolNext := s.stateTable[:256]
+
+ // Init, lay down lowprob symbols
+ {
+ for i, v := range s.norm[:s.symbolLen] {
+ if v == -1 {
+ s.dt[highThreshold].setAddBits(uint8(i))
+ highThreshold--
+ symbolNext[i] = 1
+ } else {
+ symbolNext[i] = uint16(v)
+ }
+ }
+ }
+
+ // Spread symbols
+ {
+ tableMask := tableSize - 1
+ step := tableStep(tableSize)
+ position := uint32(0)
+ for ss, v := range s.norm[:s.symbolLen] {
+ for i := 0; i < int(v); i++ {
+ s.dt[position].setAddBits(uint8(ss))
+ position = (position + step) & tableMask
+ for position > highThreshold {
+ // lowprob area
+ position = (position + step) & tableMask
+ }
+ }
+ }
+ if position != 0 {
+ // position must reach all cells once, otherwise normalizedCounter is incorrect
+ return errors.New("corrupted input (position != 0)")
+ }
+ }
+
+ // Build Decoding table
+ {
+ tableSize := uint16(1 << s.actualTableLog)
+ for u, v := range s.dt[:tableSize] {
+ symbol := v.addBits()
+ nextState := symbolNext[symbol]
+ symbolNext[symbol] = nextState + 1
+ nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
+ s.dt[u&maxTableMask].setNBits(nBits)
+ newState := (nextState << nBits) - tableSize
+ if newState > tableSize {
+ return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
+ }
+ if newState == uint16(u) && nBits == 0 {
+ // Seems weird that this is possible with nbits > 0.
+ return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
+ }
+ s.dt[u&maxTableMask].setNewState(newState)
+ }
+ }
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 212c6cac3..71e64e061 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -134,18 +134,17 @@ sequenceDecs_decode_amd64_fill_2_end:
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, DI
-sequenceDecs_decode_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -155,18 +154,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R8
-sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -176,18 +174,17 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R9
-sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -416,18 +413,17 @@ sequenceDecs_decode_56_amd64_fill_end:
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, DI
-sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -437,18 +433,17 @@ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R8
-sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -458,18 +453,17 @@ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R9
-sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -1181,52 +1175,65 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, R11
- SUBQ DI, R11
- JLS copy_match
- MOVQ R9, R14
- SUBQ R11, R14
- CMPQ R13, R11
- JGE copy_all_from_history
- XORQ R11, R11
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(R11*1), R12
- MOVB R12, (BX)(R11*1)
- ADDQ $0x01, R11
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(R11*1), R12
- MOVW R12, (BX)(R11*1)
- ADDQ $0x02, R11
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(R11*1), R12
- MOVL R12, (BX)(R11*1)
- ADDQ $0x04, R11
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(R11*1), R12
- MOVQ R12, (BX)(R11*1)
- ADDQ $0x08, R11
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(R11*1), X0
- MOVUPS X0, (BX)(R11*1)
- ADDQ $0x10, R11
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
-copy_4_test:
- CMPQ R11, R13
- JB copy_4
+copy_4_end:
ADDQ R13, DI
- ADDQ R13, BX
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
@@ -1234,53 +1241,74 @@ copy_4_test:
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, R11
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (BX)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, R11
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (BX)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, R11
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (BX)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, R11
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (BX)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (BX)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, R11
- JB copy_5
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+
+copy_5_end:
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ BX, R11
- SUBQ R12, R11
+ MOVQ BX, R11
+ SUBQ R12, R11
// ml <= mo
CMPQ R13, R12
@@ -1382,45 +1410,67 @@ main_loop:
// Copy literals
TESTQ R11, R11
JZ check_offset
- XORQ R14, R14
- TESTQ $0x00000001, R11
- JZ copy_1_word
- MOVB (SI)(R14*1), R15
- MOVB R15, (BX)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, R11
- JZ copy_1_dword
- MOVW (SI)(R14*1), R15
- MOVW R15, (BX)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, R11
- JZ copy_1_qword
- MOVL (SI)(R14*1), R15
- MOVL R15, (BX)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, R11
- JZ copy_1_test
- MOVQ (SI)(R14*1), R15
- MOVQ R15, (BX)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
+ MOVQ R11, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (SI), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, SI
+ ADDQ $0x10, BX
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(SI)(R14*1), SI
+ LEAQ 16(BX)(R14*1), BX
+ MOVUPS -16(SI), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ R11, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ R11, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (SI), R14
+ MOVB -1(SI)(R11*1), R15
+ MOVB R14, (BX)
+ MOVB R15, -1(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
-copy_1:
- MOVUPS (SI)(R14*1), X0
- MOVUPS X0, (BX)(R14*1)
- ADDQ $0x10, R14
+copy_1_move_3:
+ MOVW (SI), R14
+ MOVB 2(SI), R15
+ MOVW R14, (BX)
+ MOVB R15, 2(BX)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
-copy_1_test:
- CMPQ R14, R11
- JB copy_1
+copy_1_move_4through7:
+ MOVL (SI), R14
+ MOVL -4(SI)(R11*1), R15
+ MOVL R14, (BX)
+ MOVL R15, -4(BX)(R11*1)
ADDQ R11, SI
ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (SI), R14
+ MOVQ -8(SI)(R11*1), R15
+ MOVQ R14, (BX)
+ MOVQ R15, -8(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+
+copy_1_end:
ADDQ R11, DI
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -1432,52 +1482,65 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, R11
- SUBQ DI, R11
- JLS copy_match
- MOVQ R9, R14
- SUBQ R11, R14
- CMPQ R13, R11
- JGE copy_all_from_history
- XORQ R11, R11
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(R11*1), R12
- MOVB R12, (BX)(R11*1)
- ADDQ $0x01, R11
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(R11*1), R12
- MOVW R12, (BX)(R11*1)
- ADDQ $0x02, R11
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(R11*1), R12
- MOVL R12, (BX)(R11*1)
- ADDQ $0x04, R11
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(R11*1), R12
- MOVQ R12, (BX)(R11*1)
- ADDQ $0x08, R11
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(R11*1), X0
- MOVUPS X0, (BX)(R11*1)
- ADDQ $0x10, R11
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
-copy_4_test:
- CMPQ R11, R13
- JB copy_4
+copy_4_end:
ADDQ R13, DI
- ADDQ R13, BX
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
@@ -1485,99 +1548,143 @@ copy_4_test:
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, R11
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (BX)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, R11
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (BX)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, R11
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (BX)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, R11
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (BX)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (BX)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, R11
- JB copy_5
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
ADDQ R11, BX
+
+copy_5_end:
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ BX, R11
- SUBQ R12, R11
+ MOVQ BX, R11
+ SUBQ R12, R11
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
- ADDQ R13, DI
- XORQ R12, R12
- TESTQ $0x00000001, R13
- JZ copy_2_word
- MOVB (R11)(R12*1), R14
- MOVB R14, (BX)(R12*1)
- ADDQ $0x01, R12
-
-copy_2_word:
- TESTQ $0x00000002, R13
- JZ copy_2_dword
- MOVW (R11)(R12*1), R14
- MOVW R14, (BX)(R12*1)
- ADDQ $0x02, R12
-
-copy_2_dword:
- TESTQ $0x00000004, R13
- JZ copy_2_qword
- MOVL (R11)(R12*1), R14
- MOVL R14, (BX)(R12*1)
- ADDQ $0x04, R12
-
-copy_2_qword:
- TESTQ $0x00000008, R13
- JZ copy_2_test
- MOVQ (R11)(R12*1), R14
- MOVQ R14, (BX)(R12*1)
- ADDQ $0x08, R12
- JMP copy_2_test
+ ADDQ R13, DI
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
-copy_2:
- MOVUPS (R11)(R12*1), X0
- MOVUPS X0, (BX)(R12*1)
- ADDQ $0x10, R12
+copy_2_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R11
+ ADDQ $0x10, BX
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(R11)(R12*1), R11
+ LEAQ 16(BX)(R12*1), BX
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (R11), R12
+ MOVB -1(R11)(R13*1), R14
+ MOVB R12, (BX)
+ MOVB R14, -1(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
-copy_2_test:
- CMPQ R12, R13
- JB copy_2
+copy_2_move_3:
+ MOVW (R11), R12
+ MOVB 2(R11), R14
+ MOVW R12, (BX)
+ MOVB R14, 2(BX)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (R11), R12
+ MOVL -4(R11)(R13*1), R14
+ MOVL R12, (BX)
+ MOVL R14, -4(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (R11), R12
+ MOVQ -8(R11)(R13*1), R14
+ MOVQ R12, (BX)
+ MOVQ R14, -8(BX)(R13*1)
+ ADDQ R13, R11
ADDQ R13, BX
- JMP handle_loop
+
+copy_2_end:
+ JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
@@ -1773,18 +1880,17 @@ sequenceDecs_decodeSync_amd64_fill_2_end:
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, DI
-sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -1794,18 +1900,17 @@ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R8
-sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -1815,18 +1920,17 @@ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R9
-sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -1934,103 +2038,137 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ CX, AX
- SUBQ R12, AX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ AX, R14
- CMPQ R13, AX
- JGE copy_all_from_history
- XORQ AX, AX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(AX*1), CL
- MOVB CL, (R10)(AX*1)
- ADDQ $0x01, AX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(AX*1), CX
- MOVW CX, (R10)(AX*1)
- ADDQ $0x02, AX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(AX*1), CX
- MOVL CX, (R10)(AX*1)
- ADDQ $0x04, AX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(AX*1), CX
- MOVQ CX, (R10)(AX*1)
- ADDQ $0x08, AX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(AX*1), X0
- MOVUPS X0, (R10)(AX*1)
- ADDQ $0x10, AX
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
-copy_4_test:
- CMPQ AX, R13
- JB copy_4
- ADDQ R13, R12
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, AX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R10)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, AX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R10)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, AX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R10)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, AX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R10)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R10)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, AX
- JB copy_5
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
ADDQ AX, R10
+
+copy_5_end:
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R10, AX
- SUBQ CX, AX
+ MOVQ R10, AX
+ SUBQ CX, AX
// ml <= mo
CMPQ R13, CX
@@ -2407,103 +2545,137 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, CX
- SUBQ R11, CX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ CX, R14
- CMPQ R13, CX
- JGE copy_all_from_history
- XORQ CX, CX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(CX*1), R12
- MOVB R12, (R9)(CX*1)
- ADDQ $0x01, CX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(CX*1), R12
- MOVW R12, (R9)(CX*1)
- ADDQ $0x02, CX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(CX*1), R12
- MOVL R12, (R9)(CX*1)
- ADDQ $0x04, CX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(CX*1), R12
- MOVQ R12, (R9)(CX*1)
- ADDQ $0x08, CX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(CX*1), X0
- MOVUPS X0, (R9)(CX*1)
- ADDQ $0x10, CX
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
-copy_4_test:
- CMPQ CX, R13
- JB copy_4
+copy_4_end:
ADDQ R13, R11
- ADDQ R13, R9
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, CX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R9)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, CX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R9)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, CX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R9)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, CX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R9)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R9)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, CX
- JB copy_5
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R9, CX
- SUBQ R12, CX
+ MOVQ R9, CX
+ SUBQ R12, CX
// ml <= mo
CMPQ R13, R12
@@ -2746,18 +2918,17 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_end:
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, DI
-sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -2767,18 +2938,17 @@ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R8
-sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -2788,18 +2958,17 @@ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R9
-sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -2885,45 +3054,67 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
// Copy literals
TESTQ AX, AX
JZ check_offset
- XORQ R14, R14
- TESTQ $0x00000001, AX
- JZ copy_1_word
- MOVB (R11)(R14*1), R15
- MOVB R15, (R10)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, AX
- JZ copy_1_dword
- MOVW (R11)(R14*1), R15
- MOVW R15, (R10)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, AX
- JZ copy_1_qword
- MOVL (R11)(R14*1), R15
- MOVL R15, (R10)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, AX
- JZ copy_1_test
- MOVQ (R11)(R14*1), R15
- MOVQ R15, (R10)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
+ MOVQ AX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
-copy_1:
- MOVUPS (R11)(R14*1), X0
- MOVUPS X0, (R10)(R14*1)
- ADDQ $0x10, R14
+copy_1_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R11
+ ADDQ $0x10, R10
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R11)(R14*1), R11
+ LEAQ 16(R10)(R14*1), R10
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ AX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ AX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R11), R14
+ MOVB -1(R11)(AX*1), R15
+ MOVB R14, (R10)
+ MOVB R15, -1(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (R11), R14
+ MOVB 2(R11), R15
+ MOVW R14, (R10)
+ MOVB R15, 2(R10)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R11), R14
+ MOVL -4(R11)(AX*1), R15
+ MOVL R14, (R10)
+ MOVL R15, -4(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
-copy_1_test:
- CMPQ R14, AX
- JB copy_1
+copy_1_move_8through16:
+ MOVQ (R11), R14
+ MOVQ -8(R11)(AX*1), R15
+ MOVQ R14, (R10)
+ MOVQ R15, -8(R10)(AX*1)
ADDQ AX, R11
ADDQ AX, R10
+
+copy_1_end:
ADDQ AX, R12
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -2936,149 +3127,206 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ CX, AX
- SUBQ R12, AX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ AX, R14
- CMPQ R13, AX
- JGE copy_all_from_history
- XORQ AX, AX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(AX*1), CL
- MOVB CL, (R10)(AX*1)
- ADDQ $0x01, AX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(AX*1), CX
- MOVW CX, (R10)(AX*1)
- ADDQ $0x02, AX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(AX*1), CX
- MOVL CX, (R10)(AX*1)
- ADDQ $0x04, AX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(AX*1), CX
- MOVQ CX, (R10)(AX*1)
- ADDQ $0x08, AX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(AX*1), X0
- MOVUPS X0, (R10)(AX*1)
- ADDQ $0x10, AX
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
-copy_4_test:
- CMPQ AX, R13
- JB copy_4
- ADDQ R13, R12
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, AX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R10)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, AX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R10)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, AX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R10)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, AX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R10)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R10)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, AX
- JB copy_5
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+
+copy_5_end:
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R10, AX
- SUBQ CX, AX
+ MOVQ R10, AX
+ SUBQ CX, AX
// ml <= mo
CMPQ R13, CX
JA copy_overlapping_match
// Copy non-overlapping match
- ADDQ R13, R12
- XORQ CX, CX
- TESTQ $0x00000001, R13
- JZ copy_2_word
- MOVB (AX)(CX*1), R14
- MOVB R14, (R10)(CX*1)
- ADDQ $0x01, CX
-
-copy_2_word:
- TESTQ $0x00000002, R13
- JZ copy_2_dword
- MOVW (AX)(CX*1), R14
- MOVW R14, (R10)(CX*1)
- ADDQ $0x02, CX
-
-copy_2_dword:
- TESTQ $0x00000004, R13
- JZ copy_2_qword
- MOVL (AX)(CX*1), R14
- MOVL R14, (R10)(CX*1)
- ADDQ $0x04, CX
-
-copy_2_qword:
- TESTQ $0x00000008, R13
- JZ copy_2_test
- MOVQ (AX)(CX*1), R14
- MOVQ R14, (R10)(CX*1)
- ADDQ $0x08, CX
- JMP copy_2_test
-
-copy_2:
- MOVUPS (AX)(CX*1), X0
- MOVUPS X0, (R10)(CX*1)
- ADDQ $0x10, CX
+ ADDQ R13, R12
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_2_small
-copy_2_test:
- CMPQ CX, R13
- JB copy_2
+copy_2_loop:
+ MOVUPS (AX), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, AX
+ ADDQ $0x10, R10
+ SUBQ $0x10, CX
+ JAE copy_2_loop
+ LEAQ 16(AX)(CX*1), AX
+ LEAQ 16(R10)(CX*1), R10
+ MOVUPS -16(AX), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (AX), CL
+ MOVB -1(AX)(R13*1), R14
+ MOVB CL, (R10)
+ MOVB R14, -1(R10)(R13*1)
+ ADDQ R13, AX
ADDQ R13, R10
- JMP handle_loop
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (AX), CX
+ MOVB 2(AX), R14
+ MOVW CX, (R10)
+ MOVB R14, 2(R10)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (AX), CX
+ MOVL -4(AX)(R13*1), R14
+ MOVL CX, (R10)
+ MOVL R14, -4(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (AX), CX
+ MOVQ -8(AX)(R13*1), R14
+ MOVQ CX, (R10)
+ MOVQ R14, -8(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+
+copy_2_end:
+ JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
@@ -3415,45 +3663,67 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
// Copy literals
TESTQ CX, CX
JZ check_offset
- XORQ R14, R14
- TESTQ $0x00000001, CX
- JZ copy_1_word
- MOVB (R10)(R14*1), R15
- MOVB R15, (R9)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, CX
- JZ copy_1_dword
- MOVW (R10)(R14*1), R15
- MOVW R15, (R9)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, CX
- JZ copy_1_qword
- MOVL (R10)(R14*1), R15
- MOVL R15, (R9)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, CX
- JZ copy_1_test
- MOVQ (R10)(R14*1), R15
- MOVQ R15, (R9)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
+ MOVQ CX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (R10), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R10
+ ADDQ $0x10, R9
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R10)(R14*1), R10
+ LEAQ 16(R9)(R14*1), R9
+ MOVUPS -16(R10), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ CX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ CX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R10), R14
+ MOVB -1(R10)(CX*1), R15
+ MOVB R14, (R9)
+ MOVB R15, -1(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
-copy_1:
- MOVUPS (R10)(R14*1), X0
- MOVUPS X0, (R9)(R14*1)
- ADDQ $0x10, R14
+copy_1_move_3:
+ MOVW (R10), R14
+ MOVB 2(R10), R15
+ MOVW R14, (R9)
+ MOVB R15, 2(R9)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R10), R14
+ MOVL -4(R10)(CX*1), R15
+ MOVL R14, (R9)
+ MOVL R15, -4(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
-copy_1_test:
- CMPQ R14, CX
- JB copy_1
+copy_1_move_8through16:
+ MOVQ (R10), R14
+ MOVQ -8(R10)(CX*1), R15
+ MOVQ R14, (R9)
+ MOVQ R15, -8(R9)(CX*1)
ADDQ CX, R10
ADDQ CX, R9
+
+copy_1_end:
ADDQ CX, R11
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -3466,149 +3736,206 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, CX
- SUBQ R11, CX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ CX, R14
- CMPQ R13, CX
- JGE copy_all_from_history
- XORQ CX, CX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(CX*1), R12
- MOVB R12, (R9)(CX*1)
- ADDQ $0x01, CX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(CX*1), R12
- MOVW R12, (R9)(CX*1)
- ADDQ $0x02, CX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(CX*1), R12
- MOVL R12, (R9)(CX*1)
- ADDQ $0x04, CX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(CX*1), R12
- MOVQ R12, (R9)(CX*1)
- ADDQ $0x08, CX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(CX*1), X0
- MOVUPS X0, (R9)(CX*1)
- ADDQ $0x10, CX
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
-copy_4_test:
- CMPQ CX, R13
- JB copy_4
+copy_4_end:
ADDQ R13, R11
- ADDQ R13, R9
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, CX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R9)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, CX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R9)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, CX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R9)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, CX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R9)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R9)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, CX
- JB copy_5
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R9, CX
- SUBQ R12, CX
+ MOVQ R9, CX
+ SUBQ R12, CX
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
- ADDQ R13, R11
- XORQ R12, R12
- TESTQ $0x00000001, R13
- JZ copy_2_word
- MOVB (CX)(R12*1), R14
- MOVB R14, (R9)(R12*1)
- ADDQ $0x01, R12
-
-copy_2_word:
- TESTQ $0x00000002, R13
- JZ copy_2_dword
- MOVW (CX)(R12*1), R14
- MOVW R14, (R9)(R12*1)
- ADDQ $0x02, R12
-
-copy_2_dword:
- TESTQ $0x00000004, R13
- JZ copy_2_qword
- MOVL (CX)(R12*1), R14
- MOVL R14, (R9)(R12*1)
- ADDQ $0x04, R12
-
-copy_2_qword:
- TESTQ $0x00000008, R13
- JZ copy_2_test
- MOVQ (CX)(R12*1), R14
- MOVQ R14, (R9)(R12*1)
- ADDQ $0x08, R12
- JMP copy_2_test
-
-copy_2:
- MOVUPS (CX)(R12*1), X0
- MOVUPS X0, (R9)(R12*1)
- ADDQ $0x10, R12
+ ADDQ R13, R11
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
-copy_2_test:
- CMPQ R12, R13
- JB copy_2
+copy_2_loop:
+ MOVUPS (CX), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, CX
+ ADDQ $0x10, R9
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(CX)(R12*1), CX
+ LEAQ 16(R9)(R12*1), R9
+ MOVUPS -16(CX), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (CX), R12
+ MOVB -1(CX)(R13*1), R14
+ MOVB R12, (R9)
+ MOVB R14, -1(R9)(R13*1)
+ ADDQ R13, CX
ADDQ R13, R9
- JMP handle_loop
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (CX), R12
+ MOVB 2(CX), R14
+ MOVW R12, (R9)
+ MOVB R14, 2(R9)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (CX), R12
+ MOVL -4(CX)(R13*1), R14
+ MOVL R12, (R9)
+ MOVL R14, -4(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (CX), R12
+ MOVQ -8(CX)(R13*1), R14
+ MOVQ R12, (R9)
+ MOVQ R14, -8(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+
+copy_2_end:
+ JMP handle_loop
// Copy overlapping match
copy_overlapping_match: