diff options
Diffstat (limited to 'vendor/github.com/klauspost')
11 files changed, 1867 insertions, 1321 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md index 5c3c2a258..c7cf1a20c 100644 --- a/vendor/github.com/klauspost/compress/README.md +++ b/vendor/github.com/klauspost/compress/README.md @@ -17,6 +17,27 @@ This package provides various compression algorithms. # changelog
+* June 29, 2022 (v1.15.7)
+
+ * s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633
+ * zip: Merge upstream https://github.com/klauspost/compress/pull/631
+ * zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
+ * zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
+ * flate: Faster histograms https://github.com/klauspost/compress/pull/620
+ * deflate: Use compound hcode https://github.com/klauspost/compress/pull/622
+
+* June 3, 2022 (v1.15.6)
+ * s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
+ * s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
+ * zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
+ * zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
+ * zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
+ * gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
+ * s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
+ * s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
+ * snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
+ * s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
+
* May 25, 2022 (v1.15.5)
* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index 25f6d1108..40ef45c2f 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -169,7 +169,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { b := w.offsetEncoding.codes b = b[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i].zero() { return false } } @@ -178,7 +178,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { b = w.literalEncoding.codes[256:literalCount] b = b[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i].zero() { return false } } @@ -186,7 +186,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { a = t.litHist[:256] b = w.literalEncoding.codes[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i].zero() { return false } } @@ -280,12 +280,12 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE // Copy the concatenated code sizes to codegen. Put a marker at the end. cgnl := codegen[:numLiterals] for i := range cgnl { - cgnl[i] = uint8(litEnc.codes[i].len) + cgnl[i] = litEnc.codes[i].len() } cgnl = codegen[numLiterals : numLiterals+numOffsets] for i := range cgnl { - cgnl[i] = uint8(offEnc.codes[i].len) + cgnl[i] = offEnc.codes[i].len() } codegen[numLiterals+numOffsets] = badCode @@ -428,8 +428,8 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { func (w *huffmanBitWriter) writeCode(c hcode) { // The function does not get inlined if we "& 63" the shift. - w.bits |= uint64(c.code) << (w.nbits & 63) - w.nbits += c.len + w.bits |= c.code64() << (w.nbits & 63) + w.nbits += c.len() if w.nbits >= 48 { w.writeOutBits() } @@ -477,7 +477,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n w.writeBits(int32(numCodegens-4), 4) for i := 0; i < numCodegens; i++ { - value := uint(w.codegenEncoding.codes[codegenOrder[i]].len) + value := uint(w.codegenEncoding.codes[codegenOrder[i]].len()) w.writeBits(int32(value), 3) } @@ -670,7 +670,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b // Estimate size for using a new table. // Use the previous header size as the best estimate. newSize := w.lastHeader + tokens.EstimatedBits() - newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty + newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty // The estimated size is calculated as an optimal table. // We add a penalty to make it more realistic and re-use a bit more. @@ -854,8 +854,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) if t < 256 { //w.writeCode(lits[t.literal()]) c := lits[t] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + bits |= c.code64() << (nbits & 63) + nbits += c.len() if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -882,8 +882,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := lengths[lengthCode] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + bits |= c.code64() << (nbits & 63) + nbits += c.len() if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -931,8 +931,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := offs[offsetCode] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + bits |= c.code64() << (nbits & 63) + nbits += c.len() if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -1009,8 +1009,6 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { } } - // Fill is rarely better... - const fill = false const numLiterals = endBlockMarker + 1 const numOffsets = 1 @@ -1019,7 +1017,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // Assume header is around 70 bytes: // https://stackoverflow.com/a/25454430 const guessHeaderSizeBits = 70 * 8 - histogram(input, w.literalFreq[:numLiterals], fill) + histogram(input, w.literalFreq[:numLiterals]) ssize, storable := w.storedSize(input) if storable && len(input) > 1024 { // Quick check for incompressible content. @@ -1045,19 +1043,14 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { } w.literalFreq[endBlockMarker] = 1 w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15) - if fill { - // Clear fill... - for i := range w.literalFreq[:numLiterals] { - w.literalFreq[i] = 0 - } - histogram(input, w.literalFreq[:numLiterals], false) - } estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals]) - estBits += w.lastHeader - if w.lastHeader == 0 { - estBits += guessHeaderSizeBits + if estBits < math.MaxInt32 { + estBits += w.lastHeader + if w.lastHeader == 0 { + estBits += guessHeaderSizeBits + } + estBits += estBits >> w.logNewTablePenalty } - estBits += estBits >> w.logNewTablePenalty // Store bytes, if we don't get a reasonable improvement. if storable && ssize <= estBits { @@ -1134,12 +1127,12 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { nbytes = 0 } a, b := encoding[input[0]], encoding[input[1]] - bits |= uint64(a.code) << (nbits & 63) - bits |= uint64(b.code) << ((nbits + a.len) & 63) + bits |= a.code64() << (nbits & 63) + bits |= b.code64() << ((nbits + a.len()) & 63) c := encoding[input[2]] - nbits += b.len + a.len - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + nbits += b.len() + a.len() + bits |= c.code64() << (nbits & 63) + nbits += c.len() input = input[3:] } @@ -1165,10 +1158,11 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { } // Bitwriting inlined, ~30% speedup c := encoding[t] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + bits |= c.code64() << (nbits & 63) + + nbits += c.len() if debugDeflate { - count += int(c.len) + count += int(c.len()) } } // Restore... diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go index 9ab497c27..5ac144f28 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_code.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go @@ -16,9 +16,18 @@ const ( ) // hcode is a huffman code with a bit code and bit length. -type hcode struct { - code uint16 - len uint8 +type hcode uint32 + +func (h hcode) len() uint8 { + return uint8(h) +} + +func (h hcode) code64() uint64 { + return uint64(h >> 8) +} + +func (h hcode) zero() bool { + return h == 0 } type huffmanEncoder struct { @@ -58,8 +67,11 @@ type levelInfo struct { // set sets the code and length of an hcode. func (h *hcode) set(code uint16, length uint8) { - h.len = length - h.code = code + *h = hcode(length) | (hcode(code) << 8) +} + +func newhcode(code uint16, length uint8) hcode { + return hcode(length) | (hcode(code) << 8) } func reverseBits(number uint16, bitLength byte) uint16 { @@ -100,7 +112,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder { bits = ch + 192 - 280 size = 8 } - codes[ch] = hcode{code: reverseBits(bits, size), len: size} + codes[ch] = newhcode(reverseBits(bits, size), size) } return h } @@ -109,7 +121,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder { h := newHuffmanEncoder(30) codes := h.codes for ch := range codes { - codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5} + codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5) } return h } @@ -121,7 +133,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int { var total int for i, f := range freq { if f != 0 { - total += int(f) * int(h.codes[i].len) + total += int(f) * int(h.codes[i].len()) } } return total @@ -130,7 +142,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int { func (h *huffmanEncoder) bitLengthRaw(b []byte) int { var total int for _, f := range b { - total += int(h.codes[f].len) + total += int(h.codes[f].len()) } return total } @@ -141,10 +153,10 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int { for i, f := range freq { if f != 0 { code := h.codes[i] - if code.len == 0 { + if code.zero() { return math.MaxInt32 } - total += int(f) * int(code.len) + total += int(f) * int(code.len()) } } return total @@ -308,7 +320,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN sortByLiteral(chunk) for _, node := range chunk { - h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)} + h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n)) code++ } list = list[0 : len(list)-int(bits)] @@ -330,7 +342,7 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { list[count] = literalNode{uint16(i), f} count++ } else { - codes[i].len = 0 + codes[i] = 0 } } list[count] = literalNode{} @@ -364,21 +376,37 @@ func atLeastOne(v float32) float32 { return v } -// Unassigned values are assigned '1' in the histogram. -func fillHist(b []uint16) { - for i, v := range b { - if v == 0 { - b[i] = 1 +func histogram(b []byte, h []uint16) { + if true && len(b) >= 8<<10 { + // Split for bigger inputs + histogramSplit(b, h) + } else { + h = h[:256] + for _, t := range b { + h[t]++ } } } -func histogram(b []byte, h []uint16, fill bool) { +func histogramSplit(b []byte, h []uint16) { + // Tested, and slightly faster than 2-way. + // Writing to separate arrays and combining is also slightly slower. h = h[:256] - for _, t := range b { - h[t]++ + for len(b)&3 != 0 { + h[b[0]]++ + b = b[1:] } - if fill { - fillHist(h) + n := len(b) / 4 + x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:] + y, z, w = y[:len(x)], z[:len(x)], w[:len(x)] + for i, t := range x { + v0 := &h[t] + v1 := &h[y[i]] + v3 := &h[w[i]] + v2 := &h[z[i]] + *v0++ + *v1++ + *v2++ + *v3++ } } diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go index 544162a43..93a1d1503 100644 --- a/vendor/github.com/klauspost/compress/flate/stateless.go +++ b/vendor/github.com/klauspost/compress/flate/stateless.go @@ -59,9 +59,9 @@ var bitWriterPool = sync.Pool{ }, } -// StatelessDeflate allows to compress directly to a Writer without retaining state. +// StatelessDeflate allows compressing directly to a Writer without retaining state. // When returning everything will be flushed. -// Up to 8KB of an optional dictionary can be given which is presumed to presumed to precede the block. +// Up to 8KB of an optional dictionary can be given which is presumed to precede the block. // Longer dictionaries will be truncated and will still produce valid output. // Sending nil dictionary is perfectly fine. func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error { diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go index 671e630a8..9f3e9f79e 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go @@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) const fallback8BitSize = 800 type decompress4xContext struct { - pbr0 *bitReaderShifted - pbr1 *bitReaderShifted - pbr2 *bitReaderShifted - pbr3 *bitReaderShifted + pbr *[4]bitReaderShifted peekBits uint8 out *byte dstEvery int @@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { ctx := decompress4xContext{ - pbr0: &br[0], - pbr1: &br[1], - pbr2: &br[2], - pbr3: &br[3], + pbr: &br, peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() out: &out[0], dstEvery: dstEvery, diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s index 6c65c6e2b..dd1a5aecd 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s @@ -4,45 +4,40 @@ // +build amd64,!appengine,!noasm,gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_main_loop_amd64(SB), $8-8 +TEXT ·decompress4x_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), AX - MOVBQZX 32(AX), SI - MOVQ 40(AX), DI - MOVQ DI, BX - MOVQ 72(AX), CX - MOVQ CX, (SP) - MOVQ 48(AX), R8 - MOVQ 56(AX), R9 - MOVQ (AX), R10 - MOVQ 8(AX), R11 - MOVQ 16(AX), R12 - MOVQ 24(AX), R13 + MOVBQZX 8(AX), DI + MOVQ 16(AX), SI + MOVQ 48(AX), BX + MOVQ 24(AX), R9 + MOVQ 32(AX), R10 + MOVQ (AX), R11 // Main loop main_loop: - MOVQ BX, DI - CMPQ DI, (SP) + MOVQ SI, R8 + CMPQ R8, BX SETGE DL // br0.fillFast32() - MOVQ 32(R10), R14 - MOVBQZX 40(R10), R15 - CMPQ R15, $0x20 + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 JBE skip_fill0 - MOVQ 24(R10), AX - SUBQ $0x20, R15 + MOVQ 24(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R10), BP + MOVQ (R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R10) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 24(R11) + ORQ R14, R12 // exhausted = exhausted || (br0.off < 4) CMPQ AX, $0x04 @@ -51,57 +46,57 @@ main_loop: skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R10) - MOVB R15, 40(R10) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 // br1.fillFast32() - MOVQ 32(R11), R14 - MOVBQZX 40(R11), R15 - CMPQ R15, $0x20 + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 JBE skip_fill1 - MOVQ 24(R11), AX - SUBQ $0x20, R15 + MOVQ 72(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R11), BP + MOVQ 48(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R11) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 72(R11) + ORQ R14, R12 // exhausted = exhausted || (br1.off < 4) CMPQ AX, $0x04 @@ -110,57 +105,57 @@ skip_fill0: skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R11) - MOVB R15, 40(R11) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 // br2.fillFast32() - MOVQ 32(R12), R14 - MOVBQZX 40(R12), R15 - CMPQ R15, $0x20 + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 JBE skip_fill2 - MOVQ 24(R12), AX - SUBQ $0x20, R15 + MOVQ 120(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R12), BP + MOVQ 96(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R12) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 120(R11) + ORQ R14, R12 // exhausted = exhausted || (br2.off < 4) CMPQ AX, $0x04 @@ -169,57 +164,57 @@ skip_fill1: skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R12) - MOVB R15, 40(R12) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 // br3.fillFast32() - MOVQ 32(R13), R14 - MOVBQZX 40(R13), R15 - CMPQ R15, $0x20 + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 JBE skip_fill3 - MOVQ 24(R13), AX - SUBQ $0x20, R15 + MOVQ 168(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R13), BP + MOVQ 144(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R13) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 168(R11) + ORQ R14, R12 // exhausted = exhausted || (br3.off < 4) CMPQ AX, $0x04 @@ -228,149 +223,142 @@ skip_fill2: skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R13) - MOVB R15, 40(R13) - ADDQ $0x02, BX + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x02, SI TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ BX, DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), SI + SHLQ $0x02, SI + MOVQ SI, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8 +TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), CX - MOVBQZX 32(CX), BX - MOVQ 40(CX), SI - MOVQ SI, (SP) - MOVQ 72(CX), DX - MOVQ DX, 8(SP) - MOVQ 48(CX), DI - MOVQ 56(CX), R8 - MOVQ (CX), R9 - MOVQ 8(CX), R10 - MOVQ 16(CX), R11 - MOVQ 24(CX), R12 + MOVBQZX 8(CX), DI + MOVQ 16(CX), BX + MOVQ 48(CX), SI + MOVQ 24(CX), R9 + MOVQ 32(CX), R10 + MOVQ (CX), R11 // Main loop main_loop: - MOVQ (SP), SI - CMPQ SI, 8(SP) + MOVQ BX, R8 + CMPQ R8, SI SETGE DL - // br1000.fillFast32() - MOVQ 32(R9), R13 - MOVBQZX 40(R9), R14 - CMPQ R14, $0x20 - JBE skip_fill1000 - MOVQ 24(R9), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R9), BP + // br0.fillFast32() + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill0 + MOVQ 24(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ (R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R9) - ORQ BP, R13 - - // exhausted = exhausted || (br1000.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 24(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br0.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1000: +skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -378,88 +366,88 @@ skip_fill1000: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R9) - MOVB R14, 40(R9) - ADDQ DI, SI - - // br1001.fillFast32() - MOVQ 32(R10), R13 - MOVBQZX 40(R10), R14 - CMPQ R14, $0x20 - JBE skip_fill1001 - MOVQ 24(R10), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R10), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 + + // br1.fillFast32() + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill1 + MOVQ 72(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 48(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R10) - ORQ BP, R13 - - // exhausted = exhausted || (br1001.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 72(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br1.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1001: +skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -467,88 +455,88 @@ skip_fill1001: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R10) - MOVB R14, 40(R10) - ADDQ DI, SI - - // br1002.fillFast32() - MOVQ 32(R11), R13 - MOVBQZX 40(R11), R14 - CMPQ R14, $0x20 - JBE skip_fill1002 - MOVQ 24(R11), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R11), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 + + // br2.fillFast32() + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill2 + MOVQ 120(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 96(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R11) - ORQ BP, R13 - - // exhausted = exhausted || (br1002.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 120(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br2.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1002: +skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -556,88 +544,88 @@ skip_fill1002: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R11) - MOVB R14, 40(R11) - ADDQ DI, SI - - // br1003.fillFast32() - MOVQ 32(R12), R13 - MOVBQZX 40(R12), R14 - CMPQ R14, $0x20 - JBE skip_fill1003 - MOVQ 24(R12), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R12), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 + + // br3.fillFast32() + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill3 + MOVQ 168(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 144(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R12) - ORQ BP, R13 - - // exhausted = exhausted || (br1003.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 168(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br3.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1003: +skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -645,20 +633,18 @@ skip_fill1003: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) + MOVL AX, (R8) - // update the bitreader reader structure - MOVQ R13, 32(R12) - MOVB R14, 40(R12) - ADDQ $0x04, (SP) + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x04, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ (SP), DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) RET // func decompress1x_main_loop_amd64(ctx *decompress1xContext) @@ -750,10 +736,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) @@ -847,10 +831,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go index 23333b969..2f8860a72 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go @@ -180,7 +180,6 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error { return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog) } b.advance((bitCount + 7) >> 3) - // println(s.norm[:s.symbolLen], s.symbolLen) return s.buildDtable() } @@ -269,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) { s.dt[0] = symbol } -// buildDtable will build the decoding table. -func (s *fseDecoder) buildDtable() error { - tableSize := uint32(1 << s.actualTableLog) - highThreshold := tableSize - 1 - symbolNext := s.stateTable[:256] - - // Init, lay down lowprob symbols - { - for i, v := range s.norm[:s.symbolLen] { - if v == -1 { - s.dt[highThreshold].setAddBits(uint8(i)) - highThreshold-- - symbolNext[i] = 1 - } else { - symbolNext[i] = uint16(v) - } - } - } - // Spread symbols - { - tableMask := tableSize - 1 - step := tableStep(tableSize) - position := uint32(0) - for ss, v := range s.norm[:s.symbolLen] { - for i := 0; i < int(v); i++ { - s.dt[position].setAddBits(uint8(ss)) - position = (position + step) & tableMask - for position > highThreshold { - // lowprob area - position = (position + step) & tableMask - } - } - } - if position != 0 { - // position must reach all cells once, otherwise normalizedCounter is incorrect - return errors.New("corrupted input (position != 0)") - } - } - - // Build Decoding table - { - tableSize := uint16(1 << s.actualTableLog) - for u, v := range s.dt[:tableSize] { - symbol := v.addBits() - nextState := symbolNext[symbol] - symbolNext[symbol] = nextState + 1 - nBits := s.actualTableLog - byte(highBits(uint32(nextState))) - s.dt[u&maxTableMask].setNBits(nBits) - newState := (nextState << nBits) - tableSize - if newState > tableSize { - return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize) - } - if newState == uint16(u) && nBits == 0 { - // Seems weird that this is possible with nbits > 0. - return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u) - } - s.dt[u&maxTableMask].setNewState(newState) - } - } - return nil -} - // transform will transform the decoder table into a table usable for // decoding without having to apply the transformation while decoding. // The state will contain the base value and the number of bits to read. diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go new file mode 100644 index 000000000..e74df436c --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go @@ -0,0 +1,64 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +package zstd + +import ( + "fmt" +) + +type buildDtableAsmContext struct { + // inputs + stateTable *uint16 + norm *int16 + dt *uint64 + + // outputs --- set by the procedure in the case of error; + // for interpretation please see the error handling part below + errParam1 uint64 + errParam2 uint64 +} + +// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable. +// Function returns non-zero exit code on error. +// go:noescape +func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int + +// please keep in sync with _generate/gen_fse.go +const ( + errorCorruptedNormalizedCounter = 1 + errorNewStateTooBig = 2 + errorNewStateNoBits = 3 +) + +// buildDtable will build the decoding table. +func (s *fseDecoder) buildDtable() error { + ctx := buildDtableAsmContext{ + stateTable: (*uint16)(&s.stateTable[0]), + norm: (*int16)(&s.norm[0]), + dt: (*uint64)(&s.dt[0]), + } + code := buildDtable_asm(s, &ctx) + + if code != 0 { + switch code { + case errorCorruptedNormalizedCounter: + position := ctx.errParam1 + return fmt.Errorf("corrupted input (position=%d, expected 0)", position) + + case errorNewStateTooBig: + newState := decSymbol(ctx.errParam1) + size := ctx.errParam2 + return fmt.Errorf("newState (%d) outside table size (%d)", newState, size) + + case errorNewStateNoBits: + newState := decSymbol(ctx.errParam1) + oldState := decSymbol(ctx.errParam2) + return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState) + + default: + return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code) + } + } + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s new file mode 100644 index 000000000..da32b4420 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s @@ -0,0 +1,127 @@ +// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm + +// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int +TEXT ·buildDtable_asm(SB), $0-24 + MOVQ ctx+8(FP), CX + MOVQ s+0(FP), DI + + // Load values + MOVBQZX 4098(DI), DX + XORQ AX, AX + BTSQ DX, AX + MOVQ (CX), BX + MOVQ 16(CX), SI + LEAQ -1(AX), R8 + MOVQ 8(CX), CX + MOVWQZX 4096(DI), DI + + // End load values + // Init, lay down lowprob symbols + XORQ R9, R9 + JMP init_main_loop_condition + +init_main_loop: + MOVWQSX (CX)(R9*2), R10 + CMPW R10, $-1 + JNE do_not_update_high_threshold + MOVB R9, 1(SI)(R8*8) + DECQ R8 + MOVQ $0x0000000000000001, R10 + +do_not_update_high_threshold: + MOVW R10, (BX)(R9*2) + INCQ R9 + +init_main_loop_condition: + CMPQ R9, DI + JL init_main_loop + + // Spread symbols + // Calculate table step + MOVQ AX, R9 + SHRQ $0x01, R9 + MOVQ AX, R10 + SHRQ $0x03, R10 + LEAQ 3(R9)(R10*1), R9 + + // Fill add bits values + LEAQ -1(AX), R10 + XORQ R11, R11 + XORQ R12, R12 + JMP spread_main_loop_condition + +spread_main_loop: + XORQ R13, R13 + MOVWQSX (CX)(R12*2), R14 + JMP spread_inner_loop_condition + +spread_inner_loop: + MOVB R12, 1(SI)(R11*8) + +adjust_position: + ADDQ R9, R11 + ANDQ R10, R11 + CMPQ R11, R8 + JG adjust_position + INCQ R13 + +spread_inner_loop_condition: + CMPQ R13, R14 + JL spread_inner_loop + INCQ R12 + +spread_main_loop_condition: + CMPQ R12, DI + JL spread_main_loop + TESTQ R11, R11 + JZ spread_check_ok + MOVQ ctx+8(FP), AX + MOVQ R11, 24(AX) + MOVQ $+1, ret+16(FP) + RET + +spread_check_ok: + // Build Decoding table + XORQ DI, DI + +build_table_main_table: + MOVBQZX 1(SI)(DI*8), CX + MOVWQZX (BX)(CX*2), R8 + LEAQ 1(R8), R9 + MOVW R9, (BX)(CX*2) + MOVQ R8, R9 + BSRQ R9, R9 + MOVQ DX, CX + SUBQ R9, CX + SHLQ CL, R8 + SUBQ AX, R8 + MOVB CL, (SI)(DI*8) + MOVW R8, 2(SI)(DI*8) + CMPQ R8, AX + JLE build_table_check1_ok + MOVQ ctx+8(FP), CX + MOVQ R8, 24(CX) + MOVQ AX, 32(CX) + MOVQ $+2, ret+16(FP) + RET + +build_table_check1_ok: + TESTB CL, CL + JNZ build_table_check2_ok + CMPW R8, DI + JNE build_table_check2_ok + MOVQ ctx+8(FP), AX + MOVQ R8, 24(AX) + MOVQ DI, 32(AX) + MOVQ $+3, ret+16(FP) + RET + +build_table_check2_ok: + INCQ DI + CMPQ DI, AX + JL build_table_main_table + MOVQ $+0, ret+16(FP) + RET diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go new file mode 100644 index 000000000..332e51fe4 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go @@ -0,0 +1,72 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +package zstd + +import ( + "errors" + "fmt" +) + +// buildDtable will build the decoding table. +func (s *fseDecoder) buildDtable() error { + tableSize := uint32(1 << s.actualTableLog) + highThreshold := tableSize - 1 + symbolNext := s.stateTable[:256] + + // Init, lay down lowprob symbols + { + for i, v := range s.norm[:s.symbolLen] { + if v == -1 { + s.dt[highThreshold].setAddBits(uint8(i)) + highThreshold-- + symbolNext[i] = 1 + } else { + symbolNext[i] = uint16(v) + } + } + } + + // Spread symbols + { + tableMask := tableSize - 1 + step := tableStep(tableSize) + position := uint32(0) + for ss, v := range s.norm[:s.symbolLen] { + for i := 0; i < int(v); i++ { + s.dt[position].setAddBits(uint8(ss)) + position = (position + step) & tableMask + for position > highThreshold { + // lowprob area + position = (position + step) & tableMask + } + } + } + if position != 0 { + // position must reach all cells once, otherwise normalizedCounter is incorrect + return errors.New("corrupted input (position != 0)") + } + } + + // Build Decoding table + { + tableSize := uint16(1 << s.actualTableLog) + for u, v := range s.dt[:tableSize] { + symbol := v.addBits() + nextState := symbolNext[symbol] + symbolNext[symbol] = nextState + 1 + nBits := s.actualTableLog - byte(highBits(uint32(nextState))) + s.dt[u&maxTableMask].setNBits(nBits) + newState := (nextState << nBits) - tableSize + if newState > tableSize { + return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize) + } + if newState == uint16(u) && nBits == 0 { + // Seems weird that this is possible with nbits > 0. + return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u) + } + s.dt[u&maxTableMask].setNewState(newState) + } + } + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s index 212c6cac3..71e64e061 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s @@ -134,18 +134,17 @@ sequenceDecs_decode_amd64_fill_2_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -155,18 +154,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -176,18 +174,17 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -416,18 +413,17 @@ sequenceDecs_decode_56_amd64_fill_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -437,18 +433,17 @@ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -458,18 +453,17 @@ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -1181,52 +1175,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1234,53 +1241,74 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 @@ -1382,45 +1410,67 @@ main_loop: // Copy literals TESTQ R11, R11 JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, R11 - JZ copy_1_word - MOVB (SI)(R14*1), R15 - MOVB R15, (BX)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, R11 - JZ copy_1_dword - MOVW (SI)(R14*1), R15 - MOVW R15, (BX)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, R11 - JZ copy_1_qword - MOVL (SI)(R14*1), R15 - MOVL R15, (BX)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, R11 - JZ copy_1_test - MOVQ (SI)(R14*1), R15 - MOVQ R15, (BX)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ R11, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (SI), X0 + MOVUPS X0, (BX) + ADDQ $0x10, SI + ADDQ $0x10, BX + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(SI)(R14*1), SI + LEAQ 16(BX)(R14*1), BX + MOVUPS -16(SI), X0 + MOVUPS X0, -16(BX) + JMP copy_1_end + +copy_1_small: + CMPQ R11, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ R11, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (SI), R14 + MOVB -1(SI)(R11*1), R15 + MOVB R14, (BX) + MOVB R15, -1(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1: - MOVUPS (SI)(R14*1), X0 - MOVUPS X0, (BX)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (SI), R14 + MOVB 2(SI), R15 + MOVW R14, (BX) + MOVB R15, 2(BX) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1_test: - CMPQ R14, R11 - JB copy_1 +copy_1_move_4through7: + MOVL (SI), R14 + MOVL -4(SI)(R11*1), R15 + MOVL R14, (BX) + MOVL R15, -4(BX)(R11*1) ADDQ R11, SI ADDQ R11, BX + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (SI), R14 + MOVQ -8(SI)(R11*1), R15 + MOVQ R14, (BX) + MOVQ R15, -8(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + +copy_1_end: ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -1432,52 +1482,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1485,99 +1548,143 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, DI - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (R11)(R12*1), R14 - MOVB R14, (BX)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (R11)(R12*1), R14 - MOVW R14, (BX)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (R11)(R12*1), R14 - MOVL R14, (BX)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (R11)(R12*1), R14 - MOVQ R14, (BX)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test + ADDQ R13, DI + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2: - MOVUPS (R11)(R12*1), X0 - MOVUPS X0, (BX)(R12*1) - ADDQ $0x10, R12 +copy_2_loop: + MOVUPS (R11), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R11 + ADDQ $0x10, BX + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(R11)(R12*1), R11 + LEAQ 16(BX)(R12*1), BX + MOVUPS -16(R11), X0 + MOVUPS X0, -16(BX) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (R11), R12 + MOVB -1(R11)(R13*1), R14 + MOVB R12, (BX) + MOVB R14, -1(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_move_3: + MOVW (R11), R12 + MOVB 2(R11), R14 + MOVW R12, (BX) + MOVB R14, 2(BX) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_4through7: + MOVL (R11), R12 + MOVL -4(R11)(R13*1), R14 + MOVL R12, (BX) + MOVL R14, -4(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (R11), R12 + MOVQ -8(R11)(R13*1), R14 + MOVQ R12, (BX) + MOVQ R14, -8(BX)(R13*1) + ADDQ R13, R11 ADDQ R13, BX - JMP handle_loop + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -1773,18 +1880,17 @@ sequenceDecs_decodeSync_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -1794,18 +1900,17 @@ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -1815,18 +1920,17 @@ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -1934,103 +2038,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX @@ -2407,103 +2545,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 @@ -2746,18 +2918,17 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -2767,18 +2938,17 @@ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -2788,18 +2958,17 @@ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -2885,45 +3054,67 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: // Copy literals TESTQ AX, AX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, AX - JZ copy_1_word - MOVB (R11)(R14*1), R15 - MOVB R15, (R10)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, AX - JZ copy_1_dword - MOVW (R11)(R14*1), R15 - MOVW R15, (R10)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, AX - JZ copy_1_qword - MOVL (R11)(R14*1), R15 - MOVL R15, (R10)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, AX - JZ copy_1_test - MOVQ (R11)(R14*1), R15 - MOVQ R15, (R10)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ AX, R14 + SUBQ $0x10, R14 + JB copy_1_small -copy_1: - MOVUPS (R11)(R14*1), X0 - MOVUPS X0, (R10)(R14*1) - ADDQ $0x10, R14 +copy_1_loop: + MOVUPS (R11), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R11 + ADDQ $0x10, R10 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R11)(R14*1), R11 + LEAQ 16(R10)(R14*1), R10 + MOVUPS -16(R11), X0 + MOVUPS X0, -16(R10) + JMP copy_1_end + +copy_1_small: + CMPQ AX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ AX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R11), R14 + MOVB -1(R11)(AX*1), R15 + MOVB R14, (R10) + MOVB R15, -1(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_3: + MOVW (R11), R14 + MOVB 2(R11), R15 + MOVW R14, (R10) + MOVB R15, 2(R10) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R11), R14 + MOVL -4(R11)(AX*1), R15 + MOVL R14, (R10) + MOVL R15, -4(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end -copy_1_test: - CMPQ R14, AX - JB copy_1 +copy_1_move_8through16: + MOVQ (R11), R14 + MOVQ -8(R11)(AX*1), R15 + MOVQ R14, (R10) + MOVQ R15, -8(R10)(AX*1) ADDQ AX, R11 ADDQ AX, R10 + +copy_1_end: ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -2936,149 +3127,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R12 - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (AX)(CX*1), R14 - MOVB R14, (R10)(CX*1) - ADDQ $0x01, CX - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (AX)(CX*1), R14 - MOVW R14, (R10)(CX*1) - ADDQ $0x02, CX - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (AX)(CX*1), R14 - MOVL R14, (R10)(CX*1) - ADDQ $0x04, CX - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (AX)(CX*1), R14 - MOVQ R14, (R10)(CX*1) - ADDQ $0x08, CX - JMP copy_2_test - -copy_2: - MOVUPS (AX)(CX*1), X0 - MOVUPS X0, (R10)(CX*1) - ADDQ $0x10, CX + ADDQ R13, R12 + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_2_small -copy_2_test: - CMPQ CX, R13 - JB copy_2 +copy_2_loop: + MOVUPS (AX), X0 + MOVUPS X0, (R10) + ADDQ $0x10, AX + ADDQ $0x10, R10 + SUBQ $0x10, CX + JAE copy_2_loop + LEAQ 16(AX)(CX*1), AX + LEAQ 16(R10)(CX*1), R10 + MOVUPS -16(AX), X0 + MOVUPS X0, -16(R10) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (AX), CL + MOVB -1(AX)(R13*1), R14 + MOVB CL, (R10) + MOVB R14, -1(R10)(R13*1) + ADDQ R13, AX ADDQ R13, R10 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (AX), CX + MOVB 2(AX), R14 + MOVW CX, (R10) + MOVB R14, 2(R10) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (AX), CX + MOVL -4(AX)(R13*1), R14 + MOVL CX, (R10) + MOVL R14, -4(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (AX), CX + MOVQ -8(AX)(R13*1), R14 + MOVQ CX, (R10) + MOVQ R14, -8(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -3415,45 +3663,67 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: // Copy literals TESTQ CX, CX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, CX - JZ copy_1_word - MOVB (R10)(R14*1), R15 - MOVB R15, (R9)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, CX - JZ copy_1_dword - MOVW (R10)(R14*1), R15 - MOVW R15, (R9)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, CX - JZ copy_1_qword - MOVL (R10)(R14*1), R15 - MOVL R15, (R9)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, CX - JZ copy_1_test - MOVQ (R10)(R14*1), R15 - MOVQ R15, (R9)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ CX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R10), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R10 + ADDQ $0x10, R9 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R10)(R14*1), R10 + LEAQ 16(R9)(R14*1), R9 + MOVUPS -16(R10), X0 + MOVUPS X0, -16(R9) + JMP copy_1_end + +copy_1_small: + CMPQ CX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ CX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R10), R14 + MOVB -1(R10)(CX*1), R15 + MOVB R14, (R9) + MOVB R15, -1(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1: - MOVUPS (R10)(R14*1), X0 - MOVUPS X0, (R9)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (R10), R14 + MOVB 2(R10), R15 + MOVW R14, (R9) + MOVB R15, 2(R9) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R10), R14 + MOVL -4(R10)(CX*1), R15 + MOVL R14, (R9) + MOVL R15, -4(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1_test: - CMPQ R14, CX - JB copy_1 +copy_1_move_8through16: + MOVQ (R10), R14 + MOVQ -8(R10)(CX*1), R15 + MOVQ R14, (R9) + MOVQ R15, -8(R9)(CX*1) ADDQ CX, R10 ADDQ CX, R9 + +copy_1_end: ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -3466,149 +3736,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R11 - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (CX)(R12*1), R14 - MOVB R14, (R9)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (CX)(R12*1), R14 - MOVW R14, (R9)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (CX)(R12*1), R14 - MOVL R14, (R9)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (CX)(R12*1), R14 - MOVQ R14, (R9)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test - -copy_2: - MOVUPS (CX)(R12*1), X0 - MOVUPS X0, (R9)(R12*1) - ADDQ $0x10, R12 + ADDQ R13, R11 + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_loop: + MOVUPS (CX), X0 + MOVUPS X0, (R9) + ADDQ $0x10, CX + ADDQ $0x10, R9 + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(CX)(R12*1), CX + LEAQ 16(R9)(R12*1), R9 + MOVUPS -16(CX), X0 + MOVUPS X0, -16(R9) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (CX), R12 + MOVB -1(CX)(R13*1), R14 + MOVB R12, (R9) + MOVB R14, -1(R9)(R13*1) + ADDQ R13, CX ADDQ R13, R9 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (CX), R12 + MOVB 2(CX), R14 + MOVW R12, (R9) + MOVB R14, 2(R9) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (CX), R12 + MOVL -4(CX)(R13*1), R14 + MOVL R12, (R9) + MOVL R14, -4(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (CX), R12 + MOVQ -8(CX)(R13*1), R14 + MOVQ R12, (R9) + MOVQ R14, -8(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: |