summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost')
-rw-r--r--vendor/github.com/klauspost/compress/README.md10
-rw-r--r--vendor/github.com/klauspost/compress/flate/inflate_gen.go100
-rw-r--r--vendor/github.com/klauspost/compress/huff0/autogen.go5
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s488
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in197
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.go95
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s982
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in195
-rw-r--r--vendor/github.com/klauspost/compress/zstd/decoder.go14
-rw-r--r--vendor/github.com/klauspost/compress/zstd/decoder_options.go9
-rw-r--r--vendor/github.com/klauspost/compress/zstd/framedec.go52
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fuzz.go11
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fuzz_none.go11
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s124
14 files changed, 777 insertions, 1516 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 5b7cf781a..c3ec9d8a7 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,16 @@ This package provides various compression algorithms.
# changelog
+* May 5, 2022 (v1.15.3)
+ * zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
+ * s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
+
+* Apr 26, 2022 (v1.15.2)
+ * zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster. [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
+ * zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
+ * s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
+ * Minimum version is Go 1.16, added CI test on 1.18.
+
* Mar 11, 2022 (v1.15.1)
* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
index 8d632cea0..61342b6b8 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -24,7 +24,7 @@ func (f *decompressor) huffmanBytesBuffer() {
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
- fnb, fb := f.nb, f.b
+ fnb, fb, dict := f.nb, f.b, &f.dict
switch f.stepState {
case stateInit:
@@ -82,9 +82,9 @@ readLiteral:
var length int
switch {
case v < 256:
- f.dict.writeByte(byte(v))
- if f.dict.availWrite() == 0 {
- f.toRead = f.dict.readFlush()
+ dict.writeByte(byte(v))
+ if dict.availWrite() == 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanBytesBuffer
f.stepState = stateInit
f.b, f.nb = fb, fnb
@@ -227,10 +227,10 @@ readLiteral:
}
// No check on length; encoding can be prescient.
- if dist > uint32(f.dict.histSize()) {
+ if dist > uint32(dict.histSize()) {
f.b, f.nb = fb, fnb
if debugDecode {
- fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+ fmt.Println("dist > dict.histSize():", dist, dict.histSize())
}
f.err = CorruptInputError(f.roffset)
return
@@ -243,14 +243,14 @@ readLiteral:
copyHistory:
// Perform a backwards copy according to RFC section 3.2.3.
{
- cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+ cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
if cnt == 0 {
- cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+ cnt = dict.writeCopy(f.copyDist, f.copyLen)
}
f.copyLen -= cnt
- if f.dict.availWrite() == 0 || f.copyLen > 0 {
- f.toRead = f.dict.readFlush()
+ if dict.availWrite() == 0 || f.copyLen > 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
f.stepState = stateDict
f.b, f.nb = fb, fnb
@@ -275,7 +275,7 @@ func (f *decompressor) huffmanBytesReader() {
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
- fnb, fb := f.nb, f.b
+ fnb, fb, dict := f.nb, f.b, &f.dict
switch f.stepState {
case stateInit:
@@ -333,9 +333,9 @@ readLiteral:
var length int
switch {
case v < 256:
- f.dict.writeByte(byte(v))
- if f.dict.availWrite() == 0 {
- f.toRead = f.dict.readFlush()
+ dict.writeByte(byte(v))
+ if dict.availWrite() == 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanBytesReader
f.stepState = stateInit
f.b, f.nb = fb, fnb
@@ -478,10 +478,10 @@ readLiteral:
}
// No check on length; encoding can be prescient.
- if dist > uint32(f.dict.histSize()) {
+ if dist > uint32(dict.histSize()) {
f.b, f.nb = fb, fnb
if debugDecode {
- fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+ fmt.Println("dist > dict.histSize():", dist, dict.histSize())
}
f.err = CorruptInputError(f.roffset)
return
@@ -494,14 +494,14 @@ readLiteral:
copyHistory:
// Perform a backwards copy according to RFC section 3.2.3.
{
- cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+ cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
if cnt == 0 {
- cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+ cnt = dict.writeCopy(f.copyDist, f.copyLen)
}
f.copyLen -= cnt
- if f.dict.availWrite() == 0 || f.copyLen > 0 {
- f.toRead = f.dict.readFlush()
+ if dict.availWrite() == 0 || f.copyLen > 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanBytesReader // We need to continue this work
f.stepState = stateDict
f.b, f.nb = fb, fnb
@@ -526,7 +526,7 @@ func (f *decompressor) huffmanBufioReader() {
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
- fnb, fb := f.nb, f.b
+ fnb, fb, dict := f.nb, f.b, &f.dict
switch f.stepState {
case stateInit:
@@ -584,9 +584,9 @@ readLiteral:
var length int
switch {
case v < 256:
- f.dict.writeByte(byte(v))
- if f.dict.availWrite() == 0 {
- f.toRead = f.dict.readFlush()
+ dict.writeByte(byte(v))
+ if dict.availWrite() == 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanBufioReader
f.stepState = stateInit
f.b, f.nb = fb, fnb
@@ -729,10 +729,10 @@ readLiteral:
}
// No check on length; encoding can be prescient.
- if dist > uint32(f.dict.histSize()) {
+ if dist > uint32(dict.histSize()) {
f.b, f.nb = fb, fnb
if debugDecode {
- fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+ fmt.Println("dist > dict.histSize():", dist, dict.histSize())
}
f.err = CorruptInputError(f.roffset)
return
@@ -745,14 +745,14 @@ readLiteral:
copyHistory:
// Perform a backwards copy according to RFC section 3.2.3.
{
- cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+ cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
if cnt == 0 {
- cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+ cnt = dict.writeCopy(f.copyDist, f.copyLen)
}
f.copyLen -= cnt
- if f.dict.availWrite() == 0 || f.copyLen > 0 {
- f.toRead = f.dict.readFlush()
+ if dict.availWrite() == 0 || f.copyLen > 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanBufioReader // We need to continue this work
f.stepState = stateDict
f.b, f.nb = fb, fnb
@@ -777,7 +777,7 @@ func (f *decompressor) huffmanStringsReader() {
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
- fnb, fb := f.nb, f.b
+ fnb, fb, dict := f.nb, f.b, &f.dict
switch f.stepState {
case stateInit:
@@ -835,9 +835,9 @@ readLiteral:
var length int
switch {
case v < 256:
- f.dict.writeByte(byte(v))
- if f.dict.availWrite() == 0 {
- f.toRead = f.dict.readFlush()
+ dict.writeByte(byte(v))
+ if dict.availWrite() == 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanStringsReader
f.stepState = stateInit
f.b, f.nb = fb, fnb
@@ -980,10 +980,10 @@ readLiteral:
}
// No check on length; encoding can be prescient.
- if dist > uint32(f.dict.histSize()) {
+ if dist > uint32(dict.histSize()) {
f.b, f.nb = fb, fnb
if debugDecode {
- fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+ fmt.Println("dist > dict.histSize():", dist, dict.histSize())
}
f.err = CorruptInputError(f.roffset)
return
@@ -996,14 +996,14 @@ readLiteral:
copyHistory:
// Perform a backwards copy according to RFC section 3.2.3.
{
- cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+ cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
if cnt == 0 {
- cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+ cnt = dict.writeCopy(f.copyDist, f.copyLen)
}
f.copyLen -= cnt
- if f.dict.availWrite() == 0 || f.copyLen > 0 {
- f.toRead = f.dict.readFlush()
+ if dict.availWrite() == 0 || f.copyLen > 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanStringsReader // We need to continue this work
f.stepState = stateDict
f.b, f.nb = fb, fnb
@@ -1028,7 +1028,7 @@ func (f *decompressor) huffmanGenericReader() {
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
- fnb, fb := f.nb, f.b
+ fnb, fb, dict := f.nb, f.b, &f.dict
switch f.stepState {
case stateInit:
@@ -1086,9 +1086,9 @@ readLiteral:
var length int
switch {
case v < 256:
- f.dict.writeByte(byte(v))
- if f.dict.availWrite() == 0 {
- f.toRead = f.dict.readFlush()
+ dict.writeByte(byte(v))
+ if dict.availWrite() == 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanGenericReader
f.stepState = stateInit
f.b, f.nb = fb, fnb
@@ -1231,10 +1231,10 @@ readLiteral:
}
// No check on length; encoding can be prescient.
- if dist > uint32(f.dict.histSize()) {
+ if dist > uint32(dict.histSize()) {
f.b, f.nb = fb, fnb
if debugDecode {
- fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+ fmt.Println("dist > dict.histSize():", dist, dict.histSize())
}
f.err = CorruptInputError(f.roffset)
return
@@ -1247,14 +1247,14 @@ readLiteral:
copyHistory:
// Perform a backwards copy according to RFC section 3.2.3.
{
- cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+ cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
if cnt == 0 {
- cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+ cnt = dict.writeCopy(f.copyDist, f.copyLen)
}
f.copyLen -= cnt
- if f.dict.availWrite() == 0 || f.copyLen > 0 {
- f.toRead = f.dict.readFlush()
+ if dict.availWrite() == 0 || f.copyLen > 0 {
+ f.toRead = dict.readFlush()
f.step = (*decompressor).huffmanGenericReader // We need to continue this work
f.stepState = stateDict
f.b, f.nb = fb, fnb
diff --git a/vendor/github.com/klauspost/compress/huff0/autogen.go b/vendor/github.com/klauspost/compress/huff0/autogen.go
deleted file mode 100644
index ff2c69d60..000000000
--- a/vendor/github.com/klauspost/compress/huff0/autogen.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package huff0
-
-//go:generate go run generate.go
-//go:generate asmfmt -w decompress_amd64.s
-//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
deleted file mode 100644
index 0d6cb1a96..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
+++ /dev/null
@@ -1,488 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-
- // const stream = 0
- // br0.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
- MOVQ bitReaderShifted_value(br0), br_value
- MOVQ bitReaderShifted_off(br0), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill0
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br0), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br0.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill0:
-
- // val0 := br0.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 0(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br0.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 0+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
- MOVQ br_value, bitReaderShifted_value(br0)
- MOVQ br_offset, bitReaderShifted_off(br0)
-
- // const stream = 1
- // br1.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
- MOVQ bitReaderShifted_value(br1), br_value
- MOVQ bitReaderShifted_off(br1), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill1
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br1), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br1.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill1:
-
- // val0 := br1.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 256(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br1.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 256+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
- MOVQ br_value, bitReaderShifted_value(br1)
- MOVQ br_offset, bitReaderShifted_off(br1)
-
- // const stream = 2
- // br2.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
- MOVQ bitReaderShifted_value(br2), br_value
- MOVQ bitReaderShifted_off(br2), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill2
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br2), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br2.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill2:
-
- // val0 := br2.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 512(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br2.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 512+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
- MOVQ br_value, bitReaderShifted_value(br2)
- MOVQ br_offset, bitReaderShifted_off(br2)
-
- // const stream = 3
- // br3.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
- MOVQ bitReaderShifted_value(br3), br_value
- MOVQ bitReaderShifted_off(br3), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill3
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br3), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br3.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill3:
-
- // val0 := br3.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 768(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br3.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 768+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
- MOVQ br_value, bitReaderShifted_value(br3)
- MOVQ br_offset, bitReaderShifted_off(br3)
-
- ADDQ $4, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
deleted file mode 100644
index 6d477a2c1..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
+++ /dev/null
@@ -1,197 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
- // const stream = {{ var "id" }}
- // br{{ var "id"}}.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
- MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
- MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill{{ var "id" }}
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br{{ var "id"}}.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
- // }
-skip_fill{{ var "id" }}:
-
- // val0 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, {{ var "bufofs" }}+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
- MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
- MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
- {{ set "id" "0" }}
- {{ set "ofs" "0" }}
- {{ set "bufofs" "0" }} {{/* id * bufoff */}}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "1" }}
- {{ set "ofs" "8" }}
- {{ set "bufofs" "256" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "2" }}
- {{ set "ofs" "16" }}
- {{ set "bufofs" "512" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "3" }}
- {{ set "ofs" "24" }}
- {{ set "bufofs" "768" }}
- {{ template "decode_2_values_x86" . }}
-
- ADDQ $4, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index ce8e93bcd..3415e5da2 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -13,19 +13,30 @@ import (
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
//go:noescape
-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
- peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
//go:noescape
-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
- peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
// fallback8BitSize is the size where using Go version is faster.
const fallback8BitSize = 800
+type decompress4xContext struct {
+ pbr0 *bitReaderShifted
+ pbr1 *bitReaderShifted
+ pbr2 *bitReaderShifted
+ pbr3 *bitReaderShifted
+ peekBits uint8
+ out *byte
+ dstEvery int
+ tbl *dEntrySingle
+ decoded int
+ limit *byte
+}
+
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
@@ -42,6 +53,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if cap(dst) < fallback8BitSize && use8BitTables {
return d.decompress4X8bit(dst, src)
}
+
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
@@ -71,70 +83,28 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
- // Use temp table to avoid bound checks/append penalty.
- buf := d.buffer()
- var off uint8
var decoded int
- const debug = false
-
- // see: bitReaderShifted.peekBitsFast()
- peekBits := uint8((64 - d.actualTableLog) & 63)
-
- // Decode 2 values from each decoder/loop.
- const bufoff = 256
- for {
- if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
- break
+ if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+ ctx := decompress4xContext{
+ pbr0: &br[0],
+ pbr1: &br[1],
+ pbr2: &br[2],
+ pbr3: &br[3],
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ out: &out[0],
+ dstEvery: dstEvery,
+ tbl: &single[0],
+ limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
}
-
if use8BitTables {
- off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+ decompress4x_8b_main_loop_amd64(&ctx)
} else {
- off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
- }
- if debug {
- fmt.Print("DEBUG: ")
- fmt.Printf("off=%d,", off)
- for i := 0; i < 4; i++ {
- fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
- i, br[i].bitsRead, br[i].value, br[i].off)
- }
- fmt.Println("")
- }
-
- if off != 0 {
- break
+ decompress4x_main_loop_amd64(&ctx)
}
- if bufoff > dstEvery {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 1")
- }
- copy(out, buf[0][:])
- copy(out[dstEvery:], buf[1][:])
- copy(out[dstEvery*2:], buf[2][:])
- copy(out[dstEvery*3:], buf[3][:])
- out = out[bufoff:]
- decoded += bufoff * 4
- // There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 2")
- }
- }
- if off > 0 {
- ioff := int(off)
- if len(out) < dstEvery*3+ioff {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 3")
- }
- copy(out, buf[0][:off])
- copy(out[dstEvery:], buf[1][:off])
- copy(out[dstEvery*2:], buf[2][:off])
- copy(out[dstEvery*3:], buf[3][:off])
- decoded += int(off) * 4
- out = out[off:]
+ decoded = ctx.decoded
+ out = out[decoded/4:]
}
// Decode remaining.
@@ -150,7 +120,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
- d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
@@ -164,7 +133,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
offset++
}
if offset != endsAt {
- d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
@@ -173,7 +141,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
return nil, err
}
}
- d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 2edad3ea5..06287f568 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,506 +1,662 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), AX
+ MOVBQZX 32(AX), SI
+ MOVQ 40(AX), DI
+ MOVQ DI, BX
+ MOVQ 72(AX), CX
+ MOVQ CX, (SP)
+ MOVQ 48(AX), R8
+ MOVQ 56(AX), R9
+ MOVQ (AX), R10
+ MOVQ 8(AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 24(AX), R13
+
+ // Main loop
main_loop:
-
- // const stream = 0
- // br0.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
- MOVQ bitReaderShifted_value(br0), br_value
- MOVQ bitReaderShifted_off(br0), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill0
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br0), AX
+ MOVQ BX, DI
+ CMPQ DI, (SP)
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R10), R14
+ MOVBQZX 40(R10), R15
+ CMPQ R15, $0x20
+ JBE skip_fill0
+ MOVQ 24(R10), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R10), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R10)
+ ORQ BP, R14
// exhausted = exhausted || (br0.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill0:
-
// val0 := br0.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 0(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
- MOVQ br_value, bitReaderShifted_value(br0)
- MOVQ br_offset, bitReaderShifted_off(br0)
-
- // const stream = 1
- // br1.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
- MOVQ bitReaderShifted_value(br1), br_value
- MOVQ bitReaderShifted_off(br1), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill1
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br1), AX
+ MOVQ R14, 32(R10)
+ MOVB R15, 40(R10)
+ ADDQ R8, DI
+
+ // br1.fillFast32()
+ MOVQ 32(R11), R14
+ MOVBQZX 40(R11), R15
+ CMPQ R15, $0x20
+ JBE skip_fill1
+ MOVQ 24(R11), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R11), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R11)
+ ORQ BP, R14
// exhausted = exhausted || (br1.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill1:
-
// val0 := br1.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
+ MOVW (R9)(BP*2), CX
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 256(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
- MOVQ br_value, bitReaderShifted_value(br1)
- MOVQ br_offset, bitReaderShifted_off(br1)
-
- // const stream = 2
- // br2.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
- MOVQ bitReaderShifted_value(br2), br_value
- MOVQ bitReaderShifted_off(br2), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill2
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br2), AX
+ MOVQ R14, 32(R11)
+ MOVB R15, 40(R11)
+ ADDQ R8, DI
+
+ // br2.fillFast32()
+ MOVQ 32(R12), R14
+ MOVBQZX 40(R12), R15
+ CMPQ R15, $0x20
+ JBE skip_fill2
+ MOVQ 24(R12), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R12), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R12)
+ ORQ BP, R14
// exhausted = exhausted || (br2.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill2:
-
// val0 := br2.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ MOVW (R9)(BP*2), CX
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R9)(BP*2), CX
// br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 512(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
// update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
- MOVQ br_value, bitReaderShifted_value(br2)
- MOVQ br_offset, bitReaderShifted_off(br2)
-
- // const stream = 3
- // br3.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
- MOVQ bitReaderShifted_value(br3), br_value
- MOVQ bitReaderShifted_off(br3), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill3
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br3), AX
+ MOVQ R14, 32(R12)
+ MOVB R15, 40(R12)
+ ADDQ R8, DI
+
+ // br3.fillFast32()
+ MOVQ 32(R13), R14
+ MOVBQZX 40(R13), R15
+ CMPQ R15, $0x20
+ JBE skip_fill3
+ MOVQ 24(R13), AX
+ SUBQ $0x20, R15
+ SUBQ $0x04, AX
+ MOVQ (R13), BP
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(BP*1), BP
+ MOVQ R15, CX
+ SHLQ CL, BP
+ MOVQ AX, 24(R13)
+ ORQ BP, R14
// exhausted = exhausted || (br3.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill3:
-
// val0 := br3.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R14, BP
+ MOVQ SI, CX
+ SHRQ CL, BP
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ MOVW (R9)(BP*2), CX
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R14
+ ADDB CL, R15
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ SI, CX
+ MOVQ R14, BP
+ SHRQ CL, BP
-#endif
+ // v1 := table[val1&mask]
+ MOVW (R9)(BP*2), CX
- ADDQ CX, br_bits_read // bits_read += n
+ // br3.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R14
+ ADDB CL, R15
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (DI)
-#else
- // val1 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
+ // update the bitrader reader structure
+ MOVQ R14, 32(R13)
+ MOVB R15, 40(R13)
+ ADDQ $0x02, BX
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ MOVQ 40(AX), CX
+ MOVQ BX, DX
+ SUBQ CX, DX
+ SHLQ $0x02, DX
+ MOVQ DX, 64(AX)
+ RET
-#endif
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), CX
+ MOVBQZX 32(CX), BX
+ MOVQ 40(CX), SI
+ MOVQ SI, (SP)
+ MOVQ 72(CX), DX
+ MOVQ DX, 8(SP)
+ MOVQ 48(CX), DI
+ MOVQ 56(CX), R8
+ MOVQ (CX), R9
+ MOVQ 8(CX), R10
+ MOVQ 16(CX), R11
+ MOVQ 24(CX), R12
+
+ // Main loop
+main_loop:
+ MOVQ (SP), SI
+ CMPQ SI, 8(SP)
+ SETGE DL
+
+ // br1000.fillFast32()
+ MOVQ 32(R9), R13
+ MOVBQZX 40(R9), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1000
+ MOVQ 24(R9), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R9), BP
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R9)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1000.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1000:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br0.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br0.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R9)
+ MOVB R14, 40(R9)
+ ADDQ DI, SI
+
+ // br1001.fillFast32()
+ MOVQ 32(R10), R13
+ MOVBQZX 40(R10), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1001
+ MOVQ 24(R10), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R10), BP
-#endif
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R10)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1001.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1001:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
- ADDQ CX, br_bits_read // bits_read += n
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 768(buffer)(off*1)
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
- MOVQ br_value, bitReaderShifted_value(br3)
- MOVQ br_offset, bitReaderShifted_off(br3)
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br1.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br1.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R10)
+ MOVB R14, 40(R10)
+ ADDQ DI, SI
+
+ // br1002.fillFast32()
+ MOVQ 32(R11), R13
+ MOVBQZX 40(R11), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1002
+ MOVQ 24(R11), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R11), BP
- ADDQ $2, off // off += 2
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R11)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1002.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1002:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
- CMPQ off, $bufoff
- JL main_loop
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
-end:
- MOVQ 0(SP), BP
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br2.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br2.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R11)
+ MOVB R14, 40(R11)
+ ADDQ DI, SI
+
+ // br1003.fillFast32()
+ MOVQ 32(R12), R13
+ MOVBQZX 40(R12), R14
+ CMPQ R14, $0x20
+ JBE skip_fill1003
+ MOVQ 24(R12), R15
+ SUBQ $0x20, R14
+ SUBQ $0x04, R15
+ MOVQ (R12), BP
- MOVB off, ret+56(FP)
- RET
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R15)(BP*1), BP
+ MOVQ R14, CX
+ SHLQ CL, BP
+ MOVQ R15, 24(R12)
+ ORQ BP, R13
+
+ // exhausted = exhausted || (br1003.off < 4)
+ CMPQ R15, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1003:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
-#undef off
-#undef buffer
-#undef table
+ // v0 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
-#undef br0
-#undef br1
-#undef br2
-#undef br3
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v1 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // val2 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v2 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R13
+ ADDB CL, R14
+
+ // val3 := br3.peekTopBits(peekBits)
+ MOVQ R13, R15
+ MOVQ BX, CX
+ SHRQ CL, R15
+
+ // v3 := table[val0&mask]
+ MOVW (R8)(R15*2), CX
+
+ // br3.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R13
+ ADDB CL, R14
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (SI)
+
+ // update the bitreader reader structure
+ MOVQ R13, 32(R12)
+ MOVB R14, 40(R12)
+ ADDQ $0x04, (SP)
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ MOVQ 40(AX), CX
+ MOVQ (SP), DX
+ SUBQ CX, DX
+ SHLQ $0x02, DX
+ MOVQ DX, 64(AX)
+ RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
deleted file mode 100644
index 330d86ae1..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
+++ /dev/null
@@ -1,195 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
- // const stream = {{ var "id" }}
- // br{{ var "id"}}.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
- MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
- MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill{{ var "id" }}
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-#endif
-
- ORQ AX, br_value
-
- // exhausted = exhausted || (br{{ var "id"}}.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
- // }
-skip_fill{{ var "id" }}:
-
- // val0 := br{{ var "id"}}.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-#endif
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
- // val1 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-#endif
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
- MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
- MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
- {{ set "id" "0" }}
- {{ set "ofs" "0" }}
- {{ set "bufofs" "0" }} {{/* id * bufoff */}}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "1" }}
- {{ set "ofs" "8" }}
- {{ set "bufofs" "256" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "2" }}
- {{ set "ofs" "16" }}
- {{ set "bufofs" "512" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "3" }}
- {{ set "ofs" "24" }}
- {{ set "bufofs" "768" }}
- {{ template "decode_2_values_x86" . }}
-
- ADDQ $2, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index c65ea9795..36119f385 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -439,7 +439,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
}
- if len(next.b) > 0 {
+ if !d.o.ignoreChecksum && len(next.b) > 0 {
n, err := d.current.crc.Write(next.b)
if err == nil {
if n != len(next.b) {
@@ -451,7 +451,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
got := d.current.crc.Sum64()
var tmp [4]byte
binary.LittleEndian.PutUint32(tmp[:], uint32(got))
- if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
+ if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
if debugDecoder {
println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
}
@@ -535,9 +535,15 @@ func (d *Decoder) nextBlockSync() (ok bool) {
// Update/Check CRC
if d.frame.HasCheckSum {
- d.frame.crc.Write(d.current.b)
+ if !d.o.ignoreChecksum {
+ d.frame.crc.Write(d.current.b)
+ }
if d.current.d.Last {
- d.current.err = d.frame.checkCRC()
+ if !d.o.ignoreChecksum {
+ d.current.err = d.frame.checkCRC()
+ } else {
+ d.current.err = d.frame.consumeCRC()
+ }
if d.current.err != nil {
println("CRC error:", d.current.err)
return false
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index fc52ebc40..c70e6fa0f 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -19,6 +19,7 @@ type decoderOptions struct {
maxDecodedSize uint64
maxWindowSize uint64
dicts []dict
+ ignoreChecksum bool
}
func (o *decoderOptions) setDefault() {
@@ -112,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
return nil
}
}
+
+// IgnoreChecksum allows to forcibly ignore checksum checking.
+func IgnoreChecksum(b bool) DOption {
+ return func(o *decoderOptions) error {
+ o.ignoreChecksum = b
+ return nil
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 509d5cece..3ff109cce 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -290,13 +290,6 @@ func (d *frameDec) checkCRC() error {
if !d.HasCheckSum {
return nil
}
- var tmp [4]byte
- got := d.crc.Sum64()
- // Flip to match file order.
- tmp[0] = byte(got >> 0)
- tmp[1] = byte(got >> 8)
- tmp[2] = byte(got >> 16)
- tmp[3] = byte(got >> 24)
// We can overwrite upper tmp now
want, err := d.rawInput.readSmall(4)
@@ -305,7 +298,19 @@ func (d *frameDec) checkCRC() error {
return err
}
- if !bytes.Equal(tmp[:], want) && !ignoreCRC {
+ if d.o.ignoreChecksum {
+ return nil
+ }
+
+ var tmp [4]byte
+ got := d.crc.Sum64()
+ // Flip to match file order.
+ tmp[0] = byte(got >> 0)
+ tmp[1] = byte(got >> 8)
+ tmp[2] = byte(got >> 16)
+ tmp[3] = byte(got >> 24)
+
+ if !bytes.Equal(tmp[:], want) {
if debugDecoder {
println("CRC Check Failed:", tmp[:], "!=", want)
}
@@ -317,6 +322,19 @@ func (d *frameDec) checkCRC() error {
return nil
}
+// consumeCRC reads the checksum data if the frame has one.
+func (d *frameDec) consumeCRC() error {
+ if d.HasCheckSum {
+ _, err := d.rawInput.readSmall(4)
+ if err != nil {
+ println("CRC missing?", err)
+ return err
+ }
+ }
+
+ return nil
+}
+
// runDecoder will create a sync decoder that will decode a block of data.
func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
saved := d.history.b
@@ -373,13 +391,17 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
err = ErrFrameSizeMismatch
} else if d.HasCheckSum {
- var n int
- n, err = d.crc.Write(dst[crcStart:])
- if err == nil {
- if n != len(dst)-crcStart {
- err = io.ErrShortWrite
- } else {
- err = d.checkCRC()
+ if d.o.ignoreChecksum {
+ err = d.consumeCRC()
+ } else {
+ var n int
+ n, err = d.crc.Write(dst[crcStart:])
+ if err == nil {
+ if n != len(dst)-crcStart {
+ err = io.ErrShortWrite
+ } else {
+ err = d.checkCRC()
+ }
}
}
}
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz.go b/vendor/github.com/klauspost/compress/zstd/fuzz.go
deleted file mode 100644
index 7f2210e05..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fuzz.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build ignorecrc
-// +build ignorecrc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
-const ignoreCRC = true
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
deleted file mode 100644
index 6811c68a8..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build !ignorecrc
-// +build !ignorecrc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
-const ignoreCRC = false
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 01cc23fa8..2585b2e98 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -1326,30 +1326,30 @@ copy_match:
JA copy_overlapping_match
// Copy non-overlapping match
- XORQ R12, R12
+ ADDQ R13, DI
+ MOVQ BX, R12
+ ADDQ R13, BX
copy_2:
- MOVUPS (R11)(R12*1), X0
- MOVUPS X0, (BX)(R12*1)
+ MOVUPS (R11), X0
+ MOVUPS X0, (R12)
+ ADDQ $0x10, R11
ADDQ $0x10, R12
- CMPQ R12, R13
- JB copy_2
- ADDQ R13, BX
- ADDQ R13, DI
+ SUBQ $0x10, R13
+ JHI copy_2
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
- XORQ R12, R12
+ ADDQ R13, DI
copy_slow_3:
- MOVB (R11)(R12*1), R14
- MOVB R14, (BX)(R12*1)
- INCQ R12
- CMPQ R12, R13
- JB copy_slow_3
- ADDQ R13, BX
- ADDQ R13, DI
+ MOVB (R11), R12
+ MOVB R12, (BX)
+ INCQ R11
+ INCQ BX
+ DECQ R13
+ JNZ copy_slow_3
handle_loop:
ADDQ $0x18, AX
@@ -1826,30 +1826,30 @@ copy_match:
JA copy_overlapping_match
// Copy non-overlapping match
- XORQ CX, CX
+ ADDQ R13, R12
+ MOVQ R10, CX
+ ADDQ R13, R10
copy_2:
- MOVUPS (AX)(CX*1), X0
- MOVUPS X0, (R10)(CX*1)
+ MOVUPS (AX), X0
+ MOVUPS X0, (CX)
+ ADDQ $0x10, AX
ADDQ $0x10, CX
- CMPQ CX, R13
- JB copy_2
- ADDQ R13, R10
- ADDQ R13, R12
+ SUBQ $0x10, R13
+ JHI copy_2
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
- XORQ CX, CX
+ ADDQ R13, R12
copy_slow_3:
- MOVB (AX)(CX*1), R14
- MOVB R14, (R10)(CX*1)
- INCQ CX
- CMPQ CX, R13
- JB copy_slow_3
- ADDQ R13, R10
- ADDQ R13, R12
+ MOVB (AX), CL
+ MOVB CL, (R10)
+ INCQ AX
+ INCQ R10
+ DECQ R13
+ JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), AX
@@ -2333,30 +2333,30 @@ copy_match:
JA copy_overlapping_match
// Copy non-overlapping match
- XORQ R12, R12
+ ADDQ R13, R11
+ MOVQ R9, R12
+ ADDQ R13, R9
copy_2:
- MOVUPS (CX)(R12*1), X0
- MOVUPS X0, (R9)(R12*1)
+ MOVUPS (CX), X0
+ MOVUPS X0, (R12)
+ ADDQ $0x10, CX
ADDQ $0x10, R12
- CMPQ R12, R13
- JB copy_2
- ADDQ R13, R9
- ADDQ R13, R11
+ SUBQ $0x10, R13
+ JHI copy_2
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
- XORQ R12, R12
+ ADDQ R13, R11
copy_slow_3:
- MOVB (CX)(R12*1), R14
- MOVB R14, (R9)(R12*1)
- INCQ R12
- CMPQ R12, R13
- JB copy_slow_3
- ADDQ R13, R9
- ADDQ R13, R11
+ MOVB (CX), R12
+ MOVB R12, (R9)
+ INCQ CX
+ INCQ R9
+ DECQ R13
+ JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), CX
@@ -2862,6 +2862,7 @@ copy_match:
JA copy_overlapping_match
// Copy non-overlapping match
+ ADDQ R13, R12
XORQ CX, CX
TESTQ $0x00000001, R13
JZ copy_2_word
@@ -2900,21 +2901,19 @@ copy_2_test:
CMPQ CX, R13
JB copy_2
ADDQ R13, R10
- ADDQ R13, R12
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
- XORQ CX, CX
+ ADDQ R13, R12
copy_slow_3:
- MOVB (AX)(CX*1), R14
- MOVB R14, (R10)(CX*1)
- INCQ CX
- CMPQ CX, R13
- JB copy_slow_3
- ADDQ R13, R10
- ADDQ R13, R12
+ MOVB (AX), CL
+ MOVB CL, (R10)
+ INCQ AX
+ INCQ R10
+ DECQ R13
+ JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), AX
@@ -3398,6 +3397,7 @@ copy_match:
JA copy_overlapping_match
// Copy non-overlapping match
+ ADDQ R13, R11
XORQ R12, R12
TESTQ $0x00000001, R13
JZ copy_2_word
@@ -3436,21 +3436,19 @@ copy_2_test:
CMPQ R12, R13
JB copy_2
ADDQ R13, R9
- ADDQ R13, R11
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
- XORQ R12, R12
+ ADDQ R13, R11
copy_slow_3:
- MOVB (CX)(R12*1), R14
- MOVB R14, (R9)(R12*1)
- INCQ R12
- CMPQ R12, R13
- JB copy_slow_3
- ADDQ R13, R9
- ADDQ R13, R11
+ MOVB (CX), R12
+ MOVB R12, (R9)
+ INCQ CX
+ INCQ R9
+ DECQ R13
+ JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), CX