summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost')
-rw-r--r--vendor/github.com/klauspost/compress/README.md23
-rw-r--r--vendor/github.com/klauspost/compress/flate/deflate.go36
-rw-r--r--vendor/github.com/klauspost/compress/flate/fast_encoder.go2
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bitreader.go10
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bitwriter.go115
-rw-r--r--vendor/github.com/klauspost/compress/huff0/bytereader.go10
-rw-r--r--vendor/github.com/klauspost/compress/huff0/compress.go1
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress.go113
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.go82
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_amd64.s203
-rw-r--r--vendor/github.com/klauspost/compress/huff0/decompress_generic.go102
-rw-r--r--vendor/github.com/klauspost/compress/zstd/bitreader.go7
-rw-r--r--vendor/github.com/klauspost/compress/zstd/bitwriter.go76
-rw-r--r--vendor/github.com/klauspost/compress/zstd/blockdec.go31
-rw-r--r--vendor/github.com/klauspost/compress/zstd/bytebuf.go4
-rw-r--r--vendor/github.com/klauspost/compress/zstd/bytereader.go6
-rw-r--r--vendor/github.com/klauspost/compress/zstd/decoder.go93
-rw-r--r--vendor/github.com/klauspost/compress/zstd/enc_better.go8
-rw-r--r--vendor/github.com/klauspost/compress/zstd/enc_dfast.go10
-rw-r--r--vendor/github.com/klauspost/compress/zstd/encoder.go2
-rw-r--r--vendor/github.com/klauspost/compress/zstd/framedec.go5
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder.go40
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_encoder.go23
-rw-r--r--vendor/github.com/klauspost/compress/zstd/hash.go6
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec.go102
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go16
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s622
-rw-r--r--vendor/github.com/klauspost/compress/zstd/zip.go9
-rw-r--r--vendor/github.com/klauspost/compress/zstd/zstd.go11
29 files changed, 917 insertions, 851 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index c3ec9d8a7..5c3c2a258 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,24 @@ This package provides various compression algorithms.
# changelog
+* May 25, 2022 (v1.15.5)
+ * s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
+ * s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
+ * huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596
+ * zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588
+ * zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592
+ * zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
+ * zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
+ * huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
+ * flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590
+
+
+* May 11, 2022 (v1.15.4)
+ * huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)
+ * inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)
+ * zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)
+ * zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)
+
* May 5, 2022 (v1.15.3)
* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
@@ -77,6 +95,9 @@ While the release has been extensively tested, it is recommended to testing when
* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+<details>
+ <summary>See changes to v1.13.x</summary>
+
* Aug 30, 2021 (v1.13.5)
* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
@@ -105,6 +126,8 @@ While the release has been extensively tested, it is recommended to testing when
* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
+</details>
+
<details>
<summary>See changes to v1.12.x</summary>
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
index bffa2f332..f8435998e 100644
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -84,24 +84,23 @@ type advancedState struct {
length int
offset int
maxInsertIndex int
+ chainHead int
+ hashOffset int
- // Input hash chains
- // hashHead[hashValue] contains the largest inputIndex with the specified hash value
- // If hashHead[hashValue] is within the current window, then
- // hashPrev[hashHead[hashValue] & windowMask] contains the previous index
- // with the same hash value.
- chainHead int
- hashHead [hashSize]uint32
- hashPrev [windowSize]uint32
- hashOffset int
+ ii uint16 // position of last match, intended to overflow to reset.
// input window: unprocessed data is window[index:windowEnd]
index int
estBitsPerByte int
hashMatch [maxMatchLength + minMatchLength]uint32
- hash uint32
- ii uint16 // position of last match, intended to overflow to reset.
+ // Input hash chains
+ // hashHead[hashValue] contains the largest inputIndex with the specified hash value
+ // If hashHead[hashValue] is within the current window, then
+ // hashPrev[hashHead[hashValue] & windowMask] contains the previous index
+ // with the same hash value.
+ hashHead [hashSize]uint32
+ hashPrev [windowSize]uint32
}
type compressor struct {
@@ -259,7 +258,6 @@ func (d *compressor) fillWindow(b []byte) {
// Set the head of the hash chain to us.
s.hashHead[newH] = uint32(di + s.hashOffset)
}
- s.hash = newH
}
// Update window information.
d.windowEnd += n
@@ -403,7 +401,6 @@ func (d *compressor) initDeflate() {
s.hashOffset = 1
s.length = minMatchLength - 1
s.offset = 0
- s.hash = 0
s.chainHead = -1
}
@@ -432,9 +429,6 @@ func (d *compressor) deflateLazy() {
}
s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
- if s.index < s.maxInsertIndex {
- s.hash = hash4(d.window[s.index:])
- }
for {
if sanity && s.index > d.windowEnd {
@@ -466,11 +460,11 @@ func (d *compressor) deflateLazy() {
}
if s.index < s.maxInsertIndex {
// Update the hash
- s.hash = hash4(d.window[s.index:])
- ch := s.hashHead[s.hash&hashMask]
+ hash := hash4(d.window[s.index:])
+ ch := s.hashHead[hash]
s.chainHead = int(ch)
s.hashPrev[s.index&windowMask] = ch
- s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset)
+ s.hashHead[hash] = uint32(s.index + s.hashOffset)
}
prevLength := s.length
prevOffset := s.offset
@@ -503,7 +497,7 @@ func (d *compressor) deflateLazy() {
end += prevIndex
idx := prevIndex + prevLength - (4 - checkOff)
h := hash4(d.window[idx:])
- ch2 := int(s.hashHead[h&hashMask]) - s.hashOffset - prevLength + (4 - checkOff)
+ ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff)
if ch2 > minIndex {
length := matchLen(d.window[prevIndex:end], d.window[ch2:])
// It seems like a pure length metric is best.
@@ -547,7 +541,6 @@ func (d *compressor) deflateLazy() {
// Set the head of the hash chain to us.
s.hashHead[newH] = uint32(di + s.hashOffset)
}
- s.hash = newH
}
s.index = newIndex
@@ -793,7 +786,6 @@ func (d *compressor) reset(w io.Writer) {
d.tokens.Reset()
s.length = minMatchLength - 1
s.offset = 0
- s.hash = 0
s.ii = 0
s.maxInsertIndex = 0
}
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index d55ea2a77..f781aaa62 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -117,7 +117,7 @@ func (e *fastGen) addBlock(src []byte) int32 {
// hash4 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4u(u uint32, h uint8) uint32 {
- return (u * prime4bytes) >> ((32 - h) & reg8SizeMask32)
+ return (u * prime4bytes) >> (32 - h)
}
type tableEntryPrev struct {
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 451160edd..504a7be9d 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}
-// peekTopBits(n) is equvialent to peekBitFast(64 - n)
-func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
- return uint16(b.value >> n)
-}
-
func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
@@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
}
}
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderShifted) finished() bool {
- return b.off == 0 && b.bitsRead >= 64
-}
-
func (b *bitReaderShifted) remaining() uint {
return b.off*8 + uint(64-b.bitsRead)
}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index 6bce4e87d..ec71f7a34 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -5,8 +5,6 @@
package huff0
-import "fmt"
-
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF} /* up to 16 bits */
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
- b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
- b.nBits += bits
-}
-
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits
}
-// addBits16ZeroNC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-// This is fastest if bits can be zero.
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
- if bits == 0 {
- return
- }
- value <<= (16 - bits) & 15
- value >>= (16 - bits) & 15
- b.bitContainer |= uint64(value) << (b.nBits & 63)
- b.nBits += bits
-}
-
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
- v := b.nBits >> 3
- switch v {
- case 0:
- return
- case 1:
- b.out = append(b.out,
- byte(b.bitContainer),
- )
- b.bitContainer >>= 1 << 3
- case 2:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- )
- b.bitContainer >>= 2 << 3
- case 3:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- )
- b.bitContainer >>= 3 << 3
- case 4:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- )
- b.bitContainer >>= 4 << 3
- case 5:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- )
- b.bitContainer >>= 5 << 3
- case 6:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- )
- b.bitContainer >>= 6 << 3
- case 7:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- )
- b.bitContainer >>= 7 << 3
- case 8:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- byte(b.bitContainer>>56),
- )
- b.bitContainer = 0
- b.nBits = 0
- return
- default:
- panic(fmt.Errorf("bits (%d) > 64", b.nBits))
- }
- b.nBits &= 7
-}
-
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
@@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
b.flushAlign()
return nil
}
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
- b.bitContainer = 0
- b.nBits = 0
- b.out = out
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
index 50bcdf6ea..4dcab8d23 100644
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go
@@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
b.off = 0
}
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
- b.off += int(n)
-}
-
// Int32 returns a little endian int32 starting at current offset.
func (b byteReader) Int32() int32 {
v3 := int32(b.b[b.off+3])
@@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
}
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
- return b.b[b.off:]
-}
-
// remain will return the number of bytes remaining.
func (b byteReader) remain() int {
return len(b.b) - b.off
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index bc95ac623..4d14542fa 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
return true
}
+//lint:ignore U1000 used for debugging
func (s *Scratch) validateTable(c cTable) bool {
if len(c) < int(s.symbolLen) {
return false
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 04f652995..c0c48bd70 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -11,7 +11,6 @@ import (
type dTable struct {
single []dEntrySingle
- double []dEntryDouble
}
// single-symbols decoding
@@ -19,13 +18,6 @@ type dEntrySingle struct {
entry uint16
}
-// double-symbols decoding
-type dEntryDouble struct {
- seq [4]byte
- nBits uint8
- len uint8
-}
-
// Uses special code for all tables that are < 8 bits.
const use8BitTables = true
@@ -35,7 +27,7 @@ const use8BitTables = true
// If no Scratch is provided a new one is allocated.
// The returned Scratch can be used for encoding or decoding input using this table.
func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
- s, err = s.prepare(in)
+ s, err = s.prepare(nil)
if err != nil {
return s, nil, err
}
@@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
return &[4][256]byte{}
}
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
- if len(d.dt.single) == 0 {
- return nil, errors.New("no table loaded")
- }
- if use8BitTables && d.actualTableLog <= 8 {
- return d.decompress1X8Bit(dst, src)
- }
- var br bitReaderShifted
- err := br.init(src)
- if err != nil {
- return dst, err
- }
- maxDecodedSize := cap(dst)
- dst = dst[:0]
-
- // Avoid bounds check by always having full sized table.
- const tlSize = 1 << tableLogMax
- const tlMask = tlSize - 1
- dt := d.dt.single[:tlSize]
-
- // Use temp table to avoid bound checks/append penalty.
- bufs := d.buffer()
- buf := &bufs[0]
- var off uint8
-
- for br.off >= 8 {
- br.fillFast()
- v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+0] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+1] = uint8(v.entry >> 8)
-
- // Refill
- br.fillFast()
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+2] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+3] = uint8(v.entry >> 8)
-
- off += 4
- if off == 0 {
- if len(dst)+256 > maxDecodedSize {
- br.close()
- d.bufs.Put(bufs)
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:]...)
- }
- }
-
- if len(dst)+int(off) > maxDecodedSize {
- d.bufs.Put(bufs)
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:off]...)
-
- // br < 8, so uint8 is fine
- bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
- for bitsLeft > 0 {
- br.fill()
- if false && br.bitsRead >= 32 {
- if br.off >= 4 {
- v := br.in[br.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- br.value = (br.value << 32) | uint64(low)
- br.bitsRead -= 32
- br.off -= 4
- } else {
- for br.off > 0 {
- br.value = (br.value << 8) | uint64(br.in[br.off-1])
- br.bitsRead -= 8
- br.off--
- }
- }
- }
- if len(dst) >= maxDecodedSize {
- d.bufs.Put(bufs)
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
- nBits := uint8(v.entry)
- br.advance(nBits)
- bitsLeft -= nBits
- dst = append(dst, uint8(v.entry>>8))
- }
- d.bufs.Put(bufs)
- return dst, br.close()
-}
-
// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
@@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
const shift = 56
const tlSize = 1 << 8
- const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index 3415e5da2..671e630a8 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -2,12 +2,14 @@
// +build amd64,!appengine,!noasm,gc
// This file contains the specialisation of Decoder.Decompress4X
-// that uses an asm implementation of its main loop.
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
package huff0
import (
"errors"
"fmt"
+
+ "github.com/klauspost/compress/internal/cpuinfo"
)
// decompress4x_main_loop_x86 is an x86 assembler implementation
@@ -146,3 +148,81 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
}
return dst, nil
}
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+
+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+
+type decompress1xContext struct {
+ pbr *bitReaderShifted
+ peekBits uint8
+ out *byte
+ outCap int
+ tbl *dEntrySingle
+ decoded int
+}
+
+// Error reported by asm implementations
+const error_max_decoded_size_exeeded = -1
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:maxDecodedSize]
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+
+ if maxDecodedSize >= 4 {
+ ctx := decompress1xContext{
+ pbr: &br,
+ out: &dst[0],
+ outCap: maxDecodedSize,
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ tbl: &d.dt.single[0],
+ }
+
+ if cpuinfo.HasBMI2() {
+ decompress1x_main_loop_bmi2(&ctx)
+ } else {
+ decompress1x_main_loop_amd64(&ctx)
+ }
+ if ctx.decoded == error_max_decoded_size_exeeded {
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+
+ dst = dst[:ctx.decoded]
+ }
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if len(dst) >= maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 06287f568..6c65c6e2b 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -660,3 +660,206 @@ skip_fill1003:
SHLQ $0x02, DX
MOVQ DX, 64(AX)
RET
+
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_1_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_2_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, CX
+ MOVQ 16(AX), DX
+ SUBQ DX, CX
+ MOVQ CX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_1_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_2_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, CX
+ MOVQ 16(AX), DX
+ SUBQ DX, CX
+ MOVQ CX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
index 126b4d68a..4f6f37cb2 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -191,3 +191,105 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
}
return dst, nil
}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress1X8Bit(dst, src)
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:0]
+
+ // Avoid bounds check by always having full sized table.
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ dt := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ bufs := d.buffer()
+ buf := &bufs[0]
+ var off uint8
+
+ for br.off >= 8 {
+ br.fillFast()
+ v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ // Refill
+ br.fillFast()
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ d.bufs.Put(bufs)
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+
+ if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:off]...)
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if false && br.bitsRead >= 32 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value = (br.value << 32) | uint64(low)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value = (br.value << 8) | uint64(br.in[br.off-1])
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ if len(dst) >= maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ d.bufs.Put(bufs)
+ return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index d7cd15ba2..97299d499 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -63,13 +63,6 @@ func (b *bitReader) get32BitsFast(n uint8) uint32 {
return v
}
-func (b *bitReader) get16BitsFast(n uint8) uint16 {
- const regMask = 64 - 1
- v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
- b.bitsRead += n
- return v
-}
-
// fillFast() will make sure at least 32 bits are available.
// There must be at least 4 bytes available.
func (b *bitReader) fillFast() {
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
index b36618285..78b3c61be 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -5,8 +5,6 @@
package zstd
-import "fmt"
-
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@@ -73,80 +71,6 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
b.nBits += bits
}
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
- v := b.nBits >> 3
- switch v {
- case 0:
- case 1:
- b.out = append(b.out,
- byte(b.bitContainer),
- )
- case 2:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- )
- case 3:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- )
- case 4:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- )
- case 5:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- )
- case 6:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- )
- case 7:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- )
- case 8:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- byte(b.bitContainer>>56),
- )
- default:
- panic(fmt.Errorf("bits (%d) > 64", b.nBits))
- }
- b.bitContainer >>= v << 3
- b.nBits &= 7
-}
-
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index b2bca3301..7eed729be 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -49,11 +49,8 @@ const (
// Maximum possible block size (all Raw+Uncompressed).
maxBlockSize = (1 << 21) - 1
- // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
- maxCompressedLiteralSize = 1 << 18
- maxRLELiteralSize = 1 << 20
- maxMatchLen = 131074
- maxSequences = 0x7f00 + 0xffff
+ maxMatchLen = 131074
+ maxSequences = 0x7f00 + 0xffff
// We support slightly less than the reference decoder to be able to
// use ints on 32 bit archs.
@@ -105,7 +102,6 @@ type blockDec struct {
// Block is RLE, this is the size.
RLESize uint32
- tmp [4]byte
Type blockType
@@ -368,14 +364,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
}
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, litRegenSize)
+ b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
} else {
- if litRegenSize > maxCompressedLiteralSize {
- // Exceptional
- b.literalBuf = make([]byte, litRegenSize)
- } else {
- b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
- }
+ b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
literals = b.literalBuf[:litRegenSize]
@@ -405,14 +396,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
// Ensure we have space to store it.
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, 0, litRegenSize)
+ b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
} else {
- b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+ b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
var err error
// Use our out buffer.
- huff.MaxDecodedSize = maxCompressedBlockSize
+ huff.MaxDecodedSize = litRegenSize
if fourStreams {
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
} else {
@@ -437,9 +428,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
// Ensure we have space to store it.
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, 0, litRegenSize)
+ b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
} else {
- b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
+ b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
huff := hist.huffTree
@@ -456,7 +447,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
return in, err
}
hist.huffTree = huff
- huff.MaxDecodedSize = maxCompressedBlockSize
+ huff.MaxDecodedSize = litRegenSize
// Use our out buffer.
if fourStreams {
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -471,6 +462,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
if len(literals) != litRegenSize {
return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
}
+ // Re-cap to get extra size.
+ literals = b.literalBuf[:len(literals)]
if debugDecoder {
printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
}
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index b80191e4b..4493baa75 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -52,10 +52,6 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
return r, nil
}
-func (b *byteBuf) remain() []byte {
- return *b
-}
-
func (b *byteBuf) readByte() (byte, error) {
bb := *b
if len(bb) < 1 {
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index 2c4fca17f..0e59a242d 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -13,12 +13,6 @@ type byteReader struct {
off int
}
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
- b.b = in
- b.off = 0
-}
-
// advance the stream b n bytes.
func (b *byteReader) advance(n uint) {
b.off += int(n)
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 36119f385..286c8f9d7 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -637,60 +637,18 @@ func (d *Decoder) startSyncDecoder(r io.Reader) error {
// Create Decoder:
// ASYNC:
-// Spawn 4 go routines.
-// 0: Read frames and decode blocks.
-// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
-// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
-// 3: Wait for stream history, execute sequences, send stream history.
+// Spawn 3 go routines.
+// 0: Read frames and decode block literals.
+// 1: Decode sequences.
+// 2: Execute sequences, send to output.
func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
defer d.streamWg.Done()
br := readerWrapper{r: r}
- var seqPrepare = make(chan *blockDec, d.o.concurrent)
var seqDecode = make(chan *blockDec, d.o.concurrent)
var seqExecute = make(chan *blockDec, d.o.concurrent)
- // Async 1: Prepare blocks...
- go func() {
- var hist history
- var hasErr bool
- for block := range seqPrepare {
- if hasErr {
- if block != nil {
- seqDecode <- block
- }
- continue
- }
- if block.async.newHist != nil {
- if debugDecoder {
- println("Async 1: new history")
- }
- hist.reset()
- if block.async.newHist.dict != nil {
- hist.setDict(block.async.newHist.dict)
- }
- }
- if block.err != nil || block.Type != blockTypeCompressed {
- hasErr = block.err != nil
- seqDecode <- block
- continue
- }
-
- remain, err := block.decodeLiterals(block.data, &hist)
- block.err = err
- hasErr = block.err != nil
- if err == nil {
- block.async.literals = hist.decoders.literals
- block.async.seqData = remain
- } else if debugDecoder {
- println("decodeLiterals error:", err)
- }
- seqDecode <- block
- }
- close(seqDecode)
- }()
-
- // Async 2: Decode sequences...
+ // Async 1: Decode sequences...
go func() {
var hist history
var hasErr bool
@@ -704,7 +662,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
}
if block.async.newHist != nil {
if debugDecoder {
- println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
+ println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
}
hist.decoders = block.async.newHist.decoders
hist.recentOffsets = block.async.newHist.recentOffsets
@@ -758,7 +716,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
}
if block.async.newHist != nil {
if debugDecoder {
- println("Async 3: new history")
+ println("Async 2: new history")
}
hist.windowSize = block.async.newHist.windowSize
hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
@@ -845,6 +803,33 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
decodeStream:
for {
+ var hist history
+ var hasErr bool
+
+ decodeBlock := func(block *blockDec) {
+ if hasErr {
+ if block != nil {
+ seqDecode <- block
+ }
+ return
+ }
+ if block.err != nil || block.Type != blockTypeCompressed {
+ hasErr = block.err != nil
+ seqDecode <- block
+ return
+ }
+
+ remain, err := block.decodeLiterals(block.data, &hist)
+ block.err = err
+ hasErr = block.err != nil
+ if err == nil {
+ block.async.literals = hist.decoders.literals
+ block.async.seqData = remain
+ } else if debugDecoder {
+ println("decodeLiterals error:", err)
+ }
+ seqDecode <- block
+ }
frame := d.frame
if debugDecoder {
println("New frame...")
@@ -871,7 +856,7 @@ decodeStream:
case <-ctx.Done():
case dec := <-d.decoders:
dec.sendErr(err)
- seqPrepare <- dec
+ decodeBlock(dec)
}
break decodeStream
}
@@ -891,6 +876,10 @@ decodeStream:
if debugDecoder {
println("Alloc History:", h.allocFrameBuffer)
}
+ hist.reset()
+ if h.dict != nil {
+ hist.setDict(h.dict)
+ }
dec.async.newHist = &h
dec.async.fcs = frame.FrameContentSize
historySent = true
@@ -917,7 +906,7 @@ decodeStream:
}
err = dec.err
last := dec.Last
- seqPrepare <- dec
+ decodeBlock(dec)
if err != nil {
break decodeStream
}
@@ -926,7 +915,7 @@ decodeStream:
}
}
}
- close(seqPrepare)
+ close(seqDecode)
wg.Wait()
d.frame.history.b = frameHistCache
}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index 602c05ee0..c769f6941 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -156,8 +156,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -518,8 +518,8 @@ encodeLoop:
}
// Store this, since we have it.
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -674,8 +674,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -1047,8 +1047,8 @@ encodeLoop:
}
// Store this, since we have it.
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index d6b310424..7ff0c64fa 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -127,8 +127,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -439,8 +439,8 @@ encodeLoop:
var t int32
for {
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -785,8 +785,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -969,7 +969,7 @@ encodeLoop:
te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
- longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
+ longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
e.longTable[longHash1] = te0
e.longTable[longHash2] = te1
e.markLongShardDirty(longHash1)
@@ -1002,8 +1002,8 @@ encodeLoop:
}
// Store this, since we have it.
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index dcc987a7c..e6b1d01cf 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -551,7 +551,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
}
// If we can do everything in one block, prefer that.
- if len(src) <= maxCompressedBlockSize {
+ if len(src) <= e.o.blockSize {
enc.Reset(e.o.dict, true)
// Slightly faster with no history and everything in one block.
if e.o.crc {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 3ff109cce..fa0a633f3 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -253,10 +253,11 @@ func (d *frameDec) reset(br byteBuffer) error {
return ErrWindowSizeTooSmall
}
d.history.windowSize = int(d.WindowSize)
- if d.o.lowMem && d.history.windowSize < maxBlockSize {
+ if !d.o.lowMem || d.history.windowSize < maxBlockSize {
+ // Alloc 2x window size if not low-mem, or very small window size.
d.history.allocFrameBuffer = d.history.windowSize * 2
- // TODO: Maybe use FrameContent size
} else {
+ // Alloc with one additional block
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index fde4e6b60..23333b969 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -229,18 +229,10 @@ func (d decSymbol) newState() uint16 {
return uint16(d >> 16)
}
-func (d decSymbol) baseline() uint32 {
- return uint32(d >> 32)
-}
-
func (d decSymbol) baselineInt() int {
return int(d >> 32)
}
-func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
- *d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
-}
-
func (d *decSymbol) setNBits(nBits uint8) {
const mask = 0xffffffffffffff00
*d = (*d & mask) | decSymbol(nBits)
@@ -256,11 +248,6 @@ func (d *decSymbol) setNewState(state uint16) {
*d = (*d & mask) | decSymbol(state)<<16
}
-func (d *decSymbol) setBaseline(baseline uint32) {
- const mask = 0xffffffff
- *d = (*d & mask) | decSymbol(baseline)<<32
-}
-
func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
const mask = 0xffff00ff
*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
@@ -377,34 +364,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
s.state = dt[br.getBits(tableLog)]
}
-// next returns the current symbol and sets the next state.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) next(br *bitReader) {
- lowBits := uint16(br.getBits(s.state.nbBits()))
- s.state = s.dt[s.state.newState()+lowBits]
-}
-
-// finished returns true if all bits have been read from the bitstream
-// and the next state would require reading bits from the input.
-func (s *fseState) finished(br *bitReader) bool {
- return br.finished() && s.state.nbBits() > 0
-}
-
-// final returns the current state symbol without decoding the next.
-func (s *fseState) final() (int, uint8) {
- return s.state.baselineInt(), s.state.addBits()
-}
-
// final returns the current state symbol without decoding the next.
func (s decSymbol) final() (int, uint8) {
return s.baselineInt(), s.addBits()
}
-
-// nextFast returns the next symbol and sets the next state.
-// This can only be used if no symbols are 0 bits.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
- lowBits := br.get16BitsFast(s.state.nbBits())
- s.state = s.dt[s.state.newState()+lowBits]
- return s.state.baseline(), s.state.addBits()
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index 5442061b1..ab26326a8 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -76,21 +76,6 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
s.clearCount = maxCount != 0
}
-// prepare will prepare and allocate scratch tables used for both compression and decompression.
-func (s *fseEncoder) prepare() (*fseEncoder, error) {
- if s == nil {
- s = &fseEncoder{}
- }
- s.useRLE = false
- if s.clearCount && s.maxCount == 0 {
- for i := range s.count {
- s.count[i] = 0
- }
- s.clearCount = false
- }
- return s, nil
-}
-
// allocCtable will allocate tables needed for compression.
// If existing tables a re big enough, they are simply re-used.
func (s *fseEncoder) allocCtable() {
@@ -709,14 +694,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
c.state = c.stateTable[lu]
}
-// encode the output symbol provided and write it to the bitstream.
-func (c *cState) encode(symbolTT symbolTransform) {
- nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
- dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
- c.bw.addBits16NC(c.state, uint8(nbBitsOut))
- c.state = c.stateTable[dstState]
-}
-
// flush will write the tablelog to the output and flush the remaining full bytes.
func (c *cState) flush(tableLog uint8) {
c.bw.flush32()
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
index cf33f29a1..5d73c21eb 100644
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -33,9 +33,3 @@ func hashLen(u uint64, length, mls uint8) uint32 {
return (uint32(u) * prime4bytes) >> (32 - length)
}
}
-
-// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash3(u uint32, h uint8) uint32 {
- return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index e80139dd9..df0447203 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -188,6 +188,7 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
}
}
}
+
// Add final literals
copy(out[t:], s.literals)
if debugDecoder {
@@ -203,12 +204,11 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
// decode sequences from the stream with the provided history.
func (s *sequenceDecs) decodeSync(hist []byte) error {
- if true {
- supported, err := s.decodeSyncSimple(hist)
- if supported {
- return err
- }
+ supported, err := s.decodeSyncSimple(hist)
+ if supported {
+ return err
}
+
br := s.br
seqs := s.nSeqs
startSize := len(s.out)
@@ -396,6 +396,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
ofState = ofTable[ofState.newState()&maxTableMask]
} else {
bits := br.get32BitsFast(nBits)
+
lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
llState = llTable[(llState.newState()+lowBits)&maxTableMask]
@@ -418,16 +419,6 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
return br.close()
}
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) update(br *bitReader) {
- // Max 8 bits
- s.litLengths.state.next(br)
- // Max 9 bits
- s.matchLengths.state.next(br)
- // Max 8 bits
- s.offsets.state.next(br)
-}
-
var bitMask [16]uint16
func init() {
@@ -436,87 +427,6 @@ func init() {
}
}
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) updateAlt(br *bitReader) {
- // Update all 3 states at once. Approx 20% faster.
- a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-
- nBits := a.nbBits() + b.nbBits() + c.nbBits()
- if nBits == 0 {
- s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
- s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
- s.offsets.state.state = s.offsets.state.dt[c.newState()]
- return
- }
- bits := br.get32BitsFast(nBits)
- lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
- s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
-
- lowBits = uint16(bits >> (c.nbBits() & 31))
- lowBits &= bitMask[b.nbBits()&15]
- s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
-
- lowBits = uint16(bits) & bitMask[c.nbBits()&15]
- s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
-}
-
-// nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
-func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
- // Final will not read from stream.
- ll, llB := llState.final()
- ml, mlB := mlState.final()
- mo, moB := ofState.final()
-
- // extra bits are stored in reverse order.
- br.fillFast()
- mo += br.getBits(moB)
- if s.maxBits > 32 {
- br.fillFast()
- }
- ml += br.getBits(mlB)
- ll += br.getBits(llB)
-
- if moB > 1 {
- s.prevOffset[2] = s.prevOffset[1]
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = mo
- return
- }
- // mo = s.adjustOffset(mo, ll, moB)
- // Inlined for rather big speedup
- if ll == 0 {
- // There is an exception though, when current sequence's literals_length = 0.
- // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
- // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
- mo++
- }
-
- if mo == 0 {
- mo = s.prevOffset[0]
- return
- }
- var temp int
- if mo == 3 {
- temp = s.prevOffset[0] - 1
- } else {
- temp = s.prevOffset[mo]
- }
-
- if temp == 0 {
- // 0 is not valid; input is corrupted; force offset to 1
- println("temp was 0")
- temp = 1
- }
-
- if mo != 1 {
- s.prevOffset[2] = s.prevOffset[1]
- }
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = temp
- mo = temp
- return
-}
-
func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
// Final will not read from stream.
ll, llB := llState.final()
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
index 4676b09cc..847b322ae 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -62,6 +62,10 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
useSafe = true
}
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ useSafe = true
+ }
+
br := s.br
maxBlockSize := maxCompressedBlockSize
@@ -301,6 +305,10 @@ type executeAsmContext struct {
//go:noescape
func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+// Same as above, but with safe memcopies
+//go:noescape
+func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+
// executeSimple handles cases when dictionary is not used.
func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
// Ensure we have enough output size...
@@ -327,8 +335,12 @@ func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
literals: s.literals,
windowSize: s.windowSize,
}
-
- ok := sequenceDecs_executeSimple_amd64(&ctx)
+ var ok bool
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
+ } else {
+ ok = sequenceDecs_executeSimple_amd64(&ctx)
+ }
if !ok {
return fmt.Errorf("match offset (%d) bigger than current history (%d)",
seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 2585b2e98..212c6cac3 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -705,60 +705,55 @@ sequenceDecs_decode_bmi2_fill_2_end:
MOVQ CX, (R9)
// Fill bitreader for state updates
- MOVQ R13, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R13
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decode_bmi2_skip_update
-
- // Update Literal Length State
- MOVBQZX SI, R14
- MOVQ $0x00001010, CX
- BEXTRQ CX, SI, SI
+ MOVQ R13, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_bmi2_skip_update
+ LEAQ (SI)(DI*1), R14
+ ADDQ R8, R14
+ MOVBQZX R14, R14
LEAQ (DX)(R14*1), CX
MOVQ AX, R15
MOVQ CX, DX
ROLQ CL, R15
BZHIQ R14, R15, R15
- ADDQ R15, SI
- // Load ctx.llTable
+ // Update Offset State
+ BZHIQ R8, R15, CX
+ SHRXQ R8, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
// Update Match Length State
- MOVBQZX DI, R14
- MOVQ $0x00001010, CX
- BEXTRQ CX, DI, DI
- LEAQ (DX)(R14*1), CX
- MOVQ AX, R15
- MOVQ CX, DX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- ADDQ R15, DI
+ BZHIQ DI, R15, CX
+ SHRXQ DI, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, DI, DI
+ ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
- // Update Offset State
- MOVBQZX R8, R14
- MOVQ $0x00001010, CX
- BEXTRQ CX, R8, R8
- LEAQ (DX)(R14*1), CX
- MOVQ AX, R15
- MOVQ CX, DX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- ADDQ R15, R8
+ // Update Literal Length State
+ BZHIQ SI, R15, CX
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, SI, SI
+ ADDQ CX, SI
- // Load ctx.ofTable
+ // Load ctx.llTable
MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
sequenceDecs_decode_bmi2_skip_update:
// Adjust offset
@@ -965,60 +960,55 @@ sequenceDecs_decode_56_bmi2_fill_end:
MOVQ CX, (R9)
// Fill bitreader for state updates
- MOVQ R13, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R13
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decode_56_bmi2_skip_update
-
- // Update Literal Length State
- MOVBQZX SI, R14
- MOVQ $0x00001010, CX
- BEXTRQ CX, SI, SI
+ MOVQ R13, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_56_bmi2_skip_update
+ LEAQ (SI)(DI*1), R14
+ ADDQ R8, R14
+ MOVBQZX R14, R14
LEAQ (DX)(R14*1), CX
MOVQ AX, R15
MOVQ CX, DX
ROLQ CL, R15
BZHIQ R14, R15, R15
- ADDQ R15, SI
- // Load ctx.llTable
+ // Update Offset State
+ BZHIQ R8, R15, CX
+ SHRXQ R8, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
// Update Match Length State
- MOVBQZX DI, R14
- MOVQ $0x00001010, CX
- BEXTRQ CX, DI, DI
- LEAQ (DX)(R14*1), CX
- MOVQ AX, R15
- MOVQ CX, DX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- ADDQ R15, DI
+ BZHIQ DI, R15, CX
+ SHRXQ DI, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, DI, DI
+ ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
- // Update Offset State
- MOVBQZX R8, R14
- MOVQ $0x00001010, CX
- BEXTRQ CX, R8, R8
- LEAQ (DX)(R14*1), CX
- MOVQ AX, R15
- MOVQ CX, DX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- ADDQ R15, R8
+ // Update Literal Length State
+ BZHIQ SI, R15, CX
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, SI, SI
+ ADDQ CX, SI
- // Load ctx.ofTable
+ // Load ctx.llTable
MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
sequenceDecs_decode_56_bmi2_skip_update:
// Adjust offset
@@ -1171,6 +1161,228 @@ main_loop:
TESTQ R11, R11
JZ check_offset
XORQ R14, R14
+
+copy_1:
+ MOVUPS (SI)(R14*1), X0
+ MOVUPS X0, (BX)(R14*1)
+ ADDQ $0x10, R14
+ CMPQ R14, R11
+ JB copy_1
+ ADDQ R11, SI
+ ADDQ R11, BX
+ ADDQ R11, DI
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ LEAQ (DI)(R10*1), R11
+ CMPQ R12, R11
+ JG error_match_off_too_big
+ CMPQ R12, R8
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JGE copy_all_from_history
+ XORQ R11, R11
+ TESTQ $0x00000001, R13
+ JZ copy_4_word
+ MOVB (R14)(R11*1), R12
+ MOVB R12, (BX)(R11*1)
+ ADDQ $0x01, R11
+
+copy_4_word:
+ TESTQ $0x00000002, R13
+ JZ copy_4_dword
+ MOVW (R14)(R11*1), R12
+ MOVW R12, (BX)(R11*1)
+ ADDQ $0x02, R11
+
+copy_4_dword:
+ TESTQ $0x00000004, R13
+ JZ copy_4_qword
+ MOVL (R14)(R11*1), R12
+ MOVL R12, (BX)(R11*1)
+ ADDQ $0x04, R11
+
+copy_4_qword:
+ TESTQ $0x00000008, R13
+ JZ copy_4_test
+ MOVQ (R14)(R11*1), R12
+ MOVQ R12, (BX)(R11*1)
+ ADDQ $0x08, R11
+ JMP copy_4_test
+
+copy_4:
+ MOVUPS (R14)(R11*1), X0
+ MOVUPS X0, (BX)(R11*1)
+ ADDQ $0x10, R11
+
+copy_4_test:
+ CMPQ R11, R13
+ JB copy_4
+ ADDQ R13, DI
+ ADDQ R13, BX
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ XORQ R15, R15
+ TESTQ $0x00000001, R11
+ JZ copy_5_word
+ MOVB (R14)(R15*1), BP
+ MOVB BP, (BX)(R15*1)
+ ADDQ $0x01, R15
+
+copy_5_word:
+ TESTQ $0x00000002, R11
+ JZ copy_5_dword
+ MOVW (R14)(R15*1), BP
+ MOVW BP, (BX)(R15*1)
+ ADDQ $0x02, R15
+
+copy_5_dword:
+ TESTQ $0x00000004, R11
+ JZ copy_5_qword
+ MOVL (R14)(R15*1), BP
+ MOVL BP, (BX)(R15*1)
+ ADDQ $0x04, R15
+
+copy_5_qword:
+ TESTQ $0x00000008, R11
+ JZ copy_5_test
+ MOVQ (R14)(R15*1), BP
+ MOVQ BP, (BX)(R15*1)
+ ADDQ $0x08, R15
+ JMP copy_5_test
+
+copy_5:
+ MOVUPS (R14)(R15*1), X0
+ MOVUPS X0, (BX)(R15*1)
+ ADDQ $0x10, R15
+
+copy_5_test:
+ CMPQ R15, R11
+ JB copy_5
+ ADDQ R11, BX
+ ADDQ R11, DI
+ SUBQ R11, R13
+
+ // Copy match from the current buffer
+copy_match:
+ TESTQ R13, R13
+ JZ handle_loop
+ MOVQ BX, R11
+ SUBQ R12, R11
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, DI
+ MOVQ BX, R12
+ ADDQ R13, BX
+
+copy_2:
+ MOVUPS (R11), X0
+ MOVUPS X0, (R12)
+ ADDQ $0x10, R11
+ ADDQ $0x10, R12
+ SUBQ $0x10, R13
+ JHI copy_2
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, DI
+
+copy_slow_3:
+ MOVB (R11), R12
+ MOVB R12, (BX)
+ INCQ R11
+ INCQ BX
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+
+loop_finished:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+error_match_off_too_big:
+ // Return value
+ MOVB $0x00, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+empty_seqs:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+ RET
+
+// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
+ MOVQ ctx+0(FP), R10
+ MOVQ 8(R10), CX
+ TESTQ CX, CX
+ JZ empty_seqs
+ MOVQ (R10), AX
+ MOVQ 24(R10), DX
+ MOVQ 32(R10), BX
+ MOVQ 80(R10), SI
+ MOVQ 104(R10), DI
+ MOVQ 120(R10), R8
+ MOVQ 56(R10), R9
+ MOVQ 64(R10), R10
+ ADDQ R10, R9
+
+ // seqsBase += 24 * seqIndex
+ LEAQ (DX)(DX*2), R11
+ SHLQ $0x03, R11
+ ADDQ R11, AX
+
+ // outBase += outPosition
+ ADDQ DI, BX
+
+main_loop:
+ MOVQ (AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 8(AX), R13
+
+ // Copy literals
+ TESTQ R11, R11
+ JZ check_offset
+ XORQ R14, R14
TESTQ $0x00000001, R11
JZ copy_1_word
MOVB (SI)(R14*1), R15
@@ -1326,18 +1538,46 @@ copy_match:
JA copy_overlapping_match
// Copy non-overlapping match
- ADDQ R13, DI
- MOVQ BX, R12
- ADDQ R13, BX
+ ADDQ R13, DI
+ XORQ R12, R12
+ TESTQ $0x00000001, R13
+ JZ copy_2_word
+ MOVB (R11)(R12*1), R14
+ MOVB R14, (BX)(R12*1)
+ ADDQ $0x01, R12
+
+copy_2_word:
+ TESTQ $0x00000002, R13
+ JZ copy_2_dword
+ MOVW (R11)(R12*1), R14
+ MOVW R14, (BX)(R12*1)
+ ADDQ $0x02, R12
+
+copy_2_dword:
+ TESTQ $0x00000004, R13
+ JZ copy_2_qword
+ MOVL (R11)(R12*1), R14
+ MOVL R14, (BX)(R12*1)
+ ADDQ $0x04, R12
+
+copy_2_qword:
+ TESTQ $0x00000008, R13
+ JZ copy_2_test
+ MOVQ (R11)(R12*1), R14
+ MOVQ R14, (BX)(R12*1)
+ ADDQ $0x08, R12
+ JMP copy_2_test
copy_2:
- MOVUPS (R11), X0
- MOVUPS X0, (R12)
- ADDQ $0x10, R11
+ MOVUPS (R11)(R12*1), X0
+ MOVUPS X0, (BX)(R12*1)
ADDQ $0x10, R12
- SUBQ $0x10, R13
- JHI copy_2
- JMP handle_loop
+
+copy_2_test:
+ CMPQ R12, R13
+ JB copy_2
+ ADDQ R13, BX
+ JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
@@ -1673,45 +1913,16 @@ sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
TESTQ AX, AX
JZ check_offset
XORQ R14, R14
- TESTQ $0x00000001, AX
- JZ copy_1_word
- MOVB (R11)(R14*1), R15
- MOVB R15, (R10)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, AX
- JZ copy_1_dword
- MOVW (R11)(R14*1), R15
- MOVW R15, (R10)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, AX
- JZ copy_1_qword
- MOVL (R11)(R14*1), R15
- MOVL R15, (R10)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, AX
- JZ copy_1_test
- MOVQ (R11)(R14*1), R15
- MOVQ R15, (R10)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
copy_1:
MOVUPS (R11)(R14*1), X0
MOVUPS X0, (R10)(R14*1)
ADDQ $0x10, R14
-
-copy_1_test:
- CMPQ R14, AX
- JB copy_1
- ADDQ AX, R11
- ADDQ AX, R10
- ADDQ AX, R12
+ CMPQ R14, AX
+ JB copy_1
+ ADDQ AX, R11
+ ADDQ AX, R10
+ ADDQ AX, R12
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
@@ -2044,60 +2255,55 @@ sequenceDecs_decodeSync_bmi2_fill_2_end:
MOVQ CX, 24(SP)
// Fill bitreader for state updates
- MOVQ R12, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R12
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decodeSync_bmi2_skip_update
-
- // Update Literal Length State
- MOVBQZX SI, R13
- MOVQ $0x00001010, CX
- BEXTRQ CX, SI, SI
+ MOVQ R12, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R12
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_bmi2_skip_update
+ LEAQ (SI)(DI*1), R13
+ ADDQ R8, R13
+ MOVBQZX R13, R13
LEAQ (DX)(R13*1), CX
MOVQ AX, R14
MOVQ CX, DX
ROLQ CL, R14
BZHIQ R13, R14, R14
- ADDQ R14, SI
- // Load ctx.llTable
+ // Update Offset State
+ BZHIQ R8, R14, CX
+ SHRXQ R8, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
// Update Match Length State
- MOVBQZX DI, R13
- MOVQ $0x00001010, CX
- BEXTRQ CX, DI, DI
- LEAQ (DX)(R13*1), CX
- MOVQ AX, R14
- MOVQ CX, DX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- ADDQ R14, DI
+ BZHIQ DI, R14, CX
+ SHRXQ DI, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, DI, DI
+ ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
- // Update Offset State
- MOVBQZX R8, R13
- MOVQ $0x00001010, CX
- BEXTRQ CX, R8, R8
- LEAQ (DX)(R13*1), CX
- MOVQ AX, R14
- MOVQ CX, DX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- ADDQ R14, R8
+ // Update Literal Length State
+ BZHIQ SI, R14, CX
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, SI, SI
+ ADDQ CX, SI
- // Load ctx.ofTable
+ // Load ctx.llTable
MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
sequenceDecs_decodeSync_bmi2_skip_update:
// Adjust offset
@@ -2180,45 +2386,16 @@ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
TESTQ CX, CX
JZ check_offset
XORQ R14, R14
- TESTQ $0x00000001, CX
- JZ copy_1_word
- MOVB (R10)(R14*1), R15
- MOVB R15, (R9)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, CX
- JZ copy_1_dword
- MOVW (R10)(R14*1), R15
- MOVW R15, (R9)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, CX
- JZ copy_1_qword
- MOVL (R10)(R14*1), R15
- MOVL R15, (R9)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, CX
- JZ copy_1_test
- MOVQ (R10)(R14*1), R15
- MOVQ R15, (R9)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
copy_1:
MOVUPS (R10)(R14*1), X0
MOVUPS X0, (R9)(R14*1)
ADDQ $0x10, R14
-
-copy_1_test:
- CMPQ R14, CX
- JB copy_1
- ADDQ CX, R10
- ADDQ CX, R9
- ADDQ CX, R11
+ CMPQ R14, CX
+ JB copy_1
+ ADDQ CX, R10
+ ADDQ CX, R9
+ ADDQ CX, R11
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
@@ -3108,60 +3285,55 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
MOVQ CX, 24(SP)
// Fill bitreader for state updates
- MOVQ R12, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R12
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
-
- // Update Literal Length State
- MOVBQZX SI, R13
- MOVQ $0x00001010, CX
- BEXTRQ CX, SI, SI
+ MOVQ R12, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R12
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
+ LEAQ (SI)(DI*1), R13
+ ADDQ R8, R13
+ MOVBQZX R13, R13
LEAQ (DX)(R13*1), CX
MOVQ AX, R14
MOVQ CX, DX
ROLQ CL, R14
BZHIQ R13, R14, R14
- ADDQ R14, SI
- // Load ctx.llTable
+ // Update Offset State
+ BZHIQ R8, R14, CX
+ SHRXQ R8, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
// Update Match Length State
- MOVBQZX DI, R13
- MOVQ $0x00001010, CX
- BEXTRQ CX, DI, DI
- LEAQ (DX)(R13*1), CX
- MOVQ AX, R14
- MOVQ CX, DX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- ADDQ R14, DI
+ BZHIQ DI, R14, CX
+ SHRXQ DI, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, DI, DI
+ ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
- // Update Offset State
- MOVBQZX R8, R13
- MOVQ $0x00001010, CX
- BEXTRQ CX, R8, R8
- LEAQ (DX)(R13*1), CX
- MOVQ AX, R14
- MOVQ CX, DX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- ADDQ R14, R8
+ // Update Literal Length State
+ BZHIQ SI, R14, CX
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, SI, SI
+ ADDQ CX, SI
- // Load ctx.ofTable
+ // Load ctx.llTable
MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
sequenceDecs_decodeSync_safe_bmi2_skip_update:
// Adjust offset
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
index b53f606a1..29c15c8c4 100644
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -18,7 +18,14 @@ const ZipMethodWinZip = 93
// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
const ZipMethodPKWare = 20
-var zipReaderPool sync.Pool
+// zipReaderPool is the default reader pool.
+var zipReaderPool = sync.Pool{New: func() interface{} {
+ z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
+ if err != nil {
+ panic(err)
+ }
+ return z
+}}
// newZipReader creates a pooled zip decompressor.
func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index c1c90b4a0..3eb3f1c82 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -110,17 +110,6 @@ func printf(format string, a ...interface{}) {
}
}
-// matchLenFast does matching, but will not match the last up to 7 bytes.
-func matchLenFast(a, b []byte) int {
- endI := len(a) & (math.MaxInt32 - 7)
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- return i + bits.TrailingZeros64(diff)>>3
- }
- }
- return endI
-}
-
// matchLen returns the maximum length.
// a must be the shortest of the two.
// The function also returns whether all bytes matched.