diff options
Diffstat (limited to 'vendor/github.com/klauspost')
12 files changed, 918 insertions, 654 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md index e8ff994f8..f7f74c153 100644 --- a/vendor/github.com/klauspost/compress/README.md +++ b/vendor/github.com/klauspost/compress/README.md @@ -17,6 +17,19 @@ This package provides various compression algorithms. # changelog
+* Feb 17, 2022 (v1.14.3)
+ * flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494) [#478](https://github.com/klauspost/compress/pull/478)
+ * flate: Faster decompression speed, ~5-10%. [#483](https://github.com/klauspost/compress/pull/483)
+ * s2: Faster compression with Go v1.18 and amd64 microarch level 3+. [#484](https://github.com/klauspost/compress/pull/484) [#486](https://github.com/klauspost/compress/pull/486)
+
+* Jan 25, 2022 (v1.14.2)
+ * zstd: improve header decoder by @dsnet [#476](https://github.com/klauspost/compress/pull/476)
+ * zstd: Add bigger default blocks [#469](https://github.com/klauspost/compress/pull/469)
+ * zstd: Remove unused decompression buffer [#470](https://github.com/klauspost/compress/pull/470)
+ * zstd: Fix logically dead code by @ningmingxiao [#472](https://github.com/klauspost/compress/pull/472)
+ * flate: Improve level 7-9 [#471](https://github.com/klauspost/compress/pull/471) [#473](https://github.com/klauspost/compress/pull/473)
+ * zstd: Add noasm tag for xxhash [#475](https://github.com/klauspost/compress/pull/475)
+
* Jan 11, 2022 (v1.14.1)
* s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462)
* flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458)
@@ -53,6 +66,9 @@ This package provides various compression algorithms. * zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
+<details>
+ <summary>See changes to v1.12.x</summary>
+
* May 25, 2021 (v1.12.3)
* deflate: Better/faster Huffman encoding [#374](https://github.com/klauspost/compress/pull/374)
* deflate: Allocate less for history. [#375](https://github.com/klauspost/compress/pull/375)
@@ -74,9 +90,10 @@ This package provides various compression algorithms. * s2c/s2d/s2sx: Always truncate when writing files [#352](https://github.com/klauspost/compress/pull/352)
* zstd: Reduce memory usage further when using [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) [#346](https://github.com/klauspost/compress/pull/346)
* s2: Fix potential problem with amd64 assembly and profilers [#349](https://github.com/klauspost/compress/pull/349)
+</details>
<details>
- <summary>See changes prior to v1.12.1</summary>
+ <summary>See changes to v1.11.x</summary>
* Mar 26, 2021 (v1.11.13)
* zstd: Big speedup on small dictionary encodes [#344](https://github.com/klauspost/compress/pull/344) [#345](https://github.com/klauspost/compress/pull/345)
@@ -135,7 +152,7 @@ This package provides various compression algorithms. </details>
<details>
- <summary>See changes prior to v1.11.0</summary>
+ <summary>See changes to v1.10.x</summary>
* July 8, 2020 (v1.10.11)
* zstd: Fix extra block when compressing with ReadFrom. [#278](https://github.com/klauspost/compress/pull/278)
@@ -297,11 +314,6 @@ This package provides various compression algorithms. # deflate usage
-* [High Throughput Benchmark](http://blog.klauspost.com/go-gzipdeflate-benchmarks/).
-* [Small Payload/Webserver Benchmarks](http://blog.klauspost.com/gzip-performance-for-go-webservers/).
-* [Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
-* [Re-balancing Deflate Compression Levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/)
-
The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
| old import | new import | Documentation
@@ -323,6 +335,8 @@ Memory usage is typically 1MB for a Writer. stdlib is in the same range. If you expect to have a lot of concurrently allocated Writers consider using
the stateless compress described below.
+For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing).
+
# Stateless compression
This package offers stateless compression as a special option for gzip/deflate.
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go index 0b2e54972..d55ea2a77 100644 --- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go +++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go @@ -179,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 { // matchlenLong will return the match length between offsets and t in src. // It is assumed that s > t, that t >=0 and s < len(src). func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 { - if debugDecode { + if debugDeflate { if t >= s { panic(fmt.Sprint("t >=s:", t, s)) } diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index fd49efd75..25f6d1108 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -8,6 +8,7 @@ import ( "encoding/binary" "fmt" "io" + "math" ) const ( @@ -24,6 +25,10 @@ const ( codegenCodeCount = 19 badCode = 255 + // maxPredefinedTokens is the maximum number of tokens + // where we check if fixed size is smaller. + maxPredefinedTokens = 250 + // bufferFlushSize indicates the buffer size // after which bytes are flushed to the writer. // Should preferably be a multiple of 6, since @@ -36,8 +41,11 @@ const ( bufferSize = bufferFlushSize + 8 ) +// Minimum length code that emits bits. +const lengthExtraBitsMinCode = 8 + // The number of extra bits needed by length code X - LENGTH_CODES_START. -var lengthExtraBits = [32]int8{ +var lengthExtraBits = [32]uint8{ /* 257 */ 0, 0, 0, /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, @@ -51,6 +59,9 @@ var lengthBase = [32]uint8{ 64, 80, 96, 112, 128, 160, 192, 224, 255, } +// Minimum offset code that emits bits. +const offsetExtraBitsMinCode = 4 + // offset code word extra bits. var offsetExtraBits = [32]int8{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, @@ -78,10 +89,10 @@ func init() { for i := range offsetCombined[:] { // Don't use extended window values... - if offsetBase[i] > 0x006000 { + if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 { continue } - offsetCombined[i] = uint32(offsetExtraBits[i])<<16 | (offsetBase[i]) + offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8) } } @@ -97,7 +108,7 @@ type huffmanBitWriter struct { // Data waiting to be written is bytes[0:nbytes] // and then the low nbits of bits. bits uint64 - nbits uint16 + nbits uint8 nbytes uint8 lastHuffMan bool literalEncoding *huffmanEncoder @@ -215,7 +226,7 @@ func (w *huffmanBitWriter) write(b []byte) { _, w.err = w.writer.Write(b) } -func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { +func (w *huffmanBitWriter) writeBits(b int32, nb uint8) { w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { @@ -571,7 +582,10 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { // Fixed Huffman baseline. var literalEncoding = fixedLiteralEncoding var offsetEncoding = fixedOffsetEncoding - var size = w.fixedSize(extraBits) + var size = math.MaxInt32 + if tokens.n < maxPredefinedTokens { + size = w.fixedSize(extraBits) + } // Dynamic Huffman? var numCodegens int @@ -672,19 +686,21 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b size = reuseSize } - if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size { - // Check if we get a reasonable size decrease. - if storable && ssize <= size { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size { + // Check if we get a reasonable size decrease. + if storable && ssize <= size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) return } - w.writeFixedHeader(eof) - if !sync { - tokens.AddEOB() - } - w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) - return } // Check if we get a reasonable size decrease. if storable && ssize <= size { @@ -717,19 +733,21 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits) // Store predefined, if we don't get a reasonable improvement. - if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size { - // Store bytes, if we don't get an improvement. - if storable && ssize <= preSize { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size { + // Store bytes, if we don't get an improvement. + if storable && ssize <= preSize { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) return } - w.writeFixedHeader(eof) - if !sync { - tokens.AddEOB() - } - w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) - return } if storable && ssize <= size { @@ -833,9 +851,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits, nbits, nbytes := w.bits, w.nbits, w.nbytes for _, t := range tokens { - if t < matchType { + if t < 256 { //w.writeCode(lits[t.literal()]) - c := lits[t.literal()] + c := lits[t] bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { @@ -858,12 +876,12 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) // Write the length length := t.length() - lengthCode := lengthCode(length) + lengthCode := lengthCode(length) & 31 if false { - w.writeCode(lengths[lengthCode&31]) + w.writeCode(lengths[lengthCode]) } else { // inlined - c := lengths[lengthCode&31] + c := lengths[lengthCode] bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { @@ -883,10 +901,10 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } } - extraLengthBits := uint16(lengthExtraBits[lengthCode&31]) - if extraLengthBits > 0 { + if lengthCode >= lengthExtraBitsMinCode { + extraLengthBits := lengthExtraBits[lengthCode] //w.writeBits(extraLength, extraLengthBits) - extraLength := int32(length - lengthBase[lengthCode&31]) + extraLength := int32(length - lengthBase[lengthCode]) bits |= uint64(extraLength) << (nbits & 63) nbits += extraLengthBits if nbits >= 48 { @@ -907,10 +925,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } // Write the offset offset := t.offset() - offsetCode := offset >> 16 - offset &= matchOffsetOnlyMask + offsetCode := (offset >> 16) & 31 if false { - w.writeCode(offs[offsetCode&31]) + w.writeCode(offs[offsetCode]) } else { // inlined c := offs[offsetCode] @@ -932,11 +949,12 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } } } - offsetComb := offsetCombined[offsetCode] - if offsetComb > 1<<16 { + + if offsetCode >= offsetExtraBitsMinCode { + offsetComb := offsetCombined[offsetCode] //w.writeBits(extraOffset, extraOffsetBits) - bits |= uint64(offset-(offsetComb&0xffff)) << (nbits & 63) - nbits += uint16(offsetComb >> 16) + bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63) + nbits += uint8(offsetComb) if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -1002,6 +1020,29 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // https://stackoverflow.com/a/25454430 const guessHeaderSizeBits = 70 * 8 histogram(input, w.literalFreq[:numLiterals], fill) + ssize, storable := w.storedSize(input) + if storable && len(input) > 1024 { + // Quick check for incompressible content. + abs := float64(0) + avg := float64(len(input)) / 256 + max := float64(len(input) * 2) + for _, v := range w.literalFreq[:256] { + diff := float64(v) - avg + abs += diff * diff + if abs > max { + break + } + } + if abs < max { + if debugDeflate { + fmt.Println("stored", abs, "<", max) + } + // No chance we can compress this... + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + } w.literalFreq[endBlockMarker] = 1 w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15) if fill { @@ -1019,8 +1060,10 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { estBits += estBits >> w.logNewTablePenalty // Store bytes, if we don't get a reasonable improvement. - ssize, storable := w.storedSize(input) if storable && ssize <= estBits { + if debugDeflate { + fmt.Println("stored,", ssize, "<=", estBits) + } w.writeStoredHeader(len(input), eof) w.writeBytes(input) return @@ -1031,7 +1074,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { if estBits < reuseSize { if debugDeflate { - //fmt.Println("not reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8) + fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes") } // We owe an EOB w.writeCode(w.literalEncoding.codes[endBlockMarker]) @@ -1065,6 +1108,9 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // Go 1.16 LOVES having these on stack. At least 1.5x the speed. bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + if debugDeflate { + count -= int(nbytes)*8 + int(nbits) + } // Unroll, write 3 codes/loop. // Fastest number of unrolls. for len(input) > 3 { @@ -1074,13 +1120,16 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) bits >>= (n * 8) & 63 nbits -= n * 8 - nbytes += uint8(n) + nbytes += n } if nbytes >= bufferFlushSize { if w.err != nil { nbytes = 0 return } + if debugDeflate { + count += int(nbytes) * 8 + } _, w.err = w.writer.Write(w.bytes[:nbytes]) nbytes = 0 } @@ -1096,13 +1145,6 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // Remaining... for _, t := range input { - // Bitwriting inlined, ~30% speedup - c := encoding[t] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len - if debugDeflate { - count += int(c.len) - } if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -1114,17 +1156,33 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { nbytes = 0 return } + if debugDeflate { + count += int(nbytes) * 8 + } _, w.err = w.writer.Write(w.bytes[:nbytes]) nbytes = 0 } } + // Bitwriting inlined, ~30% speedup + c := encoding[t] + bits |= uint64(c.code) << (nbits & 63) + nbits += c.len + if debugDeflate { + count += int(c.len) + } } // Restore... w.bits, w.nbits, w.nbytes = bits, nbits, nbytes if debugDeflate { - fmt.Println("wrote", count/8, "bytes") + nb := count + int(nbytes)*8 + int(nbits) + fmt.Println("wrote", nb, "bits,", nb/8, "bytes.") + } + // Flush if needed to have space. + if w.nbits >= 48 { + w.writeOutBits() } + if eof || sync { w.writeCode(w.literalEncoding.codes[endBlockMarker]) w.lastHeader = 0 diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go index f35e00261..9ab497c27 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_code.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go @@ -17,7 +17,8 @@ const ( // hcode is a huffman code with a bit code and bit length. type hcode struct { - code, len uint16 + code uint16 + len uint8 } type huffmanEncoder struct { @@ -56,7 +57,7 @@ type levelInfo struct { } // set sets the code and length of an hcode. -func (h *hcode) set(code uint16, length uint16) { +func (h *hcode) set(code uint16, length uint8) { h.len = length h.code = code } @@ -80,7 +81,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder { var ch uint16 for ch = 0; ch < literalCount; ch++ { var bits uint16 - var size uint16 + var size uint8 switch { case ch < 144: // size 8, 000110000 .. 10111111 @@ -99,7 +100,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder { bits = ch + 192 - 280 size = 8 } - codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size} + codes[ch] = hcode{code: reverseBits(bits, size), len: size} } return h } @@ -187,14 +188,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // of the level j ancestor. var leafCounts [maxBitsLimit][maxBitsLimit]int32 + // Descending to only have 1 bounds check. + l2f := int32(list[2].freq) + l1f := int32(list[1].freq) + l0f := int32(list[0].freq) + int32(list[1].freq) + for level := int32(1); level <= maxBits; level++ { // For every level, the first two items are the first two characters. // We initialize the levels as if we had already figured this out. levels[level] = levelInfo{ level: level, - lastFreq: int32(list[1].freq), - nextCharFreq: int32(list[2].freq), - nextPairFreq: int32(list[0].freq) + int32(list[1].freq), + lastFreq: l1f, + nextCharFreq: l2f, + nextPairFreq: l0f, } leafCounts[level][level] = 2 if level == 1 { @@ -205,8 +211,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // We need a total of 2*n - 2 items at top level and have already generated 2. levels[maxBits].needed = 2*n - 4 - level := maxBits - for { + level := uint32(maxBits) + for level < 16 { l := &levels[level] if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 { // We've run out of both leafs and pairs. @@ -238,7 +244,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // more values in the level below l.lastFreq = l.nextPairFreq // Take leaf counts from the lower level, except counts[level] remains the same. - copy(leafCounts[level][:level], leafCounts[level-1][:level]) + if true { + save := leafCounts[level][level] + leafCounts[level] = leafCounts[level-1] + leafCounts[level][level] = save + } else { + copy(leafCounts[level][:level], leafCounts[level-1][:level]) + } levels[l.level-1].needed = 2 } @@ -296,7 +308,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN sortByLiteral(chunk) for _, node := range chunk { - h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)} + h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)} code++ } list = list[0 : len(list)-int(bits)] @@ -309,6 +321,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // maxBits The maximum number of bits to use for any literal. func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { list := h.freqcache[:len(freq)+1] + codes := h.codes[:len(freq)] // Number of non-zero literals count := 0 // Set list to be the set of all non-zero literals and their frequencies @@ -317,11 +330,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { list[count] = literalNode{uint16(i), f} count++ } else { - list[count] = literalNode{} - h.codes[i].len = 0 + codes[i].len = 0 } } - list[len(freq)] = literalNode{} + list[count] = literalNode{} list = list[:count] if count <= 2 { diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go index d5f62f6a2..414c0bea9 100644 --- a/vendor/github.com/klauspost/compress/flate/inflate.go +++ b/vendor/github.com/klauspost/compress/flate/inflate.go @@ -36,6 +36,13 @@ type lengthExtra struct { var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}} +var bitMask32 = [32]uint32{ + 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, + 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, + 0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF, + 0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF, +} // up to 32 bits + // Initialize the fixedHuffmanDecoder only once upon first use. var fixedOnce sync.Once var fixedHuffmanDecoder huffmanDecoder @@ -559,221 +566,6 @@ func (f *decompressor) readHuffman() error { return nil } -// Decode a single Huffman block from f. -// hl and hd are the Huffman states for the lit/length values -// and the distance values, respectively. If hd == nil, using the -// fixed distance encoding associated with fixed Huffman blocks. -func (f *decompressor) huffmanBlockGeneric() { - const ( - stateInit = iota // Zero value must be stateInit - stateDict - ) - - switch f.stepState { - case stateInit: - goto readLiteral - case stateDict: - goto copyHistory - } - -readLiteral: - // Read literal and/or (length, distance) according to RFC section 3.2.3. - { - var v int - { - // Inlined v, err := f.huffSym(f.hl) - // Since a huffmanDecoder can be empty or be composed of a degenerate tree - // with single element, huffSym must error on these two edge cases. In both - // cases, the chunks slice will be 0 for the invalid sequence, leading it - // satisfy the n == 0 check below. - n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b - for { - for nb < n { - c, err := f.r.ReadByte() - if err != nil { - f.b = b - f.nb = nb - f.err = noEOF(err) - return - } - f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 - } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] - n = uint(chunk & huffmanCountMask) - if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] - n = uint(chunk & huffmanCountMask) - } - if n <= nb { - if n == 0 { - f.b = b - f.nb = nb - if debugDecode { - fmt.Println("huffsym: n==0") - } - f.err = CorruptInputError(f.roffset) - return - } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n - v = int(chunk >> huffmanValueShift) - break - } - } - } - - var n uint // number of bits extra - var length int - var err error - switch { - case v < 256: - f.dict.writeByte(byte(v)) - if f.dict.availWrite() == 0 { - f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanBlockGeneric - f.stepState = stateInit - return - } - goto readLiteral - case v == 256: - f.finishBlock() - return - // otherwise, reference to older data - case v < 265: - length = v - (257 - 3) - n = 0 - case v < 269: - length = v*2 - (265*2 - 11) - n = 1 - case v < 273: - length = v*4 - (269*4 - 19) - n = 2 - case v < 277: - length = v*8 - (273*8 - 35) - n = 3 - case v < 281: - length = v*16 - (277*16 - 67) - n = 4 - case v < 285: - length = v*32 - (281*32 - 131) - n = 5 - case v < maxNumLit: - length = 258 - n = 0 - default: - if debugDecode { - fmt.Println(v, ">= maxNumLit") - } - f.err = CorruptInputError(f.roffset) - return - } - if n > 0 { - for f.nb < n { - if err = f.moreBits(); err != nil { - if debugDecode { - fmt.Println("morebits n>0:", err) - } - f.err = err - return - } - } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n - } - - var dist uint32 - if f.hd == nil { - for f.nb < 5 { - if err = f.moreBits(); err != nil { - if debugDecode { - fmt.Println("morebits f.nb<5:", err) - } - f.err = err - return - } - } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 - } else { - sym, err := f.huffSym(f.hd) - if err != nil { - if debugDecode { - fmt.Println("huffsym:", err) - } - f.err = err - return - } - dist = uint32(sym) - } - - switch { - case dist < 4: - dist++ - case dist < maxNumDist: - nb := uint(dist-2) >> 1 - // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { - if err = f.moreBits(); err != nil { - if debugDecode { - fmt.Println("morebits f.nb<nb:", err) - } - f.err = err - return - } - } - extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) - f.b >>= nb & regSizeMaskUint32 - f.nb -= nb - dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra - default: - if debugDecode { - fmt.Println("dist too big:", dist, maxNumDist) - } - f.err = CorruptInputError(f.roffset) - return - } - - // No check on length; encoding can be prescient. - if dist > uint32(f.dict.histSize()) { - if debugDecode { - fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) - } - f.err = CorruptInputError(f.roffset) - return - } - - f.copyLen, f.copyDist = length, int(dist) - goto copyHistory - } - -copyHistory: - // Perform a backwards copy according to RFC section 3.2.3. - { - cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen) - if cnt == 0 { - cnt = f.dict.writeCopy(f.copyDist, f.copyLen) - } - f.copyLen -= cnt - - if f.dict.availWrite() == 0 || f.copyLen > 0 { - f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work - f.stepState = stateDict - return - } - goto readLiteral - } -} - // Copy a single uncompressed data block from input to output. func (f *decompressor) dataBlock() { // Uncompressed. diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go index cc6db2792..8d632cea0 100644 --- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go +++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go @@ -21,6 +21,11 @@ func (f *decompressor) huffmanBytesBuffer() { ) fr := f.r.(*bytes.Buffer) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb := f.nb, f.b + switch f.stepState { case stateInit: goto readLiteral @@ -39,41 +44,35 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -88,10 +87,12 @@ readLiteral: f.toRead = f.dict.readFlush() f.step = (*decompressor).huffmanBytesBuffer f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -101,9 +102,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -111,25 +113,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -137,12 +141,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -152,38 +156,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -197,9 +198,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<nb:", err) } @@ -207,14 +209,16 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) - f.b >>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -224,6 +228,7 @@ readLiteral: // No check on length; encoding can be prescient. if dist > uint32(f.dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -248,10 +253,12 @@ copyHistory: f.toRead = f.dict.readFlush() f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } // Decode a single Huffman block from f. @@ -265,6 +272,11 @@ func (f *decompressor) huffmanBytesReader() { ) fr := f.r.(*bytes.Reader) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb := f.nb, f.b + switch f.stepState { case stateInit: goto readLiteral @@ -283,41 +295,35 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -332,10 +338,12 @@ readLiteral: f.toRead = f.dict.readFlush() f.step = (*decompressor).huffmanBytesReader f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -345,9 +353,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -355,25 +364,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -381,12 +392,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -396,38 +407,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -441,9 +449,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<nb:", err) } @@ -451,14 +460,16 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) - f.b >>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -468,6 +479,7 @@ readLiteral: // No check on length; encoding can be prescient. if dist > uint32(f.dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -492,10 +504,12 @@ copyHistory: f.toRead = f.dict.readFlush() f.step = (*decompressor).huffmanBytesReader // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } // Decode a single Huffman block from f. @@ -509,6 +523,11 @@ func (f *decompressor) huffmanBufioReader() { ) fr := f.r.(*bufio.Reader) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb := f.nb, f.b + switch f.stepState { case stateInit: goto readLiteral @@ -527,41 +546,35 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -576,10 +589,12 @@ readLiteral: f.toRead = f.dict.readFlush() f.step = (*decompressor).huffmanBufioReader f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -589,9 +604,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -599,25 +615,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -625,12 +643,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -640,38 +658,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -685,9 +700,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<nb:", err) } @@ -695,14 +711,16 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) - f.b >>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -712,6 +730,7 @@ readLiteral: // No check on length; encoding can be prescient. if dist > uint32(f.dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -736,10 +755,12 @@ copyHistory: f.toRead = f.dict.readFlush() f.step = (*decompressor).huffmanBufioReader // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } // Decode a single Huffman block from f. @@ -753,6 +774,11 @@ func (f *decompressor) huffmanStringsReader() { ) fr := f.r.(*strings.Reader) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb := f.nb, f.b + switch f.stepState { case stateInit: goto readLiteral @@ -771,41 +797,286 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) + for { + for fnb < n { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + f.err = noEOF(err) + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] + n = uint(chunk & huffmanCountMask) + if n > huffmanChunkBits { + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] + n = uint(chunk & huffmanCountMask) + } + if n <= fnb { + if n == 0 { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("huffsym: n==0") + } + f.err = CorruptInputError(f.roffset) + return + } + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n + v = int(chunk >> huffmanValueShift) + break + } + } + } + + var length int + switch { + case v < 256: + f.dict.writeByte(byte(v)) + if f.dict.availWrite() == 0 { + f.toRead = f.dict.readFlush() + f.step = (*decompressor).huffmanStringsReader + f.stepState = stateInit + f.b, f.nb = fb, fnb + return + } + goto readLiteral + case v == 256: + f.b, f.nb = fb, fnb + f.finishBlock() + return + // otherwise, reference to older data + case v < 265: + length = v - (257 - 3) + case v < maxNumLit: + val := decCodeToLen[(v - 257)] + length = int(val.length) + 3 + n := uint(val.extra) + for fnb < n { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("morebits n>0:", err) + } + f.err = err + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n + default: + if debugDecode { + fmt.Println(v, ">= maxNumLit") + } + f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb + return + } + + var dist uint32 + if f.hd == nil { + for fnb < 5 { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("morebits f.nb<5:", err) + } + f.err = err + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 + } else { + // Since a huffmanDecoder can be empty or be composed of a degenerate tree + // with single element, huffSym must error on these two edge cases. In both + // cases, the chunks slice will be 0 for the invalid sequence, leading it + // satisfy the n == 0 check below. + n := uint(f.hd.maxRead) // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + f.err = noEOF(err) + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] + n = uint(chunk & huffmanCountMask) + if n > huffmanChunkBits { + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] + n = uint(chunk & huffmanCountMask) + } + if n <= fnb { + if n == 0 { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("huffsym: n==0") + } + f.err = CorruptInputError(f.roffset) + return + } + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n + dist = uint32(chunk >> huffmanValueShift) + break + } + } + } + + switch { + case dist < 4: + dist++ + case dist < maxNumDist: + nb := uint(dist-2) >> 1 + // have 1 bit in bottom of dist, need nb more. + extra := (dist & 1) << (nb & regSizeMaskUint32) + for fnb < nb { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("morebits f.nb<nb:", err) + } + f.err = err + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra + default: + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("dist too big:", dist, maxNumDist) + } + f.err = CorruptInputError(f.roffset) + return + } + + // No check on length; encoding can be prescient. + if dist > uint32(f.dict.histSize()) { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) + } + f.err = CorruptInputError(f.roffset) + return + } + + f.copyLen, f.copyDist = length, int(dist) + goto copyHistory + } + +copyHistory: + // Perform a backwards copy according to RFC section 3.2.3. + { + cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen) + if cnt == 0 { + cnt = f.dict.writeCopy(f.copyDist, f.copyLen) + } + f.copyLen -= cnt + + if f.dict.availWrite() == 0 || f.copyLen > 0 { + f.toRead = f.dict.readFlush() + f.step = (*decompressor).huffmanStringsReader // We need to continue this work + f.stepState = stateDict + f.b, f.nb = fb, fnb + return + } + goto readLiteral + } + // Not reached +} + +// Decode a single Huffman block from f. +// hl and hd are the Huffman states for the lit/length values +// and the distance values, respectively. If hd == nil, using the +// fixed distance encoding associated with fixed Huffman blocks. +func (f *decompressor) huffmanGenericReader() { + const ( + stateInit = iota // Zero value must be stateInit + stateDict + ) + fr := f.r.(Reader) + + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb := f.nb, f.b + + switch f.stepState { + case stateInit: + goto readLiteral + case stateDict: + goto copyHistory + } + +readLiteral: + // Read literal and/or (length, distance) according to RFC section 3.2.3. + { + var v int + { + // Inlined v, err := f.huffSym(f.hl) + // Since a huffmanDecoder can be empty or be composed of a degenerate tree + // with single element, huffSym must error on these two edge cases. In both + // cases, the chunks slice will be 0 for the invalid sequence, leading it + // satisfy the n == 0 check below. + n := uint(f.hl.maxRead) + for { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -818,12 +1089,14 @@ readLiteral: f.dict.writeByte(byte(v)) if f.dict.availWrite() == 0 { f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanStringsReader + f.step = (*decompressor).huffmanGenericReader f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -833,9 +1106,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -843,25 +1117,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -869,12 +1145,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -884,38 +1160,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -929,9 +1202,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<nb:", err) } @@ -939,14 +1213,16 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) - f.b >>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -956,6 +1232,7 @@ readLiteral: // No check on length; encoding can be prescient. if dist > uint32(f.dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -978,12 +1255,14 @@ copyHistory: if f.dict.availWrite() == 0 || f.copyLen > 0 { f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanStringsReader // We need to continue this work + f.step = (*decompressor).huffmanGenericReader // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } func (f *decompressor) huffmanBlockDecoder() func() { @@ -996,7 +1275,9 @@ func (f *decompressor) huffmanBlockDecoder() func() { return f.huffmanBufioReader case *strings.Reader: return f.huffmanStringsReader + case Reader: + return f.huffmanGenericReader default: - return f.huffmanBlockGeneric + return f.huffmanGenericReader } } diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go index 1e5eea396..0022c8bb6 100644 --- a/vendor/github.com/klauspost/compress/flate/level1.go +++ b/vendor/github.com/klauspost/compress/flate/level1.go @@ -1,6 +1,10 @@ package flate -import "fmt" +import ( + "encoding/binary" + "fmt" + "math/bits" +) // fastGen maintains the table for matches, // and the previous byte block for level 2. @@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { // Extend the 4-byte match as long as possible. t := candidate.offset - e.cur - l := e.matchlenLong(s+4, t+4, src) + 4 + var l = int32(4) + if false { + l = e.matchlenLong(s+4, t+4, src) + 4 + } else { + // inlined: + a := src[s+4:] + b := src[t+4:] + for len(a) >= 8 { + if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 { + l += int32(bits.TrailingZeros64(diff) >> 3) + break + } + l += 8 + a = a[8:] + b = b[8:] + } + if len(a) < 8 { + b = b[:len(a)] + for i := range a { + if a[i] != b[i] { + break + } + l++ + } + } + } // Extend backwards for t > 0 && s > nextEmit && src[t-1] == src[s-1] { @@ -129,7 +158,28 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { } // Save the match found - dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + if false { + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + } else { + // Inlined... + xoffset := uint32(s - t - baseMatchOffset) + xlength := l + oc := offsetCode(xoffset) + xoffset |= oc << 16 + for xlength > 0 { + xl := xlength + if xl > 258 { + // We need to have at least baseMatchLength left over for next loop. + xl = 258 - baseMatchLength + } + xlength -= xl + xl -= baseMatchLength + dst.extraHist[lengthCodes1[uint8(xl)]]++ + dst.offHist[oc]++ + dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset) + dst.n++ + } + } s += l nextEmit = s if nextS >= s { diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go index c22b4244a..465b5e0ba 100644 --- a/vendor/github.com/klauspost/compress/flate/level3.go +++ b/vendor/github.com/klauspost/compress/flate/level3.go @@ -5,7 +5,7 @@ import "fmt" // fastEncL3 type fastEncL3 struct { fastGen - table [tableSize]tableEntryPrev + table [1 << 16]tableEntryPrev } // Encode uses a similar algorithm to level 2, will check up to two candidates. @@ -13,6 +13,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { const ( inputMargin = 8 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + tableBits = 16 + tableSize = 1 << tableBits ) if debugDeflate && e.cur < 0 { @@ -73,7 +75,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { nextS := s var candidate tableEntry for { - nextHash := hash(cv) + nextHash := hash4u(cv, tableBits) s = nextS nextS = s + 1 + (s-nextEmit)>>skipLog if nextS > sLimit { @@ -156,7 +158,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { // Index first pair after match end. if int(t+4) < len(src) && t > 0 { cv := load3232(src, t) - nextHash := hash(cv) + nextHash := hash4u(cv, tableBits) e.table[nextHash] = tableEntryPrev{ Prev: e.table[nextHash].Cur, Cur: tableEntry{offset: e.cur + t}, @@ -165,30 +167,31 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { goto emitRemainder } - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-3 to s. - x := load6432(src, s-3) - prevHash := hash(uint32(x)) - e.table[prevHash] = tableEntryPrev{ - Prev: e.table[prevHash].Cur, - Cur: tableEntry{offset: e.cur + s - 3}, + // Store every 5th hash in-between. + for i := s - l + 2; i < s-5; i += 5 { + nextHash := hash4u(load3232(src, i), tableBits) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + i}} } - x >>= 8 - prevHash = hash(uint32(x)) + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. + x := load6432(src, s-2) + prevHash := hash4u(uint32(x), tableBits) e.table[prevHash] = tableEntryPrev{ Prev: e.table[prevHash].Cur, Cur: tableEntry{offset: e.cur + s - 2}, } x >>= 8 - prevHash = hash(uint32(x)) + prevHash = hash4u(uint32(x), tableBits) e.table[prevHash] = tableEntryPrev{ Prev: e.table[prevHash].Cur, Cur: tableEntry{offset: e.cur + s - 1}, } x >>= 8 - currHash := hash(uint32(x)) + currHash := hash4u(uint32(x), tableBits) candidates := e.table[currHash] cv = uint32(x) e.table[currHash] = tableEntryPrev{ @@ -200,15 +203,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { candidate = candidates.Cur minOffset := e.cur + s - (maxMatchOffset - 4) - if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) { - // We only check if value mismatches. - // Offset will always be invalid in other cases. + if candidate.offset > minOffset { + if cv == load3232(src, candidate.offset-e.cur) { + // Found a match... + continue + } candidate = candidates.Prev if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - continue - } + // Match at prev... + continue } } cv = uint32(x >> 8) diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go index 3a9618ee1..8005a5ca8 100644 --- a/vendor/github.com/klauspost/compress/flate/token.go +++ b/vendor/github.com/klauspost/compress/flate/token.go @@ -13,11 +13,10 @@ import ( ) const ( - // From top - // 2 bits: type 0 = literal 1=EOF 2=Match 3=Unused - // 8 bits: xlength = length - MIN_MATCH_LENGTH - // 5 bits offsetcode - // 16 bits xoffset = offset - MIN_OFFSET_SIZE, or literal + // bits 0-16 xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits + // bits 16-22 offsetcode - 5 bits + // bits 22-30 xlength = length - MIN_MATCH_LENGTH - 8 bits + // bits 30-32 type 0 = literal 1=EOF 2=Match 3=Unused - 2 bits lengthShift = 22 offsetMask = 1<<lengthShift - 1 typeMask = 3 << 30 @@ -276,7 +275,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { xoffset |= oCode << 16 t.extraHist[lengthCodes1[uint8(xlength)]]++ - t.offHist[oCode]++ + t.offHist[oCode&31]++ t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset) t.n++ } @@ -300,7 +299,7 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) { xlength -= xl xl -= baseMatchLength t.extraHist[lengthCodes1[uint8(xl)]]++ - t.offHist[oc]++ + t.offHist[oc&31]++ t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset) t.n++ } @@ -356,8 +355,8 @@ func (t token) offset() uint32 { return uint32(t) & offsetMask } func (t token) length() uint8 { return uint8(t >> lengthShift) } -// The code is never more than 8 bits, but is returned as uint32 for convenience. -func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) } +// Convert length to code. +func lengthCode(len uint8) uint8 { return lengthCodes[len] } // Returns the offset code corresponding to a specific offset func offsetCode(off uint32) uint32 { diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go index 2a06bd1a7..2668b64d3 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress.go +++ b/vendor/github.com/klauspost/compress/huff0/decompress.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "io" + "sync" "github.com/klauspost/compress/fse" ) @@ -216,6 +217,7 @@ func (s *Scratch) Decoder() *Decoder { return &Decoder{ dt: s.dt, actualTableLog: s.actualTableLog, + bufs: &s.decPool, } } @@ -223,6 +225,15 @@ func (s *Scratch) Decoder() *Decoder { type Decoder struct { dt dTable actualTableLog uint8 + bufs *sync.Pool +} + +func (d *Decoder) buffer() *[4][256]byte { + buf, ok := d.bufs.Get().(*[4][256]byte) + if ok { + return buf + } + return &[4][256]byte{} } // Decompress1X will decompress a 1X encoded stream. @@ -249,7 +260,8 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { dt := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 for br.off >= 8 { @@ -277,6 +289,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -284,6 +297,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -310,6 +324,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { } } if len(dst) >= maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -319,6 +334,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { bitsLeft -= nBits dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } @@ -341,7 +357,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { dt := d.dt.single[:256] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 switch d.actualTableLog { @@ -369,6 +386,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -398,6 +416,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -426,6 +445,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -455,6 +475,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -484,6 +505,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -513,6 +535,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -542,6 +565,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -571,6 +595,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -578,10 +603,12 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { } } default: + d.bufs.Put(bufs) return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog) } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -601,6 +628,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { } if len(dst) >= maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } v := dt[br.peekByteFast()>>shift] @@ -609,6 +637,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { bitsLeft -= int8(nBits) dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } @@ -628,7 +657,8 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { dt := d.dt.single[:256] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 const shift = 56 @@ -655,6 +685,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -663,6 +694,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -679,6 +711,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { } } if len(dst) >= maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -688,6 +721,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { bitsLeft -= int8(nBits) dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } @@ -735,12 +769,12 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + buf := d.buffer() var off uint8 var decoded int // Decode 2 values from each decoder/loop. - const bufoff = 256 / 4 + const bufoff = 256 for { if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { break @@ -758,8 +792,8 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { v2 := single[val2&tlMask] br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream] = uint8(v.entry >> 8) - buf[off+bufoff*stream2] = uint8(v2.entry >> 8) + buf[stream][off] = uint8(v.entry >> 8) + buf[stream2][off] = uint8(v2.entry >> 8) val = br[stream].peekBitsFast(d.actualTableLog) val2 = br[stream2].peekBitsFast(d.actualTableLog) @@ -767,8 +801,8 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { v2 = single[val2&tlMask] br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream+1] = uint8(v.entry >> 8) - buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8) + buf[stream][off+1] = uint8(v.entry >> 8) + buf[stream2][off+1] = uint8(v2.entry >> 8) } { @@ -783,8 +817,8 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { v2 := single[val2&tlMask] br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream] = uint8(v.entry >> 8) - buf[off+bufoff*stream2] = uint8(v2.entry >> 8) + buf[stream][off] = uint8(v.entry >> 8) + buf[stream2][off] = uint8(v2.entry >> 8) val = br[stream].peekBitsFast(d.actualTableLog) val2 = br[stream2].peekBitsFast(d.actualTableLog) @@ -792,25 +826,26 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { v2 = single[val2&tlMask] br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream+1] = uint8(v.entry >> 8) - buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8) + buf[stream][off+1] = uint8(v.entry >> 8) + buf[stream2][off+1] = uint8(v2.entry >> 8) } off += 2 - if off == bufoff { + if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } - copy(out, buf[:bufoff]) - copy(out[dstEvery:], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4]) - off = 0 + copy(out, buf[0][:]) + copy(out[dstEvery:], buf[1][:]) + copy(out[dstEvery*2:], buf[2][:]) + copy(out[dstEvery*3:], buf[3][:]) out = out[bufoff:] - decoded += 256 + decoded += bufoff * 4 // There must at least be 3 buffers left. if len(out) < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } } @@ -818,12 +853,13 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if off > 0 { ioff := int(off) if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 3") } - copy(out, buf[:off]) - copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4]) + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) decoded += int(off) * 4 out = out[off:] } @@ -853,6 +889,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { } // end inline... if offset >= len(out) { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -871,6 +908,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } @@ -916,12 +954,12 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + buf := d.buffer() var off uint8 var decoded int // Decode 4 values from each decoder/loop. - const bufoff = 256 / 4 + const bufoff = 256 for { if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { break @@ -942,8 +980,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -951,8 +989,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -960,8 +998,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -969,8 +1007,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - buf[off+bufoff*stream+3] = uint8(v >> 8) + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } { @@ -987,8 +1025,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -996,8 +1034,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -1005,8 +1043,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -1014,25 +1052,26 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - buf[off+bufoff*stream+3] = uint8(v >> 8) + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } off += 4 - if off == bufoff { + if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } - copy(out, buf[:bufoff]) - copy(out[dstEvery:], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4]) - off = 0 + copy(out, buf[0][:]) + copy(out[dstEvery:], buf[1][:]) + copy(out[dstEvery*2:], buf[2][:]) + copy(out[dstEvery*3:], buf[3][:]) out = out[bufoff:] - decoded += 256 + decoded += bufoff * 4 // There must at least be 3 buffers left. if len(out) < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } } @@ -1040,12 +1079,13 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { if off > 0 { ioff := int(off) if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 3") } - copy(out, buf[:off]) - copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4]) + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) decoded += int(off) * 4 out = out[off:] } @@ -1057,6 +1097,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { bitsLeft := int(br.off*8) + int(64-br.bitsRead) for bitsLeft > 0 { if br.finished() { + d.bufs.Put(buf) return nil, io.ErrUnexpectedEOF } if br.bitsRead >= 56 { @@ -1077,6 +1118,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { } // end inline... if offset >= len(out) { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -1091,9 +1133,11 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { decoded += offset - dstEvery*i err = br.close() if err != nil { + d.bufs.Put(buf) return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } @@ -1135,12 +1179,12 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + buf := d.buffer() var off uint8 var decoded int // Decode 4 values from each decoder/loop. - const bufoff = 256 / 4 + const bufoff = 256 for { if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { break @@ -1150,104 +1194,109 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { // Interleave 2 decodes. const stream = 0 const stream2 = 1 - br[stream].fillFast() - br[stream2].fillFast() + br1 := &br[stream] + br2 := &br[stream2] + br1.fillFast() + br2.fillFast() + + v := single[uint8(br1.value>>shift)].entry + v2 := single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) - v := single[uint8(br[stream].value>>shift)].entry - v2 := single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+3] = uint8(v >> 8) - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } { const stream = 2 const stream2 = 3 - br[stream].fillFast() - br[stream2].fillFast() + br1 := &br[stream] + br2 := &br[stream2] + br1.fillFast() + br2.fillFast() + + v := single[uint8(br1.value>>shift)].entry + v2 := single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) - v := single[uint8(br[stream].value>>shift)].entry - v2 := single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+3] = uint8(v >> 8) - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } off += 4 - if off == bufoff { + if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } - copy(out, buf[:bufoff]) - copy(out[dstEvery:], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4]) - off = 0 + copy(out, buf[0][:]) + copy(out[dstEvery:], buf[1][:]) + copy(out[dstEvery*2:], buf[2][:]) + copy(out[dstEvery*3:], buf[3][:]) out = out[bufoff:] - decoded += 256 + decoded += bufoff * 4 // There must at least be 3 buffers left. if len(out) < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } } @@ -1257,10 +1306,10 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { if len(out) < dstEvery*3+ioff { return nil, errors.New("corruption detected: stream overrun 3") } - copy(out, buf[:off]) - copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4]) + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) decoded += int(off) * 4 out = out[off:] } @@ -1272,6 +1321,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { bitsLeft := int(br.off*8) + int(64-br.bitsRead) for bitsLeft > 0 { if br.finished() { + d.bufs.Put(buf) return nil, io.ErrUnexpectedEOF } if br.bitsRead >= 56 { @@ -1292,6 +1342,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { } // end inline... if offset >= len(out) { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -1306,9 +1357,11 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { decoded += offset - dstEvery*i err = br.close() if err != nil { + d.bufs.Put(buf) return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go index 3ee00ecb4..e8ad17ad0 100644 --- a/vendor/github.com/klauspost/compress/huff0/huff0.go +++ b/vendor/github.com/klauspost/compress/huff0/huff0.go @@ -8,6 +8,7 @@ import ( "fmt" "math" "math/bits" + "sync" "github.com/klauspost/compress/fse" ) @@ -116,6 +117,7 @@ type Scratch struct { nodes []nodeElt tmpOut [4][]byte fse *fse.Scratch + decPool sync.Pool // *[4][256]byte buffers. huffWeight [maxSymbolValue + 1]byte } diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go index 5f08a2830..f51ab529a 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go @@ -85,7 +85,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) { // TEMPLATE const hashLog = tableBits // seems global, but would be nice to tweak. - const kSearchStrength = 7 + const kSearchStrength = 6 // nextEmit is where in src the next emitLiteral should start from. nextEmit := s @@ -334,7 +334,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { // TEMPLATE const hashLog = tableBits // seems global, but would be nice to tweak. - const kSearchStrength = 8 + const kSearchStrength = 6 // nextEmit is where in src the next emitLiteral should start from. nextEmit := s |