diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress')
18 files changed, 704 insertions, 400 deletions
diff --git a/vendor/github.com/klauspost/compress/.goreleaser.yml b/vendor/github.com/klauspost/compress/.goreleaser.yml index c9014ce1d..0af08e65e 100644 --- a/vendor/github.com/klauspost/compress/.goreleaser.yml +++ b/vendor/github.com/klauspost/compress/.goreleaser.yml @@ -3,6 +3,7 @@ before: hooks: - ./gen.sh + - go install mvdan.cc/garble@latest builds: - @@ -31,6 +32,7 @@ builds: - mips64le goarm: - 7 + gobinary: garble - id: "s2d" binary: s2d @@ -57,6 +59,7 @@ builds: - mips64le goarm: - 7 + gobinary: garble - id: "s2sx" binary: s2sx @@ -84,6 +87,7 @@ builds: - mips64le goarm: - 7 + gobinary: garble archives: - diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md index 3429879eb..e8ff994f8 100644 --- a/vendor/github.com/klauspost/compress/README.md +++ b/vendor/github.com/klauspost/compress/README.md @@ -17,6 +17,13 @@ This package provides various compression algorithms. # changelog
+* Jan 11, 2022 (v1.14.1)
+ * s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462)
+ * flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458)
+ * zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
+ * zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
+ * Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+
* Aug 30, 2021 (v1.13.5)
* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
@@ -432,6 +439,13 @@ For more information see my blog post on [Fast Linear Time Compression](http://b This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip.
+# Other packages
+
+Here are other packages of good quality and pure Go (no cgo wrappers or autoconverted code):
+
+* [github.com/pierrec/lz4](https://github.com/pierrec/lz4) - strong multithreaded LZ4 compression.
+* [github.com/cosnicolaou/pbzip2](https://github.com/cosnicolaou/pbzip2) - multithreaded bzip2 decompression.
+* [github.com/dsnet/compress](https://github.com/dsnet/compress) - brotli decompression, bzip2 writer.
# license
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go index 5283ac5a5..b27f5a93b 100644 --- a/vendor/github.com/klauspost/compress/flate/deflate.go +++ b/vendor/github.com/klauspost/compress/flate/deflate.go @@ -6,9 +6,13 @@ package flate import ( + "encoding/binary" "fmt" "io" "math" + "math/bits" + + comp "github.com/klauspost/compress" ) const ( @@ -37,15 +41,17 @@ const ( maxMatchLength = 258 // The longest match for the compressor minOffsetSize = 1 // The shortest offset that makes any sense - // The maximum number of tokens we put into a single flat block, just too - // stop things from getting too large. - maxFlateBlockTokens = 1 << 14 + // The maximum number of tokens we will encode at the time. + // Smaller sizes usually creates less optimal blocks. + // Bigger can make context switching slow. + // We use this for levels 7-9, so we make it big. + maxFlateBlockTokens = 1 << 15 maxStoreBlockSize = 65535 hashBits = 17 // After 17 performance degrades hashSize = 1 << hashBits hashMask = (1 << hashBits) - 1 hashShift = (hashBits + minMatchLength - 1) / minMatchLength - maxHashOffset = 1 << 24 + maxHashOffset = 1 << 28 skipNever = math.MaxInt32 @@ -70,9 +76,9 @@ var levels = []compressionLevel{ {0, 0, 0, 0, 0, 6}, // Levels 7-9 use increasingly more lazy matching // and increasingly stringent conditions for "good enough". - {8, 8, 24, 16, skipNever, 7}, - {10, 16, 24, 64, skipNever, 8}, - {32, 258, 258, 4096, skipNever, 9}, + {6, 10, 12, 16, skipNever, 7}, + {10, 24, 32, 64, skipNever, 8}, + {32, 258, 258, 1024, skipNever, 9}, } // advancedState contains state for the advanced levels, with bigger hash tables, etc. @@ -93,8 +99,9 @@ type advancedState struct { hashOffset int // input window: unprocessed data is window[index:windowEnd] - index int - hashMatch [maxMatchLength + minMatchLength]uint32 + index int + estBitsPerByte int + hashMatch [maxMatchLength + minMatchLength]uint32 hash uint32 ii uint16 // position of last match, intended to overflow to reset. @@ -170,7 +177,8 @@ func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error { window = d.window[d.blockStart:index] } d.blockStart = index - d.w.writeBlock(tok, eof, window) + //d.w.writeBlock(tok, eof, window) + d.w.writeBlockDynamic(tok, eof, window, d.sync) return d.w.err } return nil @@ -263,7 +271,7 @@ func (d *compressor) fillWindow(b []byte) { // Try to find a match starting at index whose length is greater than prevSize. // We only look at chainCount possibilities before giving up. // pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead -func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { +func (d *compressor) findMatch(pos int, prevHead int, lookahead, bpb int) (length, offset int, ok bool) { minMatchLook := maxMatchLength if lookahead < minMatchLook { minMatchLook = lookahead @@ -279,36 +287,43 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead // If we've got a match that's good enough, only look in 1/4 the chain. tries := d.chain - length = prevLength - if length >= d.good { - tries >>= 2 - } + length = minMatchLength - 1 wEnd := win[pos+length] wPos := win[pos:] minIndex := pos - windowSize + if minIndex < 0 { + minIndex = 0 + } + offset = 0 + // Base is 4 bytes at with an additional cost. + // Matches must be better than this. + cGain := minMatchLength*bpb - 12 for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { n := matchLen(win[i:i+minMatchLook], wPos) - - if n > length && (n > minMatchLength || pos-i <= 4096) { - length = n - offset = pos - i - ok = true - if n >= nice { - // The match is good enough that we don't try to find a better one. - break + if n > length { + newGain := n*bpb - bits.Len32(uint32(pos-i)) + if newGain > cGain { + length = n + offset = pos - i + cGain = newGain + ok = true + if n >= nice { + // The match is good enough that we don't try to find a better one. + break + } + wEnd = win[pos+n] } - wEnd = win[pos+n] } } - if i == minIndex { + if i <= minIndex { // hashPrev[i & windowMask] has already been overwritten, so stop now. break } i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset - if i < minIndex || i < 0 { + if i < minIndex { break } } @@ -327,8 +342,7 @@ func (d *compressor) writeStoredBlock(buf []byte) error { // of the supplied slice. // The caller must ensure that len(b) >= 4. func hash4(b []byte) uint32 { - b = b[:4] - return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits) + return hash4u(binary.LittleEndian.Uint32(b), hashBits) } // bulkHash4 will compute hashes using the same @@ -337,11 +351,12 @@ func bulkHash4(b []byte, dst []uint32) { if len(b) < 4 { return } - hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 + hb := binary.LittleEndian.Uint32(b) + dst[0] = hash4u(hb, hashBits) end := len(b) - 4 + 1 for i := 1; i < end; i++ { - hb = (hb << 8) | uint32(b[i+3]) + hb = (hb >> 8) | uint32(b[i+3])<<24 dst[i] = hash4u(hb, hashBits) } } @@ -374,10 +389,15 @@ func (d *compressor) deflateLazy() { if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { return } + s.estBitsPerByte = 8 + if !d.sync { + s.estBitsPerByte = comp.ShannonEntropyBits(d.window[s.index:d.windowEnd]) + s.estBitsPerByte = int(1 + float64(s.estBitsPerByte)/float64(d.windowEnd-s.index)) + } s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) if s.index < s.maxInsertIndex { - s.hash = hash4(d.window[s.index : s.index+minMatchLength]) + s.hash = hash4(d.window[s.index:]) } for { @@ -410,7 +430,7 @@ func (d *compressor) deflateLazy() { } if s.index < s.maxInsertIndex { // Update the hash - s.hash = hash4(d.window[s.index : s.index+minMatchLength]) + s.hash = hash4(d.window[s.index:]) ch := s.hashHead[s.hash&hashMask] s.chainHead = int(ch) s.hashPrev[s.index&windowMask] = ch @@ -426,12 +446,37 @@ func (d *compressor) deflateLazy() { } if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { - if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok { + if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead, s.estBitsPerByte); ok { s.length = newLength s.offset = newOffset } } + if prevLength >= minMatchLength && s.length <= prevLength { + // Check for better match at end... + // + // checkOff must be >=2 since we otherwise risk checking s.index + // Offset of 2 seems to yield best results. + const checkOff = 2 + prevIndex := s.index - 1 + if prevIndex+prevLength+checkOff < s.maxInsertIndex { + end := lookahead + if lookahead > maxMatchLength { + end = maxMatchLength + } + end += prevIndex + idx := prevIndex + prevLength - (4 - checkOff) + h := hash4(d.window[idx:]) + ch2 := int(s.hashHead[h&hashMask]) - s.hashOffset - prevLength + (4 - checkOff) + if ch2 > minIndex { + length := matchLen(d.window[prevIndex:end], d.window[ch2:]) + // It seems like a pure length metric is best. + if length > prevLength { + prevLength = length + prevOffset = prevIndex - ch2 + } + } + } // There was a match at the previous step, and the current match is // not better. Output the previous match. d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) @@ -479,6 +524,7 @@ func (d *compressor) deflateLazy() { } d.tokens.Reset() } + s.ii = 0 } else { // Reset, if we got a match this run. if s.length >= minMatchLength { @@ -498,13 +544,12 @@ func (d *compressor) deflateLazy() { // If we have a long run of no matches, skip additional bytes // Resets when s.ii overflows after 64KB. - if s.ii > 31 { - n := int(s.ii >> 5) + if n := int(s.ii) - d.chain; n > 0 { + n = 1 + int(n>>6) for j := 0; j < n; j++ { if s.index >= d.windowEnd-1 { break } - d.tokens.AddLiteral(d.window[s.index-1]) if d.tokens.n == maxFlateBlockTokens { if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { @@ -512,6 +557,14 @@ func (d *compressor) deflateLazy() { } d.tokens.Reset() } + // Index... + if s.index < s.maxInsertIndex { + h := hash4(d.window[s.index:]) + ch := s.hashHead[h] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[h] = uint32(s.index + s.hashOffset) + } s.index++ } // Flush last byte @@ -611,7 +664,9 @@ func (d *compressor) write(b []byte) (n int, err error) { } n = len(b) for len(b) > 0 { - d.step(d) + if d.windowEnd == len(d.window) || d.sync { + d.step(d) + } b = b[d.fill(d, b):] if d.err != nil { return 0, d.err @@ -652,13 +707,13 @@ func (d *compressor) init(w io.Writer, level int) (err error) { level = 5 fallthrough case level >= 1 && level <= 6: - d.w.logNewTablePenalty = 8 + d.w.logNewTablePenalty = 7 d.fast = newFastEnc(level) d.window = make([]byte, maxStoreBlockSize) d.fill = (*compressor).fillBlock d.step = (*compressor).storeFast case 7 <= level && level <= 9: - d.w.logNewTablePenalty = 10 + d.w.logNewTablePenalty = 8 d.state = &advancedState{} d.compressionLevel = levels[level] d.initDeflate() diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go index a746eb733..0b2e54972 100644 --- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go +++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go @@ -213,11 +213,9 @@ func (e *fastGen) Reset() { // matchLen returns the maximum length. // 'a' must be the shortest of the two. func matchLen(a, b []byte) int { - b = b[:len(a)] var checked int for len(a) >= 8 { - b = b[:len(a)] if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 { return checked + (bits.TrailingZeros64(diff) >> 3) } diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index 3ad5e9807..fb1701eec 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -155,37 +155,33 @@ func (w *huffmanBitWriter) reset(writer io.Writer) { w.lastHuffMan = false } -func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) { - offsets, lits = true, true +func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { a := t.offHist[:offsetCodeCount] - b := w.offsetFreq[:len(a)] - for i := range a { - if b[i] == 0 && a[i] != 0 { - offsets = false - break + b := w.offsetEncoding.codes + b = b[:len(a)] + for i, v := range a { + if v != 0 && b[i].len == 0 { + return false } } a = t.extraHist[:literalCount-256] - b = w.literalFreq[256:literalCount] + b = w.literalEncoding.codes[256:literalCount] b = b[:len(a)] - for i := range a { - if b[i] == 0 && a[i] != 0 { - lits = false - break + for i, v := range a { + if v != 0 && b[i].len == 0 { + return false } } - if lits { - a = t.litHist[:] - b = w.literalFreq[:len(a)] - for i := range a { - if b[i] == 0 && a[i] != 0 { - lits = false - break - } + + a = t.litHist[:256] + b = w.literalEncoding.codes[:len(a)] + for i, v := range a { + if v != 0 && b[i].len == 0 { + return false } } - return + return true } func (w *huffmanBitWriter) flush() { @@ -222,7 +218,7 @@ func (w *huffmanBitWriter) write(b []byte) { } func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { - w.bits |= uint64(b) << w.nbits + w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { w.writeOutBits() @@ -423,7 +419,7 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { func (w *huffmanBitWriter) writeCode(c hcode) { // The function does not get inlined if we "& 63" the shift. - w.bits |= uint64(c.code) << w.nbits + w.bits |= uint64(c.code) << (w.nbits & 63) w.nbits += c.len if w.nbits >= 48 { w.writeOutBits() @@ -566,7 +562,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { w.lastHeader = 0 } numLiterals, numOffsets := w.indexTokens(tokens, false) - w.generate(tokens) + w.generate() var extraBits int storedSize, storable := w.storedSize(input) if storable { @@ -595,7 +591,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { } // Stored bytes? - if storable && storedSize < size { + if storable && storedSize <= size { w.writeStoredHeader(len(input), eof) w.writeBytes(input) return @@ -634,22 +630,39 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b w.lastHeader = 0 w.lastHuffMan = false } - if !sync { - tokens.Fill() + + // fillReuse enables filling of empty values. + // This will make encodings always reusable without testing. + // However, this does not appear to benefit on most cases. + const fillReuse = false + + // Check if we can reuse... + if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) { + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 } + numLiterals, numOffsets := w.indexTokens(tokens, !sync) + extraBits := 0 + ssize, storable := w.storedSize(input) + + const usePrefs = true + if storable || w.lastHeader > 0 { + extraBits = w.extraBitSize() + } var size int + // Check if we should reuse. if w.lastHeader > 0 { // Estimate size for using a new table. // Use the previous header size as the best estimate. newSize := w.lastHeader + tokens.EstimatedBits() - newSize += newSize >> w.logNewTablePenalty + newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty // The estimated size is calculated as an optimal table. // We add a penalty to make it more realistic and re-use a bit more. - reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize() + reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits // Check if a new table is better. if newSize < reuseSize { @@ -660,35 +673,79 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b } else { size = reuseSize } + + if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size { + // Check if we get a reasonable size decrease. + if storable && ssize <= size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) + return + } // Check if we get a reasonable size decrease. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + if storable && ssize <= size { w.writeStoredHeader(len(input), eof) w.writeBytes(input) - w.lastHeader = 0 return } } // We want a new block/table if w.lastHeader == 0 { - w.generate(tokens) + if fillReuse && !sync { + w.fillTokens() + numLiterals, numOffsets = maxNumLit, maxNumDist + } else { + w.literalFreq[endBlockMarker] = 1 + } + + w.generate() // Generate codegen and codegenFrequencies, which indicates how to encode // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) w.codegenEncoding.generate(w.codegenFreq[:], 7) + var numCodegens int - size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize()) - // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + if fillReuse && !sync { + // Reindex for accurate size... + w.indexTokens(tokens, true) + } + size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits) + + // Store predefined, if we don't get a reasonable improvement. + if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size { + // Store bytes, if we don't get an improvement. + if storable && ssize <= preSize { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) + return + } + + if storable && ssize <= size { + // Store bytes, if we don't get an improvement. w.writeStoredHeader(len(input), eof) w.writeBytes(input) - w.lastHeader = 0 return } // Write Huffman table. w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) - w.lastHeader, _ = w.headerSize() + if !sync { + w.lastHeader, _ = w.headerSize() + } w.lastHuffMan = false } @@ -699,6 +756,19 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes) } +func (w *huffmanBitWriter) fillTokens() { + for i, v := range w.literalFreq[:literalCount] { + if v == 0 { + w.literalFreq[i] = 1 + } + } + for i, v := range w.offsetFreq[:offsetCodeCount] { + if v == 0 { + w.offsetFreq[i] = 1 + } + } +} + // indexTokens indexes a slice of tokens, and updates // literalFreq and offsetFreq, and generates literalEncoding // and offsetEncoding. @@ -733,7 +803,7 @@ func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, num return } -func (w *huffmanBitWriter) generate(t *tokens) { +func (w *huffmanBitWriter) generate() { w.literalEncoding.generate(w.literalFreq[:literalCount], 15) w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15) } @@ -768,7 +838,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) if t < matchType { //w.writeCode(lits[t.literal()]) c := lits[t.literal()] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -796,7 +866,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := lengths[lengthCode&31] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -819,7 +889,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) if extraLengthBits > 0 { //w.writeBits(extraLength, extraLengthBits) extraLength := int32(length - lengthBase[lengthCode&31]) - bits |= uint64(extraLength) << nbits + bits |= uint64(extraLength) << (nbits & 63) nbits += extraLengthBits if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -846,7 +916,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := offs[offsetCode] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -867,7 +937,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) offsetComb := offsetCombined[offsetCode] if offsetComb > 1<<16 { //w.writeBits(extraOffset, extraOffsetBits) - bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << nbits + bits |= uint64(offset-(offsetComb&0xffff)) << (nbits & 63) nbits += uint16(offsetComb >> 16) if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -996,10 +1066,41 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { encoding := w.literalEncoding.codes[:256] // Go 1.16 LOVES having these on stack. At least 1.5x the speed. bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + + // Unroll, write 3 codes/loop. + // Fastest number of unrolls. + for len(input) > 3 { + // We must have at least 48 bits free. + if nbits >= 8 { + n := nbits >> 3 + binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + bits >>= (n * 8) & 63 + nbits -= n * 8 + nbytes += uint8(n) + } + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + a, b := encoding[input[0]], encoding[input[1]] + bits |= uint64(a.code) << (nbits & 63) + bits |= uint64(b.code) << ((nbits + a.len) & 63) + c := encoding[input[2]] + nbits += b.len + a.len + bits |= uint64(c.code) << (nbits & 63) + nbits += c.len + input = input[3:] + } + + // Remaining... for _, t := range input { // Bitwriting inlined, ~30% speedup c := encoding[t] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if debugDeflate { count += int(c.len) diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go index d1edb356c..d5f62f6a2 100644 --- a/vendor/github.com/klauspost/compress/flate/inflate.go +++ b/vendor/github.com/klauspost/compress/flate/inflate.go @@ -328,11 +328,17 @@ func (f *decompressor) nextBlock() { switch typ { case 0: f.dataBlock() + if debugDecode { + fmt.Println("stored block") + } case 1: // compressed, fixed Huffman tables f.hl = &fixedHuffmanDecoder f.hd = nil f.huffmanBlockDecoder()() + if debugDecode { + fmt.Println("predefinied huffman block") + } case 2: // compressed, dynamic Huffman tables if f.err = f.readHuffman(); f.err != nil { @@ -341,6 +347,9 @@ func (f *decompressor) nextBlock() { f.hl = &f.h1 f.hd = &f.h2 f.huffmanBlockDecoder()() + if debugDecode { + fmt.Println("dynamic huffman block") + } default: // 3 is reserved. if debugDecode { diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go index 9b7cc8e97..2a06bd1a7 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress.go +++ b/vendor/github.com/klauspost/compress/huff0/decompress.go @@ -20,7 +20,7 @@ type dEntrySingle struct { // double-symbols decoding type dEntryDouble struct { - seq uint16 + seq [4]byte nBits uint8 len uint8 } @@ -753,23 +753,21 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { br[stream2].fillFast() val := br[stream].peekBitsFast(d.actualTableLog) - v := single[val&tlMask] - br[stream].advance(uint8(v.entry)) - buf[off+bufoff*stream] = uint8(v.entry >> 8) - val2 := br[stream2].peekBitsFast(d.actualTableLog) + v := single[val&tlMask] v2 := single[val2&tlMask] + br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) + buf[off+bufoff*stream] = uint8(v.entry >> 8) buf[off+bufoff*stream2] = uint8(v2.entry >> 8) val = br[stream].peekBitsFast(d.actualTableLog) - v = single[val&tlMask] - br[stream].advance(uint8(v.entry)) - buf[off+bufoff*stream+1] = uint8(v.entry >> 8) - val2 = br[stream2].peekBitsFast(d.actualTableLog) + v = single[val&tlMask] v2 = single[val2&tlMask] + br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) + buf[off+bufoff*stream+1] = uint8(v.entry >> 8) buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8) } @@ -780,23 +778,21 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { br[stream2].fillFast() val := br[stream].peekBitsFast(d.actualTableLog) - v := single[val&tlMask] - br[stream].advance(uint8(v.entry)) - buf[off+bufoff*stream] = uint8(v.entry >> 8) - val2 := br[stream2].peekBitsFast(d.actualTableLog) + v := single[val&tlMask] v2 := single[val2&tlMask] + br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) + buf[off+bufoff*stream] = uint8(v.entry >> 8) buf[off+bufoff*stream2] = uint8(v2.entry >> 8) val = br[stream].peekBitsFast(d.actualTableLog) - v = single[val&tlMask] - br[stream].advance(uint8(v.entry)) - buf[off+bufoff*stream+1] = uint8(v.entry >> 8) - val2 = br[stream2].peekBitsFast(d.actualTableLog) + v = single[val&tlMask] v2 = single[val2&tlMask] + br[stream].advance(uint8(v.entry)) br[stream2].advance(uint8(v2.entry)) + buf[off+bufoff*stream+1] = uint8(v.entry >> 8) buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8) } @@ -914,7 +910,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { out := dst dstEvery := (dstSize + 3) / 4 - shift := (8 - d.actualTableLog) & 7 + shift := (56 + (8 - d.actualTableLog)) & 63 const tlSize = 1 << 8 single := d.dt.single[:tlSize] @@ -935,79 +931,91 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { // Interleave 2 decodes. const stream = 0 const stream2 = 1 - br[stream].fillFast() - br[stream2].fillFast() - - v := single[br[stream].peekByteFast()>>shift].entry + br1 := &br[stream] + br2 := &br[stream2] + br1.fillFast() + br2.fillFast() + + v := single[uint8(br1.value>>shift)].entry + v2 := single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 := single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream+1] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream+2] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry - buf[off+bufoff*stream+3] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) + buf[off+bufoff*stream+3] = uint8(v >> 8) } { const stream = 2 const stream2 = 3 - br[stream].fillFast() - br[stream2].fillFast() - - v := single[br[stream].peekByteFast()>>shift].entry + br1 := &br[stream] + br2 := &br[stream2] + br1.fillFast() + br2.fillFast() + + v := single[uint8(br1.value>>shift)].entry + v2 := single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 := single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream+1] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream+2] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - - v = single[br[stream].peekByteFast()>>shift].entry - buf[off+bufoff*stream+3] = uint8(v >> 8) - br[stream].advance(uint8(v)) - v2 = single[br[stream2].peekByteFast()>>shift].entry + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) + buf[off+bufoff*stream+3] = uint8(v >> 8) } off += 4 @@ -1073,7 +1081,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { } // Read value and increment offset. - v := single[br.peekByteFast()>>shift].entry + v := single[uint8(br.value>>shift)].entry nBits := uint8(v) br.advance(nBits) bitsLeft -= int(nBits) @@ -1121,7 +1129,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { out := dst dstEvery := (dstSize + 3) / 4 - const shift = 0 + const shift = 56 const tlSize = 1 << 8 const tlMask = tlSize - 1 single := d.dt.single[:tlSize] @@ -1145,37 +1153,41 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { br[stream].fillFast() br[stream2].fillFast() - v := single[br[stream].peekByteFast()>>shift].entry + v := single[uint8(br[stream].value>>shift)].entry + v2 := single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 := single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br[stream].value>>shift)].entry + v2 = single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream+1] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br[stream].value>>shift)].entry + v2 = single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream+2] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br[stream].value>>shift)].entry + v2 = single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream+3] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) } { @@ -1184,37 +1196,41 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { br[stream].fillFast() br[stream2].fillFast() - v := single[br[stream].peekByteFast()>>shift].entry + v := single[uint8(br[stream].value>>shift)].entry + v2 := single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 := single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br[stream].value>>shift)].entry + v2 = single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream+1] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br[stream].value>>shift)].entry + v2 = single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream+2] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) - v = single[br[stream].peekByteFast()>>shift].entry + v = single[uint8(br[stream].value>>shift)].entry + v2 = single[uint8(br[stream2].value>>shift)].entry + br[stream].bitsRead += uint8(v) + br[stream].value <<= v & 63 + br[stream2].bitsRead += uint8(v2) + br[stream2].value <<= v2 & 63 buf[off+bufoff*stream+3] = uint8(v >> 8) - br[stream].advance(uint8(v)) - - v2 = single[br[stream2].peekByteFast()>>shift].entry buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - br[stream2].advance(uint8(v2)) } off += 4 @@ -1280,7 +1296,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { } // Read value and increment offset. - v := single[br.peekByteFast()>>shift].entry + v := single[br.peekByteFast()].entry nBits := uint8(v) br.advance(nBits) bitsLeft -= int(nBits) diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go index 854458537..753d17df6 100644 --- a/vendor/github.com/klauspost/compress/zstd/bitreader.go +++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go @@ -50,16 +50,23 @@ func (b *bitReader) getBits(n uint8) int { if n == 0 /*|| b.bitsRead >= 64 */ { return 0 } - return b.getBitsFast(n) + return int(b.get32BitsFast(n)) } -// getBitsFast requires that at least one bit is requested every time. +// get32BitsFast requires that at least one bit is requested every time. // There are no checks if the buffer is filled. -func (b *bitReader) getBitsFast(n uint8) int { +func (b *bitReader) get32BitsFast(n uint8) uint32 { const regMask = 64 - 1 v := uint32((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask)) b.bitsRead += n - return int(v) + return v +} + +func (b *bitReader) get16BitsFast(n uint8) uint16 { + const regMask = 64 - 1 + v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask)) + b.bitsRead += n + return v } // fillFast() will make sure at least 32 bits are available. diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go index 303ae90f9..b36618285 100644 --- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go +++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go @@ -38,7 +38,7 @@ func (b *bitWriter) addBits16NC(value uint16, bits uint8) { b.nBits += bits } -// addBits32NC will add up to 32 bits. +// addBits32NC will add up to 31 bits. // It will not check if there is space for them, // so the caller must ensure that it has flushed recently. func (b *bitWriter) addBits32NC(value uint32, bits uint8) { @@ -46,6 +46,26 @@ func (b *bitWriter) addBits32NC(value uint32, bits uint8) { b.nBits += bits } +// addBits64NC will add up to 64 bits. +// There must be space for 32 bits. +func (b *bitWriter) addBits64NC(value uint64, bits uint8) { + if bits <= 31 { + b.addBits32Clean(uint32(value), bits) + return + } + b.addBits32Clean(uint32(value), 32) + b.flush32() + b.addBits32Clean(uint32(value>>32), bits-32) +} + +// addBits32Clean will add up to 32 bits. +// It will not check if there is space for them. +// The input must not contain more bits than specified. +func (b *bitWriter) addBits32Clean(value uint32, bits uint8) { + b.bitContainer |= uint64(value) << (b.nBits & 63) + b.nBits += bits +} + // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated. // It will not check if there is space for them, so the caller must ensure that it has flushed recently. func (b *bitWriter) addBits16Clean(value uint16, bits uint8) { diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go index 3df185ee4..12e8f6f0b 100644 --- a/vendor/github.com/klauspost/compress/zstd/blockenc.go +++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go @@ -51,7 +51,7 @@ func (b *blockEnc) init() { if cap(b.literals) < maxCompressedBlockSize { b.literals = make([]byte, 0, maxCompressedBlockSize) } - const defSeqs = 200 + const defSeqs = 2000 if cap(b.sequences) < defSeqs { b.sequences = make([]seq, 0, defSeqs) } @@ -426,7 +426,7 @@ func fuzzFseEncoder(data []byte) int { return 0 } enc := fseEncoder{} - hist := enc.Histogram()[:256] + hist := enc.Histogram() maxSym := uint8(0) for i, v := range data { v = v & 63 @@ -722,52 +722,53 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error { println("Encoded seq", seq, s, "codes:", s.llCode, s.mlCode, s.ofCode, "states:", ll.state, ml.state, of.state, "bits:", llB, mlB, ofB) } seq-- - if llEnc.maxBits+mlEnc.maxBits+ofEnc.maxBits <= 32 { - // No need to flush (common) - for seq >= 0 { - s = b.sequences[seq] - wr.flush32() - llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode] - // tabelog max is 8 for all. - of.encode(ofB) - ml.encode(mlB) - ll.encode(llB) - wr.flush32() - - // We checked that all can stay within 32 bits - wr.addBits32NC(s.litLen, llB.outBits) - wr.addBits32NC(s.matchLen, mlB.outBits) - wr.addBits32NC(s.offset, ofB.outBits) - - if debugSequences { - println("Encoded seq", seq, s) - } - - seq-- - } - } else { - for seq >= 0 { - s = b.sequences[seq] - wr.flush32() - llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode] - // tabelog max is below 8 for each. - of.encode(ofB) - ml.encode(mlB) - ll.encode(llB) - wr.flush32() - - // ml+ll = max 32 bits total - wr.addBits32NC(s.litLen, llB.outBits) - wr.addBits32NC(s.matchLen, mlB.outBits) - wr.flush32() - wr.addBits32NC(s.offset, ofB.outBits) - - if debugSequences { - println("Encoded seq", seq, s) - } - - seq-- - } + // Store sequences in reverse... + for seq >= 0 { + s = b.sequences[seq] + + ofB := ofTT[s.ofCode] + wr.flush32() // tablelog max is below 8 for each, so it will fill max 24 bits. + //of.encode(ofB) + nbBitsOut := (uint32(of.state) + ofB.deltaNbBits) >> 16 + dstState := int32(of.state>>(nbBitsOut&15)) + int32(ofB.deltaFindState) + wr.addBits16NC(of.state, uint8(nbBitsOut)) + of.state = of.stateTable[dstState] + + // Accumulate extra bits. + outBits := ofB.outBits & 31 + extraBits := uint64(s.offset & bitMask32[outBits]) + extraBitsN := outBits + + mlB := mlTT[s.mlCode] + //ml.encode(mlB) + nbBitsOut = (uint32(ml.state) + mlB.deltaNbBits) >> 16 + dstState = int32(ml.state>>(nbBitsOut&15)) + int32(mlB.deltaFindState) + wr.addBits16NC(ml.state, uint8(nbBitsOut)) + ml.state = ml.stateTable[dstState] + + outBits = mlB.outBits & 31 + extraBits = extraBits<<outBits | uint64(s.matchLen&bitMask32[outBits]) + extraBitsN += outBits + + llB := llTT[s.llCode] + //ll.encode(llB) + nbBitsOut = (uint32(ll.state) + llB.deltaNbBits) >> 16 + dstState = int32(ll.state>>(nbBitsOut&15)) + int32(llB.deltaFindState) + wr.addBits16NC(ll.state, uint8(nbBitsOut)) + ll.state = ll.stateTable[dstState] + + outBits = llB.outBits & 31 + extraBits = extraBits<<outBits | uint64(s.litLen&bitMask32[outBits]) + extraBitsN += outBits + + wr.flush32() + wr.addBits64NC(extraBits, extraBitsN) + + if debugSequences { + println("Encoded seq", seq, s) + } + + seq-- } ml.flush(mlEnc.actualTableLog) of.flush(ofEnc.actualTableLog) @@ -801,14 +802,13 @@ func (b *blockEnc) genCodes() { // nothing to do return } - if len(b.sequences) > math.MaxUint16 { panic("can only encode up to 64K sequences") } // No bounds checks after here: - llH := b.coders.llEnc.Histogram()[:256] - ofH := b.coders.ofEnc.Histogram()[:256] - mlH := b.coders.mlEnc.Histogram()[:256] + llH := b.coders.llEnc.Histogram() + ofH := b.coders.ofEnc.Histogram() + mlH := b.coders.mlEnc.Histogram() for i := range llH { llH[i] = 0 } @@ -820,7 +820,8 @@ func (b *blockEnc) genCodes() { } var llMax, ofMax, mlMax uint8 - for i, seq := range b.sequences { + for i := range b.sequences { + seq := &b.sequences[i] v := llCode(seq.litLen) seq.llCode = v llH[v]++ @@ -844,7 +845,6 @@ func (b *blockEnc) genCodes() { panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen)) } } - b.sequences[i] = seq } maxCount := func(a []uint32) int { var max uint32 diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go index 295cd602a..15ae8ee80 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_base.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go @@ -108,11 +108,6 @@ func (e *fastBase) UseBlock(enc *blockEnc) { e.blk = enc } -func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 { - // Extend the match to be as long as possible. - return int32(matchLen(src[s:], src[t:])) -} - func (e *fastBase) matchlen(s, t int32, src []byte) int32 { if debugAsserts { if s < 0 { @@ -131,9 +126,24 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 { panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize)) } } + a := src[s:] + b := src[t:] + b = b[:len(a)] + end := int32((len(a) >> 3) << 3) + for i := int32(0); i < end; i += 8 { + if diff := load6432(a, i) ^ load6432(b, i); diff != 0 { + return i + int32(bits.TrailingZeros64(diff)>>3) + } + } - // Extend the match to be as long as possible. - return int32(matchLen(src[s:], src[t:])) + a = a[end:] + b = b[end:] + for i := range a { + if a[i] != b[i] { + return int32(i) + end + } + } + return int32(len(a)) + end } // Reset the encoding table. diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go index f2502629b..5f08a2830 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go @@ -6,8 +6,6 @@ package zstd import ( "fmt" - "math" - "math/bits" ) const ( @@ -136,20 +134,7 @@ encodeLoop: // Consider history as well. var seq seq var length int32 - // length = 4 + e.matchlen(s+6, repIndex+4, src) - { - a := src[s+6:] - b := src[repIndex+4:] - endI := len(a) & (math.MaxInt32 - 7) - length = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } - + length = 4 + e.matchlen(s+6, repIndex+4, src) seq.matchLen = uint32(length - zstdMinMatch) // We might be able to match backwards. @@ -236,20 +221,7 @@ encodeLoop: } // Extend the 4-byte match as long as possible. - //l := e.matchlen(s+4, t+4, src) + 4 - var l int32 - { - a := src[s+4:] - b := src[t+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := e.matchlen(s+4, t+4, src) + 4 // Extend backwards tMin := s - e.maxMatchOff @@ -286,20 +258,7 @@ encodeLoop: if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match - //l := 4 + e.matchlen(s+4, o2+4, src) - var l int32 - { - a := src[s+4:] - b := src[o2+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := 4 + e.matchlen(s+4, o2+4, src) // Store this, since we have it. nextHash := hashLen(cv, hashLog, tableFastHashLen) @@ -418,21 +377,7 @@ encodeLoop: if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) { // Consider history as well. var seq seq - // length := 4 + e.matchlen(s+6, repIndex+4, src) - // length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:])) - var length int32 - { - a := src[s+6:] - b := src[repIndex+4:] - endI := len(a) & (math.MaxInt32 - 7) - length = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + length := 4 + e.matchlen(s+6, repIndex+4, src) seq.matchLen = uint32(length - zstdMinMatch) @@ -522,21 +467,7 @@ encodeLoop: panic(fmt.Sprintf("t (%d) < 0 ", t)) } // Extend the 4-byte match as long as possible. - //l := e.matchlenNoHist(s+4, t+4, src) + 4 - // l := int32(matchLen(src[s+4:], src[t+4:])) + 4 - var l int32 - { - a := src[s+4:] - b := src[t+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := e.matchlen(s+4, t+4, src) + 4 // Extend backwards tMin := s - e.maxMatchOff @@ -573,21 +504,7 @@ encodeLoop: if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match - //l := 4 + e.matchlenNoHist(s+4, o2+4, src) - // l := 4 + int32(matchLen(src[s+4:], src[o2+4:])) - var l int32 - { - a := src[s+4:] - b := src[o2+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := 4 + e.matchlen(s+4, o2+4, src) // Store this, since we have it. nextHash := hashLen(cv, hashLog, tableFastHashLen) @@ -731,19 +648,7 @@ encodeLoop: // Consider history as well. var seq seq var length int32 - // length = 4 + e.matchlen(s+6, repIndex+4, src) - { - a := src[s+6:] - b := src[repIndex+4:] - endI := len(a) & (math.MaxInt32 - 7) - length = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + length = 4 + e.matchlen(s+6, repIndex+4, src) seq.matchLen = uint32(length - zstdMinMatch) @@ -831,20 +736,7 @@ encodeLoop: } // Extend the 4-byte match as long as possible. - //l := e.matchlen(s+4, t+4, src) + 4 - var l int32 - { - a := src[s+4:] - b := src[t+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := e.matchlen(s+4, t+4, src) + 4 // Extend backwards tMin := s - e.maxMatchOff @@ -881,20 +773,7 @@ encodeLoop: if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match - //l := 4 + e.matchlen(s+4, o2+4, src) - var l int32 - { - a := src[s+4:] - b := src[o2+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := 4 + e.matchlen(s+4, o2+4, src) // Store this, since we have it. nextHash := hashLen(cv, hashLog, tableFastHashLen) diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go index e6d3d49b3..bb3d4fd6c 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go @@ -379,7 +379,7 @@ func (s decSymbol) final() (int, uint8) { // This can only be used if no symbols are 0 bits. // At least tablelog bits must be available in the bit reader. func (s *fseState) nextFast(br *bitReader) (uint32, uint8) { - lowBits := uint16(br.getBitsFast(s.state.nbBits())) + lowBits := br.get16BitsFast(s.state.nbBits()) s.state = s.dt[s.state.newState()+lowBits] return s.state.baseline(), s.state.addBits() } diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go index b4757ee3f..5442061b1 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go @@ -62,9 +62,8 @@ func (s symbolTransform) String() string { // To indicate that you have populated the histogram call HistogramFinished // with the value of the highest populated symbol, as well as the number of entries // in the most populated entry. These are accepted at face value. -// The returned slice will always be length 256. -func (s *fseEncoder) Histogram() []uint32 { - return s.count[:] +func (s *fseEncoder) Histogram() *[256]uint32 { + return &s.count } // HistogramFinished can be called to indicate that the histogram has been populated. diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s new file mode 100644 index 000000000..662609589 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s @@ -0,0 +1,189 @@ +// +build gc,!purego + +#include "textflag.h" + +// Register allocation. +#define digest R1 +#define h R2 // Return value. +#define p R3 // Input pointer. +#define len R4 +#define nblocks R5 // len / 32. +#define prime1 R7 +#define prime2 R8 +#define prime3 R9 +#define prime4 R10 +#define prime5 R11 +#define v1 R12 +#define v2 R13 +#define v3 R14 +#define v4 R15 +#define x1 R20 +#define x2 R21 +#define x3 R22 +#define x4 R23 + +#define round(acc, x) \ + MADD prime2, acc, x, acc \ + ROR $64-31, acc \ + MUL prime1, acc \ + +// x = round(0, x). +#define round0(x) \ + MUL prime2, x \ + ROR $64-31, x \ + MUL prime1, x \ + +#define mergeRound(x) \ + round0(x) \ + EOR x, h \ + MADD h, prime4, prime1, h \ + +// Update v[1-4] with 32-byte blocks. Assumes len >= 32. +#define blocksLoop() \ + LSR $5, len, nblocks \ + PCALIGN $16 \ +loop: \ + LDP.P 32(p), (x1, x2) \ + round(v1, x1) \ + LDP -16(p), (x3, x4) \ + round(v2, x2) \ + SUB $1, nblocks \ + round(v3, x3) \ + round(v4, x4) \ + CBNZ nblocks, loop \ + + +// The primes are repeated here to ensure that they're stored +// in a contiguous array, so we can load them with LDP. +DATA primes<> +0(SB)/8, $11400714785074694791 +DATA primes<> +8(SB)/8, $14029467366897019727 +DATA primes<>+16(SB)/8, $1609587929392839161 +DATA primes<>+24(SB)/8, $9650029242287828579 +DATA primes<>+32(SB)/8, $2870177450012600261 +GLOBL primes<>(SB), NOPTR+RODATA, $40 + + +// func Sum64(b []byte) uint64 +TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 + LDP b_base+0(FP), (p, len) + + LDP primes<> +0(SB), (prime1, prime2) + LDP primes<>+16(SB), (prime3, prime4) + MOVD primes<>+32(SB), prime5 + + CMP $32, len + CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } + BLO afterLoop + + ADD prime1, prime2, v1 + MOVD prime2, v2 + MOVD $0, v3 + NEG prime1, v4 + + blocksLoop() + + ROR $64-1, v1, x1 + ROR $64-7, v2, x2 + ADD x1, x2 + ROR $64-12, v3, x3 + ROR $64-18, v4, x4 + ADD x3, x4 + ADD x2, x4, h + + mergeRound(v1) + mergeRound(v2) + mergeRound(v3) + mergeRound(v4) + +afterLoop: + ADD len, h + + TBZ $4, len, try8 + LDP.P 16(p), (x1, x2) + + round0(x1) + ROR $64-27, h + EOR x1 @> 64-27, h, h + MADD h, prime4, prime1, h + + round0(x2) + ROR $64-27, h + EOR x2 @> 64-27, h + MADD h, prime4, prime1, h + +try8: + TBZ $3, len, try4 + MOVD.P 8(p), x1 + + round0(x1) + ROR $64-27, h + EOR x1 @> 64-27, h + MADD h, prime4, prime1, h + +try4: + TBZ $2, len, try2 + MOVWU.P 4(p), x2 + + MUL prime1, x2 + ROR $64-23, h + EOR x2 @> 64-23, h + MADD h, prime3, prime2, h + +try2: + TBZ $1, len, try1 + MOVHU.P 2(p), x3 + AND $255, x3, x1 + LSR $8, x3, x2 + + MUL prime5, x1 + ROR $64-11, h + EOR x1 @> 64-11, h + MUL prime1, h + + MUL prime5, x2 + ROR $64-11, h + EOR x2 @> 64-11, h + MUL prime1, h + +try1: + TBZ $0, len, end + MOVBU (p), x4 + + MUL prime5, x4 + ROR $64-11, h + EOR x4 @> 64-11, h + MUL prime1, h + +end: + EOR h >> 33, h + MUL prime2, h + EOR h >> 29, h + MUL prime3, h + EOR h >> 32, h + + MOVD h, ret+24(FP) + RET + + +// func writeBlocks(d *Digest, b []byte) int +// +// Assumes len(b) >= 32. +TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 + LDP primes<>(SB), (prime1, prime2) + + // Load state. Assume v[1-4] are stored contiguously. + MOVD d+0(FP), digest + LDP 0(digest), (v1, v2) + LDP 16(digest), (v3, v4) + + LDP b_base+8(FP), (p, len) + + blocksLoop() + + // Store updated state. + STP (v1, v2), 0(digest) + STP (v3, v4), 16(digest) + + BIC $31, len + MOVD len, ret+32(FP) + RET diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go index 0ae847f75..9216e0a40 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go @@ -1,5 +1,8 @@ -//go:build !appengine && gc && !purego -// +build !appengine,gc,!purego +//go:build (amd64 || arm64) && !appengine && gc && !purego +// +build amd64 arm64 +// +build !appengine +// +build gc +// +build !purego package xxhash diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go index 1f52f296e..2deb1ca75 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go @@ -1,5 +1,5 @@ -//go:build !amd64 || appengine || !gc || purego -// +build !amd64 appengine !gc purego +//go:build (!amd64 && !arm64) || appengine || !gc || purego +// +build !amd64,!arm64 appengine !gc purego package xxhash diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go index 1dd39e63b..bc731e4cb 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec.go +++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go @@ -278,7 +278,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { mlState = mlTable[mlState.newState()&maxTableMask] ofState = ofTable[ofState.newState()&maxTableMask] } else { - bits := br.getBitsFast(nBits) + bits := br.get32BitsFast(nBits) lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) llState = llTable[(llState.newState()+lowBits)&maxTableMask] @@ -326,7 +326,7 @@ func (s *sequenceDecs) updateAlt(br *bitReader) { s.offsets.state.state = s.offsets.state.dt[c.newState()] return } - bits := br.getBitsFast(nBits) + bits := br.get32BitsFast(nBits) lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31)) s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits] |