diff options
Diffstat (limited to 'vendor/github.com/klauspost')
45 files changed, 2934 insertions, 3614 deletions
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go deleted file mode 100644 index 8298d309a..000000000 --- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go +++ /dev/null @@ -1,42 +0,0 @@ -//+build !noasm -//+build !appengine -//+build !gccgo - -// Copyright 2015, Klaus Post, see LICENSE for details. - -package flate - -import ( - "github.com/klauspost/cpuid" -) - -// crc32sse returns a hash for the first 4 bytes of the slice -// len(a) must be >= 4. -//go:noescape -func crc32sse(a []byte) uint32 - -// crc32sseAll calculates hashes for each 4-byte set in a. -// dst must be east len(a) - 4 in size. -// The size is not checked by the assembly. -//go:noescape -func crc32sseAll(a []byte, dst []uint32) - -// matchLenSSE4 returns the number of matching bytes in a and b -// up to length 'max'. Both slices must be at least 'max' -// bytes in size. -// -// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions. -// -//go:noescape -func matchLenSSE4(a, b []byte, max int) int - -// histogram accumulates a histogram of b in h. -// h must be at least 256 entries in length, -// and must be cleared before calling this function. -//go:noescape -func histogram(b []byte, h []int32) - -// Detect SSE 4.2 feature. -func init() { - useSSE42 = cpuid.CPU.SSE42() -} diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s deleted file mode 100644 index a79943727..000000000 --- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s +++ /dev/null @@ -1,214 +0,0 @@ -//+build !noasm -//+build !appengine -//+build !gccgo - -// Copyright 2015, Klaus Post, see LICENSE for details. - -// func crc32sse(a []byte) uint32 -TEXT ·crc32sse(SB), 4, $0 - MOVQ a+0(FP), R10 - XORQ BX, BX - - // CRC32 dword (R10), EBX - BYTE $0xF2; BYTE $0x41; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0x1a - - MOVL BX, ret+24(FP) - RET - -// func crc32sseAll(a []byte, dst []uint32) -TEXT ·crc32sseAll(SB), 4, $0 - MOVQ a+0(FP), R8 // R8: src - MOVQ a_len+8(FP), R10 // input length - MOVQ dst+24(FP), R9 // R9: dst - SUBQ $4, R10 - JS end - JZ one_crc - MOVQ R10, R13 - SHRQ $2, R10 // len/4 - ANDQ $3, R13 // len&3 - XORQ BX, BX - ADDQ $1, R13 - TESTQ R10, R10 - JZ rem_loop - -crc_loop: - MOVQ (R8), R11 - XORQ BX, BX - XORQ DX, DX - XORQ DI, DI - MOVQ R11, R12 - SHRQ $8, R11 - MOVQ R12, AX - MOVQ R11, CX - SHRQ $16, R12 - SHRQ $16, R11 - MOVQ R12, SI - - // CRC32 EAX, EBX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd8 - - // CRC32 ECX, EDX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd1 - - // CRC32 ESI, EDI - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xfe - MOVL BX, (R9) - MOVL DX, 4(R9) - MOVL DI, 8(R9) - - XORQ BX, BX - MOVL R11, AX - - // CRC32 EAX, EBX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd8 - MOVL BX, 12(R9) - - ADDQ $16, R9 - ADDQ $4, R8 - XORQ BX, BX - SUBQ $1, R10 - JNZ crc_loop - -rem_loop: - MOVL (R8), AX - - // CRC32 EAX, EBX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd8 - - MOVL BX, (R9) - ADDQ $4, R9 - ADDQ $1, R8 - XORQ BX, BX - SUBQ $1, R13 - JNZ rem_loop - -end: - RET - -one_crc: - MOVQ $1, R13 - XORQ BX, BX - JMP rem_loop - -// func matchLenSSE4(a, b []byte, max int) int -TEXT ·matchLenSSE4(SB), 4, $0 - MOVQ a_base+0(FP), SI - MOVQ b_base+24(FP), DI - MOVQ DI, DX - MOVQ max+48(FP), CX - -cmp8: - // As long as we are 8 or more bytes before the end of max, we can load and - // compare 8 bytes at a time. If those 8 bytes are equal, repeat. - CMPQ CX, $8 - JLT cmp1 - MOVQ (SI), AX - MOVQ (DI), BX - CMPQ AX, BX - JNE bsf - ADDQ $8, SI - ADDQ $8, DI - SUBQ $8, CX - JMP cmp8 - -bsf: - // If those 8 bytes were not equal, XOR the two 8 byte values, and return - // the index of the first byte that differs. The BSF instruction finds the - // least significant 1 bit, the amd64 architecture is little-endian, and - // the shift by 3 converts a bit index to a byte index. - XORQ AX, BX - BSFQ BX, BX - SHRQ $3, BX - ADDQ BX, DI - - // Subtract off &b[0] to convert from &b[ret] to ret, and return. - SUBQ DX, DI - MOVQ DI, ret+56(FP) - RET - -cmp1: - // In the slices' tail, compare 1 byte at a time. - CMPQ CX, $0 - JEQ matchLenEnd - MOVB (SI), AX - MOVB (DI), BX - CMPB AX, BX - JNE matchLenEnd - ADDQ $1, SI - ADDQ $1, DI - SUBQ $1, CX - JMP cmp1 - -matchLenEnd: - // Subtract off &b[0] to convert from &b[ret] to ret, and return. - SUBQ DX, DI - MOVQ DI, ret+56(FP) - RET - -// func histogram(b []byte, h []int32) -TEXT ·histogram(SB), 4, $0 - MOVQ b+0(FP), SI // SI: &b - MOVQ b_len+8(FP), R9 // R9: len(b) - MOVQ h+24(FP), DI // DI: Histogram - MOVQ R9, R8 - SHRQ $3, R8 - JZ hist1 - XORQ R11, R11 - -loop_hist8: - MOVQ (SI), R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - INCL (DI)(R10*4) - - ADDQ $8, SI - DECQ R8 - JNZ loop_hist8 - -hist1: - ANDQ $7, R9 - JZ end_hist - XORQ R10, R10 - -loop_hist1: - MOVB (SI), R10 - INCL (DI)(R10*4) - INCQ SI - DECQ R9 - JNZ loop_hist1 - -end_hist: - RET diff --git a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go deleted file mode 100644 index dcf43bd50..000000000 --- a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go +++ /dev/null @@ -1,35 +0,0 @@ -//+build !amd64 noasm appengine gccgo - -// Copyright 2015, Klaus Post, see LICENSE for details. - -package flate - -func init() { - useSSE42 = false -} - -// crc32sse should never be called. -func crc32sse(a []byte) uint32 { - panic("no assembler") -} - -// crc32sseAll should never be called. -func crc32sseAll(a []byte, dst []uint32) { - panic("no assembler") -} - -// matchLenSSE4 should never be called. -func matchLenSSE4(a, b []byte, max int) int { - panic("no assembler") - return 0 -} - -// histogram accumulates a histogram of b in h. -// -// len(h) must be >= 256, and h's elements must be all zeroes. -func histogram(b []byte, h []int32) { - h = h[:256] - for _, t := range b { - h[t]++ - } -} diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go index 628795120..20c94f596 100644 --- a/vendor/github.com/klauspost/compress/flate/deflate.go +++ b/vendor/github.com/klauspost/compress/flate/deflate.go @@ -50,8 +50,6 @@ const ( skipNever = math.MaxInt32 ) -var useSSE42 bool - type compressionLevel struct { good, lazy, nice, chain, fastSkipHashing, level int } @@ -97,9 +95,8 @@ type advancedState struct { hashOffset int // input window: unprocessed data is window[index:windowEnd] - index int - bulkHasher func([]byte, []uint32) - hashMatch [maxMatchLength + minMatchLength]uint32 + index int + hashMatch [maxMatchLength + minMatchLength]uint32 } type compressor struct { @@ -120,7 +117,7 @@ type compressor struct { // queued output tokens tokens tokens - snap fastEnc + fast fastEnc state *advancedState } @@ -164,14 +161,14 @@ func (d *compressor) fillDeflate(b []byte) int { return n } -func (d *compressor) writeBlock(tok tokens, index int, eof bool) error { +func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error { if index > 0 || eof { var window []byte if d.blockStart <= index { window = d.window[d.blockStart:index] } d.blockStart = index - d.w.writeBlock(tok.tokens[:tok.n], eof, window) + d.w.writeBlock(tok, eof, window) return d.w.err } return nil @@ -180,20 +177,20 @@ func (d *compressor) writeBlock(tok tokens, index int, eof bool) error { // writeBlockSkip writes the current block and uses the number of tokens // to determine if the block should be stored on no matches, or // only huffman encoded. -func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error { +func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error { if index > 0 || eof { if d.blockStart <= index { window := d.window[d.blockStart:index] // If we removed less than a 64th of all literals // we huffman compress the block. if int(tok.n) > len(window)-int(tok.n>>6) { - d.w.writeBlockHuff(eof, window) + d.w.writeBlockHuff(eof, window, d.sync) } else { // Write a dynamic huffman block. - d.w.writeBlockDynamic(tok.tokens[:tok.n], eof, window) + d.w.writeBlockDynamic(tok, eof, window, d.sync) } } else { - d.w.writeBlock(tok.tokens[:tok.n], eof, nil) + d.w.writeBlock(tok, eof, nil) } d.blockStart = index return d.w.err @@ -208,8 +205,16 @@ func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error { func (d *compressor) fillWindow(b []byte) { // Do not fill window if we are in store-only mode, // use constant or Snappy compression. - switch d.compressionLevel.level { - case 0, 1, 2: + if d.level == 0 { + return + } + if d.fast != nil { + // encode the last data, but discard the result + if len(b) > maxMatchOffset { + b = b[len(b)-maxMatchOffset:] + } + d.fast.Encode(&d.tokens, b) + d.tokens.Reset() return } s := d.state @@ -236,7 +241,7 @@ func (d *compressor) fillWindow(b []byte) { } dst := s.hashMatch[:dstSize] - s.bulkHasher(tocheck, dst) + bulkHash4(tocheck, dst) var newH uint32 for i, val := range dst { di := i + startindex @@ -284,62 +289,7 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { - n := matchLen(win[i:], wPos, minMatchLook) - - if n > length && (n > minMatchLength || pos-i <= 4096) { - length = n - offset = pos - i - ok = true - if n >= nice { - // The match is good enough that we don't try to find a better one. - break - } - wEnd = win[pos+n] - } - } - if i == minIndex { - // hashPrev[i & windowMask] has already been overwritten, so stop now. - break - } - i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset - if i < minIndex || i < 0 { - break - } - } - return -} - -// Try to find a match starting at index whose length is greater than prevSize. -// We only look at chainCount possibilities before giving up. -// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead -func (d *compressor) findMatchSSE(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { - minMatchLook := maxMatchLength - if lookahead < minMatchLook { - minMatchLook = lookahead - } - - win := d.window[0 : pos+minMatchLook] - - // We quit when we get a match that's at least nice long - nice := len(win) - pos - if d.nice < nice { - nice = d.nice - } - - // If we've got a match that's good enough, only look in 1/4 the chain. - tries := d.chain - length = prevLength - if length >= d.good { - tries >>= 2 - } - - wEnd := win[pos+length] - wPos := win[pos:] - minIndex := pos - windowSize - - for i := prevHead; tries > 0; tries-- { - if wEnd == win[i+length] { - n := matchLenSSE4(win[i:], wPos, minMatchLook) + n := matchLen(win[i:i+minMatchLook], wPos) if n > length && (n > minMatchLength || pos-i <= 4096) { length = n @@ -372,42 +322,27 @@ func (d *compressor) writeStoredBlock(buf []byte) error { return d.w.err } -const hashmul = 0x1e35a7bd - // hash4 returns a hash representation of the first 4 bytes // of the supplied slice. // The caller must ensure that len(b) >= 4. func hash4(b []byte) uint32 { - return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits) + b = b[:4] + return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits) } // bulkHash4 will compute hashes using the same // algorithm as hash4 func bulkHash4(b []byte, dst []uint32) { - if len(b) < minMatchLength { + if len(b) < 4 { return } hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 - dst[0] = (hb * hashmul) >> (32 - hashBits) - end := len(b) - minMatchLength + 1 + dst[0] = hash4u(hb, hashBits) + end := len(b) - 4 + 1 for i := 1; i < end; i++ { hb = (hb << 8) | uint32(b[i+3]) - dst[i] = (hb * hashmul) >> (32 - hashBits) - } -} - -// matchLen returns the number of matching bytes in a and b -// up to length 'max'. Both slices must be at least 'max' -// bytes in size. -func matchLen(a, b []byte, max int) int { - a = a[:max] - b = b[:len(a)] - for i, av := range a { - if b[i] != av { - return i - } + dst[i] = hash4u(hb, hashBits) } - return max } func (d *compressor) initDeflate() { @@ -424,149 +359,6 @@ func (d *compressor) initDeflate() { s.offset = 0 s.hash = 0 s.chainHead = -1 - s.bulkHasher = bulkHash4 - if useSSE42 { - s.bulkHasher = crc32sseAll - } -} - -// Assumes that d.fastSkipHashing != skipNever, -// otherwise use deflateLazy -func (d *compressor) deflate() { - s := d.state - // Sanity enables additional runtime tests. - // It's intended to be used during development - // to supplement the currently ad-hoc unit tests. - const sanity = false - - if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { - return - } - - s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if s.index < s.maxInsertIndex { - s.hash = hash4(d.window[s.index : s.index+minMatchLength]) - } - - for { - if sanity && s.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - s.index - if lookahead < minMatchLength+maxMatchLength { - if !d.sync { - return - } - if sanity && s.index > d.windowEnd { - panic("index > windowEnd") - } - if lookahead == 0 { - if d.tokens.n > 0 { - if d.err = d.writeBlockSkip(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - return - } - } - if s.index < s.maxInsertIndex { - // Update the hash - s.hash = hash4(d.window[s.index : s.index+minMatchLength]) - ch := s.hashHead[s.hash&hashMask] - s.chainHead = int(ch) - s.hashPrev[s.index&windowMask] = ch - s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset) - } - s.length = minMatchLength - 1 - s.offset = 0 - minIndex := s.index - windowSize - if minIndex < 0 { - minIndex = 0 - } - - if s.chainHead-s.hashOffset >= minIndex && lookahead > minMatchLength-1 { - if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok { - s.length = newLength - s.offset = newOffset - } - } - if s.length >= minMatchLength { - s.ii = 0 - // There was a match at the previous step, and the current match is - // not better. Output the previous match. - // "s.length-3" should NOT be "s.length-minMatchLength", since the format always assume 3 - d.tokens.tokens[d.tokens.n] = matchToken(uint32(s.length-3), uint32(s.offset-minOffsetSize)) - d.tokens.n++ - // Insert in the hash table all strings up to the end of the match. - // index and index-1 are already inserted. If there is not enough - // lookahead, the last two strings are not inserted into the hash - // table. - if s.length <= d.fastSkipHashing { - var newIndex int - newIndex = s.index + s.length - // Calculate missing hashes - end := newIndex - if end > s.maxInsertIndex { - end = s.maxInsertIndex - } - end += minMatchLength - 1 - startindex := s.index + 1 - if startindex > s.maxInsertIndex { - startindex = s.maxInsertIndex - } - tocheck := d.window[startindex:end] - dstSize := len(tocheck) - minMatchLength + 1 - if dstSize > 0 { - dst := s.hashMatch[:dstSize] - bulkHash4(tocheck, dst) - var newH uint32 - for i, val := range dst { - di := i + startindex - newH = val & hashMask - // Get previous value with the same hash. - // Our chain should point to the previous value. - s.hashPrev[di&windowMask] = s.hashHead[newH] - // Set the head of the hash chain to us. - s.hashHead[newH] = uint32(di + s.hashOffset) - } - s.hash = newH - } - s.index = newIndex - } else { - // For matches this long, we don't bother inserting each individual - // item into the table. - s.index += s.length - if s.index < s.maxInsertIndex { - s.hash = hash4(d.window[s.index : s.index+minMatchLength]) - } - } - if d.tokens.n == maxFlateBlockTokens { - // The block includes the current character - if d.err = d.writeBlockSkip(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } else { - s.ii++ - end := s.index + int(s.ii>>uint(d.fastSkipHashing)) + 1 - if end > d.windowEnd { - end = d.windowEnd - } - for i := s.index; i < end; i++ { - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } - s.index = end - } - } } // deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever, @@ -603,15 +395,14 @@ func (d *compressor) deflateLazy() { // Flush current output block if any. if d.byteAvailable { // There is still one pending token that needs to be flushed - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ + d.tokens.AddLiteral(d.window[s.index-1]) d.byteAvailable = false } if d.tokens.n > 0 { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } return } @@ -642,8 +433,7 @@ func (d *compressor) deflateLazy() { if prevLength >= minMatchLength && s.length <= prevLength { // There was a match at the previous step, and the current match is // not better. Output the previous match. - d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) - d.tokens.n++ + d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) // Insert in the hash table all strings up to the end of the match. // index and index-1 are already inserted. If there is not enough @@ -684,10 +474,10 @@ func (d *compressor) deflateLazy() { s.length = minMatchLength - 1 if d.tokens.n == maxFlateBlockTokens { // The block includes the current character - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } } else { // Reset, if we got a match this run. @@ -697,13 +487,12 @@ func (d *compressor) deflateLazy() { // We have a byte waiting. Emit it. if d.byteAvailable { s.ii++ - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ + d.tokens.AddLiteral(d.window[s.index-1]) if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } s.index++ @@ -716,343 +505,24 @@ func (d *compressor) deflateLazy() { break } - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ + d.tokens.AddLiteral(d.window[s.index-1]) if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } s.index++ } // Flush last byte - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ + d.tokens.AddLiteral(d.window[s.index-1]) d.byteAvailable = false // s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 - } - } - } else { - s.index++ - d.byteAvailable = true - } - } - } -} - -// Assumes that d.fastSkipHashing != skipNever, -// otherwise use deflateLazySSE -func (d *compressor) deflateSSE() { - s := d.state - // Sanity enables additional runtime tests. - // It's intended to be used during development - // to supplement the currently ad-hoc unit tests. - const sanity = false - - if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { - return - } - - s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if s.index < s.maxInsertIndex { - s.hash = crc32sse(d.window[s.index:s.index+minMatchLength]) & hashMask - } - - for { - if sanity && s.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - s.index - if lookahead < minMatchLength+maxMatchLength { - if !d.sync { - return - } - if sanity && s.index > d.windowEnd { - panic("index > windowEnd") - } - if lookahead == 0 { - if d.tokens.n > 0 { - if d.err = d.writeBlockSkip(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - return - } - } - if s.index < s.maxInsertIndex { - // Update the hash - s.hash = crc32sse(d.window[s.index:s.index+minMatchLength]) & hashMask - ch := s.hashHead[s.hash] - s.chainHead = int(ch) - s.hashPrev[s.index&windowMask] = ch - s.hashHead[s.hash] = uint32(s.index + s.hashOffset) - } - s.length = minMatchLength - 1 - s.offset = 0 - minIndex := s.index - windowSize - if minIndex < 0 { - minIndex = 0 - } - - if s.chainHead-s.hashOffset >= minIndex && lookahead > minMatchLength-1 { - if newLength, newOffset, ok := d.findMatchSSE(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok { - s.length = newLength - s.offset = newOffset - } - } - if s.length >= minMatchLength { - s.ii = 0 - // There was a match at the previous step, and the current match is - // not better. Output the previous match. - // "s.length-3" should NOT be "s.length-minMatchLength", since the format always assume 3 - d.tokens.tokens[d.tokens.n] = matchToken(uint32(s.length-3), uint32(s.offset-minOffsetSize)) - d.tokens.n++ - // Insert in the hash table all strings up to the end of the match. - // index and index-1 are already inserted. If there is not enough - // lookahead, the last two strings are not inserted into the hash - // table. - if s.length <= d.fastSkipHashing { - var newIndex int - newIndex = s.index + s.length - // Calculate missing hashes - end := newIndex - if end > s.maxInsertIndex { - end = s.maxInsertIndex - } - end += minMatchLength - 1 - startindex := s.index + 1 - if startindex > s.maxInsertIndex { - startindex = s.maxInsertIndex - } - tocheck := d.window[startindex:end] - dstSize := len(tocheck) - minMatchLength + 1 - if dstSize > 0 { - dst := s.hashMatch[:dstSize] - - crc32sseAll(tocheck, dst) - var newH uint32 - for i, val := range dst { - di := i + startindex - newH = val & hashMask - // Get previous value with the same hash. - // Our chain should point to the previous value. - s.hashPrev[di&windowMask] = s.hashHead[newH] - // Set the head of the hash chain to us. - s.hashHead[newH] = uint32(di + s.hashOffset) - } - s.hash = newH - } - s.index = newIndex - } else { - // For matches this long, we don't bother inserting each individual - // item into the table. - s.index += s.length - if s.index < s.maxInsertIndex { - s.hash = crc32sse(d.window[s.index:s.index+minMatchLength]) & hashMask - } - } - if d.tokens.n == maxFlateBlockTokens { - // The block includes the current character - if d.err = d.writeBlockSkip(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } else { - s.ii++ - end := s.index + int(s.ii>>5) + 1 - if end > d.windowEnd { - end = d.windowEnd - } - for i := s.index; i < end; i++ { - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } - s.index = end - } - } -} - -// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever, -// meaning it always has lazy matching on. -func (d *compressor) deflateLazySSE() { - s := d.state - // Sanity enables additional runtime tests. - // It's intended to be used during development - // to supplement the currently ad-hoc unit tests. - const sanity = false - - if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { - return - } - - s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if s.index < s.maxInsertIndex { - s.hash = crc32sse(d.window[s.index:s.index+minMatchLength]) & hashMask - } - - for { - if sanity && s.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - s.index - if lookahead < minMatchLength+maxMatchLength { - if !d.sync { - return - } - if sanity && s.index > d.windowEnd { - panic("index > windowEnd") - } - if lookahead == 0 { - // Flush current output block if any. - if d.byteAvailable { - // There is still one pending token that needs to be flushed - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ - d.byteAvailable = false - } - if d.tokens.n > 0 { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - return - } - } - if s.index < s.maxInsertIndex { - // Update the hash - s.hash = crc32sse(d.window[s.index:s.index+minMatchLength]) & hashMask - ch := s.hashHead[s.hash] - s.chainHead = int(ch) - s.hashPrev[s.index&windowMask] = ch - s.hashHead[s.hash] = uint32(s.index + s.hashOffset) - } - prevLength := s.length - prevOffset := s.offset - s.length = minMatchLength - 1 - s.offset = 0 - minIndex := s.index - windowSize - if minIndex < 0 { - minIndex = 0 - } - - if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { - if newLength, newOffset, ok := d.findMatchSSE(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok { - s.length = newLength - s.offset = newOffset - } - } - if prevLength >= minMatchLength && s.length <= prevLength { - // There was a match at the previous step, and the current match is - // not better. Output the previous match. - d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) - d.tokens.n++ - - // Insert in the hash table all strings up to the end of the match. - // index and index-1 are already inserted. If there is not enough - // lookahead, the last two strings are not inserted into the hash - // table. - var newIndex int - newIndex = s.index + prevLength - 1 - // Calculate missing hashes - end := newIndex - if end > s.maxInsertIndex { - end = s.maxInsertIndex - } - end += minMatchLength - 1 - startindex := s.index + 1 - if startindex > s.maxInsertIndex { - startindex = s.maxInsertIndex - } - tocheck := d.window[startindex:end] - dstSize := len(tocheck) - minMatchLength + 1 - if dstSize > 0 { - dst := s.hashMatch[:dstSize] - crc32sseAll(tocheck, dst) - var newH uint32 - for i, val := range dst { - di := i + startindex - newH = val & hashMask - // Get previous value with the same hash. - // Our chain should point to the previous value. - s.hashPrev[di&windowMask] = s.hashHead[newH] - // Set the head of the hash chain to us. - s.hashHead[newH] = uint32(di + s.hashOffset) - } - s.hash = newH - } - - s.index = newIndex - d.byteAvailable = false - s.length = minMatchLength - 1 - if d.tokens.n == maxFlateBlockTokens { - // The block includes the current character - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } else { - // Reset, if we got a match this run. - if s.length >= minMatchLength { - s.ii = 0 - } - // We have a byte waiting. Emit it. - if d.byteAvailable { - s.ii++ - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - s.index++ - - // If we have a long run of no matches, skip additional bytes - // Resets when s.ii overflows after 64KB. - if s.ii > 31 { - n := int(s.ii >> 6) - for j := 0; j < n; j++ { - if s.index >= d.windowEnd-1 { - break - } - - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - s.index++ - } - // Flush last byte - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[s.index-1])) - d.tokens.n++ - d.byteAvailable = false - // s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, s.index, false); d.err != nil { - return - } - d.tokens.n = 0 + d.tokens.Reset() } } } else { @@ -1085,17 +555,17 @@ func (d *compressor) storeHuff() { if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 { return } - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) d.err = d.w.err d.windowEnd = 0 } -// storeHuff will compress and store the currently added data, +// storeFast will compress and store the currently added data, // if enough has been accumulated or we at the end of the stream. // Any error that occurred will be in d.err -func (d *compressor) storeSnappy() { +func (d *compressor) storeFast() { // We only compress if we have maxStoreBlockSize. - if d.windowEnd < maxStoreBlockSize { + if d.windowEnd < len(d.window) { if !d.sync { return } @@ -1106,32 +576,30 @@ func (d *compressor) storeSnappy() { } if d.windowEnd <= 32 { d.err = d.writeStoredBlock(d.window[:d.windowEnd]) - d.tokens.n = 0 - d.windowEnd = 0 } else { - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], true) d.err = d.w.err } - d.tokens.n = 0 + d.tokens.Reset() d.windowEnd = 0 - d.snap.Reset() + d.fast.Reset() return } } - d.snap.Encode(&d.tokens, d.window[:d.windowEnd]) + d.fast.Encode(&d.tokens, d.window[:d.windowEnd]) // If we made zero matches, store the block as is. - if int(d.tokens.n) == d.windowEnd { + if d.tokens.n == 0 { d.err = d.writeStoredBlock(d.window[:d.windowEnd]) // If we removed less than 1/16th, huffman compress the block. } else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) { - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) d.err = d.w.err } else { - d.w.writeBlockDynamic(d.tokens.tokens[:d.tokens.n], false, d.window[:d.windowEnd]) + d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync) d.err = d.w.err } - d.tokens.n = 0 + d.tokens.Reset() d.windowEnd = 0 } @@ -1176,36 +644,26 @@ func (d *compressor) init(w io.Writer, level int) (err error) { d.fill = (*compressor).fillBlock d.step = (*compressor).store case level == ConstantCompression: + d.w.logReusePenalty = uint(4) d.window = make([]byte, maxStoreBlockSize) d.fill = (*compressor).fillBlock d.step = (*compressor).storeHuff - case level >= 1 && level <= 4: - d.snap = newFastEnc(level) - d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillBlock - d.step = (*compressor).storeSnappy case level == DefaultCompression: level = 5 fallthrough - case 5 <= level && level <= 9: + case level >= 1 && level <= 6: + d.w.logReusePenalty = uint(level + 1) + d.fast = newFastEnc(level) + d.window = make([]byte, maxStoreBlockSize) + d.fill = (*compressor).fillBlock + d.step = (*compressor).storeFast + case 7 <= level && level <= 9: + d.w.logReusePenalty = uint(level) d.state = &advancedState{} d.compressionLevel = levels[level] d.initDeflate() d.fill = (*compressor).fillDeflate - if d.fastSkipHashing == skipNever { - if useSSE42 { - d.step = (*compressor).deflateLazySSE - } else { - d.step = (*compressor).deflateLazy - } - } else { - if useSSE42 { - d.step = (*compressor).deflateSSE - } else { - d.step = (*compressor).deflate - - } - } + d.step = (*compressor).deflateLazy default: return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level) } @@ -1218,10 +676,10 @@ func (d *compressor) reset(w io.Writer) { d.sync = false d.err = nil // We only need to reset a few things for Snappy. - if d.snap != nil { - d.snap.Reset() + if d.fast != nil { + d.fast.Reset() d.windowEnd = 0 - d.tokens.n = 0 + d.tokens.Reset() return } switch d.compressionLevel.chain { @@ -1240,7 +698,7 @@ func (d *compressor) reset(w io.Writer) { s.hashOffset = 1 s.index, d.windowEnd = 0, 0 d.blockStart, d.byteAvailable = 0, false - d.tokens.n = 0 + d.tokens.Reset() s.length = minMatchLength - 1 s.offset = 0 s.hash = 0 diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go new file mode 100644 index 000000000..b0a470f92 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go @@ -0,0 +1,257 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Modified for deflate by Klaus Post (c) 2015. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +import ( + "fmt" + "math/bits" +) + +type fastEnc interface { + Encode(dst *tokens, src []byte) + Reset() +} + +func newFastEnc(level int) fastEnc { + switch level { + case 1: + return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}} + case 2: + return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}} + case 3: + return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}} + case 4: + return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}} + case 5: + return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}} + case 6: + return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}} + default: + panic("invalid level specified") + } +} + +const ( + tableBits = 16 // Bits used in the table + tableSize = 1 << tableBits // Size of the table + tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32. + baseMatchOffset = 1 // The smallest match offset + baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 + maxMatchOffset = 1 << 15 // The largest match offset + + bTableBits = 18 // Bits used in the big tables + bTableSize = 1 << bTableBits // Size of the table + allocHistory = maxMatchOffset * 10 // Size to preallocate for history. + bufferReset = (1 << 31) - allocHistory - maxStoreBlockSize // Reset the buffer offset when reaching this. +) + +const ( + prime3bytes = 506832829 + prime4bytes = 2654435761 + prime5bytes = 889523592379 + prime6bytes = 227718039650203 + prime7bytes = 58295818150454627 + prime8bytes = 0xcf1bbcdcb7a56463 +) + +func load32(b []byte, i int) uint32 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load64(b []byte, i int) uint64 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func load3232(b []byte, i int32) uint32 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load6432(b []byte, i int32) uint64 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func hash(u uint32) uint32 { + return (u * 0x1e35a7bd) >> tableShift +} + +type tableEntry struct { + val uint32 + offset int32 +} + +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastGen struct { + hist []byte + cur int32 +} + +func (e *fastGen) addBlock(src []byte) int32 { + // check if we have space already + if len(e.hist)+len(src) > cap(e.hist) { + if cap(e.hist) == 0 { + e.hist = make([]byte, 0, allocHistory) + } else { + if cap(e.hist) < maxMatchOffset*2 { + panic("unexpected buffer size") + } + // Move down + offset := int32(len(e.hist)) - maxMatchOffset + copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + e.cur += offset + e.hist = e.hist[:maxMatchOffset] + } + } + s := int32(len(e.hist)) + e.hist = append(e.hist, src...) + return s +} + +// hash4 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4u(u uint32, h uint8) uint32 { + return (u * prime4bytes) >> ((32 - h) & 31) +} + +type tableEntryPrev struct { + Cur tableEntry + Prev tableEntry +} + +// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4x64(u uint64, h uint8) uint32 { + return (uint32(u) * prime4bytes) >> ((32 - h) & 31) +} + +// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash7(u uint64, h uint8) uint32 { + return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) +} + +// hash8 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash8(u uint64, h uint8) uint32 { + return uint32((u * prime8bytes) >> ((64 - h) & 63)) +} + +// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash6(u uint64, h uint8) uint32 { + return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63)) +} + +// matchlen will return the match length between offsets and t in src. +// The maximum length returned is maxMatchLength - 4. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchlen(s, t int32, src []byte) int32 { + if debugDecode { + if t >= s { + panic(fmt.Sprint("t >=s:", t, s)) + } + if int(s) >= len(src) { + panic(fmt.Sprint("s >= len(src):", s, len(src))) + } + if t < 0 { + panic(fmt.Sprint("t < 0:", t)) + } + if s-t > maxMatchOffset { + panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) + } + } + s1 := int(s) + maxMatchLength - 4 + if s1 > len(src) { + s1 = len(src) + } + + // Extend the match to be as long as possible. + return int32(matchLen(src[s:s1], src[t:])) +} + +// matchlenLong will return the match length between offsets and t in src. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 { + if debugDecode { + if t >= s { + panic(fmt.Sprint("t >=s:", t, s)) + } + if int(s) >= len(src) { + panic(fmt.Sprint("s >= len(src):", s, len(src))) + } + if t < 0 { + panic(fmt.Sprint("t < 0:", t)) + } + if s-t > maxMatchOffset { + panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) + } + } + // Extend the match to be as long as possible. + return int32(matchLen(src[s:], src[t:])) +} + +// Reset the encoding table. +func (e *fastGen) Reset() { + if cap(e.hist) < int(maxMatchOffset*8) { + l := maxMatchOffset * 8 + // Make it at least 1MB. + if l < 1<<20 { + l = 1 << 20 + } + e.hist = make([]byte, 0, l) + } + // We offset current position so everything will be out of reach + e.cur += maxMatchOffset + int32(len(e.hist)) + e.hist = e.hist[:0] +} + +// matchLen returns the maximum length. +// 'a' must be the shortest of the two. +func matchLen(a, b []byte) int { + b = b[:len(a)] + var checked int + if len(a) > 4 { + // Try 4 bytes first + if diff := load32(a, 0) ^ load32(b, 0); diff != 0 { + return bits.TrailingZeros32(diff) >> 3 + } + // Switch to 8 byte matching. + checked = 4 + a = a[4:] + b = b[4:] + for len(a) >= 8 { + b = b[:len(a)] + if diff := load64(a, 0) ^ load64(b, 0); diff != 0 { + return checked + (bits.TrailingZeros64(diff) >> 3) + } + checked += 8 + a = a[8:] + b = b[8:] + } + } + b = b[:len(a)] + for i := range a { + if a[i] != b[i] { + return int(i) + checked + } + } + return len(a) + checked +} diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index f46c65418..5ed476aa0 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -85,26 +85,48 @@ type huffmanBitWriter struct { // Data waiting to be written is bytes[0:nbytes] // and then the low nbits of bits. bits uint64 - nbits uint - bytes [256]byte - codegenFreq [codegenCodeCount]int32 + nbits uint16 nbytes uint8 - literalFreq []int32 - offsetFreq []int32 - codegen []uint8 literalEncoding *huffmanEncoder offsetEncoding *huffmanEncoder codegenEncoding *huffmanEncoder err error + lastHeader int + // Set between 0 (reused block can be up to 2x the size) + logReusePenalty uint + lastHuffMan bool + bytes [256]byte + literalFreq [lengthCodesStart + 32]uint16 + offsetFreq [32]uint16 + codegenFreq [codegenCodeCount]uint16 + + // codegen must have an extra space for the final symbol. + codegen [literalCount + offsetCodeCount + 1]uint8 } +// Huffman reuse. +// +// The huffmanBitWriter supports reusing huffman tables and thereby combining block sections. +// +// This is controlled by several variables: +// +// If lastHeader is non-zero the Huffman table can be reused. +// This also indicates that a Huffman table has been generated that can output all +// possible symbols. +// It also indicates that an EOB has not yet been emitted, so if a new tabel is generated +// an EOB with the previous table must be written. +// +// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid. +// +// An incoming block estimates the output size of a new table using a 'fresh' by calculating the +// optimal size and adding a penalty in 'logReusePenalty'. +// A Huffman table is not optimal, which is why we add a penalty, and generating a new table +// is slower both for compression and decompression. + func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { return &huffmanBitWriter{ writer: w, - literalFreq: make([]int32, lengthCodesStart+32), - offsetFreq: make([]int32, 32), - codegen: make([]uint8, maxNumLit+offsetCodeCount+1), - literalEncoding: newHuffmanEncoder(maxNumLit), + literalEncoding: newHuffmanEncoder(literalCount), codegenEncoding: newHuffmanEncoder(codegenCodeCount), offsetEncoding: newHuffmanEncoder(offsetCodeCount), } @@ -114,6 +136,41 @@ func (w *huffmanBitWriter) reset(writer io.Writer) { w.writer = writer w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil w.bytes = [256]byte{} + w.lastHeader = 0 + w.lastHuffMan = false +} + +func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) { + offsets, lits = true, true + a := t.offHist[:offsetCodeCount] + b := w.offsetFreq[:len(a)] + for i := range a { + if b[i] == 0 && a[i] != 0 { + offsets = false + break + } + } + + a = t.extraHist[:literalCount-256] + b = w.literalFreq[256:literalCount] + b = b[:len(a)] + for i := range a { + if b[i] == 0 && a[i] != 0 { + lits = false + break + } + } + if lits { + a = t.litHist[:] + b = w.literalFreq[:len(a)] + for i := range a { + if b[i] == 0 && a[i] != 0 { + lits = false + break + } + } + } + return } func (w *huffmanBitWriter) flush() { @@ -144,30 +201,11 @@ func (w *huffmanBitWriter) write(b []byte) { _, w.err = w.writer.Write(b) } -func (w *huffmanBitWriter) writeBits(b int32, nb uint) { - w.bits |= uint64(b) << w.nbits +func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { + w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - w.bytes[n] = byte(bits) - w.bytes[n+1] = byte(bits >> 8) - w.bytes[n+2] = byte(bits >> 16) - w.bytes[n+3] = byte(bits >> 24) - w.bytes[n+4] = byte(bits >> 32) - w.bytes[n+5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - if w.err != nil { - n = 0 - return - } - w.write(w.bytes[:n]) - n = 0 - } - w.nbytes = n + w.writeOutBits() } } @@ -213,7 +251,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE // a copy of the frequencies, and as the place where we put the result. // This is fine because the output is always shorter than the input used // so far. - codegen := w.codegen // cache + codegen := w.codegen[:] // cache // Copy the concatenated code sizes to codegen. Put a marker at the end. cgnl := codegen[:numLiterals] for i := range cgnl { @@ -292,30 +330,54 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE codegen[outIndex] = badCode } -// dynamicSize returns the size of dynamically encoded data in bits. -func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { +func (w *huffmanBitWriter) codegens() int { + numCodegens := len(w.codegenFreq) + for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { + numCodegens-- + } + return numCodegens +} + +func (w *huffmanBitWriter) headerSize() (size, numCodegens int) { numCodegens = len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- } - header := 3 + 5 + 5 + 4 + (3 * numCodegens) + + return 3 + 5 + 5 + 4 + (3 * numCodegens) + w.codegenEncoding.bitLength(w.codegenFreq[:]) + int(w.codegenFreq[16])*2 + int(w.codegenFreq[17])*3 + - int(w.codegenFreq[18])*7 + int(w.codegenFreq[18])*7, numCodegens +} + +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { + header, numCodegens := w.headerSize() size = header + - litEnc.bitLength(w.literalFreq) + - offEnc.bitLength(w.offsetFreq) + + litEnc.bitLength(w.literalFreq[:]) + + offEnc.bitLength(w.offsetFreq[:]) + extraBits - return size, numCodegens } +// extraBitSize will return the number of bits that will be written +// as "extra" bits on matches. +func (w *huffmanBitWriter) extraBitSize() int { + total := 0 + for i, n := range w.literalFreq[257:literalCount] { + total += int(n) * int(lengthExtraBits[i&31]) + } + for i, n := range w.offsetFreq[:offsetCodeCount] { + total += int(n) * int(offsetExtraBits[i&31]) + } + return total +} + // fixedSize returns the size of dynamically encoded data in bits. func (w *huffmanBitWriter) fixedSize(extraBits int) int { return 3 + - fixedLiteralEncoding.bitLength(w.literalFreq) + - fixedOffsetEncoding.bitLength(w.offsetFreq) + + fixedLiteralEncoding.bitLength(w.literalFreq[:]) + + fixedOffsetEncoding.bitLength(w.offsetFreq[:]) + extraBits } @@ -333,30 +395,36 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { } func (w *huffmanBitWriter) writeCode(c hcode) { + // The function does not get inlined if we "& 63" the shift. w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) + w.nbits += c.len if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - w.bytes[n] = byte(bits) - w.bytes[n+1] = byte(bits >> 8) - w.bytes[n+2] = byte(bits >> 16) - w.bytes[n+3] = byte(bits >> 24) - w.bytes[n+4] = byte(bits >> 32) - w.bytes[n+5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - if w.err != nil { - n = 0 - return - } - w.write(w.bytes[:n]) + w.writeOutBits() + } +} + +// writeOutBits will write bits to the buffer. +func (w *huffmanBitWriter) writeOutBits() { + bits := w.bits + w.bits >>= 48 + w.nbits -= 48 + n := w.nbytes + w.bytes[n] = byte(bits) + w.bytes[n+1] = byte(bits >> 8) + w.bytes[n+2] = byte(bits >> 16) + w.bytes[n+3] = byte(bits >> 24) + w.bytes[n+4] = byte(bits >> 32) + w.bytes[n+5] = byte(bits >> 40) + n += 6 + if n >= bufferFlushSize { + if w.err != nil { n = 0 + return } - w.nbytes = n + w.write(w.bytes[:n]) + n = 0 } + w.nbytes = n } // Write the header of a dynamic Huffman block to the output stream. @@ -412,6 +480,11 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } var flag int32 if isEof { flag = 1 @@ -426,6 +499,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + // Indicate that we are a fixed Huffman block var value int32 = 2 if isEof { @@ -439,29 +518,23 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { // is larger than the original bytes, the data will be written as a // stored block. // If the input is nil, the tokens will always be Huffman encoded. -func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) - numLiterals, numOffsets := w.indexTokens(tokens) - + tokens.AddEOB() + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + numLiterals, numOffsets := w.indexTokens(tokens, false) + w.generate(tokens) var extraBits int storedSize, storable := w.storedSize(input) if storable { - // We only bother calculating the costs of the extra bits required by - // the length of offset fields (which will be the same for both fixed - // and dynamic encoding), if we need to compare those two encodings - // against stored encoding. - for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { - // First eight length codes have extra size = 0. - extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart]) - } - for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { - // First four offset codes have extra size = 0. - extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode&63]) - } + extraBits = w.extraBitSize() } // Figure out smallest code. @@ -500,7 +573,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Write the tokens. - w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes) + w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes) } // writeBlockDynamic encodes a block using a dynamic Huffman table. @@ -508,72 +581,103 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // histogram distribution. // If input is supplied and the compression savings are below 1/16th of the // input size the block is stored. -func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) - numLiterals, numOffsets := w.indexTokens(tokens) + sync = sync || eof + if sync { + tokens.AddEOB() + } - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0) + // We cannot reuse pure huffman table. + if w.lastHuffMan && w.lastHeader > 0 { + // We will not try to reuse. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false + } + if !sync { + tokens.Fill() + } + numLiterals, numOffsets := w.indexTokens(tokens, !sync) - // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) - return + var size int + // Check if we should reuse. + if w.lastHeader > 0 { + // Estimate size for using a new table + newSize := w.lastHeader + tokens.EstimatedBits() + + // The estimated size is calculated as an optimal table. + // We add a penalty to make it more realistic and re-use a bit more. + newSize += newSize >> (w.logReusePenalty & 31) + extra := w.extraBitSize() + reuseSize, _ := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extra) + + // Check if a new table is better. + if newSize < reuseSize { + // Write the EOB we owe. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + size = newSize + w.lastHeader = 0 + } else { + size = reuseSize + } + // Check if we get a reasonable size decrease. + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + w.lastHeader = 0 + return + } } - // Write Huffman table. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + // We want a new block/table + if w.lastHeader == 0 { + w.generate(tokens) + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + var numCodegens int + size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize()) + // Store bytes, if we don't get a reasonable improvement. + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + w.lastHeader = 0 + return + } + + // Write Huffman table. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + w.lastHeader, _ = w.headerSize() + w.lastHuffMan = false + } + if sync { + w.lastHeader = 0 + } // Write the tokens. - w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes) + w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes) } // indexTokens indexes a slice of tokens, and updates // literalFreq and offsetFreq, and generates literalEncoding // and offsetEncoding. // The number of literal and offset tokens is returned. -func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) { - for i := range w.literalFreq { - w.literalFreq[i] = 0 - } - for i := range w.offsetFreq { - w.offsetFreq[i] = 0 - } +func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) { + copy(w.literalFreq[:], t.litHist[:]) + copy(w.literalFreq[256:], t.extraHist[:]) + copy(w.offsetFreq[:], t.offHist[:offsetCodeCount]) - if len(tokens) == 0 { + if t.n == 0 { return } - - // Only last token should be endBlockMarker. - if tokens[len(tokens)-1] == endBlockMarker { - w.literalFreq[endBlockMarker]++ - tokens = tokens[:len(tokens)-1] + if filled { + return maxNumLit, maxNumDist } - - // Create slices up to the next power of two to avoid bounds checks. - lits := w.literalFreq[:256] - offs := w.offsetFreq[:32] - lengths := w.literalFreq[lengthCodesStart:] - lengths = lengths[:32] - for _, t := range tokens { - if t < endBlockMarker { - lits[t.literal()]++ - continue - } - length := t.length() - offset := t.offset() - lengths[lengthCode(length)&31]++ - offs[offsetCode(offset)&31]++ - } - // get the number of literals numLiterals = len(w.literalFreq) for w.literalFreq[numLiterals-1] == 0 { @@ -590,11 +694,14 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets w.offsetFreq[0] = 1 numOffsets = 1 } - w.literalEncoding.generate(w.literalFreq[:maxNumLit], 15) - w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15) return } +func (w *huffmanBitWriter) generate(t *tokens) { + w.literalEncoding.generate(w.literalFreq[:literalCount], 15) + w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15) +} + // writeTokens writes a slice of tokens to the output. // codes for literal and offset encoding must be supplied. func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) { @@ -626,8 +733,19 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) // Write the length length := t.length() lengthCode := lengthCode(length) - w.writeCode(lengths[lengthCode&31]) - extraLengthBits := uint(lengthExtraBits[lengthCode&31]) + if false { + w.writeCode(lengths[lengthCode&31]) + } else { + // inlined + c := lengths[lengthCode&31] + w.bits |= uint64(c.code) << (w.nbits & 63) + w.nbits += c.len + if w.nbits >= 48 { + w.writeOutBits() + } + } + + extraLengthBits := uint16(lengthExtraBits[lengthCode&31]) if extraLengthBits > 0 { extraLength := int32(length - lengthBase[lengthCode&31]) w.writeBits(extraLength, extraLengthBits) @@ -635,8 +753,18 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) // Write the offset offset := t.offset() offsetCode := offsetCode(offset) - w.writeCode(offs[offsetCode&31]) - extraOffsetBits := uint(offsetExtraBits[offsetCode&63]) + if false { + w.writeCode(offs[offsetCode&31]) + } else { + // inlined + c := offs[offsetCode&31] + w.bits |= uint64(c.code) << (w.nbits & 63) + w.nbits += c.len + if w.nbits >= 48 { + w.writeOutBits() + } + } + extraOffsetBits := uint16(offsetExtraBits[offsetCode&63]) if extraOffsetBits > 0 { extraOffset := int32(offset - offsetBase[offsetCode&63]) w.writeBits(extraOffset, extraOffsetBits) @@ -661,75 +789,93 @@ func init() { // writeBlockHuff encodes a block of bytes as either // Huffman encoded literals or uncompressed bytes if the // results only gains very little from compression. -func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { if w.err != nil { return } // Clear histogram - for i := range w.literalFreq { + for i := range w.literalFreq[:] { w.literalFreq[i] = 0 } + if !w.lastHuffMan { + for i := range w.offsetFreq[:] { + w.offsetFreq[i] = 0 + } + } // Add everything as literals - histogram(input, w.literalFreq) + estBits := histogramSize(input, w.literalFreq[:], !eof && !sync) + 15 - w.literalFreq[endBlockMarker] = 1 + // Store bytes, if we don't get a reasonable improvement. + ssize, storable := w.storedSize(input) + if storable && ssize < (estBits+estBits>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } - const numLiterals = endBlockMarker + 1 - const numOffsets = 1 + if w.lastHeader > 0 { + size, _ := w.dynamicSize(w.literalEncoding, huffOffset, w.lastHeader) + estBits += estBits >> (w.logReusePenalty) - w.literalEncoding.generate(w.literalFreq[:maxNumLit], 15) + if estBits < size { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + } - // Figure out smallest code. - // Always use dynamic Huffman or Store - var numCodegens int + const numLiterals = endBlockMarker + 1 + const numOffsets = 1 + if w.lastHeader == 0 { + w.literalFreq[endBlockMarker] = 1 + w.literalEncoding.generate(w.literalFreq[:numLiterals], 15) - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + numCodegens := w.codegens() - // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) - return + // Huffman. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + w.lastHuffMan = true + w.lastHeader, _ = w.headerSize() } - // Huffman. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) encoding := w.literalEncoding.codes[:257] - n := w.nbytes for _, t := range input { // Bitwriting inlined, ~30% speedup c := encoding[t] - w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) - if w.nbits < 48 { - continue - } - // Store 6 bytes - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - w.bytes[n] = byte(bits) - w.bytes[n+1] = byte(bits >> 8) - w.bytes[n+2] = byte(bits >> 16) - w.bytes[n+3] = byte(bits >> 24) - w.bytes[n+4] = byte(bits >> 32) - w.bytes[n+5] = byte(bits >> 40) - n += 6 - if n < bufferFlushSize { - continue - } - w.write(w.bytes[:n]) - if w.err != nil { - return // Return early in the event of write failures + w.bits |= uint64(c.code) << ((w.nbits) & 63) + w.nbits += c.len + if w.nbits >= 48 { + bits := w.bits + w.bits >>= 48 + w.nbits -= 48 + n := w.nbytes + w.bytes[n] = byte(bits) + w.bytes[n+1] = byte(bits >> 8) + w.bytes[n+2] = byte(bits >> 16) + w.bytes[n+3] = byte(bits >> 24) + w.bytes[n+4] = byte(bits >> 32) + w.bytes[n+5] = byte(bits >> 40) + n += 6 + if n >= bufferFlushSize { + if w.err != nil { + n = 0 + return + } + w.write(w.bytes[:n]) + n = 0 + } + w.nbytes = n } - n = 0 } - w.nbytes = n - w.writeCode(encoding[endBlockMarker]) + if eof || sync { + w.writeCode(encoding[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false + } } diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go index f65f79336..d0099599c 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_code.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go @@ -10,6 +10,12 @@ import ( "sort" ) +const ( + maxBitsLimit = 16 + // number of valid literals + literalCount = 286 +) + // hcode is a huffman code with a bit code and bit length. type hcode struct { code, len uint16 @@ -25,7 +31,7 @@ type huffmanEncoder struct { type literalNode struct { literal uint16 - freq int32 + freq uint16 } // A levelInfo describes the state of the constructed tree for a given depth. @@ -54,7 +60,11 @@ func (h *hcode) set(code uint16, length uint16) { h.code = code } -func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} } +func reverseBits(number uint16, bitLength byte) uint16 { + return bits.Reverse16(number << ((16 - bitLength) & 15)) +} + +func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} } func newHuffmanEncoder(size int) *huffmanEncoder { // Make capacity to next power of two. @@ -64,10 +74,10 @@ func newHuffmanEncoder(size int) *huffmanEncoder { // Generates a HuffmanCode corresponding to the fixed literal table func generateFixedLiteralEncoding() *huffmanEncoder { - h := newHuffmanEncoder(maxNumLit) + h := newHuffmanEncoder(literalCount) codes := h.codes var ch uint16 - for ch = 0; ch < maxNumLit; ch++ { + for ch = 0; ch < literalCount; ch++ { var bits uint16 var size uint16 switch { @@ -108,7 +118,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder { var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding() var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding() -func (h *huffmanEncoder) bitLength(freq []int32) int { +func (h *huffmanEncoder) bitLength(freq []uint16) int { var total int for i, f := range freq { if f != 0 { @@ -118,8 +128,6 @@ func (h *huffmanEncoder) bitLength(freq []int32) int { return total } -const maxBitsLimit = 16 - // Return the number of literals assigned to each bit size in the Huffman encoding // // This method is only called when list.length >= 3 @@ -163,9 +171,9 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // We initialize the levels as if we had already figured this out. levels[level] = levelInfo{ level: level, - lastFreq: list[1].freq, - nextCharFreq: list[2].freq, - nextPairFreq: list[0].freq + list[1].freq, + lastFreq: int32(list[1].freq), + nextCharFreq: int32(list[2].freq), + nextPairFreq: int32(list[0].freq) + int32(list[1].freq), } leafCounts[level][level] = 2 if level == 1 { @@ -197,7 +205,12 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { l.lastFreq = l.nextCharFreq // Lower leafCounts are the same of the previous node. leafCounts[level][level] = n - l.nextCharFreq = list[n].freq + e := list[n] + if e.literal < math.MaxUint16 { + l.nextCharFreq = int32(e.freq) + } else { + l.nextCharFreq = math.MaxInt32 + } } else { // The next item on this row is a pair from the previous row. // nextPairFreq isn't valid until we generate two @@ -273,12 +286,12 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // // freq An array of frequencies, in which frequency[i] gives the frequency of literal i. // maxBits The maximum number of bits to use for any literal. -func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { +func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { if h.freqcache == nil { // Allocate a reusable buffer with the longest possible frequency table. - // Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit. - // The largest of these is maxNumLit, so we allocate for that case. - h.freqcache = make([]literalNode, maxNumLit+1) + // Possible lengths are codegenCodeCount, offsetCodeCount and literalCount. + // The largest of these is literalCount, so we allocate for that case. + h.freqcache = make([]literalNode, literalCount+1) } list := h.freqcache[:len(freq)+1] // Number of non-zero literals @@ -345,3 +358,27 @@ func (s byFreq) Less(i, j int) bool { } func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] } + +// histogramSize accumulates a histogram of b in h. +// An estimated size in bits is returned. +// Unassigned values are assigned '1' in the histogram. +// len(h) must be >= 256, and h's elements must be all zeroes. +func histogramSize(b []byte, h []uint16, fill bool) int { + h = h[:256] + for _, t := range b { + h[t]++ + } + invTotal := 1.0 / float64(len(b)) + shannon := 0.0 + single := math.Ceil(-math.Log2(invTotal)) + for i, v := range h[:] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + } else if fill { + shannon += single + h[i] = 1 + } + } + return int(shannon + 0.99) +} diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go index 800d0ce9e..6dc5b5d06 100644 --- a/vendor/github.com/klauspost/compress/flate/inflate.go +++ b/vendor/github.com/klauspost/compress/flate/inflate.go @@ -9,6 +9,7 @@ package flate import ( "bufio" + "fmt" "io" "math/bits" "strconv" @@ -24,6 +25,8 @@ const ( maxNumLit = 286 maxNumDist = 30 numCodes = 19 // number of codes in Huffman meta-code + + debugDecode = false ) // Initialize the fixedHuffmanDecoder only once upon first use. @@ -104,8 +107,8 @@ const ( type huffmanDecoder struct { min int // the minimum code length - chunks *[huffmanNumChunks]uint32 // chunks as described above - links [][]uint32 // overflow links + chunks *[huffmanNumChunks]uint16 // chunks as described above + links [][]uint16 // overflow links linkMask uint32 // mask the width of the link table } @@ -121,7 +124,7 @@ func (h *huffmanDecoder) init(lengths []int) bool { const sanity = false if h.chunks == nil { - h.chunks = &[huffmanNumChunks]uint32{} + h.chunks = &[huffmanNumChunks]uint16{} } if h.min != 0 { *h = huffmanDecoder{chunks: h.chunks, links: h.links} @@ -169,6 +172,9 @@ func (h *huffmanDecoder) init(lengths []int) bool { // accept degenerate single-code codings. See also // TestDegenerateHuffmanCoding. if code != 1<<uint(max) && !(code == 1 && max == 1) { + if debugDecode { + fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)") + } return false } @@ -185,7 +191,7 @@ func (h *huffmanDecoder) init(lengths []int) bool { // create link tables link := nextcode[huffmanChunkBits+1] >> 1 if cap(h.links) < huffmanNumChunks-link { - h.links = make([][]uint32, huffmanNumChunks-link) + h.links = make([][]uint16, huffmanNumChunks-link) } else { h.links = h.links[:huffmanNumChunks-link] } @@ -196,9 +202,9 @@ func (h *huffmanDecoder) init(lengths []int) bool { if sanity && h.chunks[reverse] != 0 { panic("impossible: overwriting existing chunk") } - h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1)) + h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1)) if cap(h.links[off]) < numLinks { - h.links[off] = make([]uint32, numLinks) + h.links[off] = make([]uint16, numLinks) } else { links := h.links[off][:0] h.links[off] = links[:numLinks] @@ -214,7 +220,7 @@ func (h *huffmanDecoder) init(lengths []int) bool { } code := nextcode[n] nextcode[n]++ - chunk := uint32(i<<huffmanValueShift | n) + chunk := uint16(i<<huffmanValueShift | n) reverse := int(bits.Reverse16(uint16(code))) reverse >>= uint(16 - n) if n <= huffmanChunkBits { @@ -347,6 +353,9 @@ func (f *decompressor) nextBlock() { f.huffmanBlock() default: // 3 is reserved. + if debugDecode { + fmt.Println("reserved data block encountered") + } f.err = CorruptInputError(f.roffset) } } @@ -425,11 +434,17 @@ func (f *decompressor) readHuffman() error { } nlit := int(f.b&0x1F) + 257 if nlit > maxNumLit { + if debugDecode { + fmt.Println("nlit > maxNumLit", nlit) + } return CorruptInputError(f.roffset) } f.b >>= 5 ndist := int(f.b&0x1F) + 1 if ndist > maxNumDist { + if debugDecode { + fmt.Println("ndist > maxNumDist", ndist) + } return CorruptInputError(f.roffset) } f.b >>= 5 @@ -453,6 +468,9 @@ func (f *decompressor) readHuffman() error { f.codebits[codeOrder[i]] = 0 } if !f.h1.init(f.codebits[0:]) { + if debugDecode { + fmt.Println("init codebits failed") + } return CorruptInputError(f.roffset) } @@ -480,6 +498,9 @@ func (f *decompressor) readHuffman() error { rep = 3 nb = 2 if i == 0 { + if debugDecode { + fmt.Println("i==0") + } return CorruptInputError(f.roffset) } b = f.bits[i-1] @@ -494,6 +515,9 @@ func (f *decompressor) readHuffman() error { } for f.nb < nb { if err := f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits:", err) + } return err } } @@ -501,6 +525,9 @@ func (f *decompressor) readHuffman() error { f.b >>= nb f.nb -= nb if i+rep > n { + if debugDecode { + fmt.Println("i+rep > n", i, rep, n) + } return CorruptInputError(f.roffset) } for j := 0; j < rep; j++ { @@ -510,6 +537,9 @@ func (f *decompressor) readHuffman() error { } if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) { + if debugDecode { + fmt.Println("init2 failed") + } return CorruptInputError(f.roffset) } @@ -587,12 +617,18 @@ readLiteral: length = 258 n = 0 default: + if debugDecode { + fmt.Println(v, ">= maxNumLit") + } f.err = CorruptInputError(f.roffset) return } if n > 0 { for f.nb < n { if err = f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits n>0:", err) + } f.err = err return } @@ -606,6 +642,9 @@ readLiteral: if f.hd == nil { for f.nb < 5 { if err = f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits f.nb<5:", err) + } f.err = err return } @@ -615,6 +654,9 @@ readLiteral: f.nb -= 5 } else { if dist, err = f.huffSym(f.hd); err != nil { + if debugDecode { + fmt.Println("huffsym:", err) + } f.err = err return } @@ -629,6 +671,9 @@ readLiteral: extra := (dist & 1) << nb for f.nb < nb { if err = f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits f.nb<nb:", err) + } f.err = err return } @@ -638,12 +683,18 @@ readLiteral: f.nb -= nb dist = 1<<(nb+1) + 1 + extra default: + if debugDecode { + fmt.Println("dist too big:", dist, maxNumDist) + } f.err = CorruptInputError(f.roffset) return } // No check on length; encoding can be prescient. if dist > f.dict.histSize() { + if debugDecode { + fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) + } f.err = CorruptInputError(f.roffset) return } @@ -688,6 +739,9 @@ func (f *decompressor) dataBlock() { n := int(f.buf[0]) | int(f.buf[1])<<8 nn := int(f.buf[2]) | int(f.buf[3])<<8 if uint16(nn) != uint16(^n) { + if debugDecode { + fmt.Println("uint16(nn) != uint16(^n)", nn, ^n) + } f.err = CorruptInputError(f.roffset) return } @@ -789,6 +843,9 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) { if n == 0 { f.b = b f.nb = nb + if debugDecode { + fmt.Println("huffsym: n==0") + } f.err = CorruptInputError(f.roffset) return 0, f.err } diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go new file mode 100644 index 000000000..20de8f11f --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level1.go @@ -0,0 +1,174 @@ +package flate + +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastEncL1 struct { + fastGen + table [tableSize]tableEntry +} + +// EncodeL1 uses a similar algorithm to level 1 +func (e *fastEncL1) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3232(src, s) + + for { + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hash(cv) + candidate = e.table[nextHash] + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + + now := load6432(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + nextHash = hash(uint32(now)) + + offset := s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} + break + } + + // Do one right away... + cv = uint32(now) + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + + offset = s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} + break + } + cv = uint32(now) + s = nextS + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + t := candidate.offset - e.cur + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + // Save the match found + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + if s >= sLimit { + // Index first pair after match end. + if int(s+l+4) < len(src) { + cv := load3232(src, s) + e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv} + } + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load6432(src, s-2) + o := e.cur + s - 2 + prevHash := hash(uint32(x)) + e.table[prevHash] = tableEntry{offset: o, val: uint32(x)} + x >>= 16 + currHash := hash(uint32(x)) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)} + + offset := s - (candidate.offset - e.cur) + if offset > maxMatchOffset || uint32(x) != candidate.val { + cv = uint32(x >> 8) + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go new file mode 100644 index 000000000..7c824431e --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level2.go @@ -0,0 +1,199 @@ +package flate + +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastEncL2 struct { + fastGen + table [bTableSize]tableEntry +} + +// EncodeL2 uses a similar algorithm to level 1, but is capable +// of matching across blocks giving better compression at a small slowdown. +func (e *fastEncL2) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3232(src, s) + for { + // When should we start skipping if we haven't found matches in a long while. + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hash4u(cv, bTableBits) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidate = e.table[nextHash] + now := load6432(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + nextHash = hash4u(uint32(now), bTableBits) + + offset := s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} + break + } + + // Do one right away... + cv = uint32(now) + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + + offset = s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + break + } + cv = uint32(now) + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + t := candidate.offset - e.cur + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+l+4) < len(src) { + cv := load3232(src, s) + e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv} + } + goto emitRemainder + } + + // Store every second hash in-between, but offset by 1. + for i := s - l + 2; i < s-5; i += 7 { + x := load6432(src, int32(i)) + nextHash := hash4u(uint32(x), bTableBits) + e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)} + // Skip one + x >>= 16 + nextHash = hash4u(uint32(x), bTableBits) + e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)} + // Skip one + x >>= 16 + nextHash = hash4u(uint32(x), bTableBits) + e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)} + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load6432(src, s-2) + o := e.cur + s - 2 + prevHash := hash4u(uint32(x), bTableBits) + prevHash2 := hash4u(uint32(x>>8), bTableBits) + e.table[prevHash] = tableEntry{offset: o, val: uint32(x)} + e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)} + currHash := hash4u(uint32(x>>16), bTableBits) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)} + + offset := s - (candidate.offset - e.cur) + if offset > maxMatchOffset || uint32(x>>16) != candidate.val { + cv = uint32(x >> 24) + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go new file mode 100644 index 000000000..4153d24c9 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level3.go @@ -0,0 +1,225 @@ +package flate + +// fastEncL3 +type fastEncL3 struct { + fastGen + table [tableSize]tableEntryPrev +} + +// Encode uses a similar algorithm to level 2, will check up to two candidates. +func (e *fastEncL3) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 8 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + } + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + e.table[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // Skip if too small. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3232(src, s) + for { + const skipLog = 6 + nextS := s + var candidate tableEntry + for { + nextHash := hash(cv) + s = nextS + nextS = s + 1 + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidates := e.table[nextHash] + now := load3232(src, nextS) + e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} + + // Check both candidates + candidate = candidates.Cur + offset := s - (candidate.offset - e.cur) + if cv == candidate.val { + if offset > maxMatchOffset { + cv = now + // Previous will also be invalid, we have nothing. + continue + } + o2 := s - (candidates.Prev.offset - e.cur) + if cv != candidates.Prev.val || o2 > maxMatchOffset { + break + } + // Both match and are valid, pick longest. + l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:]) + if l2 > l1 { + candidate = candidates.Prev + } + break + } else { + // We only check if value mismatches. + // Offset will always be invalid in other cases. + candidate = candidates.Prev + if cv == candidate.val { + offset := s - (candidate.offset - e.cur) + if offset <= maxMatchOffset { + break + } + } + } + cv = now + } + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + // + t := candidate.offset - e.cur + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + t += l + // Index first pair after match end. + if int(t+4) < len(src) && t > 0 { + cv := load3232(src, t) + nextHash := hash(cv) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + t, val: cv}, + } + } + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-3 to s. + x := load6432(src, s-3) + prevHash := hash(uint32(x)) + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 3, val: uint32(x)}, + } + x >>= 8 + prevHash = hash(uint32(x)) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, + } + x >>= 8 + prevHash = hash(uint32(x)) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, + } + x >>= 8 + currHash := hash(uint32(x)) + candidates := e.table[currHash] + cv = uint32(x) + e.table[currHash] = tableEntryPrev{ + Prev: candidates.Cur, + Cur: tableEntry{offset: s + e.cur, val: cv}, + } + + // Check both candidates + candidate = candidates.Cur + if cv == candidate.val { + offset := s - (candidate.offset - e.cur) + if offset <= maxMatchOffset { + continue + } + } else { + // We only check if value mismatches. + // Offset will always be invalid in other cases. + candidate = candidates.Prev + if cv == candidate.val { + offset := s - (candidate.offset - e.cur) + if offset <= maxMatchOffset { + continue + } + } + } + cv = uint32(x >> 8) + s++ + break + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go new file mode 100644 index 000000000..c689ac771 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level4.go @@ -0,0 +1,210 @@ +package flate + +import "fmt" + +type fastEncL4 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntry +} + +func (e *fastEncL4) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.bTable[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load6432(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var t int32 + for { + nextHashS := hash4x64(cv, tableBits) + nextHashL := hash7(cv, tableBits) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := load6432(src, nextS) + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.table[nextHashS] = entry + e.bTable[nextHashL] = entry + + t = lCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == lCandidate.val { + // We got a long match. Use that. + break + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { + // Found a 4 match... + lCandidate = e.bTable[hash7(next, tableBits)] + + // If the next long is a candidate, check if we should use that instead... + lOff := nextS - (lCandidate.offset - e.cur) + if lOff < maxMatchOffset && lCandidate.val == uint32(next) { + l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:]) + if l2 > l1 { + s = nextS + t = lCandidate.offset - e.cur + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + if false { + if t >= s { + panic("s-t") + } + if (s - t) > maxMatchOffset { + panic(fmt.Sprintln("mmo", t)) + } + if l < baseMatchLength { + panic("bml") + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+8) < len(src) { + cv := load6432(src, s) + e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)} + e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)} + } + goto emitRemainder + } + + // Store every 3rd hash in-between + if true { + i := nextS + if i < s-1 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1} + e.bTable[hash7(cv, tableBits)] = t + e.bTable[hash7(cv>>8, tableBits)] = t2 + e.table[hash4u(t2.val, tableBits)] = t2 + + i += 3 + for ; i < s-1; i += 3 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1} + e.bTable[hash7(cv, tableBits)] = t + e.bTable[hash7(cv>>8, tableBits)] = t2 + e.table[hash4u(t2.val, tableBits)] = t2 + } + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := load6432(src, s-1) + o := e.cur + s - 1 + prevHashS := hash4x64(x, tableBits) + prevHashL := hash7(x, tableBits) + e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)} + e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)} + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go new file mode 100644 index 000000000..14a235612 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level5.go @@ -0,0 +1,276 @@ +package flate + +import "fmt" + +type fastEncL5 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL5) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load6432(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hash4x64(cv, tableBits) + nextHashL := hash7(cv, tableBits) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := load6432(src, nextS) + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + nextHashS = hash4x64(next, tableBits) + nextHashL = hash7(next, tableBits) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == lCandidate.Cur.val { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + l = e.matchlen(s+4, t+4, src) + 4 + ml1 := e.matchlen(s+4, t2+4, src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { + // Found a 4 match... + l = e.matchlen(s+4, t+4, src) + 4 + lCandidate = e.bTable[nextHashL] + // Store the next match + + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + // If the next long is a candidate, use that... + t2 := lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if lCandidate.Cur.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + if l == 0 { + l = e.matchlenLong(s+4, t+4, src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(s+l, t+l, src) + } + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + if false { + if t >= s { + panic(fmt.Sprintln("s-t", s, t)) + } + if (s - t) > maxMatchOffset { + panic(fmt.Sprintln("mmo", s-t)) + } + if l < baseMatchLength { + panic("bml") + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + goto emitRemainder + } + + // Store every 3rd hash in-between. + if true { + const hashEvery = 3 + i := s - l + 1 + if i < s-1 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + e.table[hash4x64(cv, tableBits)] = t + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // Do an long at i+1 + cv >>= 8 + t = tableEntry{offset: t.offset + 1, val: uint32(cv)} + eLong = &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // We only have enough bits for a short entry at i+2 + cv >>= 8 + t = tableEntry{offset: t.offset + 1, val: uint32(cv)} + e.table[hash4x64(cv, tableBits)] = t + + // Skip one - otherwise we risk hitting 's' + i += 4 + for ; i < s-1; i += hashEvery { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + e.table[hash4u(t2.val, tableBits)] = t2 + } + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := load6432(src, s-1) + o := e.cur + s - 1 + prevHashS := hash4x64(x, tableBits) + prevHashL := hash7(x, tableBits) + e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)} + eLong := &e.bTable[prevHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go new file mode 100644 index 000000000..cad0c7df7 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level6.go @@ -0,0 +1,279 @@ +package flate + +import "fmt" + +type fastEncL6 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL6) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load6432(src, s) + // Repeat MUST be > 1 and within range + repeat := int32(1) + for { + const skipLog = 7 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hash4x64(cv, tableBits) + nextHashL := hash7(cv, tableBits) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := load6432(src, nextS) + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + // Calculate hashes of 'next' + nextHashS = hash4x64(next, tableBits) + nextHashL = hash7(next, tableBits) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == lCandidate.Cur.val { + // Long candidate matches at least 4 bytes. + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + // Check the previous long candidate as well. + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + l = e.matchlen(s+4, t+4, src) + 4 + ml1 := e.matchlen(s+4, t2+4, src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + // Current value did not match, but check if previous long value does. + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { + // Found a 4 match... + l = e.matchlen(s+4, t+4, src) + 4 + + // Look up next long candidate (at nextS) + lCandidate = e.bTable[nextHashL] + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + // Check repeat at s + repOff + const repOff = 1 + t2 := s - repeat + repOff + if load3232(src, t2) == uint32(cv>>(8*repOff)) { + ml := e.matchlen(s+4+repOff, t2+4, src) + 4 + if ml > l { + t = t2 + l = ml + s += repOff + // Not worth checking more. + break + } + } + + // If the next long is a candidate, use that... + t2 = lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if lCandidate.Cur.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + // This is ok, but check previous as well. + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + if l == 0 { + l = e.matchlenLong(s+4, t+4, src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(s+l, t+l, src) + } + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + if false { + if t >= s { + panic(fmt.Sprintln("s-t", s, t)) + } + if (s - t) > maxMatchOffset { + panic(fmt.Sprintln("mmo", s-t)) + } + if l < baseMatchLength { + panic("bml") + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + repeat = s - t + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index after match end. + for i := nextS + 1; i < int32(len(src))-8; i += 2 { + cv := load6432(src, i) + e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur + } + goto emitRemainder + } + + // Store every long hash in-between and every second short. + if true { + for i := nextS + 1; i < s-1; i += 2 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong2 := &e.bTable[hash7(cv>>8, tableBits)] + e.table[hash4x64(cv, tableBits)] = t + eLong.Cur, eLong.Prev = t, eLong.Cur + eLong2.Cur, eLong2.Prev = t2, eLong2.Cur + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + cv = load6432(src, s) + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/reverse_bits.go b/vendor/github.com/klauspost/compress/flate/reverse_bits.go deleted file mode 100644 index c1a02720d..000000000 --- a/vendor/github.com/klauspost/compress/flate/reverse_bits.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package flate - -var reverseByte = [256]byte{ - 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, - 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, - 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, - 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, - 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, - 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, - 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, - 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, - 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, - 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, - 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, - 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, - 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, - 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, - 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, - 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, - 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, - 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, - 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, - 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, - 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, - 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, - 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, - 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, - 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, - 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, - 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, - 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, - 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, - 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, - 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, - 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, -} - -func reverseUint16(v uint16) uint16 { - return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8 -} - -func reverseBits(number uint16, bitLength byte) uint16 { - return reverseUint16(number << uint8(16-bitLength)) -} diff --git a/vendor/github.com/klauspost/compress/flate/snappy.go b/vendor/github.com/klauspost/compress/flate/snappy.go deleted file mode 100644 index aebebd524..000000000 --- a/vendor/github.com/klauspost/compress/flate/snappy.go +++ /dev/null @@ -1,900 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Modified for deflate by Klaus Post (c) 2015. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package flate - -// emitLiteral writes a literal chunk and returns the number of bytes written. -func emitLiteral(dst *tokens, lit []byte) { - ol := int(dst.n) - for i, v := range lit { - dst.tokens[(i+ol)&maxStoreBlockSize] = token(v) - } - dst.n += uint16(len(lit)) -} - -// emitCopy writes a copy chunk and returns the number of bytes written. -func emitCopy(dst *tokens, offset, length int) { - dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize)) - dst.n++ -} - -type fastEnc interface { - Encode(dst *tokens, src []byte) - Reset() -} - -func newFastEnc(level int) fastEnc { - switch level { - case 1: - return &snappyL1{} - case 2: - return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}} - case 3: - return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}} - case 4: - return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}} - default: - panic("invalid level specified") - } -} - -const ( - tableBits = 14 // Bits used in the table - tableSize = 1 << tableBits // Size of the table - tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. - tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32. - baseMatchOffset = 1 // The smallest match offset - baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 - maxMatchOffset = 1 << 15 // The largest match offset -) - -func load32(b []byte, i int) uint32 { - b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func load64(b []byte, i int) uint64 { - b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 -} - -func hash(u uint32) uint32 { - return (u * 0x1e35a7bd) >> tableShift -} - -// snappyL1 encapsulates level 1 compression -type snappyL1 struct{} - -func (e *snappyL1) Reset() {} - -func (e *snappyL1) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 16 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin - ) - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - return - } - - // Initialize the hash table. - // - // The table element type is uint16, as s < sLimit and sLimit < len(src) - // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535. - var table [tableSize]uint16 - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - nextHash := hash(load32(src, s)) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := 32 - - nextS := s - candidate := 0 - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidate = int(table[nextHash&tableMask]) - table[nextHash&tableMask] = uint16(s) - nextHash = hash(load32(src, nextS)) - if s-candidate <= maxMatchOffset && load32(src, s) == load32(src, candidate) { - break - } - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - base := s - - // Extend the 4-byte match as long as possible. - // - // This is an inlined version of Snappy's: - // s = extendMatch(src, candidate+4, s+4) - s += 4 - s1 := base + maxMatchLength - if s1 > len(src) { - s1 = len(src) - } - a := src[s:s1] - b := src[candidate+4:] - b = b[:len(a)] - l := len(a) - for i := range a { - if a[i] != b[i] { - l = i - break - } - } - s += l - - // matchToken is flate's equivalent of Snappy's emitCopy. - dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset)) - dst.n++ - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load64(src, s-1) - prevHash := hash(uint32(x >> 0)) - table[prevHash&tableMask] = uint16(s - 1) - currHash := hash(uint32(x >> 8)) - candidate = int(table[currHash&tableMask]) - table[currHash&tableMask] = uint16(s) - if s-candidate > maxMatchOffset || uint32(x>>8) != load32(src, candidate) { - nextHash = hash(uint32(x >> 16)) - s++ - break - } - } - } - -emitRemainder: - if nextEmit < len(src) { - emitLiteral(dst, src[nextEmit:]) - } -} - -type tableEntry struct { - val uint32 - offset int32 -} - -func load3232(b []byte, i int32) uint32 { - b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func load6432(b []byte, i int32) uint64 { - b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 -} - -// snappyGen maintains the table for matches, -// and the previous byte block for level 2. -// This is the generic implementation. -type snappyGen struct { - prev []byte - cur int32 -} - -// snappyGen maintains the table for matches, -// and the previous byte block for level 2. -// This is the generic implementation. -type snappyL2 struct { - snappyGen - table [tableSize]tableEntry -} - -// EncodeL2 uses a similar algorithm to level 1, but is capable -// of matching across blocks giving better compression at a small slowdown. -func (e *snappyL2) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 8 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin - ) - - // Protect against e.cur wraparound. - if e.cur > 1<<30 { - for i := range e.table[:] { - e.table[i] = tableEntry{} - } - e.cur = maxStoreBlockSize - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load3232(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidate = e.table[nextHash&tableMask] - now := load3232(src, nextS) - e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv} - nextHash = hash(now) - - offset := s - (candidate.offset - e.cur) - if offset > maxMatchOffset || cv != candidate.val { - // Out of range or not matched. - cv = now - continue - } - break - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchlen(s, t, src) - - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) - dst.n++ - s += l - nextEmit = s - if s >= sLimit { - t += l - // Index first pair after match end. - if int(t+4) < len(src) && t > 0 { - cv := load3232(src, t) - e.table[hash(cv)&tableMask] = tableEntry{offset: t + e.cur, val: cv} - } - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load6432(src, s-1) - prevHash := hash(uint32(x)) - e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)} - x >>= 8 - currHash := hash(uint32(x)) - candidate = e.table[currHash&tableMask] - e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)} - - offset := s - (candidate.offset - e.cur) - if offset > maxMatchOffset || uint32(x) != candidate.val { - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break - } - } - } - -emitRemainder: - if int(nextEmit) < len(src) { - emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) -} - -type tableEntryPrev struct { - Cur tableEntry - Prev tableEntry -} - -// snappyL3 -type snappyL3 struct { - snappyGen - table [tableSize]tableEntryPrev -} - -// Encode uses a similar algorithm to level 2, will check up to two candidates. -func (e *snappyL3) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 8 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin - ) - - // Protect against e.cur wraparound. - if e.cur > 1<<30 { - for i := range e.table[:] { - e.table[i] = tableEntryPrev{} - } - e.snappyGen = snappyGen{cur: maxStoreBlockSize, prev: e.prev[:0]} - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load3232(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidates := e.table[nextHash&tableMask] - now := load3232(src, nextS) - e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} - nextHash = hash(now) - - // Check both candidates - candidate = candidates.Cur - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - break - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - break - } - } - } - cv = now - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchlen(s, t, src) - - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) - dst.n++ - s += l - nextEmit = s - if s >= sLimit { - t += l - // Index first pair after match end. - if int(t+4) < len(src) && t > 0 { - cv := load3232(src, t) - nextHash = hash(cv) - e.table[nextHash&tableMask] = tableEntryPrev{ - Prev: e.table[nextHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + t, val: cv}, - } - } - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-3 to s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load6432(src, s-3) - prevHash := hash(uint32(x)) - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 3, val: uint32(x)}, - } - x >>= 8 - prevHash = hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, - } - x >>= 8 - prevHash = hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, - } - x >>= 8 - currHash := hash(uint32(x)) - candidates := e.table[currHash&tableMask] - cv = uint32(x) - e.table[currHash&tableMask] = tableEntryPrev{ - Prev: candidates.Cur, - Cur: tableEntry{offset: s + e.cur, val: cv}, - } - - // Check both candidates - candidate = candidates.Cur - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - continue - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - continue - } - } - } - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break - } - } - -emitRemainder: - if int(nextEmit) < len(src) { - emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) -} - -// snappyL4 -type snappyL4 struct { - snappyL3 -} - -// Encode uses a similar algorithm to level 3, -// but will check up to two candidates if first isn't long enough. -func (e *snappyL4) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 8 - 3 - minNonLiteralBlockSize = 1 + 1 + inputMargin - matchLenGood = 12 - ) - - // Protect against e.cur wraparound. - if e.cur > 1<<30 { - for i := range e.table[:] { - e.table[i] = tableEntryPrev{} - } - e.snappyGen = snappyGen{cur: maxStoreBlockSize, prev: e.prev[:0]} - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load3232(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - var candidateAlt tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidates := e.table[nextHash&tableMask] - now := load3232(src, nextS) - e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} - nextHash = hash(now) - - // Check both candidates - candidate = candidates.Cur - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - offset = s - (candidates.Prev.offset - e.cur) - if cv == candidates.Prev.val && offset < maxMatchOffset { - candidateAlt = candidates.Prev - } - break - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - break - } - } - } - cv = now - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchlen(s, t, src) - // Try alternative candidate if match length < matchLenGood. - if l < matchLenGood-4 && candidateAlt.offset != 0 { - t2 := candidateAlt.offset - e.cur + 4 - l2 := e.matchlen(s, t2, src) - if l2 > l { - l = l2 - t = t2 - } - } - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) - dst.n++ - s += l - nextEmit = s - if s >= sLimit { - t += l - // Index first pair after match end. - if int(t+4) < len(src) && t > 0 { - cv := load3232(src, t) - nextHash = hash(cv) - e.table[nextHash&tableMask] = tableEntryPrev{ - Prev: e.table[nextHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + t, val: cv}, - } - } - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-3 to s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load6432(src, s-3) - prevHash := hash(uint32(x)) - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 3, val: uint32(x)}, - } - x >>= 8 - prevHash = hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, - } - x >>= 8 - prevHash = hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, - } - x >>= 8 - currHash := hash(uint32(x)) - candidates := e.table[currHash&tableMask] - cv = uint32(x) - e.table[currHash&tableMask] = tableEntryPrev{ - Prev: candidates.Cur, - Cur: tableEntry{offset: s + e.cur, val: cv}, - } - - // Check both candidates - candidate = candidates.Cur - candidateAlt = tableEntry{} - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - offset = s - (candidates.Prev.offset - e.cur) - if cv == candidates.Prev.val && offset <= maxMatchOffset { - candidateAlt = candidates.Prev - } - continue - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - continue - } - } - } - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break - } - } - -emitRemainder: - if int(nextEmit) < len(src) { - emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) -} - -func (e *snappyGen) matchlen(s, t int32, src []byte) int32 { - s1 := int(s) + maxMatchLength - 4 - if s1 > len(src) { - s1 = len(src) - } - - // If we are inside the current block - if t >= 0 { - b := src[t:] - a := src[s:s1] - b = b[:len(a)] - // Extend the match to be as long as possible. - for i := range a { - if a[i] != b[i] { - return int32(i) - } - } - return int32(len(a)) - } - - // We found a match in the previous block. - tp := int32(len(e.prev)) + t - if tp < 0 { - return 0 - } - - // Extend the match to be as long as possible. - a := src[s:s1] - b := e.prev[tp:] - if len(b) > len(a) { - b = b[:len(a)] - } - a = a[:len(b)] - for i := range b { - if a[i] != b[i] { - return int32(i) - } - } - - // If we reached our limit, we matched everything we are - // allowed to in the previous block and we return. - n := int32(len(b)) - if int(s+n) == s1 { - return n - } - - // Continue looking for more matches in the current block. - a = src[s+n : s1] - b = src[:len(a)] - for i := range a { - if a[i] != b[i] { - return int32(i) + n - } - } - return int32(len(a)) + n -} - -// Reset the encoding table. -func (e *snappyGen) Reset() { - e.prev = e.prev[:0] - e.cur += maxMatchOffset -} diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go new file mode 100644 index 000000000..524ee0ae3 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/stateless.go @@ -0,0 +1,252 @@ +package flate + +import ( + "io" + "math" +) + +const ( + maxStatelessBlock = math.MaxInt16 + + slTableBits = 13 + slTableSize = 1 << slTableBits + slTableShift = 32 - slTableBits +) + +type statelessWriter struct { + dst io.Writer + closed bool +} + +func (s *statelessWriter) Close() error { + if s.closed { + return nil + } + s.closed = true + // Emit EOF block + return StatelessDeflate(s.dst, nil, true) +} + +func (s *statelessWriter) Write(p []byte) (n int, err error) { + err = StatelessDeflate(s.dst, p, false) + if err != nil { + return 0, err + } + return len(p), nil +} + +func (s *statelessWriter) Reset(w io.Writer) { + s.dst = w + s.closed = false +} + +// NewStatelessWriter will do compression but without maintaining any state +// between Write calls. +// There will be no memory kept between Write calls, +// but compression and speed will be suboptimal. +// Because of this, the size of actual Write calls will affect output size. +func NewStatelessWriter(dst io.Writer) io.WriteCloser { + return &statelessWriter{dst: dst} +} + +// StatelessDeflate allows to compress directly to a Writer without retaining state. +// When returning everything will be flushed. +func StatelessDeflate(out io.Writer, in []byte, eof bool) error { + var dst tokens + bw := newHuffmanBitWriter(out) + if eof && len(in) == 0 { + // Just write an EOF block. + // Could be faster... + bw.writeStoredHeader(0, true) + bw.flush() + return bw.err + } + + for len(in) > 0 { + todo := in + if len(todo) > maxStatelessBlock { + todo = todo[:maxStatelessBlock] + } + in = in[len(todo):] + // Compress + statelessEnc(&dst, todo) + isEof := eof && len(in) == 0 + + if dst.n == 0 { + bw.writeStoredHeader(len(todo), isEof) + if bw.err != nil { + return bw.err + } + bw.writeBytes(todo) + } else if int(dst.n) > len(todo)-len(todo)>>4 { + // If we removed less than 1/16th, huffman compress the block. + bw.writeBlockHuff(isEof, todo, false) + } else { + bw.writeBlockDynamic(&dst, isEof, todo, false) + } + if bw.err != nil { + return bw.err + } + dst.Reset() + } + if !eof { + // Align. + bw.writeStoredHeader(0, false) + } + bw.flush() + return bw.err +} + +func hashSL(u uint32) uint32 { + return (u * 0x1e35a7bd) >> slTableShift +} + +func load3216(b []byte, i int16) uint32 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load6416(b []byte, i int16) uint64 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func statelessEnc(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + type tableEntry struct { + offset int16 + } + + var table [slTableSize]tableEntry + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + s := int16(1) + nextEmit := int16(0) + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int16(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3216(src, s) + + for { + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hashSL(cv) + candidate = table[nextHash] + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit || nextS <= 0 { + goto emitRemainder + } + + now := load6416(src, nextS) + table[nextHash] = tableEntry{offset: s} + nextHash = hashSL(uint32(now)) + + if cv == load3216(src, candidate.offset) { + table[nextHash] = tableEntry{offset: nextS} + break + } + + // Do one right away... + cv = uint32(now) + s = nextS + nextS++ + candidate = table[nextHash] + now >>= 8 + table[nextHash] = tableEntry{offset: s} + + if cv == load3216(src, candidate.offset) { + table[nextHash] = tableEntry{offset: nextS} + break + } + cv = uint32(now) + s = nextS + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + t := candidate.offset + l := int16(matchLen(src[s+4:], src[t+4:]) + 4) + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + // Save the match found + dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + if s >= sLimit { + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load6416(src, s-2) + o := s - 2 + prevHash := hashSL(uint32(x)) + table[prevHash] = tableEntry{offset: o} + x >>= 16 + currHash := hashSL(uint32(x)) + candidate = table[currHash] + table[currHash] = tableEntry{offset: o + 2} + + if uint32(x) != load3216(src, candidate.offset) { + cv = uint32(x >> 8) + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go index 141299b97..b3df0d894 100644 --- a/vendor/github.com/klauspost/compress/flate/token.go +++ b/vendor/github.com/klauspost/compress/flate/token.go @@ -4,6 +4,14 @@ package flate +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "math" +) + const ( // 2 bits: type 0 = literal 1=EOF 2=Match 3=Unused // 8 bits: xlength = length - MIN_MATCH_LENGTH @@ -46,6 +54,36 @@ var lengthCodes = [256]uint8{ 27, 27, 27, 27, 27, 28, } +// lengthCodes1 is length codes, but starting at 1. +var lengthCodes1 = [256]uint8{ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, + 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, + 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, + 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, + 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, + 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, + 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 29, +} + var offsetCodes = [256]uint32{ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, @@ -65,19 +103,236 @@ var offsetCodes = [256]uint32{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, } +// offsetCodes14 are offsetCodes, but with 14 added. +var offsetCodes14 = [256]uint32{ + 14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, +} + type token uint32 type tokens struct { - tokens [maxStoreBlockSize + 1]token - n uint16 // Must be able to contain maxStoreBlockSize + nLits int + extraHist [32]uint16 // codes 256->maxnumlit + offHist [32]uint16 // offset codes + litHist [256]uint16 // codes 0->255 + n uint16 // Must be able to contain maxStoreBlockSize + tokens [maxStoreBlockSize + 1]token +} + +func (t *tokens) Reset() { + if t.n == 0 { + return + } + t.n = 0 + t.nLits = 0 + for i := range t.litHist[:] { + t.litHist[i] = 0 + } + for i := range t.extraHist[:] { + t.extraHist[i] = 0 + } + for i := range t.offHist[:] { + t.offHist[i] = 0 + } +} + +func (t *tokens) Fill() { + if t.n == 0 { + return + } + for i, v := range t.litHist[:] { + if v == 0 { + t.litHist[i] = 1 + t.nLits++ + } + } + for i, v := range t.extraHist[:literalCount-256] { + if v == 0 { + t.nLits++ + t.extraHist[i] = 1 + } + } + for i, v := range t.offHist[:offsetCodeCount] { + if v == 0 { + t.offHist[i] = 1 + } + } +} + +func indexTokens(in []token) tokens { + var t tokens + t.indexTokens(in) + return t +} + +func (t *tokens) indexTokens(in []token) { + t.Reset() + for _, tok := range in { + if tok < matchType { + t.tokens[t.n] = tok + t.litHist[tok]++ + t.n++ + continue + } + t.AddMatch(uint32(tok.length()), tok.offset()) + } } -// Convert a literal into a literal token. -func literalToken(literal uint32) token { return token(literalType + literal) } +// emitLiteral writes a literal chunk and returns the number of bytes written. +func emitLiteral(dst *tokens, lit []byte) { + ol := int(dst.n) + for i, v := range lit { + dst.tokens[(i+ol)&maxStoreBlockSize] = token(v) + dst.litHist[v]++ + } + dst.n += uint16(len(lit)) + dst.nLits += len(lit) +} -// Convert a < xlength, xoffset > pair into a match token. -func matchToken(xlength uint32, xoffset uint32) token { - return token(matchType + xlength<<lengthShift + xoffset) +func (t *tokens) AddLiteral(lit byte) { + t.tokens[t.n] = token(lit) + t.litHist[lit]++ + t.n++ + t.nLits++ +} + +// EstimatedBits will return an minimum size estimated by an *optimal* +// compression of the block. +// The size of the block +func (t *tokens) EstimatedBits() int { + shannon := float64(0) + bits := int(0) + nMatches := 0 + if t.nLits > 0 { + invTotal := 1.0 / float64(t.nLits) + for _, v := range t.litHist[:] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + } + } + // Just add 15 for EOB + shannon += 15 + for _, v := range t.extraHist[1 : literalCount-256] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + bits += int(lengthExtraBits[v&31]) * int(v) + nMatches += int(v) + } + } + } + if nMatches > 0 { + invTotal := 1.0 / float64(nMatches) + for _, v := range t.offHist[:offsetCodeCount] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + bits += int(offsetExtraBits[v&31]) * int(n) + } + } + } + + return int(shannon) + bits +} + +// AddMatch adds a match to the tokens. +// This function is very sensitive to inlining and right on the border. +func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { + if debugDecode { + if xlength >= maxMatchLength+baseMatchLength { + panic(fmt.Errorf("invalid length: %v", xlength)) + } + if xoffset >= maxMatchOffset+baseMatchOffset { + panic(fmt.Errorf("invalid offset: %v", xoffset)) + } + } + t.nLits++ + lengthCode := lengthCodes1[uint8(xlength)] & 31 + t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset) + t.extraHist[lengthCode]++ + t.offHist[offsetCode(xoffset)&31]++ + t.n++ +} + +// AddMatchLong adds a match to the tokens, potentially longer than max match length. +// Length should NOT have the base subtracted, only offset should. +func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) { + if debugDecode { + if xoffset >= maxMatchOffset+baseMatchOffset { + panic(fmt.Errorf("invalid offset: %v", xoffset)) + } + } + oc := offsetCode(xoffset) & 31 + for xlength > 0 { + xl := xlength + if xl > 258 { + // We need to have at least baseMatchLength left over for next loop. + xl = 258 - baseMatchLength + } + xlength -= xl + xl -= 3 + t.nLits++ + lengthCode := lengthCodes1[uint8(xl)] & 31 + t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset) + t.extraHist[lengthCode]++ + t.offHist[oc]++ + t.n++ + } +} + +func (t *tokens) AddEOB() { + t.tokens[t.n] = token(endBlockMarker) + t.extraHist[0]++ + t.n++ +} + +func (t *tokens) Slice() []token { + return t.tokens[:t.n] +} + +// VarInt returns the tokens as varint encoded bytes. +func (t *tokens) VarInt() []byte { + var b = make([]byte, binary.MaxVarintLen32*int(t.n)) + var off int + for _, v := range t.tokens[:t.n] { + off += binary.PutUvarint(b[off:], uint64(v)) + } + return b[:off] +} + +// FromVarInt restores t to the varint encoded tokens provided. +// Any data in t is removed. +func (t *tokens) FromVarInt(b []byte) error { + var buf = bytes.NewReader(b) + var toks []token + for { + r, err := binary.ReadUvarint(buf) + if err == io.EOF { + break + } + if err != nil { + return err + } + toks = append(toks, token(r)) + } + t.indexTokens(toks) + return nil } // Returns the type of a token @@ -96,11 +351,17 @@ func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) } // Returns the offset code corresponding to a specific offset func offsetCode(off uint32) uint32 { + if false { + if off < uint32(len(offsetCodes)) { + return offsetCodes[off&255] + } else if off>>7 < uint32(len(offsetCodes)) { + return offsetCodes[(off>>7)&255] + 14 + } else { + return offsetCodes[(off>>14)&255] + 28 + } + } if off < uint32(len(offsetCodes)) { - return offsetCodes[off&255] - } else if off>>7 < uint32(len(offsetCodes)) { - return offsetCodes[(off>>7)&255] + 14 - } else { - return offsetCodes[(off>>14)&255] + 28 + return offsetCodes[uint8(off)] } + return offsetCodes14[uint8(off>>7)] } diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go index dd4f7fefb..51e00aaeb 100644 --- a/vendor/github.com/klauspost/compress/huff0/compress.go +++ b/vendor/github.com/klauspost/compress/huff0/compress.go @@ -54,6 +54,12 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error) canReuse = s.canUseTable(s.prevTable) } + // We want the output size to be less than this: + wantSize := len(in) + if s.WantLogLess > 0 { + wantSize -= wantSize >> s.WantLogLess + } + // Reset for next run. s.clearCount = true s.maxCount = 0 @@ -77,7 +83,7 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error) s.cTable = s.prevTable s.Out, err = compressor(in) s.cTable = keepTable - if err == nil && len(s.Out) < len(in) { + if err == nil && len(s.Out) < wantSize { s.OutData = s.Out return s.Out, true, nil } @@ -100,13 +106,16 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error) hSize := len(s.Out) oldSize := s.prevTable.estimateSize(s.count[:s.symbolLen]) newSize := s.cTable.estimateSize(s.count[:s.symbolLen]) - if oldSize <= hSize+newSize || hSize+12 >= len(in) { + if oldSize <= hSize+newSize || hSize+12 >= wantSize { // Retain cTable even if we re-use. keepTable := s.cTable s.cTable = s.prevTable s.Out, err = compressor(in) s.cTable = keepTable - if len(s.Out) >= len(in) { + if err != nil { + return nil, false, err + } + if len(s.Out) >= wantSize { return nil, false, ErrIncompressible } s.OutData = s.Out @@ -128,7 +137,7 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error) s.OutTable = nil return nil, false, err } - if len(s.Out) >= len(in) { + if len(s.Out) >= wantSize { s.OutTable = nil return nil, false, ErrIncompressible } diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go index 43b4815b3..7e68a4eb4 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress.go +++ b/vendor/github.com/klauspost/compress/huff0/decompress.go @@ -276,6 +276,7 @@ func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) { // Use temp table to avoid bound checks/append penalty. var tmp = s.huffWeight[:256] var off uint8 + var decoded int // Decode 2 values from each decoder/loop. const bufoff = 256 / 4 @@ -306,6 +307,7 @@ bigloop: copy(dstOut[dstEvery*3:], tmp[bufoff*3:bufoff*4]) off = 0 dstOut = dstOut[bufoff:] + decoded += 256 // There must at least be 3 buffers left. if len(dstOut) < dstEvery*3 { return nil, errors.New("corruption detected: stream overrun 2") @@ -321,9 +323,11 @@ bigloop: copy(dstOut[dstEvery:dstEvery+ioff], tmp[bufoff:bufoff*2]) copy(dstOut[dstEvery*2:dstEvery*2+ioff], tmp[bufoff*2:bufoff*3]) copy(dstOut[dstEvery*3:dstEvery*3+ioff], tmp[bufoff*3:bufoff*4]) + decoded += int(off) * 4 dstOut = dstOut[off:] } + // Decode remaining. for i := range br { offset := dstEvery * i br := &br[i] @@ -335,12 +339,15 @@ bigloop: dstOut[offset] = decode(br) offset++ } + decoded += offset - dstEvery*i err = br.close() if err != nil { return nil, err } } - + if dstSize != decoded { + return nil, errors.New("corruption detected: short output block") + } return s.Out, nil } diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go index 6f823f94d..6bc23bbf0 100644 --- a/vendor/github.com/klauspost/compress/huff0/huff0.go +++ b/vendor/github.com/klauspost/compress/huff0/huff0.go @@ -89,6 +89,12 @@ type Scratch struct { // Reuse will specify the reuse policy Reuse ReusePolicy + // WantLogLess allows to specify a log 2 reduction that should at least be achieved, + // otherwise the block will be returned as incompressible. + // The reduction should then at least be (input size >> WantLogLess) + // If WantLogLess == 0 any improvement will do. + WantLogLess uint8 + // MaxDecodedSize will set the maximum allowed output size. // This value will automatically be set to BlockSizeMax if not set. // Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded. diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go index 3e161ea15..47cc21d6d 100644 --- a/vendor/github.com/klauspost/compress/zstd/blockdec.go +++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go @@ -11,6 +11,7 @@ import ( "sync" "github.com/klauspost/compress/huff0" + "github.com/klauspost/compress/zstd/internal/xxhash" ) type blockType uint8 @@ -160,7 +161,8 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { b.data, err = br.readBig(cSize, b.dataStorage) if err != nil { if debug { - println("Reading block:", err) + println("Reading block:", err, "(", cSize, ")", len(b.data)) + printf("%T", br) } return err } @@ -275,7 +277,7 @@ func (b *blockDec) decodeBuf(hist *history) error { hist.b = nil err := b.decodeCompressed(hist) if debug { - println("Decompressed to total", len(b.dst), "bytes, error:", err) + println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err) } hist.b = b.dst b.dst = saved @@ -368,7 +370,7 @@ func (b *blockDec) decodeCompressed(hist *history) error { } } if debug { - println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize", litCompSize) + println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams) } var literals []byte var huff *huff0.Scratch @@ -426,7 +428,6 @@ func (b *blockDec) decodeCompressed(hist *history) error { } literals = in[:litCompSize] in = in[litCompSize:] - huff = huffDecoderPool.Get().(*huff0.Scratch) var err error // Ensure we have space to store it. @@ -637,7 +638,7 @@ func (b *blockDec) decodeCompressed(hist *history) error { hist.huffTree = huff } if debug { - println("Final literals:", len(literals), "and", nSeqs, "sequences.") + println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.") } if nSeqs == 0 { diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go index 9d9151a0e..8383279d2 100644 --- a/vendor/github.com/klauspost/compress/zstd/blockenc.go +++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go @@ -51,7 +51,7 @@ func (b *blockEnc) init() { b.coders.llEnc = &fseEncoder{} b.coders.llPrev = &fseEncoder{} } - b.litEnc = &huff0.Scratch{} + b.litEnc = &huff0.Scratch{WantLogLess: 4} b.reset(nil) } @@ -391,6 +391,52 @@ func (b *blockEnc) encodeLits() error { return nil } +// fuzzFseEncoder can be used to fuzz the FSE encoder. +func fuzzFseEncoder(data []byte) int { + if len(data) > maxSequences || len(data) < 2 { + return 0 + } + enc := fseEncoder{} + hist := enc.Histogram()[:256] + maxSym := uint8(0) + for i, v := range data { + v = v & 63 + data[i] = v + hist[v]++ + if v > maxSym { + maxSym = v + } + } + if maxSym == 0 { + // All 0 + return 0 + } + maxCount := func(a []uint32) int { + var max uint32 + for _, v := range a { + if v > max { + max = v + } + } + return int(max) + } + cnt := maxCount(hist[:maxSym]) + if cnt == len(data) { + // RLE + return 0 + } + enc.HistogramFinished(maxSym, cnt) + err := enc.normalizeCount(len(data)) + if err != nil { + return 0 + } + _, err = enc.writeCount(nil) + if err != nil { + panic(err) + } + return 1 +} + // encode will encode the block and put the output in b.output. func (b *blockEnc) encode() error { if len(b.sequences) == 0 { @@ -415,16 +461,10 @@ func (b *blockEnc) encode() error { if len(b.literals) >= 1024 { // Use 4 Streams. out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc) - if len(out) > len(b.literals)-len(b.literals)>>4 { - err = huff0.ErrIncompressible - } } else if len(b.literals) > 32 { // Use 1 stream single = true out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc) - if len(out) > len(b.literals)-len(b.literals)>>4 { - err = huff0.ErrIncompressible - } } else { err = huff0.ErrIncompressible } @@ -711,7 +751,7 @@ func (b *blockEnc) encode() error { return nil } -var errIncompressible = errors.New("uncompressible") +var errIncompressible = errors.New("incompressible") func (b *blockEnc) genCodes() { if len(b.sequences) == 0 { diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go index 3538063f1..07321acb1 100644 --- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go +++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go @@ -101,6 +101,9 @@ func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) { dst = make([]byte, n) } n2, err := io.ReadFull(r.r, dst[:n]) + if err == io.EOF && n > 0 { + err = io.ErrUnexpectedEOF + } return dst[:n2], err } diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go index f4db3096a..1de94eef0 100644 --- a/vendor/github.com/klauspost/compress/zstd/decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/decoder.go @@ -75,6 +75,7 @@ var ( // The Reset function can be used to initiate a new stream, which is will considerably // reduce the allocations normally caused by NewReader. func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) { + initPredefined() var d Decoder d.o.setDefault() for _, o := range opts { @@ -123,7 +124,9 @@ func (d *Decoder) Read(p []byte) (int, error) { if d.current.err != nil { break } - d.nextBlock() + if !d.nextBlock(n == 0) { + return n, nil + } } } if len(d.current.b) > 0 { @@ -251,7 +254,7 @@ func (d *Decoder) WriteTo(w io.Writer) (int64, error) { if d.current.err != nil { break } - d.nextBlock() + d.nextBlock(true) } err := d.current.err if err != nil { @@ -328,7 +331,10 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { // nextBlock returns the next block. // If an error occurs d.err will be set. -func (d *Decoder) nextBlock() { +// Optionally the function can block for new output. +// If non-blocking mode is used the returned boolean will be false +// if no data was available without blocking. +func (d *Decoder) nextBlock(blocking bool) (ok bool) { if d.current.d != nil { if debug { printf("re-adding current decoder %p", d.current.d) @@ -338,12 +344,22 @@ func (d *Decoder) nextBlock() { } if d.current.err != nil { // Keep error state. - return + return blocking + } + + if blocking { + d.current.decodeOutput = <-d.current.output + } else { + select { + case d.current.decodeOutput = <-d.current.output: + default: + return false + } } - d.current.decodeOutput = <-d.current.output if debug { println("got", len(d.current.b), "bytes, error:", d.current.err) } + return true } // Close will release all resources. diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go index e120625d8..2f41bcd0d 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go @@ -235,7 +235,7 @@ encodeLoop: if debug && s-t > e.maxMatchOff { panic("s - t >e.maxMatchOff") } - if debug { + if debugMatches { println("long match") } break @@ -259,7 +259,7 @@ encodeLoop: // but the likelihood of both the first 4 bytes and the hash matching should be enough. t = candidateL.offset - e.cur s += checkAt - if debug { + if debugMatches { println("long match (after short)") } break @@ -275,7 +275,7 @@ encodeLoop: if debug && t < 0 { panic("t<0") } - if debug { + if debugMatches { println("short match") } break diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go index b7011be29..d79188271 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder.go @@ -59,6 +59,7 @@ type encoderState struct { // NewWriter will create a new Zstandard encoder. // If the encoder will be used for encoding blocks a nil writer can be used. func NewWriter(w io.Writer, opts ...EOption) (*Encoder, error) { + initPredefined() var e Encoder e.o.setDefault() for _, o := range opts { @@ -393,12 +394,31 @@ func (e *Encoder) Close() error { // EncodeAll will encode all input in src and append it to dst. // This function can be called concurrently, but each call will only run on a single goroutine. -// If empty input is given, nothing is returned. +// If empty input is given, nothing is returned, unless WithZeroFrames is specified. // Encoded blocks can be concatenated and the result will be the combined input stream. // Data compressed with EncodeAll can be decoded with the Decoder, // using either a stream or DecodeAll. func (e *Encoder) EncodeAll(src, dst []byte) []byte { if len(src) == 0 { + if e.o.fullZero { + // Add frame header. + fh := frameHeader{ + ContentSize: 0, + WindowSize: MinWindowSize, + SingleSegment: true, + // Adding a checksum would be a waste of space. + Checksum: false, + DictID: 0, + } + dst, _ = fh.appendTo(dst) + + // Write raw block as last one only. + var blk blockHeader + blk.setSize(0) + blk.setType(blockTypeRaw) + blk.setLast(true) + dst = blk.appendTo(dst) + } return dst } e.init.Do(func() { diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go index a8559e900..0f83a325a 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go @@ -1,6 +1,7 @@ package zstd import ( + "errors" "fmt" "runtime" "strings" @@ -18,6 +19,7 @@ type encoderOptions struct { blockSize int windowSize int level EncoderLevel + fullZero bool } func (o *encoderOptions) setDefault() { @@ -63,6 +65,30 @@ func WithEncoderConcurrency(n int) EOption { } } +// WithWindowSize will set the maximum allowed back-reference distance. +// The value must be a power of two between WindowSizeMin and WindowSizeMax. +// A larger value will enable better compression but allocate more memory and, +// for above-default values, take considerably longer. +// The default value is determined by the compression level. +func WithWindowSize(n int) EOption { + return func(o *encoderOptions) error { + switch { + case n < MinWindowSize: + return fmt.Errorf("window size must be at least %d", MinWindowSize) + case n > MaxWindowSize: + return fmt.Errorf("window size must be at most %d", MaxWindowSize) + case (n & (n - 1)) != 0: + return errors.New("window size must be a power of 2") + } + + o.windowSize = n + if o.blockSize > o.windowSize { + o.blockSize = o.windowSize + } + return nil + } +} + // WithEncoderPadding will add padding to all output so the size will be a multiple of n. // This can be used to obfuscate the exact output size or make blocks of a certain size. // The contents will be a skippable frame, so it will be invisible by the decoder. @@ -166,6 +192,16 @@ func WithEncoderLevel(l EncoderLevel) EOption { } } +// WithZeroFrames will encode 0 length input as full frames. +// This can be needed for compatibility with zstandard usage, +// but is not needed for this package. +func WithZeroFrames(b bool) EOption { + return func(o *encoderOptions) error { + o.fullZero = b + return nil + } +} + // WithSingleSegment will set the "single segment" flag when EncodeAll is used. // If this flag is set, data must be regenerated within a single continuous memory segment. // In this case, Window_Descriptor byte is skipped, but Frame_Content_Size is necessarily present. diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go index 839a95fbf..9e00437a2 100644 --- a/vendor/github.com/klauspost/compress/zstd/framedec.go +++ b/vendor/github.com/klauspost/compress/zstd/framedec.go @@ -49,7 +49,8 @@ type frameDec struct { const ( // The minimum Window_Size is 1 KB. - minWindowSize = 1 << 10 + MinWindowSize = 1 << 10 + MaxWindowSize = 1 << 30 ) var ( @@ -60,7 +61,7 @@ var ( func newFrameDec(o decoderOptions) *frameDec { d := frameDec{ o: o, - maxWindowSize: 1 << 30, + maxWindowSize: MaxWindowSize, } if d.maxWindowSize > o.maxDecodedSize { d.maxWindowSize = o.maxDecodedSize @@ -193,14 +194,14 @@ func (d *frameDec) reset(br byteBuffer) error { // When FCS_Field_Size is 2, the offset of 256 is added. d.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) + 256 case 4: - d.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3] << 24)) + d.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3]) << 24) case 8: d1 := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24) d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24) d.FrameContentSize = uint64(d1) | (uint64(d2) << 32) } if debug { - println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize])) + println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize) } } // Move this to shared. @@ -215,8 +216,8 @@ func (d *frameDec) reset(br byteBuffer) error { if d.WindowSize == 0 && d.SingleSegment { // We may not need window in this case. d.WindowSize = d.FrameContentSize - if d.WindowSize < minWindowSize { - d.WindowSize = minWindowSize + if d.WindowSize < MinWindowSize { + d.WindowSize = MinWindowSize } } @@ -225,7 +226,7 @@ func (d *frameDec) reset(br byteBuffer) error { return ErrWindowSizeExceeded } // The minimum Window_Size is 1 KB. - if d.WindowSize < minWindowSize { + if d.WindowSize < MinWindowSize { println("got window size: ", d.WindowSize) return ErrWindowSizeTooSmall } @@ -309,7 +310,9 @@ func (d *frameDec) checkCRC() error { } return ErrCRCMismatch } - println("CRC ok") + if debug { + println("CRC ok", tmp[:]) + } return nil } @@ -411,6 +414,7 @@ func (d *frameDec) startDecoder(output chan decodeOutput) { } written += int64(len(r.b)) if d.SingleSegment && uint64(written) > d.FrameContentSize { + println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize) r.err = ErrFrameSizeExceeded output <- r return @@ -461,6 +465,7 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) { break } if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize { + println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize) err = ErrFrameSizeExceeded break } diff --git a/vendor/github.com/klauspost/compress/zstd/frameenc.go b/vendor/github.com/klauspost/compress/zstd/frameenc.go index acac32527..4479cfe18 100644 --- a/vendor/github.com/klauspost/compress/zstd/frameenc.go +++ b/vendor/github.com/klauspost/compress/zstd/frameenc.go @@ -5,7 +5,6 @@ package zstd import ( - "errors" "fmt" "io" "math" @@ -49,9 +48,7 @@ func (f frameHeader) appendTo(dst []byte) ([]byte, error) { windowLog := (bits.Len32(f.WindowSize-1) - winLogMin) << 3 dst = append(dst, uint8(windowLog)) } - if f.SingleSegment && f.ContentSize == 0 { - return nil, errors.New("single segment, but no size set") - } + switch fcs { case 0: if f.SingleSegment { diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go index dfa6cf7ce..619836f52 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go @@ -502,13 +502,22 @@ func (s *fseEncoder) validateNorm() (err error) { // writeCount will write the normalized histogram count to header. // This is read back by readNCount. func (s *fseEncoder) writeCount(out []byte) ([]byte, error) { + if s.useRLE { + return append(out, s.rleVal), nil + } + if s.preDefined || s.reUsed { + // Never write predefined. + return out, nil + } + var ( tableLog = s.actualTableLog tableSize = 1 << tableLog previous0 bool charnum uint16 - maxHeaderSize = ((int(s.symbolLen) * int(tableLog)) >> 3) + 3 + // maximum header size plus 2 extra bytes for final output if bitCount == 0. + maxHeaderSize = ((int(s.symbolLen) * int(tableLog)) >> 3) + 3 + 2 // Write Table Size bitStream = uint32(tableLog - minEncTablelog) @@ -516,15 +525,12 @@ func (s *fseEncoder) writeCount(out []byte) ([]byte, error) { remaining = int16(tableSize + 1) /* +1 for extra accuracy */ threshold = int16(tableSize) nbBits = uint(tableLog + 1) + outP = len(out) ) - if s.useRLE { - return append(out, s.rleVal), nil + if cap(out) < outP+maxHeaderSize { + out = append(out, make([]byte, maxHeaderSize*3)...) + out = out[:len(out)-maxHeaderSize*3] } - if s.preDefined || s.reUsed { - // Never write predefined. - return out, nil - } - outP := len(out) out = out[:outP+maxHeaderSize] // stops at 1 @@ -594,11 +600,14 @@ func (s *fseEncoder) writeCount(out []byte) ([]byte, error) { } } + if outP+2 > len(out) { + return nil, fmt.Errorf("internal error: %d > %d, maxheader: %d, sl: %d, tl: %d, normcount: %v", outP+2, len(out), maxHeaderSize, s.symbolLen, int(tableLog), s.norm[:s.symbolLen]) + } out[outP] = byte(bitStream) out[outP+1] = byte(bitStream >> 8) outP += int((bitCount + 7) / 8) - if uint16(charnum) > s.symbolLen { + if charnum > s.symbolLen { return nil, errors.New("internal error: charnum > s.symbolLen") } return out[:outP], nil diff --git a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go index 5186de802..6c17dc17f 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go @@ -7,6 +7,7 @@ package zstd import ( "fmt" "math" + "sync" ) var ( @@ -69,85 +70,89 @@ func fillBase(dst []baseOffset, base uint32, bits ...uint8) { } } -func init() { - // Literals length codes - tmp := make([]baseOffset, 36) - for i := range tmp[:16] { - tmp[i] = baseOffset{ - baseLine: uint32(i), - addBits: 0, +var predef sync.Once + +func initPredefined() { + predef.Do(func() { + // Literals length codes + tmp := make([]baseOffset, 36) + for i := range tmp[:16] { + tmp[i] = baseOffset{ + baseLine: uint32(i), + addBits: 0, + } } - } - fillBase(tmp[16:], 16, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) - symbolTableX[tableLiteralLengths] = tmp - - // Match length codes - tmp = make([]baseOffset, 53) - for i := range tmp[:32] { - tmp[i] = baseOffset{ - // The transformation adds the 3 length. - baseLine: uint32(i) + 3, - addBits: 0, - } - } - fillBase(tmp[32:], 35, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) - symbolTableX[tableMatchLengths] = tmp - - // Offset codes - tmp = make([]baseOffset, maxOffsetBits+1) - tmp[1] = baseOffset{ - baseLine: 1, - addBits: 1, - } - fillBase(tmp[2:], 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30) - symbolTableX[tableOffsets] = tmp - - // Fill predefined tables and transform them. - // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions - for i := range fsePredef[:] { - f := &fsePredef[i] - switch tableIndex(i) { - case tableLiteralLengths: - // https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L243 - f.actualTableLog = 6 - copy(f.norm[:], []int16{4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, - -1, -1, -1, -1}) - f.symbolLen = 36 - case tableOffsets: - // https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L281 - f.actualTableLog = 5 - copy(f.norm[:], []int16{ - 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1}) - f.symbolLen = 29 - case tableMatchLengths: - //https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L304 - f.actualTableLog = 6 - copy(f.norm[:], []int16{ - 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, - -1, -1, -1, -1, -1}) - f.symbolLen = 53 + fillBase(tmp[16:], 16, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + symbolTableX[tableLiteralLengths] = tmp + + // Match length codes + tmp = make([]baseOffset, 53) + for i := range tmp[:32] { + tmp[i] = baseOffset{ + // The transformation adds the 3 length. + baseLine: uint32(i) + 3, + addBits: 0, + } } - if err := f.buildDtable(); err != nil { - panic(fmt.Errorf("building table %v: %v", tableIndex(i), err)) + fillBase(tmp[32:], 35, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + symbolTableX[tableMatchLengths] = tmp + + // Offset codes + tmp = make([]baseOffset, maxOffsetBits+1) + tmp[1] = baseOffset{ + baseLine: 1, + addBits: 1, } - if err := f.transform(symbolTableX[i]); err != nil { - panic(fmt.Errorf("building table %v: %v", tableIndex(i), err)) + fillBase(tmp[2:], 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30) + symbolTableX[tableOffsets] = tmp + + // Fill predefined tables and transform them. + // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions + for i := range fsePredef[:] { + f := &fsePredef[i] + switch tableIndex(i) { + case tableLiteralLengths: + // https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L243 + f.actualTableLog = 6 + copy(f.norm[:], []int16{4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, + -1, -1, -1, -1}) + f.symbolLen = 36 + case tableOffsets: + // https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L281 + f.actualTableLog = 5 + copy(f.norm[:], []int16{ + 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1}) + f.symbolLen = 29 + case tableMatchLengths: + //https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L304 + f.actualTableLog = 6 + copy(f.norm[:], []int16{ + 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, + -1, -1, -1, -1, -1}) + f.symbolLen = 53 + } + if err := f.buildDtable(); err != nil { + panic(fmt.Errorf("building table %v: %v", tableIndex(i), err)) + } + if err := f.transform(symbolTableX[i]); err != nil { + panic(fmt.Errorf("building table %v: %v", tableIndex(i), err)) + } + f.preDefined = true + + // Create encoder as well + enc := &fsePredefEnc[i] + copy(enc.norm[:], f.norm[:]) + enc.symbolLen = f.symbolLen + enc.actualTableLog = f.actualTableLog + if err := enc.buildCTable(); err != nil { + panic(fmt.Errorf("building encoding table %v: %v", tableIndex(i), err)) + } + enc.setBits(bitTables[i]) + enc.preDefined = true } - f.preDefined = true - - // Create encoder as well - enc := &fsePredefEnc[i] - copy(enc.norm[:], f.norm[:]) - enc.symbolLen = f.symbolLen - enc.actualTableLog = f.actualTableLog - if err := enc.buildCTable(); err != nil { - panic(fmt.Errorf("building encoding table %v: %v", tableIndex(i), err)) - } - enc.setBits(bitTables[i]) - enc.preDefined = true - } + }) } diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go index e9e518570..a048818f9 100644 --- a/vendor/github.com/klauspost/compress/zstd/snappy.go +++ b/vendor/github.com/klauspost/compress/zstd/snappy.go @@ -80,6 +80,7 @@ type SnappyConverter struct { // If any error is detected on the Snappy stream it is returned. // The number of bytes written is returned. func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) { + initPredefined() r.err = nil r.r = in if r.block == nil { diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go index b975954c1..57a8a2f5b 100644 --- a/vendor/github.com/klauspost/compress/zstd/zstd.go +++ b/vendor/github.com/klauspost/compress/zstd/zstd.go @@ -11,6 +11,7 @@ import ( const debug = false const debugSequences = false +const debugMatches = false // force encoder to use predefined tables. const forcePreDef = false diff --git a/vendor/github.com/klauspost/cpuid/.gitignore b/vendor/github.com/klauspost/cpuid/.gitignore deleted file mode 100644 index daf913b1b..000000000 --- a/vendor/github.com/klauspost/cpuid/.gitignore +++ /dev/null @@ -1,24 +0,0 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof diff --git a/vendor/github.com/klauspost/cpuid/.travis.yml b/vendor/github.com/klauspost/cpuid/.travis.yml deleted file mode 100644 index 630192d59..000000000 --- a/vendor/github.com/klauspost/cpuid/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: go - -sudo: false - -os: - - linux - - osx -go: - - 1.8.x - - 1.9.x - - 1.10.x - - master - -script: - - go vet ./... - - go test -v ./... - - go test -race ./... - - diff <(gofmt -d .) <("") - -matrix: - allow_failures: - - go: 'master' - fast_finish: true diff --git a/vendor/github.com/klauspost/cpuid/CONTRIBUTING.txt b/vendor/github.com/klauspost/cpuid/CONTRIBUTING.txt deleted file mode 100644 index 2ef4714f7..000000000 --- a/vendor/github.com/klauspost/cpuid/CONTRIBUTING.txt +++ /dev/null @@ -1,35 +0,0 @@ -Developer Certificate of Origin
-Version 1.1
-
-Copyright (C) 2015- Klaus Post & Contributors.
-Email: klauspost@gmail.com
-
-Everyone is permitted to copy and distribute verbatim copies of this
-license document, but changing it is not allowed.
-
-
-Developer's Certificate of Origin 1.1
-
-By making a contribution to this project, I certify that:
-
-(a) The contribution was created in whole or in part by me and I
- have the right to submit it under the open source license
- indicated in the file; or
-
-(b) The contribution is based upon previous work that, to the best
- of my knowledge, is covered under an appropriate open source
- license and I have the right under that license to submit that
- work with modifications, whether created in whole or in part
- by me, under the same open source license (unless I am
- permitted to submit under a different license), as indicated
- in the file; or
-
-(c) The contribution was provided directly to me by some other
- person who certified (a), (b) or (c) and I have not modified
- it.
-
-(d) I understand and agree that this project and the contribution
- are public and that a record of the contribution (including all
- personal information I submit with it, including my sign-off) is
- maintained indefinitely and may be redistributed consistent with
- this project or the open source license(s) involved.
diff --git a/vendor/github.com/klauspost/cpuid/LICENSE b/vendor/github.com/klauspost/cpuid/LICENSE deleted file mode 100644 index 5cec7ee94..000000000 --- a/vendor/github.com/klauspost/cpuid/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Klaus Post - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/vendor/github.com/klauspost/cpuid/README.md b/vendor/github.com/klauspost/cpuid/README.md deleted file mode 100644 index a7fb41fbe..000000000 --- a/vendor/github.com/klauspost/cpuid/README.md +++ /dev/null @@ -1,147 +0,0 @@ -# cpuid -Package cpuid provides information about the CPU running the current program. - -CPU features are detected on startup, and kept for fast access through the life of the application. -Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use. - -You can access the CPU information by accessing the shared CPU variable of the cpuid library. - -Package home: https://github.com/klauspost/cpuid - -[![GoDoc][1]][2] [![Build Status][3]][4] - -[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg -[2]: https://godoc.org/github.com/klauspost/cpuid -[3]: https://travis-ci.org/klauspost/cpuid.svg -[4]: https://travis-ci.org/klauspost/cpuid - -# features -## CPU Instructions -* **CMOV** (i686 CMOV) -* **NX** (NX (No-Execute) bit) -* **AMD3DNOW** (AMD 3DNOW) -* **AMD3DNOWEXT** (AMD 3DNowExt) -* **MMX** (standard MMX) -* **MMXEXT** (SSE integer functions or AMD MMX ext) -* **SSE** (SSE functions) -* **SSE2** (P4 SSE functions) -* **SSE3** (Prescott SSE3 functions) -* **SSSE3** (Conroe SSSE3 functions) -* **SSE4** (Penryn SSE4.1 functions) -* **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions) -* **SSE42** (Nehalem SSE4.2 functions) -* **AVX** (AVX functions) -* **AVX2** (AVX2 functions) -* **FMA3** (Intel FMA 3) -* **FMA4** (Bulldozer FMA4 functions) -* **XOP** (Bulldozer XOP functions) -* **F16C** (Half-precision floating-point conversion) -* **BMI1** (Bit Manipulation Instruction Set 1) -* **BMI2** (Bit Manipulation Instruction Set 2) -* **TBM** (AMD Trailing Bit Manipulation) -* **LZCNT** (LZCNT instruction) -* **POPCNT** (POPCNT instruction) -* **AESNI** (Advanced Encryption Standard New Instructions) -* **CLMUL** (Carry-less Multiplication) -* **HTT** (Hyperthreading (enabled)) -* **HLE** (Hardware Lock Elision) -* **RTM** (Restricted Transactional Memory) -* **RDRAND** (RDRAND instruction is available) -* **RDSEED** (RDSEED instruction is available) -* **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions)) -* **SHA** (Intel SHA Extensions) -* **AVX512F** (AVX-512 Foundation) -* **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions) -* **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions) -* **AVX512PF** (AVX-512 Prefetch Instructions) -* **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions) -* **AVX512CD** (AVX-512 Conflict Detection Instructions) -* **AVX512BW** (AVX-512 Byte and Word Instructions) -* **AVX512VL** (AVX-512 Vector Length Extensions) -* **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions) -* **MPX** (Intel MPX (Memory Protection Extensions)) -* **ERMS** (Enhanced REP MOVSB/STOSB) -* **RDTSCP** (RDTSCP Instruction) -* **CX16** (CMPXCHG16B Instruction) -* **SGX** (Software Guard Extensions, with activation details) - -## Performance -* **RDTSCP()** Returns current cycle count. Can be used for benchmarking. -* **SSE2SLOW** (SSE2 is supported, but usually not faster) -* **SSE3SLOW** (SSE3 is supported, but usually not faster) -* **ATOM** (Atom processor, some SSSE3 instructions are slower) -* **Cache line** (Probable size of a cache line). -* **L1, L2, L3 Cache size** on newer Intel/AMD CPUs. - -## Cpu Vendor/VM -* **Intel** -* **AMD** -* **VIA** -* **Transmeta** -* **NSC** -* **KVM** (Kernel-based Virtual Machine) -* **MSVM** (Microsoft Hyper-V or Windows Virtual PC) -* **VMware** -* **XenHVM** -* **Bhyve** -* **Hygon** - -# installing - -```go get github.com/klauspost/cpuid``` - -# example - -```Go -package main - -import ( - "fmt" - "github.com/klauspost/cpuid" -) - -func main() { - // Print basic CPU information: - fmt.Println("Name:", cpuid.CPU.BrandName) - fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores) - fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore) - fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores) - fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model) - fmt.Println("Features:", cpuid.CPU.Features) - fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine) - fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes") - fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes") - fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes") - fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes") - - // Test if we have a specific feature: - if cpuid.CPU.SSE() { - fmt.Println("We have Streaming SIMD Extensions") - } -} -``` - -Sample output: -``` ->go run main.go -Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz -PhysicalCores: 2 -ThreadsPerCore: 2 -LogicalCores: 4 -Family 6 Model: 42 -Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL -Cacheline bytes: 64 -We have Streaming SIMD Extensions -``` - -# private package - -In the "private" folder you can find an autogenerated version of the library you can include in your own packages. - -For this purpose all exports are removed, and functions and constants are lowercased. - -This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages. - -# license - -This code is published under an MIT license. See LICENSE file for more information. diff --git a/vendor/github.com/klauspost/cpuid/cpuid.go b/vendor/github.com/klauspost/cpuid/cpuid.go deleted file mode 100644 index db9591321..000000000 --- a/vendor/github.com/klauspost/cpuid/cpuid.go +++ /dev/null @@ -1,1049 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// Package cpuid provides information about the CPU running the current program. -// -// CPU features are detected on startup, and kept for fast access through the life of the application. -// Currently x86 / x64 (AMD64) is supported. -// -// You can access the CPU information by accessing the shared CPU variable of the cpuid library. -// -// Package home: https://github.com/klauspost/cpuid -package cpuid - -import "strings" - -// Vendor is a representation of a CPU vendor. -type Vendor int - -const ( - Other Vendor = iota - Intel - AMD - VIA - Transmeta - NSC - KVM // Kernel-based Virtual Machine - MSVM // Microsoft Hyper-V or Windows Virtual PC - VMware - XenHVM - Bhyve - Hygon -) - -const ( - CMOV = 1 << iota // i686 CMOV - NX // NX (No-Execute) bit - AMD3DNOW // AMD 3DNOW - AMD3DNOWEXT // AMD 3DNowExt - MMX // standard MMX - MMXEXT // SSE integer functions or AMD MMX ext - SSE // SSE functions - SSE2 // P4 SSE functions - SSE3 // Prescott SSE3 functions - SSSE3 // Conroe SSSE3 functions - SSE4 // Penryn SSE4.1 functions - SSE4A // AMD Barcelona microarchitecture SSE4a instructions - SSE42 // Nehalem SSE4.2 functions - AVX // AVX functions - AVX2 // AVX2 functions - FMA3 // Intel FMA 3 - FMA4 // Bulldozer FMA4 functions - XOP // Bulldozer XOP functions - F16C // Half-precision floating-point conversion - BMI1 // Bit Manipulation Instruction Set 1 - BMI2 // Bit Manipulation Instruction Set 2 - TBM // AMD Trailing Bit Manipulation - LZCNT // LZCNT instruction - POPCNT // POPCNT instruction - AESNI // Advanced Encryption Standard New Instructions - CLMUL // Carry-less Multiplication - HTT // Hyperthreading (enabled) - HLE // Hardware Lock Elision - RTM // Restricted Transactional Memory - RDRAND // RDRAND instruction is available - RDSEED // RDSEED instruction is available - ADX // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) - SHA // Intel SHA Extensions - AVX512F // AVX-512 Foundation - AVX512DQ // AVX-512 Doubleword and Quadword Instructions - AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions - AVX512PF // AVX-512 Prefetch Instructions - AVX512ER // AVX-512 Exponential and Reciprocal Instructions - AVX512CD // AVX-512 Conflict Detection Instructions - AVX512BW // AVX-512 Byte and Word Instructions - AVX512VL // AVX-512 Vector Length Extensions - AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions - MPX // Intel MPX (Memory Protection Extensions) - ERMS // Enhanced REP MOVSB/STOSB - RDTSCP // RDTSCP Instruction - CX16 // CMPXCHG16B Instruction - SGX // Software Guard Extensions - IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) - STIBP // Single Thread Indirect Branch Predictors - - // Performance indicators - SSE2SLOW // SSE2 is supported, but usually not faster - SSE3SLOW // SSE3 is supported, but usually not faster - ATOM // Atom processor, some SSSE3 instructions are slower -) - -var flagNames = map[Flags]string{ - CMOV: "CMOV", // i686 CMOV - NX: "NX", // NX (No-Execute) bit - AMD3DNOW: "AMD3DNOW", // AMD 3DNOW - AMD3DNOWEXT: "AMD3DNOWEXT", // AMD 3DNowExt - MMX: "MMX", // Standard MMX - MMXEXT: "MMXEXT", // SSE integer functions or AMD MMX ext - SSE: "SSE", // SSE functions - SSE2: "SSE2", // P4 SSE2 functions - SSE3: "SSE3", // Prescott SSE3 functions - SSSE3: "SSSE3", // Conroe SSSE3 functions - SSE4: "SSE4.1", // Penryn SSE4.1 functions - SSE4A: "SSE4A", // AMD Barcelona microarchitecture SSE4a instructions - SSE42: "SSE4.2", // Nehalem SSE4.2 functions - AVX: "AVX", // AVX functions - AVX2: "AVX2", // AVX functions - FMA3: "FMA3", // Intel FMA 3 - FMA4: "FMA4", // Bulldozer FMA4 functions - XOP: "XOP", // Bulldozer XOP functions - F16C: "F16C", // Half-precision floating-point conversion - BMI1: "BMI1", // Bit Manipulation Instruction Set 1 - BMI2: "BMI2", // Bit Manipulation Instruction Set 2 - TBM: "TBM", // AMD Trailing Bit Manipulation - LZCNT: "LZCNT", // LZCNT instruction - POPCNT: "POPCNT", // POPCNT instruction - AESNI: "AESNI", // Advanced Encryption Standard New Instructions - CLMUL: "CLMUL", // Carry-less Multiplication - HTT: "HTT", // Hyperthreading (enabled) - HLE: "HLE", // Hardware Lock Elision - RTM: "RTM", // Restricted Transactional Memory - RDRAND: "RDRAND", // RDRAND instruction is available - RDSEED: "RDSEED", // RDSEED instruction is available - ADX: "ADX", // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) - SHA: "SHA", // Intel SHA Extensions - AVX512F: "AVX512F", // AVX-512 Foundation - AVX512DQ: "AVX512DQ", // AVX-512 Doubleword and Quadword Instructions - AVX512IFMA: "AVX512IFMA", // AVX-512 Integer Fused Multiply-Add Instructions - AVX512PF: "AVX512PF", // AVX-512 Prefetch Instructions - AVX512ER: "AVX512ER", // AVX-512 Exponential and Reciprocal Instructions - AVX512CD: "AVX512CD", // AVX-512 Conflict Detection Instructions - AVX512BW: "AVX512BW", // AVX-512 Byte and Word Instructions - AVX512VL: "AVX512VL", // AVX-512 Vector Length Extensions - AVX512VBMI: "AVX512VBMI", // AVX-512 Vector Bit Manipulation Instructions - MPX: "MPX", // Intel MPX (Memory Protection Extensions) - ERMS: "ERMS", // Enhanced REP MOVSB/STOSB - RDTSCP: "RDTSCP", // RDTSCP Instruction - CX16: "CX16", // CMPXCHG16B Instruction - SGX: "SGX", // Software Guard Extensions - IBPB: "IBPB", // Indirect Branch Restricted Speculation and Indirect Branch Predictor Barrier - STIBP: "STIBP", // Single Thread Indirect Branch Predictors - - // Performance indicators - SSE2SLOW: "SSE2SLOW", // SSE2 supported, but usually not faster - SSE3SLOW: "SSE3SLOW", // SSE3 supported, but usually not faster - ATOM: "ATOM", // Atom processor, some SSSE3 instructions are slower - -} - -// CPUInfo contains information about the detected system CPU. -type CPUInfo struct { - BrandName string // Brand name reported by the CPU - VendorID Vendor // Comparable CPU vendor ID - Features Flags // Features of the CPU - PhysicalCores int // Number of physical processor cores in your CPU. Will be 0 if undetectable. - ThreadsPerCore int // Number of threads per physical core. Will be 1 if undetectable. - LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable. - Family int // CPU family number - Model int // CPU model number - CacheLine int // Cache line size in bytes. Will be 0 if undetectable. - Cache struct { - L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected - L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected - L2 int // L2 Cache (per core or shared). Will be -1 if undetected - L3 int // L3 Instruction Cache (per core or shared). Will be -1 if undetected - } - SGX SGXSupport - maxFunc uint32 - maxExFunc uint32 -} - -var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) -var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32) -var xgetbv func(index uint32) (eax, edx uint32) -var rdtscpAsm func() (eax, ebx, ecx, edx uint32) - -// CPU contains information about the CPU as detected on startup, -// or when Detect last was called. -// -// Use this as the primary entry point to you data, -// this way queries are -var CPU CPUInfo - -func init() { - initCPU() - Detect() -} - -// Detect will re-detect current CPU info. -// This will replace the content of the exported CPU variable. -// -// Unless you expect the CPU to change while you are running your program -// you should not need to call this function. -// If you call this, you must ensure that no other goroutine is accessing the -// exported CPU variable. -func Detect() { - CPU.maxFunc = maxFunctionID() - CPU.maxExFunc = maxExtendedFunction() - CPU.BrandName = brandName() - CPU.CacheLine = cacheLine() - CPU.Family, CPU.Model = familyModel() - CPU.Features = support() - CPU.SGX = hasSGX(CPU.Features&SGX != 0) - CPU.ThreadsPerCore = threadsPerCore() - CPU.LogicalCores = logicalCores() - CPU.PhysicalCores = physicalCores() - CPU.VendorID = vendorID() - CPU.cacheSize() -} - -// Generated here: http://play.golang.org/p/BxFH2Gdc0G - -// Cmov indicates support of CMOV instructions -func (c CPUInfo) Cmov() bool { - return c.Features&CMOV != 0 -} - -// Amd3dnow indicates support of AMD 3DNOW! instructions -func (c CPUInfo) Amd3dnow() bool { - return c.Features&AMD3DNOW != 0 -} - -// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions -func (c CPUInfo) Amd3dnowExt() bool { - return c.Features&AMD3DNOWEXT != 0 -} - -// MMX indicates support of MMX instructions -func (c CPUInfo) MMX() bool { - return c.Features&MMX != 0 -} - -// MMXExt indicates support of MMXEXT instructions -// (SSE integer functions or AMD MMX ext) -func (c CPUInfo) MMXExt() bool { - return c.Features&MMXEXT != 0 -} - -// SSE indicates support of SSE instructions -func (c CPUInfo) SSE() bool { - return c.Features&SSE != 0 -} - -// SSE2 indicates support of SSE 2 instructions -func (c CPUInfo) SSE2() bool { - return c.Features&SSE2 != 0 -} - -// SSE3 indicates support of SSE 3 instructions -func (c CPUInfo) SSE3() bool { - return c.Features&SSE3 != 0 -} - -// SSSE3 indicates support of SSSE 3 instructions -func (c CPUInfo) SSSE3() bool { - return c.Features&SSSE3 != 0 -} - -// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions -func (c CPUInfo) SSE4() bool { - return c.Features&SSE4 != 0 -} - -// SSE42 indicates support of SSE4.2 instructions -func (c CPUInfo) SSE42() bool { - return c.Features&SSE42 != 0 -} - -// AVX indicates support of AVX instructions -// and operating system support of AVX instructions -func (c CPUInfo) AVX() bool { - return c.Features&AVX != 0 -} - -// AVX2 indicates support of AVX2 instructions -func (c CPUInfo) AVX2() bool { - return c.Features&AVX2 != 0 -} - -// FMA3 indicates support of FMA3 instructions -func (c CPUInfo) FMA3() bool { - return c.Features&FMA3 != 0 -} - -// FMA4 indicates support of FMA4 instructions -func (c CPUInfo) FMA4() bool { - return c.Features&FMA4 != 0 -} - -// XOP indicates support of XOP instructions -func (c CPUInfo) XOP() bool { - return c.Features&XOP != 0 -} - -// F16C indicates support of F16C instructions -func (c CPUInfo) F16C() bool { - return c.Features&F16C != 0 -} - -// BMI1 indicates support of BMI1 instructions -func (c CPUInfo) BMI1() bool { - return c.Features&BMI1 != 0 -} - -// BMI2 indicates support of BMI2 instructions -func (c CPUInfo) BMI2() bool { - return c.Features&BMI2 != 0 -} - -// TBM indicates support of TBM instructions -// (AMD Trailing Bit Manipulation) -func (c CPUInfo) TBM() bool { - return c.Features&TBM != 0 -} - -// Lzcnt indicates support of LZCNT instruction -func (c CPUInfo) Lzcnt() bool { - return c.Features&LZCNT != 0 -} - -// Popcnt indicates support of POPCNT instruction -func (c CPUInfo) Popcnt() bool { - return c.Features&POPCNT != 0 -} - -// HTT indicates the processor has Hyperthreading enabled -func (c CPUInfo) HTT() bool { - return c.Features&HTT != 0 -} - -// SSE2Slow indicates that SSE2 may be slow on this processor -func (c CPUInfo) SSE2Slow() bool { - return c.Features&SSE2SLOW != 0 -} - -// SSE3Slow indicates that SSE3 may be slow on this processor -func (c CPUInfo) SSE3Slow() bool { - return c.Features&SSE3SLOW != 0 -} - -// AesNi indicates support of AES-NI instructions -// (Advanced Encryption Standard New Instructions) -func (c CPUInfo) AesNi() bool { - return c.Features&AESNI != 0 -} - -// Clmul indicates support of CLMUL instructions -// (Carry-less Multiplication) -func (c CPUInfo) Clmul() bool { - return c.Features&CLMUL != 0 -} - -// NX indicates support of NX (No-Execute) bit -func (c CPUInfo) NX() bool { - return c.Features&NX != 0 -} - -// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions -func (c CPUInfo) SSE4A() bool { - return c.Features&SSE4A != 0 -} - -// HLE indicates support of Hardware Lock Elision -func (c CPUInfo) HLE() bool { - return c.Features&HLE != 0 -} - -// RTM indicates support of Restricted Transactional Memory -func (c CPUInfo) RTM() bool { - return c.Features&RTM != 0 -} - -// Rdrand indicates support of RDRAND instruction is available -func (c CPUInfo) Rdrand() bool { - return c.Features&RDRAND != 0 -} - -// Rdseed indicates support of RDSEED instruction is available -func (c CPUInfo) Rdseed() bool { - return c.Features&RDSEED != 0 -} - -// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions) -func (c CPUInfo) ADX() bool { - return c.Features&ADX != 0 -} - -// SHA indicates support of Intel SHA Extensions -func (c CPUInfo) SHA() bool { - return c.Features&SHA != 0 -} - -// AVX512F indicates support of AVX-512 Foundation -func (c CPUInfo) AVX512F() bool { - return c.Features&AVX512F != 0 -} - -// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions -func (c CPUInfo) AVX512DQ() bool { - return c.Features&AVX512DQ != 0 -} - -// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions -func (c CPUInfo) AVX512IFMA() bool { - return c.Features&AVX512IFMA != 0 -} - -// AVX512PF indicates support of AVX-512 Prefetch Instructions -func (c CPUInfo) AVX512PF() bool { - return c.Features&AVX512PF != 0 -} - -// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions -func (c CPUInfo) AVX512ER() bool { - return c.Features&AVX512ER != 0 -} - -// AVX512CD indicates support of AVX-512 Conflict Detection Instructions -func (c CPUInfo) AVX512CD() bool { - return c.Features&AVX512CD != 0 -} - -// AVX512BW indicates support of AVX-512 Byte and Word Instructions -func (c CPUInfo) AVX512BW() bool { - return c.Features&AVX512BW != 0 -} - -// AVX512VL indicates support of AVX-512 Vector Length Extensions -func (c CPUInfo) AVX512VL() bool { - return c.Features&AVX512VL != 0 -} - -// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions -func (c CPUInfo) AVX512VBMI() bool { - return c.Features&AVX512VBMI != 0 -} - -// MPX indicates support of Intel MPX (Memory Protection Extensions) -func (c CPUInfo) MPX() bool { - return c.Features&MPX != 0 -} - -// ERMS indicates support of Enhanced REP MOVSB/STOSB -func (c CPUInfo) ERMS() bool { - return c.Features&ERMS != 0 -} - -// RDTSCP Instruction is available. -func (c CPUInfo) RDTSCP() bool { - return c.Features&RDTSCP != 0 -} - -// CX16 indicates if CMPXCHG16B instruction is available. -func (c CPUInfo) CX16() bool { - return c.Features&CX16 != 0 -} - -// TSX is split into HLE (Hardware Lock Elision) and RTM (Restricted Transactional Memory) detection. -// So TSX simply checks that. -func (c CPUInfo) TSX() bool { - return c.Features&(HLE|RTM) == HLE|RTM -} - -// Atom indicates an Atom processor -func (c CPUInfo) Atom() bool { - return c.Features&ATOM != 0 -} - -// Intel returns true if vendor is recognized as Intel -func (c CPUInfo) Intel() bool { - return c.VendorID == Intel -} - -// AMD returns true if vendor is recognized as AMD -func (c CPUInfo) AMD() bool { - return c.VendorID == AMD -} - -// Hygon returns true if vendor is recognized as Hygon -func (c CPUInfo) Hygon() bool { - return c.VendorID == Hygon -} - -// Transmeta returns true if vendor is recognized as Transmeta -func (c CPUInfo) Transmeta() bool { - return c.VendorID == Transmeta -} - -// NSC returns true if vendor is recognized as National Semiconductor -func (c CPUInfo) NSC() bool { - return c.VendorID == NSC -} - -// VIA returns true if vendor is recognized as VIA -func (c CPUInfo) VIA() bool { - return c.VendorID == VIA -} - -// RTCounter returns the 64-bit time-stamp counter -// Uses the RDTSCP instruction. The value 0 is returned -// if the CPU does not support the instruction. -func (c CPUInfo) RTCounter() uint64 { - if !c.RDTSCP() { - return 0 - } - a, _, _, d := rdtscpAsm() - return uint64(a) | (uint64(d) << 32) -} - -// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP. -// This variable is OS dependent, but on Linux contains information -// about the current cpu/core the code is running on. -// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned. -func (c CPUInfo) Ia32TscAux() uint32 { - if !c.RDTSCP() { - return 0 - } - _, _, ecx, _ := rdtscpAsm() - return ecx -} - -// LogicalCPU will return the Logical CPU the code is currently executing on. -// This is likely to change when the OS re-schedules the running thread -// to another CPU. -// If the current core cannot be detected, -1 will be returned. -func (c CPUInfo) LogicalCPU() int { - if c.maxFunc < 1 { - return -1 - } - _, ebx, _, _ := cpuid(1) - return int(ebx >> 24) -} - -// VM Will return true if the cpu id indicates we are in -// a virtual machine. This is only a hint, and will very likely -// have many false negatives. -func (c CPUInfo) VM() bool { - switch c.VendorID { - case MSVM, KVM, VMware, XenHVM, Bhyve: - return true - } - return false -} - -// Flags contains detected cpu features and caracteristics -type Flags uint64 - -// String returns a string representation of the detected -// CPU features. -func (f Flags) String() string { - return strings.Join(f.Strings(), ",") -} - -// Strings returns and array of the detected features. -func (f Flags) Strings() []string { - s := support() - r := make([]string, 0, 20) - for i := uint(0); i < 64; i++ { - key := Flags(1 << i) - val := flagNames[key] - if s&key != 0 { - r = append(r, val) - } - } - return r -} - -func maxExtendedFunction() uint32 { - eax, _, _, _ := cpuid(0x80000000) - return eax -} - -func maxFunctionID() uint32 { - a, _, _, _ := cpuid(0) - return a -} - -func brandName() string { - if maxExtendedFunction() >= 0x80000004 { - v := make([]uint32, 0, 48) - for i := uint32(0); i < 3; i++ { - a, b, c, d := cpuid(0x80000002 + i) - v = append(v, a, b, c, d) - } - return strings.Trim(string(valAsString(v...)), " ") - } - return "unknown" -} - -func threadsPerCore() int { - mfi := maxFunctionID() - if mfi < 0x4 || vendorID() != Intel { - return 1 - } - - if mfi < 0xb { - _, b, _, d := cpuid(1) - if (d & (1 << 28)) != 0 { - // v will contain logical core count - v := (b >> 16) & 255 - if v > 1 { - a4, _, _, _ := cpuid(4) - // physical cores - v2 := (a4 >> 26) + 1 - if v2 > 0 { - return int(v) / int(v2) - } - } - } - return 1 - } - _, b, _, _ := cpuidex(0xb, 0) - if b&0xffff == 0 { - return 1 - } - return int(b & 0xffff) -} - -func logicalCores() int { - mfi := maxFunctionID() - switch vendorID() { - case Intel: - // Use this on old Intel processors - if mfi < 0xb { - if mfi < 1 { - return 0 - } - // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID) - // that can be assigned to logical processors in a physical package. - // The value may not be the same as the number of logical processors that are present in the hardware of a physical package. - _, ebx, _, _ := cpuid(1) - logical := (ebx >> 16) & 0xff - return int(logical) - } - _, b, _, _ := cpuidex(0xb, 1) - return int(b & 0xffff) - case AMD, Hygon: - _, b, _, _ := cpuid(1) - return int((b >> 16) & 0xff) - default: - return 0 - } -} - -func familyModel() (int, int) { - if maxFunctionID() < 0x1 { - return 0, 0 - } - eax, _, _, _ := cpuid(1) - family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff) - model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0) - return int(family), int(model) -} - -func physicalCores() int { - switch vendorID() { - case Intel: - return logicalCores() / threadsPerCore() - case AMD, Hygon: - if maxExtendedFunction() >= 0x80000008 { - _, _, c, _ := cpuid(0x80000008) - return int(c&0xff) + 1 - } - } - return 0 -} - -// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID -var vendorMapping = map[string]Vendor{ - "AMDisbetter!": AMD, - "AuthenticAMD": AMD, - "CentaurHauls": VIA, - "GenuineIntel": Intel, - "TransmetaCPU": Transmeta, - "GenuineTMx86": Transmeta, - "Geode by NSC": NSC, - "VIA VIA VIA ": VIA, - "KVMKVMKVMKVM": KVM, - "Microsoft Hv": MSVM, - "VMwareVMware": VMware, - "XenVMMXenVMM": XenHVM, - "bhyve bhyve ": Bhyve, - "HygonGenuine": Hygon, -} - -func vendorID() Vendor { - _, b, c, d := cpuid(0) - v := valAsString(b, d, c) - vend, ok := vendorMapping[string(v)] - if !ok { - return Other - } - return vend -} - -func cacheLine() int { - if maxFunctionID() < 0x1 { - return 0 - } - - _, ebx, _, _ := cpuid(1) - cache := (ebx & 0xff00) >> 5 // cflush size - if cache == 0 && maxExtendedFunction() >= 0x80000006 { - _, _, ecx, _ := cpuid(0x80000006) - cache = ecx & 0xff // cacheline size - } - // TODO: Read from Cache and TLB Information - return int(cache) -} - -func (c *CPUInfo) cacheSize() { - c.Cache.L1D = -1 - c.Cache.L1I = -1 - c.Cache.L2 = -1 - c.Cache.L3 = -1 - vendor := vendorID() - switch vendor { - case Intel: - if maxFunctionID() < 4 { - return - } - for i := uint32(0); ; i++ { - eax, ebx, ecx, _ := cpuidex(4, i) - cacheType := eax & 15 - if cacheType == 0 { - break - } - cacheLevel := (eax >> 5) & 7 - coherency := int(ebx&0xfff) + 1 - partitions := int((ebx>>12)&0x3ff) + 1 - associativity := int((ebx>>22)&0x3ff) + 1 - sets := int(ecx) + 1 - size := associativity * partitions * coherency * sets - switch cacheLevel { - case 1: - if cacheType == 1 { - // 1 = Data Cache - c.Cache.L1D = size - } else if cacheType == 2 { - // 2 = Instruction Cache - c.Cache.L1I = size - } else { - if c.Cache.L1D < 0 { - c.Cache.L1I = size - } - if c.Cache.L1I < 0 { - c.Cache.L1I = size - } - } - case 2: - c.Cache.L2 = size - case 3: - c.Cache.L3 = size - } - } - case AMD, Hygon: - // Untested. - if maxExtendedFunction() < 0x80000005 { - return - } - _, _, ecx, edx := cpuid(0x80000005) - c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024) - c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024) - - if maxExtendedFunction() < 0x80000006 { - return - } - _, _, ecx, _ = cpuid(0x80000006) - c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024) - } - - return -} - -type SGXSupport struct { - Available bool - SGX1Supported bool - SGX2Supported bool - MaxEnclaveSizeNot64 int64 - MaxEnclaveSize64 int64 -} - -func hasSGX(available bool) (rval SGXSupport) { - rval.Available = available - - if !available { - return - } - - a, _, _, d := cpuidex(0x12, 0) - rval.SGX1Supported = a&0x01 != 0 - rval.SGX2Supported = a&0x02 != 0 - rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF) // pow 2 - rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2 - - return -} - -func support() Flags { - mfi := maxFunctionID() - vend := vendorID() - if mfi < 0x1 { - return 0 - } - rval := uint64(0) - _, _, c, d := cpuid(1) - if (d & (1 << 15)) != 0 { - rval |= CMOV - } - if (d & (1 << 23)) != 0 { - rval |= MMX - } - if (d & (1 << 25)) != 0 { - rval |= MMXEXT - } - if (d & (1 << 25)) != 0 { - rval |= SSE - } - if (d & (1 << 26)) != 0 { - rval |= SSE2 - } - if (c & 1) != 0 { - rval |= SSE3 - } - if (c & 0x00000200) != 0 { - rval |= SSSE3 - } - if (c & 0x00080000) != 0 { - rval |= SSE4 - } - if (c & 0x00100000) != 0 { - rval |= SSE42 - } - if (c & (1 << 25)) != 0 { - rval |= AESNI - } - if (c & (1 << 1)) != 0 { - rval |= CLMUL - } - if c&(1<<23) != 0 { - rval |= POPCNT - } - if c&(1<<30) != 0 { - rval |= RDRAND - } - if c&(1<<29) != 0 { - rval |= F16C - } - if c&(1<<13) != 0 { - rval |= CX16 - } - if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 { - if threadsPerCore() > 1 { - rval |= HTT - } - } - - // Check XGETBV, OXSAVE and AVX bits - if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 { - // Check for OS support - eax, _ := xgetbv(0) - if (eax & 0x6) == 0x6 { - rval |= AVX - if (c & 0x00001000) != 0 { - rval |= FMA3 - } - } - } - - // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. - if mfi >= 7 { - _, ebx, ecx, edx := cpuidex(7, 0) - if (rval&AVX) != 0 && (ebx&0x00000020) != 0 { - rval |= AVX2 - } - if (ebx & 0x00000008) != 0 { - rval |= BMI1 - if (ebx & 0x00000100) != 0 { - rval |= BMI2 - } - } - if ebx&(1<<2) != 0 { - rval |= SGX - } - if ebx&(1<<4) != 0 { - rval |= HLE - } - if ebx&(1<<9) != 0 { - rval |= ERMS - } - if ebx&(1<<11) != 0 { - rval |= RTM - } - if ebx&(1<<14) != 0 { - rval |= MPX - } - if ebx&(1<<18) != 0 { - rval |= RDSEED - } - if ebx&(1<<19) != 0 { - rval |= ADX - } - if ebx&(1<<29) != 0 { - rval |= SHA - } - if edx&(1<<26) != 0 { - rval |= IBPB - } - if edx&(1<<27) != 0 { - rval |= STIBP - } - - // Only detect AVX-512 features if XGETBV is supported - if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { - // Check for OS support - eax, _ := xgetbv(0) - - // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and - // ZMM16-ZMM31 state are enabled by OS) - /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS). - if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 { - if ebx&(1<<16) != 0 { - rval |= AVX512F - } - if ebx&(1<<17) != 0 { - rval |= AVX512DQ - } - if ebx&(1<<21) != 0 { - rval |= AVX512IFMA - } - if ebx&(1<<26) != 0 { - rval |= AVX512PF - } - if ebx&(1<<27) != 0 { - rval |= AVX512ER - } - if ebx&(1<<28) != 0 { - rval |= AVX512CD - } - if ebx&(1<<30) != 0 { - rval |= AVX512BW - } - if ebx&(1<<31) != 0 { - rval |= AVX512VL - } - // ecx - if ecx&(1<<1) != 0 { - rval |= AVX512VBMI - } - } - } - } - - if maxExtendedFunction() >= 0x80000001 { - _, _, c, d := cpuid(0x80000001) - if (c & (1 << 5)) != 0 { - rval |= LZCNT - rval |= POPCNT - } - if (d & (1 << 31)) != 0 { - rval |= AMD3DNOW - } - if (d & (1 << 30)) != 0 { - rval |= AMD3DNOWEXT - } - if (d & (1 << 23)) != 0 { - rval |= MMX - } - if (d & (1 << 22)) != 0 { - rval |= MMXEXT - } - if (c & (1 << 6)) != 0 { - rval |= SSE4A - } - if d&(1<<20) != 0 { - rval |= NX - } - if d&(1<<27) != 0 { - rval |= RDTSCP - } - - /* Allow for selectively disabling SSE2 functions on AMD processors - with SSE2 support but not SSE4a. This includes Athlon64, some - Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster - than SSE2 often enough to utilize this special-case flag. - AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case - so that SSE2 is used unless explicitly disabled by checking - AV_CPU_FLAG_SSE2SLOW. */ - if vendorID() != Intel && - rval&SSE2 != 0 && (c&0x00000040) == 0 { - rval |= SSE2SLOW - } - - /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be - * used unless the OS has AVX support. */ - if (rval & AVX) != 0 { - if (c & 0x00000800) != 0 { - rval |= XOP - } - if (c & 0x00010000) != 0 { - rval |= FMA4 - } - } - - if vendorID() == Intel { - family, model := familyModel() - if family == 6 && (model == 9 || model == 13 || model == 14) { - /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and - * 6/14 (core1 "yonah") theoretically support sse2, but it's - * usually slower than mmx. */ - if (rval & SSE2) != 0 { - rval |= SSE2SLOW - } - if (rval & SSE3) != 0 { - rval |= SSE3SLOW - } - } - /* The Atom processor has SSSE3 support, which is useful in many cases, - * but sometimes the SSSE3 version is slower than the SSE2 equivalent - * on the Atom, but is generally faster on other processors supporting - * SSSE3. This flag allows for selectively disabling certain SSSE3 - * functions on the Atom. */ - if family == 6 && model == 28 { - rval |= ATOM - } - } - } - return Flags(rval) -} - -func valAsString(values ...uint32) []byte { - r := make([]byte, 4*len(values)) - for i, v := range values { - dst := r[i*4:] - dst[0] = byte(v & 0xff) - dst[1] = byte((v >> 8) & 0xff) - dst[2] = byte((v >> 16) & 0xff) - dst[3] = byte((v >> 24) & 0xff) - switch { - case dst[0] == 0: - return r[:i*4] - case dst[1] == 0: - return r[:i*4+1] - case dst[2] == 0: - return r[:i*4+2] - case dst[3] == 0: - return r[:i*4+3] - } - } - return r -} diff --git a/vendor/github.com/klauspost/cpuid/cpuid_386.s b/vendor/github.com/klauspost/cpuid/cpuid_386.s deleted file mode 100644 index 4d731711e..000000000 --- a/vendor/github.com/klauspost/cpuid/cpuid_386.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// +build 386,!gccgo - -// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuid(SB), 7, $0 - XORL CX, CX - MOVL op+0(FP), AX - CPUID - MOVL AX, eax+4(FP) - MOVL BX, ebx+8(FP) - MOVL CX, ecx+12(FP) - MOVL DX, edx+16(FP) - RET - -// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuidex(SB), 7, $0 - MOVL op+0(FP), AX - MOVL op2+4(FP), CX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func xgetbv(index uint32) (eax, edx uint32) -TEXT ·asmXgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+4(FP) - MOVL DX, edx+8(FP) - RET - -// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) -TEXT ·asmRdtscpAsm(SB), 7, $0 - BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP - MOVL AX, eax+0(FP) - MOVL BX, ebx+4(FP) - MOVL CX, ecx+8(FP) - MOVL DX, edx+12(FP) - RET diff --git a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s deleted file mode 100644 index 3c1d60e42..000000000 --- a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -//+build amd64,!gccgo - -// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuid(SB), 7, $0 - XORQ CX, CX - MOVL op+0(FP), AX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuidex(SB), 7, $0 - MOVL op+0(FP), AX - MOVL op2+4(FP), CX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func asmXgetbv(index uint32) (eax, edx uint32) -TEXT ·asmXgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+8(FP) - MOVL DX, edx+12(FP) - RET - -// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) -TEXT ·asmRdtscpAsm(SB), 7, $0 - BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP - MOVL AX, eax+0(FP) - MOVL BX, ebx+4(FP) - MOVL CX, ecx+8(FP) - MOVL DX, edx+12(FP) - RET diff --git a/vendor/github.com/klauspost/cpuid/detect_intel.go b/vendor/github.com/klauspost/cpuid/detect_intel.go deleted file mode 100644 index a5f04dd6d..000000000 --- a/vendor/github.com/klauspost/cpuid/detect_intel.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// +build 386,!gccgo amd64,!gccgo - -package cpuid - -func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) -func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -func asmXgetbv(index uint32) (eax, edx uint32) -func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) - -func initCPU() { - cpuid = asmCpuid - cpuidex = asmCpuidex - xgetbv = asmXgetbv - rdtscpAsm = asmRdtscpAsm -} diff --git a/vendor/github.com/klauspost/cpuid/detect_ref.go b/vendor/github.com/klauspost/cpuid/detect_ref.go deleted file mode 100644 index 909c5d9a7..000000000 --- a/vendor/github.com/klauspost/cpuid/detect_ref.go +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// +build !amd64,!386 gccgo - -package cpuid - -func initCPU() { - cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 - } - - cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 - } - - xgetbv = func(index uint32) (eax, edx uint32) { - return 0, 0 - } - - rdtscpAsm = func() (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 - } -} diff --git a/vendor/github.com/klauspost/cpuid/generate.go b/vendor/github.com/klauspost/cpuid/generate.go deleted file mode 100644 index 90e7a98d2..000000000 --- a/vendor/github.com/klauspost/cpuid/generate.go +++ /dev/null @@ -1,4 +0,0 @@ -package cpuid - -//go:generate go run private-gen.go -//go:generate gofmt -w ./private |