12 files changed, 4848 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/flate/copy.go b/vendor/github.com/klauspost/compress/flate/copy.go
new file mode 100644
index 000000000..a3200a8f4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/copy.go
@@ -0,0 +1,32 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// forwardCopy is like the built-in copy function except that it always goes
+// forward from the start, even if the dst and src overlap.
+// It is equivalent to:
+//   for i := 0; i < n; i++ {
+//     mem[dst+i] = mem[src+i]
+//   }
+func forwardCopy(mem []byte, dst, src, n int) {
+	if dst <= src {
+		copy(mem[dst:dst+n], mem[src:src+n])
+		return
+	}
+	for {
+		if dst >= src+n {
+			copy(mem[dst:dst+n], mem[src:src+n])
+			return
+		}
+		// There is some forward overlap.  The destination
+		// will be filled with a repeated pattern of mem[src:src+k].
+		// We copy one instance of the pattern here, then repeat.
+		// Each time around this loop k will double.
+		k := dst - src
+		copy(mem[dst:dst+k], mem[src:src+k])
+		n -= k
+		dst += k
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
new file mode 100644
index 000000000..8298d309a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
@@ -0,0 +1,42 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package flate
+
+import (
+	"github.com/klauspost/cpuid"
+)
+
+// crc32sse returns a hash for the first 4 bytes of the slice
+// len(a) must be >= 4.
+//go:noescape
+func crc32sse(a []byte) uint32
+
+// crc32sseAll calculates hashes for each 4-byte set in a.
+// dst must be east len(a) - 4 in size.
+// The size is not checked by the assembly.
+//go:noescape
+func crc32sseAll(a []byte, dst []uint32)
+
+// matchLenSSE4 returns the number of matching bytes in a and b
+// up to length 'max'. Both slices must be at least 'max'
+// bytes in size.
+//
+// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
+//
+//go:noescape
+func matchLenSSE4(a, b []byte, max int) int
+
+// histogram accumulates a histogram of b in h.
+// h must be at least 256 entries in length,
+// and must be cleared before calling this function.
+//go:noescape
+func histogram(b []byte, h []int32)
+
+// Detect SSE 4.2 feature.
+func init() {
+	useSSE42 = cpuid.CPU.SSE42()
+}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
new file mode 100644
index 000000000..a79943727
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
@@ -0,0 +1,214 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+// func crc32sse(a []byte) uint32
+TEXT ·crc32sse(SB), 4, $0
+	MOVQ a+0(FP), R10
+	XORQ BX, BX
+
+	// CRC32   dword (R10), EBX
+	BYTE $0xF2; BYTE $0x41; BYTE $0x0f
+	BYTE $0x38; BYTE $0xf1; BYTE $0x1a
+
+	MOVL BX, ret+24(FP)
+	RET
+
+// func crc32sseAll(a []byte, dst []uint32)
+TEXT ·crc32sseAll(SB), 4, $0
+	MOVQ  a+0(FP), R8      // R8: src
+	MOVQ  a_len+8(FP), R10 // input length
+	MOVQ  dst+24(FP), R9   // R9: dst
+	SUBQ  $4, R10
+	JS    end
+	JZ    one_crc
+	MOVQ  R10, R13
+	SHRQ  $2, R10          // len/4
+	ANDQ  $3, R13          // len&3
+	XORQ  BX, BX
+	ADDQ  $1, R13
+	TESTQ R10, R10
+	JZ    rem_loop
+
+crc_loop:
+	MOVQ (R8), R11
+	XORQ BX, BX
+	XORQ DX, DX
+	XORQ DI, DI
+	MOVQ R11, R12
+	SHRQ $8, R11
+	MOVQ R12, AX
+	MOVQ R11, CX
+	SHRQ $16, R12
+	SHRQ $16, R11
+	MOVQ R12, SI
+
+	// CRC32   EAX, EBX
+	BYTE $0xF2; BYTE $0x0f
+	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
+
+	// CRC32   ECX, EDX
+	BYTE $0xF2; BYTE $0x0f
+	BYTE $0x38; BYTE $0xf1; BYTE $0xd1
+
+	// CRC32   ESI, EDI
+	BYTE $0xF2; BYTE $0x0f
+	BYTE $0x38; BYTE $0xf1; BYTE $0xfe
+	MOVL BX, (R9)
+	MOVL DX, 4(R9)
+	MOVL DI, 8(R9)
+
+	XORQ BX, BX
+	MOVL R11, AX
+
+	// CRC32   EAX, EBX
+	BYTE $0xF2; BYTE $0x0f
+	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
+	MOVL BX, 12(R9)
+
+	ADDQ $16, R9
+	ADDQ $4, R8
+	XORQ BX, BX
+	SUBQ $1, R10
+	JNZ  crc_loop
+
+rem_loop:
+	MOVL (R8), AX
+
+	// CRC32   EAX, EBX
+	BYTE $0xF2; BYTE $0x0f
+	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
+
+	MOVL BX, (R9)
+	ADDQ $4, R9
+	ADDQ $1, R8
+	XORQ BX, BX
+	SUBQ $1, R13
+	JNZ  rem_loop
+
+end:
+	RET
+
+one_crc:
+	MOVQ $1, R13
+	XORQ BX, BX
+	JMP  rem_loop
+
+// func matchLenSSE4(a, b []byte, max int) int
+TEXT ·matchLenSSE4(SB), 4, $0
+	MOVQ a_base+0(FP), SI
+	MOVQ b_base+24(FP), DI
+	MOVQ DI, DX
+	MOVQ max+48(FP), CX
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of max, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ CX, $8
+	JLT  cmp1
+	MOVQ (SI), AX
+	MOVQ (DI), BX
+	CMPQ AX, BX
+	JNE  bsf
+	ADDQ $8, SI
+	ADDQ $8, DI
+	SUBQ $8, CX
+	JMP  cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, DI
+
+	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
+	SUBQ DX, DI
+	MOVQ DI, ret+56(FP)
+	RET
+
+cmp1:
+	// In the slices' tail, compare 1 byte at a time.
+	CMPQ CX, $0
+	JEQ  matchLenEnd
+	MOVB (SI), AX
+	MOVB (DI), BX
+	CMPB AX, BX
+	JNE  matchLenEnd
+	ADDQ $1, SI
+	ADDQ $1, DI
+	SUBQ $1, CX
+	JMP  cmp1
+
+matchLenEnd:
+	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
+	SUBQ DX, DI
+	MOVQ DI, ret+56(FP)
+	RET
+
+// func histogram(b []byte, h []int32)
+TEXT ·histogram(SB), 4, $0
+	MOVQ b+0(FP), SI     // SI: &b
+	MOVQ b_len+8(FP), R9 // R9: len(b)
+	MOVQ h+24(FP), DI    // DI: Histogram
+	MOVQ R9, R8
+	SHRQ $3, R8
+	JZ   hist1
+	XORQ R11, R11
+
+loop_hist8:
+	MOVQ (SI), R10
+
+	MOVB R10, R11
+	INCL (DI)(R11*4)
+	SHRQ $8, R10
+
+	MOVB R10, R11
+	INCL (DI)(R11*4)
+	SHRQ $8, R10
+
+	MOVB R10, R11
+	INCL (DI)(R11*4)
+	SHRQ $8, R10
+
+	MOVB R10, R11
+	INCL (DI)(R11*4)
+	SHRQ $8, R10
+
+	MOVB R10, R11
+	INCL (DI)(R11*4)
+	SHRQ $8, R10
+
+	MOVB R10, R11
+	INCL (DI)(R11*4)
+	SHRQ $8, R10
+
+	MOVB R10, R11
+	INCL (DI)(R11*4)
+	SHRQ $8, R10
+
+	INCL (DI)(R10*4)
+
+	ADDQ $8, SI
+	DECQ R8
+	JNZ  loop_hist8
+
+hist1:
+	ANDQ $7, R9
+	JZ   end_hist
+	XORQ R10, R10
+
+loop_hist1:
+	MOVB (SI), R10
+	INCL (DI)(R10*4)
+	INCQ SI
+	DECQ R9
+	JNZ  loop_hist1
+
+end_hist:
+	RET
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
new file mode 100644
index 000000000..dcf43bd50
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
@@ -0,0 +1,35 @@
+//+build !amd64 noasm appengine gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package flate
+
+func init() {
+	useSSE42 = false
+}
+
+// crc32sse should never be called.
+func crc32sse(a []byte) uint32 {
+	panic("no assembler")
+}
+
+// crc32sseAll should never be called.
+func crc32sseAll(a []byte, dst []uint32) {
+	panic("no assembler")
+}
+
+// matchLenSSE4 should never be called.
+func matchLenSSE4(a, b []byte, max int) int {
+	panic("no assembler")
+	return 0
+}
+
+// histogram accumulates a histogram of b in h.
+//
+// len(h) must be >= 256, and h's elements must be all zeroes.
+func histogram(b []byte, h []int32) {
+	h = h[:256]
+	for _, t := range b {
+		h[t]++
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
new file mode 100644
index 000000000..9e6e7ff0c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -0,0 +1,1353 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright (c) 2015 Klaus Post
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	NoCompression      = 0
+	BestSpeed          = 1
+	BestCompression    = 9
+	DefaultCompression = -1
+
+	// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
+	// entropy encoding. This mode is useful in compressing data that has
+	// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
+	// that lacks an entropy encoder. Compression gains are achieved when
+	// certain bytes in the input stream occur more frequently than others.
+	//
+	// Note that HuffmanOnly produces a compressed output that is
+	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
+	// continue to be able to decompress this output.
+	HuffmanOnly         = -2
+	ConstantCompression = HuffmanOnly // compatibility alias.
+
+	logWindowSize    = 15
+	windowSize       = 1 << logWindowSize
+	windowMask       = windowSize - 1
+	logMaxOffsetSize = 15  // Standard DEFLATE
+	minMatchLength   = 4   // The smallest match that the compressor looks for
+	maxMatchLength   = 258 // The longest match for the compressor
+	minOffsetSize    = 1   // The shortest offset that makes any sense
+
+	// The maximum number of tokens we put into a single flat block, just too
+	// stop things from getting too large.
+	maxFlateBlockTokens = 1 << 14
+	maxStoreBlockSize   = 65535
+	hashBits            = 17 // After 17 performance degrades
+	hashSize            = 1 << hashBits
+	hashMask            = (1 << hashBits) - 1
+	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
+	maxHashOffset       = 1 << 24
+
+	skipNever = math.MaxInt32
+)
+
+var useSSE42 bool
+
+type compressionLevel struct {
+	good, lazy, nice, chain, fastSkipHashing, level int
+}
+
+// Compression levels have been rebalanced from zlib deflate defaults
+// to give a bigger spread in speed and compression.
+// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
+var levels = []compressionLevel{
+	{}, // 0
+	// Level 1-4 uses specialized algorithm - values not used
+	{0, 0, 0, 0, 0, 1},
+	{0, 0, 0, 0, 0, 2},
+	{0, 0, 0, 0, 0, 3},
+	{0, 0, 0, 0, 0, 4},
+	// For levels 5-6 we don't bother trying with lazy matches.
+	// Lazy matching is at least 30% slower, with 1.5% increase.
+	{6, 0, 12, 8, 12, 5},
+	{8, 0, 24, 16, 16, 6},
+	// Levels 7-9 use increasingly more lazy matching
+	// and increasingly stringent conditions for "good enough".
+	{8, 8, 24, 16, skipNever, 7},
+	{10, 16, 24, 64, skipNever, 8},
+	{32, 258, 258, 4096, skipNever, 9},
+}
+
+type compressor struct {
+	compressionLevel
+
+	w          *huffmanBitWriter
+	bulkHasher func([]byte, []uint32)
+
+	// compression algorithm
+	fill func(*compressor, []byte) int // copy data to window
+	step func(*compressor)             // process window
+	sync bool                          // requesting flush
+
+	// Input hash chains
+	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
+	// If hashHead[hashValue] is within the current window, then
+	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
+	// with the same hash value.
+	chainHead  int
+	hashHead   [hashSize]uint32
+	hashPrev   [windowSize]uint32
+	hashOffset int
+
+	// input window: unprocessed data is window[index:windowEnd]
+	index         int
+	window        []byte
+	windowEnd     int
+	blockStart    int  // window index where current tokens start
+	byteAvailable bool // if true, still need to process window[index-1].
+
+	// queued output tokens
+	tokens tokens
+
+	// deflate state
+	length         int
+	offset         int
+	hash           uint32
+	maxInsertIndex int
+	err            error
+	ii             uint16 // position of last match, intended to overflow to reset.
+
+	snap      snappyEnc
+	hashMatch [maxMatchLength + minMatchLength]uint32
+}
+
+func (d *compressor) fillDeflate(b []byte) int {
+	if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+		// shift the window by windowSize
+		copy(d.window[:], d.window[windowSize:2*windowSize])
+		d.index -= windowSize
+		d.windowEnd -= windowSize
+		if d.blockStart >= windowSize {
+			d.blockStart -= windowSize
+		} else {
+			d.blockStart = math.MaxInt32
+		}
+		d.hashOffset += windowSize
+		if d.hashOffset > maxHashOffset {
+			delta := d.hashOffset - 1
+			d.hashOffset -= delta
+			d.chainHead -= delta
+			// Iterate over slices instead of arrays to avoid copying
+			// the entire table onto the stack (Issue #18625).
+			for i, v := range d.hashPrev[:] {
+				if int(v) > delta {
+					d.hashPrev[i] = uint32(int(v) - delta)
+				} else {
+					d.hashPrev[i] = 0
+				}
+			}
+			for i, v := range d.hashHead[:] {
+				if int(v) > delta {
+					d.hashHead[i] = uint32(int(v) - delta)
+				} else {
+					d.hashHead[i] = 0
+				}
+			}
+		}
+	}
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+func (d *compressor) writeBlock(tok tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		var window []byte
+		if d.blockStart <= index {
+			window = d.window[d.blockStart:index]
+		}
+		d.blockStart = index
+		d.w.writeBlock(tok.tokens[:tok.n], eof, window)
+		return d.w.err
+	}
+	return nil
+}
+
+// writeBlockSkip writes the current block and uses the number of tokens
+// to determine if the block should be stored on no matches, or
+// only huffman encoded.
+func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		if d.blockStart <= index {
+			window := d.window[d.blockStart:index]
+			// If we removed less than a 64th of all literals
+			// we huffman compress the block.
+			if int(tok.n) > len(window)-int(tok.n>>6) {
+				d.w.writeBlockHuff(eof, window)
+			} else {
+				// Write a dynamic huffman block.
+				d.w.writeBlockDynamic(tok.tokens[:tok.n], eof, window)
+			}
+		} else {
+			d.w.writeBlock(tok.tokens[:tok.n], eof, nil)
+		}
+		d.blockStart = index
+		return d.w.err
+	}
+	return nil
+}
+
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a start/reset.
+func (d *compressor) fillWindow(b []byte) {
+	// Do not fill window if we are in store-only mode,
+	// use constant or Snappy compression.
+	switch d.compressionLevel.level {
+	case 0, 1, 2:
+		return
+	}
+	// If we are given too much, cut it.
+	if len(b) > windowSize {
+		b = b[len(b)-windowSize:]
+	}
+	// Add all to window.
+	n := copy(d.window[d.windowEnd:], b)
+
+	// Calculate 256 hashes at the time (more L1 cache hits)
+	loops := (n + 256 - minMatchLength) / 256
+	for j := 0; j < loops; j++ {
+		startindex := j * 256
+		end := startindex + 256 + minMatchLength - 1
+		if end > n {
+			end = n
+		}
+		tocheck := d.window[startindex:end]
+		dstSize := len(tocheck) - minMatchLength + 1
+
+		if dstSize <= 0 {
+			continue
+		}
+
+		dst := d.hashMatch[:dstSize]
+		d.bulkHasher(tocheck, dst)
+		var newH uint32
+		for i, val := range dst {
+			di := i + startindex
+			newH = val & hashMask
+			// Get previous value with the same hash.
+			// Our chain should point to the previous value.
+			d.hashPrev[di&windowMask] = d.hashHead[newH]
+			// Set the head of the hash chain to us.
+			d.hashHead[newH] = uint32(di + d.hashOffset)
+		}
+		d.hash = newH
+	}
+	// Update window information.
+	d.windowEnd += n
+	d.index = n
+}
+
+// Try to find a match starting at index whose length is greater than prevSize.
+// We only look at chainCount possibilities before giving up.
+// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
+func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+	minMatchLook := maxMatchLength
+	if lookahead < minMatchLook {
+		minMatchLook = lookahead
+	}
+
+	win := d.window[0 : pos+minMatchLook]
+
+	// We quit when we get a match that's at least nice long
+	nice := len(win) - pos
+	if d.nice < nice {
+		nice = d.nice
+	}
+
+	// If we've got a match that's good enough, only look in 1/4 the chain.
+	tries := d.chain
+	length = prevLength
+	if length >= d.good {
+		tries >>= 2
+	}
+
+	wEnd := win[pos+length]
+	wPos := win[pos:]
+	minIndex := pos - windowSize
+
+	for i := prevHead; tries > 0; tries-- {
+		if wEnd == win[i+length] {
+			n := matchLen(win[i:], wPos, minMatchLook)
+
+			if n > length && (n > minMatchLength || pos-i <= 4096) {
+				length = n
+				offset = pos - i
+				ok = true
+				if n >= nice {
+					// The match is good enough that we don't try to find a better one.
+					break
+				}
+				wEnd = win[pos+n]
+			}
+		}
+		if i == minIndex {
+			// hashPrev[i & windowMask] has already been overwritten, so stop now.
+			break
+		}
+		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+		if i < minIndex || i < 0 {
+			break
+		}
+	}
+	return
+}
+
+// Try to find a match starting at index whose length is greater than prevSize.
+// We only look at chainCount possibilities before giving up.
+// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
+func (d *compressor) findMatchSSE(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+	minMatchLook := maxMatchLength
+	if lookahead < minMatchLook {
+		minMatchLook = lookahead
+	}
+
+	win := d.window[0 : pos+minMatchLook]
+
+	// We quit when we get a match that's at least nice long
+	nice := len(win) - pos
+	if d.nice < nice {
+		nice = d.nice
+	}
+
+	// If we've got a match that's good enough, only look in 1/4 the chain.
+	tries := d.chain
+	length = prevLength
+	if length >= d.good {
+		tries >>= 2
+	}
+
+	wEnd := win[pos+length]
+	wPos := win[pos:]
+	minIndex := pos - windowSize
+
+	for i := prevHead; tries > 0; tries-- {
+		if wEnd == win[i+length] {
+			n := matchLenSSE4(win[i:], wPos, minMatchLook)
+
+			if n > length && (n > minMatchLength || pos-i <= 4096) {
+				length = n
+				offset = pos - i
+				ok = true
+				if n >= nice {
+					// The match is good enough that we don't try to find a better one.
+					break
+				}
+				wEnd = win[pos+n]
+			}
+		}
+		if i == minIndex {
+			// hashPrev[i & windowMask] has already been overwritten, so stop now.
+			break
+		}
+		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+		if i < minIndex || i < 0 {
+			break
+		}
+	}
+	return
+}
+
+func (d *compressor) writeStoredBlock(buf []byte) error {
+	if d.w.writeStoredHeader(len(buf), false); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.writeBytes(buf)
+	return d.w.err
+}
+
+const hashmul = 0x1e35a7bd
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+	return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+	if len(b) < minMatchLength {
+		return
+	}
+	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	dst[0] = (hb * hashmul) >> (32 - hashBits)
+	end := len(b) - minMatchLength + 1
+	for i := 1; i < end; i++ {
+		hb = (hb << 8) | uint32(b[i+3])
+		dst[i] = (hb * hashmul) >> (32 - hashBits)
+	}
+}
+
+// matchLen returns the number of matching bytes in a and b
+// up to length 'max'. Both slices must be at least 'max'
+// bytes in size.
+func matchLen(a, b []byte, max int) int {
+	a = a[:max]
+	b = b[:len(a)]
+	for i, av := range a {
+		if b[i] != av {
+			return i
+		}
+	}
+	return max
+}
+
+func (d *compressor) initDeflate() {
+	d.window = make([]byte, 2*windowSize)
+	d.hashOffset = 1
+	d.length = minMatchLength - 1
+	d.offset = 0
+	d.byteAvailable = false
+	d.index = 0
+	d.hash = 0
+	d.chainHead = -1
+	d.bulkHasher = bulkHash4
+	if useSSE42 {
+		d.bulkHasher = crc32sseAll
+	}
+}
+
+// Assumes that d.fastSkipHashing != skipNever,
+// otherwise use deflateLazy
+func (d *compressor) deflate() {
+
+	// Sanity enables additional runtime tests.
+	// It's intended to be used during development
+	// to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+		return
+	}
+
+	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+	if d.index < d.maxInsertIndex {
+		d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+	}
+
+	for {
+		if sanity && d.index > d.windowEnd {
+			panic("index > windowEnd")
+		}
+		lookahead := d.windowEnd - d.index
+		if lookahead < minMatchLength+maxMatchLength {
+			if !d.sync {
+				return
+			}
+			if sanity && d.index > d.windowEnd {
+				panic("index > windowEnd")
+			}
+			if lookahead == 0 {
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+				return
+			}
+		}
+		if d.index < d.maxInsertIndex {
+			// Update the hash
+			d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+			ch := d.hashHead[d.hash&hashMask]
+			d.chainHead = int(ch)
+			d.hashPrev[d.index&windowMask] = ch
+			d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
+		}
+		d.length = minMatchLength - 1
+		d.offset = 0
+		minIndex := d.index - windowSize
+		if minIndex < 0 {
+			minIndex = 0
+		}
+
+		if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
+			if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+				d.length = newLength
+				d.offset = newOffset
+			}
+		}
+		if d.length >= minMatchLength {
+			d.ii = 0
+			// There was a match at the previous step, and the current match is
+			// not better. Output the previous match.
+			// "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
+			d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
+			d.tokens.n++
+			// Insert in the hash table all strings up to the end of the match.
+			// index and index-1 are already inserted. If there is not enough
+			// lookahead, the last two strings are not inserted into the hash
+			// table.
+			if d.length <= d.fastSkipHashing {
+				var newIndex int
+				newIndex = d.index + d.length
+				// Calculate missing hashes
+				end := newIndex
+				if end > d.maxInsertIndex {
+					end = d.maxInsertIndex
+				}
+				end += minMatchLength - 1
+				startindex := d.index + 1
+				if startindex > d.maxInsertIndex {
+					startindex = d.maxInsertIndex
+				}
+				tocheck := d.window[startindex:end]
+				dstSize := len(tocheck) - minMatchLength + 1
+				if dstSize > 0 {
+					dst := d.hashMatch[:dstSize]
+					bulkHash4(tocheck, dst)
+					var newH uint32
+					for i, val := range dst {
+						di := i + startindex
+						newH = val & hashMask
+						// Get previous value with the same hash.
+						// Our chain should point to the previous value.
+						d.hashPrev[di&windowMask] = d.hashHead[newH]
+						// Set the head of the hash chain to us.
+						d.hashHead[newH] = uint32(di + d.hashOffset)
+					}
+					d.hash = newH
+				}
+				d.index = newIndex
+			} else {
+				// For matches this long, we don't bother inserting each individual
+				// item into the table.
+				d.index += d.length
+				if d.index < d.maxInsertIndex {
+					d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+				}
+			}
+			if d.tokens.n == maxFlateBlockTokens {
+				// The block includes the current character
+				if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+					return
+				}
+				d.tokens.n = 0
+			}
+		} else {
+			d.ii++
+			end := d.index + int(d.ii>>uint(d.fastSkipHashing)) + 1
+			if end > d.windowEnd {
+				end = d.windowEnd
+			}
+			for i := d.index; i < end; i++ {
+				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
+				d.tokens.n++
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+			}
+			d.index = end
+		}
+	}
+}
+
+// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
+// meaning it always has lazy matching on.
+func (d *compressor) deflateLazy() {
+	// Sanity enables additional runtime tests.
+	// It's intended to be used during development
+	// to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+		return
+	}
+
+	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+	if d.index < d.maxInsertIndex {
+		d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+	}
+
+	for {
+		if sanity && d.index > d.windowEnd {
+			panic("index > windowEnd")
+		}
+		lookahead := d.windowEnd - d.index
+		if lookahead < minMatchLength+maxMatchLength {
+			if !d.sync {
+				return
+			}
+			if sanity && d.index > d.windowEnd {
+				panic("index > windowEnd")
+			}
+			if lookahead == 0 {
+				// Flush current output block if any.
+				if d.byteAvailable {
+					// There is still one pending token that needs to be flushed
+					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+					d.tokens.n++
+					d.byteAvailable = false
+				}
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+				return
+			}
+		}
+		if d.index < d.maxInsertIndex {
+			// Update the hash
+			d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+			ch := d.hashHead[d.hash&hashMask]
+			d.chainHead = int(ch)
+			d.hashPrev[d.index&windowMask] = ch
+			d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
+		}
+		prevLength := d.length
+		prevOffset := d.offset
+		d.length = minMatchLength - 1
+		d.offset = 0
+		minIndex := d.index - windowSize
+		if minIndex < 0 {
+			minIndex = 0
+		}
+
+		if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+			if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+				d.length = newLength
+				d.offset = newOffset
+			}
+		}
+		if prevLength >= minMatchLength && d.length <= prevLength {
+			// There was a match at the previous step, and the current match is
+			// not better. Output the previous match.
+			d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+			d.tokens.n++
+
+			// Insert in the hash table all strings up to the end of the match.
+			// index and index-1 are already inserted. If there is not enough
+			// lookahead, the last two strings are not inserted into the hash
+			// table.
+			var newIndex int
+			newIndex = d.index + prevLength - 1
+			// Calculate missing hashes
+			end := newIndex
+			if end > d.maxInsertIndex {
+				end = d.maxInsertIndex
+			}
+			end += minMatchLength - 1
+			startindex := d.index + 1
+			if startindex > d.maxInsertIndex {
+				startindex = d.maxInsertIndex
+			}
+			tocheck := d.window[startindex:end]
+			dstSize := len(tocheck) - minMatchLength + 1
+			if dstSize > 0 {
+				dst := d.hashMatch[:dstSize]
+				bulkHash4(tocheck, dst)
+				var newH uint32
+				for i, val := range dst {
+					di := i + startindex
+					newH = val & hashMask
+					// Get previous value with the same hash.
+					// Our chain should point to the previous value.
+					d.hashPrev[di&windowMask] = d.hashHead[newH]
+					// Set the head of the hash chain to us.
+					d.hashHead[newH] = uint32(di + d.hashOffset)
+				}
+				d.hash = newH
+			}
+
+			d.index = newIndex
+			d.byteAvailable = false
+			d.length = minMatchLength - 1
+			if d.tokens.n == maxFlateBlockTokens {
+				// The block includes the current character
+				if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+					return
+				}
+				d.tokens.n = 0
+			}
+		} else {
+			// Reset, if we got a match this run.
+			if d.length >= minMatchLength {
+				d.ii = 0
+			}
+			// We have a byte waiting. Emit it.
+			if d.byteAvailable {
+				d.ii++
+				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+				d.tokens.n++
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+				d.index++
+
+				// If we have a long run of no matches, skip additional bytes
+				// Resets when d.ii overflows after 64KB.
+				if d.ii > 31 {
+					n := int(d.ii >> 5)
+					for j := 0; j < n; j++ {
+						if d.index >= d.windowEnd-1 {
+							break
+						}
+
+						d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+						d.tokens.n++
+						if d.tokens.n == maxFlateBlockTokens {
+							if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+								return
+							}
+							d.tokens.n = 0
+						}
+						d.index++
+					}
+					// Flush last byte
+					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+					d.tokens.n++
+					d.byteAvailable = false
+					// d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
+					if d.tokens.n == maxFlateBlockTokens {
+						if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+							return
+						}
+						d.tokens.n = 0
+					}
+				}
+			} else {
+				d.index++
+				d.byteAvailable = true
+			}
+		}
+	}
+}
+
+// Assumes that d.fastSkipHashing != skipNever,
+// otherwise use deflateLazySSE
+func (d *compressor) deflateSSE() {
+
+	// Sanity enables additional runtime tests.
+	// It's intended to be used during development
+	// to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+		return
+	}
+
+	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+	if d.index < d.maxInsertIndex {
+		d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+	}
+
+	for {
+		if sanity && d.index > d.windowEnd {
+			panic("index > windowEnd")
+		}
+		lookahead := d.windowEnd - d.index
+		if lookahead < minMatchLength+maxMatchLength {
+			if !d.sync {
+				return
+			}
+			if sanity && d.index > d.windowEnd {
+				panic("index > windowEnd")
+			}
+			if lookahead == 0 {
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+				return
+			}
+		}
+		if d.index < d.maxInsertIndex {
+			// Update the hash
+			d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+			ch := d.hashHead[d.hash]
+			d.chainHead = int(ch)
+			d.hashPrev[d.index&windowMask] = ch
+			d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
+		}
+		d.length = minMatchLength - 1
+		d.offset = 0
+		minIndex := d.index - windowSize
+		if minIndex < 0 {
+			minIndex = 0
+		}
+
+		if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
+			if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+				d.length = newLength
+				d.offset = newOffset
+			}
+		}
+		if d.length >= minMatchLength {
+			d.ii = 0
+			// There was a match at the previous step, and the current match is
+			// not better. Output the previous match.
+			// "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
+			d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
+			d.tokens.n++
+			// Insert in the hash table all strings up to the end of the match.
+			// index and index-1 are already inserted. If there is not enough
+			// lookahead, the last two strings are not inserted into the hash
+			// table.
+			if d.length <= d.fastSkipHashing {
+				var newIndex int
+				newIndex = d.index + d.length
+				// Calculate missing hashes
+				end := newIndex
+				if end > d.maxInsertIndex {
+					end = d.maxInsertIndex
+				}
+				end += minMatchLength - 1
+				startindex := d.index + 1
+				if startindex > d.maxInsertIndex {
+					startindex = d.maxInsertIndex
+				}
+				tocheck := d.window[startindex:end]
+				dstSize := len(tocheck) - minMatchLength + 1
+				if dstSize > 0 {
+					dst := d.hashMatch[:dstSize]
+
+					crc32sseAll(tocheck, dst)
+					var newH uint32
+					for i, val := range dst {
+						di := i + startindex
+						newH = val & hashMask
+						// Get previous value with the same hash.
+						// Our chain should point to the previous value.
+						d.hashPrev[di&windowMask] = d.hashHead[newH]
+						// Set the head of the hash chain to us.
+						d.hashHead[newH] = uint32(di + d.hashOffset)
+					}
+					d.hash = newH
+				}
+				d.index = newIndex
+			} else {
+				// For matches this long, we don't bother inserting each individual
+				// item into the table.
+				d.index += d.length
+				if d.index < d.maxInsertIndex {
+					d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+				}
+			}
+			if d.tokens.n == maxFlateBlockTokens {
+				// The block includes the current character
+				if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+					return
+				}
+				d.tokens.n = 0
+			}
+		} else {
+			d.ii++
+			end := d.index + int(d.ii>>5) + 1
+			if end > d.windowEnd {
+				end = d.windowEnd
+			}
+			for i := d.index; i < end; i++ {
+				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
+				d.tokens.n++
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+			}
+			d.index = end
+		}
+	}
+}
+
+// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
+// meaning it always has lazy matching on.
+func (d *compressor) deflateLazySSE() {
+	// Sanity enables additional runtime tests.
+	// It's intended to be used during development
+	// to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+		return
+	}
+
+	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+	if d.index < d.maxInsertIndex {
+		d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+	}
+
+	for {
+		if sanity && d.index > d.windowEnd {
+			panic("index > windowEnd")
+		}
+		lookahead := d.windowEnd - d.index
+		if lookahead < minMatchLength+maxMatchLength {
+			if !d.sync {
+				return
+			}
+			if sanity && d.index > d.windowEnd {
+				panic("index > windowEnd")
+			}
+			if lookahead == 0 {
+				// Flush current output block if any.
+				if d.byteAvailable {
+					// There is still one pending token that needs to be flushed
+					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+					d.tokens.n++
+					d.byteAvailable = false
+				}
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+				return
+			}
+		}
+		if d.index < d.maxInsertIndex {
+			// Update the hash
+			d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+			ch := d.hashHead[d.hash]
+			d.chainHead = int(ch)
+			d.hashPrev[d.index&windowMask] = ch
+			d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
+		}
+		prevLength := d.length
+		prevOffset := d.offset
+		d.length = minMatchLength - 1
+		d.offset = 0
+		minIndex := d.index - windowSize
+		if minIndex < 0 {
+			minIndex = 0
+		}
+
+		if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+			if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+				d.length = newLength
+				d.offset = newOffset
+			}
+		}
+		if prevLength >= minMatchLength && d.length <= prevLength {
+			// There was a match at the previous step, and the current match is
+			// not better. Output the previous match.
+			d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+			d.tokens.n++
+
+			// Insert in the hash table all strings up to the end of the match.
+			// index and index-1 are already inserted. If there is not enough
+			// lookahead, the last two strings are not inserted into the hash
+			// table.
+			var newIndex int
+			newIndex = d.index + prevLength - 1
+			// Calculate missing hashes
+			end := newIndex
+			if end > d.maxInsertIndex {
+				end = d.maxInsertIndex
+			}
+			end += minMatchLength - 1
+			startindex := d.index + 1
+			if startindex > d.maxInsertIndex {
+				startindex = d.maxInsertIndex
+			}
+			tocheck := d.window[startindex:end]
+			dstSize := len(tocheck) - minMatchLength + 1
+			if dstSize > 0 {
+				dst := d.hashMatch[:dstSize]
+				crc32sseAll(tocheck, dst)
+				var newH uint32
+				for i, val := range dst {
+					di := i + startindex
+					newH = val & hashMask
+					// Get previous value with the same hash.
+					// Our chain should point to the previous value.
+					d.hashPrev[di&windowMask] = d.hashHead[newH]
+					// Set the head of the hash chain to us.
+					d.hashHead[newH] = uint32(di + d.hashOffset)
+				}
+				d.hash = newH
+			}
+
+			d.index = newIndex
+			d.byteAvailable = false
+			d.length = minMatchLength - 1
+			if d.tokens.n == maxFlateBlockTokens {
+				// The block includes the current character
+				if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+					return
+				}
+				d.tokens.n = 0
+			}
+		} else {
+			// Reset, if we got a match this run.
+			if d.length >= minMatchLength {
+				d.ii = 0
+			}
+			// We have a byte waiting. Emit it.
+			if d.byteAvailable {
+				d.ii++
+				d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+				d.tokens.n++
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+						return
+					}
+					d.tokens.n = 0
+				}
+				d.index++
+
+				// If we have a long run of no matches, skip additional bytes
+				// Resets when d.ii overflows after 64KB.
+				if d.ii > 31 {
+					n := int(d.ii >> 6)
+					for j := 0; j < n; j++ {
+						if d.index >= d.windowEnd-1 {
+							break
+						}
+
+						d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+						d.tokens.n++
+						if d.tokens.n == maxFlateBlockTokens {
+							if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+								return
+							}
+							d.tokens.n = 0
+						}
+						d.index++
+					}
+					// Flush last byte
+					d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+					d.tokens.n++
+					d.byteAvailable = false
+					// d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
+					if d.tokens.n == maxFlateBlockTokens {
+						if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+							return
+						}
+						d.tokens.n = 0
+					}
+				}
+			} else {
+				d.index++
+				d.byteAvailable = true
+			}
+		}
+	}
+}
+
+func (d *compressor) store() {
+	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		d.windowEnd = 0
+	}
+}
+
+// fillWindow will fill the buffer with data for huffman-only compression.
+// The number of bytes copied is returned.
+func (d *compressor) fillBlock(b []byte) int {
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeHuff() {
+	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
+		return
+	}
+	d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+	d.err = d.w.err
+	d.windowEnd = 0
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeSnappy() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < maxStoreBlockSize {
+		if !d.sync {
+			return
+		}
+		// Handle extremely small sizes.
+		if d.windowEnd < 128 {
+			if d.windowEnd == 0 {
+				return
+			}
+			if d.windowEnd <= 32 {
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+				d.tokens.n = 0
+				d.windowEnd = 0
+			} else {
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+				d.err = d.w.err
+			}
+			d.tokens.n = 0
+			d.windowEnd = 0
+			d.snap.Reset()
+			return
+		}
+	}
+
+	d.snap.Encode(&d.tokens, d.window[:d.windowEnd])
+	// If we made zero matches, store the block as is.
+	if int(d.tokens.n) == d.windowEnd {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		// If we removed less than 1/16th, huffman compress the block.
+	} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+		d.err = d.w.err
+	} else {
+		d.w.writeBlockDynamic(d.tokens.tokens[:d.tokens.n], false, d.window[:d.windowEnd])
+		d.err = d.w.err
+	}
+	d.tokens.n = 0
+	d.windowEnd = 0
+}
+
+// write will add input byte to the stream.
+// Unless an error occurs all bytes will be consumed.
+func (d *compressor) write(b []byte) (n int, err error) {
+	if d.err != nil {
+		return 0, d.err
+	}
+	n = len(b)
+	for len(b) > 0 {
+		d.step(d)
+		b = b[d.fill(d, b):]
+		if d.err != nil {
+			return 0, d.err
+		}
+	}
+	return n, d.err
+}
+
+func (d *compressor) syncFlush() error {
+	d.sync = true
+	if d.err != nil {
+		return d.err
+	}
+	d.step(d)
+	if d.err == nil {
+		d.w.writeStoredHeader(0, false)
+		d.w.flush()
+		d.err = d.w.err
+	}
+	d.sync = false
+	return d.err
+}
+
+func (d *compressor) init(w io.Writer, level int) (err error) {
+	d.w = newHuffmanBitWriter(w)
+
+	switch {
+	case level == NoCompression:
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).store
+	case level == ConstantCompression:
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeHuff
+	case level >= 1 && level <= 4:
+		d.snap = newSnappy(level)
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeSnappy
+	case level == DefaultCompression:
+		level = 5
+		fallthrough
+	case 5 <= level && level <= 9:
+		d.compressionLevel = levels[level]
+		d.initDeflate()
+		d.fill = (*compressor).fillDeflate
+		if d.fastSkipHashing == skipNever {
+			if useSSE42 {
+				d.step = (*compressor).deflateLazySSE
+			} else {
+				d.step = (*compressor).deflateLazy
+			}
+		} else {
+			if useSSE42 {
+				d.step = (*compressor).deflateSSE
+			} else {
+				d.step = (*compressor).deflate
+
+			}
+		}
+	default:
+		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
+	}
+	return nil
+}
+
+// reset the state of the compressor.
+func (d *compressor) reset(w io.Writer) {
+	d.w.reset(w)
+	d.sync = false
+	d.err = nil
+	// We only need to reset a few things for Snappy.
+	if d.snap != nil {
+		d.snap.Reset()
+		d.windowEnd = 0
+		d.tokens.n = 0
+		return
+	}
+	switch d.compressionLevel.chain {
+	case 0:
+		// level was NoCompression or ConstantCompresssion.
+		d.windowEnd = 0
+	default:
+		d.chainHead = -1
+		for i := range d.hashHead {
+			d.hashHead[i] = 0
+		}
+		for i := range d.hashPrev {
+			d.hashPrev[i] = 0
+		}
+		d.hashOffset = 1
+		d.index, d.windowEnd = 0, 0
+		d.blockStart, d.byteAvailable = 0, false
+		d.tokens.n = 0
+		d.length = minMatchLength - 1
+		d.offset = 0
+		d.hash = 0
+		d.ii = 0
+		d.maxInsertIndex = 0
+	}
+}
+
+func (d *compressor) close() error {
+	if d.err != nil {
+		return d.err
+	}
+	d.sync = true
+	d.step(d)
+	if d.err != nil {
+		return d.err
+	}
+	if d.w.writeStoredHeader(0, true); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.flush()
+	return d.w.err
+}
+
+// NewWriter returns a new Writer compressing data at the given level.
+// Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
+// higher levels typically run slower but compress more.
+// Level 0 (NoCompression) does not attempt any compression; it only adds the
+// necessary DEFLATE framing.
+// Level -1 (DefaultCompression) uses the default compression level.
+// Level -2 (ConstantCompression) will use Huffman compression only, giving
+// a very fast compression for all types of input, but sacrificing considerable
+// compression efficiency.
+//
+// If level is in the range [-2, 9] then the error returned will be nil.
+// Otherwise the error returned will be non-nil.
+func NewWriter(w io.Writer, level int) (*Writer, error) {
+	var dw Writer
+	if err := dw.d.init(w, level); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
+// NewWriterDict is like NewWriter but initializes the new
+// Writer with a preset dictionary.  The returned Writer behaves
+// as if the dictionary had been written to it without producing
+// any compressed output.  The compressed data written to w
+// can only be decompressed by a Reader initialized with the
+// same dictionary.
+func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
+	dw := &dictWriter{w}
+	zw, err := NewWriter(dw, level)
+	if err != nil {
+		return nil, err
+	}
+	zw.d.fillWindow(dict)
+	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
+	return zw, err
+}
+
+type dictWriter struct {
+	w io.Writer
+}
+
+func (w *dictWriter) Write(b []byte) (n int, err error) {
+	return w.w.Write(b)
+}
+
+// A Writer takes data written to it and writes the compressed
+// form of that data to an underlying writer (see NewWriter).
+type Writer struct {
+	d    compressor
+	dict []byte
+}
+
+// Write writes data to w, which will eventually write the
+// compressed form of data to its underlying writer.
+func (w *Writer) Write(data []byte) (n int, err error) {
+	return w.d.write(data)
+}
+
+// Flush flushes any pending data to the underlying writer.
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet.
+// Flush does not return until the data has been written.
+// Calling Flush when there is no pending data still causes the Writer
+// to emit a sync marker of at least 4 bytes.
+// If the underlying writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (w *Writer) Flush() error {
+	// For more about flushing:
+	// http://www.bolet.org/~pornin/deflate-flush.html
+	return w.d.syncFlush()
+}
+
+// Close flushes and closes the writer.
+func (w *Writer) Close() error {
+	return w.d.close()
+}
+
+// Reset discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level and dictionary.
+func (w *Writer) Reset(dst io.Writer) {
+	if dw, ok := w.d.w.writer.(*dictWriter); ok {
+		// w was created with NewWriterDict
+		dw.w = dst
+		w.d.reset(dw)
+		w.d.fillWindow(w.dict)
+	} else {
+		// w was created with NewWriter
+		w.d.reset(dst)
+	}
+}
+
+// ResetDict discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level, but sets a specific dictionary.
+func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
+	w.dict = dict
+	w.d.reset(dst)
+	w.d.fillWindow(w.dict)
+}
diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
new file mode 100644
index 000000000..71c75a065
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
@@ -0,0 +1,184 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// dictDecoder implements the LZ77 sliding dictionary as used in decompression.
+// LZ77 decompresses data through sequences of two forms of commands:
+//
+//	* Literal insertions: Runs of one or more symbols are inserted into the data
+//	stream as is. This is accomplished through the writeByte method for a
+//	single symbol, or combinations of writeSlice/writeMark for multiple symbols.
+//	Any valid stream must start with a literal insertion if no preset dictionary
+//	is used.
+//
+//	* Backward copies: Runs of one or more symbols are copied from previously
+//	emitted data. Backward copies come as the tuple (dist, length) where dist
+//	determines how far back in the stream to copy from and length determines how
+//	many bytes to copy. Note that it is valid for the length to be greater than
+//	the distance. Since LZ77 uses forward copies, that situation is used to
+//	perform a form of run-length encoding on repeated runs of symbols.
+//	The writeCopy and tryWriteCopy are used to implement this command.
+//
+// For performance reasons, this implementation performs little to no sanity
+// checks about the arguments. As such, the invariants documented for each
+// method call must be respected.
+type dictDecoder struct {
+	hist []byte // Sliding window history
+
+	// Invariant: 0 <= rdPos <= wrPos <= len(hist)
+	wrPos int  // Current output position in buffer
+	rdPos int  // Have emitted hist[:rdPos] already
+	full  bool // Has a full window length been written yet?
+}
+
+// init initializes dictDecoder to have a sliding window dictionary of the given
+// size. If a preset dict is provided, it will initialize the dictionary with
+// the contents of dict.
+func (dd *dictDecoder) init(size int, dict []byte) {
+	*dd = dictDecoder{hist: dd.hist}
+
+	if cap(dd.hist) < size {
+		dd.hist = make([]byte, size)
+	}
+	dd.hist = dd.hist[:size]
+
+	if len(dict) > len(dd.hist) {
+		dict = dict[len(dict)-len(dd.hist):]
+	}
+	dd.wrPos = copy(dd.hist, dict)
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos = 0
+		dd.full = true
+	}
+	dd.rdPos = dd.wrPos
+}
+
+// histSize reports the total amount of historical data in the dictionary.
+func (dd *dictDecoder) histSize() int {
+	if dd.full {
+		return len(dd.hist)
+	}
+	return dd.wrPos
+}
+
+// availRead reports the number of bytes that can be flushed by readFlush.
+func (dd *dictDecoder) availRead() int {
+	return dd.wrPos - dd.rdPos
+}
+
+// availWrite reports the available amount of output buffer space.
+func (dd *dictDecoder) availWrite() int {
+	return len(dd.hist) - dd.wrPos
+}
+
+// writeSlice returns a slice of the available buffer to write data to.
+//
+// This invariant will be kept: len(s) <= availWrite()
+func (dd *dictDecoder) writeSlice() []byte {
+	return dd.hist[dd.wrPos:]
+}
+
+// writeMark advances the writer pointer by cnt.
+//
+// This invariant must be kept: 0 <= cnt <= availWrite()
+func (dd *dictDecoder) writeMark(cnt int) {
+	dd.wrPos += cnt
+}
+
+// writeByte writes a single byte to the dictionary.
+//
+// This invariant must be kept: 0 < availWrite()
+func (dd *dictDecoder) writeByte(c byte) {
+	dd.hist[dd.wrPos] = c
+	dd.wrPos++
+}
+
+// writeCopy copies a string at a given (dist, length) to the output.
+// This returns the number of bytes copied and may be less than the requested
+// length if the available space in the output buffer is too small.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) writeCopy(dist, length int) int {
+	dstBase := dd.wrPos
+	dstPos := dstBase
+	srcPos := dstPos - dist
+	endPos := dstPos + length
+	if endPos > len(dd.hist) {
+		endPos = len(dd.hist)
+	}
+
+	// Copy non-overlapping section after destination position.
+	//
+	// This section is non-overlapping in that the copy length for this section
+	// is always less than or equal to the backwards distance. This can occur
+	// if a distance refers to data that wraps-around in the buffer.
+	// Thus, a backwards copy is performed here; that is, the exact bytes in
+	// the source prior to the copy is placed in the destination.
+	if srcPos < 0 {
+		srcPos += len(dd.hist)
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:])
+		srcPos = 0
+	}
+
+	// Copy possibly overlapping section before destination position.
+	//
+	// This section can overlap if the copy length for this section is larger
+	// than the backwards distance. This is allowed by LZ77 so that repeated
+	// strings can be succinctly represented using (dist, length) pairs.
+	// Thus, a forwards copy is performed here; that is, the bytes copied is
+	// possibly dependent on the resulting bytes in the destination as the copy
+	// progresses along. This is functionally equivalent to the following:
+	//
+	//	for i := 0; i < endPos-dstPos; i++ {
+	//		dd.hist[dstPos+i] = dd.hist[srcPos+i]
+	//	}
+	//	dstPos = endPos
+	//
+	for dstPos < endPos {
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// tryWriteCopy tries to copy a string at a given (distance, length) to the
+// output. This specialized version is optimized for short distances.
+//
+// This method is designed to be inlined for performance reasons.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
+	dstPos := dd.wrPos
+	endPos := dstPos + length
+	if dstPos < dist || endPos > len(dd.hist) {
+		return 0
+	}
+	dstBase := dstPos
+	srcPos := dstPos - dist
+
+	// Copy possibly overlapping section before destination position.
+loop:
+	dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	if dstPos < endPos {
+		goto loop // Avoid for-loop so that this function can be inlined
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// readFlush returns a slice of the historical buffer that is ready to be
+// emitted to the user. The data returned by readFlush must be fully consumed
+// before calling any other dictDecoder methods.
+func (dd *dictDecoder) readFlush() []byte {
+	toRead := dd.hist[dd.rdPos:dd.wrPos]
+	dd.rdPos = dd.wrPos
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos, dd.rdPos = 0, 0
+		dd.full = true
+	}
+	return toRead
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
new file mode 100644
index 000000000..f9b2a699a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -0,0 +1,701 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"io"
+)
+
+const (
+	// The largest offset code.
+	offsetCodeCount = 30
+
+	// The special code used to mark the end of a block.
+	endBlockMarker = 256
+
+	// The first length code.
+	lengthCodesStart = 257
+
+	// The number of codegen codes.
+	codegenCodeCount = 19
+	badCode          = 255
+
+	// bufferFlushSize indicates the buffer size
+	// after which bytes are flushed to the writer.
+	// Should preferably be a multiple of 6, since
+	// we accumulate 6 bytes between writes to the buffer.
+	bufferFlushSize = 240
+
+	// bufferSize is the actual output byte buffer size.
+	// It must have additional headroom for a flush
+	// which can contain up to 8 bytes.
+	bufferSize = bufferFlushSize + 8
+)
+
+// The number of extra bits needed by length code X - LENGTH_CODES_START.
+var lengthExtraBits = []int8{
+	/* 257 */ 0, 0, 0,
+	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
+	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
+	/* 280 */ 4, 5, 5, 5, 5, 0,
+}
+
+// The length indicated by length code X - LENGTH_CODES_START.
+var lengthBase = []uint32{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
+	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+	64, 80, 96, 112, 128, 160, 192, 224, 255,
+}
+
+// offset code word extra bits.
+var offsetExtraBits = []int8{
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
+	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	/* extended window */
+	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+}
+
+var offsetBase = []uint32{
+	/* normal deflate */
+	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+	/* extended window */
+	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
+	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
+	0x100000, 0x180000, 0x200000, 0x300000,
+}
+
+// The odd order in which the codegen code sizes are written.
+var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+type huffmanBitWriter struct {
+	// writer is the underlying writer.
+	// Do not use it directly; use the write method, which ensures
+	// that Write errors are sticky.
+	writer io.Writer
+
+	// Data waiting to be written is bytes[0:nbytes]
+	// and then the low nbits of bits.
+	bits            uint64
+	nbits           uint
+	bytes           [bufferSize]byte
+	codegenFreq     [codegenCodeCount]int32
+	nbytes          int
+	literalFreq     []int32
+	offsetFreq      []int32
+	codegen         []uint8
+	literalEncoding *huffmanEncoder
+	offsetEncoding  *huffmanEncoder
+	codegenEncoding *huffmanEncoder
+	err             error
+}
+
+func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
+	return &huffmanBitWriter{
+		writer:          w,
+		literalFreq:     make([]int32, maxNumLit),
+		offsetFreq:      make([]int32, offsetCodeCount),
+		codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
+		literalEncoding: newHuffmanEncoder(maxNumLit),
+		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
+		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
+	}
+}
+
+func (w *huffmanBitWriter) reset(writer io.Writer) {
+	w.writer = writer
+	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
+	w.bytes = [bufferSize]byte{}
+}
+
+func (w *huffmanBitWriter) flush() {
+	if w.err != nil {
+		w.nbits = 0
+		return
+	}
+	n := w.nbytes
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		if w.nbits > 8 { // Avoid underflow
+			w.nbits -= 8
+		} else {
+			w.nbits = 0
+		}
+		n++
+	}
+	w.bits = 0
+	w.write(w.bytes[:n])
+	w.nbytes = 0
+}
+
+func (w *huffmanBitWriter) write(b []byte) {
+	if w.err != nil {
+		return
+	}
+	_, w.err = w.writer.Write(b)
+}
+
+func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
+	if w.err != nil {
+		return
+	}
+	w.bits |= uint64(b) << w.nbits
+	w.nbits += nb
+	if w.nbits >= 48 {
+		bits := w.bits
+		w.bits >>= 48
+		w.nbits -= 48
+		n := w.nbytes
+		bytes := w.bytes[n : n+6]
+		bytes[0] = byte(bits)
+		bytes[1] = byte(bits >> 8)
+		bytes[2] = byte(bits >> 16)
+		bytes[3] = byte(bits >> 24)
+		bytes[4] = byte(bits >> 32)
+		bytes[5] = byte(bits >> 40)
+		n += 6
+		if n >= bufferFlushSize {
+			w.write(w.bytes[:n])
+			n = 0
+		}
+		w.nbytes = n
+	}
+}
+
+func (w *huffmanBitWriter) writeBytes(bytes []byte) {
+	if w.err != nil {
+		return
+	}
+	n := w.nbytes
+	if w.nbits&7 != 0 {
+		w.err = InternalError("writeBytes with unfinished bits")
+		return
+	}
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		w.nbits -= 8
+		n++
+	}
+	if n != 0 {
+		w.write(w.bytes[:n])
+	}
+	w.nbytes = 0
+	w.write(bytes)
+}
+
+// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
+// the literal and offset lengths arrays (which are concatenated into a single
+// array).  This method generates that run-length encoding.
+//
+// The result is written into the codegen array, and the frequencies
+// of each code is written into the codegenFreq array.
+// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
+// information. Code badCode is an end marker
+//
+//  numLiterals      The number of literals in literalEncoding
+//  numOffsets       The number of offsets in offsetEncoding
+//  litenc, offenc   The literal and offset encoder to use
+func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
+	for i := range w.codegenFreq {
+		w.codegenFreq[i] = 0
+	}
+	// Note that we are using codegen both as a temporary variable for holding
+	// a copy of the frequencies, and as the place where we put the result.
+	// This is fine because the output is always shorter than the input used
+	// so far.
+	codegen := w.codegen // cache
+	// Copy the concatenated code sizes to codegen. Put a marker at the end.
+	cgnl := codegen[:numLiterals]
+	for i := range cgnl {
+		cgnl[i] = uint8(litEnc.codes[i].len)
+	}
+
+	cgnl = codegen[numLiterals : numLiterals+numOffsets]
+	for i := range cgnl {
+		cgnl[i] = uint8(offEnc.codes[i].len)
+	}
+	codegen[numLiterals+numOffsets] = badCode
+
+	size := codegen[0]
+	count := 1
+	outIndex := 0
+	for inIndex := 1; size != badCode; inIndex++ {
+		// INVARIANT: We have seen "count" copies of size that have not yet
+		// had output generated for them.
+		nextSize := codegen[inIndex]
+		if nextSize == size {
+			count++
+			continue
+		}
+		// We need to generate codegen indicating "count" of size.
+		if size != 0 {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+			count--
+			for count >= 3 {
+				n := 6
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 16
+				outIndex++
+				codegen[outIndex] = uint8(n - 3)
+				outIndex++
+				w.codegenFreq[16]++
+				count -= n
+			}
+		} else {
+			for count >= 11 {
+				n := 138
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 18
+				outIndex++
+				codegen[outIndex] = uint8(n - 11)
+				outIndex++
+				w.codegenFreq[18]++
+				count -= n
+			}
+			if count >= 3 {
+				// count >= 3 && count <= 10
+				codegen[outIndex] = 17
+				outIndex++
+				codegen[outIndex] = uint8(count - 3)
+				outIndex++
+				w.codegenFreq[17]++
+				count = 0
+			}
+		}
+		count--
+		for ; count >= 0; count-- {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+		}
+		// Set up invariant for next time through the loop.
+		size = nextSize
+		count = 1
+	}
+	// Marker indicating the end of the codegen.
+	codegen[outIndex] = badCode
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	numCodegens = len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
+		int(w.codegenFreq[16])*2 +
+		int(w.codegenFreq[17])*3 +
+		int(w.codegenFreq[18])*7
+	size = header +
+		litEnc.bitLength(w.literalFreq) +
+		offEnc.bitLength(w.offsetFreq) +
+		extraBits
+
+	return size, numCodegens
+}
+
+// fixedSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) fixedSize(extraBits int) int {
+	return 3 +
+		fixedLiteralEncoding.bitLength(w.literalFreq) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq) +
+		extraBits
+}
+
+// storedSize calculates the stored size, including header.
+// The function returns the size in bits and whether the block
+// fits inside a single block.
+func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
+	if in == nil {
+		return 0, false
+	}
+	if len(in) <= maxStoreBlockSize {
+		return (len(in) + 5) * 8, true
+	}
+	return 0, false
+}
+
+func (w *huffmanBitWriter) writeCode(c hcode) {
+	if w.err != nil {
+		return
+	}
+	w.bits |= uint64(c.code) << w.nbits
+	w.nbits += uint(c.len)
+	if w.nbits >= 48 {
+		bits := w.bits
+		w.bits >>= 48
+		w.nbits -= 48
+		n := w.nbytes
+		bytes := w.bytes[n : n+6]
+		bytes[0] = byte(bits)
+		bytes[1] = byte(bits >> 8)
+		bytes[2] = byte(bits >> 16)
+		bytes[3] = byte(bits >> 24)
+		bytes[4] = byte(bits >> 32)
+		bytes[5] = byte(bits >> 40)
+		n += 6
+		if n >= bufferFlushSize {
+			w.write(w.bytes[:n])
+			n = 0
+		}
+		w.nbytes = n
+	}
+}
+
+// Write the header of a dynamic Huffman block to the output stream.
+//
+//  numLiterals  The number of literals specified in codegen
+//  numOffsets   The number of offsets specified in codegen
+//  numCodegens  The number of codegens used in codegen
+func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	var firstBits int32 = 4
+	if isEof {
+		firstBits = 5
+	}
+	w.writeBits(firstBits, 3)
+	w.writeBits(int32(numLiterals-257), 5)
+	w.writeBits(int32(numOffsets-1), 5)
+	w.writeBits(int32(numCodegens-4), 4)
+
+	for i := 0; i < numCodegens; i++ {
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+		w.writeBits(int32(value), 3)
+	}
+
+	i := 0
+	for {
+		var codeWord int = int(w.codegen[i])
+		i++
+		if codeWord == badCode {
+			break
+		}
+		w.writeCode(w.codegenEncoding.codes[uint32(codeWord)])
+
+		switch codeWord {
+		case 16:
+			w.writeBits(int32(w.codegen[i]), 2)
+			i++
+			break
+		case 17:
+			w.writeBits(int32(w.codegen[i]), 3)
+			i++
+			break
+		case 18:
+			w.writeBits(int32(w.codegen[i]), 7)
+			i++
+			break
+		}
+	}
+}
+
+func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	var flag int32
+	if isEof {
+		flag = 1
+	}
+	w.writeBits(flag, 3)
+	w.flush()
+	w.writeBits(int32(length), 16)
+	w.writeBits(int32(^uint16(length)), 16)
+}
+
+func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
+	if w.err != nil {
+		return
+	}
+	// Indicate that we are a fixed Huffman block
+	var value int32 = 2
+	if isEof {
+		value = 3
+	}
+	w.writeBits(value, 3)
+}
+
+// writeBlock will write a block of tokens with the smallest encoding.
+// The original input can be supplied, and if the huffman encoded data
+// is larger than the original bytes, the data will be written as a
+// stored block.
+// If the input is nil, the tokens will always be Huffman encoded.
+func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
+	if w.err != nil {
+		return
+	}
+
+	tokens = append(tokens, endBlockMarker)
+	numLiterals, numOffsets := w.indexTokens(tokens)
+
+	var extraBits int
+	storedSize, storable := w.storedSize(input)
+	if storable {
+		// We only bother calculating the costs of the extra bits required by
+		// the length of offset fields (which will be the same for both fixed
+		// and dynamic encoding), if we need to compare those two encodings
+		// against stored encoding.
+		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
+			// First eight length codes have extra size = 0.
+			extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
+		}
+		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
+			// First four offset codes have extra size = 0.
+			extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
+		}
+	}
+
+	// Figure out smallest code.
+	// Fixed Huffman baseline.
+	var literalEncoding = fixedLiteralEncoding
+	var offsetEncoding = fixedOffsetEncoding
+	var size = w.fixedSize(extraBits)
+
+	// Dynamic Huffman?
+	var numCodegens int
+
+	// Generate codegen and codegenFrequencies, which indicates how to encode
+	// the literalEncoding and the offsetEncoding.
+	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+	dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+	if dynamicSize < size {
+		size = dynamicSize
+		literalEncoding = w.literalEncoding
+		offsetEncoding = w.offsetEncoding
+	}
+
+	// Stored bytes?
+	if storable && storedSize < size {
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	// Huffman.
+	if literalEncoding == fixedLiteralEncoding {
+		w.writeFixedHeader(eof)
+	} else {
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	}
+
+	// Write the tokens.
+	w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
+}
+
+// writeBlockDynamic encodes a block using a dynamic Huffman table.
+// This should be used if the symbols used have a disproportionate
+// histogram distribution.
+// If input is supplied and the compression savings are below 1/16th of the
+// input size the block is stored.
+func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
+	if w.err != nil {
+		return
+	}
+
+	tokens = append(tokens, endBlockMarker)
+	numLiterals, numOffsets := w.indexTokens(tokens)
+
+	// Generate codegen and codegenFrequencies, which indicates how to encode
+	// the literalEncoding and the offsetEncoding.
+	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+	size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
+
+	// Store bytes, if we don't get a reasonable improvement.
+	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	// Write Huffman table.
+	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+
+	// Write the tokens.
+	w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
+}
+
+// indexTokens indexes a slice of tokens, and updates
+// literalFreq and offsetFreq, and generates literalEncoding
+// and offsetEncoding.
+// The number of literal and offset tokens is returned.
+func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
+	for i := range w.literalFreq {
+		w.literalFreq[i] = 0
+	}
+	for i := range w.offsetFreq {
+		w.offsetFreq[i] = 0
+	}
+
+	for _, t := range tokens {
+		if t < matchType {
+			w.literalFreq[t.literal()]++
+			continue
+		}
+		length := t.length()
+		offset := t.offset()
+		w.literalFreq[lengthCodesStart+lengthCode(length)]++
+		w.offsetFreq[offsetCode(offset)]++
+	}
+
+	// get the number of literals
+	numLiterals = len(w.literalFreq)
+	for w.literalFreq[numLiterals-1] == 0 {
+		numLiterals--
+	}
+	// get the number of offsets
+	numOffsets = len(w.offsetFreq)
+	for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
+		numOffsets--
+	}
+	if numOffsets == 0 {
+		// We haven't found a single match. If we want to go with the dynamic encoding,
+		// we should count at least one offset to be sure that the offset huffman tree could be encoded.
+		w.offsetFreq[0] = 1
+		numOffsets = 1
+	}
+	w.literalEncoding.generate(w.literalFreq, 15)
+	w.offsetEncoding.generate(w.offsetFreq, 15)
+	return
+}
+
+// writeTokens writes a slice of tokens to the output.
+// codes for literal and offset encoding must be supplied.
+func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
+	if w.err != nil {
+		return
+	}
+	for _, t := range tokens {
+		if t < matchType {
+			w.writeCode(leCodes[t.literal()])
+			continue
+		}
+		// Write the length
+		length := t.length()
+		lengthCode := lengthCode(length)
+		w.writeCode(leCodes[lengthCode+lengthCodesStart])
+		extraLengthBits := uint(lengthExtraBits[lengthCode])
+		if extraLengthBits > 0 {
+			extraLength := int32(length - lengthBase[lengthCode])
+			w.writeBits(extraLength, extraLengthBits)
+		}
+		// Write the offset
+		offset := t.offset()
+		offsetCode := offsetCode(offset)
+		w.writeCode(oeCodes[offsetCode])
+		extraOffsetBits := uint(offsetExtraBits[offsetCode])
+		if extraOffsetBits > 0 {
+			extraOffset := int32(offset - offsetBase[offsetCode])
+			w.writeBits(extraOffset, extraOffsetBits)
+		}
+	}
+}
+
+// huffOffset is a static offset encoder used for huffman only encoding.
+// It can be reused since we will not be encoding offset values.
+var huffOffset *huffmanEncoder
+
+func init() {
+	w := newHuffmanBitWriter(nil)
+	w.offsetFreq[0] = 1
+	huffOffset = newHuffmanEncoder(offsetCodeCount)
+	huffOffset.generate(w.offsetFreq, 15)
+}
+
+// writeBlockHuff encodes a block of bytes as either
+// Huffman encoded literals or uncompressed bytes if the
+// results only gains very little from compression.
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
+	if w.err != nil {
+		return
+	}
+
+	// Clear histogram
+	for i := range w.literalFreq {
+		w.literalFreq[i] = 0
+	}
+
+	// Add everything as literals
+	histogram(input, w.literalFreq)
+
+	w.literalFreq[endBlockMarker] = 1
+
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+
+	w.literalEncoding.generate(w.literalFreq, 15)
+
+	// Figure out smallest code.
+	// Always use dynamic Huffman or Store
+	var numCodegens int
+
+	// Generate codegen and codegenFrequencies, which indicates how to encode
+	// the literalEncoding and the offsetEncoding.
+	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+	size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+
+	// Store bytes, if we don't get a reasonable improvement.
+	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	// Huffman.
+	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	encoding := w.literalEncoding.codes[:257]
+	n := w.nbytes
+	for _, t := range input {
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		w.bits |= uint64(c.code) << w.nbits
+		w.nbits += uint(c.len)
+		if w.nbits < 48 {
+			continue
+		}
+		// Store 6 bytes
+		bits := w.bits
+		w.bits >>= 48
+		w.nbits -= 48
+		bytes := w.bytes[n : n+6]
+		bytes[0] = byte(bits)
+		bytes[1] = byte(bits >> 8)
+		bytes[2] = byte(bits >> 16)
+		bytes[3] = byte(bits >> 24)
+		bytes[4] = byte(bits >> 32)
+		bytes[5] = byte(bits >> 40)
+		n += 6
+		if n < bufferFlushSize {
+			continue
+		}
+		w.write(w.bytes[:n])
+		if w.err != nil {
+			return // Return early in the event of write failures
+		}
+		n = 0
+	}
+	w.nbytes = n
+	w.writeCode(encoding[endBlockMarker])
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
new file mode 100644
index 000000000..bdcbd823b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -0,0 +1,344 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"math"
+	"sort"
+)
+
+// hcode is a huffman code with a bit code and bit length.
+type hcode struct {
+	code, len uint16
+}
+
+type huffmanEncoder struct {
+	codes     []hcode
+	freqcache []literalNode
+	bitCount  [17]int32
+	lns       byLiteral // stored to avoid repeated allocation in generate
+	lfs       byFreq    // stored to avoid repeated allocation in generate
+}
+
+type literalNode struct {
+	literal uint16
+	freq    int32
+}
+
+// A levelInfo describes the state of the constructed tree for a given depth.
+type levelInfo struct {
+	// Our level.  for better printing
+	level int32
+
+	// The frequency of the last node at this level
+	lastFreq int32
+
+	// The frequency of the next character to add to this level
+	nextCharFreq int32
+
+	// The frequency of the next pair (from level below) to add to this level.
+	// Only valid if the "needed" value of the next lower level is 0.
+	nextPairFreq int32
+
+	// The number of chains remaining to generate for this level before moving
+	// up to the next level
+	needed int32
+}
+
+// set sets the code and length of an hcode.
+func (h *hcode) set(code uint16, length uint16) {
+	h.len = length
+	h.code = code
+}
+
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
+
+func newHuffmanEncoder(size int) *huffmanEncoder {
+	return &huffmanEncoder{codes: make([]hcode, size)}
+}
+
+// Generates a HuffmanCode corresponding to the fixed literal table
+func generateFixedLiteralEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(maxNumLit)
+	codes := h.codes
+	var ch uint16
+	for ch = 0; ch < maxNumLit; ch++ {
+		var bits uint16
+		var size uint16
+		switch {
+		case ch < 144:
+			// size 8, 000110000  .. 10111111
+			bits = ch + 48
+			size = 8
+			break
+		case ch < 256:
+			// size 9, 110010000 .. 111111111
+			bits = ch + 400 - 144
+			size = 9
+			break
+		case ch < 280:
+			// size 7, 0000000 .. 0010111
+			bits = ch - 256
+			size = 7
+			break
+		default:
+			// size 8, 11000000 .. 11000111
+			bits = ch + 192 - 280
+			size = 8
+		}
+		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+	}
+	return h
+}
+
+func generateFixedOffsetEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(30)
+	codes := h.codes
+	for ch := range codes {
+		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+	}
+	return h
+}
+
+var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
+var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
+
+func (h *huffmanEncoder) bitLength(freq []int32) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			total += int(f) * int(h.codes[i].len)
+		}
+	}
+	return total
+}
+
+const maxBitsLimit = 16
+
+// Return the number of literals assigned to each bit size in the Huffman encoding
+//
+// This method is only called when list.length >= 3
+// The cases of 0, 1, and 2 literals are handled by special case code.
+//
+// list  An array of the literals with non-zero frequencies
+//             and their associated frequencies. The array is in order of increasing
+//             frequency, and has as its last element a special element with frequency
+//             MaxInt32
+// maxBits     The maximum number of bits that should be used to encode any literal.
+//             Must be less than 16.
+// return      An integer array in which array[i] indicates the number of literals
+//             that should be encoded in i bits.
+func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
+	if maxBits >= maxBitsLimit {
+		panic("flate: maxBits too large")
+	}
+	n := int32(len(list))
+	list = list[0 : n+1]
+	list[n] = maxNode()
+
+	// The tree can't have greater depth than n - 1, no matter what. This
+	// saves a little bit of work in some small cases
+	if maxBits > n-1 {
+		maxBits = n - 1
+	}
+
+	// Create information about each of the levels.
+	// A bogus "Level 0" whose sole purpose is so that
+	// level1.prev.needed==0.  This makes level1.nextPairFreq
+	// be a legitimate value that never gets chosen.
+	var levels [maxBitsLimit]levelInfo
+	// leafCounts[i] counts the number of literals at the left
+	// of ancestors of the rightmost node at level i.
+	// leafCounts[i][j] is the number of literals at the left
+	// of the level j ancestor.
+	var leafCounts [maxBitsLimit][maxBitsLimit]int32
+
+	for level := int32(1); level <= maxBits; level++ {
+		// For every level, the first two items are the first two characters.
+		// We initialize the levels as if we had already figured this out.
+		levels[level] = levelInfo{
+			level:        level,
+			lastFreq:     list[1].freq,
+			nextCharFreq: list[2].freq,
+			nextPairFreq: list[0].freq + list[1].freq,
+		}
+		leafCounts[level][level] = 2
+		if level == 1 {
+			levels[level].nextPairFreq = math.MaxInt32
+		}
+	}
+
+	// We need a total of 2*n - 2 items at top level and have already generated 2.
+	levels[maxBits].needed = 2*n - 4
+
+	level := maxBits
+	for {
+		l := &levels[level]
+		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
+			// We've run out of both leafs and pairs.
+			// End all calculations for this level.
+			// To make sure we never come back to this level or any lower level,
+			// set nextPairFreq impossibly large.
+			l.needed = 0
+			levels[level+1].nextPairFreq = math.MaxInt32
+			level++
+			continue
+		}
+
+		prevFreq := l.lastFreq
+		if l.nextCharFreq < l.nextPairFreq {
+			// The next item on this row is a leaf node.
+			n := leafCounts[level][level] + 1
+			l.lastFreq = l.nextCharFreq
+			// Lower leafCounts are the same of the previous node.
+			leafCounts[level][level] = n
+			l.nextCharFreq = list[n].freq
+		} else {
+			// The next item on this row is a pair from the previous row.
+			// nextPairFreq isn't valid until we generate two
+			// more values in the level below
+			l.lastFreq = l.nextPairFreq
+			// Take leaf counts from the lower level, except counts[level] remains the same.
+			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			levels[l.level-1].needed = 2
+		}
+
+		if l.needed--; l.needed == 0 {
+			// We've done everything we need to do for this level.
+			// Continue calculating one level up. Fill in nextPairFreq
+			// of that level with the sum of the two nodes we've just calculated on
+			// this level.
+			if l.level == maxBits {
+				// All done!
+				break
+			}
+			levels[l.level+1].nextPairFreq = prevFreq + l.lastFreq
+			level++
+		} else {
+			// If we stole from below, move down temporarily to replenish it.
+			for levels[level-1].needed > 0 {
+				level--
+			}
+		}
+	}
+
+	// Somethings is wrong if at the end, the top level is null or hasn't used
+	// all of the leaves.
+	if leafCounts[maxBits][maxBits] != n {
+		panic("leafCounts[maxBits][maxBits] != n")
+	}
+
+	bitCount := h.bitCount[:maxBits+1]
+	bits := 1
+	counts := &leafCounts[maxBits]
+	for level := maxBits; level > 0; level-- {
+		// chain.leafCount gives the number of literals requiring at least "bits"
+		// bits to encode.
+		bitCount[bits] = counts[level] - counts[level-1]
+		bits++
+	}
+	return bitCount
+}
+
+// Look at the leaves and assign them a bit count and an encoding as specified
+// in RFC 1951 3.2.2
+func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalNode) {
+	code := uint16(0)
+	for n, bits := range bitCount {
+		code <<= 1
+		if n == 0 || bits == 0 {
+			continue
+		}
+		// The literals list[len(list)-bits] .. list[len(list)-bits]
+		// are encoded using "bits" bits, and get the values
+		// code, code + 1, ....  The code values are
+		// assigned in literal order (not frequency order).
+		chunk := list[len(list)-int(bits):]
+
+		h.lns.sort(chunk)
+		for _, node := range chunk {
+			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			code++
+		}
+		list = list[0 : len(list)-int(bits)]
+	}
+}
+
+// Update this Huffman Code object to be the minimum code for the specified frequency count.
+//
+// freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
+// maxBits  The maximum number of bits to use for any literal.
+func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
+	if h.freqcache == nil {
+		// Allocate a reusable buffer with the longest possible frequency table.
+		// Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
+		// The largest of these is maxNumLit, so we allocate for that case.
+		h.freqcache = make([]literalNode, maxNumLit+1)
+	}
+	list := h.freqcache[:len(freq)+1]
+	// Number of non-zero literals
+	count := 0
+	// Set list to be the set of all non-zero literals and their frequencies
+	for i, f := range freq {
+		if f != 0 {
+			list[count] = literalNode{uint16(i), f}
+			count++
+		} else {
+			list[count] = literalNode{}
+			h.codes[i].len = 0
+		}
+	}
+	list[len(freq)] = literalNode{}
+
+	list = list[:count]
+	if count <= 2 {
+		// Handle the small cases here, because they are awkward for the general case code. With
+		// two or fewer literals, everything has bit length 1.
+		for i, node := range list {
+			// "list" is in order of increasing literal value.
+			h.codes[node.literal].set(uint16(i), 1)
+		}
+		return
+	}
+	h.lfs.sort(list)
+
+	// Get the number of literals for each bit count
+	bitCount := h.bitCounts(list, maxBits)
+	// And do the assignment
+	h.assignEncodingAndSize(bitCount, list)
+}
+
+type byLiteral []literalNode
+
+func (s *byLiteral) sort(a []literalNode) {
+	*s = byLiteral(a)
+	sort.Sort(s)
+}
+
+func (s byLiteral) Len() int { return len(s) }
+
+func (s byLiteral) Less(i, j int) bool {
+	return s[i].literal < s[j].literal
+}
+
+func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+
+type byFreq []literalNode
+
+func (s *byFreq) sort(a []literalNode) {
+	*s = byFreq(a)
+	sort.Sort(s)
+}
+
+func (s byFreq) Len() int { return len(s) }
+
+func (s byFreq) Less(i, j int) bool {
+	if s[i].freq == s[j].freq {
+		return s[i].literal < s[j].literal
+	}
+	return s[i].freq < s[j].freq
+}
+
+func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
new file mode 100644
index 000000000..800d0ce9e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -0,0 +1,880 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package flate implements the DEFLATE compressed data format, described in
+// RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
+// formats.
+package flate
+
+import (
+	"bufio"
+	"io"
+	"math/bits"
+	"strconv"
+	"sync"
+)
+
+const (
+	maxCodeLen     = 16 // max length of Huffman code
+	maxCodeLenMask = 15 // mask for max length of Huffman code
+	// The next three numbers come from the RFC section 3.2.7, with the
+	// additional proviso in section 3.2.5 which implies that distance codes
+	// 30 and 31 should never occur in compressed data.
+	maxNumLit  = 286
+	maxNumDist = 30
+	numCodes   = 19 // number of codes in Huffman meta-code
+)
+
+// Initialize the fixedHuffmanDecoder only once upon first use.
+var fixedOnce sync.Once
+var fixedHuffmanDecoder huffmanDecoder
+
+// A CorruptInputError reports the presence of corrupt input at a given offset.
+type CorruptInputError int64
+
+func (e CorruptInputError) Error() string {
+	return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
+}
+
+// An InternalError reports an error in the flate code itself.
+type InternalError string
+
+func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
+
+// A ReadError reports an error encountered while reading input.
+//
+// Deprecated: No longer returned.
+type ReadError struct {
+	Offset int64 // byte offset where error occurred
+	Err    error // error returned by underlying Read
+}
+
+func (e *ReadError) Error() string {
+	return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
+}
+
+// A WriteError reports an error encountered while writing output.
+//
+// Deprecated: No longer returned.
+type WriteError struct {
+	Offset int64 // byte offset where error occurred
+	Err    error // error returned by underlying Write
+}
+
+func (e *WriteError) Error() string {
+	return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
+}
+
+// Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
+// to switch to a new underlying Reader. This permits reusing a ReadCloser
+// instead of allocating a new one.
+type Resetter interface {
+	// Reset discards any buffered data and resets the Resetter as if it was
+	// newly initialized with the given reader.
+	Reset(r io.Reader, dict []byte) error
+}
+
+// The data structure for decoding Huffman tables is based on that of
+// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
+// For codes smaller than the table width, there are multiple entries
+// (each combination of trailing bits has the same value). For codes
+// larger than the table width, the table contains a link to an overflow
+// table. The width of each entry in the link table is the maximum code
+// size minus the chunk width.
+//
+// Note that you can do a lookup in the table even without all bits
+// filled. Since the extra bits are zero, and the DEFLATE Huffman codes
+// have the property that shorter codes come before longer ones, the
+// bit length estimate in the result is a lower bound on the actual
+// number of bits.
+//
+// See the following:
+//	http://www.gzip.org/algorithm.txt
+
+// chunk & 15 is number of bits
+// chunk >> 4 is value, including table link
+
+const (
+	huffmanChunkBits  = 9
+	huffmanNumChunks  = 1 << huffmanChunkBits
+	huffmanCountMask  = 15
+	huffmanValueShift = 4
+)
+
+type huffmanDecoder struct {
+	min      int                       // the minimum code length
+	chunks   *[huffmanNumChunks]uint32 // chunks as described above
+	links    [][]uint32                // overflow links
+	linkMask uint32                    // mask the width of the link table
+}
+
+// Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
+func (h *huffmanDecoder) init(lengths []int) bool {
+	// Sanity enables additional runtime tests during Huffman
+	// table construction. It's intended to be used during
+	// development to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if h.chunks == nil {
+		h.chunks = &[huffmanNumChunks]uint32{}
+	}
+	if h.min != 0 {
+		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
+	}
+
+	// Count number of codes of each length,
+	// compute min and max length.
+	var count [maxCodeLen]int
+	var min, max int
+	for _, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		if min == 0 || n < min {
+			min = n
+		}
+		if n > max {
+			max = n
+		}
+		count[n&maxCodeLenMask]++
+	}
+
+	// Empty tree. The decompressor.huffSym function will fail later if the tree
+	// is used. Technically, an empty tree is only valid for the HDIST tree and
+	// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+	// is guaranteed to fail since it will attempt to use the tree to decode the
+	// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+	// guaranteed to fail later since the compressed data section must be
+	// composed of at least one symbol (the end-of-block marker).
+	if max == 0 {
+		return true
+	}
+
+	code := 0
+	var nextcode [maxCodeLen]int
+	for i := min; i <= max; i++ {
+		code <<= 1
+		nextcode[i&maxCodeLenMask] = code
+		code += count[i&maxCodeLenMask]
+	}
+
+	// Check that the coding is complete (i.e., that we've
+	// assigned all 2-to-the-max possible bit sequences).
+	// Exception: To be compatible with zlib, we also need to
+	// accept degenerate single-code codings. See also
+	// TestDegenerateHuffmanCoding.
+	if code != 1<<uint(max) && !(code == 1 && max == 1) {
+		return false
+	}
+
+	h.min = min
+	chunks := h.chunks[:]
+	for i := range chunks {
+		chunks[i] = 0
+	}
+
+	if max > huffmanChunkBits {
+		numLinks := 1 << (uint(max) - huffmanChunkBits)
+		h.linkMask = uint32(numLinks - 1)
+
+		// create link tables
+		link := nextcode[huffmanChunkBits+1] >> 1
+		if cap(h.links) < huffmanNumChunks-link {
+			h.links = make([][]uint32, huffmanNumChunks-link)
+		} else {
+			h.links = h.links[:huffmanNumChunks-link]
+		}
+		for j := uint(link); j < huffmanNumChunks; j++ {
+			reverse := int(bits.Reverse16(uint16(j)))
+			reverse >>= uint(16 - huffmanChunkBits)
+			off := j - uint(link)
+			if sanity && h.chunks[reverse] != 0 {
+				panic("impossible: overwriting existing chunk")
+			}
+			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			if cap(h.links[off]) < numLinks {
+				h.links[off] = make([]uint32, numLinks)
+			} else {
+				links := h.links[off][:0]
+				h.links[off] = links[:numLinks]
+			}
+		}
+	} else {
+		h.links = h.links[:0]
+	}
+
+	for i, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		code := nextcode[n]
+		nextcode[n]++
+		chunk := uint32(i<<huffmanValueShift | n)
+		reverse := int(bits.Reverse16(uint16(code)))
+		reverse >>= uint(16 - n)
+		if n <= huffmanChunkBits {
+			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+				// We should never need to overwrite
+				// an existing chunk. Also, 0 is
+				// never a valid chunk, because the
+				// lower 4 "count" bits should be
+				// between 1 and 15.
+				if sanity && h.chunks[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				h.chunks[off] = chunk
+			}
+		} else {
+			j := reverse & (huffmanNumChunks - 1)
+			if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+				// Longer codes should have been
+				// associated with a link table above.
+				panic("impossible: not an indirect chunk")
+			}
+			value := h.chunks[j] >> huffmanValueShift
+			linktab := h.links[value]
+			reverse >>= huffmanChunkBits
+			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+				if sanity && linktab[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				linktab[off] = chunk
+			}
+		}
+	}
+
+	if sanity {
+		// Above we've sanity checked that we never overwrote
+		// an existing entry. Here we additionally check that
+		// we filled the tables completely.
+		for i, chunk := range h.chunks {
+			if chunk == 0 {
+				// As an exception, in the degenerate
+				// single-code case, we allow odd
+				// chunks to be missing.
+				if code == 1 && i%2 == 1 {
+					continue
+				}
+				panic("impossible: missing chunk")
+			}
+		}
+		for _, linktab := range h.links {
+			for _, chunk := range linktab {
+				if chunk == 0 {
+					panic("impossible: missing chunk")
+				}
+			}
+		}
+	}
+
+	return true
+}
+
+// The actual read interface needed by NewReader.
+// If the passed in io.Reader does not also have ReadByte,
+// the NewReader will introduce its own buffering.
+type Reader interface {
+	io.Reader
+	io.ByteReader
+}
+
+// Decompress state.
+type decompressor struct {
+	// Input source.
+	r       Reader
+	roffset int64
+
+	// Input bits, in top of b.
+	b  uint32
+	nb uint
+
+	// Huffman decoders for literal/length, distance.
+	h1, h2 huffmanDecoder
+
+	// Length arrays used to define Huffman codes.
+	bits     *[maxNumLit + maxNumDist]int
+	codebits *[numCodes]int
+
+	// Output history, buffer.
+	dict dictDecoder
+
+	// Temporary buffer (avoids repeated allocation).
+	buf [4]byte
+
+	// Next step in the decompression,
+	// and decompression state.
+	step      func(*decompressor)
+	stepState int
+	final     bool
+	err       error
+	toRead    []byte
+	hl, hd    *huffmanDecoder
+	copyLen   int
+	copyDist  int
+}
+
+func (f *decompressor) nextBlock() {
+	for f.nb < 1+2 {
+		if f.err = f.moreBits(); f.err != nil {
+			return
+		}
+	}
+	f.final = f.b&1 == 1
+	f.b >>= 1
+	typ := f.b & 3
+	f.b >>= 2
+	f.nb -= 1 + 2
+	switch typ {
+	case 0:
+		f.dataBlock()
+	case 1:
+		// compressed, fixed Huffman tables
+		f.hl = &fixedHuffmanDecoder
+		f.hd = nil
+		f.huffmanBlock()
+	case 2:
+		// compressed, dynamic Huffman tables
+		if f.err = f.readHuffman(); f.err != nil {
+			break
+		}
+		f.hl = &f.h1
+		f.hd = &f.h2
+		f.huffmanBlock()
+	default:
+		// 3 is reserved.
+		f.err = CorruptInputError(f.roffset)
+	}
+}
+
+func (f *decompressor) Read(b []byte) (int, error) {
+	for {
+		if len(f.toRead) > 0 {
+			n := copy(b, f.toRead)
+			f.toRead = f.toRead[n:]
+			if len(f.toRead) == 0 {
+				return n, f.err
+			}
+			return n, nil
+		}
+		if f.err != nil {
+			return 0, f.err
+		}
+		f.step(f)
+		if f.err != nil && len(f.toRead) == 0 {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+		}
+	}
+}
+
+// Support the io.WriteTo interface for io.Copy and friends.
+func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
+	total := int64(0)
+	flushed := false
+	for {
+		if len(f.toRead) > 0 {
+			n, err := w.Write(f.toRead)
+			total += int64(n)
+			if err != nil {
+				f.err = err
+				return total, err
+			}
+			if n != len(f.toRead) {
+				return total, io.ErrShortWrite
+			}
+			f.toRead = f.toRead[:0]
+		}
+		if f.err != nil && flushed {
+			if f.err == io.EOF {
+				return total, nil
+			}
+			return total, f.err
+		}
+		if f.err == nil {
+			f.step(f)
+		}
+		if len(f.toRead) == 0 && f.err != nil && !flushed {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+			flushed = true
+		}
+	}
+}
+
+func (f *decompressor) Close() error {
+	if f.err == io.EOF {
+		return nil
+	}
+	return f.err
+}
+
+// RFC 1951 section 3.2.7.
+// Compression with dynamic Huffman codes
+
+var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+func (f *decompressor) readHuffman() error {
+	// HLIT[5], HDIST[5], HCLEN[4].
+	for f.nb < 5+5+4 {
+		if err := f.moreBits(); err != nil {
+			return err
+		}
+	}
+	nlit := int(f.b&0x1F) + 257
+	if nlit > maxNumLit {
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	ndist := int(f.b&0x1F) + 1
+	if ndist > maxNumDist {
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	nclen := int(f.b&0xF) + 4
+	// numCodes is 19, so nclen is always valid.
+	f.b >>= 4
+	f.nb -= 5 + 5 + 4
+
+	// (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
+	for i := 0; i < nclen; i++ {
+		for f.nb < 3 {
+			if err := f.moreBits(); err != nil {
+				return err
+			}
+		}
+		f.codebits[codeOrder[i]] = int(f.b & 0x7)
+		f.b >>= 3
+		f.nb -= 3
+	}
+	for i := nclen; i < len(codeOrder); i++ {
+		f.codebits[codeOrder[i]] = 0
+	}
+	if !f.h1.init(f.codebits[0:]) {
+		return CorruptInputError(f.roffset)
+	}
+
+	// HLIT + 257 code lengths, HDIST + 1 code lengths,
+	// using the code length Huffman code.
+	for i, n := 0, nlit+ndist; i < n; {
+		x, err := f.huffSym(&f.h1)
+		if err != nil {
+			return err
+		}
+		if x < 16 {
+			// Actual length.
+			f.bits[i] = x
+			i++
+			continue
+		}
+		// Repeat previous length or zero.
+		var rep int
+		var nb uint
+		var b int
+		switch x {
+		default:
+			return InternalError("unexpected length code")
+		case 16:
+			rep = 3
+			nb = 2
+			if i == 0 {
+				return CorruptInputError(f.roffset)
+			}
+			b = f.bits[i-1]
+		case 17:
+			rep = 3
+			nb = 3
+			b = 0
+		case 18:
+			rep = 11
+			nb = 7
+			b = 0
+		}
+		for f.nb < nb {
+			if err := f.moreBits(); err != nil {
+				return err
+			}
+		}
+		rep += int(f.b & uint32(1<<nb-1))
+		f.b >>= nb
+		f.nb -= nb
+		if i+rep > n {
+			return CorruptInputError(f.roffset)
+		}
+		for j := 0; j < rep; j++ {
+			f.bits[i] = b
+			i++
+		}
+	}
+
+	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
+		return CorruptInputError(f.roffset)
+	}
+
+	// As an optimization, we can initialize the min bits to read at a time
+	// for the HLIT tree to the length of the EOB marker since we know that
+	// every block must terminate with one. This preserves the property that
+	// we never read any extra bytes after the end of the DEFLATE stream.
+	if f.h1.min < f.bits[endBlockMarker] {
+		f.h1.min = f.bits[endBlockMarker]
+	}
+
+	return nil
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBlock() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		v, err := f.huffSym(f.hl)
+		if err != nil {
+			f.err = err
+			return
+		}
+		var n uint // number of bits extra
+		var length int
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBlock
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = f.moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = f.moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = f.moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBlock // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Copy a single uncompressed data block from input to output.
+func (f *decompressor) dataBlock() {
+	// Uncompressed.
+	// Discard current half-byte.
+	f.nb = 0
+	f.b = 0
+
+	// Length then ones-complement of length.
+	nr, err := io.ReadFull(f.r, f.buf[0:4])
+	f.roffset += int64(nr)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+	n := int(f.buf[0]) | int(f.buf[1])<<8
+	nn := int(f.buf[2]) | int(f.buf[3])<<8
+	if uint16(nn) != uint16(^n) {
+		f.err = CorruptInputError(f.roffset)
+		return
+	}
+
+	if n == 0 {
+		f.toRead = f.dict.readFlush()
+		f.finishBlock()
+		return
+	}
+
+	f.copyLen = n
+	f.copyData()
+}
+
+// copyData copies f.copyLen bytes from the underlying reader into f.hist.
+// It pauses for reads when f.hist is full.
+func (f *decompressor) copyData() {
+	buf := f.dict.writeSlice()
+	if len(buf) > f.copyLen {
+		buf = buf[:f.copyLen]
+	}
+
+	cnt, err := io.ReadFull(f.r, buf)
+	f.roffset += int64(cnt)
+	f.copyLen -= cnt
+	f.dict.writeMark(cnt)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+
+	if f.dict.availWrite() == 0 || f.copyLen > 0 {
+		f.toRead = f.dict.readFlush()
+		f.step = (*decompressor).copyData
+		return
+	}
+	f.finishBlock()
+}
+
+func (f *decompressor) finishBlock() {
+	if f.final {
+		if f.dict.availRead() > 0 {
+			f.toRead = f.dict.readFlush()
+		}
+		f.err = io.EOF
+	}
+	f.step = (*decompressor).nextBlock
+}
+
+// noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
+func noEOF(e error) error {
+	if e == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return e
+}
+
+func (f *decompressor) moreBits() error {
+	c, err := f.r.ReadByte()
+	if err != nil {
+		return noEOF(err)
+	}
+	f.roffset++
+	f.b |= uint32(c) << f.nb
+	f.nb += 8
+	return nil
+}
+
+// Read the next Huffman-encoded symbol from f according to h.
+func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
+	// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+	// with single element, huffSym must error on these two edge cases. In both
+	// cases, the chunks slice will be 0 for the invalid sequence, leading it
+	// satisfy the n == 0 check below.
+	n := uint(h.min)
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	nb, b := f.nb, f.b
+	for {
+		for nb < n {
+			c, err := f.r.ReadByte()
+			if err != nil {
+				f.b = b
+				f.nb = nb
+				return 0, noEOF(err)
+			}
+			f.roffset++
+			b |= uint32(c) << (nb & 31)
+			nb += 8
+		}
+		chunk := h.chunks[b&(huffmanNumChunks-1)]
+		n = uint(chunk & huffmanCountMask)
+		if n > huffmanChunkBits {
+			chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask]
+			n = uint(chunk & huffmanCountMask)
+		}
+		if n <= nb {
+			if n == 0 {
+				f.b = b
+				f.nb = nb
+				f.err = CorruptInputError(f.roffset)
+				return 0, f.err
+			}
+			f.b = b >> (n & 31)
+			f.nb = nb - n
+			return int(chunk >> huffmanValueShift), nil
+		}
+	}
+}
+
+func makeReader(r io.Reader) Reader {
+	if rr, ok := r.(Reader); ok {
+		return rr
+	}
+	return bufio.NewReader(r)
+}
+
+func fixedHuffmanDecoderInit() {
+	fixedOnce.Do(func() {
+		// These come from the RFC section 3.2.6.
+		var bits [288]int
+		for i := 0; i < 144; i++ {
+			bits[i] = 8
+		}
+		for i := 144; i < 256; i++ {
+			bits[i] = 9
+		}
+		for i := 256; i < 280; i++ {
+			bits[i] = 7
+		}
+		for i := 280; i < 288; i++ {
+			bits[i] = 8
+		}
+		fixedHuffmanDecoder.init(bits[:])
+	})
+}
+
+func (f *decompressor) Reset(r io.Reader, dict []byte) error {
+	*f = decompressor{
+		r:        makeReader(r),
+		bits:     f.bits,
+		codebits: f.codebits,
+		h1:       f.h1,
+		h2:       f.h2,
+		dict:     f.dict,
+		step:     (*decompressor).nextBlock,
+	}
+	f.dict.init(maxMatchOffset, dict)
+	return nil
+}
+
+// NewReader returns a new ReadCloser that can be used
+// to read the uncompressed version of r.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+// It is the caller's responsibility to call Close on the ReadCloser
+// when finished reading.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReader(r io.Reader) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = (*decompressor).nextBlock
+	f.dict.init(maxMatchOffset, nil)
+	return &f
+}
+
+// NewReaderDict is like NewReader but initializes the reader
+// with a preset dictionary. The returned Reader behaves as if
+// the uncompressed data stream started with the given dictionary,
+// which has already been read. NewReaderDict is typically used
+// to read data compressed by NewWriterDict.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = (*decompressor).nextBlock
+	f.dict.init(maxMatchOffset, dict)
+	return &f
+}
diff --git a/vendor/github.com/klauspost/compress/flate/reverse_bits.go b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
new file mode 100644
index 000000000..c1a02720d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
@@ -0,0 +1,48 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+var reverseByte = [256]byte{
+	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
+	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
+	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
+	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
+	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
+	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
+	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
+	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
+	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
+	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
+	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
+	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+}
+
+func reverseUint16(v uint16) uint16 {
+	return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8
+}
+
+func reverseBits(number uint16, bitLength byte) uint16 {
+	return reverseUint16(number << uint8(16-bitLength))
+}
diff --git a/vendor/github.com/klauspost/compress/flate/snappy.go b/vendor/github.com/klauspost/compress/flate/snappy.go
new file mode 100644
index 000000000..d853320a7
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/snappy.go
@@ -0,0 +1,900 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Modified for deflate by Klaus Post (c) 2015.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+func emitLiteral(dst *tokens, lit []byte) {
+	ol := int(dst.n)
+	for i, v := range lit {
+		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+	}
+	dst.n += uint16(len(lit))
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+func emitCopy(dst *tokens, offset, length int) {
+	dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize))
+	dst.n++
+}
+
+type snappyEnc interface {
+	Encode(dst *tokens, src []byte)
+	Reset()
+}
+
+func newSnappy(level int) snappyEnc {
+	switch level {
+	case 1:
+		return &snappyL1{}
+	case 2:
+		return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
+	case 3:
+		return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
+	case 4:
+		return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}}
+	default:
+		panic("invalid level specified")
+	}
+}
+
+const (
+	tableBits       = 14             // Bits used in the table
+	tableSize       = 1 << tableBits // Size of the table
+	tableMask       = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+	baseMatchOffset = 1              // The smallest match offset
+	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+	maxMatchOffset  = 1 << 15        // The largest match offset
+)
+
+func load32(b []byte, i int) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> tableShift
+}
+
+// snappyL1 encapsulates level 1 compression
+type snappyL1 struct{}
+
+func (e *snappyL1) Reset() {}
+
+func (e *snappyL1) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 16 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Initialize the hash table.
+	//
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
+	var table [tableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s))
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
+			nextHash = hash(load32(src, nextS))
+			if s-candidate <= maxMatchOffset && load32(src, s) == load32(src, candidate) {
+				break
+			}
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		emitLiteral(dst, src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+
+			// Extend the 4-byte match as long as possible.
+			//
+			// This is an inlined version of Snappy's:
+			//	s = extendMatch(src, candidate+4, s+4)
+			s += 4
+			s1 := base + maxMatchLength
+			if s1 > len(src) {
+				s1 = len(src)
+			}
+			a := src[s:s1]
+			b := src[candidate+4:]
+			b = b[:len(a)]
+			l := len(a)
+			for i := range a {
+				if a[i] != b[i] {
+					l = i
+					break
+				}
+			}
+			s += l
+
+			// matchToken is flate's equivalent of Snappy's emitCopy.
+			dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset))
+			dst.n++
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x >> 0))
+			table[prevHash&tableMask] = uint16(s - 1)
+			currHash := hash(uint32(x >> 8))
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
+			if s-candidate > maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x >> 16))
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
+
+type tableEntry struct {
+	val    uint32
+	offset int32
+}
+
+func load3232(b []byte, i int32) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6432(b []byte, i int32) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// snappyGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type snappyGen struct {
+	prev []byte
+	cur  int32
+}
+
+// snappyGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type snappyL2 struct {
+	snappyGen
+	table [tableSize]tableEntry
+}
+
+// EncodeL2 uses a similar algorithm to level 1, but is capable
+// of matching across blocks giving better compression at a small slowdown.
+func (e *snappyL2) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 8 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	if e.cur > 1<<30 {
+		for i := range e.table[:] {
+			e.table[i] = tableEntry{}
+		}
+		e.cur = maxStoreBlockSize
+	}
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		e.cur += maxStoreBlockSize
+		e.prev = e.prev[:0]
+		return
+	}
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := int32(0)
+	s := int32(0)
+	cv := load3232(src, s)
+	nextHash := hash(cv)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := int32(32)
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = e.table[nextHash&tableMask]
+			now := load3232(src, nextS)
+			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
+			nextHash = hash(now)
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || cv != candidate.val {
+				// Out of range or not matched.
+				cv = now
+				continue
+			}
+			break
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		emitLiteral(dst, src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			//
+			s += 4
+			t := candidate.offset - e.cur + 4
+			l := e.matchlen(s, t, src)
+
+			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
+			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
+			dst.n++
+			s += l
+			nextEmit = s
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+4) < len(src) && t > 0 {
+					cv := load3232(src, t)
+					e.table[hash(cv)&tableMask] = tableEntry{offset: t + e.cur, val: cv}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-1)
+			prevHash := hash(uint32(x))
+			e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
+			x >>= 8
+			currHash := hash(uint32(x))
+			candidate = e.table[currHash&tableMask]
+			e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x) != candidate.val {
+				cv = uint32(x >> 8)
+				nextHash = hash(cv)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		emitLiteral(dst, src[nextEmit:])
+	}
+	e.cur += int32(len(src))
+	e.prev = e.prev[:len(src)]
+	copy(e.prev, src)
+}
+
+type tableEntryPrev struct {
+	Cur  tableEntry
+	Prev tableEntry
+}
+
+// snappyL3
+type snappyL3 struct {
+	snappyGen
+	table [tableSize]tableEntryPrev
+}
+
+// Encode uses a similar algorithm to level 2, will check up to two candidates.
+func (e *snappyL3) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 8 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	if e.cur > 1<<30 {
+		for i := range e.table[:] {
+			e.table[i] = tableEntryPrev{}
+		}
+		e.snappyGen = snappyGen{cur: maxStoreBlockSize, prev: e.prev[:0]}
+	}
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		e.cur += maxStoreBlockSize
+		e.prev = e.prev[:0]
+		return
+	}
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := int32(0)
+	s := int32(0)
+	cv := load3232(src, s)
+	nextHash := hash(cv)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := int32(32)
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash&tableMask]
+			now := load3232(src, nextS)
+			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+			nextHash = hash(now)
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if cv == candidate.val {
+				offset := s - (candidate.offset - e.cur)
+				if offset <= maxMatchOffset {
+					break
+				}
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if cv == candidate.val {
+					offset := s - (candidate.offset - e.cur)
+					if offset <= maxMatchOffset {
+						break
+					}
+				}
+			}
+			cv = now
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		emitLiteral(dst, src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			//
+			s += 4
+			t := candidate.offset - e.cur + 4
+			l := e.matchlen(s, t, src)
+
+			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
+			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
+			dst.n++
+			s += l
+			nextEmit = s
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+4) < len(src) && t > 0 {
+					cv := load3232(src, t)
+					nextHash = hash(cv)
+					e.table[nextHash&tableMask] = tableEntryPrev{
+						Prev: e.table[nextHash&tableMask].Cur,
+						Cur:  tableEntry{offset: e.cur + t, val: cv},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-3 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-3)
+			prevHash := hash(uint32(x))
+			e.table[prevHash&tableMask] = tableEntryPrev{
+				Prev: e.table[prevHash&tableMask].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash&tableMask] = tableEntryPrev{
+				Prev: e.table[prevHash&tableMask].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash&tableMask] = tableEntryPrev{
+				Prev: e.table[prevHash&tableMask].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+			}
+			x >>= 8
+			currHash := hash(uint32(x))
+			candidates := e.table[currHash&tableMask]
+			cv = uint32(x)
+			e.table[currHash&tableMask] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur, val: cv},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if cv == candidate.val {
+				offset := s - (candidate.offset - e.cur)
+				if offset <= maxMatchOffset {
+					continue
+				}
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if cv == candidate.val {
+					offset := s - (candidate.offset - e.cur)
+					if offset <= maxMatchOffset {
+						continue
+					}
+				}
+			}
+			cv = uint32(x >> 8)
+			nextHash = hash(cv)
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		emitLiteral(dst, src[nextEmit:])
+	}
+	e.cur += int32(len(src))
+	e.prev = e.prev[:len(src)]
+	copy(e.prev, src)
+}
+
+// snappyL4
+type snappyL4 struct {
+	snappyL3
+}
+
+// Encode uses a similar algorithm to level 3,
+// but will check up to two candidates if first isn't long enough.
+func (e *snappyL4) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 8 - 3
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		matchLenGood           = 12
+	)
+
+	// Protect against e.cur wraparound.
+	if e.cur > 1<<30 {
+		for i := range e.table[:] {
+			e.table[i] = tableEntryPrev{}
+		}
+		e.snappyGen = snappyGen{cur: maxStoreBlockSize, prev: e.prev[:0]}
+	}
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		e.cur += maxStoreBlockSize
+		e.prev = e.prev[:0]
+		return
+	}
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := int32(0)
+	s := int32(0)
+	cv := load3232(src, s)
+	nextHash := hash(cv)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := int32(32)
+
+		nextS := s
+		var candidate tableEntry
+		var candidateAlt tableEntry
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash&tableMask]
+			now := load3232(src, nextS)
+			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+			nextHash = hash(now)
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if cv == candidate.val {
+				offset := s - (candidate.offset - e.cur)
+				if offset < maxMatchOffset {
+					offset = s - (candidates.Prev.offset - e.cur)
+					if cv == candidates.Prev.val && offset < maxMatchOffset {
+						candidateAlt = candidates.Prev
+					}
+					break
+				}
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if cv == candidate.val {
+					offset := s - (candidate.offset - e.cur)
+					if offset < maxMatchOffset {
+						break
+					}
+				}
+			}
+			cv = now
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		emitLiteral(dst, src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			//
+			s += 4
+			t := candidate.offset - e.cur + 4
+			l := e.matchlen(s, t, src)
+			// Try alternative candidate if match length < matchLenGood.
+			if l < matchLenGood-4 && candidateAlt.offset != 0 {
+				t2 := candidateAlt.offset - e.cur + 4
+				l2 := e.matchlen(s, t2, src)
+				if l2 > l {
+					l = l2
+					t = t2
+				}
+			}
+			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
+			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
+			dst.n++
+			s += l
+			nextEmit = s
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+4) < len(src) && t > 0 {
+					cv := load3232(src, t)
+					nextHash = hash(cv)
+					e.table[nextHash&tableMask] = tableEntryPrev{
+						Prev: e.table[nextHash&tableMask].Cur,
+						Cur:  tableEntry{offset: e.cur + t, val: cv},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-3 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-3)
+			prevHash := hash(uint32(x))
+			e.table[prevHash&tableMask] = tableEntryPrev{
+				Prev: e.table[prevHash&tableMask].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash&tableMask] = tableEntryPrev{
+				Prev: e.table[prevHash&tableMask].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash&tableMask] = tableEntryPrev{
+				Prev: e.table[prevHash&tableMask].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+			}
+			x >>= 8
+			currHash := hash(uint32(x))
+			candidates := e.table[currHash&tableMask]
+			cv = uint32(x)
+			e.table[currHash&tableMask] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur, val: cv},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			candidateAlt = tableEntry{}
+			if cv == candidate.val {
+				offset := s - (candidate.offset - e.cur)
+				if offset <= maxMatchOffset {
+					offset = s - (candidates.Prev.offset - e.cur)
+					if cv == candidates.Prev.val && offset <= maxMatchOffset {
+						candidateAlt = candidates.Prev
+					}
+					continue
+				}
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if cv == candidate.val {
+					offset := s - (candidate.offset - e.cur)
+					if offset <= maxMatchOffset {
+						continue
+					}
+				}
+			}
+			cv = uint32(x >> 8)
+			nextHash = hash(cv)
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		emitLiteral(dst, src[nextEmit:])
+	}
+	e.cur += int32(len(src))
+	e.prev = e.prev[:len(src)]
+	copy(e.prev, src)
+}
+
+func (e *snappyGen) matchlen(s, t int32, src []byte) int32 {
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// If we are inside the current block
+	if t >= 0 {
+		b := src[t:]
+		a := src[s:s1]
+		b = b[:len(a)]
+		// Extend the match to be as long as possible.
+		for i := range a {
+			if a[i] != b[i] {
+				return int32(i)
+			}
+		}
+		return int32(len(a))
+	}
+
+	// We found a match in the previous block.
+	tp := int32(len(e.prev)) + t
+	if tp < 0 {
+		return 0
+	}
+
+	// Extend the match to be as long as possible.
+	a := src[s:s1]
+	b := e.prev[tp:]
+	if len(b) > len(a) {
+		b = b[:len(a)]
+	}
+	a = a[:len(b)]
+	for i := range b {
+		if a[i] != b[i] {
+			return int32(i)
+		}
+	}
+
+	// If we reached our limit, we matched everything we are
+	// allowed to in the previous block and we return.
+	n := int32(len(b))
+	if int(s+n) == s1 {
+		return n
+	}
+
+	// Continue looking for more matches in the current block.
+	a = src[s+n : s1]
+	b = src[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int32(i) + n
+		}
+	}
+	return int32(len(a)) + n
+}
+
+// Reset the encoding table.
+func (e *snappyGen) Reset() {
+	e.prev = e.prev[:0]
+	e.cur += maxMatchOffset
+}
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
new file mode 100644
index 000000000..4f275ea61
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -0,0 +1,115 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import "fmt"
+
+const (
+	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
+	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
+	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
+	lengthShift = 22
+	offsetMask  = 1<<lengthShift - 1
+	typeMask    = 3 << 30
+	literalType = 0 << 30
+	matchType   = 1 << 30
+)
+
+// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
+// is lengthCodes[length - MIN_MATCH_LENGTH]
+var lengthCodes = [...]uint32{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
+	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
+	15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+	17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
+	18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+	19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+	20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 28,
+}
+
+var offsetCodes = [...]uint32{
+	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+}
+
+type token uint32
+
+type tokens struct {
+	tokens [maxStoreBlockSize + 1]token
+	n      uint16 // Must be able to contain maxStoreBlockSize
+}
+
+// Convert a literal into a literal token.
+func literalToken(literal uint32) token { return token(literalType + literal) }
+
+// Convert a < xlength, xoffset > pair into a match token.
+func matchToken(xlength uint32, xoffset uint32) token {
+	return token(matchType + xlength<<lengthShift + xoffset)
+}
+
+func matchTokend(xlength uint32, xoffset uint32) token {
+	if xlength > maxMatchLength || xoffset > maxMatchOffset {
+		panic(fmt.Sprintf("Invalid match: len: %d, offset: %d\n", xlength, xoffset))
+		return token(matchType)
+	}
+	return token(matchType + xlength<<lengthShift + xoffset)
+}
+
+// Returns the type of a token
+func (t token) typ() uint32 { return uint32(t) & typeMask }
+
+// Returns the literal of a literal token
+func (t token) literal() uint32 { return uint32(t - literalType) }
+
+// Returns the extra offset of a match token
+func (t token) offset() uint32 { return uint32(t) & offsetMask }
+
+func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
+
+func lengthCode(len uint32) uint32 { return lengthCodes[len] }
+
+// Returns the offset code corresponding to a specific offset
+func offsetCode(off uint32) uint32 {
+	if off < uint32(len(offsetCodes)) {
+		return offsetCodes[off]
+	} else if off>>7 < uint32(len(offsetCodes)) {
+		return offsetCodes[off>>7] + 14
+	} else {
+		return offsetCodes[off>>14] + 28
+	}
+}