29 files changed, 2245 insertions, 391 deletions
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
index d9948ab40..2b101d26b 100644
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -48,6 +48,8 @@ const (
 	maxHashOffset       = 1 << 24
 
 	skipNever = math.MaxInt32
+
+	debugDeflate = false
 )
 
 type compressionLevel struct {
@@ -59,15 +61,13 @@ type compressionLevel struct {
 // See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
 var levels = []compressionLevel{
 	{}, // 0
-	// Level 1-4 uses specialized algorithm - values not used
+	// Level 1-6 uses specialized algorithm - values not used
 	{0, 0, 0, 0, 0, 1},
 	{0, 0, 0, 0, 0, 2},
 	{0, 0, 0, 0, 0, 3},
 	{0, 0, 0, 0, 0, 4},
-	// For levels 5-6 we don't bother trying with lazy matches.
-	// Lazy matching is at least 30% slower, with 1.5% increase.
-	{6, 0, 12, 8, 12, 5},
-	{8, 0, 24, 16, 16, 6},
+	{0, 0, 0, 0, 0, 5},
+	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
 	{8, 8, 24, 16, skipNever, 7},
@@ -203,9 +203,8 @@ func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
 // This is much faster than doing a full encode.
 // Should only be used after a start/reset.
 func (d *compressor) fillWindow(b []byte) {
-	// Do not fill window if we are in store-only mode,
-	// use constant or Snappy compression.
-	if d.level == 0 {
+	// Do not fill window if we are in store-only or huffman mode.
+	if d.level <= 0 {
 		return
 	}
 	if d.fast != nil {
@@ -368,7 +367,7 @@ func (d *compressor) deflateLazy() {
 	// Sanity enables additional runtime tests.
 	// It's intended to be used during development
 	// to supplement the currently ad-hoc unit tests.
-	const sanity = false
+	const sanity = debugDeflate
 
 	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
@@ -667,6 +666,7 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 	default:
 		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
+	d.level = level
 	return nil
 }
 
@@ -720,6 +720,7 @@ func (d *compressor) close() error {
 		return d.w.err
 	}
 	d.w.flush()
+	d.w.reset(nil)
 	return d.w.err
 }
 
@@ -750,8 +751,7 @@ func NewWriter(w io.Writer, level int) (*Writer, error) {
 // can only be decompressed by a Reader initialized with the
 // same dictionary.
 func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-	dw := &dictWriter{w}
-	zw, err := NewWriter(dw, level)
+	zw, err := NewWriter(w, level)
 	if err != nil {
 		return nil, err
 	}
@@ -760,14 +760,6 @@ func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
 	return zw, err
 }
 
-type dictWriter struct {
-	w io.Writer
-}
-
-func (w *dictWriter) Write(b []byte) (n int, err error) {
-	return w.w.Write(b)
-}
-
 // A Writer takes data written to it and writes the compressed
 // form of that data to an underlying writer (see NewWriter).
 type Writer struct {
@@ -805,11 +797,12 @@ func (w *Writer) Close() error {
 // the result of NewWriter or NewWriterDict called with dst
 // and w's level and dictionary.
 func (w *Writer) Reset(dst io.Writer) {
-	if dw, ok := w.d.w.writer.(*dictWriter); ok {
+	if len(w.dict) > 0 {
 		// w was created with NewWriterDict
-		dw.w = dst
-		w.d.reset(dw)
-		w.d.fillWindow(w.dict)
+		w.d.reset(dst)
+		if dst != nil {
+			w.d.fillWindow(w.dict)
+		}
 	} else {
 		// w was created with NewWriter
 		w.d.reset(dst)
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index 3d2fdcd77..6d4c1e98b 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -35,16 +35,16 @@ func newFastEnc(level int) fastEnc {
 }
 
 const (
-	tableBits       = 16             // Bits used in the table
+	tableBits       = 15             // Bits used in the table
 	tableSize       = 1 << tableBits // Size of the table
 	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
 	baseMatchOffset = 1              // The smallest match offset
 	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
 	maxMatchOffset  = 1 << 15        // The largest match offset
 
-	bTableBits   = 18                                               // Bits used in the big tables
+	bTableBits   = 17                                               // Bits used in the big tables
 	bTableSize   = 1 << bTableBits                                  // Size of the table
-	allocHistory = maxStoreBlockSize * 20                           // Size to preallocate for history.
+	allocHistory = maxStoreBlockSize * 10                           // Size to preallocate for history.
 	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
 )
 
@@ -92,7 +92,6 @@ func hash(u uint32) uint32 {
 }
 
 type tableEntry struct {
-	val    uint32
 	offset int32
 }
 
diff --git a/vendor/github.com/klauspost/compress/flate/gen_inflate.go b/vendor/github.com/klauspost/compress/flate/gen_inflate.go
new file mode 100644
index 000000000..c74a95fe7
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/gen_inflate.go
@@ -0,0 +1,274 @@
+// +build generate
+
+//go:generate go run $GOFILE && gofmt -w inflate_gen.go
+
+package main
+
+import (
+	"os"
+	"strings"
+)
+
+func main() {
+	f, err := os.Create("inflate_gen.go")
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
+	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
+	imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
+	f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+`)
+
+	for _, imp := range imports {
+		f.WriteString("\t\"" + imp + "\"\n")
+	}
+	f.WriteString(")\n\n")
+
+	template := `
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) $FUNCNAME$() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.($TYPE$)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).$FUNCNAME$
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+`
+	for i, t := range types {
+		s := strings.Replace(template, "$FUNCNAME$", "huffman"+names[i], -1)
+		s = strings.Replace(s, "$TYPE$", t, -1)
+		f.WriteString(s)
+	}
+	f.WriteString("func (f *decompressor) huffmanBlockDecoder() func() {\n")
+	f.WriteString("\tswitch f.r.(type) {\n")
+	for i, t := range types {
+		f.WriteString("\t\tcase " + t + ":\n")
+		f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
+	}
+	f.WriteString("\t\tdefault:\n")
+	f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
+	f.WriteString("\t}\n}\n")
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index 56ee6dc8b..53fe1d06e 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -484,6 +484,9 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
 	}
 }
 
+// writeStoredHeader will write a stored header.
+// If the stored block is only used for EOF,
+// it is replaced with a fixed huffman block.
 func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 	if w.err != nil {
 		return
@@ -493,6 +496,16 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 	}
+
+	// To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes.
+	if length == 0 && isEof {
+		w.writeFixedHeader(isEof)
+		// EOB: 7 bits, value: 0
+		w.writeBits(0, 7)
+		w.flush()
+		return
+	}
+
 	var flag int32
 	if isEof {
 		flag = 1
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
index 9d8e81ad6..4c39a3018 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -109,8 +109,8 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 	return h
 }
 
-var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
-var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
+var fixedLiteralEncoding = generateFixedLiteralEncoding()
+var fixedOffsetEncoding = generateFixedOffsetEncoding()
 
 func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
index 6dc5b5d06..7f175a4ec 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -106,7 +106,7 @@ const (
 )
 
 type huffmanDecoder struct {
-	min      int                       // the minimum code length
+	maxRead  int                       // the maximum number of bits we can read and not overread
 	chunks   *[huffmanNumChunks]uint16 // chunks as described above
 	links    [][]uint16                // overflow links
 	linkMask uint32                    // mask the width of the link table
@@ -126,12 +126,12 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	if h.chunks == nil {
 		h.chunks = &[huffmanNumChunks]uint16{}
 	}
-	if h.min != 0 {
+	if h.maxRead != 0 {
 		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
 	}
 
 	// Count number of codes of each length,
-	// compute min and max length.
+	// compute maxRead and max length.
 	var count [maxCodeLen]int
 	var min, max int
 	for _, n := range lengths {
@@ -178,7 +178,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 		return false
 	}
 
-	h.min = min
+	h.maxRead = min
 	chunks := h.chunks[:]
 	for i := range chunks {
 		chunks[i] = 0
@@ -342,7 +342,7 @@ func (f *decompressor) nextBlock() {
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
-		f.huffmanBlock()
+		f.huffmanBlockDecoder()()
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -350,7 +350,7 @@ func (f *decompressor) nextBlock() {
 		}
 		f.hl = &f.h1
 		f.hd = &f.h2
-		f.huffmanBlock()
+		f.huffmanBlockDecoder()()
 	default:
 		// 3 is reserved.
 		if debugDecode {
@@ -543,12 +543,18 @@ func (f *decompressor) readHuffman() error {
 		return CorruptInputError(f.roffset)
 	}
 
-	// As an optimization, we can initialize the min bits to read at a time
+	// As an optimization, we can initialize the maxRead bits to read at a time
 	// for the HLIT tree to the length of the EOB marker since we know that
 	// every block must terminate with one. This preserves the property that
 	// we never read any extra bytes after the end of the DEFLATE stream.
-	if f.h1.min < f.bits[endBlockMarker] {
-		f.h1.min = f.bits[endBlockMarker]
+	if f.h1.maxRead < f.bits[endBlockMarker] {
+		f.h1.maxRead = f.bits[endBlockMarker]
+	}
+	if !f.final {
+		// If not the final block, the smallest block possible is
+		// a predefined table, BTYPE=01, with a single EOB marker.
+		// This will take up 3 + 7 bits.
+		f.h1.maxRead += 10
 	}
 
 	return nil
@@ -558,7 +564,7 @@ func (f *decompressor) readHuffman() error {
 // hl and hd are the Huffman states for the lit/length values
 // and the distance values, respectively. If hd == nil, using the
 // fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlock() {
+func (f *decompressor) huffmanBlockGeneric() {
 	const (
 		stateInit = iota // Zero value must be stateInit
 		stateDict
@@ -574,19 +580,64 @@ func (f *decompressor) huffmanBlock() {
 readLiteral:
 	// Read literal and/or (length, distance) according to RFC section 3.2.3.
 	{
-		v, err := f.huffSym(f.hl)
-		if err != nil {
-			f.err = err
-			return
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := f.r.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
 		}
+
 		var n uint // number of bits extra
 		var length int
+		var err error
 		switch {
 		case v < 256:
 			f.dict.writeByte(byte(v))
 			if f.dict.availWrite() == 0 {
 				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlock
+				f.step = (*decompressor).huffmanBlockGeneric
 				f.stepState = stateInit
 				return
 			}
@@ -714,7 +765,7 @@ copyHistory:
 
 		if f.dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlock // We need to continue this work
+			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
 			f.stepState = stateDict
 			return
 		}
@@ -726,21 +777,33 @@ copyHistory:
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
 	// Discard current half-byte.
-	f.nb = 0
-	f.b = 0
+	left := (f.nb) & 7
+	f.nb -= left
+	f.b >>= left
+
+	offBytes := f.nb >> 3
+	// Unfilled values will be overwritten.
+	f.buf[0] = uint8(f.b)
+	f.buf[1] = uint8(f.b >> 8)
+	f.buf[2] = uint8(f.b >> 16)
+	f.buf[3] = uint8(f.b >> 24)
+
+	f.roffset += int64(offBytes)
+	f.nb, f.b = 0, 0
 
 	// Length then ones-complement of length.
-	nr, err := io.ReadFull(f.r, f.buf[0:4])
+	nr, err := io.ReadFull(f.r, f.buf[offBytes:4])
 	f.roffset += int64(nr)
 	if err != nil {
 		f.err = noEOF(err)
 		return
 	}
-	n := int(f.buf[0]) | int(f.buf[1])<<8
-	nn := int(f.buf[2]) | int(f.buf[3])<<8
-	if uint16(nn) != uint16(^n) {
+	n := uint16(f.buf[0]) | uint16(f.buf[1])<<8
+	nn := uint16(f.buf[2]) | uint16(f.buf[3])<<8
+	if nn != ^n {
 		if debugDecode {
-			fmt.Println("uint16(nn) != uint16(^n)", nn, ^n)
+			ncomp := ^n
+			fmt.Println("uint16(nn) != uint16(^n)", nn, ncomp)
 		}
 		f.err = CorruptInputError(f.roffset)
 		return
@@ -752,7 +815,7 @@ func (f *decompressor) dataBlock() {
 		return
 	}
 
-	f.copyLen = n
+	f.copyLen = int(n)
 	f.copyData()
 }
 
@@ -816,7 +879,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
 	// with single element, huffSym must error on these two edge cases. In both
 	// cases, the chunks slice will be 0 for the invalid sequence, leading it
 	// satisfy the n == 0 check below.
-	n := uint(h.min)
+	n := uint(h.maxRead)
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
new file mode 100644
index 000000000..397dc1b1a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -0,0 +1,922 @@
+// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"math/bits"
+	"strings"
+)
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesBuffer() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Buffer)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesBuffer
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBufioReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bufio.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBufioReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanStringsReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*strings.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+func (f *decompressor) huffmanBlockDecoder() func() {
+	switch f.r.(type) {
+	case *bytes.Buffer:
+		return f.huffmanBytesBuffer
+	case *bytes.Reader:
+		return f.huffmanBytesReader
+	case *bufio.Reader:
+		return f.huffmanBufioReader
+	case *strings.Reader:
+		return f.huffmanStringsReader
+	default:
+		return f.huffmanBlockGeneric
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
index 102fc74c7..1e5eea396 100644
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -16,7 +16,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
-	if debugDecode && e.cur < 0 {
+	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
 	}
 
@@ -81,12 +81,12 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			}
 
 			now := load6432(src, nextS)
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
 			nextHash = hash(uint32(now))
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
-				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
@@ -96,11 +96,11 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
-				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 			cv = uint32(now)
@@ -139,7 +139,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(s+l+4) < len(src) {
 					cv := load3232(src, s)
-					e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv}
+					e.table[hash(cv)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@@ -153,14 +153,14 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
 			prevHash := hash(uint32(x))
-			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
+			e.table[prevHash] = tableEntry{offset: o}
 			x >>= 16
 			currHash := hash(uint32(x))
 			candidate = e.table[currHash]
-			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)}
+			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x) != candidate.val {
+			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
 				cv = uint32(x >> 8)
 				s++
 				break
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
index dc6b1d314..5b986a194 100644
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -18,7 +18,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 
-	if debugDecode && e.cur < 0 {
+	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
 	}
 
@@ -83,12 +83,12 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			}
 			candidate = e.table[nextHash]
 			now := load6432(src, nextS)
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
 			nextHash = hash4u(uint32(now), bTableBits)
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
-				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
@@ -98,10 +98,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
 				break
 			}
 			cv = uint32(now)
@@ -148,7 +148,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(s+l+4) < len(src) {
 					cv := load3232(src, s)
-					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv}
+					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@@ -157,15 +157,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			for i := s - l + 2; i < s-5; i += 7 {
 				x := load6432(src, int32(i))
 				nextHash := hash4u(uint32(x), bTableBits)
-				e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)}
+				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
 				x >>= 16
 				nextHash = hash4u(uint32(x), bTableBits)
-				e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)}
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
 				// Skip one
 				x >>= 16
 				nextHash = hash4u(uint32(x), bTableBits)
-				e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)}
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
 			}
 
 			// We could immediately start working at s now, but to improve
@@ -178,14 +178,14 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			o := e.cur + s - 2
 			prevHash := hash4u(uint32(x), bTableBits)
 			prevHash2 := hash4u(uint32(x>>8), bTableBits)
-			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
-			e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)}
+			e.table[prevHash] = tableEntry{offset: o}
+			e.table[prevHash2] = tableEntry{offset: o + 1}
 			currHash := hash4u(uint32(x>>16), bTableBits)
 			candidate = e.table[currHash]
-			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)}
+			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x>>16) != candidate.val {
+			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
 				cv = uint32(x >> 24)
 				s++
 				break
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
index 1a3ff9b6b..c22b4244a 100644
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -15,7 +15,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 
-	if debugDecode && e.cur < 0 {
+	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
 	}
 
@@ -81,22 +81,26 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			}
 			candidates := e.table[nextHash]
 			now := load3232(src, nextS)
-			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+
+			// Safe offset distance until s + 4...
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}}
 
 			// Check both candidates
 			candidate = candidates.Cur
-			offset := s - (candidate.offset - e.cur)
-			if cv == candidate.val {
-				if offset > maxMatchOffset {
-					cv = now
-					// Previous will also be invalid, we have nothing.
-					continue
-				}
-				o2 := s - (candidates.Prev.offset - e.cur)
-				if cv != candidates.Prev.val || o2 > maxMatchOffset {
+			if candidate.offset < minOffset {
+				cv = now
+				// Previous will also be invalid, we have nothing.
+				continue
+			}
+
+			if cv == load3232(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) {
 					break
 				}
 				// Both match and are valid, pick longest.
+				offset := s - (candidate.offset - e.cur)
+				o2 := s - (candidates.Prev.offset - e.cur)
 				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
 				if l2 > l1 {
 					candidate = candidates.Prev
@@ -106,11 +110,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						break
-					}
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+					break
 				}
 			}
 			cv = now
@@ -158,7 +159,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 					nextHash := hash(cv)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
-						Cur:  tableEntry{offset: e.cur + t, val: cv},
+						Cur:  tableEntry{offset: e.cur + t},
 					}
 				}
 				goto emitRemainder
@@ -170,21 +171,21 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			prevHash := hash(uint32(x))
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
+				Cur:  tableEntry{offset: e.cur + s - 3},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
 			currHash := hash(uint32(x))
@@ -192,21 +193,18 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			cv = uint32(x)
 			e.table[currHash] = tableEntryPrev{
 				Prev: candidates.Cur,
-				Cur:  tableEntry{offset: s + e.cur, val: cv},
+				Cur:  tableEntry{offset: s + e.cur},
 			}
 
 			// Check both candidates
 			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset <= maxMatchOffset {
-					continue
-				}
-			} else {
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+
+			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
-				if cv == candidate.val {
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
 					offset := s - (candidate.offset - e.cur)
 					if offset <= maxMatchOffset {
 						continue
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
index f3ecc9c4d..e62f0c02b 100644
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -13,7 +13,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
-	if debugDecode && e.cur < 0 {
+	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
 	}
 	// Protect against e.cur wraparound.
@@ -92,24 +92,24 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			entry := tableEntry{offset: s + e.cur}
 			e.table[nextHashS] = entry
 			e.bTable[nextHashL] = entry
 
 			t = lCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == lCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.offset-e.cur) {
 				// We got a long match. Use that.
 				break
 			}
 
 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
 				// Found a 4 match...
 				lCandidate = e.bTable[hash7(next, tableBits)]
 
 				// If the next long is a candidate, check if we should use that instead...
 				lOff := nextS - (lCandidate.offset - e.cur)
-				if lOff < maxMatchOffset && lCandidate.val == uint32(next) {
+				if lOff < maxMatchOffset && load3232(src, lCandidate.offset-e.cur) == uint32(next) {
 					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
 					if l2 > l1 {
 						s = nextS
@@ -137,7 +137,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		if nextEmit < s {
 			emitLiteral(dst, src[nextEmit:s])
 		}
-		if false {
+		if debugDeflate {
 			if t >= s {
 				panic("s-t")
 			}
@@ -160,8 +160,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			// Index first pair after match end.
 			if int(s+8) < len(src) {
 				cv := load6432(src, s)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur}
+				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
 			}
 			goto emitRemainder
 		}
@@ -171,20 +171,20 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			i := nextS
 			if i < s-1 {
 				cv := load6432(src, i)
-				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-				t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
 				e.bTable[hash7(cv, tableBits)] = t
 				e.bTable[hash7(cv>>8, tableBits)] = t2
-				e.table[hash4u(t2.val, tableBits)] = t2
+				e.table[hash4u(uint32(cv>>8), tableBits)] = t2
 
 				i += 3
 				for ; i < s-1; i += 3 {
 					cv := load6432(src, i)
-					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-					t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
 					e.bTable[hash7(cv, tableBits)] = t
 					e.bTable[hash7(cv>>8, tableBits)] = t2
-					e.table[hash4u(t2.val, tableBits)] = t2
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
 				}
 			}
 		}
@@ -195,8 +195,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		o := e.cur + s - 1
 		prevHashS := hash4x64(x, tableBits)
 		prevHashL := hash7(x, tableBits)
-		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
-		e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)}
+		e.table[prevHashS] = tableEntry{offset: o}
+		e.bTable[prevHashL] = tableEntry{offset: o}
 		cv = x >> 8
 	}
 
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
index 4e3916825..d513f1ffd 100644
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -13,7 +13,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
-	if debugDecode && e.cur < 0 {
+	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
 	}
 
@@ -100,7 +100,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			entry := tableEntry{offset: s + e.cur}
 			e.table[nextHashS] = entry
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
@@ -110,14 +110,14 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == lCandidate.Cur.val {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 
 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
@@ -129,30 +129,30 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					break
 				}
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 					break
 				}
 			}
 
 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4
 				lCandidate = e.bTable[nextHashL]
 				// Store the next match
 
-				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 				eLong := &e.bTable[nextHashL]
-				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 
 				// If the next long is a candidate, use that...
 				t2 := lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if lCandidate.Cur.val == uint32(next) {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@@ -163,7 +163,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@@ -197,7 +197,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		if nextEmit < s {
 			emitLiteral(dst, src[nextEmit:s])
 		}
-		if false {
+		if debugDeflate {
 			if t >= s {
 				panic(fmt.Sprintln("s-t", s, t))
 			}
@@ -226,31 +226,31 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			i := s - l + 1
 			if i < s-1 {
 				cv := load6432(src, i)
-				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+				t := tableEntry{offset: i + e.cur}
 				e.table[hash4x64(cv, tableBits)] = t
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 
 				// Do an long at i+1
 				cv >>= 8
-				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+				t = tableEntry{offset: t.offset + 1}
 				eLong = &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 
 				// We only have enough bits for a short entry at i+2
 				cv >>= 8
-				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+				t = tableEntry{offset: t.offset + 1}
 				e.table[hash4x64(cv, tableBits)] = t
 
 				// Skip one - otherwise we risk hitting 's'
 				i += 4
 				for ; i < s-1; i += hashEvery {
 					cv := load6432(src, i)
-					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-					t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
 					eLong := &e.bTable[hash7(cv, tableBits)]
 					eLong.Cur, eLong.Prev = t, eLong.Cur
-					e.table[hash4u(t2.val, tableBits)] = t2
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
 				}
 			}
 		}
@@ -261,9 +261,9 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		o := e.cur + s - 1
 		prevHashS := hash4x64(x, tableBits)
 		prevHashL := hash7(x, tableBits)
-		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
+		e.table[prevHashS] = tableEntry{offset: o}
 		eLong := &e.bTable[prevHashL]
-		eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
 		cv = x >> 8
 	}
 
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
index 00a311977..a52c80ea4 100644
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -13,7 +13,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
-	if debugDecode && e.cur < 0 {
+	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
 	}
 
@@ -101,7 +101,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			entry := tableEntry{offset: s + e.cur}
 			e.table[nextHashS] = entry
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
@@ -112,17 +112,17 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == lCandidate.Cur.val {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
 					// Long candidate matches at least 4 bytes.
 
 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 
 					// Check the previous long candidate as well.
 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
@@ -135,17 +135,17 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				}
 				// Current value did not match, but check if previous long value does.
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 					break
 				}
 			}
 
 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4
 
@@ -153,9 +153,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				lCandidate = e.bTable[nextHashL]
 
 				// Store the next match
-				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 				eLong := &e.bTable[nextHashL]
-				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 
 				// Check repeat at s + repOff
 				const repOff = 1
@@ -174,7 +174,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				// If the next long is a candidate, use that...
 				t2 = lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if lCandidate.Cur.val == uint32(next) {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@@ -185,7 +185,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@@ -244,9 +244,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			// Index after match end.
 			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
 				cv := load6432(src, i)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)}
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur}
 				eLong := &e.bTable[hash7(cv, tableBits)]
-				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
 			}
 			goto emitRemainder
 		}
@@ -255,8 +255,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 		if true {
 			for i := nextS + 1; i < s-1; i += 2 {
 				cv := load6432(src, i)
-				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-				t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
 				e.table[hash4x64(cv, tableBits)] = t
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
index 099c0ddbc..f9abf606d 100644
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -262,7 +262,7 @@ func (t *tokens) EstimatedBits() int {
 // AddMatch adds a match to the tokens.
 // This function is very sensitive to inlining and right on the border.
 func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
-	if debugDecode {
+	if debugDeflate {
 		if xlength >= maxMatchLength+baseMatchLength {
 			panic(fmt.Errorf("invalid length: %v", xlength))
 		}
@@ -281,7 +281,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 // AddMatchLong adds a match to the tokens, potentially longer than max match length.
 // Length should NOT have the base subtracted, only offset should.
 func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
-	if debugDecode {
+	if debugDeflate {
 		if xoffset >= maxMatchOffset+baseMatchOffset {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index 507757d52..4f0eba22f 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -806,7 +806,7 @@ func (b *blockEnc) genCodes() {
 		mlH[v]++
 		if v > mlMax {
 			mlMax = v
-			if debug && mlMax > maxMatchLengthSymbol {
+			if debugAsserts && mlMax > maxMatchLengthSymbol {
 				panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
 			}
 		}
@@ -821,13 +821,13 @@ func (b *blockEnc) genCodes() {
 		}
 		return int(max)
 	}
-	if mlMax > maxMatchLengthSymbol {
+	if debugAsserts && mlMax > maxMatchLengthSymbol {
 		panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d)", mlMax))
 	}
-	if ofMax > maxOffsetBits {
+	if debugAsserts && ofMax > maxOffsetBits {
 		panic(fmt.Errorf("ofMax > maxOffsetBits (%d)", ofMax))
 	}
-	if llMax > maxLiteralLengthSymbol {
+	if debugAsserts && llMax > maxLiteralLengthSymbol {
 		panic(fmt.Errorf("llMax > maxLiteralLengthSymbol (%d)", llMax))
 	}
 
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 07321acb1..658ef7838 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -30,7 +30,7 @@ type byteBuffer interface {
 type byteBuf []byte
 
 func (b *byteBuf) readSmall(n int) []byte {
-	if debug && n > 8 {
+	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	bb := *b
@@ -82,7 +82,7 @@ type readerWrapper struct {
 }
 
 func (r *readerWrapper) readSmall(n int) []byte {
-	if debug && n > 8 {
+	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	n2, err := io.ReadFull(r.r, r.tmp[:n])
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 35a3cda91..86553c2c3 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -66,7 +66,7 @@ var (
 // A Decoder can be used in two modes:
 //
 // 1) As a stream, or
-// 2) For stateless decoding using DecodeAll or DecodeBuffer.
+// 2) For stateless decoding using DecodeAll.
 //
 // Only a single stream can be decoded concurrently, but the same decoder
 // can run multiple concurrent stateless decodes. It is even possible to
@@ -315,7 +315,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			if size > 1<<20 {
 				size = 1 << 20
 			}
-			dst = make([]byte, 0, frame.WindowSize)
+			dst = make([]byte, 0, size)
 		}
 
 		dst, err = frame.runDecoder(dst, block)
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
new file mode 100644
index 000000000..4375e08b4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -0,0 +1,521 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import "fmt"
+
+const (
+	betterLongTableBits = 19                       // Bits used in the long match table
+	betterLongTableSize = 1 << betterLongTableBits // Size of the table
+
+	// Note: Increasing the short table bits or making the hash shorter
+	// can actually lead to compression degradation since it will 'steal' more from the
+	// long match table and match offsets are quite big.
+	// This greatly depends on the type of input.
+	betterShortTableBits = 13                        // Bits used in the short match table
+	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+)
+
+type prevEntry struct {
+	offset int32
+	prev   int32
+}
+
+// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type betterFastEncoder struct {
+	fastBase
+	table     [betterShortTableSize]tableEntry
+	longTable [betterLongTableSize]prevEntry
+}
+
+// Encode improves compression...
+func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	stepSize := int32(e.o.targetLength)
+	if stepSize == 0 {
+		stepSize++
+	}
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, betterLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hash8(cv0, betterLongTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			index0 += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.Encode(blk, src)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index ee3b09b02..d640e6a9f 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -4,6 +4,8 @@
 
 package zstd
 
+import "fmt"
+
 const (
 	dFastLongTableBits = 17                      // Bits used in the long match table
 	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
@@ -29,7 +31,7 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur > (1<<30)+e.maxMatchOff {
+	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -61,6 +63,7 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 			e.longTable[i].offset = v
 		}
 		e.cur = e.maxMatchOff
+		break
 	}
 
 	s := e.addBlock(src)
@@ -110,7 +113,7 @@ encodeLoop:
 		canRepeat := len(blk.sequences) > 2
 
 		for {
-			if debug && canRepeat && offset1 == 0 {
+			if debugAsserts && canRepeat && offset1 == 0 {
 				panic("offset0 was 0")
 			}
 
@@ -169,55 +172,6 @@ encodeLoop:
 					cv = load6432(src, s)
 					continue
 				}
-				const repOff2 = 1
-				// We deviate from the reference encoder and also check offset 2.
-				// Slower and not consistently better, so disabled.
-				// repIndex = s - offset2 + repOff2
-				if false && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff2*8)) {
-					// Consider history as well.
-					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff2, repIndex+4, src)
-
-					seq.matchLen = uint32(lenght - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff2
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 2
-					seq.offset = 2
-					if debugSequences {
-						println("repeat sequence 2", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-					s += lenght + repOff2
-					nextEmit = s
-					if s >= sLimit {
-						if debug {
-							println("repeat ended", s, lenght)
-
-						}
-						break encodeLoop
-					}
-					cv = load6432(src, s)
-					// Swap offsets
-					offset1, offset2 = offset2, offset1
-					continue
-				}
 			}
 			// Find the offsets of our two matches.
 			coffsetL := s - (candidateL.offset - e.cur)
@@ -229,10 +183,10 @@ encodeLoop:
 				// Reference encoder checks all 8 bytes, we only check 4,
 				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 				t = candidateL.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugMatches {
@@ -266,13 +220,13 @@ encodeLoop:
 				}
 
 				t = candidateS.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				if debugMatches {
@@ -294,11 +248,11 @@ encodeLoop:
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
-		if debug && canRepeat && int(offset1) > len(src) {
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
 			panic("invalid offset")
 		}
 
@@ -369,7 +323,7 @@ encodeLoop:
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv1>>8, dFastShortTableBits)
+			nextHashS := hash5(cv, dFastShortTableBits)
 			nextHashL := hash8(cv, dFastLongTableBits)
 
 			// We have at least 4 byte match.
@@ -424,7 +378,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	if e.cur > (1<<30)+e.maxMatchOff {
+	if e.cur >= bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -545,10 +499,10 @@ encodeLoop:
 				// Reference encoder checks all 8 bytes, we only check 4,
 				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 				t = candidateL.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugMatches {
@@ -582,13 +536,13 @@ encodeLoop:
 				}
 
 				t = candidateS.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				if debugMatches {
@@ -610,8 +564,8 @@ encodeLoop:
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
 		// Extend the 4-byte match as long as possible.
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index 0bdddac5b..1387b8082 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -5,6 +5,8 @@
 package zstd
 
 import (
+	"fmt"
+	"math"
 	"math/bits"
 
 	"github.com/klauspost/compress/zstd/internal/xxhash"
@@ -22,7 +24,7 @@ type tableEntry struct {
 	offset int32
 }
 
-type fastEncoder struct {
+type fastBase struct {
 	o encParams
 	// cur is the offset at the start of hist
 	cur int32
@@ -30,18 +32,22 @@ type fastEncoder struct {
 	maxMatchOff int32
 	hist        []byte
 	crc         *xxhash.Digest
-	table       [tableSize]tableEntry
 	tmp         [8]byte
 	blk         *blockEnc
 }
 
+type fastEncoder struct {
+	fastBase
+	table [tableSize]tableEntry
+}
+
 // CRC returns the underlying CRC writer.
-func (e *fastEncoder) CRC() *xxhash.Digest {
+func (e *fastBase) CRC() *xxhash.Digest {
 	return e.crc
 }
 
 // AppendCRC will append the CRC to the destination slice and return it.
-func (e *fastEncoder) AppendCRC(dst []byte) []byte {
+func (e *fastBase) AppendCRC(dst []byte) []byte {
 	crc := e.crc.Sum(e.tmp[:0])
 	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
 	return dst
@@ -49,7 +55,7 @@ func (e *fastEncoder) AppendCRC(dst []byte) []byte {
 
 // WindowSize returns the window size of the encoder,
 // or a window size small enough to contain the input size, if > 0.
-func (e *fastEncoder) WindowSize(size int) int32 {
+func (e *fastBase) WindowSize(size int) int32 {
 	if size > 0 && size < int(e.maxMatchOff) {
 		b := int32(1) << uint(bits.Len(uint(size)))
 		// Keep minimum window.
@@ -62,7 +68,7 @@ func (e *fastEncoder) WindowSize(size int) int32 {
 }
 
 // Block returns the current block.
-func (e *fastEncoder) Block() *blockEnc {
+func (e *fastBase) Block() *blockEnc {
 	return e.blk
 }
 
@@ -74,7 +80,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur > (1<<30)+e.maxMatchOff {
+	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -94,6 +100,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 			e.table[i].offset = v
 		}
 		e.cur = e.maxMatchOff
+		break
 	}
 
 	s := e.addBlock(src)
@@ -151,7 +158,7 @@ encodeLoop:
 		canRepeat := len(blk.sequences) > 2
 
 		for {
-			if debug && canRepeat && offset1 == 0 {
+			if debugAsserts && canRepeat && offset1 == 0 {
 				panic("offset0 was 0")
 			}
 
@@ -167,9 +174,22 @@ encodeLoop:
 			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				lenght := 4 + e.matchlen(s+6, repIndex+4, src)
+				var length int32
+				// length = 4 + e.matchlen(s+6, repIndex+4, src)
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
 
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -195,11 +215,11 @@ encodeLoop:
 					println("repeat sequence", seq, "next s:", s)
 				}
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 
 					}
 					break encodeLoop
@@ -212,10 +232,10 @@ encodeLoop:
 			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
 				// found a regular match
 				t = candidate.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				break
@@ -225,13 +245,13 @@ encodeLoop:
 				// found a regular match
 				t = candidate2.offset - e.cur
 				s++
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				break
@@ -246,16 +266,29 @@ encodeLoop:
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
-		if debug && canRepeat && int(offset1) > len(src) {
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
 			panic("invalid offset")
 		}
 
 		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
+		//l := e.matchlen(s+4, t+4, src) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -292,7 +325,20 @@ encodeLoop:
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
 
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -343,7 +389,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		}
 	}
 	// Protect against e.cur wraparound.
-	if e.cur > (1<<30)+e.maxMatchOff {
+	if e.cur >= bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -410,10 +456,23 @@ encodeLoop:
 			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				// lenght := 4 + e.matchlen(s+6, repIndex+4, src)
-				lenght := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				// length := 4 + e.matchlen(s+6, repIndex+4, src)
+				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				var length int32
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
 
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -439,11 +498,11 @@ encodeLoop:
 					println("repeat sequence", seq, "next s:", s)
 				}
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 
 					}
 					break encodeLoop
@@ -456,10 +515,10 @@ encodeLoop:
 			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
 				// found a regular match
 				t = candidate.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				break
@@ -469,13 +528,13 @@ encodeLoop:
 				// found a regular match
 				t = candidate2.offset - e.cur
 				s++
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				break
@@ -490,13 +549,26 @@ encodeLoop:
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
 		// Extend the 4-byte match as long as possible.
 		//l := e.matchlenNoHist(s+4, t+4, src) + 4
-		l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -534,7 +606,20 @@ encodeLoop:
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
-			l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
 
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -569,7 +654,10 @@ encodeLoop:
 	}
 }
 
-func (e *fastEncoder) addBlock(src []byte) int32 {
+func (e *fastBase) addBlock(src []byte) int32 {
+	if debugAsserts && e.cur > bufferReset {
+		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+	}
 	// check if we have space already
 	if len(e.hist)+len(src) > cap(e.hist) {
 		if cap(e.hist) == 0 {
@@ -597,39 +685,41 @@ func (e *fastEncoder) addBlock(src []byte) int32 {
 
 // useBlock will replace the block with the provided one,
 // but transfer recent offsets from the previous.
-func (e *fastEncoder) UseBlock(enc *blockEnc) {
+func (e *fastBase) UseBlock(enc *blockEnc) {
 	enc.reset(e.blk)
 	e.blk = enc
 }
 
-func (e *fastEncoder) matchlenNoHist(s, t int32, src []byte) int32 {
+func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
 	// Extend the match to be as long as possible.
 	return int32(matchLen(src[s:], src[t:]))
 }
 
-func (e *fastEncoder) matchlen(s, t int32, src []byte) int32 {
-	if debug {
+func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
+	if debugAsserts {
 		if s < 0 {
-			panic("s<0")
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
 		}
 		if t < 0 {
-			panic("t<0")
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
 		}
 		if s-t > e.maxMatchOff {
-			panic(s - t)
+			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
+			panic(err)
+		}
+		if len(src)-int(s) > maxCompressedBlockSize {
+			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
 		}
-	}
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
 	}
 
 	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:s1], src[t:]))
+	return int32(matchLen(src[s:], src[t:]))
 }
 
 // Reset the encoding table.
-func (e *fastEncoder) Reset() {
+func (e *fastBase) Reset() {
 	if e.blk == nil {
 		e.blk = &blockEnc{}
 		e.blk.init()
@@ -650,7 +740,10 @@ func (e *fastEncoder) Reset() {
 		}
 		e.hist = make([]byte, 0, l)
 	}
-	// We offset current position so everything will be out of reach
-	e.cur += e.maxMatchOff + int32(len(e.hist))
+	// We offset current position so everything will be out of reach.
+	// If above reset line, history will be purged.
+	if e.cur < bufferReset {
+		e.cur += e.maxMatchOff + int32(len(e.hist))
+	}
 	e.hist = e.hist[:0]
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index 366dd66bd..67d45efb9 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -71,15 +71,14 @@ func NewWriter(w io.Writer, opts ...EOption) (*Encoder, error) {
 	}
 	if w != nil {
 		e.Reset(w)
-	} else {
-		e.init.Do(func() {
-			e.initialize()
-		})
 	}
 	return &e, nil
 }
 
 func (e *Encoder) initialize() {
+	if e.o.concurrent == 0 {
+		e.o.setDefault()
+	}
 	e.encoders = make(chan encoder, e.o.concurrent)
 	for i := 0; i < e.o.concurrent; i++ {
 		e.encoders <- e.o.encoder()
@@ -89,9 +88,6 @@ func (e *Encoder) initialize() {
 // Reset will re-initialize the writer and new writes will encode to the supplied writer
 // as a new, independent stream.
 func (e *Encoder) Reset(w io.Writer) {
-	e.init.Do(func() {
-		e.initialize()
-	})
 	s := &e.state
 	s.wg.Wait()
 	s.wWg.Wait()
@@ -156,7 +152,7 @@ func (e *Encoder) Write(p []byte) (n int, err error) {
 		if err != nil {
 			return n, err
 		}
-		if debug && len(s.filling) > 0 {
+		if debugAsserts && len(s.filling) > 0 {
 			panic(len(s.filling))
 		}
 	}
@@ -422,10 +418,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		}
 		return dst
 	}
-	e.init.Do(func() {
-		e.o.setDefault()
-		e.initialize()
-	})
+	e.init.Do(e.initialize)
 	enc := <-e.encoders
 	defer func() {
 		// Release encoder reference to last block.
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 40eb45733..0ff970dac 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -39,9 +39,11 @@ func (o *encoderOptions) setDefault() {
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
 	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{maxMatchOff: int32(o.windowSize)}}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
+	case SpeedBetterCompression:
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
 	case SpeedFastest:
-		return &fastEncoder{maxMatchOff: int32(o.windowSize)}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
 	}
 	panic("unknown compression level")
 }
@@ -67,7 +69,7 @@ func WithEncoderConcurrency(n int) EOption {
 }
 
 // WithWindowSize will set the maximum allowed back-reference distance.
-// The value must be a power of two between WindowSizeMin and WindowSizeMax.
+// The value must be a power of two between MinWindowSize and MaxWindowSize.
 // A larger value will enable better compression but allocate more memory and,
 // for above-default values, take considerably longer.
 // The default value is determined by the compression level.
@@ -130,18 +132,18 @@ const (
 	// This is roughly equivalent to the default Zstandard mode (level 3).
 	SpeedDefault
 
+	// SpeedBetterCompression will yield better compression than the default.
+	// Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage.
+	// By using this, notice that CPU usage may go up in the future.
+	SpeedBetterCompression
+
 	// speedLast should be kept as the last actual compression option.
 	// The is not for external usage, but is used to keep track of the valid options.
 	speedLast
 
-	// SpeedBetterCompression will (in the future) yield better compression than the default,
-	// but at approximately 4x the CPU usage of the default.
-	// For now this is not implemented.
-	SpeedBetterCompression = SpeedDefault
-
 	// SpeedBestCompression will choose the best available compression option.
 	// For now this is not implemented.
-	SpeedBestCompression = SpeedDefault
+	SpeedBestCompression = SpeedBetterCompression
 )
 
 // EncoderLevelFromString will convert a string representation of an encoding level back
@@ -163,8 +165,10 @@ func EncoderLevelFromZstd(level int) EncoderLevel {
 	switch {
 	case level < 3:
 		return SpeedFastest
-	case level >= 3:
+	case level >= 3 && level < 6:
 		return SpeedDefault
+	case level > 5:
+		return SpeedBetterCompression
 	}
 	return SpeedDefault
 }
@@ -176,6 +180,8 @@ func (e EncoderLevel) String() string {
 		return "fastest"
 	case SpeedDefault:
 		return "default"
+	case SpeedBetterCompression:
+		return "better"
 	default:
 		return "invalid"
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 40790747a..cda590b5f 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -50,7 +50,7 @@ type frameDec struct {
 const (
 	// The minimum Window_Size is 1 KB.
 	MinWindowSize = 1 << 10
-	MaxWindowSize = 1 << 30
+	MaxWindowSize = 1 << 29
 )
 
 var (
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index 9efe34feb..e002be98b 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -118,7 +118,7 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 
 		if int32(bitStream)&(threshold-1) < max {
 			count = int32(bitStream) & (threshold - 1)
-			if debug && nbBits < 1 {
+			if debugAsserts && nbBits < 1 {
 				panic("nbBits underflow")
 			}
 			bitCount += nbBits - 1
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index 619836f52..aa9eba88b 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -327,7 +327,7 @@ func (s *fseEncoder) normalizeCount(length int) error {
 		if err != nil {
 			return err
 		}
-		if debug {
+		if debugAsserts {
 			err = s.validateNorm()
 			if err != nil {
 				return err
@@ -336,7 +336,7 @@ func (s *fseEncoder) normalizeCount(length int) error {
 		return s.buildCTable()
 	}
 	s.norm[largest] += stillToDistribute
-	if debug {
+	if debugAsserts {
 		err := s.validateNorm()
 		if err != nil {
 			return err
@@ -619,7 +619,7 @@ func (s *fseEncoder) writeCount(out []byte) ([]byte, error) {
 func (s *fseEncoder) bitCost(symbolValue uint8, accuracyLog uint32) uint32 {
 	minNbBits := s.ct.symbolTT[symbolValue].deltaNbBits >> 16
 	threshold := (minNbBits + 1) << 16
-	if debug {
+	if debugAsserts {
 		if !(s.actualTableLog < 16) {
 			panic("!s.actualTableLog < 16")
 		}
@@ -633,7 +633,7 @@ func (s *fseEncoder) bitCost(symbolValue uint8, accuracyLog uint32) uint32 {
 	// linear interpolation (very approximate)
 	normalizedDeltaFromThreshold := (deltaFromThreshold << accuracyLog) >> s.actualTableLog
 	bitMultiplier := uint32(1) << accuracyLog
-	if debug {
+	if debugAsserts {
 		if s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold {
 			panic("s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold")
 		}
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index d580e32ae..2c9c5357a 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -179,13 +179,13 @@ TEXT ·writeBlocks(SB), NOSPLIT, $0-40
 	MOVQ ·prime2v(SB), R14
 
 	// Load slice.
-	MOVQ b_base+8(FP), CX
-	MOVQ b_len+16(FP), DX
+	MOVQ arg1_base+8(FP), CX
+	MOVQ arg1_len+16(FP), DX
 	LEAQ (CX)(DX*1), BX
 	SUBQ $32, BX
 
 	// Load vN from d.
-	MOVQ d+0(FP), AX
+	MOVQ arg+0(FP), AX
 	MOVQ 0(AX), R8   // v1
 	MOVQ 8(AX), R9   // v2
 	MOVQ 16(AX), R10 // v3
@@ -209,7 +209,7 @@ blockLoop:
 	MOVQ R11, 24(AX)
 
 	// The number of bytes written is CX minus the old base pointer.
-	SUBQ b_base+8(FP), CX
+	SUBQ arg1_base+8(FP), CX
 	MOVQ CX, ret+32(FP)
 
 	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 57a8a2f5b..0807719c8 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -6,11 +6,20 @@ package zstd
 import (
 	"errors"
 	"log"
+	"math"
 	"math/bits"
 )
 
+// enable debug printing
 const debug = false
+
+// Enable extra assertions.
+const debugAsserts = debug || false
+
+// print sequence details
 const debugSequences = false
+
+// print detailed matching information
 const debugMatches = false
 
 // force encoder to use predefined tables.
@@ -19,6 +28,9 @@ const forcePreDef = false
 // zstdMinMatch is the minimum zstd match length.
 const zstdMinMatch = 3
 
+// Reset the buffer offset when reaching this.
+const bufferReset = math.MaxInt32 - MaxWindowSize
+
 var (
 	// ErrReservedBlockType is returned when a reserved block type is found.
 	// Typically this indicates wrong or corrupted input.
@@ -75,6 +87,17 @@ func printf(format string, a ...interface{}) {
 	}
 }
 
+// matchLenFast does matching, but will not match the last up to 7 bytes.
+func matchLenFast(a, b []byte) int {
+	endI := len(a) & (math.MaxInt32 - 7)
+	for i := 0; i < endI; i += 8 {
+		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+			return i + bits.TrailingZeros64(diff)>>3
+		}
+	}
+	return endI
+}
+
 // matchLen returns the maximum length.
 // a must be the shortest of the two.
 // The function also returns whether all bytes matched.
@@ -85,33 +108,18 @@ func matchLen(a, b []byte) int {
 			return i + (bits.TrailingZeros64(diff) >> 3)
 		}
 	}
+
 	checked := (len(a) >> 3) << 3
 	a = a[checked:]
 	b = b[checked:]
-	// TODO: We could do a 4 check.
 	for i := range a {
 		if a[i] != b[i] {
-			return int(i) + checked
+			return i + checked
 		}
 	}
 	return len(a) + checked
 }
 
-// matchLen returns a match length in src between index s and t
-func matchLenIn(src []byte, s, t int32) int32 {
-	s1 := len(src)
-	b := src[t:]
-	a := src[s:s1]
-	b = b[:len(a)]
-	// Extend the match to be as long as possible.
-	for i := range a {
-		if a[i] != b[i] {
-			return int32(i)
-		}
-	}
-	return int32(len(a))
-}
-
 func load3232(b []byte, i int32) uint32 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
diff --git a/vendor/github.com/klauspost/pgzip/README.md b/vendor/github.com/klauspost/pgzip/README.md
index 81000996c..171b978fd 100644
--- a/vendor/github.com/klauspost/pgzip/README.md
+++ b/vendor/github.com/klauspost/pgzip/README.md
@@ -39,7 +39,6 @@ You might need to get/update the dependencies:
 
 ```
 go get -u github.com/klauspost/compress
-go get -u github.com/klauspost/crc32
 ```
 
 Usage
@@ -65,7 +64,7 @@ Changes in [github.com/klauspost/compress](https://github.com/klauspost/compress
 ## Compression
 The simplest way to use this is to simply do the same as you would when using [compress/gzip](http://golang.org/pkg/compress/gzip). 
 
-To change the block size, use the added (*pgzip.Writer).SetConcurrency(blockSize, blocks int) function. With this you can control the approximate size of your blocks, as well as how many you want to be processing in parallel. Default values for this is SetConcurrency(250000, 16), meaning blocks are split at 250000 bytes and up to 16 blocks can be processing at once before the writer blocks.
+To change the block size, use the added (*pgzip.Writer).SetConcurrency(blockSize, blocks int) function. With this you can control the approximate size of your blocks, as well as how many you want to be processing in parallel. Default values for this is SetConcurrency(1MB, runtime.GOMAXPROCS(0)), meaning blocks are split at 1 MB and up to the number of CPU threads blocks can be processing at once before the writer blocks.
 
 
 Example:
@@ -99,19 +98,19 @@ See my blog post in [Benchmarks of Golang Gzip](https://blog.klauspost.com/go-gz
 
 Compression cost is usually about 0.2% with default settings with a block size of 250k.
 
-Example with GOMAXPROC set to 8 (quad core with 8 hyperthreads)
+Example with GOMAXPROC set to 32 (16 core CPU)
 
 Content is [Matt Mahoneys 10GB corpus](http://mattmahoney.net/dc/10gb.html). Compression level 6.
 
 Compressor  | MB/sec   | speedup | size | size overhead (lower=better)
 ------------|----------|---------|------|---------
-[gzip](http://golang.org/pkg/compress/gzip) (golang) | 7.21MB/s | 1.0x | 4786608902 | 0%
-[gzip](http://github.com/klauspost/compress/gzip) (klauspost) | 10.98MB/s | 1.52x | 4781331645 | -0.11%
-[pgzip](https://github.com/klauspost/pgzip) (klauspost) | 50.76MB/s|7.04x | 4784121440 | -0.052%
-[bgzf](https://godoc.org/github.com/biogo/hts/bgzf) (biogo) | 38.65MB/s | 5.36x | 4924899484 | 2.889%
-[pargzip](https://godoc.org/github.com/golang/build/pargzip) (builder) | 32.00MB/s | 4.44x | 4791226567 | 0.096%
+[gzip](http://golang.org/pkg/compress/gzip) (golang) | 15.44MB/s (1 thread) | 1.0x | 4781329307 | 0%
+[gzip](http://github.com/klauspost/compress/gzip) (klauspost) | 135.04MB/s (1 thread) | 8.74x | 4894858258 | +2.37%
+[pgzip](https://github.com/klauspost/pgzip) (klauspost) | 1573.23MB/s| 101.9x | 4902285651 | +2.53%
+[bgzf](https://godoc.org/github.com/biogo/hts/bgzf) (biogo) | 361.40MB/s | 23.4x | 4869686090 | +1.85%
+[pargzip](https://godoc.org/github.com/golang/build/pargzip) (builder) | 306.01MB/s | 19.8x | 4786890417 | +0.12%
 
-pgzip also contains a [linear time compression](https://github.com/klauspost/compress#linear-time-compression) mode, that will allow compression at ~150MB per core per second, independent of the content.
+pgzip also contains a [linear time compression](https://github.com/klauspost/compress#linear-time-compression-huffman-only) mode, that will allow compression at ~250MB per core per second, independent of the content.
 
 See the [complete sheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) for different content types and compression settings.
 
diff --git a/vendor/github.com/klauspost/pgzip/gzip.go b/vendor/github.com/klauspost/pgzip/gzip.go
index 85d14e9cb..cb3dc0896 100644
--- a/vendor/github.com/klauspost/pgzip/gzip.go
+++ b/vendor/github.com/klauspost/pgzip/gzip.go
@@ -11,6 +11,7 @@ import (
 	"hash"
 	"hash/crc32"
 	"io"
+	"runtime"
 	"sync"
 	"time"
 
@@ -18,9 +19,9 @@ import (
 )
 
 const (
-	defaultBlockSize = 256 << 10
+	defaultBlockSize = 1 << 20
 	tailSize         = 16384
-	defaultBlocks    = 16
+	defaultBlocks    = 4
 )
 
 // These constants are copied from the flate package, so that code that imports
@@ -68,8 +69,8 @@ type result struct {
 // With this you can control the approximate size of your blocks,
 // as well as how many you want to be processing in parallel.
 //
-// Default values for this is SetConcurrency(250000, 16),
-// meaning blocks are split at 250000 bytes and up to 16 blocks
+// Default values for this is SetConcurrency(defaultBlockSize, runtime.GOMAXPROCS(0)),
+// meaning blocks are split at 1 MB and up to the number of CPU threads
 // can be processing at once before the writer blocks.
 func (z *Writer) SetConcurrency(blockSize, blocks int) error {
 	if blockSize <= tailSize {
@@ -84,7 +85,7 @@ func (z *Writer) SetConcurrency(blockSize, blocks int) error {
 	z.blockSize = blockSize
 	z.results = make(chan result, blocks)
 	z.blocks = blocks
-	z.dstPool = sync.Pool{New: func() interface{} { return make([]byte, 0, blockSize+(blockSize)>>4) }}
+	z.dstPool.New = func() interface{} { return make([]byte, 0, blockSize+(blockSize)>>4) }
 	return nil
 }
 
@@ -115,7 +116,7 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
 		return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
 	}
 	z := new(Writer)
-	z.SetConcurrency(defaultBlockSize, defaultBlocks)
+	z.SetConcurrency(defaultBlockSize, runtime.GOMAXPROCS(0))
 	z.init(w, level)
 	return z, nil
 }
@@ -174,7 +175,7 @@ func (z *Writer) Reset(w io.Writer) {
 	if z.results != nil && !z.closed {
 		close(z.results)
 	}
-	z.SetConcurrency(defaultBlockSize, defaultBlocks)
+	z.SetConcurrency(defaultBlockSize, runtime.GOMAXPROCS(0))
 	z.init(w, z.level)
 }
 
@@ -239,36 +240,36 @@ func (z *Writer) writeString(s string) (err error) {
 // compressCurrent will compress the data currently buffered
 // This should only be called from the main writer/flush/closer
 func (z *Writer) compressCurrent(flush bool) {
+	c := z.currentBuffer
+	if len(c) > z.blockSize {
+		// This can never happen through the public interface.
+		panic("len(z.currentBuffer) > z.blockSize (most likely due to concurrent Write race)")
+	}
+
 	r := result{}
 	r.result = make(chan []byte, 1)
 	r.notifyWritten = make(chan struct{}, 0)
+	// Reserve a result slot
 	select {
 	case z.results <- r:
 	case <-z.pushedErr:
 		return
 	}
 
-	// If block given is more than twice the block size, split it.
-	c := z.currentBuffer
-	if len(c) > z.blockSize*2 {
-		c = c[:z.blockSize]
-		z.wg.Add(1)
-		go z.compressBlock(c, z.prevTail, r, false)
-		z.prevTail = c[len(c)-tailSize:]
-		z.currentBuffer = z.currentBuffer[z.blockSize:]
-		z.compressCurrent(flush)
-		// Last one flushes if needed
-		return
-	}
-
 	z.wg.Add(1)
-	go z.compressBlock(c, z.prevTail, r, z.closed)
+	tail := z.prevTail
 	if len(c) > tailSize {
-		z.prevTail = c[len(c)-tailSize:]
+		buf := z.dstPool.Get().([]byte) // Put in .compressBlock
+		// Copy tail from current buffer before handing the buffer over to the
+		// compressBlock goroutine.
+		buf = append(buf[:0], c[len(c)-tailSize:]...)
+		z.prevTail = buf
 	} else {
 		z.prevTail = nil
 	}
-	z.currentBuffer = z.dstPool.Get().([]byte)
+	go z.compressBlock(c, tail, r, z.closed)
+
+	z.currentBuffer = z.dstPool.Get().([]byte) // Put in .compressBlock
 	z.currentBuffer = z.currentBuffer[:0]
 
 	// Wait if flushing
@@ -358,29 +359,37 @@ func (z *Writer) Write(p []byte) (int, error) {
 		// Start receiving data from compressors
 		go func() {
 			listen := z.results
+			var failed bool
 			for {
 				r, ok := <-listen
 				// If closed, we are finished.
 				if !ok {
 					return
 				}
+				if failed {
+					close(r.notifyWritten)
+					continue
+				}
 				buf := <-r.result
 				n, err := z.w.Write(buf)
 				if err != nil {
 					z.pushError(err)
 					close(r.notifyWritten)
-					return
+					failed = true
+					continue
 				}
 				if n != len(buf) {
 					z.pushError(fmt.Errorf("gzip: short write %d should be %d", n, len(buf)))
+					failed = true
 					close(r.notifyWritten)
-					return
+					continue
 				}
 				z.dstPool.Put(buf)
 				close(r.notifyWritten)
 			}
 		}()
-		z.currentBuffer = make([]byte, 0, z.blockSize)
+		z.currentBuffer = z.dstPool.Get().([]byte)
+		z.currentBuffer = z.currentBuffer[:0]
 	}
 	q := p
 	for len(q) > 0 {
@@ -390,7 +399,10 @@ func (z *Writer) Write(p []byte) (int, error) {
 		}
 		z.digest.Write(q[:length])
 		z.currentBuffer = append(z.currentBuffer, q[:length]...)
-		if len(z.currentBuffer) >= z.blockSize {
+		if len(z.currentBuffer) > z.blockSize {
+			panic("z.currentBuffer too large (most likely due to concurrent Write race)")
+		}
+		if len(z.currentBuffer) == z.blockSize {
 			z.compressCurrent(false)
 			if err := z.checkError(); err != nil {
 				return len(p) - len(q) - length, err
@@ -410,12 +422,13 @@ func (z *Writer) compressBlock(p, prevTail []byte, r result, closed bool) {
 		close(r.result)
 		z.wg.Done()
 	}()
-	buf := z.dstPool.Get().([]byte)
+	buf := z.dstPool.Get().([]byte) // Corresponding Put in .Write's result writer
 	dest := bytes.NewBuffer(buf[:0])
 
-	compressor := z.dictFlatePool.Get().(*flate.Writer)
+	compressor := z.dictFlatePool.Get().(*flate.Writer) // Put below
 	compressor.ResetDict(dest, prevTail)
 	compressor.Write(p)
+	z.dstPool.Put(p) // Corresponding Get in .Write and .compressCurrent
 
 	err := compressor.Flush()
 	if err != nil {
@@ -429,7 +442,12 @@ func (z *Writer) compressBlock(p, prevTail []byte, r result, closed bool) {
 			return
 		}
 	}
-	z.dictFlatePool.Put(compressor)
+	z.dictFlatePool.Put(compressor) // Get above
+
+	if prevTail != nil {
+		z.dstPool.Put(prevTail) // Get in .compressCurrent
+	}
+
 	// Read back buffer
 	buf = dest.Bytes()
 	r.result <- buf