20 files changed, 382 insertions, 206 deletions
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
index 744875676..1eb75ef68 100644
--- a/vendor/github.com/klauspost/compress/LICENSE
+++ b/vendor/github.com/klauspost/compress/LICENSE
@@ -1,4 +1,5 @@
 Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
index 202f36a99..413ec3b3c 100644
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@@ -243,7 +243,7 @@ func (s *Scratch) buildDtable() error {
 			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
 			s.decTable[u].nbBits = nBits
 			newState := (nextState << nBits) - tableSize
-			if newState > tableSize {
+			if newState >= tableSize {
 				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
 			}
 			if newState == uint16(u) && nBits == 0 {
@@ -281,8 +281,12 @@ func (s *Scratch) decompress() error {
 			tmp[off+2] = s1.nextFast()
 			tmp[off+3] = s2.nextFast()
 			off += 4
+			// When off is 0, we have overflowed and should write.
 			if off == 0 {
 				s.Out = append(s.Out, tmp...)
+				if len(s.Out) >= s.DecompressLimit {
+					return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
+				}
 			}
 		}
 	} else {
@@ -296,7 +300,7 @@ func (s *Scratch) decompress() error {
 			off += 4
 			if off == 0 {
 				s.Out = append(s.Out, tmp...)
-				off = 0
+				// When off is 0, we have overflowed and should write.
 				if len(s.Out) >= s.DecompressLimit {
 					return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
 				}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 261c54274..43b4815b3 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -193,14 +193,26 @@ func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
 		tmp[off+3] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
 		off += 4
 		if off == 0 {
+			if len(s.Out)+256 > s.MaxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
 			s.Out = append(s.Out, tmp...)
 		}
 	}
 
+	if len(s.Out)+int(off) > s.MaxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
 	s.Out = append(s.Out, tmp[:off]...)
 
 	for !br.finished() {
 		br.fill()
+		if len(s.Out) >= s.MaxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
 		s.Out = append(s.Out, decode())
 	}
 	return s.Out, br.close()
@@ -218,6 +230,9 @@ func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
 	if len(in) < 6+(4*1) {
 		return nil, errors.New("input too small")
 	}
+	if dstSize > s.MaxDecodedSize {
+		return nil, ErrMaxDecodedSizeExceeded
+	}
 	// TODO: We do not detect when we overrun a buffer, except if the last one does.
 
 	var br [4]bitReader
@@ -247,9 +262,13 @@ func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
 	dstOut := s.Out
 	dstEvery := (dstSize + 3) / 4
 
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := s.dt.single[:tlSize]
+
 	decode := func(br *bitReader) byte {
 		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
-		v := s.dt.single[val]
+		v := single[val&tlMask]
 		br.bitsRead += v.nBits
 		return v.byte
 	}
@@ -279,7 +298,7 @@ bigloop:
 		off += 2
 		if off == bufoff {
 			if bufoff > dstEvery {
-				return nil, errors.New("corruption detected: stream overrun")
+				return nil, errors.New("corruption detected: stream overrun 1")
 			}
 			copy(dstOut, tmp[:bufoff])
 			copy(dstOut[dstEvery:], tmp[bufoff:bufoff*2])
@@ -288,15 +307,15 @@ bigloop:
 			off = 0
 			dstOut = dstOut[bufoff:]
 			// There must at least be 3 buffers left.
-			if len(dstOut) < dstEvery*3+3 {
-				return nil, errors.New("corruption detected: stream overrun")
+			if len(dstOut) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(dstOut) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun")
+			return nil, errors.New("corruption detected: stream overrun 3")
 		}
 		copy(dstOut, tmp[:off])
 		copy(dstOut[dstEvery:dstEvery+ioff], tmp[bufoff:bufoff*2])
@@ -311,7 +330,7 @@ bigloop:
 		for !br.finished() {
 			br.fill()
 			if offset >= len(dstOut) {
-				return nil, errors.New("corruption detected: stream overrun")
+				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 			dstOut[offset] = decode(br)
 			offset++
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 50d02e440..6f823f94d 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -35,6 +35,9 @@ var (
 
 	// ErrTooBig is return if input is too large for a single block.
 	ErrTooBig = errors.New("input too big")
+
+	// ErrMaxDecodedSizeExceeded is return if input is too large for a single block.
+	ErrMaxDecodedSizeExceeded = errors.New("maximum output size exceeded")
 )
 
 type ReusePolicy uint8
@@ -86,6 +89,11 @@ type Scratch struct {
 	// Reuse will specify the reuse policy
 	Reuse ReusePolicy
 
+	// MaxDecodedSize will set the maximum allowed output size.
+	// This value will automatically be set to BlockSizeMax if not set.
+	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
+	MaxDecodedSize int
+
 	br             byteReader
 	symbolLen      uint16 // Length of active part of the symbol table.
 	maxCount       int    // count of the most probable symbol
@@ -116,6 +124,9 @@ func (s *Scratch) prepare(in []byte) (*Scratch, error) {
 	if s.TableLog > tableLogMax {
 		return nil, fmt.Errorf("tableLog (%d) > maxTableLog (%d)", s.TableLog, tableLogMax)
 	}
+	if s.MaxDecodedSize <= 0 || s.MaxDecodedSize > BlockSizeMax {
+		s.MaxDecodedSize = BlockSizeMax
+	}
 	if s.clearCount && s.maxCount == 0 {
 		for i := range s.count {
 			s.count[i] = 0
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s b/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
index e6179f65e..1c66e3723 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
+++ b/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
@@ -184,9 +184,7 @@ tagLit60Plus:
 	// checks. In the asm version, we code it once instead of once per switch case.
 	ADDQ CX, SI
 	SUBQ $58, SI
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	CMPQ SI, R13
 	JA   errCorrupt
 
 	// case x == 60:
@@ -232,9 +230,7 @@ tagCopy4:
 	ADDQ $5, SI
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	CMPQ SI, R13
 	JA   errCorrupt
 
 	// length = 1 + int(src[s-5])>>2
@@ -251,9 +247,7 @@ tagCopy2:
 	ADDQ $3, SI
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	CMPQ SI, R13
 	JA   errCorrupt
 
 	// length = 1 + int(src[s-3])>>2
@@ -277,9 +271,7 @@ tagCopy:
 	ADDQ $2, SI
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	CMPQ SI, R13
 	JA   errCorrupt
 
 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_other.go b/vendor/github.com/klauspost/compress/snappy/decode_other.go
index 8c9f2049b..94a96c5d7 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_other.go
+++ b/vendor/github.com/klauspost/compress/snappy/decode_other.go
@@ -85,14 +85,28 @@ func decode(dst, src []byte) int {
 		if offset <= 0 || d < offset || length > len(dst)-d {
 			return decodeErrCodeCorrupt
 		}
-		// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
-		// the built-in copy function, this byte-by-byte copy always runs
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
 		// forwards, even if the slices overlap. Conceptually, this is:
 		//
 		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
-		for end := d + length; d != end; d++ {
-			dst[d] = dst[d-offset]
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
 		}
+		d += length
 	}
 	if d != len(dst) {
 		return decodeErrCodeCorrupt
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index 670f98af4..d9d38b23f 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -34,7 +34,8 @@ For now, a high speed (fastest) and medium-fast (default) compressor has been im
 The "Fastest" compression ratio is roughly equivalent to zstd level 1. 
 The "Default" compression ration is roughly equivalent to zstd level 3 (default).
 
-In terms of speed, it is typically 2x as fast as the stdlib deflate/gzip in its fastest mode. The compression ratio compared to stdlib is around level 3, but usually 3x as fast.
+In terms of speed, it is typically 2x as fast as the stdlib deflate/gzip in its fastest mode. 
+The compression ratio compared to stdlib is around level 3, but usually 3x as fast.
 
 Compared to cgo zstd, the speed is around level 3 (default), but compression slightly worse, between level 1&2.
 
@@ -217,7 +218,8 @@ silesia.tar zstd    3   211947520   66793301    1377    146.79
 
 As part of the development process a *Snappy* -> *Zstandard* converter was also built.
 
-This can convert a *framed* [Snappy Stream](https://godoc.org/github.com/golang/snappy#Writer) to a zstd stream. Note that a single block is not framed.
+This can convert a *framed* [Snappy Stream](https://godoc.org/github.com/golang/snappy#Writer) to a zstd stream. 
+Note that a single block is not framed.
 
 Conversion is done by converting the stream directly from Snappy without intermediate full decoding.
 Therefore the compression ratio is much less than what can be done by a full decompression
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index aca1cb85d..3e161ea15 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -63,7 +63,8 @@ var (
 
 type blockDec struct {
 	// Raw source data of the block.
-	data []byte
+	data        []byte
+	dataStorage []byte
 
 	// Destination of the decoded data.
 	dst []byte
@@ -145,18 +146,18 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	}
 
 	// Read block data.
-	if cap(b.data) < cSize {
+	if cap(b.dataStorage) < cSize {
 		if b.lowMem {
-			b.data = make([]byte, 0, cSize)
+			b.dataStorage = make([]byte, 0, cSize)
 		} else {
-			b.data = make([]byte, 0, maxBlockSize)
+			b.dataStorage = make([]byte, 0, maxBlockSize)
 		}
 	}
 	if cap(b.dst) <= maxBlockSize {
 		b.dst = make([]byte, 0, maxBlockSize+1)
 	}
 	var err error
-	b.data, err = br.readBig(cSize, b.data[:0])
+	b.data, err = br.readBig(cSize, b.dataStorage)
 	if err != nil {
 		if debug {
 			println("Reading block:", err)
@@ -447,6 +448,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		// Use our out buffer.
 		huff.Out = b.literalBuf[:0]
+		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
 			literals, err = huff.Decompress4X(literals, litRegenSize)
 		} else {
@@ -609,6 +611,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		// Use our out buffer.
 		huff = hist.huffTree
 		huff.Out = b.literalBuf[:0]
+		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
 			literals, err = huff.Decompress4X(literals, litRegenSize)
 		} else {
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index cba24c76d..9d9151a0e 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -155,14 +155,17 @@ func (h *literalsHeader) setSize(regenLen int) {
 }
 
 // setSizes will set the size of a compressed literals section and the input length.
-func (h *literalsHeader) setSizes(compLen, inLen int) {
+func (h *literalsHeader) setSizes(compLen, inLen int, single bool) {
 	compBits, inBits := bits.Len32(uint32(compLen)), bits.Len32(uint32(inLen))
 	// Only retain 2 bits
 	const mask = 3
 	lh := uint64(*h & mask)
 	switch {
 	case compBits <= 10 && inBits <= 10:
-		lh |= (1 << 2) | (uint64(inLen) << 4) | (uint64(compLen) << (10 + 4)) | (3 << 60)
+		if !single {
+			lh |= 1 << 2
+		}
+		lh |= (uint64(inLen) << 4) | (uint64(compLen) << (10 + 4)) | (3 << 60)
 		if debug {
 			const mmask = (1 << 24) - 1
 			n := (lh >> 4) & mmask
@@ -175,8 +178,14 @@ func (h *literalsHeader) setSizes(compLen, inLen int) {
 		}
 	case compBits <= 14 && inBits <= 14:
 		lh |= (2 << 2) | (uint64(inLen) << 4) | (uint64(compLen) << (14 + 4)) | (4 << 60)
+		if single {
+			panic("single stream used with more than 10 bits length.")
+		}
 	case compBits <= 18 && inBits <= 18:
 		lh |= (3 << 2) | (uint64(inLen) << 4) | (uint64(compLen) << (18 + 4)) | (5 << 60)
+		if single {
+			panic("single stream used with more than 10 bits length.")
+		}
 	default:
 		panic("internal error: block too big")
 	}
@@ -307,12 +316,30 @@ func (b *blockEnc) encodeLits() error {
 		return nil
 	}
 
-	// TODO: Switch to 1X when less than x bytes.
-	out, reUsed, err := huff0.Compress4X(b.literals, b.litEnc)
-	// Bail out of compression is too little.
-	if len(out) > (len(b.literals) - len(b.literals)>>4) {
+	var (
+		out            []byte
+		reUsed, single bool
+		err            error
+	)
+	if len(b.literals) >= 1024 {
+		// Use 4 Streams.
+		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
+		if len(out) > len(b.literals)-len(b.literals)>>4 {
+			// Bail out of compression is too little.
+			err = huff0.ErrIncompressible
+		}
+	} else if len(b.literals) > 32 {
+		// Use 1 stream
+		single = true
+		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
+		if len(out) > len(b.literals)-len(b.literals)>>4 {
+			// Bail out of compression is too little.
+			err = huff0.ErrIncompressible
+		}
+	} else {
 		err = huff0.ErrIncompressible
 	}
+
 	switch err {
 	case huff0.ErrIncompressible:
 		if debug {
@@ -351,7 +378,7 @@ func (b *blockEnc) encodeLits() error {
 		lh.setType(literalsBlockCompressed)
 	}
 	// Set sizes
-	lh.setSizes(len(out), len(b.literals))
+	lh.setSizes(len(out), len(b.literals), single)
 	bh.setSize(uint32(len(out) + lh.size() + 1))
 
 	// Write block headers.
@@ -381,16 +408,23 @@ func (b *blockEnc) encode() error {
 	b.output = bh.appendTo(b.output)
 
 	var (
-		out    []byte
-		reUsed bool
-		err    error
+		out            []byte
+		reUsed, single bool
+		err            error
 	)
-	if len(b.literals) > 32 {
-		// TODO: Switch to 1X on small blocks.
+	if len(b.literals) >= 1024 {
+		// Use 4 Streams.
 		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
 		if len(out) > len(b.literals)-len(b.literals)>>4 {
 			err = huff0.ErrIncompressible
 		}
+	} else if len(b.literals) > 32 {
+		// Use 1 stream
+		single = true
+		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
+		if len(out) > len(b.literals)-len(b.literals)>>4 {
+			err = huff0.ErrIncompressible
+		}
 	} else {
 		err = huff0.ErrIncompressible
 	}
@@ -435,7 +469,7 @@ func (b *blockEnc) encode() error {
 				}
 			}
 		}
-		lh.setSizes(len(out), len(b.literals))
+		lh.setSizes(len(out), len(b.literals), single)
 		if debug {
 			printf("Compressed %d literals to %d bytes", len(b.literals), len(out))
 			println("Adding literal header:", lh)
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 4a8460476..3538063f1 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -116,6 +116,9 @@ func (r *readerWrapper) readByte() (byte, error) {
 }
 
 func (r *readerWrapper) skipN(n int) error {
-	_, err := io.CopyN(ioutil.Discard, r.r, int64(n))
+	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
+	if n2 != int64(n) {
+		err = io.ErrUnexpectedEOF
+	}
 	return err
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index f06bff6f6..f4db3096a 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -127,6 +127,9 @@ func (d *Decoder) Read(p []byte) (int, error) {
 		}
 	}
 	if len(d.current.b) > 0 {
+		if debug {
+			println("returning", n, "still bytes left:", len(d.current.b))
+		}
 		// Only return error at end of block
 		return n, nil
 	}
@@ -159,6 +162,9 @@ func (d *Decoder) Reset(r io.Reader) error {
 
 	// If bytes buffer and < 1MB, do sync decoding anyway.
 	if bb, ok := r.(*bytes.Buffer); ok && bb.Len() < 1<<20 {
+		if debug {
+			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
+		}
 		b := bb.Bytes()
 		dst, err := d.DecodeAll(b, nil)
 		if err == nil {
@@ -167,6 +173,9 @@ func (d *Decoder) Reset(r io.Reader) error {
 		d.current.b = dst
 		d.current.err = err
 		d.current.flushed = true
+		if debug {
+			println("sync decode to ", len(dst), "bytes, err:", err)
+		}
 		return nil
 	}
 
@@ -193,7 +202,9 @@ func (d *Decoder) drainOutput() {
 		d.current.cancel = nil
 	}
 	if d.current.d != nil {
-		println("re-adding current decoder", d.current.d, len(d.decoders))
+		if debug {
+			printf("re-adding current decoder %p, decoders: %d", d.current.d, len(d.decoders))
+		}
 		d.decoders <- d.current.d
 		d.current.d = nil
 		d.current.b = nil
@@ -206,7 +217,9 @@ func (d *Decoder) drainOutput() {
 		select {
 		case v := <-d.current.output:
 			if v.d != nil {
-				println("got decoder", v.d)
+				if debug {
+					printf("re-adding decoder %p", v.d)
+				}
 				d.decoders <- v.d
 			}
 			if v.err == errEndOfStream {
@@ -259,20 +272,22 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	if d.current.err == ErrDecoderClosed {
 		return dst, ErrDecoderClosed
 	}
-	//println(len(d.frames), len(d.decoders), d.current)
+
+	// Grab a block decoder and frame decoder.
 	block, frame := <-d.decoders, <-d.frames
 	defer func() {
+		if debug {
+			printf("re-adding decoder: %p", block)
+		}
 		d.decoders <- block
 		frame.rawInput = nil
+		frame.bBuf = nil
 		d.frames <- frame
 	}()
-	if cap(dst) == 0 {
-		// Allocate 1MB by default.
-		dst = make([]byte, 0, 1<<20)
-	}
-	br := byteBuf(input)
+	frame.bBuf = input
+
 	for {
-		err := frame.reset(&br)
+		err := frame.reset(&frame.bBuf)
 		if err == io.EOF {
 			return dst, nil
 		}
@@ -290,11 +305,21 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 				dst = dst2
 			}
 		}
+		if cap(dst) == 0 {
+			// Allocate window size * 2 by default if nothing is provided and we didn't get frame content size.
+			size := frame.WindowSize * 2
+			// Cap to 1 MB.
+			if size > 1<<20 {
+				size = 1 << 20
+			}
+			dst = make([]byte, 0, frame.WindowSize)
+		}
+
 		dst, err = frame.runDecoder(dst, block)
 		if err != nil {
 			return dst, err
 		}
-		if len(br) == 0 {
+		if len(frame.bBuf) == 0 {
 			break
 		}
 	}
@@ -305,6 +330,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 // If an error occurs d.err will be set.
 func (d *Decoder) nextBlock() {
 	if d.current.d != nil {
+		if debug {
+			printf("re-adding current decoder %p", d.current.d)
+		}
 		d.decoders <- d.current.d
 		d.current.d = nil
 	}
@@ -377,6 +405,9 @@ func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
 	defer d.streamWg.Done()
 	frame := newFrameDec(d.o)
 	for stream := range inStream {
+		if debug {
+			println("got new stream")
+		}
 		br := readerWrapper{r: stream.r}
 	decodeStream:
 		for {
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 52c1eb066..2ac9cd2dd 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -50,15 +50,17 @@ func WithDecoderConcurrency(n int) DOption {
 }
 
 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
-// (non-streaming) operations.
-// Maxmimum and default is 1 << 63 bytes.
+// non-streaming operations or maximum window size for streaming operations.
+// This can be used to control memory usage of potentially hostile content.
+// For streaming operations, the maximum window size is capped at 1<<30 bytes.
+// Maximum and default is 1 << 63 bytes.
 func WithDecoderMaxMemory(n uint64) DOption {
 	return func(o *decoderOptions) error {
 		if n == 0 {
-			return errors.New("WithDecoderMaxmemory must be at least 1")
+			return errors.New("WithDecoderMaxMemory must be at least 1")
 		}
 		if n > 1<<63 {
-			return fmt.Errorf("WithDecoderMaxmemorymust be less than 1 << 63")
+			return fmt.Errorf("WithDecoderMaxmemory must be less than 1 << 63")
 		}
 		o.maxDecodedSize = n
 		return nil
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 02c79814f..e120625d8 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -82,16 +82,11 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 		stepSize++
 	}
 
-	// TEMPLATE
-
 	const kSearchStrength = 8
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
 	cv := load6432(src, s)
-	// nextHash is the hash at s
-	nextHashS := hash5(cv, dFastShortTableBits)
-	nextHashL := hash8(cv, dFastLongTableBits)
 
 	// Relative offsets
 	offset1 := int32(blk.recentOffsets[0])
@@ -119,8 +114,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 
-			nextHashS = nextHashS & dFastShortTableMask
-			nextHashL = nextHashL & dFastLongTableMask
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 
@@ -172,8 +167,6 @@ encodeLoop:
 						break encodeLoop
 					}
 					cv = load6432(src, s)
-					nextHashS = hash5(cv, dFastShortTableBits)
-					nextHashL = hash8(cv, dFastLongTableBits)
 					continue
 				}
 				const repOff2 = 1
@@ -221,8 +214,6 @@ encodeLoop:
 						break encodeLoop
 					}
 					cv = load6432(src, s)
-					nextHashS = hash5(cv, dFastShortTableBits)
-					nextHashL = hash8(cv, dFastLongTableBits)
 					// Swap offsets
 					offset1, offset2 = offset2, offset1
 					continue
@@ -296,8 +287,6 @@ encodeLoop:
 				break encodeLoop
 			}
 			cv = load6432(src, s)
-			nextHashS = hash5(cv, dFastShortTableBits)
-			nextHashL = hash8(cv, dFastLongTableBits)
 		}
 
 		// A 4-byte match has been found. Update recent offsets.
@@ -345,38 +334,54 @@ encodeLoop:
 			break encodeLoop
 		}
 
-		// Index match start + 2 and end - 2
-		index0 := s - l + 2
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
 		index1 := s - 2
-		if l == 4 {
-			// if l is 4, we would check the same place twice, so index s-1 instead.
-			index1++
-		}
 
 		cv0 := load6432(src, index0)
 		cv1 := load6432(src, index1)
-		entry0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
-		entry1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		e.table[hash5(cv0, dFastShortTableBits)&dFastShortTableMask] = entry0
-		e.longTable[hash8(cv0, dFastLongTableBits)&dFastLongTableMask] = entry0
-		e.table[hash5(cv1, dFastShortTableBits)&dFastShortTableMask] = entry1
-		e.longTable[hash8(cv1, dFastLongTableBits)&dFastLongTableMask] = entry1
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		e.longTable[hash8(cv0, dFastLongTableBits)] = te0
+		e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		e.table[hash5(cv0, dFastShortTableBits)] = te0
+		e.table[hash5(cv1, dFastShortTableBits)] = te1
 
 		cv = load6432(src, s)
-		nextHashS = hash5(cv, dFastShortTableBits)
-		nextHashL = hash8(cv, dFastLongTableBits)
+
+		if !canRepeat {
+			continue
+		}
 
 		// Check offset 2
-		if o2 := s - offset2; canRepeat && o2 > 0 && load3232(src, o2) == uint32(cv) {
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv1>>8, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			l := 4 + e.matchlen(s+4, o2+4, src)
-			// Store this, since we have it.
+
 			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
-			e.longTable[nextHashL&dFastLongTableMask] = entry
-			e.table[nextHashS&dFastShortTableMask] = entry
+			e.longTable[nextHashL] = entry
+			e.table[nextHashS] = entry
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
+
 			// Since litlen is always 0, this is offset 1.
 			seq.offset = 1
 			s += l
@@ -389,12 +394,10 @@ encodeLoop:
 			// Swap offset 1 and 2.
 			offset1, offset2 = offset2, offset1
 			if s >= sLimit {
+				// Finished
 				break encodeLoop
 			}
-			// Prepare next loop.
 			cv = load6432(src, s)
-			nextHashS = hash5(cv, dFastShortTableBits)
-			nextHashL = hash8(cv, dFastLongTableBits)
 		}
 	}
 
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index a8edaa888..6f388de04 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -124,8 +124,6 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
 	cv := load6432(src, s)
-	// nextHash is the hash at s
-	nextHash := hash6(cv, hashLog)
 
 	// Relative offsets
 	offset1 := int32(blk.recentOffsets[0])
@@ -157,8 +155,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 
-			nextHash2 := hash6(cv>>8, hashLog) & tableMask
-			nextHash = nextHash & tableMask
+			nextHash := hash6(cv, hashLog)
+			nextHash2 := hash6(cv>>8, hashLog)
 			candidate := e.table[nextHash]
 			candidate2 := e.table[nextHash2]
 			repIndex := s - offset1 + 2
@@ -207,8 +205,6 @@ encodeLoop:
 					break encodeLoop
 				}
 				cv = load6432(src, s)
-				//nextHash = hashLen(cv, hashLog, mls)
-				nextHash = hash6(cv, hashLog)
 				continue
 			}
 			coffset0 := s - (candidate.offset - e.cur)
@@ -245,7 +241,6 @@ encodeLoop:
 				break encodeLoop
 			}
 			cv = load6432(src, s)
-			nextHash = hash6(cv, hashLog)
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes.
 		offset2 = offset1
@@ -292,15 +287,16 @@ encodeLoop:
 			break encodeLoop
 		}
 		cv = load6432(src, s)
-		nextHash = hash6(cv, hashLog)
 
 		// Check offset 2
-		if o2 := s - offset2; canRepeat && o2 > 0 && load3232(src, o2) == uint32(cv) {
+		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			l := 4 + e.matchlen(s+4, o2+4, src)
+
 			// Store this, since we have it.
-			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			nextHash := hash6(cv, hashLog)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
 			// Since litlen is always 0, this is offset 1.
@@ -319,7 +315,6 @@ encodeLoop:
 			}
 			// Prepare next loop.
 			cv = load6432(src, s)
-			nextHash = hash6(cv, hashLog)
 		}
 	}
 
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index ed028f5a7..b7011be29 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -211,6 +211,7 @@ func (e *Encoder) nextBlock(final bool) error {
 			s.wWg.Wait()
 			_, s.err = s.w.Write(blk.output)
 			s.nWritten += int64(len(blk.output))
+			s.eofWritten = true
 		}
 		return s.err
 	}
@@ -256,7 +257,12 @@ func (e *Encoder) nextBlock(final bool) error {
 				}
 				s.wWg.Done()
 			}()
-			err := blk.encode()
+			err := errIncompressible
+			// If we got the exact same number of literals as input,
+			// assume the literals cannot be compressed.
+			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
+				err = blk.encode()
+			}
 			switch err {
 			case errIncompressible:
 				if debug {
@@ -443,7 +449,13 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		if len(src) == 0 {
 			blk.last = true
 		}
-		err := blk.encode()
+		err := errIncompressible
+		// If we got the exact same number of literals as input,
+		// assume the literals cannot be compressed.
+		if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
+			err = blk.encode()
+		}
+
 		switch err {
 		case errIncompressible:
 			if debug {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 6e210c4a0..a8559e900 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -6,7 +6,7 @@ import (
 	"strings"
 )
 
-// DOption is an option for creating a encoder.
+// EOption is an option for creating a encoder.
 type EOption func(*encoderOptions) error
 
 // options retains accumulated state of multiple options.
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 8fa264fc2..839a95fbf 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -39,6 +39,9 @@ type frameDec struct {
 
 	rawInput byteBuffer
 
+	// Byte buffer that can be reused for small input blocks.
+	bBuf byteBuf
+
 	// asyncRunning indicates whether the async routine processes input on 'decoding'.
 	asyncRunning   bool
 	asyncRunningMu sync.Mutex
@@ -59,6 +62,9 @@ func newFrameDec(o decoderOptions) *frameDec {
 		o:             o,
 		maxWindowSize: 1 << 30,
 	}
+	if d.maxWindowSize > o.maxDecodedSize {
+		d.maxWindowSize = o.maxDecodedSize
+	}
 	return &d
 }
 
@@ -232,7 +238,9 @@ func (d *frameDec) reset(br byteBuffer) error {
 
 // next will start decoding the next block from stream.
 func (d *frameDec) next(block *blockDec) error {
-	println("decoding new block")
+	if debug {
+		printf("decoding new block %p:%p", block, block.data)
+	}
 	err := block.reset(d.rawInput, d.WindowSize)
 	if err != nil {
 		println("block error:", err)
@@ -280,13 +288,13 @@ func (d *frameDec) checkCRC() error {
 	if !d.HasCheckSum {
 		return nil
 	}
-	var tmp [8]byte
-	gotB := d.crc.Sum(tmp[:0])
+	var tmp [4]byte
+	got := d.crc.Sum64()
 	// Flip to match file order.
-	gotB[0] = gotB[7]
-	gotB[1] = gotB[6]
-	gotB[2] = gotB[5]
-	gotB[3] = gotB[4]
+	tmp[0] = byte(got >> 0)
+	tmp[1] = byte(got >> 8)
+	tmp[2] = byte(got >> 16)
+	tmp[3] = byte(got >> 24)
 
 	// We can overwrite upper tmp now
 	want := d.rawInput.readSmall(4)
@@ -295,8 +303,10 @@ func (d *frameDec) checkCRC() error {
 		return io.ErrUnexpectedEOF
 	}
 
-	if !bytes.Equal(gotB[:4], want) {
-		println("CRC Check Failed:", gotB[:4], "!=", want)
+	if !bytes.Equal(tmp[:], want) {
+		if debug {
+			println("CRC Check Failed:", tmp[:], "!=", want)
+		}
 		return ErrCRCMismatch
 	}
 	println("CRC ok")
@@ -423,7 +433,7 @@ func (d *frameDec) startDecoder(output chan decodeOutput) {
 	}
 }
 
-// runDecoder will create a sync decoder that will decodeAsync a block of data.
+// runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	// TODO: Init to dictionary
 	d.history.reset()
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index a86d00bc3..9efe34feb 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -184,29 +184,75 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 // decSymbol contains information about a state entry,
 // Including the state offset base, the output symbol and
 // the number of bits to read for the low part of the destination state.
-type decSymbol struct {
-	newState uint16
-	addBits  uint8 // Used for symbols until transformed.
-	nbBits   uint8
-	baseline uint32
+// Using a composite uint64 is faster than a struct with separate members.
+type decSymbol uint64
+
+func newDecSymbol(nbits, addBits uint8, newState uint16, baseline uint32) decSymbol {
+	return decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
+}
+
+func (d decSymbol) nbBits() uint8 {
+	return uint8(d)
+}
+
+func (d decSymbol) addBits() uint8 {
+	return uint8(d >> 8)
+}
+
+func (d decSymbol) newState() uint16 {
+	return uint16(d >> 16)
+}
+
+func (d decSymbol) baseline() uint32 {
+	return uint32(d >> 32)
+}
+
+func (d decSymbol) baselineInt() int {
+	return int(d >> 32)
+}
+
+func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
+	*d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
+}
+
+func (d *decSymbol) setNBits(nBits uint8) {
+	const mask = 0xffffffffffffff00
+	*d = (*d & mask) | decSymbol(nBits)
+}
+
+func (d *decSymbol) setAddBits(addBits uint8) {
+	const mask = 0xffffffffffff00ff
+	*d = (*d & mask) | (decSymbol(addBits) << 8)
+}
+
+func (d *decSymbol) setNewState(state uint16) {
+	const mask = 0xffffffff0000ffff
+	*d = (*d & mask) | decSymbol(state)<<16
+}
+
+func (d *decSymbol) setBaseline(baseline uint32) {
+	const mask = 0xffffffff
+	*d = (*d & mask) | decSymbol(baseline)<<32
+}
+
+func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
+	const mask = 0xffff00ff
+	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
 }
 
 // decSymbolValue returns the transformed decSymbol for the given symbol.
 func decSymbolValue(symb uint8, t []baseOffset) (decSymbol, error) {
 	if int(symb) >= len(t) {
-		return decSymbol{}, fmt.Errorf("rle symbol %d >= max %d", symb, len(t))
+		return 0, fmt.Errorf("rle symbol %d >= max %d", symb, len(t))
 	}
 	lu := t[symb]
-	return decSymbol{
-		addBits:  lu.addBits,
-		baseline: lu.baseLine,
-	}, nil
+	return newDecSymbol(0, lu.addBits, 0, lu.baseLine), nil
 }
 
 // setRLE will set the decoder til RLE mode.
 func (s *fseDecoder) setRLE(symbol decSymbol) {
 	s.actualTableLog = 0
-	s.maxBits = symbol.addBits
+	s.maxBits = symbol.addBits()
 	s.dt[0] = symbol
 }
 
@@ -220,7 +266,7 @@ func (s *fseDecoder) buildDtable() error {
 	{
 		for i, v := range s.norm[:s.symbolLen] {
 			if v == -1 {
-				s.dt[highThreshold].addBits = uint8(i)
+				s.dt[highThreshold].setAddBits(uint8(i))
 				highThreshold--
 				symbolNext[i] = 1
 			} else {
@@ -235,7 +281,7 @@ func (s *fseDecoder) buildDtable() error {
 		position := uint32(0)
 		for ss, v := range s.norm[:s.symbolLen] {
 			for i := 0; i < int(v); i++ {
-				s.dt[position].addBits = uint8(ss)
+				s.dt[position].setAddBits(uint8(ss))
 				position = (position + step) & tableMask
 				for position > highThreshold {
 					// lowprob area
@@ -253,11 +299,11 @@ func (s *fseDecoder) buildDtable() error {
 	{
 		tableSize := uint16(1 << s.actualTableLog)
 		for u, v := range s.dt[:tableSize] {
-			symbol := v.addBits
+			symbol := v.addBits()
 			nextState := symbolNext[symbol]
 			symbolNext[symbol] = nextState + 1
 			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
-			s.dt[u&maxTableMask].nbBits = nBits
+			s.dt[u&maxTableMask].setNBits(nBits)
 			newState := (nextState << nBits) - tableSize
 			if newState > tableSize {
 				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
@@ -266,7 +312,7 @@ func (s *fseDecoder) buildDtable() error {
 				// Seems weird that this is possible with nbits > 0.
 				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
 			}
-			s.dt[u&maxTableMask].newState = newState
+			s.dt[u&maxTableMask].setNewState(newState)
 		}
 	}
 	return nil
@@ -279,25 +325,21 @@ func (s *fseDecoder) transform(t []baseOffset) error {
 	tableSize := uint16(1 << s.actualTableLog)
 	s.maxBits = 0
 	for i, v := range s.dt[:tableSize] {
-		if int(v.addBits) >= len(t) {
-			return fmt.Errorf("invalid decoding table entry %d, symbol %d >= max (%d)", i, v.addBits, len(t))
+		add := v.addBits()
+		if int(add) >= len(t) {
+			return fmt.Errorf("invalid decoding table entry %d, symbol %d >= max (%d)", i, v.addBits(), len(t))
 		}
-		lu := t[v.addBits]
+		lu := t[add]
 		if lu.addBits > s.maxBits {
 			s.maxBits = lu.addBits
 		}
-		s.dt[i&maxTableMask] = decSymbol{
-			newState: v.newState,
-			nbBits:   v.nbBits,
-			addBits:  lu.addBits,
-			baseline: lu.baseLine,
-		}
+		v.setExt(lu.addBits, lu.baseLine)
+		s.dt[i] = v
 	}
 	return nil
 }
 
 type fseState struct {
-	// TODO: Check if *[1 << maxTablelog]decSymbol is faster.
 	dt    []decSymbol
 	state decSymbol
 }
@@ -312,26 +354,31 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
 // next returns the current symbol and sets the next state.
 // At least tablelog bits must be available in the bit reader.
 func (s *fseState) next(br *bitReader) {
-	lowBits := uint16(br.getBits(s.state.nbBits))
-	s.state = s.dt[s.state.newState+lowBits]
+	lowBits := uint16(br.getBits(s.state.nbBits()))
+	s.state = s.dt[s.state.newState()+lowBits]
 }
 
 // finished returns true if all bits have been read from the bitstream
 // and the next state would require reading bits from the input.
 func (s *fseState) finished(br *bitReader) bool {
-	return br.finished() && s.state.nbBits > 0
+	return br.finished() && s.state.nbBits() > 0
 }
 
 // final returns the current state symbol without decoding the next.
 func (s *fseState) final() (int, uint8) {
-	return int(s.state.baseline), s.state.addBits
+	return s.state.baselineInt(), s.state.addBits()
+}
+
+// final returns the current state symbol without decoding the next.
+func (s decSymbol) final() (int, uint8) {
+	return s.baselineInt(), s.addBits()
 }
 
 // nextFast returns the next symbol and sets the next state.
 // This can only be used if no symbols are 0 bits.
 // At least tablelog bits must be available in the bit reader.
 func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
-	lowBits := uint16(br.getBitsFast(s.state.nbBits))
-	s.state = s.dt[s.state.newState+lowBits]
-	return s.state.baseline, s.state.addBits
+	lowBits := uint16(br.getBitsFast(s.state.nbBits()))
+	s.state = s.dt[s.state.newState()+lowBits]
+	return s.state.baseline(), s.state.addBits()
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
index 819d87f88..4a752067f 100644
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -64,7 +64,7 @@ func hash6(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
 }
 
-// hash6 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index cef69e35b..15a45f7b5 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -89,6 +89,10 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 // decode sequences from the stream with the provided history.
 func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 	startSize := len(s.out)
+	// Grab full sizes tables, to avoid bounds checks.
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+
 	for i := seqs - 1; i >= 0; i-- {
 		if br.overread() {
 			printf("reading sequence %d, exceeded available data\n", seqs-i)
@@ -96,10 +100,10 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		}
 		var litLen, matchOff, matchLen int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
-			litLen, matchOff, matchLen = s.nextFast(br)
+			litLen, matchOff, matchLen = s.nextFast(br, llState, mlState, ofState)
 			br.fillFast()
 		} else {
-			litLen, matchOff, matchLen = s.next(br)
+			litLen, matchOff, matchLen = s.next(br, llState, mlState, ofState)
 			br.fill()
 		}
 
@@ -175,30 +179,25 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			// This is the last sequence, so we shouldn't update state.
 			break
 		}
-		if true {
-			// Manually inlined, ~ 5-20% faster
-			// Update all 3 states at once. Approx 20% faster.
-			a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-
-			nBits := a.nbBits + b.nbBits + c.nbBits
-			if nBits == 0 {
-				s.litLengths.state.state = s.litLengths.state.dt[a.newState]
-				s.matchLengths.state.state = s.matchLengths.state.dt[b.newState]
-				s.offsets.state.state = s.offsets.state.dt[c.newState]
-			} else {
-				bits := br.getBitsFast(nBits)
-				lowBits := uint16(bits >> ((c.nbBits + b.nbBits) & 31))
-				s.litLengths.state.state = s.litLengths.state.dt[a.newState+lowBits]
-
-				lowBits = uint16(bits >> (c.nbBits & 31))
-				lowBits &= bitMask[b.nbBits&15]
-				s.matchLengths.state.state = s.matchLengths.state.dt[b.newState+lowBits]
 
-				lowBits = uint16(bits) & bitMask[c.nbBits&15]
-				s.offsets.state.state = s.offsets.state.dt[c.newState+lowBits]
-			}
+		// Manually inlined, ~ 5-20% faster
+		// Update all 3 states at once. Approx 20% faster.
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+		if nBits == 0 {
+			llState = llTable[llState.newState()&maxTableMask]
+			mlState = mlTable[mlState.newState()&maxTableMask]
+			ofState = ofTable[ofState.newState()&maxTableMask]
 		} else {
-			s.updateAlt(br)
+			bits := br.getBitsFast(nBits)
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
+			lowBits &= bitMask[mlState.nbBits()&15]
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
 		}
 	}
 
@@ -230,55 +229,49 @@ func (s *sequenceDecs) updateAlt(br *bitReader) {
 	// Update all 3 states at once. Approx 20% faster.
 	a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
 
-	nBits := a.nbBits + b.nbBits + c.nbBits
+	nBits := a.nbBits() + b.nbBits() + c.nbBits()
 	if nBits == 0 {
-		s.litLengths.state.state = s.litLengths.state.dt[a.newState]
-		s.matchLengths.state.state = s.matchLengths.state.dt[b.newState]
-		s.offsets.state.state = s.offsets.state.dt[c.newState]
+		s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
+		s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
+		s.offsets.state.state = s.offsets.state.dt[c.newState()]
 		return
 	}
 	bits := br.getBitsFast(nBits)
-	lowBits := uint16(bits >> ((c.nbBits + b.nbBits) & 31))
-	s.litLengths.state.state = s.litLengths.state.dt[a.newState+lowBits]
+	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
+	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
 
-	lowBits = uint16(bits >> (c.nbBits & 31))
-	lowBits &= bitMask[b.nbBits&15]
-	s.matchLengths.state.state = s.matchLengths.state.dt[b.newState+lowBits]
+	lowBits = uint16(bits >> (c.nbBits() & 31))
+	lowBits &= bitMask[b.nbBits()&15]
+	s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
 
-	lowBits = uint16(bits) & bitMask[c.nbBits&15]
-	s.offsets.state.state = s.offsets.state.dt[c.newState+lowBits]
+	lowBits = uint16(bits) & bitMask[c.nbBits()&15]
+	s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
 }
 
 // nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
-func (s *sequenceDecs) nextFast(br *bitReader) (ll, mo, ml int) {
+func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
 	// Final will not read from stream.
-	ll, llB := s.litLengths.state.final()
-	ml, mlB := s.matchLengths.state.final()
-	mo, moB := s.offsets.state.final()
+	ll, llB := llState.final()
+	ml, mlB := mlState.final()
+	mo, moB := ofState.final()
 
 	// extra bits are stored in reverse order.
 	br.fillFast()
-	if s.maxBits <= 32 {
-		mo += br.getBits(moB)
-		ml += br.getBits(mlB)
-		ll += br.getBits(llB)
-	} else {
-		mo += br.getBits(moB)
+	mo += br.getBits(moB)
+	if s.maxBits > 32 {
 		br.fillFast()
-		// matchlength+literal length, max 32 bits
-		ml += br.getBits(mlB)
-		ll += br.getBits(llB)
 	}
+	ml += br.getBits(mlB)
+	ll += br.getBits(llB)
 
-	// mo = s.adjustOffset(mo, ll, moB)
-	// Inlined for rather big speedup
 	if moB > 1 {
 		s.prevOffset[2] = s.prevOffset[1]
 		s.prevOffset[1] = s.prevOffset[0]
 		s.prevOffset[0] = mo
 		return
 	}
-
+	// mo = s.adjustOffset(mo, ll, moB)
+	// Inlined for rather big speedup
 	if ll == 0 {
 		// There is an exception though, when current sequence's literals_length = 0.
 		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
@@ -312,11 +305,11 @@ func (s *sequenceDecs) nextFast(br *bitReader) (ll, mo, ml int) {
 	return
 }
 
-func (s *sequenceDecs) next(br *bitReader) (ll, mo, ml int) {
+func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
 	// Final will not read from stream.
-	ll, llB := s.litLengths.state.final()
-	ml, mlB := s.matchLengths.state.final()
-	mo, moB := s.offsets.state.final()
+	ll, llB := llState.final()
+	ml, mlB := mlState.final()
+	mo, moB := ofState.final()
 
 	// extra bits are stored in reverse order.
 	br.fill()