19 files changed, 1885 insertions, 254 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 9ddf39f6f..0e2dc116a 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,23 @@ This package provides various compression algorithms.
 
 # changelog
 
+* Mar 3, 2022 (v1.15.0)
+	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
+	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
+	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)
+	* flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509)
+	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
+	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
+
+<details>
+	<summary>See  Details</summary>
+Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
+
+Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
+
+While the release has been extensively tested, it is recommended to testing when upgrading.
+</details>
+
 * Feb 22, 2022 (v1.14.4)
 	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
 	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
diff --git a/vendor/github.com/klauspost/compress/huff0/autogen.go b/vendor/github.com/klauspost/compress/huff0/autogen.go
new file mode 100644
index 000000000..ff2c69d60
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/autogen.go
@@ -0,0 +1,5 @@
+package huff0
+
+//go:generate go run generate.go
+//go:generate asmfmt -w decompress_amd64.s
+//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 03562db16..451160edd 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -165,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 
+// peekTopBits(n) is equvialent to peekBitFast(64 - n)
+func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
+	return uint16(b.value >> n)
+}
+
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 3ae7d4677..04f652995 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -729,189 +729,6 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	// Decode "jump table"
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
-			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			br.fill()
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
 func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	if d.actualTableLog == 8 {
 		return d.decompress4X8bitExactly(dst, src)
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
new file mode 100644
index 000000000..0d6cb1a96
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
@@ -0,0 +1,488 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#define bufoff      256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+	MOVQ BP, 0(SP)
+
+	XORQ exhausted, exhausted // exhausted = false
+	XORQ off, off             // off = 0
+
+	MOVBQZX peekBits+32(FP), peek_bits
+	MOVQ    buf+40(FP), buffer
+	MOVQ    tbl+48(FP), table
+
+	MOVQ pbr0+0(FP), br0
+	MOVQ pbr1+8(FP), br1
+	MOVQ pbr2+16(FP), br2
+	MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+	// const stream = 0
+	// br0.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+	MOVQ    bitReaderShifted_value(br0), br_value
+	MOVQ    bitReaderShifted_off(br0), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill0
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br0), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill0:
+
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br0.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br0.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 0(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br0.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br0.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 0+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+	MOVQ br_value, bitReaderShifted_value(br0)
+	MOVQ br_offset, bitReaderShifted_off(br0)
+
+	// const stream = 1
+	// br1.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+	MOVQ    bitReaderShifted_value(br1), br_value
+	MOVQ    bitReaderShifted_off(br1), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill1
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br1), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill1:
+
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br1.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br1.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 256(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br1.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br1.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 256+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+	MOVQ br_value, bitReaderShifted_value(br1)
+	MOVQ br_offset, bitReaderShifted_off(br1)
+
+	// const stream = 2
+	// br2.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+	MOVQ    bitReaderShifted_value(br2), br_value
+	MOVQ    bitReaderShifted_off(br2), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill2
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br2), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill2:
+
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br2.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br2.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 512(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br2.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br2.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 512+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+	MOVQ br_value, bitReaderShifted_value(br2)
+	MOVQ br_offset, bitReaderShifted_off(br2)
+
+	// const stream = 3
+	// br3.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+	MOVQ    bitReaderShifted_value(br3), br_value
+	MOVQ    bitReaderShifted_off(br3), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill3
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br3), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill3:
+
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br3.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br3.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 768(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br3.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br3.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 768+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+	MOVQ br_value, bitReaderShifted_value(br3)
+	MOVQ br_offset, bitReaderShifted_off(br3)
+
+	ADDQ $4, off // off += 2
+
+	TESTB DH, DH // any br[i].ofs < 4?
+	JNZ   end
+
+	CMPQ off, $bufoff
+	JL   main_loop
+
+end:
+	MOVQ 0(SP), BP
+
+	MOVB off, ret+56(FP)
+	RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
new file mode 100644
index 000000000..6d477a2c1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
@@ -0,0 +1,197 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+
+#define bufoff      256     // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+    MOVQ    BP, 0(SP)
+
+    XORQ    exhausted, exhausted    // exhausted = false
+    XORQ    off, off                // off = 0
+
+    MOVBQZX peekBits+32(FP), peek_bits
+    MOVQ    buf+40(FP), buffer
+    MOVQ    tbl+48(FP), table
+
+    MOVQ    pbr0+0(FP), br0
+    MOVQ    pbr1+8(FP), br1
+    MOVQ    pbr2+16(FP), br2
+    MOVQ    pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+    // const stream = {{ var "id" }}
+    // br{{ var "id"}}.fillFast()
+    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
+    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+	// if b.bitsRead >= 32 {
+    CMPQ    br_bits_read, $32
+    JB      skip_fill{{ var "id" }}
+
+    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
+    SUBQ    $4, br_offset           // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
+    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+    MOVQ    br_bits_read, CX
+    SHLQ    CL, AX
+    ORQ     AX, br_value
+
+    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+    CMPQ    br_offset, $4
+    SETLT   DL
+    ORB     DL, DH
+    // }
+skip_fill{{ var "id" }}:
+
+    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    br_value, AX
+    MOVQ    peek_bits, CX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v0 := table[val0&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v0
+
+    // br{{ var "id"}}.advance(uint8(v0.entry))
+    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    peek_bits, CX
+    MOVQ    br_value, AX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v1 := table[val1&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v1
+
+    // br{{ var "id"}}.advance(uint8(v1.entry))
+    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CX, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+    // these two writes get coalesced
+    // buf[stream][off] = uint8(v0.entry >> 8)
+    // buf[stream][off+1] = uint8(v1.entry >> 8)
+    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
+
+    // SECOND PART:
+    // val2 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    br_value, AX
+    MOVQ    peek_bits, CX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v2 := table[val0&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v0
+
+    // br{{ var "id"}}.advance(uint8(v0.entry))
+    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+    // val3 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    peek_bits, CX
+    MOVQ    br_value, AX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v3 := table[val1&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v1
+
+    // br{{ var "id"}}.advance(uint8(v1.entry))
+    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CX, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+    // these two writes get coalesced
+    // buf[stream][off+2] = uint8(v2.entry >> 8)
+    // buf[stream][off+3] = uint8(v3.entry >> 8)
+    MOVW    BX, {{ var "bufofs" }}+2(buffer)(off*1)
+
+    // update the bitrader reader structure
+    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
+    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+    {{ set "id" "0" }}
+    {{ set "ofs" "0" }}
+    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "1" }}
+    {{ set "ofs" "8" }}
+    {{ set "bufofs" "256" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "2" }}
+    {{ set "ofs" "16" }}
+    {{ set "bufofs" "512" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "3" }}
+    {{ set "ofs" "24" }}
+    {{ set "bufofs" "768" }}
+    {{ template "decode_2_values_x86" . }}
+
+    ADDQ    $4, off     // off += 2
+
+    TESTB   DH, DH      // any br[i].ofs < 4?
+    JNZ     end
+
+    CMPQ    off, $bufoff
+    JL      main_loop
+end:
+    MOVQ    0(SP), BP
+
+    MOVB    off, ret+56(FP)
+    RET
+#undef  off
+#undef  buffer
+#undef  table
+
+#undef  br_bits_read
+#undef  br_value
+#undef  br_offset
+#undef  peek_bits
+#undef  exhausted
+
+#undef  br0
+#undef  br1
+#undef  br2
+#undef  br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 000000000..d47f6644f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,181 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// that uses an asm implementation of its main loop.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+// go:noescape
+func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+// go:noescape
+func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+
+	use8BitTables := d.actualTableLog <= 8
+	if cap(dst) < fallback8BitSize && use8BitTables {
+		return d.decompress4X8bit(dst, src)
+	}
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	const debug = false
+
+	// see: bitReaderShifted.peekBitsFast()
+	peekBits := uint8((64 - d.actualTableLog) & 63)
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		if use8BitTables {
+			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+		} else {
+			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+		}
+		if debug {
+			fmt.Print("DEBUG: ")
+			fmt.Printf("off=%d,", off)
+			for i := 0; i < 4; i++ {
+				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
+					i, br[i].bitsRead, br[i].value, br[i].off)
+			}
+			fmt.Println("")
+		}
+
+		if off != 0 {
+			break
+		}
+
+		if bufoff > dstEvery {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 1")
+		}
+		copy(out, buf[0][:])
+		copy(out[dstEvery:], buf[1][:])
+		copy(out[dstEvery*2:], buf[2][:])
+		copy(out[dstEvery*3:], buf[3][:])
+		out = out[bufoff:]
+		decoded += bufoff * 4
+		// There must at least be 3 buffers left.
+		if len(out) < dstEvery*3 {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 2")
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 000000000..2edad3ea5
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,506 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff      256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+	MOVQ BP, 0(SP)
+
+	XORQ exhausted, exhausted // exhausted = false
+	XORQ off, off             // off = 0
+
+	MOVBQZX peekBits+32(FP), peek_bits
+	MOVQ    buf+40(FP), buffer
+	MOVQ    tbl+48(FP), table
+
+	MOVQ pbr0+0(FP), br0
+	MOVQ pbr1+8(FP), br1
+	MOVQ pbr2+16(FP), br2
+	MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+	// const stream = 0
+	// br0.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+	MOVQ    bitReaderShifted_value(br0), br_value
+	MOVQ    bitReaderShifted_off(br0), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill0
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br0), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill0:
+
+	// val0 := br0.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br0.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br0.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 0(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+	MOVQ br_value, bitReaderShifted_value(br0)
+	MOVQ br_offset, bitReaderShifted_off(br0)
+
+	// const stream = 1
+	// br1.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+	MOVQ    bitReaderShifted_value(br1), br_value
+	MOVQ    bitReaderShifted_off(br1), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill1
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br1), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill1:
+
+	// val0 := br1.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br1.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br1.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 256(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+	MOVQ br_value, bitReaderShifted_value(br1)
+	MOVQ br_offset, bitReaderShifted_off(br1)
+
+	// const stream = 2
+	// br2.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+	MOVQ    bitReaderShifted_value(br2), br_value
+	MOVQ    bitReaderShifted_off(br2), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill2
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br2), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill2:
+
+	// val0 := br2.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br2.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br2.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 512(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+	MOVQ br_value, bitReaderShifted_value(br2)
+	MOVQ br_offset, bitReaderShifted_off(br2)
+
+	// const stream = 3
+	// br3.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+	MOVQ    bitReaderShifted_value(br3), br_value
+	MOVQ    bitReaderShifted_off(br3), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill3
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br3), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill3:
+
+	// val0 := br3.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br3.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br3.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 768(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+	MOVQ br_value, bitReaderShifted_value(br3)
+	MOVQ br_offset, bitReaderShifted_off(br3)
+
+	ADDQ $2, off // off += 2
+
+	TESTB DH, DH // any br[i].ofs < 4?
+	JNZ   end
+
+	CMPQ off, $bufoff
+	JL   main_loop
+
+end:
+	MOVQ 0(SP), BP
+
+	MOVB off, ret+56(FP)
+	RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
new file mode 100644
index 000000000..330d86ae1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
@@ -0,0 +1,195 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff      256     // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+    MOVQ    BP, 0(SP)
+
+    XORQ    exhausted, exhausted    // exhausted = false
+    XORQ    off, off                // off = 0
+
+    MOVBQZX peekBits+32(FP), peek_bits
+    MOVQ    buf+40(FP), buffer
+    MOVQ    tbl+48(FP), table
+
+    MOVQ    pbr0+0(FP), br0
+    MOVQ    pbr1+8(FP), br1
+    MOVQ    pbr2+16(FP), br2
+    MOVQ    pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+    // const stream = {{ var "id" }}
+    // br{{ var "id"}}.fillFast()
+    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
+    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+    // We must have at least 2 * max tablelog left
+    CMPQ    br_bits_read, $64-22
+    JBE     skip_fill{{ var "id" }}
+
+    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
+    SUBQ    $4, br_offset           // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+    SHLXQ   br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+#else
+    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
+    MOVQ    br_bits_read, CX
+    SHLQ    CL, AX
+#endif
+
+    ORQ     AX, br_value
+
+    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+    CMPQ    br_offset, $4
+    SETLT   DL
+    ORB     DL, DH
+    // }
+skip_fill{{ var "id" }}:
+
+    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+    SHRXQ   peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+#else
+    MOVQ    br_value, AX
+    MOVQ    peek_bits, CX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+#endif
+
+    // v0 := table[val0&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v0
+
+    // br{{ var "id"}}.advance(uint8(v0.entry))
+    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+    MOVBQZX AL, CX
+    SHLXQ   AX, br_value, br_value // value <<= n
+#else
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+#endif
+
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+#ifdef GOAMD64_v3
+    SHRXQ    peek_bits, br_value, AX  // AX = (value >> peek_bits) & mask
+#else
+    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    peek_bits, CX
+    MOVQ    br_value, AX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+#endif
+
+    // v1 := table[val1&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v1
+
+    // br{{ var "id"}}.advance(uint8(v1.entry))
+    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+    MOVBQZX AL, CX
+    SHLXQ   AX, br_value, br_value // value <<= n
+#else
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+#endif
+
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+    // these two writes get coalesced
+    // buf[stream][off] = uint8(v0.entry >> 8)
+    // buf[stream][off+1] = uint8(v1.entry >> 8)
+    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
+
+    // update the bitrader reader structure
+    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
+    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+    {{ set "id" "0" }}
+    {{ set "ofs" "0" }}
+    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "1" }}
+    {{ set "ofs" "8" }}
+    {{ set "bufofs" "256" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "2" }}
+    {{ set "ofs" "16" }}
+    {{ set "bufofs" "512" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "3" }}
+    {{ set "ofs" "24" }}
+    {{ set "bufofs" "768" }}
+    {{ template "decode_2_values_x86" . }}
+
+    ADDQ    $2, off     // off += 2
+
+    TESTB   DH, DH      // any br[i].ofs < 4?
+    JNZ     end
+
+    CMPQ    off, $bufoff
+    JL      main_loop
+end:
+    MOVQ    0(SP), BP
+
+    MOVB    off, ret+56(FP)
+    RET
+#undef  off
+#undef  buffer
+#undef  table
+
+#undef  br_bits_read
+#undef  br_value
+#undef  br_offset
+#undef  peek_bits
+#undef  exhausted
+
+#undef  br0
+#undef  br1
+#undef  br2
+#undef  br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 000000000..126b4d68a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,193 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == 0 {
+			if bufoff > dstEvery {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
+			out = out[bufoff:]
+			decoded += bufoff * 4
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index c876c591a..e3445ac19 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -153,10 +153,10 @@ http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 
 This package:
 file    out     level   insize      outsize     millis  mb/s
-silesia.tar zskp    1   211947520   73101992    643     313.87
-silesia.tar zskp    2   211947520   67504318    969     208.38
-silesia.tar zskp    3   211947520   64595893    2007    100.68
-silesia.tar zskp    4   211947520   60995370    8825    22.90
+silesia.tar zskp    1   211947520   73821326    634     318.47
+silesia.tar zskp    2   211947520   67655404    1508    133.96
+silesia.tar zskp    3   211947520   64746933    3000    67.37
+silesia.tar zskp    4   211947520   60073508    16926   11.94
 
 cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
@@ -165,94 +165,94 @@ silesia.tar zstd    6   211947520   62916450    1913    105.66
 silesia.tar zstd    9   211947520   60212393    5063    39.92
 
 gzip, stdlib/this package:
-silesia.tar gzstd   1   211947520   80007735    1654    122.21
-silesia.tar gzkp    1   211947520   80136201    1152    175.45
+silesia.tar gzstd   1   211947520   80007735    1498    134.87
+silesia.tar gzkp    1   211947520   80088272    1009    200.31
 
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
 
 file        out     level   insize  outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  235022249   3088    590.30
-gob-stream  zskp    2   1911399616  205669791   3786    481.34
-gob-stream  zskp    3   1911399616  175034659   9636    189.17
-gob-stream  zskp    4   1911399616  165609838   50369   36.19
+gob-stream  zskp    1   1911399616  233948096   3230    564.34
+gob-stream  zskp    2   1911399616  203997694   4997    364.73
+gob-stream  zskp    3   1911399616  173526523   13435   135.68
+gob-stream  zskp    4   1911399616  162195235   47559   38.33
 
 gob-stream  zstd    1   1911399616  249810424   2637    691.26
 gob-stream  zstd    3   1911399616  208192146   3490    522.31
 gob-stream  zstd    6   1911399616  193632038   6687    272.56
 gob-stream  zstd    9   1911399616  177620386   16175   112.70
 
-gob-stream  gzstd   1   1911399616  357382641   10251   177.82
-gob-stream  gzkp    1   1911399616  359753026   5438    335.20
+gob-stream  gzstd   1   1911399616  357382013   9046    201.49
+gob-stream  gzkp    1   1911399616  359136669   4885    373.08
 
 The test data for the Large Text Compression Benchmark is the first
 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
 http://mattmahoney.net/dc/textdata.html
 
 file    out level   insize      outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343848582   3609    264.18
-enwik9  zskp    2   1000000000  317276632   5746    165.97
-enwik9  zskp    3   1000000000  292243069   12162   78.41
-enwik9  zskp    4   1000000000  262183768   82837   11.51
+enwik9  zskp    1   1000000000  343833605   3687    258.64
+enwik9  zskp    2   1000000000  317001237   7672    124.29
+enwik9  zskp    3   1000000000  291915823   15923   59.89
+enwik9  zskp    4   1000000000  261710291   77697   12.27
 
 enwik9  zstd    1   1000000000  358072021   3110    306.65
 enwik9  zstd    3   1000000000  313734672   4784    199.35
 enwik9  zstd    6   1000000000  295138875   10290   92.68
 enwik9  zstd    9   1000000000  278348700   28549   33.40
 
-enwik9  gzstd   1   1000000000  382578136   9604    99.30
-enwik9  gzkp    1   1000000000  383825945   6544    145.73
+enwik9  gzstd   1   1000000000  382578136   8608    110.78
+enwik9  gzkp    1   1000000000  382781160   5628    169.45
 
 Highly compressible JSON file.
 https://files.klauspost.com/compress/github-june-2days-2019.json.zst
 
 file                        out level   insize      outsize     millis  mb/s
-github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
-github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
-github-june-2days-2019.json zskp    3   6273951764  524340691   34043   175.75
-github-june-2days-2019.json zskp    4   6273951764  470320075   170190  35.16
+github-june-2days-2019.json zskp    1   6273951764  697439532   9789    611.17
+github-june-2days-2019.json zskp    2   6273951764  610876538   18553   322.49
+github-june-2days-2019.json zskp    3   6273951764  517662858   44186   135.41
+github-june-2days-2019.json zskp    4   6273951764  464617114   165373  36.18
 
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
 github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
 
-github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
-github-june-2days-2019.json gzkp    1   6273951764  1125417694  21788   274.61
+github-june-2days-2019.json gzstd   1   6273951764  1164397768  26793   223.32
+github-june-2days-2019.json gzkp    1   6273951764  1120631856  17693   338.16
 
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
 
 file                    out level   insize      outsize     millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
-rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
-rawstudio-mint14.tar    zskp    3   8558382592  3158085214  77675   105.08
-rawstudio-mint14.tar    zskp    4   8558382592  2965110639  857750  9.52
+rawstudio-mint14.tar    zskp    1   8558382592  3718400221  18206   448.29
+rawstudio-mint14.tar    zskp    2   8558382592  3326118337  37074   220.15
+rawstudio-mint14.tar    zskp    3   8558382592  3163842361  87306   93.49
+rawstudio-mint14.tar    zskp    4   8558382592  2970480650  783862  10.41
 
 rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
 rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
 rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
 rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
 
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
-rawstudio-mint14.tar    gzkp    1   8558382592  3962605659  45113   180.92
+rawstudio-mint14.tar    gzstd   1   8558382592  3926234992  51345   158.96
+rawstudio-mint14.tar    gzkp    1   8558382592  3960117298  36722   222.26
 
 CSV data:
 https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
 
 file                    out level   insize      outsize     millis  mb/s
-nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
-nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
-nyc-taxi-data-10M.csv   zskp    3   3325605752  530289687   25239   125.66
-nyc-taxi-data-10M.csv   zskp    4   3325605752  476268884   135958  23.33
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641319332   9462    335.17
+nyc-taxi-data-10M.csv   zskp    2   3325605752  588976126   17570   180.50
+nyc-taxi-data-10M.csv   zskp    3   3325605752  529329260   32432   97.79
+nyc-taxi-data-10M.csv   zskp    4   3325605752  474949772   138025  22.98
 
 nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
 nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
 nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
 nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
 
-nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
-nyc-taxi-data-10M.csv   gzkp    1   3325605752  922257165   16780   189.00
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928654908   21270   149.11
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  922273214   13929   227.68
 ```
 
 ## Decompressor
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 607b62ee3..7d567a54a 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -167,6 +167,11 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			}
 			return ErrCompressedSizeTooBig
 		}
+		// Empty compressed blocks must at least be 2 bytes
+		// for Literals_Block_Type and one for Sequences_Section_Header.
+		if cSize < 2 {
+			return ErrBlockTooSmall
+		}
 	case blockTypeRaw:
 		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
 			if debugDecoder {
@@ -491,6 +496,9 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 }
 
 func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
+	if debugDecoder {
+		printf("prepareSequences: %d byte(s) input\n", len(in))
+	}
 	// Decode Sequences
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
 	if len(in) < 1 {
@@ -499,8 +507,6 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
 	var nSeqs int
 	seqHeader := in[0]
 	switch {
-	case seqHeader == 0:
-		in = in[1:]
 	case seqHeader < 128:
 		nSeqs = int(seqHeader)
 		in = in[1:]
@@ -517,6 +523,13 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
 		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
 		in = in[3:]
 	}
+	if nSeqs == 0 && len(in) != 0 {
+		// When no sequences, there should not be any more data...
+		if debugDecoder {
+			printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in))
+		}
+		return ErrUnexpectedBlockSize
+	}
 
 	var seqs = &hist.decoders
 	seqs.nSeqs = nSeqs
@@ -635,6 +648,7 @@ func (b *blockDec) decodeSequences(hist *history) error {
 		hist.decoders.seqSize = len(hist.decoders.literals)
 		return nil
 	}
+	hist.decoders.windowSize = hist.windowSize
 	hist.decoders.prevOffset = hist.recentOffsets
 	err := hist.decoders.decode(b.sequence)
 	hist.recentOffsets = hist.decoders.prevOffset
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index a93dfaf10..9fcdaac1d 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -348,10 +348,10 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			frame.history.setDict(&dict)
 		}
 
-		if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+		if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
 			return dst, ErrDecoderSizeExceeded
 		}
-		if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
+		if frame.FrameContentSize < 1<<30 {
 			// Never preallocate more than 1 GB up front.
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
 				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
@@ -514,7 +514,7 @@ func (d *Decoder) nextBlockSync() (ok bool) {
 
 		// Check frame size (before CRC)
 		d.syncStream.decodedFrame += uint64(len(d.current.b))
-		if d.frame.FrameContentSize > 0 && d.syncStream.decodedFrame > d.frame.FrameContentSize {
+		if d.syncStream.decodedFrame > d.frame.FrameContentSize {
 			if debugDecoder {
 				printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
 			}
@@ -523,7 +523,7 @@ func (d *Decoder) nextBlockSync() (ok bool) {
 		}
 
 		// Check FCS
-		if d.current.d.Last && d.frame.FrameContentSize > 0 && d.syncStream.decodedFrame != d.frame.FrameContentSize {
+		if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize {
 			if debugDecoder {
 				printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
 			}
@@ -700,6 +700,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 				}
 				hist.decoders = block.async.newHist.decoders
 				hist.recentOffsets = block.async.newHist.recentOffsets
+				hist.windowSize = block.async.newHist.windowSize
 				if block.async.newHist.dict != nil {
 					hist.setDict(block.async.newHist.dict)
 				}
@@ -811,11 +812,11 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 			}
 			if !hasErr {
 				decodedFrame += uint64(len(do.b))
-				if fcs > 0 && decodedFrame > fcs {
+				if decodedFrame > fcs {
 					println("fcs exceeded", block.Last, fcs, decodedFrame)
 					do.err = ErrFrameSizeExceeded
 					hasErr = true
-				} else if block.Last && fcs > 0 && decodedFrame != fcs {
+				} else if block.Last && fcs != fcsUnknown && decodedFrame != fcs {
 					do.err = ErrFrameSizeMismatch
 					hasErr = true
 				} else {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 29c3176b0..11089d223 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -197,7 +197,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 	default:
 		fcsSize = 1 << v
 	}
-	d.FrameContentSize = 0
+	d.FrameContentSize = fcsUnknown
 	if fcsSize > 0 {
 		b, err := br.readSmall(fcsSize)
 		if err != nil {
@@ -343,12 +343,7 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 			err = ErrDecoderSizeExceeded
 			break
 		}
-		if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize {
-			println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize)
-			err = ErrFrameSizeExceeded
-			break
-		}
-		if d.FrameContentSize > 0 && uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
+		if uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
 			println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize)
 			err = ErrFrameSizeExceeded
 			break
@@ -356,13 +351,13 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 		if dec.Last {
 			break
 		}
-		if debugDecoder && d.FrameContentSize > 0 {
+		if debugDecoder {
 			println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize)
 		}
 	}
 	dst = d.history.b
 	if err == nil {
-		if d.FrameContentSize > 0 && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
+		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
 			err = ErrFrameSizeMismatch
 		} else if d.HasCheckSum {
 			var n int
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz.go b/vendor/github.com/klauspost/compress/zstd/fuzz.go
index fda8a7422..7f2210e05 100644
--- a/vendor/github.com/klauspost/compress/zstd/fuzz.go
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz.go
@@ -1,5 +1,5 @@
-//go:build gofuzz
-// +build gofuzz
+//go:build ignorecrc
+// +build ignorecrc
 
 // Copyright 2019+ Klaus Post. All rights reserved.
 // License information can be found in the LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
index 0515b201c..6811c68a8 100644
--- a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
@@ -1,5 +1,5 @@
-//go:build !gofuzz
-// +build !gofuzz
+//go:build !ignorecrc
+// +build !ignorecrc
 
 // Copyright 2019+ Klaus Post. All rights reserved.
 // License information can be found in the LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 213736ad7..819f1461b 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -107,7 +107,10 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
 	s.seqSize = 0
 	litRemain := len(s.literals)
-
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
 	for i := range seqs {
 		var ll, mo, ml int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
@@ -192,7 +195,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 		}
 		s.seqSize += ll + ml
 		if s.seqSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size", s.seqSize)
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 		}
 		litRemain -= ll
 		if litRemain < 0 {
@@ -230,7 +233,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 	}
 	s.seqSize += litRemain
 	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size", s.seqSize)
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
@@ -347,6 +350,10 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
 	hist := history.b[history.ignoreBuffer:]
 	out := s.out
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
 
 	for i := seqs - 1; i >= 0; i-- {
 		if br.overread() {
@@ -426,7 +433,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 		}
 		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size", size)
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
 		}
 		if size > cap(out) {
 			// Not enough size, which can happen under high volume block streaming conditions
@@ -535,6 +542,11 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 		}
 	}
 
+	// Check if space for literals
+	if len(s.literals)+len(s.out)-startSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", len(s.out), maxBlockSize)
+	}
+
 	// Add final literals
 	s.out = append(out, s.literals...)
 	return br.close()
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
index 967f29b31..ffffcbc25 100644
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -20,7 +20,7 @@ const ZipMethodPKWare = 20
 
 var zipReaderPool sync.Pool
 
-// newZipReader cannot be used since we would leak goroutines...
+// newZipReader creates a pooled zip decompressor.
 func newZipReader(r io.Reader) io.ReadCloser {
 	dec, ok := zipReaderPool.Get().(*Decoder)
 	if ok {
@@ -44,10 +44,14 @@ func (r *pooledZipReader) Read(p []byte) (n int, err error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	if r.dec == nil {
-		return 0, errors.New("Read after Close")
+		return 0, errors.New("read after close or EOF")
 	}
 	dec, err := r.dec.Read(p)
-
+	if err == io.EOF {
+		err = r.dec.Reset(nil)
+		zipReaderPool.Put(r.dec)
+		r.dec = nil
+	}
 	return dec, err
 }
 
@@ -112,11 +116,5 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
 // ZipDecompressor returns a decompressor that can be registered with zip libraries.
 // See ZipCompressor for example.
 func ZipDecompressor() func(r io.Reader) io.ReadCloser {
-	return func(r io.Reader) io.ReadCloser {
-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
-		if err != nil {
-			panic(err)
-		}
-		return d.IOReadCloser()
-	}
+	return newZipReader
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 0b0c2571d..c1c90b4a0 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -39,6 +39,9 @@ const zstdMinMatch = 3
 // Reset the buffer offset when reaching this.
 const bufferReset = math.MaxInt32 - MaxWindowSize
 
+// fcsUnknown is used for unknown frame content size.
+const fcsUnknown = math.MaxUint64
+
 var (
 	// ErrReservedBlockType is returned when a reserved block type is found.
 	// Typically this indicates wrong or corrupted input.
@@ -52,6 +55,10 @@ var (
 	// Typically returned on invalid input.
 	ErrBlockTooSmall = errors.New("block too small")
 
+	// ErrUnexpectedBlockSize is returned when a block has unexpected size.
+	// Typically returned on invalid input.
+	ErrUnexpectedBlockSize = errors.New("unexpected block size")
+
 	// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
 	// Typically this indicates wrong or corrupted input.
 	ErrMagicMismatch = errors.New("invalid input: magic number mismatch")