vendor: update c/storage and c/image

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
author: Giuseppe Scrivano <gscrivan@redhat.com> 2022-05-12 03:45:32 +0200
committer: Giuseppe Scrivano <gscrivan@redhat.com> 2022-05-12 03:47:42 +0200
commit: 16e22dbbe30bdde27c51c656a31b41486cbc76e4 (patch)
tree: 619dfc9e81720df59f9c78e2bded6cd7707fd06f /vendor/github.com/klauspost
parent: 1b8e9c2ca799063016e9fa765e124b9b8cec67a8 (diff)
download: podman-16e22dbbe30bdde27c51c656a31b41486cbc76e4.tar.gz
podman-16e22dbbe30bdde27c51c656a31b41486cbc76e4.tar.bz2
podman-16e22dbbe30bdde27c51c656a31b41486cbc76e4.zip
14 files changed, 777 insertions, 1516 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 5b7cf781a..c3ec9d8a7 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,16 @@ This package provides various compression algorithms.
 
 # changelog
 
+* May 5, 2022 (v1.15.3)
+	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
+	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
+
+* Apr 26, 2022 (v1.15.2)
+	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
+	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
+	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
+	* Minimum version is Go 1.16, added CI test on 1.18.
+
 * Mar 11, 2022 (v1.15.1)
 	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
 	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
index 8d632cea0..61342b6b8 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -24,7 +24,7 @@ func (f *decompressor) huffmanBytesBuffer() {
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
-	fnb, fb := f.nb, f.b
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -82,9 +82,9 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesBuffer
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
@@ -227,10 +227,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
 			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -243,14 +243,14 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
@@ -275,7 +275,7 @@ func (f *decompressor) huffmanBytesReader() {
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
-	fnb, fb := f.nb, f.b
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -333,9 +333,9 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
@@ -478,10 +478,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
 			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -494,14 +494,14 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
@@ -526,7 +526,7 @@ func (f *decompressor) huffmanBufioReader() {
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
-	fnb, fb := f.nb, f.b
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -584,9 +584,9 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBufioReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
@@ -729,10 +729,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
 			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -745,14 +745,14 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
@@ -777,7 +777,7 @@ func (f *decompressor) huffmanStringsReader() {
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
-	fnb, fb := f.nb, f.b
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -835,9 +835,9 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanStringsReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
@@ -980,10 +980,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
 			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -996,14 +996,14 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
@@ -1028,7 +1028,7 @@ func (f *decompressor) huffmanGenericReader() {
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
-	fnb, fb := f.nb, f.b
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -1086,9 +1086,9 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanGenericReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
@@ -1231,10 +1231,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
 			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -1247,14 +1247,14 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
diff --git a/vendor/github.com/klauspost/compress/huff0/autogen.go b/vendor/github.com/klauspost/compress/huff0/autogen.go
deleted file mode 100644
index ff2c69d60..000000000
--- a/vendor/github.com/klauspost/compress/huff0/autogen.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package huff0
-
-//go:generate go run generate.go
-//go:generate asmfmt -w decompress_amd64.s
-//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
deleted file mode 100644
index 0d6cb1a96..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
+++ /dev/null
@@ -1,488 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-	MOVQ BP, 0(SP)
-
-	XORQ exhausted, exhausted // exhausted = false
-	XORQ off, off             // off = 0
-
-	MOVBQZX peekBits+32(FP), peek_bits
-	MOVQ    buf+40(FP), buffer
-	MOVQ    tbl+48(FP), table
-
-	MOVQ pbr0+0(FP), br0
-	MOVQ pbr1+8(FP), br1
-	MOVQ pbr2+16(FP), br2
-	MOVQ pbr3+24(FP), br3
-
-main_loop:
-
-	// const stream = 0
-	// br0.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
-	MOVQ    bitReaderShifted_value(br0), br_value
-	MOVQ    bitReaderShifted_off(br0), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill0
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br0), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill0:
-
-	// val0 := br0.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br0.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br0.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br0.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 0(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br0.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br0.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br0.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br0.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 0+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
-	MOVQ br_value, bitReaderShifted_value(br0)
-	MOVQ br_offset, bitReaderShifted_off(br0)
-
-	// const stream = 1
-	// br1.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
-	MOVQ    bitReaderShifted_value(br1), br_value
-	MOVQ    bitReaderShifted_off(br1), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill1
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br1), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill1:
-
-	// val0 := br1.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br1.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br1.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br1.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 256(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br1.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br1.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br1.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br1.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 256+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
-	MOVQ br_value, bitReaderShifted_value(br1)
-	MOVQ br_offset, bitReaderShifted_off(br1)
-
-	// const stream = 2
-	// br2.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
-	MOVQ    bitReaderShifted_value(br2), br_value
-	MOVQ    bitReaderShifted_off(br2), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill2
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br2), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill2:
-
-	// val0 := br2.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br2.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br2.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br2.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 512(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br2.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br2.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br2.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br2.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 512+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
-	MOVQ br_value, bitReaderShifted_value(br2)
-	MOVQ br_offset, bitReaderShifted_off(br2)
-
-	// const stream = 3
-	// br3.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
-	MOVQ    bitReaderShifted_value(br3), br_value
-	MOVQ    bitReaderShifted_off(br3), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill3
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br3), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill3:
-
-	// val0 := br3.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br3.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br3.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 768(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br3.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br3.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br3.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br3.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 768+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
-	MOVQ br_value, bitReaderShifted_value(br3)
-	MOVQ br_offset, bitReaderShifted_off(br3)
-
-	ADDQ $4, off // off += 2
-
-	TESTB DH, DH // any br[i].ofs < 4?
-	JNZ   end
-
-	CMPQ off, $bufoff
-	JL   main_loop
-
-end:
-	MOVQ 0(SP), BP
-
-	MOVB off, ret+56(FP)
-	RET
-
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
deleted file mode 100644
index 6d477a2c1..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
+++ /dev/null
@@ -1,197 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-
-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-    MOVQ    BP, 0(SP)
-
-    XORQ    exhausted, exhausted    // exhausted = false
-    XORQ    off, off                // off = 0
-
-    MOVBQZX peekBits+32(FP), peek_bits
-    MOVQ    buf+40(FP), buffer
-    MOVQ    tbl+48(FP), table
-
-    MOVQ    pbr0+0(FP), br0
-    MOVQ    pbr1+8(FP), br1
-    MOVQ    pbr2+16(FP), br2
-    MOVQ    pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
-    // const stream = {{ var "id" }}
-    // br{{ var "id"}}.fillFast()
-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
-
-	// if b.bitsRead >= 32 {
-    CMPQ    br_bits_read, $32
-    JB      skip_fill{{ var "id" }}
-
-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
-    SUBQ    $4, br_offset           // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-    MOVQ    br_bits_read, CX
-    SHLQ    CL, AX
-    ORQ     AX, br_value
-
-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
-    CMPQ    br_offset, $4
-    SETLT   DL
-    ORB     DL, DH
-    // }
-skip_fill{{ var "id" }}:
-
-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    br_value, AX
-    MOVQ    peek_bits, CX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v0 := table[val0&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v0
-
-    // br{{ var "id"}}.advance(uint8(v0.entry))
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    peek_bits, CX
-    MOVQ    br_value, AX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v1 := table[val1&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v1
-
-    // br{{ var "id"}}.advance(uint8(v1.entry))
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CX, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-    // these two writes get coalesced
-    // buf[stream][off] = uint8(v0.entry >> 8)
-    // buf[stream][off+1] = uint8(v1.entry >> 8)
-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
-
-    // SECOND PART:
-    // val2 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    br_value, AX
-    MOVQ    peek_bits, CX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v2 := table[val0&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v0
-
-    // br{{ var "id"}}.advance(uint8(v0.entry))
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-    // val3 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    peek_bits, CX
-    MOVQ    br_value, AX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v3 := table[val1&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v1
-
-    // br{{ var "id"}}.advance(uint8(v1.entry))
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CX, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-    // these two writes get coalesced
-    // buf[stream][off+2] = uint8(v2.entry >> 8)
-    // buf[stream][off+3] = uint8(v3.entry >> 8)
-    MOVW    BX, {{ var "bufofs" }}+2(buffer)(off*1)
-
-    // update the bitrader reader structure
-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
-    {{ set "id" "0" }}
-    {{ set "ofs" "0" }}
-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "1" }}
-    {{ set "ofs" "8" }}
-    {{ set "bufofs" "256" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "2" }}
-    {{ set "ofs" "16" }}
-    {{ set "bufofs" "512" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "3" }}
-    {{ set "ofs" "24" }}
-    {{ set "bufofs" "768" }}
-    {{ template "decode_2_values_x86" . }}
-
-    ADDQ    $4, off     // off += 2
-
-    TESTB   DH, DH      // any br[i].ofs < 4?
-    JNZ     end
-
-    CMPQ    off, $bufoff
-    JL      main_loop
-end:
-    MOVQ    0(SP), BP
-
-    MOVB    off, ret+56(FP)
-    RET
-#undef  off
-#undef  buffer
-#undef  table
-
-#undef  br_bits_read
-#undef  br_value
-#undef  br_offset
-#undef  peek_bits
-#undef  exhausted
-
-#undef  br0
-#undef  br1
-#undef  br2
-#undef  br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index ce8e93bcd..3415e5da2 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -13,19 +13,30 @@ import (
 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog > 8.
 //go:noescape
-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
 
 // decompress4x_8b_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog <= 8 which decodes 4 entries
 // per loop.
 //go:noescape
-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 
 // fallback8BitSize is the size where using Go version is faster.
 const fallback8BitSize = 800
 
+type decompress4xContext struct {
+	pbr0     *bitReaderShifted
+	pbr1     *bitReaderShifted
+	pbr2     *bitReaderShifted
+	pbr3     *bitReaderShifted
+	peekBits uint8
+	out      *byte
+	dstEvery int
+	tbl      *dEntrySingle
+	decoded  int
+	limit    *byte
+}
+
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
@@ -42,6 +53,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	if cap(dst) < fallback8BitSize && use8BitTables {
 		return d.decompress4X8bit(dst, src)
 	}
+
 	var br [4]bitReaderShifted
 	// Decode "jump table"
 	start := 6
@@ -71,70 +83,28 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
 	var decoded int
 
-	const debug = false
-
-	// see: bitReaderShifted.peekBitsFast()
-	peekBits := uint8((64 - d.actualTableLog) & 63)
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+		ctx := decompress4xContext{
+			pbr0:     &br[0],
+			pbr1:     &br[1],
+			pbr2:     &br[2],
+			pbr3:     &br[3],
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			out:      &out[0],
+			dstEvery: dstEvery,
+			tbl:      &single[0],
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
 		}
-
 		if use8BitTables {
-			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+			decompress4x_8b_main_loop_amd64(&ctx)
 		} else {
-			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
-		}
-		if debug {
-			fmt.Print("DEBUG: ")
-			fmt.Printf("off=%d,", off)
-			for i := 0; i < 4; i++ {
-				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
-					i, br[i].bitsRead, br[i].value, br[i].off)
-			}
-			fmt.Println("")
-		}
-
-		if off != 0 {
-			break
+			decompress4x_main_loop_amd64(&ctx)
 		}
 
-		if bufoff > dstEvery {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 1")
-		}
-		copy(out, buf[0][:])
-		copy(out[dstEvery:], buf[1][:])
-		copy(out[dstEvery*2:], buf[2][:])
-		copy(out[dstEvery*3:], buf[3][:])
-		out = out[bufoff:]
-		decoded += bufoff * 4
-		// There must at least be 3 buffers left.
-		if len(out) < dstEvery*3 {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 2")
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
+		decoded = ctx.decoded
+		out = out[decoded/4:]
 	}
 
 	// Decode remaining.
@@ -150,7 +120,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 		for bitsLeft > 0 {
 			br.fill()
 			if offset >= endsAt {
-				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
@@ -164,7 +133,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 			offset++
 		}
 		if offset != endsAt {
-			d.bufs.Put(buf)
 			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
 		}
 		decoded += offset - dstEvery*i
@@ -173,7 +141,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 			return nil, err
 		}
 	}
-	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 2edad3ea5..06287f568 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,506 +1,662 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-	MOVQ BP, 0(SP)
-
-	XORQ exhausted, exhausted // exhausted = false
-	XORQ off, off             // off = 0
-
-	MOVBQZX peekBits+32(FP), peek_bits
-	MOVQ    buf+40(FP), buffer
-	MOVQ    tbl+48(FP), table
-
-	MOVQ pbr0+0(FP), br0
-	MOVQ pbr1+8(FP), br1
-	MOVQ pbr2+16(FP), br2
-	MOVQ pbr3+24(FP), br3
-
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), AX
+	MOVBQZX 32(AX), SI
+	MOVQ    40(AX), DI
+	MOVQ    DI, BX
+	MOVQ    72(AX), CX
+	MOVQ    CX, (SP)
+	MOVQ    48(AX), R8
+	MOVQ    56(AX), R9
+	MOVQ    (AX), R10
+	MOVQ    8(AX), R11
+	MOVQ    16(AX), R12
+	MOVQ    24(AX), R13
+
+	// Main loop
 main_loop:
-
-	// const stream = 0
-	// br0.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
-	MOVQ    bitReaderShifted_value(br0), br_value
-	MOVQ    bitReaderShifted_off(br0), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill0
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br0), AX
+	MOVQ  BX, DI
+	CMPQ  DI, (SP)
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R10), R14
+	MOVBQZX 40(R10), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R10), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R10), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R10)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill0:
-
 	// val0 := br0.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br0.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
+	MOVW (R9)(BP*2), CX
 
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
 	// val1 := br0.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
 	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	MOVW (R9)(BP*2), CX
 
 	// br0.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
 	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 0(buffer)(off*1)
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
 	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
-	MOVQ br_value, bitReaderShifted_value(br0)
-	MOVQ br_offset, bitReaderShifted_off(br0)
-
-	// const stream = 1
-	// br1.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
-	MOVQ    bitReaderShifted_value(br1), br_value
-	MOVQ    bitReaderShifted_off(br1), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill1
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br1), AX
+	MOVQ R14, 32(R10)
+	MOVB R15, 40(R10)
+	ADDQ R8, DI
+
+	// br1.fillFast32()
+	MOVQ    32(R11), R14
+	MOVBQZX 40(R11), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill1
+	MOVQ    24(R11), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R11), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R11)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill1:
-
 	// val0 := br1.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br1.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
+	MOVW (R9)(BP*2), CX
 
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
 	// val1 := br1.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
 	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	MOVW (R9)(BP*2), CX
 
 	// br1.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
 	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 256(buffer)(off*1)
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
 	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
-	MOVQ br_value, bitReaderShifted_value(br1)
-	MOVQ br_offset, bitReaderShifted_off(br1)
-
-	// const stream = 2
-	// br2.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
-	MOVQ    bitReaderShifted_value(br2), br_value
-	MOVQ    bitReaderShifted_off(br2), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill2
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br2), AX
+	MOVQ R14, 32(R11)
+	MOVB R15, 40(R11)
+	ADDQ R8, DI
+
+	// br2.fillFast32()
+	MOVQ    32(R12), R14
+	MOVBQZX 40(R12), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill2
+	MOVQ    24(R12), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R12), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R12)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill2:
-
 	// val0 := br2.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
+	MOVW (R9)(BP*2), CX
 
-	// br2.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
 	// val1 := br2.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
 	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	MOVW (R9)(BP*2), CX
 
 	// br2.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
 	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 512(buffer)(off*1)
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
 	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
-	MOVQ br_value, bitReaderShifted_value(br2)
-	MOVQ br_offset, bitReaderShifted_off(br2)
-
-	// const stream = 3
-	// br3.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
-	MOVQ    bitReaderShifted_value(br3), br_value
-	MOVQ    bitReaderShifted_off(br3), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill3
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br3), AX
+	MOVQ R14, 32(R12)
+	MOVB R15, 40(R12)
+	ADDQ R8, DI
+
+	// br3.fillFast32()
+	MOVQ    32(R13), R14
+	MOVBQZX 40(R13), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill3
+	MOVQ    24(R13), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R13), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R13)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill3:
-
 	// val0 := br3.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
+	MOVW (R9)(BP*2), CX
 
-	// br3.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
-#endif
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
 
-	ADDQ CX, br_bits_read // bits_read += n
+	// br3.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
-#else
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+	// update the bitrader reader structure
+	MOVQ  R14, 32(R13)
+	MOVB  R15, 40(R13)
+	ADDQ  $0x02, BX
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  BX, DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
 
-#endif
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), CX
+	MOVBQZX 32(CX), BX
+	MOVQ    40(CX), SI
+	MOVQ    SI, (SP)
+	MOVQ    72(CX), DX
+	MOVQ    DX, 8(SP)
+	MOVQ    48(CX), DI
+	MOVQ    56(CX), R8
+	MOVQ    (CX), R9
+	MOVQ    8(CX), R10
+	MOVQ    16(CX), R11
+	MOVQ    24(CX), R12
+
+	// Main loop
+main_loop:
+	MOVQ  (SP), SI
+	CMPQ  SI, 8(SP)
+	SETGE DL
+
+	// br1000.fillFast32()
+	MOVQ    32(R9), R13
+	MOVBQZX 40(R9), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1000
+	MOVQ    24(R9), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R9), BP
 
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R9)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1000.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1000:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-	// br3.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R9)
+	MOVB R14, 40(R9)
+	ADDQ DI, SI
+
+	// br1001.fillFast32()
+	MOVQ    32(R10), R13
+	MOVBQZX 40(R10), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1001
+	MOVQ    24(R10), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R10), BP
 
-#endif
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R10)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1001.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1001:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-	ADDQ CX, br_bits_read // bits_read += n
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 768(buffer)(off*1)
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
-	MOVQ br_value, bitReaderShifted_value(br3)
-	MOVQ br_offset, bitReaderShifted_off(br3)
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R10)
+	MOVB R14, 40(R10)
+	ADDQ DI, SI
+
+	// br1002.fillFast32()
+	MOVQ    32(R11), R13
+	MOVBQZX 40(R11), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1002
+	MOVQ    24(R11), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R11), BP
 
-	ADDQ $2, off // off += 2
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R11)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1002.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1002:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-	TESTB DH, DH // any br[i].ofs < 4?
-	JNZ   end
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-	CMPQ off, $bufoff
-	JL   main_loop
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-end:
-	MOVQ 0(SP), BP
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R11)
+	MOVB R14, 40(R11)
+	ADDQ DI, SI
+
+	// br1003.fillFast32()
+	MOVQ    32(R12), R13
+	MOVBQZX 40(R12), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1003
+	MOVQ    24(R12), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R12), BP
 
-	MOVB off, ret+56(FP)
-	RET
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R12)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1003.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1003:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-#undef off
-#undef buffer
-#undef table
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-#undef br0
-#undef br1
-#undef br2
-#undef br3
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ  R13, 32(R12)
+	MOVB  R14, 40(R12)
+	ADDQ  $0x04, (SP)
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  (SP), DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
deleted file mode 100644
index 330d86ae1..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
+++ /dev/null
@@ -1,195 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-    MOVQ    BP, 0(SP)
-
-    XORQ    exhausted, exhausted    // exhausted = false
-    XORQ    off, off                // off = 0
-
-    MOVBQZX peekBits+32(FP), peek_bits
-    MOVQ    buf+40(FP), buffer
-    MOVQ    tbl+48(FP), table
-
-    MOVQ    pbr0+0(FP), br0
-    MOVQ    pbr1+8(FP), br1
-    MOVQ    pbr2+16(FP), br2
-    MOVQ    pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
-    // const stream = {{ var "id" }}
-    // br{{ var "id"}}.fillFast()
-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
-
-    // We must have at least 2 * max tablelog left
-    CMPQ    br_bits_read, $64-22
-    JBE     skip_fill{{ var "id" }}
-
-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
-    SUBQ    $4, br_offset           // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-    SHLXQ   br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-#else
-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
-    MOVQ    br_bits_read, CX
-    SHLQ    CL, AX
-#endif
-
-    ORQ     AX, br_value
-
-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
-    CMPQ    br_offset, $4
-    SETLT   DL
-    ORB     DL, DH
-    // }
-skip_fill{{ var "id" }}:
-
-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-    SHRXQ   peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
-    MOVQ    br_value, AX
-    MOVQ    peek_bits, CX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-#endif
-
-    // v0 := table[val0&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v0
-
-    // br{{ var "id"}}.advance(uint8(v0.entry))
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
-    MOVBQZX AL, CX
-    SHLXQ   AX, br_value, br_value // value <<= n
-#else
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-#endif
-
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-#ifdef GOAMD64_v3
-    SHRXQ    peek_bits, br_value, AX  // AX = (value >> peek_bits) & mask
-#else
-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    peek_bits, CX
-    MOVQ    br_value, AX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-#endif
-
-    // v1 := table[val1&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v1
-
-    // br{{ var "id"}}.advance(uint8(v1.entry))
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-    MOVBQZX AL, CX
-    SHLXQ   AX, br_value, br_value // value <<= n
-#else
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-#endif
-
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-    // these two writes get coalesced
-    // buf[stream][off] = uint8(v0.entry >> 8)
-    // buf[stream][off+1] = uint8(v1.entry >> 8)
-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
-
-    // update the bitrader reader structure
-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
-    {{ set "id" "0" }}
-    {{ set "ofs" "0" }}
-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "1" }}
-    {{ set "ofs" "8" }}
-    {{ set "bufofs" "256" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "2" }}
-    {{ set "ofs" "16" }}
-    {{ set "bufofs" "512" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "3" }}
-    {{ set "ofs" "24" }}
-    {{ set "bufofs" "768" }}
-    {{ template "decode_2_values_x86" . }}
-
-    ADDQ    $2, off     // off += 2
-
-    TESTB   DH, DH      // any br[i].ofs < 4?
-    JNZ     end
-
-    CMPQ    off, $bufoff
-    JL      main_loop
-end:
-    MOVQ    0(SP), BP
-
-    MOVB    off, ret+56(FP)
-    RET
-#undef  off
-#undef  buffer
-#undef  table
-
-#undef  br_bits_read
-#undef  br_value
-#undef  br_offset
-#undef  peek_bits
-#undef  exhausted
-
-#undef  br0
-#undef  br1
-#undef  br2
-#undef  br3
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index c65ea9795..36119f385 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -439,7 +439,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
 	}
 
-	if len(next.b) > 0 {
+	if !d.o.ignoreChecksum && len(next.b) > 0 {
 		n, err := d.current.crc.Write(next.b)
 		if err == nil {
 			if n != len(next.b) {
@@ -451,7 +451,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 		got := d.current.crc.Sum64()
 		var tmp [4]byte
 		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
-		if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
+		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
 			if debugDecoder {
 				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
 			}
@@ -535,9 +535,15 @@ func (d *Decoder) nextBlockSync() (ok bool) {
 
 		// Update/Check CRC
 		if d.frame.HasCheckSum {
-			d.frame.crc.Write(d.current.b)
+			if !d.o.ignoreChecksum {
+				d.frame.crc.Write(d.current.b)
+			}
 			if d.current.d.Last {
-				d.current.err = d.frame.checkCRC()
+				if !d.o.ignoreChecksum {
+					d.current.err = d.frame.checkCRC()
+				} else {
+					d.current.err = d.frame.consumeCRC()
+				}
 				if d.current.err != nil {
 					println("CRC error:", d.current.err)
 					return false
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index fc52ebc40..c70e6fa0f 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -19,6 +19,7 @@ type decoderOptions struct {
 	maxDecodedSize uint64
 	maxWindowSize  uint64
 	dicts          []dict
+	ignoreChecksum bool
 }
 
 func (o *decoderOptions) setDefault() {
@@ -112,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
 		return nil
 	}
 }
+
+// IgnoreChecksum allows to forcibly ignore checksum checking.
+func IgnoreChecksum(b bool) DOption {
+	return func(o *decoderOptions) error {
+		o.ignoreChecksum = b
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 509d5cece..3ff109cce 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -290,13 +290,6 @@ func (d *frameDec) checkCRC() error {
 	if !d.HasCheckSum {
 		return nil
 	}
-	var tmp [4]byte
-	got := d.crc.Sum64()
-	// Flip to match file order.
-	tmp[0] = byte(got >> 0)
-	tmp[1] = byte(got >> 8)
-	tmp[2] = byte(got >> 16)
-	tmp[3] = byte(got >> 24)
 
 	// We can overwrite upper tmp now
 	want, err := d.rawInput.readSmall(4)
@@ -305,7 +298,19 @@ func (d *frameDec) checkCRC() error {
 		return err
 	}
 
-	if !bytes.Equal(tmp[:], want) && !ignoreCRC {
+	if d.o.ignoreChecksum {
+		return nil
+	}
+
+	var tmp [4]byte
+	got := d.crc.Sum64()
+	// Flip to match file order.
+	tmp[0] = byte(got >> 0)
+	tmp[1] = byte(got >> 8)
+	tmp[2] = byte(got >> 16)
+	tmp[3] = byte(got >> 24)
+
+	if !bytes.Equal(tmp[:], want) {
 		if debugDecoder {
 			println("CRC Check Failed:", tmp[:], "!=", want)
 		}
@@ -317,6 +322,19 @@ func (d *frameDec) checkCRC() error {
 	return nil
 }
 
+// consumeCRC reads the checksum data if the frame has one.
+func (d *frameDec) consumeCRC() error {
+	if d.HasCheckSum {
+		_, err := d.rawInput.readSmall(4)
+		if err != nil {
+			println("CRC missing?", err)
+			return err
+		}
+	}
+
+	return nil
+}
+
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	saved := d.history.b
@@ -373,13 +391,17 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
 			err = ErrFrameSizeMismatch
 		} else if d.HasCheckSum {
-			var n int
-			n, err = d.crc.Write(dst[crcStart:])
-			if err == nil {
-				if n != len(dst)-crcStart {
-					err = io.ErrShortWrite
-				} else {
-					err = d.checkCRC()
+			if d.o.ignoreChecksum {
+				err = d.consumeCRC()
+			} else {
+				var n int
+				n, err = d.crc.Write(dst[crcStart:])
+				if err == nil {
+					if n != len(dst)-crcStart {
+						err = io.ErrShortWrite
+					} else {
+						err = d.checkCRC()
+					}
 				}
 			}
 		}
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz.go b/vendor/github.com/klauspost/compress/zstd/fuzz.go
deleted file mode 100644
index 7f2210e05..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fuzz.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build ignorecrc
-// +build ignorecrc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
-const ignoreCRC = true
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
deleted file mode 100644
index 6811c68a8..000000000
--- a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build !ignorecrc
-// +build !ignorecrc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
-const ignoreCRC = false
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 01cc23fa8..2585b2e98 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -1326,30 +1326,30 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	XORQ R12, R12
+	ADDQ R13, DI
+	MOVQ BX, R12
+	ADDQ R13, BX
 
 copy_2:
-	MOVUPS (R11)(R12*1), X0
-	MOVUPS X0, (BX)(R12*1)
+	MOVUPS (R11), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, R11
 	ADDQ   $0x10, R12
-	CMPQ   R12, R13
-	JB     copy_2
-	ADDQ   R13, BX
-	ADDQ   R13, DI
+	SUBQ   $0x10, R13
+	JHI    copy_2
 	JMP    handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
-	XORQ R12, R12
+	ADDQ R13, DI
 
 copy_slow_3:
-	MOVB (R11)(R12*1), R14
-	MOVB R14, (BX)(R12*1)
-	INCQ R12
-	CMPQ R12, R13
-	JB   copy_slow_3
-	ADDQ R13, BX
-	ADDQ R13, DI
+	MOVB (R11), R12
+	MOVB R12, (BX)
+	INCQ R11
+	INCQ BX
+	DECQ R13
+	JNZ  copy_slow_3
 
 handle_loop:
 	ADDQ $0x18, AX
@@ -1826,30 +1826,30 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	XORQ CX, CX
+	ADDQ R13, R12
+	MOVQ R10, CX
+	ADDQ R13, R10
 
 copy_2:
-	MOVUPS (AX)(CX*1), X0
-	MOVUPS X0, (R10)(CX*1)
+	MOVUPS (AX), X0
+	MOVUPS X0, (CX)
+	ADDQ   $0x10, AX
 	ADDQ   $0x10, CX
-	CMPQ   CX, R13
-	JB     copy_2
-	ADDQ   R13, R10
-	ADDQ   R13, R12
+	SUBQ   $0x10, R13
+	JHI    copy_2
 	JMP    handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
-	XORQ CX, CX
+	ADDQ R13, R12
 
 copy_slow_3:
-	MOVB (AX)(CX*1), R14
-	MOVB R14, (R10)(CX*1)
-	INCQ CX
-	CMPQ CX, R13
-	JB   copy_slow_3
-	ADDQ R13, R10
-	ADDQ R13, R12
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
 
 handle_loop:
 	MOVQ ctx+16(FP), AX
@@ -2333,30 +2333,30 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	XORQ R12, R12
+	ADDQ R13, R11
+	MOVQ R9, R12
+	ADDQ R13, R9
 
 copy_2:
-	MOVUPS (CX)(R12*1), X0
-	MOVUPS X0, (R9)(R12*1)
+	MOVUPS (CX), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, CX
 	ADDQ   $0x10, R12
-	CMPQ   R12, R13
-	JB     copy_2
-	ADDQ   R13, R9
-	ADDQ   R13, R11
+	SUBQ   $0x10, R13
+	JHI    copy_2
 	JMP    handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
-	XORQ R12, R12
+	ADDQ R13, R11
 
 copy_slow_3:
-	MOVB (CX)(R12*1), R14
-	MOVB R14, (R9)(R12*1)
-	INCQ R12
-	CMPQ R12, R13
-	JB   copy_slow_3
-	ADDQ R13, R9
-	ADDQ R13, R11
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
 
 handle_loop:
 	MOVQ ctx+16(FP), CX
@@ -2862,6 +2862,7 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
+	ADDQ  R13, R12
 	XORQ  CX, CX
 	TESTQ $0x00000001, R13
 	JZ    copy_2_word
@@ -2900,21 +2901,19 @@ copy_2_test:
 	CMPQ CX, R13
 	JB   copy_2
 	ADDQ R13, R10
-	ADDQ R13, R12
 	JMP  handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
-	XORQ CX, CX
+	ADDQ R13, R12
 
 copy_slow_3:
-	MOVB (AX)(CX*1), R14
-	MOVB R14, (R10)(CX*1)
-	INCQ CX
-	CMPQ CX, R13
-	JB   copy_slow_3
-	ADDQ R13, R10
-	ADDQ R13, R12
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
 
 handle_loop:
 	MOVQ ctx+16(FP), AX
@@ -3398,6 +3397,7 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
+	ADDQ  R13, R11
 	XORQ  R12, R12
 	TESTQ $0x00000001, R13
 	JZ    copy_2_word
@@ -3436,21 +3436,19 @@ copy_2_test:
 	CMPQ R12, R13
 	JB   copy_2
 	ADDQ R13, R9
-	ADDQ R13, R11
 	JMP  handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
-	XORQ R12, R12
+	ADDQ R13, R11
 
 copy_slow_3:
-	MOVB (CX)(R12*1), R14
-	MOVB R14, (R9)(R12*1)
-	INCQ R12
-	CMPQ R12, R13
-	JB   copy_slow_3
-	ADDQ R13, R9
-	ADDQ R13, R11
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
 
 handle_loop:
 	MOVQ ctx+16(FP), CX
author	Giuseppe Scrivano <gscrivan@redhat.com>	2022-05-12 03:45:32 +0200
committer	Giuseppe Scrivano <gscrivan@redhat.com>	2022-05-12 03:47:42 +0200
commit	16e22dbbe30bdde27c51c656a31b41486cbc76e4 (patch)
tree	619dfc9e81720df59f9c78e2bded6cd7707fd06f /vendor/github.com/klauspost
parent	1b8e9c2ca799063016e9fa765e124b9b8cec67a8 (diff)
download	podman-16e22dbbe30bdde27c51c656a31b41486cbc76e4.tar.gz podman-16e22dbbe30bdde27c51c656a31b41486cbc76e4.tar.bz2 podman-16e22dbbe30bdde27c51c656a31b41486cbc76e4.zip