18 files changed, 4357 insertions, 222 deletions
diff --git a/vendor/github.com/klauspost/compress/.gitignore b/vendor/github.com/klauspost/compress/.gitignore
index b35f8449b..d31b37815 100644
--- a/vendor/github.com/klauspost/compress/.gitignore
+++ b/vendor/github.com/klauspost/compress/.gitignore
@@ -23,3 +23,10 @@ _testmain.go
 *.test
 *.prof
 /s2/cmd/_s2sx/sfx-exe
+
+# Linux perf files
+perf.data
+perf.data.old
+
+# gdb history
+.gdb_history
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 0e2dc116a..5b7cf781a 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,13 @@ This package provides various compression algorithms.
 
 # changelog
 
+* Mar 11, 2022 (v1.15.1)
+	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
+	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
+	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
+	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
+	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
+
 * Mar 3, 2022 (v1.15.0)
 	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
 	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
diff --git a/vendor/github.com/klauspost/compress/go.mod b/vendor/github.com/klauspost/compress/go.mod
index 5aa64a436..b605e2d52 100644
--- a/vendor/github.com/klauspost/compress/go.mod
+++ b/vendor/github.com/klauspost/compress/go.mod
@@ -1,3 +1,3 @@
 module github.com/klauspost/compress
 
-go 1.15
+go 1.16
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index d47f6644f..ce8e93bcd 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -12,14 +12,14 @@ import (
 
 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog > 8.
-// go:noescape
+//go:noescape
 func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
 	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
 
 // decompress4x_8b_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog <= 8 which decodes 4 entries
 // per loop.
-// go:noescape
+//go:noescape
 func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
 	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
 
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
new file mode 100644
index 000000000..3954c5121
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
@@ -0,0 +1,34 @@
+// Package cpuinfo gives runtime info about the current CPU.
+//
+// This is a very limited module meant for use internally
+// in this project. For more versatile solution check
+// https://github.com/klauspost/cpuid.
+package cpuinfo
+
+// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
+func HasBMI1() bool {
+	return hasBMI1
+}
+
+// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
+func HasBMI2() bool {
+	return hasBMI2
+}
+
+// DisableBMI2 will disable BMI2, for testing purposes.
+// Call returned function to restore previous state.
+func DisableBMI2() func() {
+	old := hasBMI2
+	hasBMI2 = false
+	return func() {
+		hasBMI2 = old
+	}
+}
+
+// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
+func HasBMI() bool {
+	return HasBMI1() && HasBMI2()
+}
+
+var hasBMI1 bool
+var hasBMI2 bool
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
new file mode 100644
index 000000000..e802579c4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
@@ -0,0 +1,11 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package cpuinfo
+
+// go:noescape
+func x86extensions() (bmi1, bmi2 bool)
+
+func init() {
+	hasBMI1, hasBMI2 = x86extensions()
+}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
new file mode 100644
index 000000000..4465fbe9e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
@@ -0,0 +1,36 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+TEXT ·x86extensions(SB), NOSPLIT, $0
+	// 1. determine max EAX value
+	XORQ AX, AX
+	CPUID
+
+	CMPQ AX, $7
+	JB   unsupported
+
+	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
+	MOVQ $7, AX
+	MOVQ $0, CX
+	CPUID
+
+	BTQ   $3, BX // bit 3 = BMI1
+	SETCS AL
+
+	BTQ   $8, BX // bit 8 = BMI2
+	SETCS AH
+
+	MOVB AL, bmi1+0(FP)
+	MOVB AH, bmi2+1(FP)
+	RET
+
+unsupported:
+	XORQ AX, AX
+	MOVB AL, bmi1+0(FP)
+	MOVB AL, bmi2+1(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index e3445ac19..beb7fa872 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -386,47 +386,31 @@ In practice this means that concurrency is often limited to utilizing about 3 co
   
 ### Benchmarks
 
-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
-
 The first two are streaming decodes and the last are smaller inputs. 
- 
+
+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
+
 ```
-BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
-BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
-
-BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
-BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
-
-Concurrent performance:
-
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
-
-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
+BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
+BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
+
+Concurrent blocks, performance:
+
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
 ```
 
-This reflects the performance around May 2020, but this may be out of date.
+This reflects the performance around May 2022, but this may be out of date.
 
 ## Zstd inside ZIP files
 
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 7d567a54a..b2bca3301 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,9 +5,14 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
 	"sync"
 
 	"github.com/klauspost/compress/huff0"
@@ -38,6 +43,9 @@ const (
 	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
 	maxCompressedBlockSize = 128 << 10
 
+	compressedBlockOverAlloc    = 16
+	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
+
 	// Maximum possible block size (all Raw+Uncompressed).
 	maxBlockSize = (1 << 21) - 1
 
@@ -136,7 +144,7 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
-	maxSize := maxBlockSize
+	maxSize := maxCompressedBlockSizeAlloc
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
@@ -157,9 +165,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			println("Data size on stream:", cSize)
 		}
 		b.RLESize = 0
-		maxSize = maxCompressedBlockSize
+		maxSize = maxCompressedBlockSizeAlloc
 		if windowSize < maxCompressedBlockSize && b.lowMem {
-			maxSize = int(windowSize)
+			maxSize = int(windowSize) + compressedBlockOverAlloc
 		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
 			if debugDecoder {
@@ -190,9 +198,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	// Read block data.
 	if cap(b.dataStorage) < cSize {
 		if b.lowMem || cSize > maxCompressedBlockSize {
-			b.dataStorage = make([]byte, 0, cSize)
+			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
 		} else {
-			b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
+			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
 		}
 	}
 	if cap(b.dst) <= maxSize {
@@ -486,10 +494,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		b.dst = append(b.dst, hist.decoders.literals...)
 		return nil
 	}
-	err = hist.decoders.decodeSync(hist)
+	before := len(hist.decoders.out)
+	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
 	if err != nil {
 		return err
 	}
+	if hist.decoders.maxSyncLen > 0 {
+		hist.decoders.maxSyncLen += uint64(before)
+		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
+	}
 	b.dst = hist.decoders.out
 	hist.recentOffsets = hist.decoders.prevOffset
 	return nil
@@ -632,6 +645,22 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
 		println("initializing sequences:", err)
 		return err
 	}
+	// Extract blocks...
+	if false && hist.dict == nil {
+		fatalErr := func(err error) {
+			if err != nil {
+				panic(err)
+			}
+		}
+		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
+		var buf bytes.Buffer
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
+		buf.Write(in)
+		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
+	}
+
 	return nil
 }
 
@@ -650,6 +679,7 @@ func (b *blockDec) decodeSequences(hist *history) error {
 	}
 	hist.decoders.windowSize = hist.windowSize
 	hist.decoders.prevOffset = hist.recentOffsets
+
 	err := hist.decoders.decode(b.sequence)
 	hist.recentOffsets = hist.decoders.prevOffset
 	return err
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 9fcdaac1d..c65ea9795 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -347,18 +347,20 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			}
 			frame.history.setDict(&dict)
 		}
-
-		if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
-			return dst, ErrDecoderSizeExceeded
+		if frame.WindowSize > d.o.maxWindowSize {
+			return dst, ErrWindowSizeExceeded
 		}
-		if frame.FrameContentSize < 1<<30 {
-			// Never preallocate more than 1 GB up front.
+		if frame.FrameContentSize != fcsUnknown {
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+				return dst, ErrDecoderSizeExceeded
+			}
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
 				copy(dst2, dst)
 				dst = dst2
 			}
 		}
+
 		if cap(dst) == 0 {
 			// Allocate len(input) * 2 by default if nothing is provided
 			// and we didn't get frame content size.
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index fd05c9bb0..fc52ebc40 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -31,7 +31,7 @@ func (o *decoderOptions) setDefault() {
 	if o.concurrent > 4 {
 		o.concurrent = 4
 	}
-	o.maxDecodedSize = 1 << 63
+	o.maxDecodedSize = 64 << 30
 }
 
 // WithDecoderLowmem will set whether to use a lower amount of memory,
@@ -66,7 +66,7 @@ func WithDecoderConcurrency(n int) DOption {
 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
 // non-streaming operations or maximum window size for streaming operations.
 // This can be used to control memory usage of potentially hostile content.
-// Maximum and default is 1 << 63 bytes.
+// Maximum is 1 << 63 bytes. Default is 64GiB.
 func WithDecoderMaxMemory(n uint64) DOption {
 	return func(o *decoderOptions) error {
 		if n == 0 {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 11089d223..509d5cece 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -326,6 +326,19 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	d.history.ignoreBuffer = len(dst)
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
+	d.history.decoders.maxSyncLen = 0
+	if d.FrameContentSize != fcsUnknown {
+		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+			return dst, ErrDecoderSizeExceeded
+		}
+		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+			// Alloc for output
+			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
+			copy(dst2, dst)
+			dst = dst2
+		}
+	}
 	var err error
 	for {
 		err = dec.reset(d.rawInput, d.WindowSize)
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index bb3d4fd6c..fde4e6b60 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -5,8 +5,10 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"errors"
 	"fmt"
+	"io"
 )
 
 const (
@@ -182,6 +184,29 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 	return s.buildDtable()
 }
 
+func (s *fseDecoder) mustReadFrom(r io.Reader) {
+	fatalErr := func(err error) {
+		if err != nil {
+			panic(err)
+		}
+	}
+	// 	dt             [maxTablesize]decSymbol // Decompression table.
+	//	symbolLen      uint16                  // Length of active part of the symbol table.
+	//	actualTableLog uint8                   // Selected tablelog.
+	//	maxBits        uint8                   // Maximum number of additional bits
+	//	// used for table creation to avoid allocations.
+	//	stateTable [256]uint16
+	//	norm       [maxSymbolValue + 1]int16
+	//	preDefined bool
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
+}
+
 // decSymbol contains information about a state entry,
 // Including the state offset base, the output symbol and
 // the number of bits to read for the low part of the destination state.
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 819f1461b..e80139dd9 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -73,6 +73,7 @@ type sequenceDecs struct {
 	seqSize      int
 	windowSize   int
 	maxBits      uint8
+	maxSyncLen   uint64
 }
 
 // initialize all 3 decoders from the stream input.
@@ -98,153 +99,13 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
 	return nil
 }
 
-// decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decode(seqs []seqVals) error {
-	br := s.br
-
-	// Grab full sizes tables, to avoid bounds checks.
-	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
-	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-	s.seqSize = 0
-	litRemain := len(s.literals)
-	maxBlockSize := maxCompressedBlockSize
-	if s.windowSize < maxBlockSize {
-		maxBlockSize = s.windowSize
-	}
-	for i := range seqs {
-		var ll, mo, ml int
-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
-			// inlined function:
-			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
-
-			// Final will not read from stream.
-			var llB, mlB, moB uint8
-			ll, llB = llState.final()
-			ml, mlB = mlState.final()
-			mo, moB = ofState.final()
-
-			// extra bits are stored in reverse order.
-			br.fillFast()
-			mo += br.getBits(moB)
-			if s.maxBits > 32 {
-				br.fillFast()
-			}
-			ml += br.getBits(mlB)
-			ll += br.getBits(llB)
-
-			if moB > 1 {
-				s.prevOffset[2] = s.prevOffset[1]
-				s.prevOffset[1] = s.prevOffset[0]
-				s.prevOffset[0] = mo
-			} else {
-				// mo = s.adjustOffset(mo, ll, moB)
-				// Inlined for rather big speedup
-				if ll == 0 {
-					// There is an exception though, when current sequence's literals_length = 0.
-					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
-					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
-					mo++
-				}
-
-				if mo == 0 {
-					mo = s.prevOffset[0]
-				} else {
-					var temp int
-					if mo == 3 {
-						temp = s.prevOffset[0] - 1
-					} else {
-						temp = s.prevOffset[mo]
-					}
-
-					if temp == 0 {
-						// 0 is not valid; input is corrupted; force offset to 1
-						println("WARNING: temp was 0")
-						temp = 1
-					}
-
-					if mo != 1 {
-						s.prevOffset[2] = s.prevOffset[1]
-					}
-					s.prevOffset[1] = s.prevOffset[0]
-					s.prevOffset[0] = temp
-					mo = temp
-				}
-			}
-			br.fillFast()
-		} else {
-			if br.overread() {
-				if debugDecoder {
-					printf("reading sequence %d, exceeded available data\n", i)
-				}
-				return io.ErrUnexpectedEOF
-			}
-			ll, mo, ml = s.next(br, llState, mlState, ofState)
-			br.fill()
-		}
-
-		if debugSequences {
-			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
-		}
-		// Evaluate.
-		// We might be doing this async, so do it early.
-		if mo == 0 && ml > 0 {
-			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
-		}
-		if ml > maxMatchLen {
-			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
-		}
-		s.seqSize += ll + ml
-		if s.seqSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
-		}
-		litRemain -= ll
-		if litRemain < 0 {
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
-		}
-		seqs[i] = seqVals{
-			ll: ll,
-			ml: ml,
-			mo: mo,
-		}
-		if i == len(seqs)-1 {
-			// This is the last sequence, so we shouldn't update state.
-			break
-		}
-
-		// Manually inlined, ~ 5-20% faster
-		// Update all 3 states at once. Approx 20% faster.
-		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
-		if nBits == 0 {
-			llState = llTable[llState.newState()&maxTableMask]
-			mlState = mlTable[mlState.newState()&maxTableMask]
-			ofState = ofTable[ofState.newState()&maxTableMask]
-		} else {
-			bits := br.get32BitsFast(nBits)
-			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
-			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
-
-			lowBits = uint16(bits >> (ofState.nbBits() & 31))
-			lowBits &= bitMask[mlState.nbBits()&15]
-			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
-
-			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
-			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
-		}
-	}
-	s.seqSize += litRemain
-	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
-	}
-	err := br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-	}
-	return err
-}
-
 // execute will execute the decoded sequence with the provided history.
 // The sequence must be evaluated before being sent.
 func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
+	if len(s.dict) == 0 {
+		return s.executeSimple(seqs, hist)
+	}
+
 	// Ensure we have enough output size...
 	if len(s.out)+s.seqSize > cap(s.out) {
 		addBytes := s.seqSize + len(s.out)
@@ -341,14 +202,19 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
 }
 
 // decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decodeSync(history *history) error {
+func (s *sequenceDecs) decodeSync(hist []byte) error {
+	if true {
+		supported, err := s.decodeSyncSimple(hist)
+		if supported {
+			return err
+		}
+	}
 	br := s.br
 	seqs := s.nSeqs
 	startSize := len(s.out)
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-	hist := history.b[history.ignoreBuffer:]
 	out := s.out
 	maxBlockSize := maxCompressedBlockSize
 	if s.windowSize < maxBlockSize {
@@ -433,7 +299,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 		}
 		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
 		}
 		if size > cap(out) {
 			// Not enough size, which can happen under high volume block streaming conditions
@@ -463,13 +329,13 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 
 		if mo > len(out)+len(hist) || mo > s.windowSize {
 			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 
 			// we may be in dictionary.
 			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
 			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 			end := dictO + ml
 			if end > len(s.dict) {
@@ -543,8 +409,8 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 	}
 
 	// Check if space for literals
-	if len(s.literals)+len(s.out)-startSize > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", len(s.out), maxBlockSize)
+	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
 	}
 
 	// Add final literals
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
new file mode 100644
index 000000000..4676b09cc
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -0,0 +1,350 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+	"fmt"
+
+	"github.com/klauspost/compress/internal/cpuinfo"
+)
+
+type decodeSyncAsmContext struct {
+	llTable     []decSymbol
+	mlTable     []decSymbol
+	ofTable     []decSymbol
+	llState     uint64
+	mlState     uint64
+	ofState     uint64
+	iteration   int
+	litRemain   int
+	out         []byte
+	outPosition int
+	literals    []byte
+	litPosition int
+	history     []byte
+	windowSize  int
+	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
+	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
+	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
+}
+
+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// decode sequences from the stream with the provided history but without a dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	if len(s.dict) > 0 {
+		return false, nil
+	}
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
+		return false, nil
+	}
+	useSafe := false
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+		useSafe = true
+	}
+	if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+		useSafe = true
+	}
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeSyncAsmContext{
+		llTable:     s.litLengths.fse.dt[:maxTablesize],
+		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:     s.offsets.fse.dt[:maxTablesize],
+		llState:     uint64(s.litLengths.state.state),
+		mlState:     uint64(s.matchLengths.state.state),
+		ofState:     uint64(s.offsets.state.state),
+		iteration:   s.nSeqs - 1,
+		litRemain:   len(s.literals),
+		out:         s.out,
+		outPosition: len(s.out),
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+		history:     hist,
+	}
+
+	s.seqSize = 0
+	startSize := len(s.out)
+
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
+		}
+	} else {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
+		}
+	}
+	switch errCode {
+	case noError:
+		break
+
+	case errorMatchLenOfsMismatch:
+		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
+
+	case errorMatchLenTooBig:
+		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
+
+	case errorMatchOffTooBig:
+		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			ctx.mo, ctx.outPosition+len(hist)-startSize)
+
+	case errorNotEnoughLiterals:
+		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
+			ctx.ll, ctx.litRemain+ctx.ll)
+
+	case errorNotEnoughSpace:
+		size := ctx.outPosition + ctx.ll + ctx.ml
+		if debugDecoder {
+			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
+		}
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+
+	default:
+		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+		return true, err
+	}
+
+	s.literals = s.literals[ctx.litPosition:]
+	t := ctx.outPosition
+	s.out = s.out[:t]
+
+	// Add final literals
+	s.out = append(s.out, s.literals...)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(s.out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
+		}
+	}
+
+	return true, nil
+}
+
+// --------------------------------------------------------------------------------
+
+type decodeAsmContext struct {
+	llTable   []decSymbol
+	mlTable   []decSymbol
+	ofTable   []decSymbol
+	llState   uint64
+	mlState   uint64
+	ofState   uint64
+	iteration int
+	seqs      []seqVals
+	litRemain int
+}
+
+const noError = 0
+
+// error reported when mo == 0 && ml > 0
+const errorMatchLenOfsMismatch = 1
+
+// error reported when ml > maxMatchLen
+const errorMatchLenTooBig = 2
+
+// error reported when mo > available history or mo > s.windowSize
+const errorMatchOffTooBig = 3
+
+// error reported when the sum of literal lengths exeeceds the literal buffer size
+const errorNotEnoughLiterals = 4
+
+// error reported when capacity of `out` is too small
+const errorNotEnoughSpace = 5
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeAsmContext{
+		llTable:   s.litLengths.fse.dt[:maxTablesize],
+		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:   s.offsets.fse.dt[:maxTablesize],
+		llState:   uint64(s.litLengths.state.state),
+		mlState:   uint64(s.matchLengths.state.state),
+		ofState:   uint64(s.offsets.state.state),
+		seqs:      seqs,
+		iteration: len(seqs) - 1,
+		litRemain: len(s.literals),
+	}
+
+	s.seqSize = 0
+	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
+		}
+	} else {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
+		}
+	}
+	if errCode != 0 {
+		i := len(seqs) - ctx.iteration - 1
+		switch errCode {
+		case errorMatchLenOfsMismatch:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+
+		case errorMatchLenTooBig:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+
+		case errorNotEnoughLiterals:
+			ll := ctx.seqs[i].ll
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
+		}
+
+		return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
+	}
+
+	if ctx.litRemain < 0 {
+		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
+			len(s.literals), len(s.literals)-ctx.litRemain)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// --------------------------------------------------------------------------------
+
+type executeAsmContext struct {
+	seqs        []seqVals
+	seqIndex    int
+	out         []byte
+	history     []byte
+	literals    []byte
+	outPosition int
+	litPosition int
+	windowSize  int
+}
+
+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
+//
+// Returns false if a match offset is too big.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+
+// executeSimple handles cases when dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
+		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	ctx := executeAsmContext{
+		seqs:        seqs,
+		seqIndex:    0,
+		out:         out,
+		history:     hist,
+		outPosition: t,
+		litPosition: 0,
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+	}
+
+	ok := sequenceDecs_executeSimple_amd64(&ctx)
+	if !ok {
+		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
+	}
+	s.literals = s.literals[ctx.litPosition:]
+	t = ctx.outPosition
+
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
new file mode 100644
index 000000000..01cc23fa8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -0,0 +1,3519 @@
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_end
+
+sequenceDecs_decode_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 16(R10)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 8(R10)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_2_end
+
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, DI
+
+sequenceDecs_decode_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R8
+
+sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R9
+
+sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_amd64_adjust_end
+
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_amd64_adjust_end
+
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_amd64_adjust_three
+	JMP  sequenceDecs_decode_amd64_adjust_two
+
+sequenceDecs_decode_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_amd64_adjust_end:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_56_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_56_amd64_fill_end
+
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_56_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_56_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_56_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 16(R10)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 8(R10)
+
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_56_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, DI
+
+sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R8
+
+sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R9
+
+sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_56_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_56_amd64_adjust_end
+
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_56_amd64_adjust_end
+
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_56_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_amd64_adjust_three
+	JMP  sequenceDecs_decode_56_amd64_adjust_two
+
+sequenceDecs_decode_56_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_56_amd64_adjust_end:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_56_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_end
+
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_2_end
+
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ   R13, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decode_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decode_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_bmi2_adjust_end
+
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_bmi2_adjust_end
+
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_bmi2_adjust_three
+	JMP  sequenceDecs_decode_bmi2_adjust_two
+
+sequenceDecs_decode_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_bmi2_adjust_end:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_56_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_56_bmi2_fill_end
+
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_56_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ   R13, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decode_56_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decode_56_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_56_bmi2_adjust_end
+
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_56_bmi2_adjust_end
+
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_bmi2_adjust_three
+	JMP  sequenceDecs_decode_56_bmi2_adjust_two
+
+sequenceDecs_decode_56_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_56_bmi2_adjust_end:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_56_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
+	MOVQ  ctx+0(FP), R10
+	MOVQ  8(R10), CX
+	TESTQ CX, CX
+	JZ    empty_seqs
+	MOVQ  (R10), AX
+	MOVQ  24(R10), DX
+	MOVQ  32(R10), BX
+	MOVQ  80(R10), SI
+	MOVQ  104(R10), DI
+	MOVQ  120(R10), R8
+	MOVQ  56(R10), R9
+	MOVQ  64(R10), R10
+	ADDQ  R10, R9
+
+	// seqsBase += 24 * seqIndex
+	LEAQ (DX)(DX*2), R11
+	SHLQ $0x03, R11
+	ADDQ R11, AX
+
+	// outBase += outPosition
+	ADDQ DI, BX
+
+main_loop:
+	MOVQ (AX), R11
+	MOVQ 16(AX), R12
+	MOVQ 8(AX), R13
+
+	// Copy literals
+	TESTQ R11, R11
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, R11
+	JZ    copy_1_word
+	MOVB  (SI)(R14*1), R15
+	MOVB  R15, (BX)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, R11
+	JZ    copy_1_dword
+	MOVW  (SI)(R14*1), R15
+	MOVW  R15, (BX)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, R11
+	JZ    copy_1_qword
+	MOVL  (SI)(R14*1), R15
+	MOVL  R15, (BX)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, R11
+	JZ    copy_1_test
+	MOVQ  (SI)(R14*1), R15
+	MOVQ  R15, (BX)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (SI)(R14*1), X0
+	MOVUPS X0, (BX)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, R11
+	JB   copy_1
+	ADDQ R11, SI
+	ADDQ R11, BX
+	ADDQ R11, DI
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	LEAQ (DI)(R10*1), R11
+	CMPQ R12, R11
+	JG   error_match_off_too_big
+	CMPQ R12, R8
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, R11
+	SUBQ  DI, R11
+	JLS   copy_match
+	MOVQ  R9, R14
+	SUBQ  R11, R14
+	CMPQ  R13, R11
+	JGE   copy_all_from_history
+	XORQ  R11, R11
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(R11*1), R12
+	MOVB  R12, (BX)(R11*1)
+	ADDQ  $0x01, R11
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(R11*1), R12
+	MOVW  R12, (BX)(R11*1)
+	ADDQ  $0x02, R11
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(R11*1), R12
+	MOVL  R12, (BX)(R11*1)
+	ADDQ  $0x04, R11
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(R11*1), R12
+	MOVQ  R12, (BX)(R11*1)
+	ADDQ  $0x08, R11
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(R11*1), X0
+	MOVUPS X0, (BX)(R11*1)
+	ADDQ   $0x10, R11
+
+copy_4_test:
+	CMPQ R11, R13
+	JB   copy_4
+	ADDQ R13, DI
+	ADDQ R13, BX
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+	JMP  loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, R11
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (BX)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, R11
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (BX)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, R11
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (BX)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, R11
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (BX)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (BX)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, R11
+	JB   copy_5
+	ADDQ R11, BX
+	ADDQ R11, DI
+	SUBQ R11, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  BX, R11
+	SUBQ  R12, R11
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	XORQ R12, R12
+
+copy_2:
+	MOVUPS (R11)(R12*1), X0
+	MOVUPS X0, (BX)(R12*1)
+	ADDQ   $0x10, R12
+	CMPQ   R12, R13
+	JB     copy_2
+	ADDQ   R13, BX
+	ADDQ   R13, DI
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	XORQ R12, R12
+
+copy_slow_3:
+	MOVB (R11)(R12*1), R14
+	MOVB R14, (BX)(R12*1)
+	INCQ R12
+	CMPQ R12, R13
+	JB   copy_slow_3
+	ADDQ R13, BX
+	ADDQ R13, DI
+
+handle_loop:
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+
+loop_finished:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	MOVQ 80(AX), CX
+	SUBQ CX, SI
+	MOVQ SI, 112(AX)
+	RET
+
+error_match_off_too_big:
+	// Return value
+	MOVB $0x00, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	MOVQ 80(AX), CX
+	SUBQ CX, SI
+	MOVQ SI, 112(AX)
+	RET
+
+empty_seqs:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+	RET
+
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_end
+
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 8(SP)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
+
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, DI
+
+sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R8
+
+sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R9
+
+sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_amd64_adjust_end
+
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_amd64_adjust_end
+
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_amd64_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, AX
+	JZ    copy_1_word
+	MOVB  (R11)(R14*1), R15
+	MOVB  R15, (R10)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_1_dword
+	MOVW  (R11)(R14*1), R15
+	MOVW  R15, (R10)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_1_qword
+	MOVL  (R11)(R14*1), R15
+	MOVL  R15, (R10)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_1_test
+	MOVQ  (R11)(R14*1), R15
+	MOVQ  R15, (R10)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R11)(R14*1), X0
+	MOVUPS X0, (R10)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, AX
+	JB   copy_1
+	ADDQ AX, R11
+	ADDQ AX, R10
+	ADDQ AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  CX, AX
+	SUBQ  R12, AX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  AX, R14
+	CMPQ  R13, AX
+	JGE   copy_all_from_history
+	XORQ  AX, AX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(AX*1), CL
+	MOVB  CL, (R10)(AX*1)
+	ADDQ  $0x01, AX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(AX*1), CX
+	MOVW  CX, (R10)(AX*1)
+	ADDQ  $0x02, AX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(AX*1), CX
+	MOVL  CX, (R10)(AX*1)
+	ADDQ  $0x04, AX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(AX*1), CX
+	MOVQ  CX, (R10)(AX*1)
+	ADDQ  $0x08, AX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(AX*1), X0
+	MOVUPS X0, (R10)(AX*1)
+	ADDQ   $0x10, AX
+
+copy_4_test:
+	CMPQ AX, R13
+	JB   copy_4
+	ADDQ R13, R12
+	ADDQ R13, R10
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, AX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R10)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R10)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R10)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R10)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R10)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, AX
+	JB   copy_5
+	ADDQ AX, R10
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R10, AX
+	SUBQ  CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	XORQ CX, CX
+
+copy_2:
+	MOVUPS (AX)(CX*1), X0
+	MOVUPS X0, (R10)(CX*1)
+	ADDQ   $0x10, CX
+	CMPQ   CX, R13
+	JB     copy_2
+	ADDQ   R13, R10
+	ADDQ   R13, R12
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	XORQ CX, CX
+
+copy_slow_3:
+	MOVB (AX)(CX*1), R14
+	MOVB R14, (R10)(CX*1)
+	INCQ CX
+	CMPQ CX, R13
+	JB   copy_slow_3
+	ADDQ R13, R10
+	ADDQ R13, R12
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_end
+
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ   R12, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R12
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decodeSync_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decodeSync_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_bmi2_adjust_end
+
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_bmi2_adjust_end
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_bmi2_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, CX
+	JZ    copy_1_word
+	MOVB  (R10)(R14*1), R15
+	MOVB  R15, (R9)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_1_dword
+	MOVW  (R10)(R14*1), R15
+	MOVW  R15, (R9)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_1_qword
+	MOVL  (R10)(R14*1), R15
+	MOVL  R15, (R9)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_1_test
+	MOVQ  (R10)(R14*1), R15
+	MOVQ  R15, (R9)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R10)(R14*1), X0
+	MOVUPS X0, (R9)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, CX
+	JB   copy_1
+	ADDQ CX, R10
+	ADDQ CX, R9
+	ADDQ CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, CX
+	SUBQ  R11, CX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  CX, R14
+	CMPQ  R13, CX
+	JGE   copy_all_from_history
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(CX*1), R12
+	MOVB  R12, (R9)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(CX*1), R12
+	MOVW  R12, (R9)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(CX*1), R12
+	MOVL  R12, (R9)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(CX*1), R12
+	MOVQ  R12, (R9)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(CX*1), X0
+	MOVUPS X0, (R9)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_4_test:
+	CMPQ CX, R13
+	JB   copy_4
+	ADDQ R13, R11
+	ADDQ R13, R9
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, CX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R9)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R9)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R9)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R9)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R9)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, CX
+	JB   copy_5
+	ADDQ CX, R9
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R9, CX
+	SUBQ  R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	XORQ R12, R12
+
+copy_2:
+	MOVUPS (CX)(R12*1), X0
+	MOVUPS X0, (R9)(R12*1)
+	ADDQ   $0x10, R12
+	CMPQ   R12, R13
+	JB     copy_2
+	ADDQ   R13, R9
+	ADDQ   R13, R11
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	XORQ R12, R12
+
+copy_slow_3:
+	MOVB (CX)(R12*1), R14
+	MOVB R14, (R9)(R12*1)
+	INCQ R12
+	CMPQ R12, R13
+	JB   copy_slow_3
+	ADDQ R13, R9
+	ADDQ R13, R11
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_safe_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 8(SP)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, DI
+
+sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R8
+
+sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R9
+
+sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_safe_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_amd64_adjust_end
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_amd64_adjust_end
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_amd64_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, AX
+	JZ    copy_1_word
+	MOVB  (R11)(R14*1), R15
+	MOVB  R15, (R10)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_1_dword
+	MOVW  (R11)(R14*1), R15
+	MOVW  R15, (R10)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_1_qword
+	MOVL  (R11)(R14*1), R15
+	MOVL  R15, (R10)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_1_test
+	MOVQ  (R11)(R14*1), R15
+	MOVQ  R15, (R10)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R11)(R14*1), X0
+	MOVUPS X0, (R10)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, AX
+	JB   copy_1
+	ADDQ AX, R11
+	ADDQ AX, R10
+	ADDQ AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  CX, AX
+	SUBQ  R12, AX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  AX, R14
+	CMPQ  R13, AX
+	JGE   copy_all_from_history
+	XORQ  AX, AX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(AX*1), CL
+	MOVB  CL, (R10)(AX*1)
+	ADDQ  $0x01, AX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(AX*1), CX
+	MOVW  CX, (R10)(AX*1)
+	ADDQ  $0x02, AX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(AX*1), CX
+	MOVL  CX, (R10)(AX*1)
+	ADDQ  $0x04, AX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(AX*1), CX
+	MOVQ  CX, (R10)(AX*1)
+	ADDQ  $0x08, AX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(AX*1), X0
+	MOVUPS X0, (R10)(AX*1)
+	ADDQ   $0x10, AX
+
+copy_4_test:
+	CMPQ AX, R13
+	JB   copy_4
+	ADDQ R13, R12
+	ADDQ R13, R10
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, AX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R10)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R10)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R10)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R10)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R10)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, AX
+	JB   copy_5
+	ADDQ AX, R10
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R10, AX
+	SUBQ  CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_2_word
+	MOVB  (AX)(CX*1), R14
+	MOVB  R14, (R10)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_2_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_2_dword
+	MOVW  (AX)(CX*1), R14
+	MOVW  R14, (R10)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_2_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_2_qword
+	MOVL  (AX)(CX*1), R14
+	MOVL  R14, (R10)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_2_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_2_test
+	MOVQ  (AX)(CX*1), R14
+	MOVQ  R14, (R10)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_2_test
+
+copy_2:
+	MOVUPS (AX)(CX*1), X0
+	MOVUPS X0, (R10)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_2_test:
+	CMPQ CX, R13
+	JB   copy_2
+	ADDQ R13, R10
+	ADDQ R13, R12
+	JMP  handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	XORQ CX, CX
+
+copy_slow_3:
+	MOVB (AX)(CX*1), R14
+	MOVB R14, (R10)(CX*1)
+	INCQ CX
+	CMPQ CX, R13
+	JB   copy_slow_3
+	ADDQ R13, R10
+	ADDQ R13, R12
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ   R12, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R12
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decodeSync_safe_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_bmi2_adjust_end
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_bmi2_adjust_end
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, CX
+	JZ    copy_1_word
+	MOVB  (R10)(R14*1), R15
+	MOVB  R15, (R9)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_1_dword
+	MOVW  (R10)(R14*1), R15
+	MOVW  R15, (R9)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_1_qword
+	MOVL  (R10)(R14*1), R15
+	MOVL  R15, (R9)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_1_test
+	MOVQ  (R10)(R14*1), R15
+	MOVQ  R15, (R9)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R10)(R14*1), X0
+	MOVUPS X0, (R9)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, CX
+	JB   copy_1
+	ADDQ CX, R10
+	ADDQ CX, R9
+	ADDQ CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, CX
+	SUBQ  R11, CX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  CX, R14
+	CMPQ  R13, CX
+	JGE   copy_all_from_history
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(CX*1), R12
+	MOVB  R12, (R9)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(CX*1), R12
+	MOVW  R12, (R9)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(CX*1), R12
+	MOVL  R12, (R9)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(CX*1), R12
+	MOVQ  R12, (R9)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(CX*1), X0
+	MOVUPS X0, (R9)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_4_test:
+	CMPQ CX, R13
+	JB   copy_4
+	ADDQ R13, R11
+	ADDQ R13, R9
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, CX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R9)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R9)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R9)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R9)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R9)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, CX
+	JB   copy_5
+	ADDQ CX, R9
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R9, CX
+	SUBQ  R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	XORQ  R12, R12
+	TESTQ $0x00000001, R13
+	JZ    copy_2_word
+	MOVB  (CX)(R12*1), R14
+	MOVB  R14, (R9)(R12*1)
+	ADDQ  $0x01, R12
+
+copy_2_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_2_dword
+	MOVW  (CX)(R12*1), R14
+	MOVW  R14, (R9)(R12*1)
+	ADDQ  $0x02, R12
+
+copy_2_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_2_qword
+	MOVL  (CX)(R12*1), R14
+	MOVL  R14, (R9)(R12*1)
+	ADDQ  $0x04, R12
+
+copy_2_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_2_test
+	MOVQ  (CX)(R12*1), R14
+	MOVQ  R14, (R9)(R12*1)
+	ADDQ  $0x08, R12
+	JMP   copy_2_test
+
+copy_2:
+	MOVUPS (CX)(R12*1), X0
+	MOVUPS X0, (R9)(R12*1)
+	ADDQ   $0x10, R12
+
+copy_2_test:
+	CMPQ R12, R13
+	JB   copy_2
+	ADDQ R13, R9
+	ADDQ R13, R11
+	JMP  handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	XORQ R12, R12
+
+copy_slow_3:
+	MOVB (CX)(R12*1), R14
+	MOVB R14, (R9)(R12*1)
+	INCQ R12
+	CMPQ R12, R13
+	JB   copy_slow_3
+	ADDQ R13, R9
+	ADDQ R13, R11
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
new file mode 100644
index 000000000..c3452bc3a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -0,0 +1,237 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+	"fmt"
+	"io"
+)
+
+// decode sequences from the stream with the provided history but without dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	return false, nil
+}
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	// Grab full sizes tables, to avoid bounds checks.
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	s.seqSize = 0
+	litRemain := len(s.literals)
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+	for i := range seqs {
+		var ll, mo, ml int
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("WARNING: temp was 0")
+						temp = 1
+					}
+
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
+			br.fillFast()
+		} else {
+			if br.overread() {
+				if debugDecoder {
+					printf("reading sequence %d, exceeded available data\n", i)
+				}
+				return io.ErrUnexpectedEOF
+			}
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
+			br.fill()
+		}
+
+		if debugSequences {
+			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
+		}
+		// Evaluate.
+		// We might be doing this async, so do it early.
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+		}
+		s.seqSize += ll + ml
+		if s.seqSize > maxBlockSize {
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		}
+		litRemain -= ll
+		if litRemain < 0 {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
+		}
+		seqs[i] = seqVals{
+			ll: ll,
+			ml: ml,
+			mo: mo,
+		}
+		if i == len(seqs)-1 {
+			// This is the last sequence, so we shouldn't update state.
+			break
+		}
+
+		// Manually inlined, ~ 5-20% faster
+		// Update all 3 states at once. Approx 20% faster.
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+		if nBits == 0 {
+			llState = llTable[llState.newState()&maxTableMask]
+			mlState = mlTable[mlState.newState()&maxTableMask]
+			ofState = ofTable[ofState.newState()&maxTableMask]
+		} else {
+			bits := br.get32BitsFast(nBits)
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
+			lowBits &= bitMask[mlState.nbBits()&15]
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+		}
+	}
+	s.seqSize += litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// executeSimple handles cases when a dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Malformed input
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into the current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+
+		// We must be in the current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
index ffffcbc25..b53f606a1 100644
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -21,23 +21,34 @@ const ZipMethodPKWare = 20
 var zipReaderPool sync.Pool
 
 // newZipReader creates a pooled zip decompressor.
-func newZipReader(r io.Reader) io.ReadCloser {
-	dec, ok := zipReaderPool.Get().(*Decoder)
-	if ok {
-		dec.Reset(r)
-	} else {
-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
-		if err != nil {
-			panic(err)
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	pool := &zipReaderPool
+	if len(opts) > 0 {
+		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
+		// Force concurrency 1
+		opts = append(opts, WithDecoderConcurrency(1))
+		// Create our own pool
+		pool = &sync.Pool{}
+	}
+	return func(r io.Reader) io.ReadCloser {
+		dec, ok := pool.Get().(*Decoder)
+		if ok {
+			dec.Reset(r)
+		} else {
+			d, err := NewReader(r, opts...)
+			if err != nil {
+				panic(err)
+			}
+			dec = d
 		}
-		dec = d
+		return &pooledZipReader{dec: dec, pool: pool}
 	}
-	return &pooledZipReader{dec: dec}
 }
 
 type pooledZipReader struct {
-	mu  sync.Mutex // guards Close and Read
-	dec *Decoder
+	mu   sync.Mutex // guards Close and Read
+	pool *sync.Pool
+	dec  *Decoder
 }
 
 func (r *pooledZipReader) Read(p []byte) (n int, err error) {
@@ -48,8 +59,8 @@ func (r *pooledZipReader) Read(p []byte) (n int, err error) {
 	}
 	dec, err := r.dec.Read(p)
 	if err == io.EOF {
-		err = r.dec.Reset(nil)
-		zipReaderPool.Put(r.dec)
+		r.dec.Reset(nil)
+		r.pool.Put(r.dec)
 		r.dec = nil
 	}
 	return dec, err
@@ -61,7 +72,7 @@ func (r *pooledZipReader) Close() error {
 	var err error
 	if r.dec != nil {
 		err = r.dec.Reset(nil)
-		zipReaderPool.Put(r.dec)
+		r.pool.Put(r.dec)
 		r.dec = nil
 	}
 	return err
@@ -115,6 +126,9 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
 
 // ZipDecompressor returns a decompressor that can be registered with zip libraries.
 // See ZipCompressor for example.
-func ZipDecompressor() func(r io.Reader) io.ReadCloser {
-	return newZipReader
+// Options can be specified. WithDecoderConcurrency(1) is forced,
+// and by default a 128MB maximum decompression window is specified.
+// The window size can be overridden if required.
+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	return newZipReader(opts...)
 }