14 files changed, 1044 insertions, 146 deletions
diff --git a/vendor/github.com/containers/image/v5/copy/copy.go b/vendor/github.com/containers/image/v5/copy/copy.go
index 9fc0e5123..7482fb458 100644
--- a/vendor/github.com/containers/image/v5/copy/copy.go
+++ b/vendor/github.com/containers/image/v5/copy/copy.go
@@ -659,7 +659,7 @@ func (c *copier) copyOneImage(ctx context.Context, policyContext *signature.Poli
 		// With !ic.canModifyManifest, that would just be a string of repeated failures for the same reason,
 		// so let’s bail out early and with a better error message.
 		if !ic.canModifyManifest {
-			return nil, "", "", errors.Wrap(err, "Writing manifest failed (and converting it is not possible)")
+			return nil, "", "", errors.Wrap(err, "Writing manifest failed (and converting it is not possible, image is signed or the destination specifies a digest)")
 		}
 
 		// errs is a list of errors when trying various manifest types. Also serves as an "upload succeeded" flag when set to nil.
@@ -757,7 +757,7 @@ func (ic *imageCopier) updateEmbeddedDockerReference() error {
 	}
 
 	if !ic.canModifyManifest {
-		return errors.Errorf("Copying a schema1 image with an embedded Docker reference to %s (Docker reference %s) would invalidate existing signatures. Explicitly enable signature removal to proceed anyway",
+		return errors.Errorf("Copying a schema1 image with an embedded Docker reference to %s (Docker reference %s) would change the manifest, which is not possible (image is signed or the destination specifies a digest)",
 			transports.ImageName(ic.c.dest.Reference()), destRef.String())
 	}
 	ic.manifestUpdates.EmbeddedDockerReference = destRef
@@ -784,7 +784,7 @@ func (ic *imageCopier) copyLayers(ctx context.Context) error {
 	// If we only need to check authorization, no updates required.
 	if updatedSrcInfos != nil && !reflect.DeepEqual(srcInfos, updatedSrcInfos) {
 		if !ic.canModifyManifest {
-			return errors.Errorf("Internal error: copyLayers() needs to use an updated manifest but that was known to be forbidden")
+			return errors.Errorf("Copying this image requires changing layer representation, which is not possible (image is signed or the destination specifies a digest)")
 		}
 		srcInfos = updatedSrcInfos
 		srcInfosUpdated = true
@@ -1060,6 +1060,14 @@ func (ic *imageCopier) copyLayer(ctx context.Context, srcInfo types.BlobInfo, to
 			logrus.Debugf("Skipping blob %s (already present):", srcInfo.Digest)
 			bar := ic.c.createProgressBar(pool, srcInfo, "blob", "skipped: already exists")
 			bar.SetTotal(0, true)
+
+			// Throw an event that the layer has been skipped
+			if ic.c.progress != nil && ic.c.progressInterval > 0 {
+				ic.c.progress <- types.ProgressProperties{
+					Event:    types.ProgressEventSkipped,
+					Artifact: srcInfo,
+				}
+			}
 			return blobInfo, cachedDiffID, nil
 		}
 	}
diff --git a/vendor/github.com/containers/image/v5/oci/layout/oci_dest.go b/vendor/github.com/containers/image/v5/oci/layout/oci_dest.go
index fb0449ca5..48a32315b 100644
--- a/vendor/github.com/containers/image/v5/oci/layout/oci_dest.go
+++ b/vendor/github.com/containers/image/v5/oci/layout/oci_dest.go
@@ -279,7 +279,7 @@ func (d *ociImageDestination) addManifest(desc *imgspecv1.Descriptor) {
 	// If it has the same digest as another entry in the index, we already overwrote the file,
 	// so just pick up the other information.
 	for i, manifest := range d.index.Manifests {
-		if manifest.Digest == desc.Digest {
+		if manifest.Digest == desc.Digest && manifest.Annotations[imgspecv1.AnnotationRefName] == "" {
 			// Replace it completely.
 			d.index.Manifests[i] = *desc
 			return
diff --git a/vendor/github.com/containers/image/v5/types/types.go b/vendor/github.com/containers/image/v5/types/types.go
index d469e03b5..4f624cf33 100644
--- a/vendor/github.com/containers/image/v5/types/types.go
+++ b/vendor/github.com/containers/image/v5/types/types.go
@@ -604,6 +604,10 @@ const (
 	// ProgressEventDone is fired when the data transfer has been finished for
 	// the specific artifact
 	ProgressEventDone
+
+	// ProgressEventSkipped is fired when the artifact has been skipped because
+	// its already available at the destination
+	ProgressEventSkipped
 )
 
 // ProgressProperties is used to pass information from the copy code to a monitor which
diff --git a/vendor/github.com/containers/image/v5/version/version.go b/vendor/github.com/containers/image/v5/version/version.go
index 7cf412723..114bce387 100644
--- a/vendor/github.com/containers/image/v5/version/version.go
+++ b/vendor/github.com/containers/image/v5/version/version.go
@@ -8,10 +8,10 @@ const (
 	// VersionMinor is for functionality in a backwards-compatible manner
 	VersionMinor = 5
 	// VersionPatch is for backwards-compatible bug fixes
-	VersionPatch = 0
+	VersionPatch = 1
 
 	// VersionDev indicates development branch. Releases will be empty string.
-	VersionDev = "-dev"
+	VersionDev = ""
 )
 
 // Version is the specification version that the package types support.
diff --git a/vendor/github.com/klauspost/compress/fse/bitreader.go b/vendor/github.com/klauspost/compress/fse/bitreader.go
index b9db204f5..f65eb3909 100644
--- a/vendor/github.com/klauspost/compress/fse/bitreader.go
+++ b/vendor/github.com/klauspost/compress/fse/bitreader.go
@@ -6,6 +6,7 @@
 package fse
 
 import (
+	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@@ -63,8 +68,9 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
-	v := b.in[b.off-4 : b.off]
+	// 2 bounds checks.
+	v := b.in[b.off-4:]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
@@ -77,7 +83,8 @@ func (b *bitReader) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -91,9 +98,17 @@ func (b *bitReader) fill() {
 	}
 }
 
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return b.bitsRead >= 64 && b.off == 0
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
diff --git a/vendor/github.com/klauspost/compress/fse/bytereader.go b/vendor/github.com/klauspost/compress/fse/bytereader.go
index f228a46cd..abade2d60 100644
--- a/vendor/github.com/klauspost/compress/fse/bytereader.go
+++ b/vendor/github.com/klauspost/compress/fse/bytereader.go
@@ -25,19 +25,10 @@ func (b *byteReader) advance(n uint) {
 	b.off += int(n)
 }
 
-// Int32 returns a little endian int32 starting at current offset.
-func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
-	v3 := int32(b2[3])
-	v2 := int32(b2[2])
-	v1 := int32(b2[1])
-	v0 := int32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
-}
-
 // Uint32 returns a little endian uint32 starting at current offset.
 func (b byteReader) Uint32() uint32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
index 0a8448ce9..e12da4db2 100644
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@@ -12,8 +12,6 @@ but it can be used as a secondary step to compressors (like Snappy) that does no
 
 * [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
 
-THIS PACKAGE IS NOT CONSIDERED STABLE AND API OR ENCODING MAY CHANGE IN THE FUTURE.
-
 ## News
 
  * Mar 2018: First implementation released. Consider this beta software for now.
@@ -75,6 +73,8 @@ which can be given to the decompressor.
 Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
 or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
 
+For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
+
 You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
 your input was likely corrupted. 
 
@@ -84,4 +84,4 @@ There are no integrity checks, so relying on errors from the decompressor does n
 # Contributing
 
 Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
-changes will likely not be accepted. If in doubt open an issue before writing the PR.
-\ No newline at end of file
+changes will likely not be accepted. If in doubt open an issue before writing the PR.
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 7d0903c70..a4979e886 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -6,6 +6,7 @@
 package huff0
 
 import (
+	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -34,29 +35,16 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
 	return nil
 }
 
-// getBits will return n bits. n can be 0.
-func (b *bitReader) getBits(n uint8) uint16 {
-	if n == 0 || b.bitsRead >= 64 {
-		return 0
-	}
-	return b.getBitsFast(n)
-}
-
-// getBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	b.bitsRead += n
-	return v
-}
-
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReader) peekBitsFast(n uint8) uint16 {
@@ -71,21 +59,36 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+
+	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 
+func (b *bitReader) advance(n uint8) {
+	b.bitsRead += n
+}
+
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -113,3 +116,214 @@ func (b *bitReader) close() error {
 	}
 	return nil
 }
+
+// bitReader reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReaderBytes struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+}
+
+// init initializes and resets the bit reader.
+func (b *bitReaderBytes) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
+	b.advance(8 - uint8(highBit32(uint32(v))))
+	return nil
+}
+
+// peekBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReaderBytes) peekByteFast() uint8 {
+	got := uint8(b.value >> 56)
+	return got
+}
+
+func (b *bitReaderBytes) advance(n uint8) {
+	b.bitsRead += n
+	b.value <<= n & 63
+}
+
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReaderBytes) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+
+	// 2 bounds checks.
+	v := b.in[b.off-4 : b.off]
+	v = v[:4]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value |= uint64(low) << (b.bitsRead - 32)
+	b.bitsRead -= 32
+	b.off -= 4
+}
+
+// fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
+func (b *bitReaderBytes) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
+// fill() will make sure at least 32 bits are available.
+func (b *bitReaderBytes) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4:]
+		v = v[:4]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value |= uint64(low) << (b.bitsRead - 32)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
+		b.bitsRead -= 8
+		b.off--
+	}
+}
+
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReaderBytes) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+}
+
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReaderBytes) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+
+// bitReaderShifted reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReaderShifted struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+}
+
+// init initializes and resets the bit reader.
+func (b *bitReaderShifted) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
+	b.advance(8 - uint8(highBit32(uint32(v))))
+	return nil
+}
+
+// peekBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
+	return uint16(b.value >> ((64 - n) & 63))
+}
+
+func (b *bitReaderShifted) advance(n uint8) {
+	b.bitsRead += n
+	b.value <<= n & 63
+}
+
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReaderShifted) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+
+	// 2 bounds checks.
+	v := b.in[b.off-4 : b.off]
+	v = v[:4]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
+	b.bitsRead -= 32
+	b.off -= 4
+}
+
+// fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
+func (b *bitReaderShifted) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
+// fill() will make sure at least 32 bits are available.
+func (b *bitReaderShifted) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4:]
+		v = v[:4]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
+		b.bitsRead -= 8
+		b.off--
+	}
+}
+
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReaderShifted) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+}
+
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReaderShifted) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index fb42a398b..a03b2634a 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -25,6 +25,9 @@ type dEntryDouble struct {
 	len   uint8
 }
 
+// Uses special code for all tables that are < 8 bits.
+const use8BitTables = true
+
 // ReadTable will read a table from the input.
 // The size of the input may be larger than the table definition.
 // Any content remaining after the table definition will be returned.
@@ -83,6 +86,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		}
 		v2 := v & 15
 		rankStats[v2]++
+		// (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
 		weightTotal += (1 << v2) >> 1
 	}
 	if weightTotal == 0 {
@@ -142,12 +146,14 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		d := dEntrySingle{
 			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
 		}
-		single := s.dt.single[rankStats[w] : rankStats[w]+length]
+		rank := &rankStats[w]
+		single := s.dt.single[*rank : *rank+length]
 		for i := range single {
 			single[i] = d
 		}
-		rankStats[w] += length
+		*rank += length
 	}
+
 	return s, in, nil
 }
 
@@ -208,7 +214,10 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
-	var br bitReader
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress1X8Bit(dst, src)
+	}
+	var br bitReaderShifted
 	err := br.init(src)
 	if err != nil {
 		return dst, err
@@ -216,17 +225,6 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]
 
-	decode := func() byte {
-		val := br.peekBitsFast(d.actualTableLog) /* note : actualTableLog >= 1 */
-		v := d.dt.single[val]
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
-	hasDec := func(v dEntrySingle) byte {
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
-
 	// Avoid bounds check by always having full sized table.
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
@@ -238,11 +236,25 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 
 	for br.off >= 8 {
 		br.fillFast()
-		buf[off+0] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
-		buf[off+1] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		// Refill
 		br.fillFast()
-		buf[off+2] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
-		buf[off+3] = hasDec(dt[br.peekBitsFast(d.actualTableLog)&tlMask])
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
@@ -259,13 +271,196 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	}
 	dst = append(dst, buf[:off]...)
 
-	for !br.finished() {
+	// br < 8, so uint8 is fine
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+	for bitsLeft > 0 {
 		br.fill()
+		if false && br.bitsRead >= 32 {
+			if br.off >= 4 {
+				v := br.in[br.off-4:]
+				v = v[:4]
+				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+				br.value = (br.value << 32) | uint64(low)
+				br.bitsRead -= 32
+				br.off -= 4
+			} else {
+				for br.off > 0 {
+					br.value = (br.value << 8) | uint64(br.in[br.off-1])
+					br.bitsRead -= 8
+					br.off--
+				}
+			}
+		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
-		dst = append(dst, decode())
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= nBits
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+}
+
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
+	if d.actualTableLog == 8 {
+		return d.decompress1X8BitExactly(dst, src)
+	}
+	var br bitReaderBytes
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+
+	// Avoid bounds check by always having full sized table.
+	dt := d.dt.single[:256]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+
+	shift := (8 - d.actualTableLog) & 7
+
+	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+	for br.off >= 4 {
+		br.fillFast()
+		v := dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+
+	// br < 4, so uint8 is fine
+	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	for bitsLeft > 0 {
+		if br.bitsRead >= 64-8 {
+			for br.off > 0 {
+				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+				br.bitsRead -= 8
+				br.off--
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := dt[br.peekByteFast()>>shift]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= int8(nBits)
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+}
+
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
+	var br bitReaderBytes
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+
+	// Avoid bounds check by always having full sized table.
+	dt := d.dt.single[:256]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+
+	const shift = 0
+
+	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+	for br.off >= 4 {
+		br.fillFast()
+		v := dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+
+	// br < 4, so uint8 is fine
+	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	for bitsLeft > 0 {
+		if br.bitsRead >= 64-8 {
+			for br.off > 0 {
+				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+				br.bitsRead -= 8
+				br.off--
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := dt[br.peekByteFast()>>shift]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= int8(nBits)
+		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }
@@ -274,15 +469,18 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
-func (s *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(s.dt.single) == 0 {
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	if len(src) < 6+(4*1) {
 		return nil, errors.New("input too small")
 	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
 
-	var br [4]bitReader
+	var br [4]bitReaderShifted
 	start := 6
 	for i := 0; i < 3; i++ {
 		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
@@ -308,14 +506,7 @@ func (s *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
-	single := s.dt.single[:tlSize]
-
-	decode := func(br *bitReader) byte {
-		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
-		v := single[val&tlMask]
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
+	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
@@ -324,69 +515,484 @@ func (s *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
 	// Decode 2 values from each decoder/loop.
 	const bufoff = 256 / 4
-bigloop:
 	for {
-		for i := range br {
-			br := &br[i]
-			if br.off < 4 {
-				break bigloop
-			}
-			br.fillFast()
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
 		}
 
 		{
 			const stream = 0
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
-			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
+
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v2 = single[val2&tlMask]
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		}
 
 		{
-			const stream = 1
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
-			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
+
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v2 = single[val2&tlMask]
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == bufoff {
+			if bufoff > dstEvery {
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+			off = 0
+			out = out[bufoff:]
+			decoded += 256
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	for i := range br {
+		offset := dstEvery * i
+		br := &br[i]
+		bitsLeft := br.off*8 + uint(64-br.bitsRead)
+		for bitsLeft > 0 {
+			br.fill()
+			if false && br.bitsRead >= 32 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value = (br.value << 32) | uint64(low)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value = (br.value << 8) | uint64(br.in[br.off-1])
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
+	if d.actualTableLog == 8 {
+		return d.decompress4X8bitExactly(dst, src)
+	}
+
+	var br [4]bitReaderBytes
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	shift := (8 - d.actualTableLog) & 7
+
+	const tlSize = 1 << 8
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	var decoded int
+
+	// Decode 4 values from each decoder/loop.
+	const bufoff = 256 / 4
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			// Interleave 2 decodes.
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
 		}
 
 		{
 			const stream = 2
-			val := br[stream].peekBitsFast(s.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
-			v2 := single[val2&tlMask]
-			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+
+		off += 4
+
+		if off == bufoff {
+			if bufoff > dstEvery {
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+			off = 0
+			out = out[bufoff:]
+			decoded += 256
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	for i := range br {
+		offset := dstEvery * i
+		br := &br[i]
+		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		for bitsLeft > 0 {
+			if br.finished() {
+				return nil, io.ErrUnexpectedEOF
+			}
+			if br.bitsRead >= 56 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value |= uint64(low) << (br.bitsRead - 32)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			v := single[br.peekByteFast()>>shift].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= int(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
+	var br [4]bitReaderBytes
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const shift = 0
+	const tlSize = 1 << 8
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	var decoded int
+
+	// Decode 4 values from each decoder/loop.
+	const bufoff = 256 / 4
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
 		}
 
 		{
-			const stream = 3
-			val := br[stream].peekBitsFast(s.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			// Interleave 2 decodes.
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
-			v2 := single[val2&tlMask]
-			buf[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
 		}
 
-		off += 2
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+
+		off += 4
 
 		if off == bufoff {
 			if bufoff > dstEvery {
@@ -422,12 +1028,38 @@ bigloop:
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
-		for !br.finished() {
-			br.fill()
+		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		for bitsLeft > 0 {
+			if br.finished() {
+				return nil, io.ErrUnexpectedEOF
+			}
+			if br.bitsRead >= 56 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value |= uint64(low) << (br.bitsRead - 32)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
 			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
-			out[offset] = decode(br)
+
+			// Read value and increment offset.
+			v := single[br.peekByteFast()>>shift].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= int(nBits)
+			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 15d79d439..854458537 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -5,6 +5,7 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"errors"
 	"io"
 	"math/bits"
@@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@@ -63,21 +68,31 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
-	v := b.in[b.off-4 : b.off]
+	// 2 bounds checks.
+	v := b.in[b.off-4:]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off >= 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 4a14242c7..c8ec6e331 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -83,6 +83,10 @@ type blockDec struct {
 	err         error
 	decWG       sync.WaitGroup
 
+	// Frame to use for singlethreaded decoding.
+	// Should not be used by the decoder itself since parent may be another frame.
+	localFrame *frameDec
+
 	// Block is RLE, this is the size.
 	RLESize uint32
 	tmp     [4]byte
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index f708df1c4..2c4fca17f 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -4,8 +4,6 @@
 
 package zstd
 
-import "encoding/binary"
-
 // byteReader provides a byte reader that reads
 // little endian values from a byte stream.
 // The input stream is manually advanced.
@@ -33,7 +31,8 @@ func (b *byteReader) overread() bool {
 
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := int32(b2[3])
 	v2 := int32(b2[2])
 	v1 := int32(b2[1])
@@ -57,7 +56,25 @@ func (b byteReader) Uint32() uint32 {
 		}
 		return v
 	}
-	return binary.LittleEndian.Uint32(b.b[b.off : b.off+4])
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
+	v3 := uint32(b2[3])
+	v2 := uint32(b2[2])
+	v1 := uint32(b2[1])
+	v0 := uint32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+}
+
+// Uint32NC returns a little endian uint32 starting at current offset.
+// The caller must be sure if there are at least 4 bytes left.
+func (b byteReader) Uint32NC() uint32 {
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
+	v3 := uint32(b2[3])
+	v2 := uint32(b2[2])
+	v1 := uint32(b2[1])
+	v0 := uint32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
 }
 
 // unread returns the unread portion of the input.
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 8e34479ff..75bf05bc9 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -23,9 +23,6 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 
-	// Unreferenced decoders, ready for use.
-	frames chan *frameDec
-
 	// Streams ready to be decoded.
 	stream chan decodeStream
 
@@ -90,10 +87,10 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 
 	// Create decoders
 	d.decoders = make(chan *blockDec, d.o.concurrent)
-	d.frames = make(chan *frameDec, d.o.concurrent)
 	for i := 0; i < d.o.concurrent; i++ {
-		d.frames <- newFrameDec(d.o)
-		d.decoders <- newBlockDec(d.o.lowMem)
+		dec := newBlockDec(d.o.lowMem)
+		dec.localFrame = newFrameDec(d.o)
+		d.decoders <- dec
 	}
 
 	if r == nil {
@@ -283,15 +280,15 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	}
 
 	// Grab a block decoder and frame decoder.
-	block, frame := <-d.decoders, <-d.frames
+	block := <-d.decoders
+	frame := block.localFrame
 	defer func() {
 		if debug {
 			printf("re-adding decoder: %p", block)
 		}
-		d.decoders <- block
 		frame.rawInput = nil
 		frame.bBuf = nil
-		d.frames <- frame
+		d.decoders <- block
 	}()
 	frame.bBuf = input
 
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index 957cfeb79..e6d3d49b3 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -55,7 +55,7 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 	if b.remain() < 4 {
 		return errors.New("input too small")
 	}
-	bitStream := b.Uint32()
+	bitStream := b.Uint32NC()
 	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
 	if nbBits > tablelogAbsoluteMax {
 		println("Invalid tablelog:", nbBits)
@@ -79,7 +79,8 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 				n0 += 24
 				if r := b.remain(); r > 5 {
 					b.advance(2)
-					bitStream = b.Uint32() >> bitCount
+					// The check above should make sure we can read 32 bits
+					bitStream = b.Uint32NC() >> bitCount
 				} else {
 					// end of bit stream
 					bitStream >>= 16
@@ -104,10 +105,11 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 				charnum++
 			}
 
-			if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+			if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 				b.advance(bitCount >> 3)
 				bitCount &= 7
-				bitStream = b.Uint32() >> bitCount
+				// The check above should make sure we can read 32 bits
+				bitStream = b.Uint32NC() >> bitCount
 			} else {
 				bitStream >>= 2
 			}
@@ -148,17 +150,16 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 			threshold >>= 1
 		}
 
-		//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "remain:", b.remain())
-		if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+		if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 			b.advance(bitCount >> 3)
 			bitCount &= 7
+			// The check above should make sure we can read 32 bits
+			bitStream = b.Uint32NC() >> (bitCount & 31)
 		} else {
 			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
 			b.off = len(b.b) - 4
-			//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "iend", iend)
+			bitStream = b.Uint32() >> (bitCount & 31)
 		}
-		bitStream = b.Uint32() >> (bitCount & 31)
-		//printf("bitstream is now: 0b%b", bitStream)
 	}
 	s.symbolLen = charnum
 	if s.symbolLen <= 1 {