12 files changed, 340 insertions, 82 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index bc977a302..f2a80b5d0 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -309,6 +309,20 @@ The decoder can be used for *concurrent* decompression of multiple buffers.
 It will only allow a certain number of concurrent operations to run. 
 To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
 
+### Dictionaries
+
+Data compressed with [dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression) can be decompressed.
+
+Dictionaries are added individually to Decoders.
+Dictionaries are generated by the `zstd --train` command and contains an initial state for the decoder.
+To add a dictionary use the `RegisterDict(data)` with the dictionary data before starting any decompression.
+
+The dictionary will be used automatically for the data that specifies them.
+
+A re-used Decoder will still contain the dictionaries registered.
+
+When registering a dictionary with the same ID it will override the existing.
+
 ### Allocation-less operation
 
 The decoder has been designed to operate without allocations after a warmup. 
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 19181caea..4a14242c7 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -461,26 +461,22 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		if huff == nil {
 			huff = &huff0.Scratch{}
 		}
-		huff.Out = b.literalBuf[:0]
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
 			return err
 		}
 		// Use our out buffer.
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
 			return err
 		}
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if len(literals) != litRegenSize {
 			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
@@ -631,15 +627,12 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		var err error
 		// Use our out buffer.
 		huff = hist.huffTree
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if err != nil {
 			println("decompressing literals:", err)
 			return err
@@ -649,12 +642,13 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 	} else {
 		if hist.huffTree != nil && huff != nil {
-			huffDecoderPool.Put(hist.huffTree)
+			if hist.dict == nil || hist.dict.litDec != hist.huffTree {
+				huffDecoderPool.Put(hist.huffTree)
+			}
 			hist.huffTree = nil
 		}
 	}
 	if huff != nil {
-		huff.Out = nil
 		hist.huffTree = huff
 	}
 	if debug {
@@ -687,14 +681,20 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	//   If only recent offsets were not transferred, this would be an obvious win.
 	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
 
-	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
-		println("initializing sequences:", err)
-		return err
-	}
 	hbytes := hist.b
 	if len(hbytes) > hist.windowSize {
 		hbytes = hbytes[len(hbytes)-hist.windowSize:]
+		// We do not need history any more.
+		if hist.dict != nil {
+			hist.dict.content = nil
+		}
 	}
+
+	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
+		println("initializing sequences:", err)
+		return err
+	}
+
 	err = seqs.decode(nSeqs, br, hbytes)
 	if err != nil {
 		return err
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index dc4378b64..f708df1c4 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -4,6 +4,8 @@
 
 package zstd
 
+import "encoding/binary"
+
 // byteReader provides a byte reader that reads
 // little endian values from a byte stream.
 // The input stream is manually advanced.
@@ -55,12 +57,7 @@ func (b byteReader) Uint32() uint32 {
 		}
 		return v
 	}
-	b2 := b.b[b.off : b.off+4 : b.off+4]
-	v3 := uint32(b2[3])
-	v2 := uint32(b2[2])
-	v1 := uint32(b2[1])
-	v0 := uint32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+	return binary.LittleEndian.Uint32(b.b[b.off : b.off+4])
 }
 
 // unread returns the unread portion of the input.
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 324347623..8e34479ff 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -32,8 +32,9 @@ type Decoder struct {
 	// Current read position used for Reader functionality.
 	current decoderState
 
-	// Custom dictionaries
-	dicts map[uint32]struct{}
+	// Custom dictionaries.
+	// Always uses copies.
+	dicts map[uint32]dict
 
 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@@ -295,10 +296,18 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	frame.bBuf = input
 
 	for {
+		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
 		if err == io.EOF {
 			return dst, nil
 		}
+		if frame.DictionaryID != nil {
+			dict, ok := d.dicts[*frame.DictionaryID]
+			if !ok {
+				return nil, ErrUnknownDictionary
+			}
+			frame.history.setDict(&dict)
+		}
 		if err != nil {
 			return dst, err
 		}
@@ -393,6 +402,19 @@ func (d *Decoder) Close() {
 	d.current.err = ErrDecoderClosed
 }
 
+// RegisterDict will load a dictionary
+func (d *Decoder) RegisterDict(b []byte) error {
+	dc, err := loadDict(b)
+	if err != nil {
+		return err
+	}
+	if d.dicts == nil {
+		d.dicts = make(map[uint32]dict, 1)
+	}
+	d.dicts[dc.id] = *dc
+	return nil
+}
+
 // IOReadCloser returns the decoder as an io.ReadCloser for convenience.
 // Any changes to the decoder will be reflected, so the returned ReadCloser
 // can be reused along with the decoder.
@@ -466,6 +488,14 @@ func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
 			if debug && err != nil {
 				println("Frame decoder returned", err)
 			}
+			if err == nil && frame.DictionaryID != nil {
+				dict, ok := d.dicts[*frame.DictionaryID]
+				if !ok {
+					err = ErrUnknownDictionary
+				} else {
+					frame.history.setDict(&dict)
+				}
+			}
 			if err != nil {
 				stream.output <- decodeOutput{
 					err: err,
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
new file mode 100644
index 000000000..8eb6f6ba3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@@ -0,0 +1,104 @@
+package zstd
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/klauspost/compress/huff0"
+)
+
+type dict struct {
+	id uint32
+
+	litDec              *huff0.Scratch
+	llDec, ofDec, mlDec sequenceDec
+	offsets             [3]int
+	content             []byte
+}
+
+var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+
+// Load a dictionary as described in
+// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+func loadDict(b []byte) (*dict, error) {
+	// Check static field size.
+	if len(b) <= 8+(3*4) {
+		return nil, io.ErrUnexpectedEOF
+	}
+	d := dict{
+		llDec: sequenceDec{fse: &fseDecoder{}},
+		ofDec: sequenceDec{fse: &fseDecoder{}},
+		mlDec: sequenceDec{fse: &fseDecoder{}},
+	}
+	if !bytes.Equal(b[:4], dictMagic[:]) {
+		return nil, ErrMagicMismatch
+	}
+	d.id = binary.LittleEndian.Uint32(b[4:8])
+	if d.id == 0 {
+		return nil, errors.New("dictionaries cannot have ID 0")
+	}
+
+	// Read literal table
+	var err error
+	d.litDec, b, err = huff0.ReadTable(b[8:], nil)
+	if err != nil {
+		return nil, err
+	}
+
+	br := byteReader{
+		b:   b,
+		off: 0,
+	}
+	readDec := func(i tableIndex, dec *fseDecoder) error {
+		if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil {
+			return err
+		}
+		if br.overread() {
+			return io.ErrUnexpectedEOF
+		}
+		err = dec.transform(symbolTableX[i])
+		if err != nil {
+			println("Transform table error:", err)
+			return err
+		}
+		if debug {
+			println("Read table ok", "symbolLen:", dec.symbolLen)
+		}
+		// Set decoders as predefined so they aren't reused.
+		dec.preDefined = true
+		return nil
+	}
+
+	if err := readDec(tableOffsets, d.ofDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil {
+		return nil, err
+	}
+	if br.remain() < 12 {
+		return nil, io.ErrUnexpectedEOF
+	}
+
+	d.offsets[0] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[1] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[2] = int(br.Uint32())
+	br.advance(4)
+	if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 {
+		return nil, errors.New("invalid offset in dictionary")
+	}
+	d.content = make([]byte, br.remain())
+	copy(d.content, br.unread())
+	if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) {
+		return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets)
+	}
+
+	return &d, nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 5ebead9dc..50276bcde 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -671,4 +671,8 @@ encodeLoop:
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 
+	// We do not store history, so we must offset e.cur to avoid false matches for next user.
+	if e.cur < bufferReset {
+		e.cur += int32(len(src))
+	}
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index d1d3658e6..4104b456c 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -383,6 +383,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 			panic("src too big")
 		}
 	}
+
 	// Protect against e.cur wraparound.
 	if e.cur >= bufferReset {
 		for i := range e.table[:] {
@@ -516,6 +517,9 @@ encodeLoop:
 				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
+				if debugAsserts && t < 0 {
+					panic(fmt.Sprintf("t (%d) < 0, candidate.offset: %d, e.cur: %d, coffset0: %d, e.maxMatchOff: %d", t, candidate.offset, e.cur, coffset0, e.maxMatchOff))
+				}
 				break
 			}
 
@@ -548,6 +552,9 @@ encodeLoop:
 			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
+		if debugAsserts && t < 0 {
+			panic(fmt.Sprintf("t (%d) < 0 ", t))
+		}
 		// Extend the 4-byte match as long as possible.
 		//l := e.matchlenNoHist(s+4, t+4, src) + 4
 		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
@@ -647,6 +654,10 @@ encodeLoop:
 	if debug {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
+	// We do not store history, so we must offset e.cur to avoid false matches for next user.
+	if e.cur < bufferReset {
+		e.cur += int32(len(src))
+	}
 }
 
 func (e *fastBase) addBlock(src []byte) int32 {
@@ -714,7 +725,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 }
 
 // Reset the encoding table.
-func (e *fastBase) Reset() {
+func (e *fastBase) Reset(singleBlock bool) {
 	if e.blk == nil {
 		e.blk = &blockEnc{}
 		e.blk.init()
@@ -727,7 +738,7 @@ func (e *fastBase) Reset() {
 	} else {
 		e.crc.Reset()
 	}
-	if cap(e.hist) < int(e.maxMatchOff*2) {
+	if !singleBlock && cap(e.hist) < int(e.maxMatchOff*2) {
 		l := e.maxMatchOff * 2
 		// Make it at least 1MB.
 		if l < 1<<20 {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index af4f00b73..bf42bb1cf 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -35,7 +35,7 @@ type encoder interface {
 	AppendCRC([]byte) []byte
 	WindowSize(size int) int32
 	UseBlock(*blockEnc)
-	Reset()
+	Reset(singleBlock bool)
 }
 
 type encoderState struct {
@@ -82,7 +82,10 @@ func (e *Encoder) initialize() {
 	}
 	e.encoders = make(chan encoder, e.o.concurrent)
 	for i := 0; i < e.o.concurrent; i++ {
-		e.encoders <- e.o.encoder()
+		enc := e.o.encoder()
+		// If not single block, history will be allocated on first use.
+		enc.Reset(true)
+		e.encoders <- enc
 	}
 }
 
@@ -112,7 +115,7 @@ func (e *Encoder) Reset(w io.Writer) {
 	s.filling = s.filling[:0]
 	s.current = s.current[:0]
 	s.previous = s.previous[:0]
-	s.encoder.Reset()
+	s.encoder.Reset(false)
 	s.headerWritten = false
 	s.eofWritten = false
 	s.fullFrameWritten = false
@@ -445,11 +448,10 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	enc := <-e.encoders
 	defer func() {
 		// Release encoder reference to last block.
-		enc.Reset()
+		// If a non-single block is needed the encoder will reset again.
+		enc.Reset(true)
 		e.encoders <- enc
 	}()
-	enc.Reset()
-	blk := enc.Block()
 	// Use single segments when above minimum window and below 1MB.
 	single := len(src) < 1<<20 && len(src) > MinWindowSize
 	if e.o.single != nil {
@@ -472,12 +474,13 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		panic(err)
 	}
 
-	if len(src) <= e.o.blockSize && len(src) <= maxBlockSize {
+	// If we can do everything in one block, prefer that.
+	if len(src) <= maxCompressedBlockSize {
 		// Slightly faster with no history and everything in one block.
 		if e.o.crc {
 			_, _ = enc.CRC().Write(src)
 		}
-		blk.reset(nil)
+		blk := enc.Block()
 		blk.last = true
 		enc.EncodeNoHist(blk, src)
 
@@ -504,6 +507,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		}
 		blk.output = oldout
 	} else {
+		enc.Reset(false)
+		blk := enc.Block()
 		for len(src) > 0 {
 			todo := src
 			if len(todo) > e.o.blockSize {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 780880ebe..fc4a566d3 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -40,7 +40,7 @@ type frameDec struct {
 	FrameContentSize uint64
 	frameDone        sync.WaitGroup
 
-	DictionaryID  uint32
+	DictionaryID  *uint32
 	HasCheckSum   bool
 	SingleSegment bool
 
@@ -142,7 +142,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 
 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = 0
+	d.DictionaryID = nil
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
@@ -154,19 +154,22 @@ func (d *frameDec) reset(br byteBuffer) error {
 			}
 			return io.ErrUnexpectedEOF
 		}
+		var id uint32
 		switch size {
 		case 1:
-			d.DictionaryID = uint32(b[0])
+			id = uint32(b[0])
 		case 2:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+			id = uint32(b[0]) | (uint32(b[1]) << 8)
 		case 4:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		}
 		if debug {
-			println("Dict size", size, "ID:", d.DictionaryID)
+			println("Dict size", size, "ID:", id)
 		}
-		if d.DictionaryID != 0 {
-			return ErrUnknownDictionary
+		if id > 0 {
+			// ID 0 means "sorry, no dictionary anyway".
+			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
+			d.DictionaryID = &id
 		}
 	}
 
@@ -351,8 +354,6 @@ func (d *frameDec) initAsync() {
 // When the frame has finished decoding the *bufio.Reader
 // containing the remaining input will be sent on frameDec.frameDone.
 func (d *frameDec) startDecoder(output chan decodeOutput) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	written := int64(0)
 
 	defer func() {
@@ -445,8 +446,6 @@ func (d *frameDec) startDecoder(output chan decodeOutput) {
 
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	saved := d.history.b
 
 	// We use the history for output to avoid copying it.
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index e002be98b..957cfeb79 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -19,7 +19,7 @@ const (
 	 *  Increasing memory usage improves compression ratio
 	 *  Reduced memory usage can improve speed, due to cache effect
 	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-	maxMemoryUsage = 11
+	maxMemoryUsage = tablelogAbsoluteMax + 2
 
 	maxTableLog    = maxMemoryUsage - 2
 	maxTablesize   = 1 << maxTableLog
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index e8c419bd5..f418f50fc 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -17,6 +17,7 @@ type history struct {
 	windowSize    int
 	maxSize       int
 	error         bool
+	dict          *dict
 }
 
 // reset will reset the history to initial state of a frame.
@@ -36,12 +37,27 @@ func (h *history) reset() {
 	}
 	h.decoders = sequenceDecs{}
 	if h.huffTree != nil {
-		huffDecoderPool.Put(h.huffTree)
+		if h.dict == nil || h.dict.litDec != h.huffTree {
+			huffDecoderPool.Put(h.huffTree)
+		}
 	}
 	h.huffTree = nil
+	h.dict = nil
 	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
 }
 
+func (h *history) setDict(dict *dict) {
+	if dict == nil {
+		return
+	}
+	h.dict = dict
+	h.decoders.litLengths = dict.llDec
+	h.decoders.offsets = dict.ofDec
+	h.decoders.matchLengths = dict.mlDec
+	h.recentOffsets = dict.offsets
+	h.huffTree = dict.litDec
+}
+
 // append bytes to history.
 // This function will make sure there is space for it,
 // if the buffer has been allocated with enough extra space.
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 39238e16a..7ff870400 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -62,6 +62,7 @@ type sequenceDecs struct {
 	matchLengths sequenceDec
 	prevOffset   [3]int
 	hist         []byte
+	dict         []byte
 	literals     []byte
 	out          []byte
 	windowSize   int
@@ -85,6 +86,10 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
 	s.windowSize = hist.windowSize
 	s.out = out
+	s.dict = nil
+	if hist.dict != nil {
+		s.dict = hist.dict.content
+	}
 	return nil
 }
 
@@ -100,23 +105,78 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			printf("reading sequence %d, exceeded available data\n", seqs-i)
 			return io.ErrUnexpectedEOF
 		}
-		var litLen, matchOff, matchLen int
+		var ll, mo, ml int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
-			litLen, matchOff, matchLen = s.nextFast(br, llState, mlState, ofState)
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("temp was 0")
+						temp = 1
+					}
+
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
 			br.fillFast()
 		} else {
-			litLen, matchOff, matchLen = s.next(br, llState, mlState, ofState)
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
 			br.fill()
 		}
 
 		if debugSequences {
-			println("Seq", seqs-i-1, "Litlen:", litLen, "matchOff:", matchOff, "(abs) matchLen:", matchLen)
+			println("Seq", seqs-i-1, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
 		}
 
-		if litLen > len(s.literals) {
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", litLen, len(s.literals))
+		if ll > len(s.literals) {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
 		}
-		size := litLen + matchLen + len(s.out)
+		size := ll + ml + len(s.out)
 		if size-startSize > maxBlockSize {
 			return fmt.Errorf("output (%d) bigger than max block size", size)
 		}
@@ -127,52 +187,70 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			s.out = append(s.out, make([]byte, maxBlockSize)...)
 			s.out = s.out[:len(s.out)-maxBlockSize]
 		}
-		if matchLen > maxMatchLen {
-			return fmt.Errorf("match len (%d) bigger than max allowed length", matchLen)
-		}
-		if matchOff > len(s.out)+len(hist)+litLen {
-			return fmt.Errorf("match offset (%d) bigger than current history (%d)", matchOff, len(s.out)+len(hist)+litLen)
-		}
-		if matchOff > s.windowSize {
-			return fmt.Errorf("match offset (%d) bigger than window size (%d)", matchOff, s.windowSize)
-		}
-		if matchOff == 0 && matchLen > 0 {
-			return fmt.Errorf("zero matchoff and matchlen > 0")
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 
-		s.out = append(s.out, s.literals[:litLen]...)
-		s.literals = s.literals[litLen:]
+		// Add literals
+		s.out = append(s.out, s.literals[:ll]...)
+		s.literals = s.literals[ll:]
 		out := s.out
 
+		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+
+			// we may be in dictionary.
+			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+			end := dictO + ml
+			if end > len(s.dict) {
+				out = append(out, s.dict[dictO:]...)
+				mo -= len(s.dict) - dictO
+				ml -= len(s.dict) - dictO
+			} else {
+				out = append(out, s.dict[dictO:end]...)
+				mo = 0
+				ml = 0
+			}
+		}
+
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := matchOff - len(s.out); v > 0 {
+		if v := mo - len(s.out); v > 0 {
 			// v is the start position in history from end.
 			start := len(s.hist) - v
-			if matchLen > v {
+			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
 				out = append(out, s.hist[start:]...)
-				matchOff -= v
-				matchLen -= v
+				mo -= v
+				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+matchLen]...)
-				matchLen = 0
+				out = append(out, s.hist[start:start+ml]...)
+				ml = 0
 			}
 		}
 		// We must be in current buffer now
-		if matchLen > 0 {
-			start := len(s.out) - matchOff
-			if matchLen <= len(s.out)-start {
+		if ml > 0 {
+			start := len(s.out) - mo
+			if ml <= len(s.out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+matchLen]...)
+				out = append(out, s.out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
-				out = out[:len(out)+matchLen]
-				src := out[start : start+matchLen]
+				out = out[:len(out)+ml]
+				src := out[start : start+ml]
 				// Destination is the space we just added.
-				dst := out[len(out)-matchLen:]
+				dst := out[len(out)-ml:]
 				dst = dst[:len(src)]
 				for i := range src {
 					dst[i] = src[i]