summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost')
-rw-r--r--vendor/github.com/klauspost/compress/README.md10
-rw-r--r--vendor/github.com/klauspost/compress/zstd/bytebuf.go15
-rw-r--r--vendor/github.com/klauspost/compress/zstd/decoder.go3
-rw-r--r--vendor/github.com/klauspost/compress/zstd/encoder.go4
-rw-r--r--vendor/github.com/klauspost/compress/zstd/encoder_options.go2
-rw-r--r--vendor/github.com/klauspost/compress/zstd/framedec.go21
-rw-r--r--vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go4
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go26
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s460
9 files changed, 329 insertions, 216 deletions
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index c7cf1a20c..ad5c63a82 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,16 @@ This package provides various compression algorithms.
# changelog
+* July 13, 2022 (v1.15.8)
+
+ * gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
+ * s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
+ * zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
+ * zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
+ * huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
+ * zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
+ * gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
+
* June 29, 2022 (v1.15.7)
* s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 4493baa75..2ad02070d 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -23,7 +23,7 @@ type byteBuffer interface {
readByte() (byte, error)
// Skip n bytes.
- skipN(n int) error
+ skipN(n int64) error
}
// in-memory buffer
@@ -62,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) {
return r, nil
}
-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
bb := *b
- if len(bb) < n {
+ if n < 0 {
+ return fmt.Errorf("negative skip (%d) requested", n)
+ }
+ if int64(len(bb)) < n {
return io.ErrUnexpectedEOF
}
*b = bb[n:]
@@ -120,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) {
return r.tmp[0], nil
}
-func (r *readerWrapper) skipN(n int) error {
- n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
- if n2 != int64(n) {
+func (r *readerWrapper) skipN(n int64) error {
+ n2, err := io.CopyN(ioutil.Discard, r.r, n)
+ if n2 != n {
err = io.ErrUnexpectedEOF
}
return err
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 286c8f9d7..d212f4737 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -348,6 +348,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
frame.history.setDict(&dict)
}
if frame.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
+ }
return dst, ErrWindowSizeExceeded
}
if frame.FrameContentSize != fcsUnknown {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index e6b1d01cf..7aaaedb23 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// If a non-single block is needed the encoder will reset again.
e.encoders <- enc
}()
- // Use single segments when above minimum window and below 1MB.
- single := len(src) < 1<<20 && len(src) > MinWindowSize
+ // Use single segments when above minimum window and below window size.
+ single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
if e.o.single != nil {
single = *e.o.single
}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 44d8dbd19..a7c5e1aac 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
// This setting has no effect on streamed encodes.
func WithSingleSegment(b bool) EOption {
return func(o *encoderOptions) error {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index fa0a633f3..9568a4ba3 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
}
n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
println("Skipping frame with", n, "bytes.")
- err = br.skipN(int(n))
+ err = br.skipN(int64(n))
if err != nil {
if debugDecoder {
println("Reading discarded frame", err)
@@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
d.crc.Reset()
}
+ if d.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrWindowSizeExceeded
+ }
+
if d.WindowSize == 0 && d.SingleSegment {
// We may not need window in this case.
d.WindowSize = d.FrameContentSize
if d.WindowSize < MinWindowSize {
d.WindowSize = MinWindowSize
}
- }
-
- if d.WindowSize > uint64(d.o.maxWindowSize) {
- if debugDecoder {
- printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ if d.WindowSize > d.o.maxDecodedSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrDecoderSizeExceeded
}
- return ErrWindowSizeExceeded
}
+
// The minimum Window_Size is 1 KB.
if d.WindowSize < MinWindowSize {
if debugDecoder {
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
index e74df436c..c881d28d8 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -34,8 +34,8 @@ const (
// buildDtable will build the decoding table.
func (s *fseDecoder) buildDtable() error {
ctx := buildDtableAsmContext{
- stateTable: (*uint16)(&s.stateTable[0]),
- norm: (*int16)(&s.norm[0]),
+ stateTable: &s.stateTable[0],
+ norm: &s.norm[0],
dt: (*uint64)(&s.dt[0]),
}
code := buildDtable_asm(s, &ctx)
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
index 847b322ae..7598c1018 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -55,16 +55,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
return false, nil
}
- useSafe := false
- if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
- useSafe = true
- }
- if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
- useSafe = true
- }
- if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
- useSafe = true
- }
+
+ // FIXME: Using unsafe memory copies leads to rare, random crashes
+ // with fuzz testing. It is therefore disabled for now.
+ const useSafe = true
+ /*
+ useSafe := false
+ if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+ useSafe = true
+ }
+ if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+ useSafe = true
+ }
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ useSafe = true
+ }
+ */
br := s.br
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 71e64e061..27e76774c 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -52,34 +52,46 @@ sequenceDecs_decode_amd64_fill_byte_by_byte:
sequenceDecs_decode_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 16(R10)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 8(R10)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@@ -107,19 +119,25 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte:
sequenceDecs_decode_amd64_fill_2_end:
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, (R10)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ll_update_zero:
+ MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
@@ -198,7 +216,7 @@ sequenceDecs_decode_amd64_skip_update:
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
- JMP sequenceDecs_decode_amd64_adjust_end
+ JMP sequenceDecs_decode_amd64_after_adjust
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
@@ -210,7 +228,7 @@ sequenceDecs_decode_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
MOVQ R11, CX
- JMP sequenceDecs_decode_amd64_adjust_end
+ JMP sequenceDecs_decode_amd64_after_adjust
sequenceDecs_decode_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -247,7 +265,7 @@ sequenceDecs_decode_amd64_adjust_temp_valid:
MOVQ AX, R11
MOVQ AX, CX
-sequenceDecs_decode_amd64_adjust_end:
+sequenceDecs_decode_amd64_after_adjust:
MOVQ CX, 16(R10)
// Check values
@@ -356,49 +374,67 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte:
sequenceDecs_decode_56_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 16(R10)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, 8(R10)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R15
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R15
- ADDQ R15, AX
- MOVQ AX, (R10)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ll_update_zero:
+ MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
@@ -477,7 +513,7 @@ sequenceDecs_decode_56_amd64_skip_update:
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
- JMP sequenceDecs_decode_56_amd64_adjust_end
+ JMP sequenceDecs_decode_56_amd64_after_adjust
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
@@ -489,7 +525,7 @@ sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
MOVQ R11, CX
- JMP sequenceDecs_decode_56_amd64_adjust_end
+ JMP sequenceDecs_decode_56_amd64_after_adjust
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -526,7 +562,7 @@ sequenceDecs_decode_56_amd64_adjust_temp_valid:
MOVQ AX, R11
MOVQ AX, CX
-sequenceDecs_decode_56_amd64_adjust_end:
+sequenceDecs_decode_56_amd64_after_adjust:
MOVQ CX, 16(R10)
// Check values
@@ -757,7 +793,7 @@ sequenceDecs_decode_bmi2_skip_update:
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
- JMP sequenceDecs_decode_bmi2_adjust_end
+ JMP sequenceDecs_decode_bmi2_after_adjust
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
@@ -769,7 +805,7 @@ sequenceDecs_decode_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
MOVQ R10, CX
- JMP sequenceDecs_decode_bmi2_adjust_end
+ JMP sequenceDecs_decode_bmi2_after_adjust
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -806,7 +842,7 @@ sequenceDecs_decode_bmi2_adjust_temp_valid:
MOVQ R13, R10
MOVQ R13, CX
-sequenceDecs_decode_bmi2_adjust_end:
+sequenceDecs_decode_bmi2_after_adjust:
MOVQ CX, 16(R9)
// Check values
@@ -1012,7 +1048,7 @@ sequenceDecs_decode_56_bmi2_skip_update:
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
- JMP sequenceDecs_decode_56_bmi2_adjust_end
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
@@ -1024,7 +1060,7 @@ sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
MOVQ R10, CX
- JMP sequenceDecs_decode_56_bmi2_adjust_end
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
@@ -1061,7 +1097,7 @@ sequenceDecs_decode_56_bmi2_adjust_temp_valid:
MOVQ R13, R10
MOVQ R13, CX
-sequenceDecs_decode_56_bmi2_adjust_end:
+sequenceDecs_decode_56_bmi2_after_adjust:
MOVQ CX, 16(R9)
// Check values
@@ -1749,6 +1785,10 @@ TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
@@ -1798,34 +1838,46 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
sequenceDecs_decodeSync_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 8(SP)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 16(SP)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@@ -1853,19 +1905,25 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
sequenceDecs_decodeSync_amd64_fill_2_end:
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 24(SP)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
@@ -1945,7 +2003,7 @@ sequenceDecs_decodeSync_amd64_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -1957,7 +2015,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
MOVQ R13, AX
@@ -1966,8 +2024,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(AX*8), R14
+ ADDQ 144(CX)(AX*8), R14
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -1983,7 +2040,7 @@ sequenceDecs_decodeSync_amd64_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_amd64_adjust_end:
+sequenceDecs_decodeSync_amd64_after_adjust:
MOVQ R13, 8(SP)
// Check values
@@ -2280,6 +2337,10 @@ TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
@@ -2452,7 +2513,7 @@ sequenceDecs_decodeSync_bmi2_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -2464,7 +2525,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
@@ -2473,8 +2534,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(R12*8), R14
+ ADDQ 144(CX)(R12*8), R14
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -2490,7 +2550,7 @@ sequenceDecs_decodeSync_bmi2_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_bmi2_adjust_end:
+sequenceDecs_decodeSync_bmi2_after_adjust:
MOVQ R13, 8(SP)
// Check values
@@ -2787,6 +2847,10 @@ TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
@@ -2836,34 +2900,46 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
sequenceDecs_decodeSync_safe_amd64_fill_end:
// Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 8(SP)
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
// Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 16(SP)
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
@@ -2891,19 +2967,25 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
// Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- ADDQ CX, BX
- NEGL CX
- SHRQ CL, R14
- SHRQ $0x20, AX
- TESTQ CX, CX
- CMOVQEQ CX, R14
- ADDQ R14, AX
- MOVQ AX, 24(SP)
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
@@ -2983,7 +3065,7 @@ sequenceDecs_decodeSync_safe_amd64_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -2995,7 +3077,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
MOVQ R13, AX
@@ -3004,8 +3086,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(AX*8), R14
+ ADDQ 144(CX)(AX*8), R14
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -3021,7 +3102,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_safe_amd64_adjust_end:
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
MOVQ R13, 8(SP)
// Check values
@@ -3420,6 +3501,10 @@ TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
@@ -3592,7 +3677,7 @@ sequenceDecs_decodeSync_safe_bmi2_skip_update:
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
@@ -3604,7 +3689,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
@@ -3613,8 +3698,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
- LEAQ 144(CX), R15
- ADDQ (R15)(R12*8), R14
+ ADDQ 144(CX)(R12*8), R14
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
@@ -3630,7 +3714,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
MOVQ R14, 144(CX)
MOVQ R14, R13
-sequenceDecs_decodeSync_safe_bmi2_adjust_end:
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
MOVQ R13, 8(SP)
// Check values