diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s | 3519 |
1 files changed, 3519 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s new file mode 100644 index 000000000..01cc23fa8 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s @@ -0,0 +1,3519 @@ +// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm + +// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: CMOV +TEXT ·sequenceDecs_decode_amd64(SB), $8-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + MOVQ 104(AX), R10 + MOVQ s+0(FP), AX + MOVQ 144(AX), R11 + MOVQ 152(AX), R12 + MOVQ 160(AX), R13 + +sequenceDecs_decode_amd64_main_loop: + MOVQ (SP), R14 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decode_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R14 + MOVQ (R14), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decode_amd64_fill_end + +sequenceDecs_decode_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decode_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decode_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R14 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R14), AX + ORQ AX, DX + JMP sequenceDecs_decode_amd64_fill_byte_by_byte + +sequenceDecs_decode_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R15 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R15 + ADDQ R15, AX + MOVQ AX, 16(R10) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R15 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R15 + ADDQ R15, AX + MOVQ AX, 8(R10) + + // Fill bitreader to have enough for the remaining + CMPQ SI, $0x08 + JL sequenceDecs_decode_amd64_fill_2_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R14 + MOVQ (R14), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decode_amd64_fill_2_end + +sequenceDecs_decode_amd64_fill_2_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decode_amd64_fill_2_end + CMPQ BX, $0x07 + JLE sequenceDecs_decode_amd64_fill_2_end + SHLQ $0x08, DX + SUBQ $0x01, R14 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R14), AX + ORQ AX, DX + JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte + +sequenceDecs_decode_amd64_fill_2_end: + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R15 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R15 + ADDQ R15, AX + MOVQ AX, (R10) + + // Fill bitreader for state updates + MOVQ R14, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R14 + SHRQ $0x10, DI + MOVWQZX DI, DI + CMPQ R14, $0x00 + JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero + MOVQ BX, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX + NEGQ CX + SHRQ CL, R15 + ADDQ R15, DI + +sequenceDecs_decode_amd64_llState_updateState_skip_zero: + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R14 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + CMPQ R14, $0x00 + JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero + MOVQ BX, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX + NEGQ CX + SHRQ CL, R15 + ADDQ R15, R8 + +sequenceDecs_decode_amd64_mlState_updateState_skip_zero: + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R14 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + CMPQ R14, $0x00 + JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero + MOVQ BX, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX + NEGQ CX + SHRQ CL, R15 + ADDQ R15, R9 + +sequenceDecs_decode_amd64_ofState_updateState_skip_zero: + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decode_amd64_skip_update: + // Adjust offset + MOVQ 16(R10), CX + CMPQ AX, $0x01 + JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 + MOVQ R12, R13 + MOVQ R11, R12 + MOVQ CX, R11 + JMP sequenceDecs_decode_amd64_adjust_end + +sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: + CMPQ (R10), $0x00000000 + JNE sequenceDecs_decode_amd64_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_amd64_adjust_offset_nonzero + +sequenceDecs_decode_amd64_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero + MOVQ R11, CX + JMP sequenceDecs_decode_amd64_adjust_end + +sequenceDecs_decode_amd64_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_amd64_adjust_zero + JEQ sequenceDecs_decode_amd64_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_amd64_adjust_three + JMP sequenceDecs_decode_amd64_adjust_two + +sequenceDecs_decode_amd64_adjust_zero: + MOVQ R11, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_one: + MOVQ R12, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_two: + MOVQ R13, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_three: + LEAQ -1(R11), AX + +sequenceDecs_decode_amd64_adjust_test_temp_valid: + TESTQ AX, AX + JNZ sequenceDecs_decode_amd64_adjust_temp_valid + MOVQ $0x00000001, AX + +sequenceDecs_decode_amd64_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R12, R13 + MOVQ R11, R12 + MOVQ AX, R11 + MOVQ AX, CX + +sequenceDecs_decode_amd64_adjust_end: + MOVQ CX, 16(R10) + + // Check values + MOVQ 8(R10), AX + MOVQ (R10), R14 + LEAQ (AX)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decode_amd64_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decode_amd64_match_len_ofs_ok: + ADDQ $0x18, R10 + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decode_amd64_main_loop + MOVQ s+0(FP), AX + MOVQ R11, 144(AX) + MOVQ R12, 152(AX) + MOVQ R13, 160(AX) + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_amd64_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: CMOV +TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + MOVQ 104(AX), R10 + MOVQ s+0(FP), AX + MOVQ 144(AX), R11 + MOVQ 152(AX), R12 + MOVQ 160(AX), R13 + +sequenceDecs_decode_56_amd64_main_loop: + MOVQ (SP), R14 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decode_56_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R14 + MOVQ (R14), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decode_56_amd64_fill_end + +sequenceDecs_decode_56_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decode_56_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decode_56_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R14 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R14), AX + ORQ AX, DX + JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte + +sequenceDecs_decode_56_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R15 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R15 + ADDQ R15, AX + MOVQ AX, 16(R10) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R15 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R15 + ADDQ R15, AX + MOVQ AX, 8(R10) + + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R15 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R15 + ADDQ R15, AX + MOVQ AX, (R10) + + // Fill bitreader for state updates + MOVQ R14, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_56_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R14 + SHRQ $0x10, DI + MOVWQZX DI, DI + CMPQ R14, $0x00 + JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero + MOVQ BX, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX + NEGQ CX + SHRQ CL, R15 + ADDQ R15, DI + +sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R14 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + CMPQ R14, $0x00 + JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero + MOVQ BX, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX + NEGQ CX + SHRQ CL, R15 + ADDQ R15, R8 + +sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R14 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + CMPQ R14, $0x00 + JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero + MOVQ BX, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX + NEGQ CX + SHRQ CL, R15 + ADDQ R15, R9 + +sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero: + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decode_56_amd64_skip_update: + // Adjust offset + MOVQ 16(R10), CX + CMPQ AX, $0x01 + JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 + MOVQ R12, R13 + MOVQ R11, R12 + MOVQ CX, R11 + JMP sequenceDecs_decode_56_amd64_adjust_end + +sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: + CMPQ (R10), $0x00000000 + JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero + +sequenceDecs_decode_56_amd64_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero + MOVQ R11, CX + JMP sequenceDecs_decode_56_amd64_adjust_end + +sequenceDecs_decode_56_amd64_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_56_amd64_adjust_zero + JEQ sequenceDecs_decode_56_amd64_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_56_amd64_adjust_three + JMP sequenceDecs_decode_56_amd64_adjust_two + +sequenceDecs_decode_56_amd64_adjust_zero: + MOVQ R11, AX + JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid + +sequenceDecs_decode_56_amd64_adjust_one: + MOVQ R12, AX + JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid + +sequenceDecs_decode_56_amd64_adjust_two: + MOVQ R13, AX + JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid + +sequenceDecs_decode_56_amd64_adjust_three: + LEAQ -1(R11), AX + +sequenceDecs_decode_56_amd64_adjust_test_temp_valid: + TESTQ AX, AX + JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid + MOVQ $0x00000001, AX + +sequenceDecs_decode_56_amd64_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R12, R13 + MOVQ R11, R12 + MOVQ AX, R11 + MOVQ AX, CX + +sequenceDecs_decode_56_amd64_adjust_end: + MOVQ CX, 16(R10) + + // Check values + MOVQ 8(R10), AX + MOVQ (R10), R14 + LEAQ (AX)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decode_56_amd64_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decode_56_amd64_match_len_ofs_ok: + ADDQ $0x18, R10 + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decode_56_amd64_main_loop + MOVQ s+0(FP), AX + MOVQ R11, 144(AX) + MOVQ R12, 152(AX) + MOVQ R13, 160(AX) + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_56_amd64_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: BMI, BMI2, CMOV +TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + MOVQ 104(CX), R9 + MOVQ s+0(FP), CX + MOVQ 144(CX), R10 + MOVQ 152(CX), R11 + MOVQ 160(CX), R12 + +sequenceDecs_decode_bmi2_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decode_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R13 + MOVQ (R13), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decode_bmi2_fill_end + +sequenceDecs_decode_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decode_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decode_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R13 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R13), CX + ORQ CX, AX + JMP sequenceDecs_decode_bmi2_fill_byte_by_byte + +sequenceDecs_decode_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 16(R9) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 8(R9) + + // Fill bitreader to have enough for the remaining + CMPQ BX, $0x08 + JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R13 + MOVQ (R13), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decode_bmi2_fill_2_end + +sequenceDecs_decode_bmi2_fill_2_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decode_bmi2_fill_2_end + CMPQ DX, $0x07 + JLE sequenceDecs_decode_bmi2_fill_2_end + SHLQ $0x08, AX + SUBQ $0x01, R13 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R13), CX + ORQ CX, AX + JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte + +sequenceDecs_decode_bmi2_fill_2_end: + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, (R9) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_bmi2_skip_update + + // Update Literal Length State + MOVBQZX SI, R14 + MOVQ $0x00001010, CX + BEXTRQ CX, SI, SI + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + + // Update Match Length State + MOVBQZX DI, R14 + MOVQ $0x00001010, CX + BEXTRQ CX, DI, DI + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Offset State + MOVBQZX R8, R14 + MOVQ $0x00001010, CX + BEXTRQ CX, R8, R8 + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + +sequenceDecs_decode_bmi2_skip_update: + // Adjust offset + MOVQ 16(R9), CX + CMPQ R13, $0x01 + JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 + MOVQ R11, R12 + MOVQ R10, R11 + MOVQ CX, R10 + JMP sequenceDecs_decode_bmi2_adjust_end + +sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: + CMPQ (R9), $0x00000000 + JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero + +sequenceDecs_decode_bmi2_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero + MOVQ R10, CX + JMP sequenceDecs_decode_bmi2_adjust_end + +sequenceDecs_decode_bmi2_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_bmi2_adjust_zero + JEQ sequenceDecs_decode_bmi2_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_bmi2_adjust_three + JMP sequenceDecs_decode_bmi2_adjust_two + +sequenceDecs_decode_bmi2_adjust_zero: + MOVQ R10, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_one: + MOVQ R11, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_two: + MOVQ R12, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_three: + LEAQ -1(R10), R13 + +sequenceDecs_decode_bmi2_adjust_test_temp_valid: + TESTQ R13, R13 + JNZ sequenceDecs_decode_bmi2_adjust_temp_valid + MOVQ $0x00000001, R13 + +sequenceDecs_decode_bmi2_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R11, R12 + MOVQ R10, R11 + MOVQ R13, R10 + MOVQ R13, CX + +sequenceDecs_decode_bmi2_adjust_end: + MOVQ CX, 16(R9) + + // Check values + MOVQ 8(R9), R13 + MOVQ (R9), R14 + LEAQ (R13)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ R13, $0x00020002 + JA sequenceDecs_decode_bmi2_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok + TESTQ R13, R13 + JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decode_bmi2_match_len_ofs_ok: + ADDQ $0x18, R9 + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decode_bmi2_main_loop + MOVQ s+0(FP), CX + MOVQ R10, 144(CX) + MOVQ R11, 152(CX) + MOVQ R12, 160(CX) + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_bmi2_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: BMI, BMI2, CMOV +TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + MOVQ 104(CX), R9 + MOVQ s+0(FP), CX + MOVQ 144(CX), R10 + MOVQ 152(CX), R11 + MOVQ 160(CX), R12 + +sequenceDecs_decode_56_bmi2_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R13 + MOVQ (R13), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decode_56_bmi2_fill_end + +sequenceDecs_decode_56_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decode_56_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decode_56_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R13 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R13), CX + ORQ CX, AX + JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte + +sequenceDecs_decode_56_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 16(R9) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 8(R9) + + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, (R9) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_56_bmi2_skip_update + + // Update Literal Length State + MOVBQZX SI, R14 + MOVQ $0x00001010, CX + BEXTRQ CX, SI, SI + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + + // Update Match Length State + MOVBQZX DI, R14 + MOVQ $0x00001010, CX + BEXTRQ CX, DI, DI + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Offset State + MOVBQZX R8, R14 + MOVQ $0x00001010, CX + BEXTRQ CX, R8, R8 + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + +sequenceDecs_decode_56_bmi2_skip_update: + // Adjust offset + MOVQ 16(R9), CX + CMPQ R13, $0x01 + JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 + MOVQ R11, R12 + MOVQ R10, R11 + MOVQ CX, R10 + JMP sequenceDecs_decode_56_bmi2_adjust_end + +sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: + CMPQ (R9), $0x00000000 + JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero + +sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero + MOVQ R10, CX + JMP sequenceDecs_decode_56_bmi2_adjust_end + +sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_56_bmi2_adjust_zero + JEQ sequenceDecs_decode_56_bmi2_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_56_bmi2_adjust_three + JMP sequenceDecs_decode_56_bmi2_adjust_two + +sequenceDecs_decode_56_bmi2_adjust_zero: + MOVQ R10, R13 + JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_56_bmi2_adjust_one: + MOVQ R11, R13 + JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_56_bmi2_adjust_two: + MOVQ R12, R13 + JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_56_bmi2_adjust_three: + LEAQ -1(R10), R13 + +sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: + TESTQ R13, R13 + JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid + MOVQ $0x00000001, R13 + +sequenceDecs_decode_56_bmi2_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R11, R12 + MOVQ R10, R11 + MOVQ R13, R10 + MOVQ R13, CX + +sequenceDecs_decode_56_bmi2_adjust_end: + MOVQ CX, 16(R9) + + // Check values + MOVQ 8(R9), R13 + MOVQ (R9), R14 + LEAQ (R13)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ R13, $0x00020002 + JA sequenceDecs_decode_56_bmi2_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok + TESTQ R13, R13 + JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decode_56_bmi2_match_len_ofs_ok: + ADDQ $0x18, R9 + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decode_56_bmi2_main_loop + MOVQ s+0(FP), CX + MOVQ R10, 144(CX) + MOVQ R11, 152(CX) + MOVQ R12, 160(CX) + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_56_bmi2_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool +// Requires: SSE +TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 + MOVQ ctx+0(FP), R10 + MOVQ 8(R10), CX + TESTQ CX, CX + JZ empty_seqs + MOVQ (R10), AX + MOVQ 24(R10), DX + MOVQ 32(R10), BX + MOVQ 80(R10), SI + MOVQ 104(R10), DI + MOVQ 120(R10), R8 + MOVQ 56(R10), R9 + MOVQ 64(R10), R10 + ADDQ R10, R9 + + // seqsBase += 24 * seqIndex + LEAQ (DX)(DX*2), R11 + SHLQ $0x03, R11 + ADDQ R11, AX + + // outBase += outPosition + ADDQ DI, BX + +main_loop: + MOVQ (AX), R11 + MOVQ 16(AX), R12 + MOVQ 8(AX), R13 + + // Copy literals + TESTQ R11, R11 + JZ check_offset + XORQ R14, R14 + TESTQ $0x00000001, R11 + JZ copy_1_word + MOVB (SI)(R14*1), R15 + MOVB R15, (BX)(R14*1) + ADDQ $0x01, R14 + +copy_1_word: + TESTQ $0x00000002, R11 + JZ copy_1_dword + MOVW (SI)(R14*1), R15 + MOVW R15, (BX)(R14*1) + ADDQ $0x02, R14 + +copy_1_dword: + TESTQ $0x00000004, R11 + JZ copy_1_qword + MOVL (SI)(R14*1), R15 + MOVL R15, (BX)(R14*1) + ADDQ $0x04, R14 + +copy_1_qword: + TESTQ $0x00000008, R11 + JZ copy_1_test + MOVQ (SI)(R14*1), R15 + MOVQ R15, (BX)(R14*1) + ADDQ $0x08, R14 + JMP copy_1_test + +copy_1: + MOVUPS (SI)(R14*1), X0 + MOVUPS X0, (BX)(R14*1) + ADDQ $0x10, R14 + +copy_1_test: + CMPQ R14, R11 + JB copy_1 + ADDQ R11, SI + ADDQ R11, BX + ADDQ R11, DI + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + LEAQ (DI)(R10*1), R11 + CMPQ R12, R11 + JG error_match_off_too_big + CMPQ R12, R8 + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JGE copy_all_from_history + XORQ R11, R11 + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(R11*1), R12 + MOVB R12, (BX)(R11*1) + ADDQ $0x01, R11 + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(R11*1), R12 + MOVW R12, (BX)(R11*1) + ADDQ $0x02, R11 + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(R11*1), R12 + MOVL R12, (BX)(R11*1) + ADDQ $0x04, R11 + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(R11*1), R12 + MOVQ R12, (BX)(R11*1) + ADDQ $0x08, R11 + JMP copy_4_test + +copy_4: + MOVUPS (R14)(R11*1), X0 + MOVUPS X0, (BX)(R11*1) + ADDQ $0x10, R11 + +copy_4_test: + CMPQ R11, R13 + JB copy_4 + ADDQ R13, DI + ADDQ R13, BX + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, R11 + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (BX)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, R11 + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (BX)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, R11 + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (BX)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, R11 + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (BX)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (BX)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, R11 + JB copy_5 + ADDQ R11, BX + ADDQ R11, DI + SUBQ R11, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ BX, R11 + SUBQ R12, R11 + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + XORQ R12, R12 + +copy_2: + MOVUPS (R11)(R12*1), X0 + MOVUPS X0, (BX)(R12*1) + ADDQ $0x10, R12 + CMPQ R12, R13 + JB copy_2 + ADDQ R13, BX + ADDQ R13, DI + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + XORQ R12, R12 + +copy_slow_3: + MOVB (R11)(R12*1), R14 + MOVB R14, (BX)(R12*1) + INCQ R12 + CMPQ R12, R13 + JB copy_slow_3 + ADDQ R13, BX + ADDQ R13, DI + +handle_loop: + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + +loop_finished: + // Return value + MOVB $0x01, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +error_match_off_too_big: + // Return value + MOVB $0x00, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +empty_seqs: + // Return value + MOVB $0x01, ret+8(FP) + RET + +// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: CMOV, SSE +TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + MOVQ 112(AX), R10 + MOVQ 128(AX), CX + MOVQ CX, 32(SP) + MOVQ 144(AX), R11 + MOVQ 136(AX), R12 + MOVQ 200(AX), CX + MOVQ CX, 56(SP) + MOVQ 176(AX), CX + MOVQ CX, 48(SP) + MOVQ 184(AX), AX + MOVQ AX, 40(SP) + MOVQ 40(SP), AX + ADDQ AX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R10, 32(SP) + + // outBase += outPosition + ADDQ R12, R10 + +sequenceDecs_decodeSync_amd64_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_amd64_fill_end + +sequenceDecs_decodeSync_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte + +sequenceDecs_decodeSync_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 8(SP) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_amd64_fill_2_end + +sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_amd64_fill_2_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_amd64_fill_2_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte + +sequenceDecs_decodeSync_amd64_fill_2_end: + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 24(SP) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R13 + SHRQ $0x10, DI + MOVWQZX DI, DI + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, DI + +sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R13 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, R8 + +sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R13 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, R9 + +sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero: + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decodeSync_amd64_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ AX, $0x01 + JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_amd64_adjust_end + +sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero + +sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_amd64_adjust_end + +sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: + MOVQ R13, AX + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, AX + CMOVQEQ R15, R14 + LEAQ 144(CX), R15 + ADDQ (R15)(AX*8), R14 + JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_amd64_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_amd64_adjust_skip + MOVQ 152(CX), AX + MOVQ AX, 160(CX) + +sequenceDecs_decodeSync_amd64_adjust_skip: + MOVQ 144(CX), AX + MOVQ AX, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_amd64_adjust_end: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), AX + MOVQ 24(SP), CX + LEAQ (AX)(CX*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ CX, 104(R14) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decodeSync_amd64_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_amd64_match_len_ofs_ok: + MOVQ 24(SP), AX + MOVQ 8(SP), CX + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (AX)(R13*1), R14 + ADDQ R10, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ AX, AX + JZ check_offset + XORQ R14, R14 + TESTQ $0x00000001, AX + JZ copy_1_word + MOVB (R11)(R14*1), R15 + MOVB R15, (R10)(R14*1) + ADDQ $0x01, R14 + +copy_1_word: + TESTQ $0x00000002, AX + JZ copy_1_dword + MOVW (R11)(R14*1), R15 + MOVW R15, (R10)(R14*1) + ADDQ $0x02, R14 + +copy_1_dword: + TESTQ $0x00000004, AX + JZ copy_1_qword + MOVL (R11)(R14*1), R15 + MOVL R15, (R10)(R14*1) + ADDQ $0x04, R14 + +copy_1_qword: + TESTQ $0x00000008, AX + JZ copy_1_test + MOVQ (R11)(R14*1), R15 + MOVQ R15, (R10)(R14*1) + ADDQ $0x08, R14 + JMP copy_1_test + +copy_1: + MOVUPS (R11)(R14*1), X0 + MOVUPS X0, (R10)(R14*1) + ADDQ $0x10, R14 + +copy_1_test: + CMPQ R14, AX + JB copy_1 + ADDQ AX, R11 + ADDQ AX, R10 + ADDQ AX, R12 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R12, AX + ADDQ 40(SP), AX + CMPQ CX, AX + JG error_match_off_too_big + CMPQ CX, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JGE copy_all_from_history + XORQ AX, AX + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(AX*1), CL + MOVB CL, (R10)(AX*1) + ADDQ $0x01, AX + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(AX*1), CX + MOVW CX, (R10)(AX*1) + ADDQ $0x02, AX + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(AX*1), CX + MOVL CX, (R10)(AX*1) + ADDQ $0x04, AX + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(AX*1), CX + MOVQ CX, (R10)(AX*1) + ADDQ $0x08, AX + JMP copy_4_test + +copy_4: + MOVUPS (R14)(AX*1), X0 + MOVUPS X0, (R10)(AX*1) + ADDQ $0x10, AX + +copy_4_test: + CMPQ AX, R13 + JB copy_4 + ADDQ R13, R12 + ADDQ R13, R10 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, AX + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (R10)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, AX + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (R10)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, AX + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (R10)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, AX + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (R10)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (R10)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, AX + JB copy_5 + ADDQ AX, R10 + ADDQ AX, R12 + SUBQ AX, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ R10, AX + SUBQ CX, AX + + // ml <= mo + CMPQ R13, CX + JA copy_overlapping_match + + // Copy non-overlapping match + XORQ CX, CX + +copy_2: + MOVUPS (AX)(CX*1), X0 + MOVUPS X0, (R10)(CX*1) + ADDQ $0x10, CX + CMPQ CX, R13 + JB copy_2 + ADDQ R13, R10 + ADDQ R13, R12 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + XORQ CX, CX + +copy_slow_3: + MOVB (AX)(CX*1), R14 + MOVB R14, (R10)(CX*1) + INCQ CX + CMPQ CX, R13 + JB copy_slow_3 + ADDQ R13, R10 + ADDQ R13, R12 + +handle_loop: + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decodeSync_amd64_main_loop + +loop_finished: + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R12, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R11 + MOVQ R11, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_amd64_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: BMI, BMI2, CMOV, SSE +TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + MOVQ 112(CX), R9 + MOVQ 128(CX), R10 + MOVQ R10, 32(SP) + MOVQ 144(CX), R10 + MOVQ 136(CX), R11 + MOVQ 200(CX), R12 + MOVQ R12, 56(SP) + MOVQ 176(CX), R12 + MOVQ R12, 48(SP) + MOVQ 184(CX), CX + MOVQ CX, 40(SP) + MOVQ 40(SP), CX + ADDQ CX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R9, 32(SP) + + // outBase += outPosition + ADDQ R11, R9 + +sequenceDecs_decodeSync_bmi2_main_loop: + MOVQ (SP), R12 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_bmi2_fill_end + +sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte + +sequenceDecs_decodeSync_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 8(SP) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_bmi2_fill_2_end + +sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_bmi2_fill_2_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_bmi2_fill_2_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte + +sequenceDecs_decodeSync_bmi2_fill_2_end: + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 24(SP) + + // Fill bitreader for state updates + MOVQ R12, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R12 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_bmi2_skip_update + + // Update Literal Length State + MOVBQZX SI, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, SI, SI + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + + // Update Match Length State + MOVBQZX DI, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, DI, DI + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Offset State + MOVBQZX R8, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, R8, R8 + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + +sequenceDecs_decodeSync_bmi2_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ R12, $0x01 + JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_bmi2_adjust_end + +sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero + +sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_bmi2_adjust_end + +sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: + MOVQ R13, R12 + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, R12 + CMOVQEQ R15, R14 + LEAQ 144(CX), R15 + ADDQ (R15)(R12*8), R14 + JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_bmi2_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_bmi2_adjust_skip + MOVQ 152(CX), R12 + MOVQ R12, 160(CX) + +sequenceDecs_decodeSync_bmi2_adjust_skip: + MOVQ 144(CX), R12 + MOVQ R12, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_bmi2_adjust_end: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), CX + MOVQ 24(SP), R12 + LEAQ (CX)(R12*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ R12, 104(R14) + JS error_not_enough_literals + CMPQ CX, $0x00020002 + JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok + TESTQ CX, CX + JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: + MOVQ 24(SP), CX + MOVQ 8(SP), R12 + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (CX)(R13*1), R14 + ADDQ R9, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ CX, CX + JZ check_offset + XORQ R14, R14 + TESTQ $0x00000001, CX + JZ copy_1_word + MOVB (R10)(R14*1), R15 + MOVB R15, (R9)(R14*1) + ADDQ $0x01, R14 + +copy_1_word: + TESTQ $0x00000002, CX + JZ copy_1_dword + MOVW (R10)(R14*1), R15 + MOVW R15, (R9)(R14*1) + ADDQ $0x02, R14 + +copy_1_dword: + TESTQ $0x00000004, CX + JZ copy_1_qword + MOVL (R10)(R14*1), R15 + MOVL R15, (R9)(R14*1) + ADDQ $0x04, R14 + +copy_1_qword: + TESTQ $0x00000008, CX + JZ copy_1_test + MOVQ (R10)(R14*1), R15 + MOVQ R15, (R9)(R14*1) + ADDQ $0x08, R14 + JMP copy_1_test + +copy_1: + MOVUPS (R10)(R14*1), X0 + MOVUPS X0, (R9)(R14*1) + ADDQ $0x10, R14 + +copy_1_test: + CMPQ R14, CX + JB copy_1 + ADDQ CX, R10 + ADDQ CX, R9 + ADDQ CX, R11 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R11, CX + ADDQ 40(SP), CX + CMPQ R12, CX + JG error_match_off_too_big + CMPQ R12, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JGE copy_all_from_history + XORQ CX, CX + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(CX*1), R12 + MOVB R12, (R9)(CX*1) + ADDQ $0x01, CX + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(CX*1), R12 + MOVW R12, (R9)(CX*1) + ADDQ $0x02, CX + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(CX*1), R12 + MOVL R12, (R9)(CX*1) + ADDQ $0x04, CX + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(CX*1), R12 + MOVQ R12, (R9)(CX*1) + ADDQ $0x08, CX + JMP copy_4_test + +copy_4: + MOVUPS (R14)(CX*1), X0 + MOVUPS X0, (R9)(CX*1) + ADDQ $0x10, CX + +copy_4_test: + CMPQ CX, R13 + JB copy_4 + ADDQ R13, R11 + ADDQ R13, R9 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, CX + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (R9)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, CX + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (R9)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, CX + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (R9)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, CX + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (R9)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (R9)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, CX + JB copy_5 + ADDQ CX, R9 + ADDQ CX, R11 + SUBQ CX, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ R9, CX + SUBQ R12, CX + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + XORQ R12, R12 + +copy_2: + MOVUPS (CX)(R12*1), X0 + MOVUPS X0, (R9)(R12*1) + ADDQ $0x10, R12 + CMPQ R12, R13 + JB copy_2 + ADDQ R13, R9 + ADDQ R13, R11 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + XORQ R12, R12 + +copy_slow_3: + MOVB (CX)(R12*1), R14 + MOVB R14, (R9)(R12*1) + INCQ R12 + CMPQ R12, R13 + JB copy_slow_3 + ADDQ R13, R9 + ADDQ R13, R11 + +handle_loop: + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decodeSync_bmi2_main_loop + +loop_finished: + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R11, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R10 + MOVQ R10, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_bmi2_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: CMOV, SSE +TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + MOVQ 112(AX), R10 + MOVQ 128(AX), CX + MOVQ CX, 32(SP) + MOVQ 144(AX), R11 + MOVQ 136(AX), R12 + MOVQ 200(AX), CX + MOVQ CX, 56(SP) + MOVQ 176(AX), CX + MOVQ CX, 48(SP) + MOVQ 184(AX), AX + MOVQ AX, 40(SP) + MOVQ 40(SP), AX + ADDQ AX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R10, 32(SP) + + // outBase += outPosition + ADDQ R12, R10 + +sequenceDecs_decodeSync_safe_amd64_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_safe_amd64_fill_end + +sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_safe_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_safe_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte + +sequenceDecs_decodeSync_safe_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 8(SP) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end + +sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte + +sequenceDecs_decodeSync_safe_amd64_fill_2_end: + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 24(SP) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_safe_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R13 + SHRQ $0x10, DI + MOVWQZX DI, DI + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, DI + +sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R13 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, R8 + +sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R13 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, R9 + +sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero: + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decodeSync_safe_amd64_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ AX, $0x01 + JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_safe_amd64_adjust_end + +sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero + +sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_safe_amd64_adjust_end + +sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: + MOVQ R13, AX + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, AX + CMOVQEQ R15, R14 + LEAQ 144(CX), R15 + ADDQ (R15)(AX*8), R14 + JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip + MOVQ 152(CX), AX + MOVQ AX, 160(CX) + +sequenceDecs_decodeSync_safe_amd64_adjust_skip: + MOVQ 144(CX), AX + MOVQ AX, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_safe_amd64_adjust_end: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), AX + MOVQ 24(SP), CX + LEAQ (AX)(CX*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ CX, 104(R14) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: + MOVQ 24(SP), AX + MOVQ 8(SP), CX + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (AX)(R13*1), R14 + ADDQ R10, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ AX, AX + JZ check_offset + XORQ R14, R14 + TESTQ $0x00000001, AX + JZ copy_1_word + MOVB (R11)(R14*1), R15 + MOVB R15, (R10)(R14*1) + ADDQ $0x01, R14 + +copy_1_word: + TESTQ $0x00000002, AX + JZ copy_1_dword + MOVW (R11)(R14*1), R15 + MOVW R15, (R10)(R14*1) + ADDQ $0x02, R14 + +copy_1_dword: + TESTQ $0x00000004, AX + JZ copy_1_qword + MOVL (R11)(R14*1), R15 + MOVL R15, (R10)(R14*1) + ADDQ $0x04, R14 + +copy_1_qword: + TESTQ $0x00000008, AX + JZ copy_1_test + MOVQ (R11)(R14*1), R15 + MOVQ R15, (R10)(R14*1) + ADDQ $0x08, R14 + JMP copy_1_test + +copy_1: + MOVUPS (R11)(R14*1), X0 + MOVUPS X0, (R10)(R14*1) + ADDQ $0x10, R14 + +copy_1_test: + CMPQ R14, AX + JB copy_1 + ADDQ AX, R11 + ADDQ AX, R10 + ADDQ AX, R12 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R12, AX + ADDQ 40(SP), AX + CMPQ CX, AX + JG error_match_off_too_big + CMPQ CX, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JGE copy_all_from_history + XORQ AX, AX + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(AX*1), CL + MOVB CL, (R10)(AX*1) + ADDQ $0x01, AX + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(AX*1), CX + MOVW CX, (R10)(AX*1) + ADDQ $0x02, AX + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(AX*1), CX + MOVL CX, (R10)(AX*1) + ADDQ $0x04, AX + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(AX*1), CX + MOVQ CX, (R10)(AX*1) + ADDQ $0x08, AX + JMP copy_4_test + +copy_4: + MOVUPS (R14)(AX*1), X0 + MOVUPS X0, (R10)(AX*1) + ADDQ $0x10, AX + +copy_4_test: + CMPQ AX, R13 + JB copy_4 + ADDQ R13, R12 + ADDQ R13, R10 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, AX + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (R10)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, AX + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (R10)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, AX + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (R10)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, AX + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (R10)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (R10)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, AX + JB copy_5 + ADDQ AX, R10 + ADDQ AX, R12 + SUBQ AX, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ R10, AX + SUBQ CX, AX + + // ml <= mo + CMPQ R13, CX + JA copy_overlapping_match + + // Copy non-overlapping match + XORQ CX, CX + TESTQ $0x00000001, R13 + JZ copy_2_word + MOVB (AX)(CX*1), R14 + MOVB R14, (R10)(CX*1) + ADDQ $0x01, CX + +copy_2_word: + TESTQ $0x00000002, R13 + JZ copy_2_dword + MOVW (AX)(CX*1), R14 + MOVW R14, (R10)(CX*1) + ADDQ $0x02, CX + +copy_2_dword: + TESTQ $0x00000004, R13 + JZ copy_2_qword + MOVL (AX)(CX*1), R14 + MOVL R14, (R10)(CX*1) + ADDQ $0x04, CX + +copy_2_qword: + TESTQ $0x00000008, R13 + JZ copy_2_test + MOVQ (AX)(CX*1), R14 + MOVQ R14, (R10)(CX*1) + ADDQ $0x08, CX + JMP copy_2_test + +copy_2: + MOVUPS (AX)(CX*1), X0 + MOVUPS X0, (R10)(CX*1) + ADDQ $0x10, CX + +copy_2_test: + CMPQ CX, R13 + JB copy_2 + ADDQ R13, R10 + ADDQ R13, R12 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + XORQ CX, CX + +copy_slow_3: + MOVB (AX)(CX*1), R14 + MOVB R14, (R10)(CX*1) + INCQ CX + CMPQ CX, R13 + JB copy_slow_3 + ADDQ R13, R10 + ADDQ R13, R12 + +handle_loop: + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decodeSync_safe_amd64_main_loop + +loop_finished: + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R12, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R11 + MOVQ R11, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: BMI, BMI2, CMOV, SSE +TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + MOVQ 112(CX), R9 + MOVQ 128(CX), R10 + MOVQ R10, 32(SP) + MOVQ 144(CX), R10 + MOVQ 136(CX), R11 + MOVQ 200(CX), R12 + MOVQ R12, 56(SP) + MOVQ 176(CX), R12 + MOVQ R12, 48(SP) + MOVQ 184(CX), CX + MOVQ CX, 40(SP) + MOVQ 40(SP), CX + ADDQ CX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R9, 32(SP) + + // outBase += outPosition + ADDQ R11, R9 + +sequenceDecs_decodeSync_safe_bmi2_main_loop: + MOVQ (SP), R12 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_end + +sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte + +sequenceDecs_decodeSync_safe_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 8(SP) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end + +sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte + +sequenceDecs_decodeSync_safe_bmi2_fill_2_end: + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 24(SP) + + // Fill bitreader for state updates + MOVQ R12, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R12 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_safe_bmi2_skip_update + + // Update Literal Length State + MOVBQZX SI, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, SI, SI + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + + // Update Match Length State + MOVBQZX DI, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, DI, DI + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Offset State + MOVBQZX R8, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, R8, R8 + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + +sequenceDecs_decodeSync_safe_bmi2_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ R12, $0x01 + JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end + +sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero + +sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end + +sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: + MOVQ R13, R12 + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, R12 + CMOVQEQ R15, R14 + LEAQ 144(CX), R15 + ADDQ (R15)(R12*8), R14 + JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip + MOVQ 152(CX), R12 + MOVQ R12, 160(CX) + +sequenceDecs_decodeSync_safe_bmi2_adjust_skip: + MOVQ 144(CX), R12 + MOVQ R12, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_safe_bmi2_adjust_end: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), CX + MOVQ 24(SP), R12 + LEAQ (CX)(R12*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ R12, 104(R14) + JS error_not_enough_literals + CMPQ CX, $0x00020002 + JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok + TESTQ CX, CX + JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: + MOVQ 24(SP), CX + MOVQ 8(SP), R12 + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (CX)(R13*1), R14 + ADDQ R9, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ CX, CX + JZ check_offset + XORQ R14, R14 + TESTQ $0x00000001, CX + JZ copy_1_word + MOVB (R10)(R14*1), R15 + MOVB R15, (R9)(R14*1) + ADDQ $0x01, R14 + +copy_1_word: + TESTQ $0x00000002, CX + JZ copy_1_dword + MOVW (R10)(R14*1), R15 + MOVW R15, (R9)(R14*1) + ADDQ $0x02, R14 + +copy_1_dword: + TESTQ $0x00000004, CX + JZ copy_1_qword + MOVL (R10)(R14*1), R15 + MOVL R15, (R9)(R14*1) + ADDQ $0x04, R14 + +copy_1_qword: + TESTQ $0x00000008, CX + JZ copy_1_test + MOVQ (R10)(R14*1), R15 + MOVQ R15, (R9)(R14*1) + ADDQ $0x08, R14 + JMP copy_1_test + +copy_1: + MOVUPS (R10)(R14*1), X0 + MOVUPS X0, (R9)(R14*1) + ADDQ $0x10, R14 + +copy_1_test: + CMPQ R14, CX + JB copy_1 + ADDQ CX, R10 + ADDQ CX, R9 + ADDQ CX, R11 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R11, CX + ADDQ 40(SP), CX + CMPQ R12, CX + JG error_match_off_too_big + CMPQ R12, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JGE copy_all_from_history + XORQ CX, CX + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(CX*1), R12 + MOVB R12, (R9)(CX*1) + ADDQ $0x01, CX + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(CX*1), R12 + MOVW R12, (R9)(CX*1) + ADDQ $0x02, CX + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(CX*1), R12 + MOVL R12, (R9)(CX*1) + ADDQ $0x04, CX + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(CX*1), R12 + MOVQ R12, (R9)(CX*1) + ADDQ $0x08, CX + JMP copy_4_test + +copy_4: + MOVUPS (R14)(CX*1), X0 + MOVUPS X0, (R9)(CX*1) + ADDQ $0x10, CX + +copy_4_test: + CMPQ CX, R13 + JB copy_4 + ADDQ R13, R11 + ADDQ R13, R9 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, CX + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (R9)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, CX + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (R9)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, CX + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (R9)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, CX + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (R9)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (R9)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, CX + JB copy_5 + ADDQ CX, R9 + ADDQ CX, R11 + SUBQ CX, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ R9, CX + SUBQ R12, CX + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + XORQ R12, R12 + TESTQ $0x00000001, R13 + JZ copy_2_word + MOVB (CX)(R12*1), R14 + MOVB R14, (R9)(R12*1) + ADDQ $0x01, R12 + +copy_2_word: + TESTQ $0x00000002, R13 + JZ copy_2_dword + MOVW (CX)(R12*1), R14 + MOVW R14, (R9)(R12*1) + ADDQ $0x02, R12 + +copy_2_dword: + TESTQ $0x00000004, R13 + JZ copy_2_qword + MOVL (CX)(R12*1), R14 + MOVL R14, (R9)(R12*1) + ADDQ $0x04, R12 + +copy_2_qword: + TESTQ $0x00000008, R13 + JZ copy_2_test + MOVQ (CX)(R12*1), R14 + MOVQ R14, (R9)(R12*1) + ADDQ $0x08, R12 + JMP copy_2_test + +copy_2: + MOVUPS (CX)(R12*1), X0 + MOVUPS X0, (R9)(R12*1) + ADDQ $0x10, R12 + +copy_2_test: + CMPQ R12, R13 + JB copy_2 + ADDQ R13, R9 + ADDQ R13, R11 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + XORQ R12, R12 + +copy_slow_3: + MOVB (CX)(R12*1), R14 + MOVB R14, (R9)(R12*1) + INCQ R12 + CMPQ R12, R13 + JB copy_slow_3 + ADDQ R13, R9 + ADDQ R13, R11 + +handle_loop: + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decodeSync_safe_bmi2_main_loop + +loop_finished: + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R11, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R10 + MOVQ R10, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET |