aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s1999
1 files changed, 1163 insertions, 836 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 212c6cac3..71e64e061 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -134,18 +134,17 @@ sequenceDecs_decode_amd64_fill_2_end:
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, DI
-sequenceDecs_decode_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -155,18 +154,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R8
-sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -176,18 +174,17 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R9
-sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -416,18 +413,17 @@ sequenceDecs_decode_56_amd64_fill_end:
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, DI
-sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -437,18 +433,17 @@ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R8
-sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -458,18 +453,17 @@ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R14, $0x00
- JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R14, BX
+ LEAQ (BX)(R14*1), CX
MOVQ DX, R15
- SHLQ CL, R15
- MOVQ R14, CX
- NEGQ CX
- SHRQ CL, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
ADDQ R15, R9
-sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -1181,52 +1175,65 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, R11
- SUBQ DI, R11
- JLS copy_match
- MOVQ R9, R14
- SUBQ R11, R14
- CMPQ R13, R11
- JGE copy_all_from_history
- XORQ R11, R11
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(R11*1), R12
- MOVB R12, (BX)(R11*1)
- ADDQ $0x01, R11
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(R11*1), R12
- MOVW R12, (BX)(R11*1)
- ADDQ $0x02, R11
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(R11*1), R12
- MOVL R12, (BX)(R11*1)
- ADDQ $0x04, R11
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(R11*1), R12
- MOVQ R12, (BX)(R11*1)
- ADDQ $0x08, R11
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(R11*1), X0
- MOVUPS X0, (BX)(R11*1)
- ADDQ $0x10, R11
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
-copy_4_test:
- CMPQ R11, R13
- JB copy_4
+copy_4_end:
ADDQ R13, DI
- ADDQ R13, BX
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
@@ -1234,53 +1241,74 @@ copy_4_test:
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, R11
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (BX)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, R11
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (BX)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, R11
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (BX)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, R11
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (BX)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (BX)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, R11
- JB copy_5
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+
+copy_5_end:
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ BX, R11
- SUBQ R12, R11
+ MOVQ BX, R11
+ SUBQ R12, R11
// ml <= mo
CMPQ R13, R12
@@ -1382,45 +1410,67 @@ main_loop:
// Copy literals
TESTQ R11, R11
JZ check_offset
- XORQ R14, R14
- TESTQ $0x00000001, R11
- JZ copy_1_word
- MOVB (SI)(R14*1), R15
- MOVB R15, (BX)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, R11
- JZ copy_1_dword
- MOVW (SI)(R14*1), R15
- MOVW R15, (BX)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, R11
- JZ copy_1_qword
- MOVL (SI)(R14*1), R15
- MOVL R15, (BX)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, R11
- JZ copy_1_test
- MOVQ (SI)(R14*1), R15
- MOVQ R15, (BX)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
+ MOVQ R11, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (SI), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, SI
+ ADDQ $0x10, BX
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(SI)(R14*1), SI
+ LEAQ 16(BX)(R14*1), BX
+ MOVUPS -16(SI), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ R11, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ R11, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (SI), R14
+ MOVB -1(SI)(R11*1), R15
+ MOVB R14, (BX)
+ MOVB R15, -1(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
-copy_1:
- MOVUPS (SI)(R14*1), X0
- MOVUPS X0, (BX)(R14*1)
- ADDQ $0x10, R14
+copy_1_move_3:
+ MOVW (SI), R14
+ MOVB 2(SI), R15
+ MOVW R14, (BX)
+ MOVB R15, 2(BX)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
-copy_1_test:
- CMPQ R14, R11
- JB copy_1
+copy_1_move_4through7:
+ MOVL (SI), R14
+ MOVL -4(SI)(R11*1), R15
+ MOVL R14, (BX)
+ MOVL R15, -4(BX)(R11*1)
ADDQ R11, SI
ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (SI), R14
+ MOVQ -8(SI)(R11*1), R15
+ MOVQ R14, (BX)
+ MOVQ R15, -8(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+
+copy_1_end:
ADDQ R11, DI
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -1432,52 +1482,65 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, R11
- SUBQ DI, R11
- JLS copy_match
- MOVQ R9, R14
- SUBQ R11, R14
- CMPQ R13, R11
- JGE copy_all_from_history
- XORQ R11, R11
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(R11*1), R12
- MOVB R12, (BX)(R11*1)
- ADDQ $0x01, R11
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(R11*1), R12
- MOVW R12, (BX)(R11*1)
- ADDQ $0x02, R11
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(R11*1), R12
- MOVL R12, (BX)(R11*1)
- ADDQ $0x04, R11
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(R11*1), R12
- MOVQ R12, (BX)(R11*1)
- ADDQ $0x08, R11
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(R11*1), X0
- MOVUPS X0, (BX)(R11*1)
- ADDQ $0x10, R11
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
-copy_4_test:
- CMPQ R11, R13
- JB copy_4
+copy_4_end:
ADDQ R13, DI
- ADDQ R13, BX
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
@@ -1485,99 +1548,143 @@ copy_4_test:
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, R11
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (BX)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, R11
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (BX)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, R11
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (BX)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, R11
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (BX)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (BX)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, R11
- JB copy_5
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
ADDQ R11, BX
+
+copy_5_end:
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ BX, R11
- SUBQ R12, R11
+ MOVQ BX, R11
+ SUBQ R12, R11
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
- ADDQ R13, DI
- XORQ R12, R12
- TESTQ $0x00000001, R13
- JZ copy_2_word
- MOVB (R11)(R12*1), R14
- MOVB R14, (BX)(R12*1)
- ADDQ $0x01, R12
-
-copy_2_word:
- TESTQ $0x00000002, R13
- JZ copy_2_dword
- MOVW (R11)(R12*1), R14
- MOVW R14, (BX)(R12*1)
- ADDQ $0x02, R12
-
-copy_2_dword:
- TESTQ $0x00000004, R13
- JZ copy_2_qword
- MOVL (R11)(R12*1), R14
- MOVL R14, (BX)(R12*1)
- ADDQ $0x04, R12
-
-copy_2_qword:
- TESTQ $0x00000008, R13
- JZ copy_2_test
- MOVQ (R11)(R12*1), R14
- MOVQ R14, (BX)(R12*1)
- ADDQ $0x08, R12
- JMP copy_2_test
+ ADDQ R13, DI
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
-copy_2:
- MOVUPS (R11)(R12*1), X0
- MOVUPS X0, (BX)(R12*1)
- ADDQ $0x10, R12
+copy_2_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R11
+ ADDQ $0x10, BX
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(R11)(R12*1), R11
+ LEAQ 16(BX)(R12*1), BX
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (R11), R12
+ MOVB -1(R11)(R13*1), R14
+ MOVB R12, (BX)
+ MOVB R14, -1(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
-copy_2_test:
- CMPQ R12, R13
- JB copy_2
+copy_2_move_3:
+ MOVW (R11), R12
+ MOVB 2(R11), R14
+ MOVW R12, (BX)
+ MOVB R14, 2(BX)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (R11), R12
+ MOVL -4(R11)(R13*1), R14
+ MOVL R12, (BX)
+ MOVL R14, -4(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (R11), R12
+ MOVQ -8(R11)(R13*1), R14
+ MOVQ R12, (BX)
+ MOVQ R14, -8(BX)(R13*1)
+ ADDQ R13, R11
ADDQ R13, BX
- JMP handle_loop
+
+copy_2_end:
+ JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
@@ -1773,18 +1880,17 @@ sequenceDecs_decodeSync_amd64_fill_2_end:
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, DI
-sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -1794,18 +1900,17 @@ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R8
-sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -1815,18 +1920,17 @@ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R9
-sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -1934,103 +2038,137 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ CX, AX
- SUBQ R12, AX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ AX, R14
- CMPQ R13, AX
- JGE copy_all_from_history
- XORQ AX, AX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(AX*1), CL
- MOVB CL, (R10)(AX*1)
- ADDQ $0x01, AX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(AX*1), CX
- MOVW CX, (R10)(AX*1)
- ADDQ $0x02, AX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(AX*1), CX
- MOVL CX, (R10)(AX*1)
- ADDQ $0x04, AX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(AX*1), CX
- MOVQ CX, (R10)(AX*1)
- ADDQ $0x08, AX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(AX*1), X0
- MOVUPS X0, (R10)(AX*1)
- ADDQ $0x10, AX
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
-copy_4_test:
- CMPQ AX, R13
- JB copy_4
- ADDQ R13, R12
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, AX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R10)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, AX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R10)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, AX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R10)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, AX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R10)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R10)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, AX
- JB copy_5
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
ADDQ AX, R10
+
+copy_5_end:
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R10, AX
- SUBQ CX, AX
+ MOVQ R10, AX
+ SUBQ CX, AX
// ml <= mo
CMPQ R13, CX
@@ -2407,103 +2545,137 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, CX
- SUBQ R11, CX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ CX, R14
- CMPQ R13, CX
- JGE copy_all_from_history
- XORQ CX, CX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(CX*1), R12
- MOVB R12, (R9)(CX*1)
- ADDQ $0x01, CX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(CX*1), R12
- MOVW R12, (R9)(CX*1)
- ADDQ $0x02, CX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(CX*1), R12
- MOVL R12, (R9)(CX*1)
- ADDQ $0x04, CX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(CX*1), R12
- MOVQ R12, (R9)(CX*1)
- ADDQ $0x08, CX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(CX*1), X0
- MOVUPS X0, (R9)(CX*1)
- ADDQ $0x10, CX
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
-copy_4_test:
- CMPQ CX, R13
- JB copy_4
+copy_4_end:
ADDQ R13, R11
- ADDQ R13, R9
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, CX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R9)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, CX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R9)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, CX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R9)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, CX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R9)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R9)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, CX
- JB copy_5
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R9, CX
- SUBQ R12, CX
+ MOVQ R9, CX
+ SUBQ R12, CX
// ml <= mo
CMPQ R13, R12
@@ -2746,18 +2918,17 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_end:
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, DI
-sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
@@ -2767,18 +2938,17 @@ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R8
-sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
@@ -2788,18 +2958,17 @@ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
- CMPQ R13, $0x00
- JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
- MOVQ BX, CX
- ADDQ R13, BX
+ LEAQ (BX)(R13*1), CX
MOVQ DX, R14
- SHLQ CL, R14
- MOVQ R13, CX
- NEGQ CX
- SHRQ CL, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
ADDQ R14, R9
-sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
@@ -2885,45 +3054,67 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
// Copy literals
TESTQ AX, AX
JZ check_offset
- XORQ R14, R14
- TESTQ $0x00000001, AX
- JZ copy_1_word
- MOVB (R11)(R14*1), R15
- MOVB R15, (R10)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, AX
- JZ copy_1_dword
- MOVW (R11)(R14*1), R15
- MOVW R15, (R10)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, AX
- JZ copy_1_qword
- MOVL (R11)(R14*1), R15
- MOVL R15, (R10)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, AX
- JZ copy_1_test
- MOVQ (R11)(R14*1), R15
- MOVQ R15, (R10)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
+ MOVQ AX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
-copy_1:
- MOVUPS (R11)(R14*1), X0
- MOVUPS X0, (R10)(R14*1)
- ADDQ $0x10, R14
+copy_1_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R11
+ ADDQ $0x10, R10
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R11)(R14*1), R11
+ LEAQ 16(R10)(R14*1), R10
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ AX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ AX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R11), R14
+ MOVB -1(R11)(AX*1), R15
+ MOVB R14, (R10)
+ MOVB R15, -1(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (R11), R14
+ MOVB 2(R11), R15
+ MOVW R14, (R10)
+ MOVB R15, 2(R10)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R11), R14
+ MOVL -4(R11)(AX*1), R15
+ MOVL R14, (R10)
+ MOVL R15, -4(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
-copy_1_test:
- CMPQ R14, AX
- JB copy_1
+copy_1_move_8through16:
+ MOVQ (R11), R14
+ MOVQ -8(R11)(AX*1), R15
+ MOVQ R14, (R10)
+ MOVQ R15, -8(R10)(AX*1)
ADDQ AX, R11
ADDQ AX, R10
+
+copy_1_end:
ADDQ AX, R12
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -2936,149 +3127,206 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ CX, AX
- SUBQ R12, AX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ AX, R14
- CMPQ R13, AX
- JGE copy_all_from_history
- XORQ AX, AX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(AX*1), CL
- MOVB CL, (R10)(AX*1)
- ADDQ $0x01, AX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(AX*1), CX
- MOVW CX, (R10)(AX*1)
- ADDQ $0x02, AX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(AX*1), CX
- MOVL CX, (R10)(AX*1)
- ADDQ $0x04, AX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(AX*1), CX
- MOVQ CX, (R10)(AX*1)
- ADDQ $0x08, AX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(AX*1), X0
- MOVUPS X0, (R10)(AX*1)
- ADDQ $0x10, AX
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
-copy_4_test:
- CMPQ AX, R13
- JB copy_4
- ADDQ R13, R12
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, AX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R10)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, AX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R10)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, AX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R10)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, AX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R10)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R10)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, AX
- JB copy_5
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+
+copy_5_end:
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R10, AX
- SUBQ CX, AX
+ MOVQ R10, AX
+ SUBQ CX, AX
// ml <= mo
CMPQ R13, CX
JA copy_overlapping_match
// Copy non-overlapping match
- ADDQ R13, R12
- XORQ CX, CX
- TESTQ $0x00000001, R13
- JZ copy_2_word
- MOVB (AX)(CX*1), R14
- MOVB R14, (R10)(CX*1)
- ADDQ $0x01, CX
-
-copy_2_word:
- TESTQ $0x00000002, R13
- JZ copy_2_dword
- MOVW (AX)(CX*1), R14
- MOVW R14, (R10)(CX*1)
- ADDQ $0x02, CX
-
-copy_2_dword:
- TESTQ $0x00000004, R13
- JZ copy_2_qword
- MOVL (AX)(CX*1), R14
- MOVL R14, (R10)(CX*1)
- ADDQ $0x04, CX
-
-copy_2_qword:
- TESTQ $0x00000008, R13
- JZ copy_2_test
- MOVQ (AX)(CX*1), R14
- MOVQ R14, (R10)(CX*1)
- ADDQ $0x08, CX
- JMP copy_2_test
-
-copy_2:
- MOVUPS (AX)(CX*1), X0
- MOVUPS X0, (R10)(CX*1)
- ADDQ $0x10, CX
+ ADDQ R13, R12
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_2_small
-copy_2_test:
- CMPQ CX, R13
- JB copy_2
+copy_2_loop:
+ MOVUPS (AX), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, AX
+ ADDQ $0x10, R10
+ SUBQ $0x10, CX
+ JAE copy_2_loop
+ LEAQ 16(AX)(CX*1), AX
+ LEAQ 16(R10)(CX*1), R10
+ MOVUPS -16(AX), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (AX), CL
+ MOVB -1(AX)(R13*1), R14
+ MOVB CL, (R10)
+ MOVB R14, -1(R10)(R13*1)
+ ADDQ R13, AX
ADDQ R13, R10
- JMP handle_loop
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (AX), CX
+ MOVB 2(AX), R14
+ MOVW CX, (R10)
+ MOVB R14, 2(R10)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (AX), CX
+ MOVL -4(AX)(R13*1), R14
+ MOVL CX, (R10)
+ MOVL R14, -4(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (AX), CX
+ MOVQ -8(AX)(R13*1), R14
+ MOVQ CX, (R10)
+ MOVQ R14, -8(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+
+copy_2_end:
+ JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
@@ -3415,45 +3663,67 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
// Copy literals
TESTQ CX, CX
JZ check_offset
- XORQ R14, R14
- TESTQ $0x00000001, CX
- JZ copy_1_word
- MOVB (R10)(R14*1), R15
- MOVB R15, (R9)(R14*1)
- ADDQ $0x01, R14
-
-copy_1_word:
- TESTQ $0x00000002, CX
- JZ copy_1_dword
- MOVW (R10)(R14*1), R15
- MOVW R15, (R9)(R14*1)
- ADDQ $0x02, R14
-
-copy_1_dword:
- TESTQ $0x00000004, CX
- JZ copy_1_qword
- MOVL (R10)(R14*1), R15
- MOVL R15, (R9)(R14*1)
- ADDQ $0x04, R14
-
-copy_1_qword:
- TESTQ $0x00000008, CX
- JZ copy_1_test
- MOVQ (R10)(R14*1), R15
- MOVQ R15, (R9)(R14*1)
- ADDQ $0x08, R14
- JMP copy_1_test
+ MOVQ CX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (R10), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R10
+ ADDQ $0x10, R9
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R10)(R14*1), R10
+ LEAQ 16(R9)(R14*1), R9
+ MOVUPS -16(R10), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ CX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ CX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R10), R14
+ MOVB -1(R10)(CX*1), R15
+ MOVB R14, (R9)
+ MOVB R15, -1(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
-copy_1:
- MOVUPS (R10)(R14*1), X0
- MOVUPS X0, (R9)(R14*1)
- ADDQ $0x10, R14
+copy_1_move_3:
+ MOVW (R10), R14
+ MOVB 2(R10), R15
+ MOVW R14, (R9)
+ MOVB R15, 2(R9)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R10), R14
+ MOVL -4(R10)(CX*1), R15
+ MOVL R14, (R9)
+ MOVL R15, -4(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
-copy_1_test:
- CMPQ R14, CX
- JB copy_1
+copy_1_move_8through16:
+ MOVQ (R10), R14
+ MOVQ -8(R10)(CX*1), R15
+ MOVQ R14, (R9)
+ MOVQ R15, -8(R9)(CX*1)
ADDQ CX, R10
ADDQ CX, R9
+
+copy_1_end:
ADDQ CX, R11
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -3466,149 +3736,206 @@ check_offset:
JG error_match_off_too_big
// Copy match from history
- MOVQ R12, CX
- SUBQ R11, CX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ CX, R14
- CMPQ R13, CX
- JGE copy_all_from_history
- XORQ CX, CX
- TESTQ $0x00000001, R13
- JZ copy_4_word
- MOVB (R14)(CX*1), R12
- MOVB R12, (R9)(CX*1)
- ADDQ $0x01, CX
-
-copy_4_word:
- TESTQ $0x00000002, R13
- JZ copy_4_dword
- MOVW (R14)(CX*1), R12
- MOVW R12, (R9)(CX*1)
- ADDQ $0x02, CX
-
-copy_4_dword:
- TESTQ $0x00000004, R13
- JZ copy_4_qword
- MOVL (R14)(CX*1), R12
- MOVL R12, (R9)(CX*1)
- ADDQ $0x04, CX
-
-copy_4_qword:
- TESTQ $0x00000008, R13
- JZ copy_4_test
- MOVQ (R14)(CX*1), R12
- MOVQ R12, (R9)(CX*1)
- ADDQ $0x08, CX
- JMP copy_4_test
-
-copy_4:
- MOVUPS (R14)(CX*1), X0
- MOVUPS X0, (R9)(CX*1)
- ADDQ $0x10, CX
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
-copy_4_test:
- CMPQ CX, R13
- JB copy_4
+copy_4_end:
ADDQ R13, R11
- ADDQ R13, R9
JMP handle_loop
JMP loop_finished
copy_all_from_history:
- XORQ R15, R15
- TESTQ $0x00000001, CX
- JZ copy_5_word
- MOVB (R14)(R15*1), BP
- MOVB BP, (R9)(R15*1)
- ADDQ $0x01, R15
-
-copy_5_word:
- TESTQ $0x00000002, CX
- JZ copy_5_dword
- MOVW (R14)(R15*1), BP
- MOVW BP, (R9)(R15*1)
- ADDQ $0x02, R15
-
-copy_5_dword:
- TESTQ $0x00000004, CX
- JZ copy_5_qword
- MOVL (R14)(R15*1), BP
- MOVL BP, (R9)(R15*1)
- ADDQ $0x04, R15
-
-copy_5_qword:
- TESTQ $0x00000008, CX
- JZ copy_5_test
- MOVQ (R14)(R15*1), BP
- MOVQ BP, (R9)(R15*1)
- ADDQ $0x08, R15
- JMP copy_5_test
-
-copy_5:
- MOVUPS (R14)(R15*1), X0
- MOVUPS X0, (R9)(R15*1)
- ADDQ $0x10, R15
-
-copy_5_test:
- CMPQ R15, CX
- JB copy_5
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
- TESTQ R13, R13
- JZ handle_loop
- MOVQ R9, CX
- SUBQ R12, CX
+ MOVQ R9, CX
+ SUBQ R12, CX
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
- ADDQ R13, R11
- XORQ R12, R12
- TESTQ $0x00000001, R13
- JZ copy_2_word
- MOVB (CX)(R12*1), R14
- MOVB R14, (R9)(R12*1)
- ADDQ $0x01, R12
-
-copy_2_word:
- TESTQ $0x00000002, R13
- JZ copy_2_dword
- MOVW (CX)(R12*1), R14
- MOVW R14, (R9)(R12*1)
- ADDQ $0x02, R12
-
-copy_2_dword:
- TESTQ $0x00000004, R13
- JZ copy_2_qword
- MOVL (CX)(R12*1), R14
- MOVL R14, (R9)(R12*1)
- ADDQ $0x04, R12
-
-copy_2_qword:
- TESTQ $0x00000008, R13
- JZ copy_2_test
- MOVQ (CX)(R12*1), R14
- MOVQ R14, (R9)(R12*1)
- ADDQ $0x08, R12
- JMP copy_2_test
-
-copy_2:
- MOVUPS (CX)(R12*1), X0
- MOVUPS X0, (R9)(R12*1)
- ADDQ $0x10, R12
+ ADDQ R13, R11
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
-copy_2_test:
- CMPQ R12, R13
- JB copy_2
+copy_2_loop:
+ MOVUPS (CX), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, CX
+ ADDQ $0x10, R9
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(CX)(R12*1), CX
+ LEAQ 16(R9)(R12*1), R9
+ MOVUPS -16(CX), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (CX), R12
+ MOVB -1(CX)(R13*1), R14
+ MOVB R12, (R9)
+ MOVB R14, -1(R9)(R13*1)
+ ADDQ R13, CX
ADDQ R13, R9
- JMP handle_loop
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (CX), R12
+ MOVB 2(CX), R14
+ MOVW R12, (R9)
+ MOVB R14, 2(R9)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (CX), R12
+ MOVL -4(CX)(R13*1), R14
+ MOVL R12, (R9)
+ MOVL R14, -4(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (CX), R12
+ MOVQ -8(CX)(R13*1), R14
+ MOVQ R12, (R9)
+ MOVQ R14, -8(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+
+copy_2_end:
+ JMP handle_loop
// Copy overlapping match
copy_overlapping_match: