diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s | 1999 |
1 files changed, 1163 insertions, 836 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s index 212c6cac3..71e64e061 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s @@ -134,18 +134,17 @@ sequenceDecs_decode_amd64_fill_2_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -155,18 +154,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -176,18 +174,17 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -416,18 +413,17 @@ sequenceDecs_decode_56_amd64_fill_end: MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, DI -sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -437,18 +433,17 @@ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R8 -sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -458,18 +453,17 @@ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R14, $0x00 - JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R14, BX + LEAQ (BX)(R14*1), CX MOVQ DX, R15 - SHLQ CL, R15 - MOVQ R14, CX - NEGQ CX - SHRQ CL, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 ADDQ R15, R9 -sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -1181,52 +1175,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1234,53 +1241,74 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 @@ -1382,45 +1410,67 @@ main_loop: // Copy literals TESTQ R11, R11 JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, R11 - JZ copy_1_word - MOVB (SI)(R14*1), R15 - MOVB R15, (BX)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, R11 - JZ copy_1_dword - MOVW (SI)(R14*1), R15 - MOVW R15, (BX)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, R11 - JZ copy_1_qword - MOVL (SI)(R14*1), R15 - MOVL R15, (BX)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, R11 - JZ copy_1_test - MOVQ (SI)(R14*1), R15 - MOVQ R15, (BX)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ R11, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (SI), X0 + MOVUPS X0, (BX) + ADDQ $0x10, SI + ADDQ $0x10, BX + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(SI)(R14*1), SI + LEAQ 16(BX)(R14*1), BX + MOVUPS -16(SI), X0 + MOVUPS X0, -16(BX) + JMP copy_1_end + +copy_1_small: + CMPQ R11, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ R11, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (SI), R14 + MOVB -1(SI)(R11*1), R15 + MOVB R14, (BX) + MOVB R15, -1(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1: - MOVUPS (SI)(R14*1), X0 - MOVUPS X0, (BX)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (SI), R14 + MOVB 2(SI), R15 + MOVW R14, (BX) + MOVB R15, 2(BX) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end -copy_1_test: - CMPQ R14, R11 - JB copy_1 +copy_1_move_4through7: + MOVL (SI), R14 + MOVL -4(SI)(R11*1), R15 + MOVL R14, (BX) + MOVL R15, -4(BX)(R11*1) ADDQ R11, SI ADDQ R11, BX + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (SI), R14 + MOVQ -8(SI)(R11*1), R15 + MOVQ R14, (BX) + MOVQ R15, -8(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + +copy_1_end: ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -1432,52 +1482,65 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - JMP copy_4_test - -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1485,99 +1548,143 @@ copy_4_test: JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, R11 - JB copy_5 + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 ADDQ R11, BX + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, DI - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (R11)(R12*1), R14 - MOVB R14, (BX)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (R11)(R12*1), R14 - MOVW R14, (BX)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (R11)(R12*1), R14 - MOVL R14, (BX)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (R11)(R12*1), R14 - MOVQ R14, (BX)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test + ADDQ R13, DI + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2: - MOVUPS (R11)(R12*1), X0 - MOVUPS X0, (BX)(R12*1) - ADDQ $0x10, R12 +copy_2_loop: + MOVUPS (R11), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R11 + ADDQ $0x10, BX + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(R11)(R12*1), R11 + LEAQ 16(BX)(R12*1), BX + MOVUPS -16(R11), X0 + MOVUPS X0, -16(BX) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (R11), R12 + MOVB -1(R11)(R13*1), R14 + MOVB R12, (BX) + MOVB R14, -1(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_move_3: + MOVW (R11), R12 + MOVB 2(R11), R14 + MOVW R12, (BX) + MOVB R14, 2(BX) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_4through7: + MOVL (R11), R12 + MOVL -4(R11)(R13*1), R14 + MOVL R12, (BX) + MOVL R14, -4(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (R11), R12 + MOVQ -8(R11)(R13*1), R14 + MOVQ R12, (BX) + MOVQ R14, -8(BX)(R13*1) + ADDQ R13, R11 ADDQ R13, BX - JMP handle_loop + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -1773,18 +1880,17 @@ sequenceDecs_decodeSync_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -1794,18 +1900,17 @@ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -1815,18 +1920,17 @@ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -1934,103 +2038,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX @@ -2407,103 +2545,137 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 @@ -2746,18 +2918,17 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_end: MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, DI -sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX @@ -2767,18 +2938,17 @@ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R8 -sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX @@ -2788,18 +2958,17 @@ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R13, $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero - MOVQ BX, CX - ADDQ R13, BX + LEAQ (BX)(R13*1), CX MOVQ DX, R14 - SHLQ CL, R14 - MOVQ R13, CX - NEGQ CX - SHRQ CL, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 ADDQ R14, R9 -sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX @@ -2885,45 +3054,67 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: // Copy literals TESTQ AX, AX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, AX - JZ copy_1_word - MOVB (R11)(R14*1), R15 - MOVB R15, (R10)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, AX - JZ copy_1_dword - MOVW (R11)(R14*1), R15 - MOVW R15, (R10)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, AX - JZ copy_1_qword - MOVL (R11)(R14*1), R15 - MOVL R15, (R10)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, AX - JZ copy_1_test - MOVQ (R11)(R14*1), R15 - MOVQ R15, (R10)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ AX, R14 + SUBQ $0x10, R14 + JB copy_1_small -copy_1: - MOVUPS (R11)(R14*1), X0 - MOVUPS X0, (R10)(R14*1) - ADDQ $0x10, R14 +copy_1_loop: + MOVUPS (R11), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R11 + ADDQ $0x10, R10 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R11)(R14*1), R11 + LEAQ 16(R10)(R14*1), R10 + MOVUPS -16(R11), X0 + MOVUPS X0, -16(R10) + JMP copy_1_end + +copy_1_small: + CMPQ AX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ AX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R11), R14 + MOVB -1(R11)(AX*1), R15 + MOVB R14, (R10) + MOVB R15, -1(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_3: + MOVW (R11), R14 + MOVB 2(R11), R15 + MOVW R14, (R10) + MOVB R15, 2(R10) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R11), R14 + MOVL -4(R11)(AX*1), R15 + MOVL R14, (R10) + MOVL R15, -4(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end -copy_1_test: - CMPQ R14, AX - JB copy_1 +copy_1_move_8through16: + MOVQ (R11), R14 + MOVQ -8(R11)(AX*1), R15 + MOVQ R14, (R10) + MOVQ R15, -8(R10)(AX*1) ADDQ AX, R11 ADDQ AX, R10 + +copy_1_end: ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -2936,149 +3127,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small -copy_4_test: - CMPQ AX, R13 - JB copy_4 - ADDQ R13, R12 +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R12 - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (AX)(CX*1), R14 - MOVB R14, (R10)(CX*1) - ADDQ $0x01, CX - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (AX)(CX*1), R14 - MOVW R14, (R10)(CX*1) - ADDQ $0x02, CX - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (AX)(CX*1), R14 - MOVL R14, (R10)(CX*1) - ADDQ $0x04, CX - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (AX)(CX*1), R14 - MOVQ R14, (R10)(CX*1) - ADDQ $0x08, CX - JMP copy_2_test - -copy_2: - MOVUPS (AX)(CX*1), X0 - MOVUPS X0, (R10)(CX*1) - ADDQ $0x10, CX + ADDQ R13, R12 + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_2_small -copy_2_test: - CMPQ CX, R13 - JB copy_2 +copy_2_loop: + MOVUPS (AX), X0 + MOVUPS X0, (R10) + ADDQ $0x10, AX + ADDQ $0x10, R10 + SUBQ $0x10, CX + JAE copy_2_loop + LEAQ 16(AX)(CX*1), AX + LEAQ 16(R10)(CX*1), R10 + MOVUPS -16(AX), X0 + MOVUPS X0, -16(R10) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (AX), CL + MOVB -1(AX)(R13*1), R14 + MOVB CL, (R10) + MOVB R14, -1(R10)(R13*1) + ADDQ R13, AX ADDQ R13, R10 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (AX), CX + MOVB 2(AX), R14 + MOVW CX, (R10) + MOVB R14, 2(R10) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (AX), CX + MOVL -4(AX)(R13*1), R14 + MOVL CX, (R10) + MOVL R14, -4(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (AX), CX + MOVQ -8(AX)(R13*1), R14 + MOVQ CX, (R10) + MOVQ R14, -8(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -3415,45 +3663,67 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: // Copy literals TESTQ CX, CX JZ check_offset - XORQ R14, R14 - TESTQ $0x00000001, CX - JZ copy_1_word - MOVB (R10)(R14*1), R15 - MOVB R15, (R9)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, CX - JZ copy_1_dword - MOVW (R10)(R14*1), R15 - MOVW R15, (R9)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, CX - JZ copy_1_qword - MOVL (R10)(R14*1), R15 - MOVL R15, (R9)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, CX - JZ copy_1_test - MOVQ (R10)(R14*1), R15 - MOVQ R15, (R9)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test + MOVQ CX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R10), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R10 + ADDQ $0x10, R9 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R10)(R14*1), R10 + LEAQ 16(R9)(R14*1), R9 + MOVUPS -16(R10), X0 + MOVUPS X0, -16(R9) + JMP copy_1_end + +copy_1_small: + CMPQ CX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ CX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R10), R14 + MOVB -1(R10)(CX*1), R15 + MOVB R14, (R9) + MOVB R15, -1(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1: - MOVUPS (R10)(R14*1), X0 - MOVUPS X0, (R9)(R14*1) - ADDQ $0x10, R14 +copy_1_move_3: + MOVW (R10), R14 + MOVB 2(R10), R15 + MOVW R14, (R9) + MOVB R15, 2(R9) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R10), R14 + MOVL -4(R10)(CX*1), R15 + MOVL R14, (R9) + MOVL R15, -4(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end -copy_1_test: - CMPQ R14, CX - JB copy_1 +copy_1_move_8through16: + MOVQ (R10), R14 + MOVQ -8(R10)(CX*1), R15 + MOVQ R14, (R9) + MOVQ R15, -8(R9)(CX*1) ADDQ CX, R10 ADDQ CX, R9 + +copy_1_end: ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -3466,149 +3736,206 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_test - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - JMP copy_4_test - -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_test - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R11 - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (CX)(R12*1), R14 - MOVB R14, (R9)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (CX)(R12*1), R14 - MOVW R14, (R9)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (CX)(R12*1), R14 - MOVL R14, (R9)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_test - MOVQ (CX)(R12*1), R14 - MOVQ R14, (R9)(R12*1) - ADDQ $0x08, R12 - JMP copy_2_test - -copy_2: - MOVUPS (CX)(R12*1), X0 - MOVUPS X0, (R9)(R12*1) - ADDQ $0x10, R12 + ADDQ R13, R11 + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_loop: + MOVUPS (CX), X0 + MOVUPS X0, (R9) + ADDQ $0x10, CX + ADDQ $0x10, R9 + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(CX)(R12*1), CX + LEAQ 16(R9)(R12*1), R9 + MOVUPS -16(CX), X0 + MOVUPS X0, -16(R9) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (CX), R12 + MOVB -1(CX)(R13*1), R14 + MOVB R12, (R9) + MOVB R14, -1(R9)(R13*1) + ADDQ R13, CX ADDQ R13, R9 - JMP handle_loop + JMP copy_2_end + +copy_2_move_3: + MOVW (CX), R12 + MOVB 2(CX), R14 + MOVW R12, (R9) + MOVB R14, 2(R9) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (CX), R12 + MOVL -4(CX)(R13*1), R14 + MOVL R12, (R9) + MOVL R14, -4(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (CX), R12 + MOVQ -8(CX)(R13*1), R14 + MOVQ R12, (R9) + MOVQ R14, -8(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: |