diff options
Diffstat (limited to 'vendor/github.com/golang/snappy/encode_arm64.s')
-rw-r--r-- | vendor/github.com/golang/snappy/encode_arm64.s | 722 |
1 files changed, 0 insertions, 722 deletions
diff --git a/vendor/github.com/golang/snappy/encode_arm64.s b/vendor/github.com/golang/snappy/encode_arm64.s deleted file mode 100644 index bf83667d7..000000000 --- a/vendor/github.com/golang/snappy/encode_arm64.s +++ /dev/null @@ -1,722 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" - -// The asm code generally follows the pure Go code in encode_other.go, except -// where marked with a "!!!". - -// ---------------------------------------------------------------------------- - -// func emitLiteral(dst, lit []byte) int -// -// All local variables fit into registers. The register allocation: -// - R3 len(lit) -// - R4 n -// - R6 return value -// - R8 &dst[i] -// - R10 &lit[0] -// -// The 32 bytes of stack space is to call runtime·memmove. -// -// The unusual register allocation of local variables, such as R10 for the -// source pointer, matches the allocation used at the call site in encodeBlock, -// which makes it easier to manually inline this function. -TEXT ·emitLiteral(SB), NOSPLIT, $32-56 - MOVD dst_base+0(FP), R8 - MOVD lit_base+24(FP), R10 - MOVD lit_len+32(FP), R3 - MOVD R3, R6 - MOVW R3, R4 - SUBW $1, R4, R4 - - CMPW $60, R4 - BLT oneByte - CMPW $256, R4 - BLT twoBytes - -threeBytes: - MOVD $0xf4, R2 - MOVB R2, 0(R8) - MOVW R4, 1(R8) - ADD $3, R8, R8 - ADD $3, R6, R6 - B memmove - -twoBytes: - MOVD $0xf0, R2 - MOVB R2, 0(R8) - MOVB R4, 1(R8) - ADD $2, R8, R8 - ADD $2, R6, R6 - B memmove - -oneByte: - LSLW $2, R4, R4 - MOVB R4, 0(R8) - ADD $1, R8, R8 - ADD $1, R6, R6 - -memmove: - MOVD R6, ret+48(FP) - - // copy(dst[i:], lit) - // - // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push - // R8, R10 and R3 as arguments. - MOVD R8, 8(RSP) - MOVD R10, 16(RSP) - MOVD R3, 24(RSP) - CALL runtime·memmove(SB) - RET - -// ---------------------------------------------------------------------------- - -// func emitCopy(dst []byte, offset, length int) int -// -// All local variables fit into registers. The register allocation: -// - R3 length -// - R7 &dst[0] -// - R8 &dst[i] -// - R11 offset -// -// The unusual register allocation of local variables, such as R11 for the -// offset, matches the allocation used at the call site in encodeBlock, which -// makes it easier to manually inline this function. -TEXT ·emitCopy(SB), NOSPLIT, $0-48 - MOVD dst_base+0(FP), R8 - MOVD R8, R7 - MOVD offset+24(FP), R11 - MOVD length+32(FP), R3 - -loop0: - // for length >= 68 { etc } - CMPW $68, R3 - BLT step1 - - // Emit a length 64 copy, encoded as 3 bytes. - MOVD $0xfe, R2 - MOVB R2, 0(R8) - MOVW R11, 1(R8) - ADD $3, R8, R8 - SUB $64, R3, R3 - B loop0 - -step1: - // if length > 64 { etc } - CMP $64, R3 - BLE step2 - - // Emit a length 60 copy, encoded as 3 bytes. - MOVD $0xee, R2 - MOVB R2, 0(R8) - MOVW R11, 1(R8) - ADD $3, R8, R8 - SUB $60, R3, R3 - -step2: - // if length >= 12 || offset >= 2048 { goto step3 } - CMP $12, R3 - BGE step3 - CMPW $2048, R11 - BGE step3 - - // Emit the remaining copy, encoded as 2 bytes. - MOVB R11, 1(R8) - LSRW $3, R11, R11 - AND $0xe0, R11, R11 - SUB $4, R3, R3 - LSLW $2, R3 - AND $0xff, R3, R3 - ORRW R3, R11, R11 - ORRW $1, R11, R11 - MOVB R11, 0(R8) - ADD $2, R8, R8 - - // Return the number of bytes written. - SUB R7, R8, R8 - MOVD R8, ret+40(FP) - RET - -step3: - // Emit the remaining copy, encoded as 3 bytes. - SUB $1, R3, R3 - AND $0xff, R3, R3 - LSLW $2, R3, R3 - ORRW $2, R3, R3 - MOVB R3, 0(R8) - MOVW R11, 1(R8) - ADD $3, R8, R8 - - // Return the number of bytes written. - SUB R7, R8, R8 - MOVD R8, ret+40(FP) - RET - -// ---------------------------------------------------------------------------- - -// func extendMatch(src []byte, i, j int) int -// -// All local variables fit into registers. The register allocation: -// - R6 &src[0] -// - R7 &src[j] -// - R13 &src[len(src) - 8] -// - R14 &src[len(src)] -// - R15 &src[i] -// -// The unusual register allocation of local variables, such as R15 for a source -// pointer, matches the allocation used at the call site in encodeBlock, which -// makes it easier to manually inline this function. -TEXT ·extendMatch(SB), NOSPLIT, $0-48 - MOVD src_base+0(FP), R6 - MOVD src_len+8(FP), R14 - MOVD i+24(FP), R15 - MOVD j+32(FP), R7 - ADD R6, R14, R14 - ADD R6, R15, R15 - ADD R6, R7, R7 - MOVD R14, R13 - SUB $8, R13, R13 - -cmp8: - // As long as we are 8 or more bytes before the end of src, we can load and - // compare 8 bytes at a time. If those 8 bytes are equal, repeat. - CMP R13, R7 - BHI cmp1 - MOVD (R15), R3 - MOVD (R7), R4 - CMP R4, R3 - BNE bsf - ADD $8, R15, R15 - ADD $8, R7, R7 - B cmp8 - -bsf: - // If those 8 bytes were not equal, XOR the two 8 byte values, and return - // the index of the first byte that differs. - // RBIT reverses the bit order, then CLZ counts the leading zeros, the - // combination of which finds the least significant bit which is set. - // The arm64 architecture is little-endian, and the shift by 3 converts - // a bit index to a byte index. - EOR R3, R4, R4 - RBIT R4, R4 - CLZ R4, R4 - ADD R4>>3, R7, R7 - - // Convert from &src[ret] to ret. - SUB R6, R7, R7 - MOVD R7, ret+40(FP) - RET - -cmp1: - // In src's tail, compare 1 byte at a time. - CMP R7, R14 - BLS extendMatchEnd - MOVB (R15), R3 - MOVB (R7), R4 - CMP R4, R3 - BNE extendMatchEnd - ADD $1, R15, R15 - ADD $1, R7, R7 - B cmp1 - -extendMatchEnd: - // Convert from &src[ret] to ret. - SUB R6, R7, R7 - MOVD R7, ret+40(FP) - RET - -// ---------------------------------------------------------------------------- - -// func encodeBlock(dst, src []byte) (d int) -// -// All local variables fit into registers, other than "var table". The register -// allocation: -// - R3 . . -// - R4 . . -// - R5 64 shift -// - R6 72 &src[0], tableSize -// - R7 80 &src[s] -// - R8 88 &dst[d] -// - R9 96 sLimit -// - R10 . &src[nextEmit] -// - R11 104 prevHash, currHash, nextHash, offset -// - R12 112 &src[base], skip -// - R13 . &src[nextS], &src[len(src) - 8] -// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x -// - R15 120 candidate -// - R16 . hash constant, 0x1e35a7bd -// - R17 . &table -// - . 128 table -// -// The second column (64, 72, etc) is the stack offset to spill the registers -// when calling other functions. We could pack this slightly tighter, but it's -// simpler to have a dedicated spill map independent of the function called. -// -// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An -// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill -// local variables (registers) during calls gives 32768 + 64 + 64 = 32896. -TEXT ·encodeBlock(SB), 0, $32896-56 - MOVD dst_base+0(FP), R8 - MOVD src_base+24(FP), R7 - MOVD src_len+32(FP), R14 - - // shift, tableSize := uint32(32-8), 1<<8 - MOVD $24, R5 - MOVD $256, R6 - MOVW $0xa7bd, R16 - MOVKW $(0x1e35<<16), R16 - -calcShift: - // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { - // shift-- - // } - MOVD $16384, R2 - CMP R2, R6 - BGE varTable - CMP R14, R6 - BGE varTable - SUB $1, R5, R5 - LSL $1, R6, R6 - B calcShift - -varTable: - // var table [maxTableSize]uint16 - // - // In the asm code, unlike the Go code, we can zero-initialize only the - // first tableSize elements. Each uint16 element is 2 bytes and each - // iterations writes 64 bytes, so we can do only tableSize/32 writes - // instead of the 2048 writes that would zero-initialize all of table's - // 32768 bytes. This clear could overrun the first tableSize elements, but - // it won't overrun the allocated stack size. - ADD $128, RSP, R17 - MOVD R17, R4 - - // !!! R6 = &src[tableSize] - ADD R6<<1, R17, R6 - -memclr: - STP.P (ZR, ZR), 64(R4) - STP (ZR, ZR), -48(R4) - STP (ZR, ZR), -32(R4) - STP (ZR, ZR), -16(R4) - CMP R4, R6 - BHI memclr - - // !!! R6 = &src[0] - MOVD R7, R6 - - // sLimit := len(src) - inputMargin - MOVD R14, R9 - SUB $15, R9, R9 - - // !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't - // change for the rest of the function. - MOVD R5, 64(RSP) - MOVD R6, 72(RSP) - MOVD R9, 96(RSP) - - // nextEmit := 0 - MOVD R6, R10 - - // s := 1 - ADD $1, R7, R7 - - // nextHash := hash(load32(src, s), shift) - MOVW 0(R7), R11 - MULW R16, R11, R11 - LSRW R5, R11, R11 - -outer: - // for { etc } - - // skip := 32 - MOVD $32, R12 - - // nextS := s - MOVD R7, R13 - - // candidate := 0 - MOVD $0, R15 - -inner0: - // for { etc } - - // s := nextS - MOVD R13, R7 - - // bytesBetweenHashLookups := skip >> 5 - MOVD R12, R14 - LSR $5, R14, R14 - - // nextS = s + bytesBetweenHashLookups - ADD R14, R13, R13 - - // skip += bytesBetweenHashLookups - ADD R14, R12, R12 - - // if nextS > sLimit { goto emitRemainder } - MOVD R13, R3 - SUB R6, R3, R3 - CMP R9, R3 - BHI emitRemainder - - // candidate = int(table[nextHash]) - MOVHU 0(R17)(R11<<1), R15 - - // table[nextHash] = uint16(s) - MOVD R7, R3 - SUB R6, R3, R3 - - MOVH R3, 0(R17)(R11<<1) - - // nextHash = hash(load32(src, nextS), shift) - MOVW 0(R13), R11 - MULW R16, R11 - LSRW R5, R11, R11 - - // if load32(src, s) != load32(src, candidate) { continue } break - MOVW 0(R7), R3 - MOVW (R6)(R15*1), R4 - CMPW R4, R3 - BNE inner0 - -fourByteMatch: - // As per the encode_other.go code: - // - // A 4-byte match has been found. We'll later see etc. - - // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment - // on inputMargin in encode.go. - MOVD R7, R3 - SUB R10, R3, R3 - CMP $16, R3 - BLE emitLiteralFastPath - - // ---------------------------------------- - // Begin inline of the emitLiteral call. - // - // d += emitLiteral(dst[d:], src[nextEmit:s]) - - MOVW R3, R4 - SUBW $1, R4, R4 - - MOVW $60, R2 - CMPW R2, R4 - BLT inlineEmitLiteralOneByte - MOVW $256, R2 - CMPW R2, R4 - BLT inlineEmitLiteralTwoBytes - -inlineEmitLiteralThreeBytes: - MOVD $0xf4, R1 - MOVB R1, 0(R8) - MOVW R4, 1(R8) - ADD $3, R8, R8 - B inlineEmitLiteralMemmove - -inlineEmitLiteralTwoBytes: - MOVD $0xf0, R1 - MOVB R1, 0(R8) - MOVB R4, 1(R8) - ADD $2, R8, R8 - B inlineEmitLiteralMemmove - -inlineEmitLiteralOneByte: - LSLW $2, R4, R4 - MOVB R4, 0(R8) - ADD $1, R8, R8 - -inlineEmitLiteralMemmove: - // Spill local variables (registers) onto the stack; call; unspill. - // - // copy(dst[i:], lit) - // - // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push - // R8, R10 and R3 as arguments. - MOVD R8, 8(RSP) - MOVD R10, 16(RSP) - MOVD R3, 24(RSP) - - // Finish the "d +=" part of "d += emitLiteral(etc)". - ADD R3, R8, R8 - MOVD R7, 80(RSP) - MOVD R8, 88(RSP) - MOVD R15, 120(RSP) - CALL runtime·memmove(SB) - MOVD 64(RSP), R5 - MOVD 72(RSP), R6 - MOVD 80(RSP), R7 - MOVD 88(RSP), R8 - MOVD 96(RSP), R9 - MOVD 120(RSP), R15 - ADD $128, RSP, R17 - MOVW $0xa7bd, R16 - MOVKW $(0x1e35<<16), R16 - B inner1 - -inlineEmitLiteralEnd: - // End inline of the emitLiteral call. - // ---------------------------------------- - -emitLiteralFastPath: - // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". - MOVB R3, R4 - SUBW $1, R4, R4 - AND $0xff, R4, R4 - LSLW $2, R4, R4 - MOVB R4, (R8) - ADD $1, R8, R8 - - // !!! Implement the copy from lit to dst as a 16-byte load and store. - // (Encode's documentation says that dst and src must not overlap.) - // - // This always copies 16 bytes, instead of only len(lit) bytes, but that's - // OK. Subsequent iterations will fix up the overrun. - // - // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or - // 16-byte loads and stores. This technique probably wouldn't be as - // effective on architectures that are fussier about alignment. - LDP 0(R10), (R0, R1) - STP (R0, R1), 0(R8) - ADD R3, R8, R8 - -inner1: - // for { etc } - - // base := s - MOVD R7, R12 - - // !!! offset := base - candidate - MOVD R12, R11 - SUB R15, R11, R11 - SUB R6, R11, R11 - - // ---------------------------------------- - // Begin inline of the extendMatch call. - // - // s = extendMatch(src, candidate+4, s+4) - - // !!! R14 = &src[len(src)] - MOVD src_len+32(FP), R14 - ADD R6, R14, R14 - - // !!! R13 = &src[len(src) - 8] - MOVD R14, R13 - SUB $8, R13, R13 - - // !!! R15 = &src[candidate + 4] - ADD $4, R15, R15 - ADD R6, R15, R15 - - // !!! s += 4 - ADD $4, R7, R7 - -inlineExtendMatchCmp8: - // As long as we are 8 or more bytes before the end of src, we can load and - // compare 8 bytes at a time. If those 8 bytes are equal, repeat. - CMP R13, R7 - BHI inlineExtendMatchCmp1 - MOVD (R15), R3 - MOVD (R7), R4 - CMP R4, R3 - BNE inlineExtendMatchBSF - ADD $8, R15, R15 - ADD $8, R7, R7 - B inlineExtendMatchCmp8 - -inlineExtendMatchBSF: - // If those 8 bytes were not equal, XOR the two 8 byte values, and return - // the index of the first byte that differs. - // RBIT reverses the bit order, then CLZ counts the leading zeros, the - // combination of which finds the least significant bit which is set. - // The arm64 architecture is little-endian, and the shift by 3 converts - // a bit index to a byte index. - EOR R3, R4, R4 - RBIT R4, R4 - CLZ R4, R4 - ADD R4>>3, R7, R7 - B inlineExtendMatchEnd - -inlineExtendMatchCmp1: - // In src's tail, compare 1 byte at a time. - CMP R7, R14 - BLS inlineExtendMatchEnd - MOVB (R15), R3 - MOVB (R7), R4 - CMP R4, R3 - BNE inlineExtendMatchEnd - ADD $1, R15, R15 - ADD $1, R7, R7 - B inlineExtendMatchCmp1 - -inlineExtendMatchEnd: - // End inline of the extendMatch call. - // ---------------------------------------- - - // ---------------------------------------- - // Begin inline of the emitCopy call. - // - // d += emitCopy(dst[d:], base-candidate, s-base) - - // !!! length := s - base - MOVD R7, R3 - SUB R12, R3, R3 - -inlineEmitCopyLoop0: - // for length >= 68 { etc } - MOVW $68, R2 - CMPW R2, R3 - BLT inlineEmitCopyStep1 - - // Emit a length 64 copy, encoded as 3 bytes. - MOVD $0xfe, R1 - MOVB R1, 0(R8) - MOVW R11, 1(R8) - ADD $3, R8, R8 - SUBW $64, R3, R3 - B inlineEmitCopyLoop0 - -inlineEmitCopyStep1: - // if length > 64 { etc } - MOVW $64, R2 - CMPW R2, R3 - BLE inlineEmitCopyStep2 - - // Emit a length 60 copy, encoded as 3 bytes. - MOVD $0xee, R1 - MOVB R1, 0(R8) - MOVW R11, 1(R8) - ADD $3, R8, R8 - SUBW $60, R3, R3 - -inlineEmitCopyStep2: - // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } - MOVW $12, R2 - CMPW R2, R3 - BGE inlineEmitCopyStep3 - MOVW $2048, R2 - CMPW R2, R11 - BGE inlineEmitCopyStep3 - - // Emit the remaining copy, encoded as 2 bytes. - MOVB R11, 1(R8) - LSRW $8, R11, R11 - LSLW $5, R11, R11 - SUBW $4, R3, R3 - AND $0xff, R3, R3 - LSLW $2, R3, R3 - ORRW R3, R11, R11 - ORRW $1, R11, R11 - MOVB R11, 0(R8) - ADD $2, R8, R8 - B inlineEmitCopyEnd - -inlineEmitCopyStep3: - // Emit the remaining copy, encoded as 3 bytes. - SUBW $1, R3, R3 - LSLW $2, R3, R3 - ORRW $2, R3, R3 - MOVB R3, 0(R8) - MOVW R11, 1(R8) - ADD $3, R8, R8 - -inlineEmitCopyEnd: - // End inline of the emitCopy call. - // ---------------------------------------- - - // nextEmit = s - MOVD R7, R10 - - // if s >= sLimit { goto emitRemainder } - MOVD R7, R3 - SUB R6, R3, R3 - CMP R3, R9 - BLS emitRemainder - - // As per the encode_other.go code: - // - // We could immediately etc. - - // x := load64(src, s-1) - MOVD -1(R7), R14 - - // prevHash := hash(uint32(x>>0), shift) - MOVW R14, R11 - MULW R16, R11, R11 - LSRW R5, R11, R11 - - // table[prevHash] = uint16(s-1) - MOVD R7, R3 - SUB R6, R3, R3 - SUB $1, R3, R3 - - MOVHU R3, 0(R17)(R11<<1) - - // currHash := hash(uint32(x>>8), shift) - LSR $8, R14, R14 - MOVW R14, R11 - MULW R16, R11, R11 - LSRW R5, R11, R11 - - // candidate = int(table[currHash]) - MOVHU 0(R17)(R11<<1), R15 - - // table[currHash] = uint16(s) - ADD $1, R3, R3 - MOVHU R3, 0(R17)(R11<<1) - - // if uint32(x>>8) == load32(src, candidate) { continue } - MOVW (R6)(R15*1), R4 - CMPW R4, R14 - BEQ inner1 - - // nextHash = hash(uint32(x>>16), shift) - LSR $8, R14, R14 - MOVW R14, R11 - MULW R16, R11, R11 - LSRW R5, R11, R11 - - // s++ - ADD $1, R7, R7 - - // break out of the inner1 for loop, i.e. continue the outer loop. - B outer - -emitRemainder: - // if nextEmit < len(src) { etc } - MOVD src_len+32(FP), R3 - ADD R6, R3, R3 - CMP R3, R10 - BEQ encodeBlockEnd - - // d += emitLiteral(dst[d:], src[nextEmit:]) - // - // Push args. - MOVD R8, 8(RSP) - MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative. - MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative. - MOVD R10, 32(RSP) - SUB R10, R3, R3 - MOVD R3, 40(RSP) - MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative. - - // Spill local variables (registers) onto the stack; call; unspill. - MOVD R8, 88(RSP) - CALL ·emitLiteral(SB) - MOVD 88(RSP), R8 - - // Finish the "d +=" part of "d += emitLiteral(etc)". - MOVD 56(RSP), R1 - ADD R1, R8, R8 - -encodeBlockEnd: - MOVD dst_base+0(FP), R3 - SUB R3, R8, R8 - MOVD R8, d+48(FP) - RET |