diff options
Diffstat (limited to 'vendor/github.com/golang/snappy/encode_amd64.s')
-rw-r--r-- | vendor/github.com/golang/snappy/encode_amd64.s | 730 |
1 files changed, 0 insertions, 730 deletions
diff --git a/vendor/github.com/golang/snappy/encode_amd64.s b/vendor/github.com/golang/snappy/encode_amd64.s deleted file mode 100644 index adfd979fe..000000000 --- a/vendor/github.com/golang/snappy/encode_amd64.s +++ /dev/null @@ -1,730 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" - -// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a -// Go toolchain regression. See https://github.com/golang/go/issues/15426 and -// https://github.com/golang/snappy/issues/29 -// -// As a workaround, the package was built with a known good assembler, and -// those instructions were disassembled by "objdump -d" to yield the -// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 -// style comments, in AT&T asm syntax. Note that rsp here is a physical -// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm). -// The instructions were then encoded as "BYTE $0x.." sequences, which assemble -// fine on Go 1.6. - -// The asm code generally follows the pure Go code in encode_other.go, except -// where marked with a "!!!". - -// ---------------------------------------------------------------------------- - -// func emitLiteral(dst, lit []byte) int -// -// All local variables fit into registers. The register allocation: -// - AX len(lit) -// - BX n -// - DX return value -// - DI &dst[i] -// - R10 &lit[0] -// -// The 24 bytes of stack space is to call runtime·memmove. -// -// The unusual register allocation of local variables, such as R10 for the -// source pointer, matches the allocation used at the call site in encodeBlock, -// which makes it easier to manually inline this function. -TEXT ·emitLiteral(SB), NOSPLIT, $24-56 - MOVQ dst_base+0(FP), DI - MOVQ lit_base+24(FP), R10 - MOVQ lit_len+32(FP), AX - MOVQ AX, DX - MOVL AX, BX - SUBL $1, BX - - CMPL BX, $60 - JLT oneByte - CMPL BX, $256 - JLT twoBytes - -threeBytes: - MOVB $0xf4, 0(DI) - MOVW BX, 1(DI) - ADDQ $3, DI - ADDQ $3, DX - JMP memmove - -twoBytes: - MOVB $0xf0, 0(DI) - MOVB BX, 1(DI) - ADDQ $2, DI - ADDQ $2, DX - JMP memmove - -oneByte: - SHLB $2, BX - MOVB BX, 0(DI) - ADDQ $1, DI - ADDQ $1, DX - -memmove: - MOVQ DX, ret+48(FP) - - // copy(dst[i:], lit) - // - // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push - // DI, R10 and AX as arguments. - MOVQ DI, 0(SP) - MOVQ R10, 8(SP) - MOVQ AX, 16(SP) - CALL runtime·memmove(SB) - RET - -// ---------------------------------------------------------------------------- - -// func emitCopy(dst []byte, offset, length int) int -// -// All local variables fit into registers. The register allocation: -// - AX length -// - SI &dst[0] -// - DI &dst[i] -// - R11 offset -// -// The unusual register allocation of local variables, such as R11 for the -// offset, matches the allocation used at the call site in encodeBlock, which -// makes it easier to manually inline this function. -TEXT ·emitCopy(SB), NOSPLIT, $0-48 - MOVQ dst_base+0(FP), DI - MOVQ DI, SI - MOVQ offset+24(FP), R11 - MOVQ length+32(FP), AX - -loop0: - // for length >= 68 { etc } - CMPL AX, $68 - JLT step1 - - // Emit a length 64 copy, encoded as 3 bytes. - MOVB $0xfe, 0(DI) - MOVW R11, 1(DI) - ADDQ $3, DI - SUBL $64, AX - JMP loop0 - -step1: - // if length > 64 { etc } - CMPL AX, $64 - JLE step2 - - // Emit a length 60 copy, encoded as 3 bytes. - MOVB $0xee, 0(DI) - MOVW R11, 1(DI) - ADDQ $3, DI - SUBL $60, AX - -step2: - // if length >= 12 || offset >= 2048 { goto step3 } - CMPL AX, $12 - JGE step3 - CMPL R11, $2048 - JGE step3 - - // Emit the remaining copy, encoded as 2 bytes. - MOVB R11, 1(DI) - SHRL $8, R11 - SHLB $5, R11 - SUBB $4, AX - SHLB $2, AX - ORB AX, R11 - ORB $1, R11 - MOVB R11, 0(DI) - ADDQ $2, DI - - // Return the number of bytes written. - SUBQ SI, DI - MOVQ DI, ret+40(FP) - RET - -step3: - // Emit the remaining copy, encoded as 3 bytes. - SUBL $1, AX - SHLB $2, AX - ORB $2, AX - MOVB AX, 0(DI) - MOVW R11, 1(DI) - ADDQ $3, DI - - // Return the number of bytes written. - SUBQ SI, DI - MOVQ DI, ret+40(FP) - RET - -// ---------------------------------------------------------------------------- - -// func extendMatch(src []byte, i, j int) int -// -// All local variables fit into registers. The register allocation: -// - DX &src[0] -// - SI &src[j] -// - R13 &src[len(src) - 8] -// - R14 &src[len(src)] -// - R15 &src[i] -// -// The unusual register allocation of local variables, such as R15 for a source -// pointer, matches the allocation used at the call site in encodeBlock, which -// makes it easier to manually inline this function. -TEXT ·extendMatch(SB), NOSPLIT, $0-48 - MOVQ src_base+0(FP), DX - MOVQ src_len+8(FP), R14 - MOVQ i+24(FP), R15 - MOVQ j+32(FP), SI - ADDQ DX, R14 - ADDQ DX, R15 - ADDQ DX, SI - MOVQ R14, R13 - SUBQ $8, R13 - -cmp8: - // As long as we are 8 or more bytes before the end of src, we can load and - // compare 8 bytes at a time. If those 8 bytes are equal, repeat. - CMPQ SI, R13 - JA cmp1 - MOVQ (R15), AX - MOVQ (SI), BX - CMPQ AX, BX - JNE bsf - ADDQ $8, R15 - ADDQ $8, SI - JMP cmp8 - -bsf: - // If those 8 bytes were not equal, XOR the two 8 byte values, and return - // the index of the first byte that differs. The BSF instruction finds the - // least significant 1 bit, the amd64 architecture is little-endian, and - // the shift by 3 converts a bit index to a byte index. - XORQ AX, BX - BSFQ BX, BX - SHRQ $3, BX - ADDQ BX, SI - - // Convert from &src[ret] to ret. - SUBQ DX, SI - MOVQ SI, ret+40(FP) - RET - -cmp1: - // In src's tail, compare 1 byte at a time. - CMPQ SI, R14 - JAE extendMatchEnd - MOVB (R15), AX - MOVB (SI), BX - CMPB AX, BX - JNE extendMatchEnd - ADDQ $1, R15 - ADDQ $1, SI - JMP cmp1 - -extendMatchEnd: - // Convert from &src[ret] to ret. - SUBQ DX, SI - MOVQ SI, ret+40(FP) - RET - -// ---------------------------------------------------------------------------- - -// func encodeBlock(dst, src []byte) (d int) -// -// All local variables fit into registers, other than "var table". The register -// allocation: -// - AX . . -// - BX . . -// - CX 56 shift (note that amd64 shifts by non-immediates must use CX). -// - DX 64 &src[0], tableSize -// - SI 72 &src[s] -// - DI 80 &dst[d] -// - R9 88 sLimit -// - R10 . &src[nextEmit] -// - R11 96 prevHash, currHash, nextHash, offset -// - R12 104 &src[base], skip -// - R13 . &src[nextS], &src[len(src) - 8] -// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x -// - R15 112 candidate -// -// The second column (56, 64, etc) is the stack offset to spill the registers -// when calling other functions. We could pack this slightly tighter, but it's -// simpler to have a dedicated spill map independent of the function called. -// -// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An -// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill -// local variables (registers) during calls gives 32768 + 56 + 64 = 32888. -TEXT ·encodeBlock(SB), 0, $32888-56 - MOVQ dst_base+0(FP), DI - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R14 - - // shift, tableSize := uint32(32-8), 1<<8 - MOVQ $24, CX - MOVQ $256, DX - -calcShift: - // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { - // shift-- - // } - CMPQ DX, $16384 - JGE varTable - CMPQ DX, R14 - JGE varTable - SUBQ $1, CX - SHLQ $1, DX - JMP calcShift - -varTable: - // var table [maxTableSize]uint16 - // - // In the asm code, unlike the Go code, we can zero-initialize only the - // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU - // writes 16 bytes, so we can do only tableSize/8 writes instead of the - // 2048 writes that would zero-initialize all of table's 32768 bytes. - SHRQ $3, DX - LEAQ table-32768(SP), BX - PXOR X0, X0 - -memclr: - MOVOU X0, 0(BX) - ADDQ $16, BX - SUBQ $1, DX - JNZ memclr - - // !!! DX = &src[0] - MOVQ SI, DX - - // sLimit := len(src) - inputMargin - MOVQ R14, R9 - SUBQ $15, R9 - - // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't - // change for the rest of the function. - MOVQ CX, 56(SP) - MOVQ DX, 64(SP) - MOVQ R9, 88(SP) - - // nextEmit := 0 - MOVQ DX, R10 - - // s := 1 - ADDQ $1, SI - - // nextHash := hash(load32(src, s), shift) - MOVL 0(SI), R11 - IMULL $0x1e35a7bd, R11 - SHRL CX, R11 - -outer: - // for { etc } - - // skip := 32 - MOVQ $32, R12 - - // nextS := s - MOVQ SI, R13 - - // candidate := 0 - MOVQ $0, R15 - -inner0: - // for { etc } - - // s := nextS - MOVQ R13, SI - - // bytesBetweenHashLookups := skip >> 5 - MOVQ R12, R14 - SHRQ $5, R14 - - // nextS = s + bytesBetweenHashLookups - ADDQ R14, R13 - - // skip += bytesBetweenHashLookups - ADDQ R14, R12 - - // if nextS > sLimit { goto emitRemainder } - MOVQ R13, AX - SUBQ DX, AX - CMPQ AX, R9 - JA emitRemainder - - // candidate = int(table[nextHash]) - // XXX: MOVWQZX table-32768(SP)(R11*2), R15 - // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 - BYTE $0x4e - BYTE $0x0f - BYTE $0xb7 - BYTE $0x7c - BYTE $0x5c - BYTE $0x78 - - // table[nextHash] = uint16(s) - MOVQ SI, AX - SUBQ DX, AX - - // XXX: MOVW AX, table-32768(SP)(R11*2) - // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) - BYTE $0x66 - BYTE $0x42 - BYTE $0x89 - BYTE $0x44 - BYTE $0x5c - BYTE $0x78 - - // nextHash = hash(load32(src, nextS), shift) - MOVL 0(R13), R11 - IMULL $0x1e35a7bd, R11 - SHRL CX, R11 - - // if load32(src, s) != load32(src, candidate) { continue } break - MOVL 0(SI), AX - MOVL (DX)(R15*1), BX - CMPL AX, BX - JNE inner0 - -fourByteMatch: - // As per the encode_other.go code: - // - // A 4-byte match has been found. We'll later see etc. - - // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment - // on inputMargin in encode.go. - MOVQ SI, AX - SUBQ R10, AX - CMPQ AX, $16 - JLE emitLiteralFastPath - - // ---------------------------------------- - // Begin inline of the emitLiteral call. - // - // d += emitLiteral(dst[d:], src[nextEmit:s]) - - MOVL AX, BX - SUBL $1, BX - - CMPL BX, $60 - JLT inlineEmitLiteralOneByte - CMPL BX, $256 - JLT inlineEmitLiteralTwoBytes - -inlineEmitLiteralThreeBytes: - MOVB $0xf4, 0(DI) - MOVW BX, 1(DI) - ADDQ $3, DI - JMP inlineEmitLiteralMemmove - -inlineEmitLiteralTwoBytes: - MOVB $0xf0, 0(DI) - MOVB BX, 1(DI) - ADDQ $2, DI - JMP inlineEmitLiteralMemmove - -inlineEmitLiteralOneByte: - SHLB $2, BX - MOVB BX, 0(DI) - ADDQ $1, DI - -inlineEmitLiteralMemmove: - // Spill local variables (registers) onto the stack; call; unspill. - // - // copy(dst[i:], lit) - // - // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push - // DI, R10 and AX as arguments. - MOVQ DI, 0(SP) - MOVQ R10, 8(SP) - MOVQ AX, 16(SP) - ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)". - MOVQ SI, 72(SP) - MOVQ DI, 80(SP) - MOVQ R15, 112(SP) - CALL runtime·memmove(SB) - MOVQ 56(SP), CX - MOVQ 64(SP), DX - MOVQ 72(SP), SI - MOVQ 80(SP), DI - MOVQ 88(SP), R9 - MOVQ 112(SP), R15 - JMP inner1 - -inlineEmitLiteralEnd: - // End inline of the emitLiteral call. - // ---------------------------------------- - -emitLiteralFastPath: - // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". - MOVB AX, BX - SUBB $1, BX - SHLB $2, BX - MOVB BX, (DI) - ADDQ $1, DI - - // !!! Implement the copy from lit to dst as a 16-byte load and store. - // (Encode's documentation says that dst and src must not overlap.) - // - // This always copies 16 bytes, instead of only len(lit) bytes, but that's - // OK. Subsequent iterations will fix up the overrun. - // - // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or - // 16-byte loads and stores. This technique probably wouldn't be as - // effective on architectures that are fussier about alignment. - MOVOU 0(R10), X0 - MOVOU X0, 0(DI) - ADDQ AX, DI - -inner1: - // for { etc } - - // base := s - MOVQ SI, R12 - - // !!! offset := base - candidate - MOVQ R12, R11 - SUBQ R15, R11 - SUBQ DX, R11 - - // ---------------------------------------- - // Begin inline of the extendMatch call. - // - // s = extendMatch(src, candidate+4, s+4) - - // !!! R14 = &src[len(src)] - MOVQ src_len+32(FP), R14 - ADDQ DX, R14 - - // !!! R13 = &src[len(src) - 8] - MOVQ R14, R13 - SUBQ $8, R13 - - // !!! R15 = &src[candidate + 4] - ADDQ $4, R15 - ADDQ DX, R15 - - // !!! s += 4 - ADDQ $4, SI - -inlineExtendMatchCmp8: - // As long as we are 8 or more bytes before the end of src, we can load and - // compare 8 bytes at a time. If those 8 bytes are equal, repeat. - CMPQ SI, R13 - JA inlineExtendMatchCmp1 - MOVQ (R15), AX - MOVQ (SI), BX - CMPQ AX, BX - JNE inlineExtendMatchBSF - ADDQ $8, R15 - ADDQ $8, SI - JMP inlineExtendMatchCmp8 - -inlineExtendMatchBSF: - // If those 8 bytes were not equal, XOR the two 8 byte values, and return - // the index of the first byte that differs. The BSF instruction finds the - // least significant 1 bit, the amd64 architecture is little-endian, and - // the shift by 3 converts a bit index to a byte index. - XORQ AX, BX - BSFQ BX, BX - SHRQ $3, BX - ADDQ BX, SI - JMP inlineExtendMatchEnd - -inlineExtendMatchCmp1: - // In src's tail, compare 1 byte at a time. - CMPQ SI, R14 - JAE inlineExtendMatchEnd - MOVB (R15), AX - MOVB (SI), BX - CMPB AX, BX - JNE inlineExtendMatchEnd - ADDQ $1, R15 - ADDQ $1, SI - JMP inlineExtendMatchCmp1 - -inlineExtendMatchEnd: - // End inline of the extendMatch call. - // ---------------------------------------- - - // ---------------------------------------- - // Begin inline of the emitCopy call. - // - // d += emitCopy(dst[d:], base-candidate, s-base) - - // !!! length := s - base - MOVQ SI, AX - SUBQ R12, AX - -inlineEmitCopyLoop0: - // for length >= 68 { etc } - CMPL AX, $68 - JLT inlineEmitCopyStep1 - - // Emit a length 64 copy, encoded as 3 bytes. - MOVB $0xfe, 0(DI) - MOVW R11, 1(DI) - ADDQ $3, DI - SUBL $64, AX - JMP inlineEmitCopyLoop0 - -inlineEmitCopyStep1: - // if length > 64 { etc } - CMPL AX, $64 - JLE inlineEmitCopyStep2 - - // Emit a length 60 copy, encoded as 3 bytes. - MOVB $0xee, 0(DI) - MOVW R11, 1(DI) - ADDQ $3, DI - SUBL $60, AX - -inlineEmitCopyStep2: - // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } - CMPL AX, $12 - JGE inlineEmitCopyStep3 - CMPL R11, $2048 - JGE inlineEmitCopyStep3 - - // Emit the remaining copy, encoded as 2 bytes. - MOVB R11, 1(DI) - SHRL $8, R11 - SHLB $5, R11 - SUBB $4, AX - SHLB $2, AX - ORB AX, R11 - ORB $1, R11 - MOVB R11, 0(DI) - ADDQ $2, DI - JMP inlineEmitCopyEnd - -inlineEmitCopyStep3: - // Emit the remaining copy, encoded as 3 bytes. - SUBL $1, AX - SHLB $2, AX - ORB $2, AX - MOVB AX, 0(DI) - MOVW R11, 1(DI) - ADDQ $3, DI - -inlineEmitCopyEnd: - // End inline of the emitCopy call. - // ---------------------------------------- - - // nextEmit = s - MOVQ SI, R10 - - // if s >= sLimit { goto emitRemainder } - MOVQ SI, AX - SUBQ DX, AX - CMPQ AX, R9 - JAE emitRemainder - - // As per the encode_other.go code: - // - // We could immediately etc. - - // x := load64(src, s-1) - MOVQ -1(SI), R14 - - // prevHash := hash(uint32(x>>0), shift) - MOVL R14, R11 - IMULL $0x1e35a7bd, R11 - SHRL CX, R11 - - // table[prevHash] = uint16(s-1) - MOVQ SI, AX - SUBQ DX, AX - SUBQ $1, AX - - // XXX: MOVW AX, table-32768(SP)(R11*2) - // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) - BYTE $0x66 - BYTE $0x42 - BYTE $0x89 - BYTE $0x44 - BYTE $0x5c - BYTE $0x78 - - // currHash := hash(uint32(x>>8), shift) - SHRQ $8, R14 - MOVL R14, R11 - IMULL $0x1e35a7bd, R11 - SHRL CX, R11 - - // candidate = int(table[currHash]) - // XXX: MOVWQZX table-32768(SP)(R11*2), R15 - // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 - BYTE $0x4e - BYTE $0x0f - BYTE $0xb7 - BYTE $0x7c - BYTE $0x5c - BYTE $0x78 - - // table[currHash] = uint16(s) - ADDQ $1, AX - - // XXX: MOVW AX, table-32768(SP)(R11*2) - // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) - BYTE $0x66 - BYTE $0x42 - BYTE $0x89 - BYTE $0x44 - BYTE $0x5c - BYTE $0x78 - - // if uint32(x>>8) == load32(src, candidate) { continue } - MOVL (DX)(R15*1), BX - CMPL R14, BX - JEQ inner1 - - // nextHash = hash(uint32(x>>16), shift) - SHRQ $8, R14 - MOVL R14, R11 - IMULL $0x1e35a7bd, R11 - SHRL CX, R11 - - // s++ - ADDQ $1, SI - - // break out of the inner1 for loop, i.e. continue the outer loop. - JMP outer - -emitRemainder: - // if nextEmit < len(src) { etc } - MOVQ src_len+32(FP), AX - ADDQ DX, AX - CMPQ R10, AX - JEQ encodeBlockEnd - - // d += emitLiteral(dst[d:], src[nextEmit:]) - // - // Push args. - MOVQ DI, 0(SP) - MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative. - MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative. - MOVQ R10, 24(SP) - SUBQ R10, AX - MOVQ AX, 32(SP) - MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative. - - // Spill local variables (registers) onto the stack; call; unspill. - MOVQ DI, 80(SP) - CALL ·emitLiteral(SB) - MOVQ 80(SP), DI - - // Finish the "d +=" part of "d += emitLiteral(etc)". - ADDQ 48(SP), DI - -encodeBlockEnd: - MOVQ dst_base+0(FP), AX - SUBQ AX, DI - MOVQ DI, d+48(FP) - RET |