diff options
Diffstat (limited to 'vendor/github.com/etcd-io/bbolt/node.go')
-rw-r--r-- | vendor/github.com/etcd-io/bbolt/node.go | 604 |
1 files changed, 0 insertions, 604 deletions
diff --git a/vendor/github.com/etcd-io/bbolt/node.go b/vendor/github.com/etcd-io/bbolt/node.go deleted file mode 100644 index 6c3fa553e..000000000 --- a/vendor/github.com/etcd-io/bbolt/node.go +++ /dev/null @@ -1,604 +0,0 @@ -package bbolt - -import ( - "bytes" - "fmt" - "sort" - "unsafe" -) - -// node represents an in-memory, deserialized page. -type node struct { - bucket *Bucket - isLeaf bool - unbalanced bool - spilled bool - key []byte - pgid pgid - parent *node - children nodes - inodes inodes -} - -// root returns the top-level node this node is attached to. -func (n *node) root() *node { - if n.parent == nil { - return n - } - return n.parent.root() -} - -// minKeys returns the minimum number of inodes this node should have. -func (n *node) minKeys() int { - if n.isLeaf { - return 1 - } - return 2 -} - -// size returns the size of the node after serialization. -func (n *node) size() int { - sz, elsz := pageHeaderSize, n.pageElementSize() - for i := 0; i < len(n.inodes); i++ { - item := &n.inodes[i] - sz += elsz + len(item.key) + len(item.value) - } - return sz -} - -// sizeLessThan returns true if the node is less than a given size. -// This is an optimization to avoid calculating a large node when we only need -// to know if it fits inside a certain page size. -func (n *node) sizeLessThan(v int) bool { - sz, elsz := pageHeaderSize, n.pageElementSize() - for i := 0; i < len(n.inodes); i++ { - item := &n.inodes[i] - sz += elsz + len(item.key) + len(item.value) - if sz >= v { - return false - } - } - return true -} - -// pageElementSize returns the size of each page element based on the type of node. -func (n *node) pageElementSize() int { - if n.isLeaf { - return leafPageElementSize - } - return branchPageElementSize -} - -// childAt returns the child node at a given index. -func (n *node) childAt(index int) *node { - if n.isLeaf { - panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) - } - return n.bucket.node(n.inodes[index].pgid, n) -} - -// childIndex returns the index of a given child node. -func (n *node) childIndex(child *node) int { - index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) - return index -} - -// numChildren returns the number of children. -func (n *node) numChildren() int { - return len(n.inodes) -} - -// nextSibling returns the next node with the same parent. -func (n *node) nextSibling() *node { - if n.parent == nil { - return nil - } - index := n.parent.childIndex(n) - if index >= n.parent.numChildren()-1 { - return nil - } - return n.parent.childAt(index + 1) -} - -// prevSibling returns the previous node with the same parent. -func (n *node) prevSibling() *node { - if n.parent == nil { - return nil - } - index := n.parent.childIndex(n) - if index == 0 { - return nil - } - return n.parent.childAt(index - 1) -} - -// put inserts a key/value. -func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { - if pgid >= n.bucket.tx.meta.pgid { - panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) - } else if len(oldKey) <= 0 { - panic("put: zero-length old key") - } else if len(newKey) <= 0 { - panic("put: zero-length new key") - } - - // Find insertion index. - index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) - - // Add capacity and shift nodes if we don't have an exact match and need to insert. - exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) - if !exact { - n.inodes = append(n.inodes, inode{}) - copy(n.inodes[index+1:], n.inodes[index:]) - } - - inode := &n.inodes[index] - inode.flags = flags - inode.key = newKey - inode.value = value - inode.pgid = pgid - _assert(len(inode.key) > 0, "put: zero-length inode key") -} - -// del removes a key from the node. -func (n *node) del(key []byte) { - // Find index of key. - index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) - - // Exit if the key isn't found. - if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { - return - } - - // Delete inode from the node. - n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) - - // Mark the node as needing rebalancing. - n.unbalanced = true -} - -// read initializes the node from a page. -func (n *node) read(p *page) { - n.pgid = p.id - n.isLeaf = ((p.flags & leafPageFlag) != 0) - n.inodes = make(inodes, int(p.count)) - - for i := 0; i < int(p.count); i++ { - inode := &n.inodes[i] - if n.isLeaf { - elem := p.leafPageElement(uint16(i)) - inode.flags = elem.flags - inode.key = elem.key() - inode.value = elem.value() - } else { - elem := p.branchPageElement(uint16(i)) - inode.pgid = elem.pgid - inode.key = elem.key() - } - _assert(len(inode.key) > 0, "read: zero-length inode key") - } - - // Save first key so we can find the node in the parent when we spill. - if len(n.inodes) > 0 { - n.key = n.inodes[0].key - _assert(len(n.key) > 0, "read: zero-length node key") - } else { - n.key = nil - } -} - -// write writes the items onto one or more pages. -func (n *node) write(p *page) { - // Initialize page. - if n.isLeaf { - p.flags |= leafPageFlag - } else { - p.flags |= branchPageFlag - } - - if len(n.inodes) >= 0xFFFF { - panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) - } - p.count = uint16(len(n.inodes)) - - // Stop here if there are no items to write. - if p.count == 0 { - return - } - - // Loop over each item and write it to the page. - b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] - for i, item := range n.inodes { - _assert(len(item.key) > 0, "write: zero-length inode key") - - // Write the page element. - if n.isLeaf { - elem := p.leafPageElement(uint16(i)) - elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) - elem.flags = item.flags - elem.ksize = uint32(len(item.key)) - elem.vsize = uint32(len(item.value)) - } else { - elem := p.branchPageElement(uint16(i)) - elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) - elem.ksize = uint32(len(item.key)) - elem.pgid = item.pgid - _assert(elem.pgid != p.id, "write: circular dependency occurred") - } - - // If the length of key+value is larger than the max allocation size - // then we need to reallocate the byte array pointer. - // - // See: https://github.com/boltdb/bolt/pull/335 - klen, vlen := len(item.key), len(item.value) - if len(b) < klen+vlen { - b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:] - } - - // Write data for the element to the end of the page. - copy(b[0:], item.key) - b = b[klen:] - copy(b[0:], item.value) - b = b[vlen:] - } - - // DEBUG ONLY: n.dump() -} - -// split breaks up a node into multiple smaller nodes, if appropriate. -// This should only be called from the spill() function. -func (n *node) split(pageSize int) []*node { - var nodes []*node - - node := n - for { - // Split node into two. - a, b := node.splitTwo(pageSize) - nodes = append(nodes, a) - - // If we can't split then exit the loop. - if b == nil { - break - } - - // Set node to b so it gets split on the next iteration. - node = b - } - - return nodes -} - -// splitTwo breaks up a node into two smaller nodes, if appropriate. -// This should only be called from the split() function. -func (n *node) splitTwo(pageSize int) (*node, *node) { - // Ignore the split if the page doesn't have at least enough nodes for - // two pages or if the nodes can fit in a single page. - if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { - return n, nil - } - - // Determine the threshold before starting a new node. - var fillPercent = n.bucket.FillPercent - if fillPercent < minFillPercent { - fillPercent = minFillPercent - } else if fillPercent > maxFillPercent { - fillPercent = maxFillPercent - } - threshold := int(float64(pageSize) * fillPercent) - - // Determine split position and sizes of the two pages. - splitIndex, _ := n.splitIndex(threshold) - - // Split node into two separate nodes. - // If there's no parent then we'll need to create one. - if n.parent == nil { - n.parent = &node{bucket: n.bucket, children: []*node{n}} - } - - // Create a new node and add it to the parent. - next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} - n.parent.children = append(n.parent.children, next) - - // Split inodes across two nodes. - next.inodes = n.inodes[splitIndex:] - n.inodes = n.inodes[:splitIndex] - - // Update the statistics. - n.bucket.tx.stats.Split++ - - return n, next -} - -// splitIndex finds the position where a page will fill a given threshold. -// It returns the index as well as the size of the first page. -// This is only be called from split(). -func (n *node) splitIndex(threshold int) (index, sz int) { - sz = pageHeaderSize - - // Loop until we only have the minimum number of keys required for the second page. - for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { - index = i - inode := n.inodes[i] - elsize := n.pageElementSize() + len(inode.key) + len(inode.value) - - // If we have at least the minimum number of keys and adding another - // node would put us over the threshold then exit and return. - if i >= minKeysPerPage && sz+elsize > threshold { - break - } - - // Add the element size to the total size. - sz += elsize - } - - return -} - -// spill writes the nodes to dirty pages and splits nodes as it goes. -// Returns an error if dirty pages cannot be allocated. -func (n *node) spill() error { - var tx = n.bucket.tx - if n.spilled { - return nil - } - - // Spill child nodes first. Child nodes can materialize sibling nodes in - // the case of split-merge so we cannot use a range loop. We have to check - // the children size on every loop iteration. - sort.Sort(n.children) - for i := 0; i < len(n.children); i++ { - if err := n.children[i].spill(); err != nil { - return err - } - } - - // We no longer need the child list because it's only used for spill tracking. - n.children = nil - - // Split nodes into appropriate sizes. The first node will always be n. - var nodes = n.split(tx.db.pageSize) - for _, node := range nodes { - // Add node's page to the freelist if it's not new. - if node.pgid > 0 { - tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) - node.pgid = 0 - } - - // Allocate contiguous space for the node. - p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize) - if err != nil { - return err - } - - // Write the node. - if p.id >= tx.meta.pgid { - panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) - } - node.pgid = p.id - node.write(p) - node.spilled = true - - // Insert into parent inodes. - if node.parent != nil { - var key = node.key - if key == nil { - key = node.inodes[0].key - } - - node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) - node.key = node.inodes[0].key - _assert(len(node.key) > 0, "spill: zero-length node key") - } - - // Update the statistics. - tx.stats.Spill++ - } - - // If the root node split and created a new root then we need to spill that - // as well. We'll clear out the children to make sure it doesn't try to respill. - if n.parent != nil && n.parent.pgid == 0 { - n.children = nil - return n.parent.spill() - } - - return nil -} - -// rebalance attempts to combine the node with sibling nodes if the node fill -// size is below a threshold or if there are not enough keys. -func (n *node) rebalance() { - if !n.unbalanced { - return - } - n.unbalanced = false - - // Update statistics. - n.bucket.tx.stats.Rebalance++ - - // Ignore if node is above threshold (25%) and has enough keys. - var threshold = n.bucket.tx.db.pageSize / 4 - if n.size() > threshold && len(n.inodes) > n.minKeys() { - return - } - - // Root node has special handling. - if n.parent == nil { - // If root node is a branch and only has one node then collapse it. - if !n.isLeaf && len(n.inodes) == 1 { - // Move root's child up. - child := n.bucket.node(n.inodes[0].pgid, n) - n.isLeaf = child.isLeaf - n.inodes = child.inodes[:] - n.children = child.children - - // Reparent all child nodes being moved. - for _, inode := range n.inodes { - if child, ok := n.bucket.nodes[inode.pgid]; ok { - child.parent = n - } - } - - // Remove old child. - child.parent = nil - delete(n.bucket.nodes, child.pgid) - child.free() - } - - return - } - - // If node has no keys then just remove it. - if n.numChildren() == 0 { - n.parent.del(n.key) - n.parent.removeChild(n) - delete(n.bucket.nodes, n.pgid) - n.free() - n.parent.rebalance() - return - } - - _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") - - // Destination node is right sibling if idx == 0, otherwise left sibling. - var target *node - var useNextSibling = (n.parent.childIndex(n) == 0) - if useNextSibling { - target = n.nextSibling() - } else { - target = n.prevSibling() - } - - // If both this node and the target node are too small then merge them. - if useNextSibling { - // Reparent all child nodes being moved. - for _, inode := range target.inodes { - if child, ok := n.bucket.nodes[inode.pgid]; ok { - child.parent.removeChild(child) - child.parent = n - child.parent.children = append(child.parent.children, child) - } - } - - // Copy over inodes from target and remove target. - n.inodes = append(n.inodes, target.inodes...) - n.parent.del(target.key) - n.parent.removeChild(target) - delete(n.bucket.nodes, target.pgid) - target.free() - } else { - // Reparent all child nodes being moved. - for _, inode := range n.inodes { - if child, ok := n.bucket.nodes[inode.pgid]; ok { - child.parent.removeChild(child) - child.parent = target - child.parent.children = append(child.parent.children, child) - } - } - - // Copy over inodes to target and remove node. - target.inodes = append(target.inodes, n.inodes...) - n.parent.del(n.key) - n.parent.removeChild(n) - delete(n.bucket.nodes, n.pgid) - n.free() - } - - // Either this node or the target node was deleted from the parent so rebalance it. - n.parent.rebalance() -} - -// removes a node from the list of in-memory children. -// This does not affect the inodes. -func (n *node) removeChild(target *node) { - for i, child := range n.children { - if child == target { - n.children = append(n.children[:i], n.children[i+1:]...) - return - } - } -} - -// dereference causes the node to copy all its inode key/value references to heap memory. -// This is required when the mmap is reallocated so inodes are not pointing to stale data. -func (n *node) dereference() { - if n.key != nil { - key := make([]byte, len(n.key)) - copy(key, n.key) - n.key = key - _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") - } - - for i := range n.inodes { - inode := &n.inodes[i] - - key := make([]byte, len(inode.key)) - copy(key, inode.key) - inode.key = key - _assert(len(inode.key) > 0, "dereference: zero-length inode key") - - value := make([]byte, len(inode.value)) - copy(value, inode.value) - inode.value = value - } - - // Recursively dereference children. - for _, child := range n.children { - child.dereference() - } - - // Update statistics. - n.bucket.tx.stats.NodeDeref++ -} - -// free adds the node's underlying page to the freelist. -func (n *node) free() { - if n.pgid != 0 { - n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) - n.pgid = 0 - } -} - -// dump writes the contents of the node to STDERR for debugging purposes. -/* -func (n *node) dump() { - // Write node header. - var typ = "branch" - if n.isLeaf { - typ = "leaf" - } - warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) - - // Write out abbreviated version of each item. - for _, item := range n.inodes { - if n.isLeaf { - if item.flags&bucketLeafFlag != 0 { - bucket := (*bucket)(unsafe.Pointer(&item.value[0])) - warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) - } else { - warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) - } - } else { - warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) - } - } - warn("") -} -*/ - -type nodes []*node - -func (s nodes) Len() int { return len(s) } -func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } -func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 } - -// inode represents an internal node inside of a node. -// It can be used to point to elements in a page or point -// to an element which hasn't been added to a page yet. -type inode struct { - flags uint32 - pgid pgid - key []byte - value []byte -} - -type inodes []inode |