summaryrefslogtreecommitdiff
path: root/vendor/github.com/etcd-io/bbolt/node.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/etcd-io/bbolt/node.go')
-rw-r--r--vendor/github.com/etcd-io/bbolt/node.go604
1 files changed, 604 insertions, 0 deletions
diff --git a/vendor/github.com/etcd-io/bbolt/node.go b/vendor/github.com/etcd-io/bbolt/node.go
new file mode 100644
index 000000000..6c3fa553e
--- /dev/null
+++ b/vendor/github.com/etcd-io/bbolt/node.go
@@ -0,0 +1,604 @@
+package bbolt
+
+import (
+ "bytes"
+ "fmt"
+ "sort"
+ "unsafe"
+)
+
+// node represents an in-memory, deserialized page.
+type node struct {
+ bucket *Bucket
+ isLeaf bool
+ unbalanced bool
+ spilled bool
+ key []byte
+ pgid pgid
+ parent *node
+ children nodes
+ inodes inodes
+}
+
+// root returns the top-level node this node is attached to.
+func (n *node) root() *node {
+ if n.parent == nil {
+ return n
+ }
+ return n.parent.root()
+}
+
+// minKeys returns the minimum number of inodes this node should have.
+func (n *node) minKeys() int {
+ if n.isLeaf {
+ return 1
+ }
+ return 2
+}
+
+// size returns the size of the node after serialization.
+func (n *node) size() int {
+ sz, elsz := pageHeaderSize, n.pageElementSize()
+ for i := 0; i < len(n.inodes); i++ {
+ item := &n.inodes[i]
+ sz += elsz + len(item.key) + len(item.value)
+ }
+ return sz
+}
+
+// sizeLessThan returns true if the node is less than a given size.
+// This is an optimization to avoid calculating a large node when we only need
+// to know if it fits inside a certain page size.
+func (n *node) sizeLessThan(v int) bool {
+ sz, elsz := pageHeaderSize, n.pageElementSize()
+ for i := 0; i < len(n.inodes); i++ {
+ item := &n.inodes[i]
+ sz += elsz + len(item.key) + len(item.value)
+ if sz >= v {
+ return false
+ }
+ }
+ return true
+}
+
+// pageElementSize returns the size of each page element based on the type of node.
+func (n *node) pageElementSize() int {
+ if n.isLeaf {
+ return leafPageElementSize
+ }
+ return branchPageElementSize
+}
+
+// childAt returns the child node at a given index.
+func (n *node) childAt(index int) *node {
+ if n.isLeaf {
+ panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
+ }
+ return n.bucket.node(n.inodes[index].pgid, n)
+}
+
+// childIndex returns the index of a given child node.
+func (n *node) childIndex(child *node) int {
+ index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
+ return index
+}
+
+// numChildren returns the number of children.
+func (n *node) numChildren() int {
+ return len(n.inodes)
+}
+
+// nextSibling returns the next node with the same parent.
+func (n *node) nextSibling() *node {
+ if n.parent == nil {
+ return nil
+ }
+ index := n.parent.childIndex(n)
+ if index >= n.parent.numChildren()-1 {
+ return nil
+ }
+ return n.parent.childAt(index + 1)
+}
+
+// prevSibling returns the previous node with the same parent.
+func (n *node) prevSibling() *node {
+ if n.parent == nil {
+ return nil
+ }
+ index := n.parent.childIndex(n)
+ if index == 0 {
+ return nil
+ }
+ return n.parent.childAt(index - 1)
+}
+
+// put inserts a key/value.
+func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
+ if pgid >= n.bucket.tx.meta.pgid {
+ panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
+ } else if len(oldKey) <= 0 {
+ panic("put: zero-length old key")
+ } else if len(newKey) <= 0 {
+ panic("put: zero-length new key")
+ }
+
+ // Find insertion index.
+ index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
+
+ // Add capacity and shift nodes if we don't have an exact match and need to insert.
+ exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
+ if !exact {
+ n.inodes = append(n.inodes, inode{})
+ copy(n.inodes[index+1:], n.inodes[index:])
+ }
+
+ inode := &n.inodes[index]
+ inode.flags = flags
+ inode.key = newKey
+ inode.value = value
+ inode.pgid = pgid
+ _assert(len(inode.key) > 0, "put: zero-length inode key")
+}
+
+// del removes a key from the node.
+func (n *node) del(key []byte) {
+ // Find index of key.
+ index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
+
+ // Exit if the key isn't found.
+ if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
+ return
+ }
+
+ // Delete inode from the node.
+ n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
+
+ // Mark the node as needing rebalancing.
+ n.unbalanced = true
+}
+
+// read initializes the node from a page.
+func (n *node) read(p *page) {
+ n.pgid = p.id
+ n.isLeaf = ((p.flags & leafPageFlag) != 0)
+ n.inodes = make(inodes, int(p.count))
+
+ for i := 0; i < int(p.count); i++ {
+ inode := &n.inodes[i]
+ if n.isLeaf {
+ elem := p.leafPageElement(uint16(i))
+ inode.flags = elem.flags
+ inode.key = elem.key()
+ inode.value = elem.value()
+ } else {
+ elem := p.branchPageElement(uint16(i))
+ inode.pgid = elem.pgid
+ inode.key = elem.key()
+ }
+ _assert(len(inode.key) > 0, "read: zero-length inode key")
+ }
+
+ // Save first key so we can find the node in the parent when we spill.
+ if len(n.inodes) > 0 {
+ n.key = n.inodes[0].key
+ _assert(len(n.key) > 0, "read: zero-length node key")
+ } else {
+ n.key = nil
+ }
+}
+
+// write writes the items onto one or more pages.
+func (n *node) write(p *page) {
+ // Initialize page.
+ if n.isLeaf {
+ p.flags |= leafPageFlag
+ } else {
+ p.flags |= branchPageFlag
+ }
+
+ if len(n.inodes) >= 0xFFFF {
+ panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
+ }
+ p.count = uint16(len(n.inodes))
+
+ // Stop here if there are no items to write.
+ if p.count == 0 {
+ return
+ }
+
+ // Loop over each item and write it to the page.
+ b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
+ for i, item := range n.inodes {
+ _assert(len(item.key) > 0, "write: zero-length inode key")
+
+ // Write the page element.
+ if n.isLeaf {
+ elem := p.leafPageElement(uint16(i))
+ elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
+ elem.flags = item.flags
+ elem.ksize = uint32(len(item.key))
+ elem.vsize = uint32(len(item.value))
+ } else {
+ elem := p.branchPageElement(uint16(i))
+ elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
+ elem.ksize = uint32(len(item.key))
+ elem.pgid = item.pgid
+ _assert(elem.pgid != p.id, "write: circular dependency occurred")
+ }
+
+ // If the length of key+value is larger than the max allocation size
+ // then we need to reallocate the byte array pointer.
+ //
+ // See: https://github.com/boltdb/bolt/pull/335
+ klen, vlen := len(item.key), len(item.value)
+ if len(b) < klen+vlen {
+ b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
+ }
+
+ // Write data for the element to the end of the page.
+ copy(b[0:], item.key)
+ b = b[klen:]
+ copy(b[0:], item.value)
+ b = b[vlen:]
+ }
+
+ // DEBUG ONLY: n.dump()
+}
+
+// split breaks up a node into multiple smaller nodes, if appropriate.
+// This should only be called from the spill() function.
+func (n *node) split(pageSize int) []*node {
+ var nodes []*node
+
+ node := n
+ for {
+ // Split node into two.
+ a, b := node.splitTwo(pageSize)
+ nodes = append(nodes, a)
+
+ // If we can't split then exit the loop.
+ if b == nil {
+ break
+ }
+
+ // Set node to b so it gets split on the next iteration.
+ node = b
+ }
+
+ return nodes
+}
+
+// splitTwo breaks up a node into two smaller nodes, if appropriate.
+// This should only be called from the split() function.
+func (n *node) splitTwo(pageSize int) (*node, *node) {
+ // Ignore the split if the page doesn't have at least enough nodes for
+ // two pages or if the nodes can fit in a single page.
+ if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
+ return n, nil
+ }
+
+ // Determine the threshold before starting a new node.
+ var fillPercent = n.bucket.FillPercent
+ if fillPercent < minFillPercent {
+ fillPercent = minFillPercent
+ } else if fillPercent > maxFillPercent {
+ fillPercent = maxFillPercent
+ }
+ threshold := int(float64(pageSize) * fillPercent)
+
+ // Determine split position and sizes of the two pages.
+ splitIndex, _ := n.splitIndex(threshold)
+
+ // Split node into two separate nodes.
+ // If there's no parent then we'll need to create one.
+ if n.parent == nil {
+ n.parent = &node{bucket: n.bucket, children: []*node{n}}
+ }
+
+ // Create a new node and add it to the parent.
+ next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
+ n.parent.children = append(n.parent.children, next)
+
+ // Split inodes across two nodes.
+ next.inodes = n.inodes[splitIndex:]
+ n.inodes = n.inodes[:splitIndex]
+
+ // Update the statistics.
+ n.bucket.tx.stats.Split++
+
+ return n, next
+}
+
+// splitIndex finds the position where a page will fill a given threshold.
+// It returns the index as well as the size of the first page.
+// This is only be called from split().
+func (n *node) splitIndex(threshold int) (index, sz int) {
+ sz = pageHeaderSize
+
+ // Loop until we only have the minimum number of keys required for the second page.
+ for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
+ index = i
+ inode := n.inodes[i]
+ elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
+
+ // If we have at least the minimum number of keys and adding another
+ // node would put us over the threshold then exit and return.
+ if i >= minKeysPerPage && sz+elsize > threshold {
+ break
+ }
+
+ // Add the element size to the total size.
+ sz += elsize
+ }
+
+ return
+}
+
+// spill writes the nodes to dirty pages and splits nodes as it goes.
+// Returns an error if dirty pages cannot be allocated.
+func (n *node) spill() error {
+ var tx = n.bucket.tx
+ if n.spilled {
+ return nil
+ }
+
+ // Spill child nodes first. Child nodes can materialize sibling nodes in
+ // the case of split-merge so we cannot use a range loop. We have to check
+ // the children size on every loop iteration.
+ sort.Sort(n.children)
+ for i := 0; i < len(n.children); i++ {
+ if err := n.children[i].spill(); err != nil {
+ return err
+ }
+ }
+
+ // We no longer need the child list because it's only used for spill tracking.
+ n.children = nil
+
+ // Split nodes into appropriate sizes. The first node will always be n.
+ var nodes = n.split(tx.db.pageSize)
+ for _, node := range nodes {
+ // Add node's page to the freelist if it's not new.
+ if node.pgid > 0 {
+ tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
+ node.pgid = 0
+ }
+
+ // Allocate contiguous space for the node.
+ p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
+ if err != nil {
+ return err
+ }
+
+ // Write the node.
+ if p.id >= tx.meta.pgid {
+ panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
+ }
+ node.pgid = p.id
+ node.write(p)
+ node.spilled = true
+
+ // Insert into parent inodes.
+ if node.parent != nil {
+ var key = node.key
+ if key == nil {
+ key = node.inodes[0].key
+ }
+
+ node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
+ node.key = node.inodes[0].key
+ _assert(len(node.key) > 0, "spill: zero-length node key")
+ }
+
+ // Update the statistics.
+ tx.stats.Spill++
+ }
+
+ // If the root node split and created a new root then we need to spill that
+ // as well. We'll clear out the children to make sure it doesn't try to respill.
+ if n.parent != nil && n.parent.pgid == 0 {
+ n.children = nil
+ return n.parent.spill()
+ }
+
+ return nil
+}
+
+// rebalance attempts to combine the node with sibling nodes if the node fill
+// size is below a threshold or if there are not enough keys.
+func (n *node) rebalance() {
+ if !n.unbalanced {
+ return
+ }
+ n.unbalanced = false
+
+ // Update statistics.
+ n.bucket.tx.stats.Rebalance++
+
+ // Ignore if node is above threshold (25%) and has enough keys.
+ var threshold = n.bucket.tx.db.pageSize / 4
+ if n.size() > threshold && len(n.inodes) > n.minKeys() {
+ return
+ }
+
+ // Root node has special handling.
+ if n.parent == nil {
+ // If root node is a branch and only has one node then collapse it.
+ if !n.isLeaf && len(n.inodes) == 1 {
+ // Move root's child up.
+ child := n.bucket.node(n.inodes[0].pgid, n)
+ n.isLeaf = child.isLeaf
+ n.inodes = child.inodes[:]
+ n.children = child.children
+
+ // Reparent all child nodes being moved.
+ for _, inode := range n.inodes {
+ if child, ok := n.bucket.nodes[inode.pgid]; ok {
+ child.parent = n
+ }
+ }
+
+ // Remove old child.
+ child.parent = nil
+ delete(n.bucket.nodes, child.pgid)
+ child.free()
+ }
+
+ return
+ }
+
+ // If node has no keys then just remove it.
+ if n.numChildren() == 0 {
+ n.parent.del(n.key)
+ n.parent.removeChild(n)
+ delete(n.bucket.nodes, n.pgid)
+ n.free()
+ n.parent.rebalance()
+ return
+ }
+
+ _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
+
+ // Destination node is right sibling if idx == 0, otherwise left sibling.
+ var target *node
+ var useNextSibling = (n.parent.childIndex(n) == 0)
+ if useNextSibling {
+ target = n.nextSibling()
+ } else {
+ target = n.prevSibling()
+ }
+
+ // If both this node and the target node are too small then merge them.
+ if useNextSibling {
+ // Reparent all child nodes being moved.
+ for _, inode := range target.inodes {
+ if child, ok := n.bucket.nodes[inode.pgid]; ok {
+ child.parent.removeChild(child)
+ child.parent = n
+ child.parent.children = append(child.parent.children, child)
+ }
+ }
+
+ // Copy over inodes from target and remove target.
+ n.inodes = append(n.inodes, target.inodes...)
+ n.parent.del(target.key)
+ n.parent.removeChild(target)
+ delete(n.bucket.nodes, target.pgid)
+ target.free()
+ } else {
+ // Reparent all child nodes being moved.
+ for _, inode := range n.inodes {
+ if child, ok := n.bucket.nodes[inode.pgid]; ok {
+ child.parent.removeChild(child)
+ child.parent = target
+ child.parent.children = append(child.parent.children, child)
+ }
+ }
+
+ // Copy over inodes to target and remove node.
+ target.inodes = append(target.inodes, n.inodes...)
+ n.parent.del(n.key)
+ n.parent.removeChild(n)
+ delete(n.bucket.nodes, n.pgid)
+ n.free()
+ }
+
+ // Either this node or the target node was deleted from the parent so rebalance it.
+ n.parent.rebalance()
+}
+
+// removes a node from the list of in-memory children.
+// This does not affect the inodes.
+func (n *node) removeChild(target *node) {
+ for i, child := range n.children {
+ if child == target {
+ n.children = append(n.children[:i], n.children[i+1:]...)
+ return
+ }
+ }
+}
+
+// dereference causes the node to copy all its inode key/value references to heap memory.
+// This is required when the mmap is reallocated so inodes are not pointing to stale data.
+func (n *node) dereference() {
+ if n.key != nil {
+ key := make([]byte, len(n.key))
+ copy(key, n.key)
+ n.key = key
+ _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
+ }
+
+ for i := range n.inodes {
+ inode := &n.inodes[i]
+
+ key := make([]byte, len(inode.key))
+ copy(key, inode.key)
+ inode.key = key
+ _assert(len(inode.key) > 0, "dereference: zero-length inode key")
+
+ value := make([]byte, len(inode.value))
+ copy(value, inode.value)
+ inode.value = value
+ }
+
+ // Recursively dereference children.
+ for _, child := range n.children {
+ child.dereference()
+ }
+
+ // Update statistics.
+ n.bucket.tx.stats.NodeDeref++
+}
+
+// free adds the node's underlying page to the freelist.
+func (n *node) free() {
+ if n.pgid != 0 {
+ n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
+ n.pgid = 0
+ }
+}
+
+// dump writes the contents of the node to STDERR for debugging purposes.
+/*
+func (n *node) dump() {
+ // Write node header.
+ var typ = "branch"
+ if n.isLeaf {
+ typ = "leaf"
+ }
+ warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
+
+ // Write out abbreviated version of each item.
+ for _, item := range n.inodes {
+ if n.isLeaf {
+ if item.flags&bucketLeafFlag != 0 {
+ bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
+ warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
+ } else {
+ warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
+ }
+ } else {
+ warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
+ }
+ }
+ warn("")
+}
+*/
+
+type nodes []*node
+
+func (s nodes) Len() int { return len(s) }
+func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
+
+// inode represents an internal node inside of a node.
+// It can be used to point to elements in a page or point
+// to an element which hasn't been added to a page yet.
+type inode struct {
+ flags uint32
+ pgid pgid
+ key []byte
+ value []byte
+}
+
+type inodes []inode