diff options
Diffstat (limited to 'vendor/github.com/etcd-io/bbolt/node.go')
-rw-r--r-- | vendor/github.com/etcd-io/bbolt/node.go | 604 |
1 files changed, 604 insertions, 0 deletions
diff --git a/vendor/github.com/etcd-io/bbolt/node.go b/vendor/github.com/etcd-io/bbolt/node.go new file mode 100644 index 000000000..6c3fa553e --- /dev/null +++ b/vendor/github.com/etcd-io/bbolt/node.go @@ -0,0 +1,604 @@ +package bbolt + +import ( + "bytes" + "fmt" + "sort" + "unsafe" +) + +// node represents an in-memory, deserialized page. +type node struct { + bucket *Bucket + isLeaf bool + unbalanced bool + spilled bool + key []byte + pgid pgid + parent *node + children nodes + inodes inodes +} + +// root returns the top-level node this node is attached to. +func (n *node) root() *node { + if n.parent == nil { + return n + } + return n.parent.root() +} + +// minKeys returns the minimum number of inodes this node should have. +func (n *node) minKeys() int { + if n.isLeaf { + return 1 + } + return 2 +} + +// size returns the size of the node after serialization. +func (n *node) size() int { + sz, elsz := pageHeaderSize, n.pageElementSize() + for i := 0; i < len(n.inodes); i++ { + item := &n.inodes[i] + sz += elsz + len(item.key) + len(item.value) + } + return sz +} + +// sizeLessThan returns true if the node is less than a given size. +// This is an optimization to avoid calculating a large node when we only need +// to know if it fits inside a certain page size. +func (n *node) sizeLessThan(v int) bool { + sz, elsz := pageHeaderSize, n.pageElementSize() + for i := 0; i < len(n.inodes); i++ { + item := &n.inodes[i] + sz += elsz + len(item.key) + len(item.value) + if sz >= v { + return false + } + } + return true +} + +// pageElementSize returns the size of each page element based on the type of node. +func (n *node) pageElementSize() int { + if n.isLeaf { + return leafPageElementSize + } + return branchPageElementSize +} + +// childAt returns the child node at a given index. +func (n *node) childAt(index int) *node { + if n.isLeaf { + panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) + } + return n.bucket.node(n.inodes[index].pgid, n) +} + +// childIndex returns the index of a given child node. +func (n *node) childIndex(child *node) int { + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) + return index +} + +// numChildren returns the number of children. +func (n *node) numChildren() int { + return len(n.inodes) +} + +// nextSibling returns the next node with the same parent. +func (n *node) nextSibling() *node { + if n.parent == nil { + return nil + } + index := n.parent.childIndex(n) + if index >= n.parent.numChildren()-1 { + return nil + } + return n.parent.childAt(index + 1) +} + +// prevSibling returns the previous node with the same parent. +func (n *node) prevSibling() *node { + if n.parent == nil { + return nil + } + index := n.parent.childIndex(n) + if index == 0 { + return nil + } + return n.parent.childAt(index - 1) +} + +// put inserts a key/value. +func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { + if pgid >= n.bucket.tx.meta.pgid { + panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) + } else if len(oldKey) <= 0 { + panic("put: zero-length old key") + } else if len(newKey) <= 0 { + panic("put: zero-length new key") + } + + // Find insertion index. + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) + + // Add capacity and shift nodes if we don't have an exact match and need to insert. + exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) + if !exact { + n.inodes = append(n.inodes, inode{}) + copy(n.inodes[index+1:], n.inodes[index:]) + } + + inode := &n.inodes[index] + inode.flags = flags + inode.key = newKey + inode.value = value + inode.pgid = pgid + _assert(len(inode.key) > 0, "put: zero-length inode key") +} + +// del removes a key from the node. +func (n *node) del(key []byte) { + // Find index of key. + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) + + // Exit if the key isn't found. + if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { + return + } + + // Delete inode from the node. + n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) + + // Mark the node as needing rebalancing. + n.unbalanced = true +} + +// read initializes the node from a page. +func (n *node) read(p *page) { + n.pgid = p.id + n.isLeaf = ((p.flags & leafPageFlag) != 0) + n.inodes = make(inodes, int(p.count)) + + for i := 0; i < int(p.count); i++ { + inode := &n.inodes[i] + if n.isLeaf { + elem := p.leafPageElement(uint16(i)) + inode.flags = elem.flags + inode.key = elem.key() + inode.value = elem.value() + } else { + elem := p.branchPageElement(uint16(i)) + inode.pgid = elem.pgid + inode.key = elem.key() + } + _assert(len(inode.key) > 0, "read: zero-length inode key") + } + + // Save first key so we can find the node in the parent when we spill. + if len(n.inodes) > 0 { + n.key = n.inodes[0].key + _assert(len(n.key) > 0, "read: zero-length node key") + } else { + n.key = nil + } +} + +// write writes the items onto one or more pages. +func (n *node) write(p *page) { + // Initialize page. + if n.isLeaf { + p.flags |= leafPageFlag + } else { + p.flags |= branchPageFlag + } + + if len(n.inodes) >= 0xFFFF { + panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) + } + p.count = uint16(len(n.inodes)) + + // Stop here if there are no items to write. + if p.count == 0 { + return + } + + // Loop over each item and write it to the page. + b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] + for i, item := range n.inodes { + _assert(len(item.key) > 0, "write: zero-length inode key") + + // Write the page element. + if n.isLeaf { + elem := p.leafPageElement(uint16(i)) + elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) + elem.flags = item.flags + elem.ksize = uint32(len(item.key)) + elem.vsize = uint32(len(item.value)) + } else { + elem := p.branchPageElement(uint16(i)) + elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) + elem.ksize = uint32(len(item.key)) + elem.pgid = item.pgid + _assert(elem.pgid != p.id, "write: circular dependency occurred") + } + + // If the length of key+value is larger than the max allocation size + // then we need to reallocate the byte array pointer. + // + // See: https://github.com/boltdb/bolt/pull/335 + klen, vlen := len(item.key), len(item.value) + if len(b) < klen+vlen { + b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:] + } + + // Write data for the element to the end of the page. + copy(b[0:], item.key) + b = b[klen:] + copy(b[0:], item.value) + b = b[vlen:] + } + + // DEBUG ONLY: n.dump() +} + +// split breaks up a node into multiple smaller nodes, if appropriate. +// This should only be called from the spill() function. +func (n *node) split(pageSize int) []*node { + var nodes []*node + + node := n + for { + // Split node into two. + a, b := node.splitTwo(pageSize) + nodes = append(nodes, a) + + // If we can't split then exit the loop. + if b == nil { + break + } + + // Set node to b so it gets split on the next iteration. + node = b + } + + return nodes +} + +// splitTwo breaks up a node into two smaller nodes, if appropriate. +// This should only be called from the split() function. +func (n *node) splitTwo(pageSize int) (*node, *node) { + // Ignore the split if the page doesn't have at least enough nodes for + // two pages or if the nodes can fit in a single page. + if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { + return n, nil + } + + // Determine the threshold before starting a new node. + var fillPercent = n.bucket.FillPercent + if fillPercent < minFillPercent { + fillPercent = minFillPercent + } else if fillPercent > maxFillPercent { + fillPercent = maxFillPercent + } + threshold := int(float64(pageSize) * fillPercent) + + // Determine split position and sizes of the two pages. + splitIndex, _ := n.splitIndex(threshold) + + // Split node into two separate nodes. + // If there's no parent then we'll need to create one. + if n.parent == nil { + n.parent = &node{bucket: n.bucket, children: []*node{n}} + } + + // Create a new node and add it to the parent. + next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} + n.parent.children = append(n.parent.children, next) + + // Split inodes across two nodes. + next.inodes = n.inodes[splitIndex:] + n.inodes = n.inodes[:splitIndex] + + // Update the statistics. + n.bucket.tx.stats.Split++ + + return n, next +} + +// splitIndex finds the position where a page will fill a given threshold. +// It returns the index as well as the size of the first page. +// This is only be called from split(). +func (n *node) splitIndex(threshold int) (index, sz int) { + sz = pageHeaderSize + + // Loop until we only have the minimum number of keys required for the second page. + for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { + index = i + inode := n.inodes[i] + elsize := n.pageElementSize() + len(inode.key) + len(inode.value) + + // If we have at least the minimum number of keys and adding another + // node would put us over the threshold then exit and return. + if i >= minKeysPerPage && sz+elsize > threshold { + break + } + + // Add the element size to the total size. + sz += elsize + } + + return +} + +// spill writes the nodes to dirty pages and splits nodes as it goes. +// Returns an error if dirty pages cannot be allocated. +func (n *node) spill() error { + var tx = n.bucket.tx + if n.spilled { + return nil + } + + // Spill child nodes first. Child nodes can materialize sibling nodes in + // the case of split-merge so we cannot use a range loop. We have to check + // the children size on every loop iteration. + sort.Sort(n.children) + for i := 0; i < len(n.children); i++ { + if err := n.children[i].spill(); err != nil { + return err + } + } + + // We no longer need the child list because it's only used for spill tracking. + n.children = nil + + // Split nodes into appropriate sizes. The first node will always be n. + var nodes = n.split(tx.db.pageSize) + for _, node := range nodes { + // Add node's page to the freelist if it's not new. + if node.pgid > 0 { + tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) + node.pgid = 0 + } + + // Allocate contiguous space for the node. + p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize) + if err != nil { + return err + } + + // Write the node. + if p.id >= tx.meta.pgid { + panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) + } + node.pgid = p.id + node.write(p) + node.spilled = true + + // Insert into parent inodes. + if node.parent != nil { + var key = node.key + if key == nil { + key = node.inodes[0].key + } + + node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) + node.key = node.inodes[0].key + _assert(len(node.key) > 0, "spill: zero-length node key") + } + + // Update the statistics. + tx.stats.Spill++ + } + + // If the root node split and created a new root then we need to spill that + // as well. We'll clear out the children to make sure it doesn't try to respill. + if n.parent != nil && n.parent.pgid == 0 { + n.children = nil + return n.parent.spill() + } + + return nil +} + +// rebalance attempts to combine the node with sibling nodes if the node fill +// size is below a threshold or if there are not enough keys. +func (n *node) rebalance() { + if !n.unbalanced { + return + } + n.unbalanced = false + + // Update statistics. + n.bucket.tx.stats.Rebalance++ + + // Ignore if node is above threshold (25%) and has enough keys. + var threshold = n.bucket.tx.db.pageSize / 4 + if n.size() > threshold && len(n.inodes) > n.minKeys() { + return + } + + // Root node has special handling. + if n.parent == nil { + // If root node is a branch and only has one node then collapse it. + if !n.isLeaf && len(n.inodes) == 1 { + // Move root's child up. + child := n.bucket.node(n.inodes[0].pgid, n) + n.isLeaf = child.isLeaf + n.inodes = child.inodes[:] + n.children = child.children + + // Reparent all child nodes being moved. + for _, inode := range n.inodes { + if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent = n + } + } + + // Remove old child. + child.parent = nil + delete(n.bucket.nodes, child.pgid) + child.free() + } + + return + } + + // If node has no keys then just remove it. + if n.numChildren() == 0 { + n.parent.del(n.key) + n.parent.removeChild(n) + delete(n.bucket.nodes, n.pgid) + n.free() + n.parent.rebalance() + return + } + + _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") + + // Destination node is right sibling if idx == 0, otherwise left sibling. + var target *node + var useNextSibling = (n.parent.childIndex(n) == 0) + if useNextSibling { + target = n.nextSibling() + } else { + target = n.prevSibling() + } + + // If both this node and the target node are too small then merge them. + if useNextSibling { + // Reparent all child nodes being moved. + for _, inode := range target.inodes { + if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent.removeChild(child) + child.parent = n + child.parent.children = append(child.parent.children, child) + } + } + + // Copy over inodes from target and remove target. + n.inodes = append(n.inodes, target.inodes...) + n.parent.del(target.key) + n.parent.removeChild(target) + delete(n.bucket.nodes, target.pgid) + target.free() + } else { + // Reparent all child nodes being moved. + for _, inode := range n.inodes { + if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent.removeChild(child) + child.parent = target + child.parent.children = append(child.parent.children, child) + } + } + + // Copy over inodes to target and remove node. + target.inodes = append(target.inodes, n.inodes...) + n.parent.del(n.key) + n.parent.removeChild(n) + delete(n.bucket.nodes, n.pgid) + n.free() + } + + // Either this node or the target node was deleted from the parent so rebalance it. + n.parent.rebalance() +} + +// removes a node from the list of in-memory children. +// This does not affect the inodes. +func (n *node) removeChild(target *node) { + for i, child := range n.children { + if child == target { + n.children = append(n.children[:i], n.children[i+1:]...) + return + } + } +} + +// dereference causes the node to copy all its inode key/value references to heap memory. +// This is required when the mmap is reallocated so inodes are not pointing to stale data. +func (n *node) dereference() { + if n.key != nil { + key := make([]byte, len(n.key)) + copy(key, n.key) + n.key = key + _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") + } + + for i := range n.inodes { + inode := &n.inodes[i] + + key := make([]byte, len(inode.key)) + copy(key, inode.key) + inode.key = key + _assert(len(inode.key) > 0, "dereference: zero-length inode key") + + value := make([]byte, len(inode.value)) + copy(value, inode.value) + inode.value = value + } + + // Recursively dereference children. + for _, child := range n.children { + child.dereference() + } + + // Update statistics. + n.bucket.tx.stats.NodeDeref++ +} + +// free adds the node's underlying page to the freelist. +func (n *node) free() { + if n.pgid != 0 { + n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) + n.pgid = 0 + } +} + +// dump writes the contents of the node to STDERR for debugging purposes. +/* +func (n *node) dump() { + // Write node header. + var typ = "branch" + if n.isLeaf { + typ = "leaf" + } + warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) + + // Write out abbreviated version of each item. + for _, item := range n.inodes { + if n.isLeaf { + if item.flags&bucketLeafFlag != 0 { + bucket := (*bucket)(unsafe.Pointer(&item.value[0])) + warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) + } else { + warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) + } + } else { + warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) + } + } + warn("") +} +*/ + +type nodes []*node + +func (s nodes) Len() int { return len(s) } +func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 } + +// inode represents an internal node inside of a node. +// It can be used to point to elements in a page or point +// to an element which hasn't been added to a page yet. +type inode struct { + flags uint32 + pgid pgid + key []byte + value []byte +} + +type inodes []inode |