From a031b83a09a8628435317a03f199cdc18b78262f Mon Sep 17 00:00:00 2001 From: Matthew Heon Date: Wed, 1 Nov 2017 11:24:59 -0400 Subject: Initial checkin from CRI-O repo Signed-off-by: Matthew Heon --- vendor/github.com/vbatts/tar-split/LICENSE | 28 + vendor/github.com/vbatts/tar-split/README.md | 137 +++ .../vbatts/tar-split/archive/tar/common.go | 340 +++++++ .../vbatts/tar-split/archive/tar/reader.go | 1064 ++++++++++++++++++++ .../vbatts/tar-split/archive/tar/stat_atim.go | 20 + .../vbatts/tar-split/archive/tar/stat_atimespec.go | 20 + .../vbatts/tar-split/archive/tar/stat_unix.go | 32 + .../vbatts/tar-split/archive/tar/writer.go | 416 ++++++++ .../github.com/vbatts/tar-split/tar/asm/README.md | 44 + .../vbatts/tar-split/tar/asm/assemble.go | 130 +++ .../vbatts/tar-split/tar/asm/disassemble.go | 141 +++ vendor/github.com/vbatts/tar-split/tar/asm/doc.go | 9 + .../github.com/vbatts/tar-split/tar/storage/doc.go | 12 + .../vbatts/tar-split/tar/storage/entry.go | 78 ++ .../vbatts/tar-split/tar/storage/getter.go | 104 ++ .../vbatts/tar-split/tar/storage/packer.go | 127 +++ 16 files changed, 2702 insertions(+) create mode 100644 vendor/github.com/vbatts/tar-split/LICENSE create mode 100644 vendor/github.com/vbatts/tar-split/README.md create mode 100644 vendor/github.com/vbatts/tar-split/archive/tar/common.go create mode 100644 vendor/github.com/vbatts/tar-split/archive/tar/reader.go create mode 100644 vendor/github.com/vbatts/tar-split/archive/tar/stat_atim.go create mode 100644 vendor/github.com/vbatts/tar-split/archive/tar/stat_atimespec.go create mode 100644 vendor/github.com/vbatts/tar-split/archive/tar/stat_unix.go create mode 100644 vendor/github.com/vbatts/tar-split/archive/tar/writer.go create mode 100644 vendor/github.com/vbatts/tar-split/tar/asm/README.md create mode 100644 vendor/github.com/vbatts/tar-split/tar/asm/assemble.go create mode 100644 vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go create mode 100644 vendor/github.com/vbatts/tar-split/tar/asm/doc.go create mode 100644 vendor/github.com/vbatts/tar-split/tar/storage/doc.go create mode 100644 vendor/github.com/vbatts/tar-split/tar/storage/entry.go create mode 100644 vendor/github.com/vbatts/tar-split/tar/storage/getter.go create mode 100644 vendor/github.com/vbatts/tar-split/tar/storage/packer.go (limited to 'vendor/github.com/vbatts/tar-split') diff --git a/vendor/github.com/vbatts/tar-split/LICENSE b/vendor/github.com/vbatts/tar-split/LICENSE new file mode 100644 index 000000000..ca03685b1 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2015 Vincent Batts, Raleigh, NC, USA + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/vbatts/tar-split/README.md b/vendor/github.com/vbatts/tar-split/README.md new file mode 100644 index 000000000..4c544d823 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/README.md @@ -0,0 +1,137 @@ +# tar-split + +[![Build Status](https://travis-ci.org/vbatts/tar-split.svg?branch=master)](https://travis-ci.org/vbatts/tar-split) + +Pristinely disassembling a tar archive, and stashing needed raw bytes and offsets to reassemble a validating original archive. + +## Docs + +Code API for libraries provided by `tar-split`: + +* https://godoc.org/github.com/vbatts/tar-split/tar/asm +* https://godoc.org/github.com/vbatts/tar-split/tar/storage +* https://godoc.org/github.com/vbatts/tar-split/archive/tar + +## Install + +The command line utilitiy is installable via: + +```bash +go get github.com/vbatts/tar-split/cmd/tar-split +``` + +## Usage + +For cli usage, see its [README.md](cmd/tar-split/README.md). +For the library see the [docs](#docs) + +## Demo + +### Basic disassembly and assembly + +This demonstrates the `tar-split` command and how to assemble a tar archive from the `tar-data.json.gz` + + +![basic cmd demo thumbnail](https://i.ytimg.com/vi/vh5wyjIOBtc/2.jpg?time=1445027151805) +[youtube video of basic command demo](https://youtu.be/vh5wyjIOBtc) + +### Docker layer preservation + +This demonstrates the tar-split integration for docker-1.8. Providing consistent tar archives for the image layer content. + +![docker tar-split demo](https://i.ytimg.com/vi_webp/vh5wyjIOBtc/default.webp) +[youtube vide of docker layer checksums](https://youtu.be/tV_Dia8E8xw) + +## Caveat + +Eventually this should detect TARs that this is not possible with. + +For example stored sparse files that have "holes" in them, will be read as a +contiguous file, though the archive contents may be recorded in sparse format. +Therefore when adding the file payload to a reassembled tar, to achieve +identical output, the file payload would need be precisely re-sparsified. This +is not something I seek to fix imediately, but would rather have an alert that +precise reassembly is not possible. +(see more http://www.gnu.org/software/tar/manual/html_node/Sparse-Formats.html) + + +Other caveat, while tar archives support having multiple file entries for the +same path, we will not support this feature. If there are more than one entries +with the same path, expect an err (like `ErrDuplicatePath`) or a resulting tar +stream that does not validate your original checksum/signature. + +## Contract + +Do not break the API of stdlib `archive/tar` in our fork (ideally find an upstream mergeable solution). + +## Std Version + +The version of golang stdlib `archive/tar` is from go1.6 +It is minimally extended to expose the raw bytes of the TAR, rather than just the marshalled headers and file stream. + + +## Design + +See the [design](concept/DESIGN.md). + +## Stored Metadata + +Since the raw bytes of the headers and padding are stored, you may be wondering +what the size implications are. The headers are at least 512 bytes per +file (sometimes more), at least 1024 null bytes on the end, and then various +padding. This makes for a constant linear growth in the stored metadata, with a +naive storage implementation. + +First we'll get an archive to work with. For repeatability, we'll make an +archive from what you've just cloned: + +```bash +git archive --format=tar -o tar-split.tar HEAD . +``` + +```bash +$ go get github.com/vbatts/tar-split/cmd/tar-split +$ tar-split checksize ./tar-split.tar +inspecting "tar-split.tar" (size 210k) + -- number of files: 50 + -- size of metadata uncompressed: 53k + -- size of gzip compressed metadata: 3k +``` + +So assuming you've managed the extraction of the archive yourself, for reuse of +the file payloads from a relative path, then the only additional storage +implications are as little as 3kb. + +But let's look at a larger archive, with many files. + +```bash +$ ls -sh ./d.tar +1.4G ./d.tar +$ tar-split checksize ~/d.tar +inspecting "/home/vbatts/d.tar" (size 1420749k) + -- number of files: 38718 + -- size of metadata uncompressed: 43261k + -- size of gzip compressed metadata: 2251k +``` + +Here, an archive with 38,718 files has a compressed footprint of about 2mb. + +Rolling the null bytes on the end of the archive, we will assume a +bytes-per-file rate for the storage implications. + +| uncompressed | compressed | +| :----------: | :--------: | +| ~ 1kb per/file | 0.06kb per/file | + + +## What's Next? + +* More implementations of storage Packer and Unpacker +* More implementations of FileGetter and FilePutter +* would be interesting to have an assembler stream that implements `io.Seeker` + + +## License + +See [LICENSE](LICENSE) + diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/common.go b/vendor/github.com/vbatts/tar-split/archive/tar/common.go new file mode 100644 index 000000000..36f4e2398 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/archive/tar/common.go @@ -0,0 +1,340 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package tar implements access to tar archives. +// It aims to cover most of the variations, including those produced +// by GNU and BSD tars. +// +// References: +// http://www.freebsd.org/cgi/man.cgi?query=tar&sektion=5 +// http://www.gnu.org/software/tar/manual/html_node/Standard.html +// http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html +package tar + +import ( + "bytes" + "errors" + "fmt" + "os" + "path" + "time" +) + +const ( + blockSize = 512 + + // Types + TypeReg = '0' // regular file + TypeRegA = '\x00' // regular file + TypeLink = '1' // hard link + TypeSymlink = '2' // symbolic link + TypeChar = '3' // character device node + TypeBlock = '4' // block device node + TypeDir = '5' // directory + TypeFifo = '6' // fifo node + TypeCont = '7' // reserved + TypeXHeader = 'x' // extended header + TypeXGlobalHeader = 'g' // global extended header + TypeGNULongName = 'L' // Next file has a long name + TypeGNULongLink = 'K' // Next file symlinks to a file w/ a long name + TypeGNUSparse = 'S' // sparse file +) + +// A Header represents a single header in a tar archive. +// Some fields may not be populated. +type Header struct { + Name string // name of header file entry + Mode int64 // permission and mode bits + Uid int // user id of owner + Gid int // group id of owner + Size int64 // length in bytes + ModTime time.Time // modified time + Typeflag byte // type of header entry + Linkname string // target name of link + Uname string // user name of owner + Gname string // group name of owner + Devmajor int64 // major number of character or block device + Devminor int64 // minor number of character or block device + AccessTime time.Time // access time + ChangeTime time.Time // status change time + Xattrs map[string]string +} + +// File name constants from the tar spec. +const ( + fileNameSize = 100 // Maximum number of bytes in a standard tar name. + fileNamePrefixSize = 155 // Maximum number of ustar extension bytes. +) + +// FileInfo returns an os.FileInfo for the Header. +func (h *Header) FileInfo() os.FileInfo { + return headerFileInfo{h} +} + +// headerFileInfo implements os.FileInfo. +type headerFileInfo struct { + h *Header +} + +func (fi headerFileInfo) Size() int64 { return fi.h.Size } +func (fi headerFileInfo) IsDir() bool { return fi.Mode().IsDir() } +func (fi headerFileInfo) ModTime() time.Time { return fi.h.ModTime } +func (fi headerFileInfo) Sys() interface{} { return fi.h } + +// Name returns the base name of the file. +func (fi headerFileInfo) Name() string { + if fi.IsDir() { + return path.Base(path.Clean(fi.h.Name)) + } + return path.Base(fi.h.Name) +} + +// Mode returns the permission and mode bits for the headerFileInfo. +func (fi headerFileInfo) Mode() (mode os.FileMode) { + // Set file permission bits. + mode = os.FileMode(fi.h.Mode).Perm() + + // Set setuid, setgid and sticky bits. + if fi.h.Mode&c_ISUID != 0 { + // setuid + mode |= os.ModeSetuid + } + if fi.h.Mode&c_ISGID != 0 { + // setgid + mode |= os.ModeSetgid + } + if fi.h.Mode&c_ISVTX != 0 { + // sticky + mode |= os.ModeSticky + } + + // Set file mode bits. + // clear perm, setuid, setgid and sticky bits. + m := os.FileMode(fi.h.Mode) &^ 07777 + if m == c_ISDIR { + // directory + mode |= os.ModeDir + } + if m == c_ISFIFO { + // named pipe (FIFO) + mode |= os.ModeNamedPipe + } + if m == c_ISLNK { + // symbolic link + mode |= os.ModeSymlink + } + if m == c_ISBLK { + // device file + mode |= os.ModeDevice + } + if m == c_ISCHR { + // Unix character device + mode |= os.ModeDevice + mode |= os.ModeCharDevice + } + if m == c_ISSOCK { + // Unix domain socket + mode |= os.ModeSocket + } + + switch fi.h.Typeflag { + case TypeSymlink: + // symbolic link + mode |= os.ModeSymlink + case TypeChar: + // character device node + mode |= os.ModeDevice + mode |= os.ModeCharDevice + case TypeBlock: + // block device node + mode |= os.ModeDevice + case TypeDir: + // directory + mode |= os.ModeDir + case TypeFifo: + // fifo node + mode |= os.ModeNamedPipe + } + + return mode +} + +// sysStat, if non-nil, populates h from system-dependent fields of fi. +var sysStat func(fi os.FileInfo, h *Header) error + +// Mode constants from the tar spec. +const ( + c_ISUID = 04000 // Set uid + c_ISGID = 02000 // Set gid + c_ISVTX = 01000 // Save text (sticky bit) + c_ISDIR = 040000 // Directory + c_ISFIFO = 010000 // FIFO + c_ISREG = 0100000 // Regular file + c_ISLNK = 0120000 // Symbolic link + c_ISBLK = 060000 // Block special file + c_ISCHR = 020000 // Character special file + c_ISSOCK = 0140000 // Socket +) + +// Keywords for the PAX Extended Header +const ( + paxAtime = "atime" + paxCharset = "charset" + paxComment = "comment" + paxCtime = "ctime" // please note that ctime is not a valid pax header. + paxGid = "gid" + paxGname = "gname" + paxLinkpath = "linkpath" + paxMtime = "mtime" + paxPath = "path" + paxSize = "size" + paxUid = "uid" + paxUname = "uname" + paxXattr = "SCHILY.xattr." + paxNone = "" +) + +// FileInfoHeader creates a partially-populated Header from fi. +// If fi describes a symlink, FileInfoHeader records link as the link target. +// If fi describes a directory, a slash is appended to the name. +// Because os.FileInfo's Name method returns only the base name of +// the file it describes, it may be necessary to modify the Name field +// of the returned header to provide the full path name of the file. +func FileInfoHeader(fi os.FileInfo, link string) (*Header, error) { + if fi == nil { + return nil, errors.New("tar: FileInfo is nil") + } + fm := fi.Mode() + h := &Header{ + Name: fi.Name(), + ModTime: fi.ModTime(), + Mode: int64(fm.Perm()), // or'd with c_IS* constants later + } + switch { + case fm.IsRegular(): + h.Mode |= c_ISREG + h.Typeflag = TypeReg + h.Size = fi.Size() + case fi.IsDir(): + h.Typeflag = TypeDir + h.Mode |= c_ISDIR + h.Name += "/" + case fm&os.ModeSymlink != 0: + h.Typeflag = TypeSymlink + h.Mode |= c_ISLNK + h.Linkname = link + case fm&os.ModeDevice != 0: + if fm&os.ModeCharDevice != 0 { + h.Mode |= c_ISCHR + h.Typeflag = TypeChar + } else { + h.Mode |= c_ISBLK + h.Typeflag = TypeBlock + } + case fm&os.ModeNamedPipe != 0: + h.Typeflag = TypeFifo + h.Mode |= c_ISFIFO + case fm&os.ModeSocket != 0: + h.Mode |= c_ISSOCK + default: + return nil, fmt.Errorf("archive/tar: unknown file mode %v", fm) + } + if fm&os.ModeSetuid != 0 { + h.Mode |= c_ISUID + } + if fm&os.ModeSetgid != 0 { + h.Mode |= c_ISGID + } + if fm&os.ModeSticky != 0 { + h.Mode |= c_ISVTX + } + // If possible, populate additional fields from OS-specific + // FileInfo fields. + if sys, ok := fi.Sys().(*Header); ok { + // This FileInfo came from a Header (not the OS). Use the + // original Header to populate all remaining fields. + h.Uid = sys.Uid + h.Gid = sys.Gid + h.Uname = sys.Uname + h.Gname = sys.Gname + h.AccessTime = sys.AccessTime + h.ChangeTime = sys.ChangeTime + if sys.Xattrs != nil { + h.Xattrs = make(map[string]string) + for k, v := range sys.Xattrs { + h.Xattrs[k] = v + } + } + if sys.Typeflag == TypeLink { + // hard link + h.Typeflag = TypeLink + h.Size = 0 + h.Linkname = sys.Linkname + } + } + if sysStat != nil { + return h, sysStat(fi, h) + } + return h, nil +} + +var zeroBlock = make([]byte, blockSize) + +// POSIX specifies a sum of the unsigned byte values, but the Sun tar uses signed byte values. +// We compute and return both. +func checksum(header []byte) (unsigned int64, signed int64) { + for i := 0; i < len(header); i++ { + if i == 148 { + // The chksum field (header[148:156]) is special: it should be treated as space bytes. + unsigned += ' ' * 8 + signed += ' ' * 8 + i += 7 + continue + } + unsigned += int64(header[i]) + signed += int64(int8(header[i])) + } + return +} + +type slicer []byte + +func (sp *slicer) next(n int) (b []byte) { + s := *sp + b, *sp = s[0:n], s[n:] + return +} + +func isASCII(s string) bool { + for _, c := range s { + if c >= 0x80 { + return false + } + } + return true +} + +func toASCII(s string) string { + if isASCII(s) { + return s + } + var buf bytes.Buffer + for _, c := range s { + if c < 0x80 { + buf.WriteByte(byte(c)) + } + } + return buf.String() +} + +// isHeaderOnlyType checks if the given type flag is of the type that has no +// data section even if a size is specified. +func isHeaderOnlyType(flag byte) bool { + switch flag { + case TypeLink, TypeSymlink, TypeChar, TypeBlock, TypeDir, TypeFifo: + return true + default: + return false + } +} diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/reader.go b/vendor/github.com/vbatts/tar-split/archive/tar/reader.go new file mode 100644 index 000000000..adf32122e --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/archive/tar/reader.go @@ -0,0 +1,1064 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package tar + +// TODO(dsymonds): +// - pax extensions + +import ( + "bytes" + "errors" + "io" + "io/ioutil" + "math" + "os" + "strconv" + "strings" + "time" +) + +var ( + ErrHeader = errors.New("archive/tar: invalid tar header") +) + +const maxNanoSecondIntSize = 9 + +// A Reader provides sequential access to the contents of a tar archive. +// A tar archive consists of a sequence of files. +// The Next method advances to the next file in the archive (including the first), +// and then it can be treated as an io.Reader to access the file's data. +type Reader struct { + r io.Reader + err error + pad int64 // amount of padding (ignored) after current file entry + curr numBytesReader // reader for current file entry + hdrBuff [blockSize]byte // buffer to use in readHeader + + RawAccounting bool // Whether to enable the access needed to reassemble the tar from raw bytes. Some performance/memory hit for this. + rawBytes *bytes.Buffer // last raw bits +} + +type parser struct { + err error // Last error seen +} + +// RawBytes accesses the raw bytes of the archive, apart from the file payload itself. +// This includes the header and padding. +// +// This call resets the current rawbytes buffer +// +// Only when RawAccounting is enabled, otherwise this returns nil +func (tr *Reader) RawBytes() []byte { + if !tr.RawAccounting { + return nil + } + if tr.rawBytes == nil { + tr.rawBytes = bytes.NewBuffer(nil) + } + // if we've read them, then flush them. + defer tr.rawBytes.Reset() + return tr.rawBytes.Bytes() +} + +// A numBytesReader is an io.Reader with a numBytes method, returning the number +// of bytes remaining in the underlying encoded data. +type numBytesReader interface { + io.Reader + numBytes() int64 +} + +// A regFileReader is a numBytesReader for reading file data from a tar archive. +type regFileReader struct { + r io.Reader // underlying reader + nb int64 // number of unread bytes for current file entry +} + +// A sparseFileReader is a numBytesReader for reading sparse file data from a +// tar archive. +type sparseFileReader struct { + rfr numBytesReader // Reads the sparse-encoded file data + sp []sparseEntry // The sparse map for the file + pos int64 // Keeps track of file position + total int64 // Total size of the file +} + +// A sparseEntry holds a single entry in a sparse file's sparse map. +// +// Sparse files are represented using a series of sparseEntrys. +// Despite the name, a sparseEntry represents an actual data fragment that +// references data found in the underlying archive stream. All regions not +// covered by a sparseEntry are logically filled with zeros. +// +// For example, if the underlying raw file contains the 10-byte data: +// var compactData = "abcdefgh" +// +// And the sparse map has the following entries: +// var sp = []sparseEntry{ +// {offset: 2, numBytes: 5} // Data fragment for [2..7] +// {offset: 18, numBytes: 3} // Data fragment for [18..21] +// } +// +// Then the content of the resulting sparse file with a "real" size of 25 is: +// var sparseData = "\x00"*2 + "abcde" + "\x00"*11 + "fgh" + "\x00"*4 +type sparseEntry struct { + offset int64 // Starting position of the fragment + numBytes int64 // Length of the fragment +} + +// Keywords for GNU sparse files in a PAX extended header +const ( + paxGNUSparseNumBlocks = "GNU.sparse.numblocks" + paxGNUSparseOffset = "GNU.sparse.offset" + paxGNUSparseNumBytes = "GNU.sparse.numbytes" + paxGNUSparseMap = "GNU.sparse.map" + paxGNUSparseName = "GNU.sparse.name" + paxGNUSparseMajor = "GNU.sparse.major" + paxGNUSparseMinor = "GNU.sparse.minor" + paxGNUSparseSize = "GNU.sparse.size" + paxGNUSparseRealSize = "GNU.sparse.realsize" +) + +// Keywords for old GNU sparse headers +const ( + oldGNUSparseMainHeaderOffset = 386 + oldGNUSparseMainHeaderIsExtendedOffset = 482 + oldGNUSparseMainHeaderNumEntries = 4 + oldGNUSparseExtendedHeaderIsExtendedOffset = 504 + oldGNUSparseExtendedHeaderNumEntries = 21 + oldGNUSparseOffsetSize = 12 + oldGNUSparseNumBytesSize = 12 +) + +// NewReader creates a new Reader reading from r. +func NewReader(r io.Reader) *Reader { return &Reader{r: r} } + +// Next advances to the next entry in the tar archive. +// +// io.EOF is returned at the end of the input. +func (tr *Reader) Next() (*Header, error) { + if tr.RawAccounting { + if tr.rawBytes == nil { + tr.rawBytes = bytes.NewBuffer(nil) + } else { + tr.rawBytes.Reset() + } + } + + if tr.err != nil { + return nil, tr.err + } + + var hdr *Header + var extHdrs map[string]string + + // Externally, Next iterates through the tar archive as if it is a series of + // files. Internally, the tar format often uses fake "files" to add meta + // data that describes the next file. These meta data "files" should not + // normally be visible to the outside. As such, this loop iterates through + // one or more "header files" until it finds a "normal file". +loop: + for { + tr.err = tr.skipUnread() + if tr.err != nil { + return nil, tr.err + } + + hdr = tr.readHeader() + if tr.err != nil { + return nil, tr.err + } + // Check for PAX/GNU special headers and files. + switch hdr.Typeflag { + case TypeXHeader: + extHdrs, tr.err = parsePAX(tr) + if tr.err != nil { + return nil, tr.err + } + continue loop // This is a meta header affecting the next header + case TypeGNULongName, TypeGNULongLink: + var realname []byte + realname, tr.err = ioutil.ReadAll(tr) + if tr.err != nil { + return nil, tr.err + } + + if tr.RawAccounting { + if _, tr.err = tr.rawBytes.Write(realname); tr.err != nil { + return nil, tr.err + } + } + + // Convert GNU extensions to use PAX headers. + if extHdrs == nil { + extHdrs = make(map[string]string) + } + var p parser + switch hdr.Typeflag { + case TypeGNULongName: + extHdrs[paxPath] = p.parseString(realname) + case TypeGNULongLink: + extHdrs[paxLinkpath] = p.parseString(realname) + } + if p.err != nil { + tr.err = p.err + return nil, tr.err + } + continue loop // This is a meta header affecting the next header + default: + mergePAX(hdr, extHdrs) + + // Check for a PAX format sparse file + sp, err := tr.checkForGNUSparsePAXHeaders(hdr, extHdrs) + if err != nil { + tr.err = err + return nil, err + } + if sp != nil { + // Current file is a PAX format GNU sparse file. + // Set the current file reader to a sparse file reader. + tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size) + if tr.err != nil { + return nil, tr.err + } + } + break loop // This is a file, so stop + } + } + return hdr, nil +} + +// checkForGNUSparsePAXHeaders checks the PAX headers for GNU sparse headers. If they are found, then +// this function reads the sparse map and returns it. Unknown sparse formats are ignored, causing the file to +// be treated as a regular file. +func (tr *Reader) checkForGNUSparsePAXHeaders(hdr *Header, headers map[string]string) ([]sparseEntry, error) { + var sparseFormat string + + // Check for sparse format indicators + major, majorOk := headers[paxGNUSparseMajor] + minor, minorOk := headers[paxGNUSparseMinor] + sparseName, sparseNameOk := headers[paxGNUSparseName] + _, sparseMapOk := headers[paxGNUSparseMap] + sparseSize, sparseSizeOk := headers[paxGNUSparseSize] + sparseRealSize, sparseRealSizeOk := headers[paxGNUSparseRealSize] + + // Identify which, if any, sparse format applies from which PAX headers are set + if majorOk && minorOk { + sparseFormat = major + "." + minor + } else if sparseNameOk && sparseMapOk { + sparseFormat = "0.1" + } else if sparseSizeOk { + sparseFormat = "0.0" + } else { + // Not a PAX format GNU sparse file. + return nil, nil + } + + // Check for unknown sparse format + if sparseFormat != "0.0" && sparseFormat != "0.1" && sparseFormat != "1.0" { + return nil, nil + } + + // Update hdr from GNU sparse PAX headers + if sparseNameOk { + hdr.Name = sparseName + } + if sparseSizeOk { + realSize, err := strconv.ParseInt(sparseSize, 10, 0) + if err != nil { + return nil, ErrHeader + } + hdr.Size = realSize + } else if sparseRealSizeOk { + realSize, err := strconv.ParseInt(sparseRealSize, 10, 0) + if err != nil { + return nil, ErrHeader + } + hdr.Size = realSize + } + + // Set up the sparse map, according to the particular sparse format in use + var sp []sparseEntry + var err error + switch sparseFormat { + case "0.0", "0.1": + sp, err = readGNUSparseMap0x1(headers) + case "1.0": + sp, err = readGNUSparseMap1x0(tr.curr) + } + return sp, err +} + +// mergePAX merges well known headers according to PAX standard. +// In general headers with the same name as those found +// in the header struct overwrite those found in the header +// struct with higher precision or longer values. Esp. useful +// for name and linkname fields. +func mergePAX(hdr *Header, headers map[string]string) error { + for k, v := range headers { + switch k { + case paxPath: + hdr.Name = v + case paxLinkpath: + hdr.Linkname = v + case paxGname: + hdr.Gname = v + case paxUname: + hdr.Uname = v + case paxUid: + uid, err := strconv.ParseInt(v, 10, 0) + if err != nil { + return err + } + hdr.Uid = int(uid) + case paxGid: + gid, err := strconv.ParseInt(v, 10, 0) + if err != nil { + return err + } + hdr.Gid = int(gid) + case paxAtime: + t, err := parsePAXTime(v) + if err != nil { + return err + } + hdr.AccessTime = t + case paxMtime: + t, err := parsePAXTime(v) + if err != nil { + return err + } + hdr.ModTime = t + case paxCtime: + t, err := parsePAXTime(v) + if err != nil { + return err + } + hdr.ChangeTime = t + case paxSize: + size, err := strconv.ParseInt(v, 10, 0) + if err != nil { + return err + } + hdr.Size = int64(size) + default: + if strings.HasPrefix(k, paxXattr) { + if hdr.Xattrs == nil { + hdr.Xattrs = make(map[string]string) + } + hdr.Xattrs[k[len(paxXattr):]] = v + } + } + } + return nil +} + +// parsePAXTime takes a string of the form %d.%d as described in +// the PAX specification. +func parsePAXTime(t string) (time.Time, error) { + buf := []byte(t) + pos := bytes.IndexByte(buf, '.') + var seconds, nanoseconds int64 + var err error + if pos == -1 { + seconds, err = strconv.ParseInt(t, 10, 0) + if err != nil { + return time.Time{}, err + } + } else { + seconds, err = strconv.ParseInt(string(buf[:pos]), 10, 0) + if err != nil { + return time.Time{}, err + } + nano_buf := string(buf[pos+1:]) + // Pad as needed before converting to a decimal. + // For example .030 -> .030000000 -> 30000000 nanoseconds + if len(nano_buf) < maxNanoSecondIntSize { + // Right pad + nano_buf += strings.Repeat("0", maxNanoSecondIntSize-len(nano_buf)) + } else if len(nano_buf) > maxNanoSecondIntSize { + // Right truncate + nano_buf = nano_buf[:maxNanoSecondIntSize] + } + nanoseconds, err = strconv.ParseInt(string(nano_buf), 10, 0) + if err != nil { + return time.Time{}, err + } + } + ts := time.Unix(seconds, nanoseconds) + return ts, nil +} + +// parsePAX parses PAX headers. +// If an extended header (type 'x') is invalid, ErrHeader is returned +func parsePAX(r io.Reader) (map[string]string, error) { + buf, err := ioutil.ReadAll(r) + if err != nil { + return nil, err + } + // leaving this function for io.Reader makes it more testable + if tr, ok := r.(*Reader); ok && tr.RawAccounting { + if _, err = tr.rawBytes.Write(buf); err != nil { + return nil, err + } + } + sbuf := string(buf) + + // For GNU PAX sparse format 0.0 support. + // This function transforms the sparse format 0.0 headers into sparse format 0.1 headers. + var sparseMap bytes.Buffer + + headers := make(map[string]string) + // Each record is constructed as + // "%d %s=%s\n", length, keyword, value + for len(sbuf) > 0 { + key, value, residual, err := parsePAXRecord(sbuf) + if err != nil { + return nil, ErrHeader + } + sbuf = residual + + keyStr := string(key) + if keyStr == paxGNUSparseOffset || keyStr == paxGNUSparseNumBytes { + // GNU sparse format 0.0 special key. Write to sparseMap instead of using the headers map. + sparseMap.WriteString(value) + sparseMap.Write([]byte{','}) + } else { + // Normal key. Set the value in the headers map. + headers[keyStr] = string(value) + } + } + if sparseMap.Len() != 0 { + // Add sparse info to headers, chopping off the extra comma + sparseMap.Truncate(sparseMap.Len() - 1) + headers[paxGNUSparseMap] = sparseMap.String() + } + return headers, nil +} + +// parsePAXRecord parses the input PAX record string into a key-value pair. +// If parsing is successful, it will slice off the currently read record and +// return the remainder as r. +// +// A PAX record is of the following form: +// "%d %s=%s\n" % (size, key, value) +func parsePAXRecord(s string) (k, v, r string, err error) { + // The size field ends at the first space. + sp := strings.IndexByte(s, ' ') + if sp == -1 { + return "", "", s, ErrHeader + } + + // Parse the first token as a decimal integer. + n, perr := strconv.ParseInt(s[:sp], 10, 0) // Intentionally parse as native int + if perr != nil || n < 5 || int64(len(s)) < n { + return "", "", s, ErrHeader + } + + // Extract everything between the space and the final newline. + rec, nl, rem := s[sp+1:n-1], s[n-1:n], s[n:] + if nl != "\n" { + return "", "", s, ErrHeader + } + + // The first equals separates the key from the value. + eq := strings.IndexByte(rec, '=') + if eq == -1 { + return "", "", s, ErrHeader + } + return rec[:eq], rec[eq+1:], rem, nil +} + +// parseString parses bytes as a NUL-terminated C-style string. +// If a NUL byte is not found then the whole slice is returned as a string. +func (*parser) parseString(b []byte) string { + n := 0 + for n < len(b) && b[n] != 0 { + n++ + } + return string(b[0:n]) +} + +// parseNumeric parses the input as being encoded in either base-256 or octal. +// This function may return negative numbers. +// If parsing fails or an integer overflow occurs, err will be set. +func (p *parser) parseNumeric(b []byte) int64 { + // Check for base-256 (binary) format first. + // If the first bit is set, then all following bits constitute a two's + // complement encoded number in big-endian byte order. + if len(b) > 0 && b[0]&0x80 != 0 { + // Handling negative numbers relies on the following identity: + // -a-1 == ^a + // + // If the number is negative, we use an inversion mask to invert the + // data bytes and treat the value as an unsigned number. + var inv byte // 0x00 if positive or zero, 0xff if negative + if b[0]&0x40 != 0 { + inv = 0xff + } + + var x uint64 + for i, c := range b { + c ^= inv // Inverts c only if inv is 0xff, otherwise does nothing + if i == 0 { + c &= 0x7f // Ignore signal bit in first byte + } + if (x >> 56) > 0 { + p.err = ErrHeader // Integer overflow + return 0 + } + x = x<<8 | uint64(c) + } + if (x >> 63) > 0 { + p.err = ErrHeader // Integer overflow + return 0 + } + if inv == 0xff { + return ^int64(x) + } + return int64(x) + } + + // Normal case is base-8 (octal) format. + return p.parseOctal(b) +} + +func (p *parser) parseOctal(b []byte) int64 { + // Because unused fields are filled with NULs, we need + // to skip leading NULs. Fields may also be padded with + // spaces or NULs. + // So we remove leading and trailing NULs and spaces to + // be sure. + b = bytes.Trim(b, " \x00") + + if len(b) == 0 { + return 0 + } + x, perr := strconv.ParseUint(p.parseString(b), 8, 64) + if perr != nil { + p.err = ErrHeader + } + return int64(x) +} + +// skipUnread skips any unread bytes in the existing file entry, as well as any +// alignment padding. It returns io.ErrUnexpectedEOF if any io.EOF is +// encountered in the data portion; it is okay to hit io.EOF in the padding. +// +// Note that this function still works properly even when sparse files are being +// used since numBytes returns the bytes remaining in the underlying io.Reader. +func (tr *Reader) skipUnread() error { + dataSkip := tr.numBytes() // Number of data bytes to skip + totalSkip := dataSkip + tr.pad // Total number of bytes to skip + tr.curr, tr.pad = nil, 0 + if tr.RawAccounting { + _, tr.err = io.CopyN(tr.rawBytes, tr.r, totalSkip) + return tr.err + } + // If possible, Seek to the last byte before the end of the data section. + // Do this because Seek is often lazy about reporting errors; this will mask + // the fact that the tar stream may be truncated. We can rely on the + // io.CopyN done shortly afterwards to trigger any IO errors. + var seekSkipped int64 // Number of bytes skipped via Seek + if sr, ok := tr.r.(io.Seeker); ok && dataSkip > 1 { + // Not all io.Seeker can actually Seek. For example, os.Stdin implements + // io.Seeker, but calling Seek always returns an error and performs + // no action. Thus, we try an innocent seek to the current position + // to see if Seek is really supported. + pos1, err := sr.Seek(0, os.SEEK_CUR) + if err == nil { + // Seek seems supported, so perform the real Seek. + pos2, err := sr.Seek(dataSkip-1, os.SEEK_CUR) + if err != nil { + tr.err = err + return tr.err + } + seekSkipped = pos2 - pos1 + } + } + + var copySkipped int64 // Number of bytes skipped via CopyN + copySkipped, tr.err = io.CopyN(ioutil.Discard, tr.r, totalSkip-seekSkipped) + if tr.err == io.EOF && seekSkipped+copySkipped < dataSkip { + tr.err = io.ErrUnexpectedEOF + } + return tr.err +} + +func (tr *Reader) verifyChecksum(header []byte) bool { + if tr.err != nil { + return false + } + + var p parser + given := p.parseOctal(header[148:156]) + unsigned, signed := checksum(header) + return p.err == nil && (given == unsigned || given == signed) +} + +// readHeader reads the next block header and assumes that the underlying reader +// is already aligned to a block boundary. +// +// The err will be set to io.EOF only when one of the following occurs: +// * Exactly 0 bytes are read and EOF is hit. +// * Exactly 1 block of zeros is read and EOF is hit. +// * At least 2 blocks of zeros are read. +func (tr *Reader) readHeader() *Header { + header := tr.hdrBuff[:] + copy(header, zeroBlock) + + if n, err := io.ReadFull(tr.r, header); err != nil { + tr.err = err + // because it could read some of the block, but reach EOF first + if tr.err == io.EOF && tr.RawAccounting { + if _, err := tr.rawBytes.Write(header[:n]); err != nil { + tr.err = err + } + } + return nil // io.EOF is okay here + } + if tr.RawAccounting { + if _, tr.err = tr.rawBytes.Write(header); tr.err != nil { + return nil + } + } + + // Two blocks of zero bytes marks the end of the archive. + if bytes.Equal(header, zeroBlock[0:blockSize]) { + if n, err := io.ReadFull(tr.r, header); err != nil { + tr.err = err + // because it could read some of the block, but reach EOF first + if tr.err == io.EOF && tr.RawAccounting { + if _, err := tr.rawBytes.Write(header[:n]); err != nil { + tr.err = err + } + } + return nil // io.EOF is okay here + } + if tr.RawAccounting { + if _, tr.err = tr.rawBytes.Write(header); tr.err != nil { + return nil + } + } + if bytes.Equal(header, zeroBlock[0:blockSize]) { + tr.err = io.EOF + } else { + tr.err = ErrHeader // zero block and then non-zero block + } + return nil + } + + if !tr.verifyChecksum(header) { + tr.err = ErrHeader + return nil + } + + // Unpack + var p parser + hdr := new(Header) + s := slicer(header) + + hdr.Name = p.parseString(s.next(100)) + hdr.Mode = p.parseNumeric(s.next(8)) + hdr.Uid = int(p.parseNumeric(s.next(8))) + hdr.Gid = int(p.parseNumeric(s.next(8))) + hdr.Size = p.parseNumeric(s.next(12)) + hdr.ModTime = time.Unix(p.parseNumeric(s.next(12)), 0) + s.next(8) // chksum + hdr.Typeflag = s.next(1)[0] + hdr.Linkname = p.parseString(s.next(100)) + + // The remainder of the header depends on the value of magic. + // The original (v7) version of tar had no explicit magic field, + // so its magic bytes, like the rest of the block, are NULs. + magic := string(s.next(8)) // contains version field as well. + var format string + switch { + case magic[:6] == "ustar\x00": // POSIX tar (1003.1-1988) + if string(header[508:512]) == "tar\x00" { + format = "star" + } else { + format = "posix" + } + case magic == "ustar \x00": // old GNU tar + format = "gnu" + } + + switch format { + case "posix", "gnu", "star": + hdr.Uname = p.parseString(s.next(32)) + hdr.Gname = p.parseString(s.next(32)) + devmajor := s.next(8) + devminor := s.next(8) + if hdr.Typeflag == TypeChar || hdr.Typeflag == TypeBlock { + hdr.Devmajor = p.parseNumeric(devmajor) + hdr.Devminor = p.parseNumeric(devminor) + } + var prefix string + switch format { + case "posix", "gnu": + prefix = p.parseString(s.next(155)) + case "star": + prefix = p.parseString(s.next(131)) + hdr.AccessTime = time.Unix(p.parseNumeric(s.next(12)), 0) + hdr.ChangeTime = time.Unix(p.parseNumeric(s.next(12)), 0) + } + if len(prefix) > 0 { + hdr.Name = prefix + "/" + hdr.Name + } + } + + if p.err != nil { + tr.err = p.err + return nil + } + + nb := hdr.Size + if isHeaderOnlyType(hdr.Typeflag) { + nb = 0 + } + if nb < 0 { + tr.err = ErrHeader + return nil + } + + // Set the current file reader. + tr.pad = -nb & (blockSize - 1) // blockSize is a power of two + tr.curr = ®FileReader{r: tr.r, nb: nb} + + // Check for old GNU sparse format entry. + if hdr.Typeflag == TypeGNUSparse { + // Get the real size of the file. + hdr.Size = p.parseNumeric(header[483:495]) + if p.err != nil { + tr.err = p.err + return nil + } + + // Read the sparse map. + sp := tr.readOldGNUSparseMap(header) + if tr.err != nil { + return nil + } + + // Current file is a GNU sparse file. Update the current file reader. + tr.curr, tr.err = newSparseFileReader(tr.curr, sp, hdr.Size) + if tr.err != nil { + return nil + } + } + + return hdr +} + +// readOldGNUSparseMap reads the sparse map as stored in the old GNU sparse format. +// The sparse map is stored in the tar header if it's small enough. If it's larger than four entries, +// then one or more extension headers are used to store the rest of the sparse map. +func (tr *Reader) readOldGNUSparseMap(header []byte) []sparseEntry { + var p parser + isExtended := header[oldGNUSparseMainHeaderIsExtendedOffset] != 0 + spCap := oldGNUSparseMainHeaderNumEntries + if isExtended { + spCap += oldGNUSparseExtendedHeaderNumEntries + } + sp := make([]sparseEntry, 0, spCap) + s := slicer(header[oldGNUSparseMainHeaderOffset:]) + + // Read the four entries from the main tar header + for i := 0; i < oldGNUSparseMainHeaderNumEntries; i++ { + offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize)) + numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize)) + if p.err != nil { + tr.err = p.err + return nil + } + if offset == 0 && numBytes == 0 { + break + } + sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes}) + } + + for isExtended { + // There are more entries. Read an extension header and parse its entries. + sparseHeader := make([]byte, blockSize) + if _, tr.err = io.ReadFull(tr.r, sparseHeader); tr.err != nil { + return nil + } + if tr.RawAccounting { + if _, tr.err = tr.rawBytes.Write(sparseHeader); tr.err != nil { + return nil + } + } + + isExtended = sparseHeader[oldGNUSparseExtendedHeaderIsExtendedOffset] != 0 + s = slicer(sparseHeader) + for i := 0; i < oldGNUSparseExtendedHeaderNumEntries; i++ { + offset := p.parseNumeric(s.next(oldGNUSparseOffsetSize)) + numBytes := p.parseNumeric(s.next(oldGNUSparseNumBytesSize)) + if p.err != nil { + tr.err = p.err + return nil + } + if offset == 0 && numBytes == 0 { + break + } + sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes}) + } + } + return sp +} + +// readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format +// version 1.0. The format of the sparse map consists of a series of +// newline-terminated numeric fields. The first field is the number of entries +// and is always present. Following this are the entries, consisting of two +// fields (offset, numBytes). This function must stop reading at the end +// boundary of the block containing the last newline. +// +// Note that the GNU manual says that numeric values should be encoded in octal +// format. However, the GNU tar utility itself outputs these values in decimal. +// As such, this library treats values as being encoded in decimal. +func readGNUSparseMap1x0(r io.Reader) ([]sparseEntry, error) { + var cntNewline int64 + var buf bytes.Buffer + var blk = make([]byte, blockSize) + + // feedTokens copies data in numBlock chunks from r into buf until there are + // at least cnt newlines in buf. It will not read more blocks than needed. + var feedTokens = func(cnt int64) error { + for cntNewline < cnt { + if _, err := io.ReadFull(r, blk); err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + return err + } + buf.Write(blk) + for _, c := range blk { + if c == '\n' { + cntNewline++ + } + } + } + return nil + } + + // nextToken gets the next token delimited by a newline. This assumes that + // at least one newline exists in the buffer. + var nextToken = func() string { + cntNewline-- + tok, _ := buf.ReadString('\n') + return tok[:len(tok)-1] // Cut off newline + } + + // Parse for the number of entries. + // Use integer overflow resistant math to check this. + if err := feedTokens(1); err != nil { + return nil, err + } + numEntries, err := strconv.ParseInt(nextToken(), 10, 0) // Intentionally parse as native int + if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) { + return nil, ErrHeader + } + + // Parse for all member entries. + // numEntries is trusted after this since a potential attacker must have + // committed resources proportional to what this library used. + if err := feedTokens(2 * numEntries); err != nil { + return nil, err + } + sp := make([]sparseEntry, 0, numEntries) + for i := int64(0); i < numEntries; i++ { + offset, err := strconv.ParseInt(nextToken(), 10, 64) + if err != nil { + return nil, ErrHeader + } + numBytes, err := strconv.ParseInt(nextToken(), 10, 64) + if err != nil { + return nil, ErrHeader + } + sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes}) + } + return sp, nil +} + +// readGNUSparseMap0x1 reads the sparse map as stored in GNU's PAX sparse format +// version 0.1. The sparse map is stored in the PAX headers. +func readGNUSparseMap0x1(extHdrs map[string]string) ([]sparseEntry, error) { + // Get number of entries. + // Use integer overflow resistant math to check this. + numEntriesStr := extHdrs[paxGNUSparseNumBlocks] + numEntries, err := strconv.ParseInt(numEntriesStr, 10, 0) // Intentionally parse as native int + if err != nil || numEntries < 0 || int(2*numEntries) < int(numEntries) { + return nil, ErrHeader + } + + // There should be two numbers in sparseMap for each entry. + sparseMap := strings.Split(extHdrs[paxGNUSparseMap], ",") + if int64(len(sparseMap)) != 2*numEntries { + return nil, ErrHeader + } + + // Loop through the entries in the sparse map. + // numEntries is trusted now. + sp := make([]sparseEntry, 0, numEntries) + for i := int64(0); i < numEntries; i++ { + offset, err := strconv.ParseInt(sparseMap[2*i], 10, 64) + if err != nil { + return nil, ErrHeader + } + numBytes, err := strconv.ParseInt(sparseMap[2*i+1], 10, 64) + if err != nil { + return nil, ErrHeader + } + sp = append(sp, sparseEntry{offset: offset, numBytes: numBytes}) + } + return sp, nil +} + +// numBytes returns the number of bytes left to read in the current file's entry +// in the tar archive, or 0 if there is no current file. +func (tr *Reader) numBytes() int64 { + if tr.curr == nil { + // No current file, so no bytes + return 0 + } + return tr.curr.numBytes() +} + +// Read reads from the current entry in the tar archive. +// It returns 0, io.EOF when it reaches the end of that entry, +// until Next is called to advance to the next entry. +// +// Calling Read on special types like TypeLink, TypeSymLink, TypeChar, +// TypeBlock, TypeDir, and TypeFifo returns 0, io.EOF regardless of what +// the Header.Size claims. +func (tr *Reader) Read(b []byte) (n int, err error) { + if tr.err != nil { + return 0, tr.err + } + if tr.curr == nil { + return 0, io.EOF + } + + n, err = tr.curr.Read(b) + if err != nil && err != io.EOF { + tr.err = err + } + return +} + +func (rfr *regFileReader) Read(b []byte) (n int, err error) { + if rfr.nb == 0 { + // file consumed + return 0, io.EOF + } + if int64(len(b)) > rfr.nb { + b = b[0:rfr.nb] + } + n, err = rfr.r.Read(b) + rfr.nb -= int64(n) + + if err == io.EOF && rfr.nb > 0 { + err = io.ErrUnexpectedEOF + } + return +} + +// numBytes returns the number of bytes left to read in the file's data in the tar archive. +func (rfr *regFileReader) numBytes() int64 { + return rfr.nb +} + +// newSparseFileReader creates a new sparseFileReader, but validates all of the +// sparse entries before doing so. +func newSparseFileReader(rfr numBytesReader, sp []sparseEntry, total int64) (*sparseFileReader, error) { + if total < 0 { + return nil, ErrHeader // Total size cannot be negative + } + + // Validate all sparse entries. These are the same checks as performed by + // the BSD tar utility. + for i, s := range sp { + switch { + case s.offset < 0 || s.numBytes < 0: + return nil, ErrHeader // Negative values are never okay + case s.offset > math.MaxInt64-s.numBytes: + return nil, ErrHeader // Integer overflow with large length + case s.offset+s.numBytes > total: + return nil, ErrHeader // Region extends beyond the "real" size + case i > 0 && sp[i-1].offset+sp[i-1].numBytes > s.offset: + return nil, ErrHeader // Regions can't overlap and must be in order + } + } + return &sparseFileReader{rfr: rfr, sp: sp, total: total}, nil +} + +// readHole reads a sparse hole ending at endOffset. +func (sfr *sparseFileReader) readHole(b []byte, endOffset int64) int { + n64 := endOffset - sfr.pos + if n64 > int64(len(b)) { + n64 = int64(len(b)) + } + n := int(n64) + for i := 0; i < n; i++ { + b[i] = 0 + } + sfr.pos += n64 + return n +} + +// Read reads the sparse file data in expanded form. +func (sfr *sparseFileReader) Read(b []byte) (n int, err error) { + // Skip past all empty fragments. + for len(sfr.sp) > 0 && sfr.sp[0].numBytes == 0 { + sfr.sp = sfr.sp[1:] + } + + // If there are no more fragments, then it is possible that there + // is one last sparse hole. + if len(sfr.sp) == 0 { + // This behavior matches the BSD tar utility. + // However, GNU tar stops returning data even if sfr.total is unmet. + if sfr.pos < sfr.total { + return sfr.readHole(b, sfr.total), nil + } + return 0, io.EOF + } + + // In front of a data fragment, so read a hole. + if sfr.pos < sfr.sp[0].offset { + return sfr.readHole(b, sfr.sp[0].offset), nil + } + + // In a data fragment, so read from it. + // This math is overflow free since we verify that offset and numBytes can + // be safely added when creating the sparseFileReader. + endPos := sfr.sp[0].offset + sfr.sp[0].numBytes // End offset of fragment + bytesLeft := endPos - sfr.pos // Bytes left in fragment + if int64(len(b)) > bytesLeft { + b = b[:bytesLeft] + } + + n, err = sfr.rfr.Read(b) + sfr.pos += int64(n) + if err == io.EOF { + if sfr.pos < endPos { + err = io.ErrUnexpectedEOF // There was supposed to be more data + } else if sfr.pos < sfr.total { + err = nil // There is still an implicit sparse hole at the end + } + } + + if sfr.pos == endPos { + sfr.sp = sfr.sp[1:] // We are done with this fragment, so pop it + } + return n, err +} + +// numBytes returns the number of bytes left to read in the sparse file's +// sparse-encoded data in the tar archive. +func (sfr *sparseFileReader) numBytes() int64 { + return sfr.rfr.numBytes() +} diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/stat_atim.go b/vendor/github.com/vbatts/tar-split/archive/tar/stat_atim.go new file mode 100644 index 000000000..cf9cc79c5 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/archive/tar/stat_atim.go @@ -0,0 +1,20 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build linux dragonfly openbsd solaris + +package tar + +import ( + "syscall" + "time" +) + +func statAtime(st *syscall.Stat_t) time.Time { + return time.Unix(st.Atim.Unix()) +} + +func statCtime(st *syscall.Stat_t) time.Time { + return time.Unix(st.Ctim.Unix()) +} diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/stat_atimespec.go b/vendor/github.com/vbatts/tar-split/archive/tar/stat_atimespec.go new file mode 100644 index 000000000..6f17dbe30 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/archive/tar/stat_atimespec.go @@ -0,0 +1,20 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build darwin freebsd netbsd + +package tar + +import ( + "syscall" + "time" +) + +func statAtime(st *syscall.Stat_t) time.Time { + return time.Unix(st.Atimespec.Unix()) +} + +func statCtime(st *syscall.Stat_t) time.Time { + return time.Unix(st.Ctimespec.Unix()) +} diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/stat_unix.go b/vendor/github.com/vbatts/tar-split/archive/tar/stat_unix.go new file mode 100644 index 000000000..cb843db4c --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/archive/tar/stat_unix.go @@ -0,0 +1,32 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build linux darwin dragonfly freebsd openbsd netbsd solaris + +package tar + +import ( + "os" + "syscall" +) + +func init() { + sysStat = statUnix +} + +func statUnix(fi os.FileInfo, h *Header) error { + sys, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return nil + } + h.Uid = int(sys.Uid) + h.Gid = int(sys.Gid) + // TODO(bradfitz): populate username & group. os/user + // doesn't cache LookupId lookups, and lacks group + // lookup functions. + h.AccessTime = statAtime(sys) + h.ChangeTime = statCtime(sys) + // TODO(bradfitz): major/minor device numbers? + return nil +} diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/writer.go b/vendor/github.com/vbatts/tar-split/archive/tar/writer.go new file mode 100644 index 000000000..042638175 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/archive/tar/writer.go @@ -0,0 +1,416 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package tar + +// TODO(dsymonds): +// - catch more errors (no first header, etc.) + +import ( + "bytes" + "errors" + "fmt" + "io" + "path" + "sort" + "strconv" + "strings" + "time" +) + +var ( + ErrWriteTooLong = errors.New("archive/tar: write too long") + ErrFieldTooLong = errors.New("archive/tar: header field too long") + ErrWriteAfterClose = errors.New("archive/tar: write after close") + errInvalidHeader = errors.New("archive/tar: header field too long or contains invalid values") +) + +// A Writer provides sequential writing of a tar archive in POSIX.1 format. +// A tar archive consists of a sequence of files. +// Call WriteHeader to begin a new file, and then call Write to supply that file's data, +// writing at most hdr.Size bytes in total. +type Writer struct { + w io.Writer + err error + nb int64 // number of unwritten bytes for current file entry + pad int64 // amount of padding to write after current file entry + closed bool + usedBinary bool // whether the binary numeric field extension was used + preferPax bool // use pax header instead of binary numeric header + hdrBuff [blockSize]byte // buffer to use in writeHeader when writing a regular header + paxHdrBuff [blockSize]byte // buffer to use in writeHeader when writing a pax header +} + +type formatter struct { + err error // Last error seen +} + +// NewWriter creates a new Writer writing to w. +func NewWriter(w io.Writer) *Writer { return &Writer{w: w} } + +// Flush finishes writing the current file (optional). +func (tw *Writer) Flush() error { + if tw.nb > 0 { + tw.err = fmt.Errorf("archive/tar: missed writing %d bytes", tw.nb) + return tw.err + } + + n := tw.nb + tw.pad + for n > 0 && tw.err == nil { + nr := n + if nr > blockSize { + nr = blockSize + } + var nw int + nw, tw.err = tw.w.Write(zeroBlock[0:nr]) + n -= int64(nw) + } + tw.nb = 0 + tw.pad = 0 + return tw.err +} + +// Write s into b, terminating it with a NUL if there is room. +func (f *formatter) formatString(b []byte, s string) { + if len(s) > len(b) { + f.err = ErrFieldTooLong + return + } + ascii := toASCII(s) + copy(b, ascii) + if len(ascii) < len(b) { + b[len(ascii)] = 0 + } +} + +// Encode x as an octal ASCII string and write it into b with leading zeros. +func (f *formatter) formatOctal(b []byte, x int64) { + s := strconv.FormatInt(x, 8) + // leading zeros, but leave room for a NUL. + for len(s)+1 < len(b) { + s = "0" + s + } + f.formatString(b, s) +} + +// fitsInBase256 reports whether x can be encoded into n bytes using base-256 +// encoding. Unlike octal encoding, base-256 encoding does not require that the +// string ends with a NUL character. Thus, all n bytes are available for output. +// +// If operating in binary mode, this assumes strict GNU binary mode; which means +// that the first byte can only be either 0x80 or 0xff. Thus, the first byte is +// equivalent to the sign bit in two's complement form. +func fitsInBase256(n int, x int64) bool { + var binBits = uint(n-1) * 8 + return n >= 9 || (x >= -1<= 0; i-- { + b[i] = byte(x) + x >>= 8 + } + b[0] |= 0x80 // Highest bit indicates binary format + return + } + + f.formatOctal(b, 0) // Last resort, just write zero + f.err = ErrFieldTooLong +} + +var ( + minTime = time.Unix(0, 0) + // There is room for 11 octal digits (33 bits) of mtime. + maxTime = minTime.Add((1<<33 - 1) * time.Second) +) + +// WriteHeader writes hdr and prepares to accept the file's contents. +// WriteHeader calls Flush if it is not the first header. +// Calling after a Close will return ErrWriteAfterClose. +func (tw *Writer) WriteHeader(hdr *Header) error { + return tw.writeHeader(hdr, true) +} + +// WriteHeader writes hdr and prepares to accept the file's contents. +// WriteHeader calls Flush if it is not the first header. +// Calling after a Close will return ErrWriteAfterClose. +// As this method is called internally by writePax header to allow it to +// suppress writing the pax header. +func (tw *Writer) writeHeader(hdr *Header, allowPax bool) error { + if tw.closed { + return ErrWriteAfterClose + } + if tw.err == nil { + tw.Flush() + } + if tw.err != nil { + return tw.err + } + + // a map to hold pax header records, if any are needed + paxHeaders := make(map[string]string) + + // TODO(shanemhansen): we might want to use PAX headers for + // subsecond time resolution, but for now let's just capture + // too long fields or non ascii characters + + var f formatter + var header []byte + + // We need to select which scratch buffer to use carefully, + // since this method is called recursively to write PAX headers. + // If allowPax is true, this is the non-recursive call, and we will use hdrBuff. + // If allowPax is false, we are being called by writePAXHeader, and hdrBuff is + // already being used by the non-recursive call, so we must use paxHdrBuff. + header = tw.hdrBuff[:] + if !allowPax { + header = tw.paxHdrBuff[:] + } + copy(header, zeroBlock) + s := slicer(header) + + // Wrappers around formatter that automatically sets paxHeaders if the + // argument extends beyond the capacity of the input byte slice. + var formatString = func(b []byte, s string, paxKeyword string) { + needsPaxHeader := paxKeyword != paxNone && len(s) > len(b) || !isASCII(s) + if needsPaxHeader { + paxHeaders[paxKeyword] = s + return + } + f.formatString(b, s) + } + var formatNumeric = func(b []byte, x int64, paxKeyword string) { + // Try octal first. + s := strconv.FormatInt(x, 8) + if len(s) < len(b) { + f.formatOctal(b, x) + return + } + + // If it is too long for octal, and PAX is preferred, use a PAX header. + if paxKeyword != paxNone && tw.preferPax { + f.formatOctal(b, 0) + s := strconv.FormatInt(x, 10) + paxHeaders[paxKeyword] = s + return + } + + tw.usedBinary = true + f.formatNumeric(b, x) + } + + // keep a reference to the filename to allow to overwrite it later if we detect that we can use ustar longnames instead of pax + pathHeaderBytes := s.next(fileNameSize) + + formatString(pathHeaderBytes, hdr.Name, paxPath) + + // Handle out of range ModTime carefully. + var modTime int64 + if !hdr.ModTime.Before(minTime) && !hdr.ModTime.After(maxTime) { + modTime = hdr.ModTime.Unix() + } + + f.formatOctal(s.next(8), hdr.Mode) // 100:108 + formatNumeric(s.next(8), int64(hdr.Uid), paxUid) // 108:116 + formatNumeric(s.next(8), int64(hdr.Gid), paxGid) // 116:124 + formatNumeric(s.next(12), hdr.Size, paxSize) // 124:136 + formatNumeric(s.next(12), modTime, paxNone) // 136:148 --- consider using pax for finer granularity + s.next(8) // chksum (148:156) + s.next(1)[0] = hdr.Typeflag // 156:157 + + formatString(s.next(100), hdr.Linkname, paxLinkpath) + + copy(s.next(8), []byte("ustar\x0000")) // 257:265 + formatString(s.next(32), hdr.Uname, paxUname) // 265:297 + formatString(s.next(32), hdr.Gname, paxGname) // 297:329 + formatNumeric(s.next(8), hdr.Devmajor, paxNone) // 329:337 + formatNumeric(s.next(8), hdr.Devminor, paxNone) // 337:345 + + // keep a reference to the prefix to allow to overwrite it later if we detect that we can use ustar longnames instead of pax + prefixHeaderBytes := s.next(155) + formatString(prefixHeaderBytes, "", paxNone) // 345:500 prefix + + // Use the GNU magic instead of POSIX magic if we used any GNU extensions. + if tw.usedBinary { + copy(header[257:265], []byte("ustar \x00")) + } + + _, paxPathUsed := paxHeaders[paxPath] + // try to use a ustar header when only the name is too long + if !tw.preferPax && len(paxHeaders) == 1 && paxPathUsed { + prefix, suffix, ok := splitUSTARPath(hdr.Name) + if ok { + // Since we can encode in USTAR format, disable PAX header. + delete(paxHeaders, paxPath) + + // Update the path fields + formatString(pathHeaderBytes, suffix, paxNone) + formatString(prefixHeaderBytes, prefix, paxNone) + } + } + + // The chksum field is terminated by a NUL and a space. + // This is different from the other octal fields. + chksum, _ := checksum(header) + f.formatOctal(header[148:155], chksum) // Never fails + header[155] = ' ' + + // Check if there were any formatting errors. + if f.err != nil { + tw.err = f.err + return tw.err + } + + if allowPax { + for k, v := range hdr.Xattrs { + paxHeaders[paxXattr+k] = v + } + } + + if len(paxHeaders) > 0 { + if !allowPax { + return errInvalidHeader + } + if err := tw.writePAXHeader(hdr, paxHeaders); err != nil { + return err + } + } + tw.nb = int64(hdr.Size) + tw.pad = (blockSize - (tw.nb % blockSize)) % blockSize + + _, tw.err = tw.w.Write(header) + return tw.err +} + +// splitUSTARPath splits a path according to USTAR prefix and suffix rules. +// If the path is not splittable, then it will return ("", "", false). +func splitUSTARPath(name string) (prefix, suffix string, ok bool) { + length := len(name) + if length <= fileNameSize || !isASCII(name) { + return "", "", false + } else if length > fileNamePrefixSize+1 { + length = fileNamePrefixSize + 1 + } else if name[length-1] == '/' { + length-- + } + + i := strings.LastIndex(name[:length], "/") + nlen := len(name) - i - 1 // nlen is length of suffix + plen := i // plen is length of prefix + if i <= 0 || nlen > fileNameSize || nlen == 0 || plen > fileNamePrefixSize { + return "", "", false + } + return name[:i], name[i+1:], true +} + +// writePaxHeader writes an extended pax header to the +// archive. +func (tw *Writer) writePAXHeader(hdr *Header, paxHeaders map[string]string) error { + // Prepare extended header + ext := new(Header) + ext.Typeflag = TypeXHeader + // Setting ModTime is required for reader parsing to + // succeed, and seems harmless enough. + ext.ModTime = hdr.ModTime + // The spec asks that we namespace our pseudo files + // with the current pid. However, this results in differing outputs + // for identical inputs. As such, the constant 0 is now used instead. + // golang.org/issue/12358 + dir, file := path.Split(hdr.Name) + fullName := path.Join(dir, "PaxHeaders.0", file) + + ascii := toASCII(fullName) + if len(ascii) > 100 { + ascii = ascii[:100] + } + ext.Name = ascii + // Construct the body + var buf bytes.Buffer + + // Keys are sorted before writing to body to allow deterministic output. + var keys []string + for k := range paxHeaders { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + fmt.Fprint(&buf, formatPAXRecord(k, paxHeaders[k])) + } + + ext.Size = int64(len(buf.Bytes())) + if err := tw.writeHeader(ext, false); err != nil { + return err + } + if _, err := tw.Write(buf.Bytes()); err != nil { + return err + } + if err := tw.Flush(); err != nil { + return err + } + return nil +} + +// formatPAXRecord formats a single PAX record, prefixing it with the +// appropriate length. +func formatPAXRecord(k, v string) string { + const padding = 3 // Extra padding for ' ', '=', and '\n' + size := len(k) + len(v) + padding + size += len(strconv.Itoa(size)) + record := fmt.Sprintf("%d %s=%s\n", size, k, v) + + // Final adjustment if adding size field increased the record size. + if len(record) != size { + size = len(record) + record = fmt.Sprintf("%d %s=%s\n", size, k, v) + } + return record +} + +// Write writes to the current entry in the tar archive. +// Write returns the error ErrWriteTooLong if more than +// hdr.Size bytes are written after WriteHeader. +func (tw *Writer) Write(b []byte) (n int, err error) { + if tw.closed { + err = ErrWriteAfterClose + return + } + overwrite := false + if int64(len(b)) > tw.nb { + b = b[0:tw.nb] + overwrite = true + } + n, err = tw.w.Write(b) + tw.nb -= int64(n) + if err == nil && overwrite { + err = ErrWriteTooLong + return + } + tw.err = err + return +} + +// Close closes the tar archive, flushing any unwritten +// data to the underlying writer. +func (tw *Writer) Close() error { + if tw.err != nil || tw.closed { + return tw.err + } + tw.Flush() + tw.closed = true + if tw.err != nil { + return tw.err + } + + // trailer: two zero blocks + for i := 0; i < 2; i++ { + _, tw.err = tw.w.Write(zeroBlock) + if tw.err != nil { + break + } + } + return tw.err +} diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/README.md b/vendor/github.com/vbatts/tar-split/tar/asm/README.md new file mode 100644 index 000000000..2a3a5b56a --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/README.md @@ -0,0 +1,44 @@ +asm +=== + +This library for assembly and disassembly of tar archives, facilitated by +`github.com/vbatts/tar-split/tar/storage`. + + +Concerns +-------- + +For completely safe assembly/disassembly, there will need to be a Content +Addressable Storage (CAS) directory, that maps to a checksum in the +`storage.Entity` of `storage.FileType`. + +This is due to the fact that tar archives _can_ allow multiple records for the +same path, but the last one effectively wins. Even if the prior records had a +different payload. + +In this way, when assembling an archive from relative paths, if the archive has +multiple entries for the same path, then all payloads read in from a relative +path would be identical. + + +Thoughts +-------- + +Have a look-aside directory or storage. This way when a clobbering record is +encountered from the tar stream, then the payload of the prior/existing file is +stored to the CAS. This way the clobbering record's file payload can be +extracted, but we'll have preserved the payload needed to reassemble a precise +tar archive. + +clobbered/path/to/file.[0-N] + +*alternatively* + +We could just _not_ support tar streams that have clobbering file paths. +Appending records to the archive is not incredibly common, and doesn't happen +by default for most implementations. Not supporting them wouldn't be a +security concern either, as if it did occur, we would reassemble an archive +that doesn't validate signature/checksum, so it shouldn't be trusted anyway. + +Otherwise, this will allow us to defer support for appended files as a FUTURE FEATURE. + diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go b/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go new file mode 100644 index 000000000..d624450ab --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go @@ -0,0 +1,130 @@ +package asm + +import ( + "bytes" + "fmt" + "hash" + "hash/crc64" + "io" + "sync" + + "github.com/vbatts/tar-split/tar/storage" +) + +// NewOutputTarStream returns an io.ReadCloser that is an assembled tar archive +// stream. +// +// It takes a storage.FileGetter, for mapping the file payloads that are to be read in, +// and a storage.Unpacker, which has access to the rawbytes and file order +// metadata. With the combination of these two items, a precise assembled Tar +// archive is possible. +func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadCloser { + // ... Since these are interfaces, this is possible, so let's not have a nil pointer + if fg == nil || up == nil { + return nil + } + pr, pw := io.Pipe() + go func() { + err := WriteOutputTarStream(fg, up, pw) + if err != nil { + pw.CloseWithError(err) + } else { + pw.Close() + } + }() + return pr +} + +// WriteOutputTarStream writes assembled tar archive to a writer. +func WriteOutputTarStream(fg storage.FileGetter, up storage.Unpacker, w io.Writer) error { + // ... Since these are interfaces, this is possible, so let's not have a nil pointer + if fg == nil || up == nil { + return nil + } + var copyBuffer []byte + var crcHash hash.Hash + var crcSum []byte + var multiWriter io.Writer + for { + entry, err := up.Next() + if err != nil { + if err == io.EOF { + return nil + } + return err + } + switch entry.Type { + case storage.SegmentType: + if _, err := w.Write(entry.Payload); err != nil { + return err + } + case storage.FileType: + if entry.Size == 0 { + continue + } + fh, err := fg.Get(entry.GetName()) + if err != nil { + return err + } + if crcHash == nil { + crcHash = crc64.New(storage.CRCTable) + crcSum = make([]byte, 8) + multiWriter = io.MultiWriter(w, crcHash) + copyBuffer = byteBufferPool.Get().([]byte) + defer byteBufferPool.Put(copyBuffer) + } else { + crcHash.Reset() + } + + if _, err := copyWithBuffer(multiWriter, fh, copyBuffer); err != nil { + fh.Close() + return err + } + + if !bytes.Equal(crcHash.Sum(crcSum[:0]), entry.Payload) { + // I would rather this be a comparable ErrInvalidChecksum or such, + // but since it's coming through the PipeReader, the context of + // _which_ file would be lost... + fh.Close() + return fmt.Errorf("file integrity checksum failed for %q", entry.GetName()) + } + fh.Close() + } + } +} + +var byteBufferPool = &sync.Pool{ + New: func() interface{} { + return make([]byte, 32*1024) + }, +} + +// copyWithBuffer is taken from stdlib io.Copy implementation +// https://github.com/golang/go/blob/go1.5.1/src/io/io.go#L367 +func copyWithBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) { + for { + nr, er := src.Read(buf) + if nr > 0 { + nw, ew := dst.Write(buf[0:nr]) + if nw > 0 { + written += int64(nw) + } + if ew != nil { + err = ew + break + } + if nr != nw { + err = io.ErrShortWrite + break + } + } + if er == io.EOF { + break + } + if er != nil { + err = er + break + } + } + return written, err +} diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go new file mode 100644 index 000000000..54ef23aed --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go @@ -0,0 +1,141 @@ +package asm + +import ( + "io" + "io/ioutil" + + "github.com/vbatts/tar-split/archive/tar" + "github.com/vbatts/tar-split/tar/storage" +) + +// NewInputTarStream wraps the Reader stream of a tar archive and provides a +// Reader stream of the same. +// +// In the middle it will pack the segments and file metadata to storage.Packer +// `p`. +// +// The the storage.FilePutter is where payload of files in the stream are +// stashed. If this stashing is not needed, you can provide a nil +// storage.FilePutter. Since the checksumming is still needed, then a default +// of NewDiscardFilePutter will be used internally +func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) { + // What to do here... folks will want their own access to the Reader that is + // their tar archive stream, but we'll need that same stream to use our + // forked 'archive/tar'. + // Perhaps do an io.TeeReader that hands back an io.Reader for them to read + // from, and we'll MITM the stream to store metadata. + // We'll need a storage.FilePutter too ... + + // Another concern, whether to do any storage.FilePutter operations, such that we + // don't extract any amount of the archive. But then again, we're not making + // files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter. + // Perhaps we have a DiscardFilePutter that is a bit bucket. + + // we'll return the pipe reader, since TeeReader does not buffer and will + // only read what the outputRdr Read's. Since Tar archives have padding on + // the end, we want to be the one reading the padding, even if the user's + // `archive/tar` doesn't care. + pR, pW := io.Pipe() + outputRdr := io.TeeReader(r, pW) + + // we need a putter that will generate the crc64 sums of file payloads + if fp == nil { + fp = storage.NewDiscardFilePutter() + } + + go func() { + tr := tar.NewReader(outputRdr) + tr.RawAccounting = true + for { + hdr, err := tr.Next() + if err != nil { + if err != io.EOF { + pW.CloseWithError(err) + return + } + // even when an EOF is reached, there is often 1024 null bytes on + // the end of an archive. Collect them too. + if b := tr.RawBytes(); len(b) > 0 { + _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } + } + break // not return. We need the end of the reader. + } + if hdr == nil { + break // not return. We need the end of the reader. + } + + if b := tr.RawBytes(); len(b) > 0 { + _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } + } + + var csum []byte + if hdr.Size > 0 { + var err error + _, csum, err = fp.Put(hdr.Name, tr) + if err != nil { + pW.CloseWithError(err) + return + } + } + + entry := storage.Entry{ + Type: storage.FileType, + Size: hdr.Size, + Payload: csum, + } + // For proper marshalling of non-utf8 characters + entry.SetName(hdr.Name) + + // File entries added, regardless of size + _, err = p.AddEntry(entry) + if err != nil { + pW.CloseWithError(err) + return + } + + if b := tr.RawBytes(); len(b) > 0 { + _, err = p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } + } + } + + // it is allowable, and not uncommon that there is further padding on the + // end of an archive, apart from the expected 1024 null bytes. + remainder, err := ioutil.ReadAll(outputRdr) + if err != nil && err != io.EOF { + pW.CloseWithError(err) + return + } + _, err = p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: remainder, + }) + if err != nil { + pW.CloseWithError(err) + return + } + pW.Close() + }() + + return pR, nil +} diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/doc.go b/vendor/github.com/vbatts/tar-split/tar/asm/doc.go new file mode 100644 index 000000000..4367b9022 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/doc.go @@ -0,0 +1,9 @@ +/* +Package asm provides the API for streaming assembly and disassembly of tar +archives. + +Using the `github.com/vbatts/tar-split/tar/storage` for Packing/Unpacking the +metadata for a stream, as well as an implementation of Getting/Putting the file +entries' payload. +*/ +package asm diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/doc.go b/vendor/github.com/vbatts/tar-split/tar/storage/doc.go new file mode 100644 index 000000000..83f7089ff --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/doc.go @@ -0,0 +1,12 @@ +/* +Package storage is for metadata of a tar archive. + +Packing and unpacking the Entries of the stream. The types of streams are +either segments of raw bytes (for the raw headers and various padding) and for +an entry marking a file payload. + +The raw bytes are stored precisely in the packed (marshalled) Entry, whereas +the file payload marker include the name of the file, size, and crc64 checksum +(for basic file integrity). +*/ +package storage diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/entry.go b/vendor/github.com/vbatts/tar-split/tar/storage/entry.go new file mode 100644 index 000000000..c91e7ea1e --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/entry.go @@ -0,0 +1,78 @@ +package storage + +import "unicode/utf8" + +// Entries is for sorting by Position +type Entries []Entry + +func (e Entries) Len() int { return len(e) } +func (e Entries) Swap(i, j int) { e[i], e[j] = e[j], e[i] } +func (e Entries) Less(i, j int) bool { return e[i].Position < e[j].Position } + +// Type of Entry +type Type int + +const ( + // FileType represents a file payload from the tar stream. + // + // This will be used to map to relative paths on disk. Only Size > 0 will get + // read into a resulting output stream (due to hardlinks). + FileType Type = 1 + iota + // SegmentType represents a raw bytes segment from the archive stream. These raw + // byte segments consist of the raw headers and various padding. + // + // Its payload is to be marshalled base64 encoded. + SegmentType +) + +// Entry is the structure for packing and unpacking the information read from +// the Tar archive. +// +// FileType Payload checksum is using `hash/crc64` for basic file integrity, +// _not_ for cryptography. +// From http://www.backplane.com/matt/crc64.html, CRC32 has almost 40,000 +// collisions in a sample of 18.2 million, CRC64 had none. +type Entry struct { + Type Type `json:"type"` + Name string `json:"name,omitempty"` + NameRaw []byte `json:"name_raw,omitempty"` + Size int64 `json:"size,omitempty"` + Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here; + Position int `json:"position"` +} + +// SetName will check name for valid UTF-8 string, and set the appropriate +// field. See https://github.com/vbatts/tar-split/issues/17 +func (e *Entry) SetName(name string) { + if utf8.ValidString(name) { + e.Name = name + } else { + e.NameRaw = []byte(name) + } +} + +// SetNameBytes will check name for valid UTF-8 string, and set the appropriate +// field +func (e *Entry) SetNameBytes(name []byte) { + if utf8.Valid(name) { + e.Name = string(name) + } else { + e.NameRaw = name + } +} + +// GetName returns the string for the entry's name, regardless of the field stored in +func (e *Entry) GetName() string { + if len(e.NameRaw) > 0 { + return string(e.NameRaw) + } + return e.Name +} + +// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in +func (e *Entry) GetNameBytes() []byte { + if len(e.NameRaw) > 0 { + return e.NameRaw + } + return []byte(e.Name) +} diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/getter.go b/vendor/github.com/vbatts/tar-split/tar/storage/getter.go new file mode 100644 index 000000000..ae11f8ffd --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/getter.go @@ -0,0 +1,104 @@ +package storage + +import ( + "bytes" + "errors" + "hash/crc64" + "io" + "os" + "path/filepath" +) + +// FileGetter is the interface for getting a stream of a file payload, +// addressed by name/filename. Presumably, the names will be scoped to relative +// file paths. +type FileGetter interface { + // Get returns a stream for the provided file path + Get(filename string) (output io.ReadCloser, err error) +} + +// FilePutter is the interface for storing a stream of a file payload, +// addressed by name/filename. +type FilePutter interface { + // Put returns the size of the stream received, and the crc64 checksum for + // the provided stream + Put(filename string, input io.Reader) (size int64, checksum []byte, err error) +} + +// FileGetPutter is the interface that groups both Getting and Putting file +// payloads. +type FileGetPutter interface { + FileGetter + FilePutter +} + +// NewPathFileGetter returns a FileGetter that is for files relative to path +// relpath. +func NewPathFileGetter(relpath string) FileGetter { + return &pathFileGetter{root: relpath} +} + +type pathFileGetter struct { + root string +} + +func (pfg pathFileGetter) Get(filename string) (io.ReadCloser, error) { + return os.Open(filepath.Join(pfg.root, filename)) +} + +type bufferFileGetPutter struct { + files map[string][]byte +} + +func (bfgp bufferFileGetPutter) Get(name string) (io.ReadCloser, error) { + if _, ok := bfgp.files[name]; !ok { + return nil, errors.New("no such file") + } + b := bytes.NewBuffer(bfgp.files[name]) + return &readCloserWrapper{b}, nil +} + +func (bfgp *bufferFileGetPutter) Put(name string, r io.Reader) (int64, []byte, error) { + crc := crc64.New(CRCTable) + buf := bytes.NewBuffer(nil) + cw := io.MultiWriter(crc, buf) + i, err := io.Copy(cw, r) + if err != nil { + return 0, nil, err + } + bfgp.files[name] = buf.Bytes() + return i, crc.Sum(nil), nil +} + +type readCloserWrapper struct { + io.Reader +} + +func (w *readCloserWrapper) Close() error { return nil } + +// NewBufferFileGetPutter is a simple in-memory FileGetPutter +// +// Implication is this is memory intensive... +// Probably best for testing or light weight cases. +func NewBufferFileGetPutter() FileGetPutter { + return &bufferFileGetPutter{ + files: map[string][]byte{}, + } +} + +// NewDiscardFilePutter is a bit bucket FilePutter +func NewDiscardFilePutter() FilePutter { + return &bitBucketFilePutter{} +} + +type bitBucketFilePutter struct { +} + +func (bbfp *bitBucketFilePutter) Put(name string, r io.Reader) (int64, []byte, error) { + c := crc64.New(CRCTable) + i, err := io.Copy(c, r) + return i, c.Sum(nil), err +} + +// CRCTable is the default table used for crc64 sum calculations +var CRCTable = crc64.MakeTable(crc64.ISO) diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/packer.go b/vendor/github.com/vbatts/tar-split/tar/storage/packer.go new file mode 100644 index 000000000..aba694818 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/packer.go @@ -0,0 +1,127 @@ +package storage + +import ( + "encoding/json" + "errors" + "io" + "path/filepath" + "unicode/utf8" +) + +// ErrDuplicatePath occurs when a tar archive has more than one entry for the +// same file path +var ErrDuplicatePath = errors.New("duplicates of file paths not supported") + +// Packer describes the methods to pack Entries to a storage destination +type Packer interface { + // AddEntry packs the Entry and returns its position + AddEntry(e Entry) (int, error) +} + +// Unpacker describes the methods to read Entries from a source +type Unpacker interface { + // Next returns the next Entry being unpacked, or error, until io.EOF + Next() (*Entry, error) +} + +/* TODO(vbatts) figure out a good model for this +type PackUnpacker interface { + Packer + Unpacker +} +*/ + +type jsonUnpacker struct { + seen seenNames + dec *json.Decoder +} + +func (jup *jsonUnpacker) Next() (*Entry, error) { + var e Entry + err := jup.dec.Decode(&e) + if err != nil { + return nil, err + } + + // check for dup name + if e.Type == FileType { + cName := filepath.Clean(e.GetName()) + if _, ok := jup.seen[cName]; ok { + return nil, ErrDuplicatePath + } + jup.seen[cName] = struct{}{} + } + + return &e, err +} + +// NewJSONUnpacker provides an Unpacker that reads Entries (SegmentType and +// FileType) as a json document. +// +// Each Entry read are expected to be delimited by new line. +func NewJSONUnpacker(r io.Reader) Unpacker { + return &jsonUnpacker{ + dec: json.NewDecoder(r), + seen: seenNames{}, + } +} + +type jsonPacker struct { + w io.Writer + e *json.Encoder + pos int + seen seenNames +} + +type seenNames map[string]struct{} + +func (jp *jsonPacker) AddEntry(e Entry) (int, error) { + // if Name is not valid utf8, switch it to raw first. + if e.Name != "" { + if !utf8.ValidString(e.Name) { + e.NameRaw = []byte(e.Name) + e.Name = "" + } + } + + // check early for dup name + if e.Type == FileType { + cName := filepath.Clean(e.GetName()) + if _, ok := jp.seen[cName]; ok { + return -1, ErrDuplicatePath + } + jp.seen[cName] = struct{}{} + } + + e.Position = jp.pos + err := jp.e.Encode(e) + if err != nil { + return -1, err + } + + // made it this far, increment now + jp.pos++ + return e.Position, nil +} + +// NewJSONPacker provides a Packer that writes each Entry (SegmentType and +// FileType) as a json document. +// +// The Entries are delimited by new line. +func NewJSONPacker(w io.Writer) Packer { + return &jsonPacker{ + w: w, + e: json.NewEncoder(w), + seen: seenNames{}, + } +} + +/* +TODO(vbatts) perhaps have a more compact packer/unpacker, maybe using msgapck +(https://github.com/ugorji/go) + + +Even though, since our jsonUnpacker and jsonPacker just take +io.Reader/io.Writer, then we can get away with passing them a +gzip.Reader/gzip.Writer +*/ -- cgit v1.2.3-54-g00ecf