diff options
Diffstat (limited to 'vendor/github.com/vbatts/tar-split/tar')
8 files changed, 645 insertions, 0 deletions
diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/README.md b/vendor/github.com/vbatts/tar-split/tar/asm/README.md new file mode 100644 index 000000000..2a3a5b56a --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/README.md @@ -0,0 +1,44 @@ +asm +=== + +This library for assembly and disassembly of tar archives, facilitated by +`github.com/vbatts/tar-split/tar/storage`. + + +Concerns +-------- + +For completely safe assembly/disassembly, there will need to be a Content +Addressable Storage (CAS) directory, that maps to a checksum in the +`storage.Entity` of `storage.FileType`. + +This is due to the fact that tar archives _can_ allow multiple records for the +same path, but the last one effectively wins. Even if the prior records had a +different payload. + +In this way, when assembling an archive from relative paths, if the archive has +multiple entries for the same path, then all payloads read in from a relative +path would be identical. + + +Thoughts +-------- + +Have a look-aside directory or storage. This way when a clobbering record is +encountered from the tar stream, then the payload of the prior/existing file is +stored to the CAS. This way the clobbering record's file payload can be +extracted, but we'll have preserved the payload needed to reassemble a precise +tar archive. + +clobbered/path/to/file.[0-N] + +*alternatively* + +We could just _not_ support tar streams that have clobbering file paths. +Appending records to the archive is not incredibly common, and doesn't happen +by default for most implementations. Not supporting them wouldn't be a +security concern either, as if it did occur, we would reassemble an archive +that doesn't validate signature/checksum, so it shouldn't be trusted anyway. + +Otherwise, this will allow us to defer support for appended files as a FUTURE FEATURE. + diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go b/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go new file mode 100644 index 000000000..d624450ab --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go @@ -0,0 +1,130 @@ +package asm + +import ( + "bytes" + "fmt" + "hash" + "hash/crc64" + "io" + "sync" + + "github.com/vbatts/tar-split/tar/storage" +) + +// NewOutputTarStream returns an io.ReadCloser that is an assembled tar archive +// stream. +// +// It takes a storage.FileGetter, for mapping the file payloads that are to be read in, +// and a storage.Unpacker, which has access to the rawbytes and file order +// metadata. With the combination of these two items, a precise assembled Tar +// archive is possible. +func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadCloser { + // ... Since these are interfaces, this is possible, so let's not have a nil pointer + if fg == nil || up == nil { + return nil + } + pr, pw := io.Pipe() + go func() { + err := WriteOutputTarStream(fg, up, pw) + if err != nil { + pw.CloseWithError(err) + } else { + pw.Close() + } + }() + return pr +} + +// WriteOutputTarStream writes assembled tar archive to a writer. +func WriteOutputTarStream(fg storage.FileGetter, up storage.Unpacker, w io.Writer) error { + // ... Since these are interfaces, this is possible, so let's not have a nil pointer + if fg == nil || up == nil { + return nil + } + var copyBuffer []byte + var crcHash hash.Hash + var crcSum []byte + var multiWriter io.Writer + for { + entry, err := up.Next() + if err != nil { + if err == io.EOF { + return nil + } + return err + } + switch entry.Type { + case storage.SegmentType: + if _, err := w.Write(entry.Payload); err != nil { + return err + } + case storage.FileType: + if entry.Size == 0 { + continue + } + fh, err := fg.Get(entry.GetName()) + if err != nil { + return err + } + if crcHash == nil { + crcHash = crc64.New(storage.CRCTable) + crcSum = make([]byte, 8) + multiWriter = io.MultiWriter(w, crcHash) + copyBuffer = byteBufferPool.Get().([]byte) + defer byteBufferPool.Put(copyBuffer) + } else { + crcHash.Reset() + } + + if _, err := copyWithBuffer(multiWriter, fh, copyBuffer); err != nil { + fh.Close() + return err + } + + if !bytes.Equal(crcHash.Sum(crcSum[:0]), entry.Payload) { + // I would rather this be a comparable ErrInvalidChecksum or such, + // but since it's coming through the PipeReader, the context of + // _which_ file would be lost... + fh.Close() + return fmt.Errorf("file integrity checksum failed for %q", entry.GetName()) + } + fh.Close() + } + } +} + +var byteBufferPool = &sync.Pool{ + New: func() interface{} { + return make([]byte, 32*1024) + }, +} + +// copyWithBuffer is taken from stdlib io.Copy implementation +// https://github.com/golang/go/blob/go1.5.1/src/io/io.go#L367 +func copyWithBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) { + for { + nr, er := src.Read(buf) + if nr > 0 { + nw, ew := dst.Write(buf[0:nr]) + if nw > 0 { + written += int64(nw) + } + if ew != nil { + err = ew + break + } + if nr != nw { + err = io.ErrShortWrite + break + } + } + if er == io.EOF { + break + } + if er != nil { + err = er + break + } + } + return written, err +} diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go new file mode 100644 index 000000000..54ef23aed --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go @@ -0,0 +1,141 @@ +package asm + +import ( + "io" + "io/ioutil" + + "github.com/vbatts/tar-split/archive/tar" + "github.com/vbatts/tar-split/tar/storage" +) + +// NewInputTarStream wraps the Reader stream of a tar archive and provides a +// Reader stream of the same. +// +// In the middle it will pack the segments and file metadata to storage.Packer +// `p`. +// +// The the storage.FilePutter is where payload of files in the stream are +// stashed. If this stashing is not needed, you can provide a nil +// storage.FilePutter. Since the checksumming is still needed, then a default +// of NewDiscardFilePutter will be used internally +func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) { + // What to do here... folks will want their own access to the Reader that is + // their tar archive stream, but we'll need that same stream to use our + // forked 'archive/tar'. + // Perhaps do an io.TeeReader that hands back an io.Reader for them to read + // from, and we'll MITM the stream to store metadata. + // We'll need a storage.FilePutter too ... + + // Another concern, whether to do any storage.FilePutter operations, such that we + // don't extract any amount of the archive. But then again, we're not making + // files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter. + // Perhaps we have a DiscardFilePutter that is a bit bucket. + + // we'll return the pipe reader, since TeeReader does not buffer and will + // only read what the outputRdr Read's. Since Tar archives have padding on + // the end, we want to be the one reading the padding, even if the user's + // `archive/tar` doesn't care. + pR, pW := io.Pipe() + outputRdr := io.TeeReader(r, pW) + + // we need a putter that will generate the crc64 sums of file payloads + if fp == nil { + fp = storage.NewDiscardFilePutter() + } + + go func() { + tr := tar.NewReader(outputRdr) + tr.RawAccounting = true + for { + hdr, err := tr.Next() + if err != nil { + if err != io.EOF { + pW.CloseWithError(err) + return + } + // even when an EOF is reached, there is often 1024 null bytes on + // the end of an archive. Collect them too. + if b := tr.RawBytes(); len(b) > 0 { + _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } + } + break // not return. We need the end of the reader. + } + if hdr == nil { + break // not return. We need the end of the reader. + } + + if b := tr.RawBytes(); len(b) > 0 { + _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } + } + + var csum []byte + if hdr.Size > 0 { + var err error + _, csum, err = fp.Put(hdr.Name, tr) + if err != nil { + pW.CloseWithError(err) + return + } + } + + entry := storage.Entry{ + Type: storage.FileType, + Size: hdr.Size, + Payload: csum, + } + // For proper marshalling of non-utf8 characters + entry.SetName(hdr.Name) + + // File entries added, regardless of size + _, err = p.AddEntry(entry) + if err != nil { + pW.CloseWithError(err) + return + } + + if b := tr.RawBytes(); len(b) > 0 { + _, err = p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }) + if err != nil { + pW.CloseWithError(err) + return + } + } + } + + // it is allowable, and not uncommon that there is further padding on the + // end of an archive, apart from the expected 1024 null bytes. + remainder, err := ioutil.ReadAll(outputRdr) + if err != nil && err != io.EOF { + pW.CloseWithError(err) + return + } + _, err = p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: remainder, + }) + if err != nil { + pW.CloseWithError(err) + return + } + pW.Close() + }() + + return pR, nil +} diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/doc.go b/vendor/github.com/vbatts/tar-split/tar/asm/doc.go new file mode 100644 index 000000000..4367b9022 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/asm/doc.go @@ -0,0 +1,9 @@ +/* +Package asm provides the API for streaming assembly and disassembly of tar +archives. + +Using the `github.com/vbatts/tar-split/tar/storage` for Packing/Unpacking the +metadata for a stream, as well as an implementation of Getting/Putting the file +entries' payload. +*/ +package asm diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/doc.go b/vendor/github.com/vbatts/tar-split/tar/storage/doc.go new file mode 100644 index 000000000..83f7089ff --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/doc.go @@ -0,0 +1,12 @@ +/* +Package storage is for metadata of a tar archive. + +Packing and unpacking the Entries of the stream. The types of streams are +either segments of raw bytes (for the raw headers and various padding) and for +an entry marking a file payload. + +The raw bytes are stored precisely in the packed (marshalled) Entry, whereas +the file payload marker include the name of the file, size, and crc64 checksum +(for basic file integrity). +*/ +package storage diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/entry.go b/vendor/github.com/vbatts/tar-split/tar/storage/entry.go new file mode 100644 index 000000000..c91e7ea1e --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/entry.go @@ -0,0 +1,78 @@ +package storage + +import "unicode/utf8" + +// Entries is for sorting by Position +type Entries []Entry + +func (e Entries) Len() int { return len(e) } +func (e Entries) Swap(i, j int) { e[i], e[j] = e[j], e[i] } +func (e Entries) Less(i, j int) bool { return e[i].Position < e[j].Position } + +// Type of Entry +type Type int + +const ( + // FileType represents a file payload from the tar stream. + // + // This will be used to map to relative paths on disk. Only Size > 0 will get + // read into a resulting output stream (due to hardlinks). + FileType Type = 1 + iota + // SegmentType represents a raw bytes segment from the archive stream. These raw + // byte segments consist of the raw headers and various padding. + // + // Its payload is to be marshalled base64 encoded. + SegmentType +) + +// Entry is the structure for packing and unpacking the information read from +// the Tar archive. +// +// FileType Payload checksum is using `hash/crc64` for basic file integrity, +// _not_ for cryptography. +// From http://www.backplane.com/matt/crc64.html, CRC32 has almost 40,000 +// collisions in a sample of 18.2 million, CRC64 had none. +type Entry struct { + Type Type `json:"type"` + Name string `json:"name,omitempty"` + NameRaw []byte `json:"name_raw,omitempty"` + Size int64 `json:"size,omitempty"` + Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here; + Position int `json:"position"` +} + +// SetName will check name for valid UTF-8 string, and set the appropriate +// field. See https://github.com/vbatts/tar-split/issues/17 +func (e *Entry) SetName(name string) { + if utf8.ValidString(name) { + e.Name = name + } else { + e.NameRaw = []byte(name) + } +} + +// SetNameBytes will check name for valid UTF-8 string, and set the appropriate +// field +func (e *Entry) SetNameBytes(name []byte) { + if utf8.Valid(name) { + e.Name = string(name) + } else { + e.NameRaw = name + } +} + +// GetName returns the string for the entry's name, regardless of the field stored in +func (e *Entry) GetName() string { + if len(e.NameRaw) > 0 { + return string(e.NameRaw) + } + return e.Name +} + +// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in +func (e *Entry) GetNameBytes() []byte { + if len(e.NameRaw) > 0 { + return e.NameRaw + } + return []byte(e.Name) +} diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/getter.go b/vendor/github.com/vbatts/tar-split/tar/storage/getter.go new file mode 100644 index 000000000..ae11f8ffd --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/getter.go @@ -0,0 +1,104 @@ +package storage + +import ( + "bytes" + "errors" + "hash/crc64" + "io" + "os" + "path/filepath" +) + +// FileGetter is the interface for getting a stream of a file payload, +// addressed by name/filename. Presumably, the names will be scoped to relative +// file paths. +type FileGetter interface { + // Get returns a stream for the provided file path + Get(filename string) (output io.ReadCloser, err error) +} + +// FilePutter is the interface for storing a stream of a file payload, +// addressed by name/filename. +type FilePutter interface { + // Put returns the size of the stream received, and the crc64 checksum for + // the provided stream + Put(filename string, input io.Reader) (size int64, checksum []byte, err error) +} + +// FileGetPutter is the interface that groups both Getting and Putting file +// payloads. +type FileGetPutter interface { + FileGetter + FilePutter +} + +// NewPathFileGetter returns a FileGetter that is for files relative to path +// relpath. +func NewPathFileGetter(relpath string) FileGetter { + return &pathFileGetter{root: relpath} +} + +type pathFileGetter struct { + root string +} + +func (pfg pathFileGetter) Get(filename string) (io.ReadCloser, error) { + return os.Open(filepath.Join(pfg.root, filename)) +} + +type bufferFileGetPutter struct { + files map[string][]byte +} + +func (bfgp bufferFileGetPutter) Get(name string) (io.ReadCloser, error) { + if _, ok := bfgp.files[name]; !ok { + return nil, errors.New("no such file") + } + b := bytes.NewBuffer(bfgp.files[name]) + return &readCloserWrapper{b}, nil +} + +func (bfgp *bufferFileGetPutter) Put(name string, r io.Reader) (int64, []byte, error) { + crc := crc64.New(CRCTable) + buf := bytes.NewBuffer(nil) + cw := io.MultiWriter(crc, buf) + i, err := io.Copy(cw, r) + if err != nil { + return 0, nil, err + } + bfgp.files[name] = buf.Bytes() + return i, crc.Sum(nil), nil +} + +type readCloserWrapper struct { + io.Reader +} + +func (w *readCloserWrapper) Close() error { return nil } + +// NewBufferFileGetPutter is a simple in-memory FileGetPutter +// +// Implication is this is memory intensive... +// Probably best for testing or light weight cases. +func NewBufferFileGetPutter() FileGetPutter { + return &bufferFileGetPutter{ + files: map[string][]byte{}, + } +} + +// NewDiscardFilePutter is a bit bucket FilePutter +func NewDiscardFilePutter() FilePutter { + return &bitBucketFilePutter{} +} + +type bitBucketFilePutter struct { +} + +func (bbfp *bitBucketFilePutter) Put(name string, r io.Reader) (int64, []byte, error) { + c := crc64.New(CRCTable) + i, err := io.Copy(c, r) + return i, c.Sum(nil), err +} + +// CRCTable is the default table used for crc64 sum calculations +var CRCTable = crc64.MakeTable(crc64.ISO) diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/packer.go b/vendor/github.com/vbatts/tar-split/tar/storage/packer.go new file mode 100644 index 000000000..aba694818 --- /dev/null +++ b/vendor/github.com/vbatts/tar-split/tar/storage/packer.go @@ -0,0 +1,127 @@ +package storage + +import ( + "encoding/json" + "errors" + "io" + "path/filepath" + "unicode/utf8" +) + +// ErrDuplicatePath occurs when a tar archive has more than one entry for the +// same file path +var ErrDuplicatePath = errors.New("duplicates of file paths not supported") + +// Packer describes the methods to pack Entries to a storage destination +type Packer interface { + // AddEntry packs the Entry and returns its position + AddEntry(e Entry) (int, error) +} + +// Unpacker describes the methods to read Entries from a source +type Unpacker interface { + // Next returns the next Entry being unpacked, or error, until io.EOF + Next() (*Entry, error) +} + +/* TODO(vbatts) figure out a good model for this +type PackUnpacker interface { + Packer + Unpacker +} +*/ + +type jsonUnpacker struct { + seen seenNames + dec *json.Decoder +} + +func (jup *jsonUnpacker) Next() (*Entry, error) { + var e Entry + err := jup.dec.Decode(&e) + if err != nil { + return nil, err + } + + // check for dup name + if e.Type == FileType { + cName := filepath.Clean(e.GetName()) + if _, ok := jup.seen[cName]; ok { + return nil, ErrDuplicatePath + } + jup.seen[cName] = struct{}{} + } + + return &e, err +} + +// NewJSONUnpacker provides an Unpacker that reads Entries (SegmentType and +// FileType) as a json document. +// +// Each Entry read are expected to be delimited by new line. +func NewJSONUnpacker(r io.Reader) Unpacker { + return &jsonUnpacker{ + dec: json.NewDecoder(r), + seen: seenNames{}, + } +} + +type jsonPacker struct { + w io.Writer + e *json.Encoder + pos int + seen seenNames +} + +type seenNames map[string]struct{} + +func (jp *jsonPacker) AddEntry(e Entry) (int, error) { + // if Name is not valid utf8, switch it to raw first. + if e.Name != "" { + if !utf8.ValidString(e.Name) { + e.NameRaw = []byte(e.Name) + e.Name = "" + } + } + + // check early for dup name + if e.Type == FileType { + cName := filepath.Clean(e.GetName()) + if _, ok := jp.seen[cName]; ok { + return -1, ErrDuplicatePath + } + jp.seen[cName] = struct{}{} + } + + e.Position = jp.pos + err := jp.e.Encode(e) + if err != nil { + return -1, err + } + + // made it this far, increment now + jp.pos++ + return e.Position, nil +} + +// NewJSONPacker provides a Packer that writes each Entry (SegmentType and +// FileType) as a json document. +// +// The Entries are delimited by new line. +func NewJSONPacker(w io.Writer) Packer { + return &jsonPacker{ + w: w, + e: json.NewEncoder(w), + seen: seenNames{}, + } +} + +/* +TODO(vbatts) perhaps have a more compact packer/unpacker, maybe using msgapck +(https://github.com/ugorji/go) + + +Even though, since our jsonUnpacker and jsonPacker just take +io.Reader/io.Writer, then we can get away with passing them a +gzip.Reader/gzip.Writer +*/ |