aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/vbatts/tar-split/tar
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/vbatts/tar-split/tar')
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/asm/README.md44
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/asm/assemble.go130
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go141
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/asm/doc.go9
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/storage/doc.go12
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/storage/entry.go78
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/storage/getter.go104
-rw-r--r--vendor/github.com/vbatts/tar-split/tar/storage/packer.go127
8 files changed, 645 insertions, 0 deletions
diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/README.md b/vendor/github.com/vbatts/tar-split/tar/asm/README.md
new file mode 100644
index 000000000..2a3a5b56a
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/asm/README.md
@@ -0,0 +1,44 @@
+asm
+===
+
+This library for assembly and disassembly of tar archives, facilitated by
+`github.com/vbatts/tar-split/tar/storage`.
+
+
+Concerns
+--------
+
+For completely safe assembly/disassembly, there will need to be a Content
+Addressable Storage (CAS) directory, that maps to a checksum in the
+`storage.Entity` of `storage.FileType`.
+
+This is due to the fact that tar archives _can_ allow multiple records for the
+same path, but the last one effectively wins. Even if the prior records had a
+different payload.
+
+In this way, when assembling an archive from relative paths, if the archive has
+multiple entries for the same path, then all payloads read in from a relative
+path would be identical.
+
+
+Thoughts
+--------
+
+Have a look-aside directory or storage. This way when a clobbering record is
+encountered from the tar stream, then the payload of the prior/existing file is
+stored to the CAS. This way the clobbering record's file payload can be
+extracted, but we'll have preserved the payload needed to reassemble a precise
+tar archive.
+
+clobbered/path/to/file.[0-N]
+
+*alternatively*
+
+We could just _not_ support tar streams that have clobbering file paths.
+Appending records to the archive is not incredibly common, and doesn't happen
+by default for most implementations. Not supporting them wouldn't be a
+security concern either, as if it did occur, we would reassemble an archive
+that doesn't validate signature/checksum, so it shouldn't be trusted anyway.
+
+Otherwise, this will allow us to defer support for appended files as a FUTURE FEATURE.
+
diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go b/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go
new file mode 100644
index 000000000..d624450ab
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/asm/assemble.go
@@ -0,0 +1,130 @@
+package asm
+
+import (
+ "bytes"
+ "fmt"
+ "hash"
+ "hash/crc64"
+ "io"
+ "sync"
+
+ "github.com/vbatts/tar-split/tar/storage"
+)
+
+// NewOutputTarStream returns an io.ReadCloser that is an assembled tar archive
+// stream.
+//
+// It takes a storage.FileGetter, for mapping the file payloads that are to be read in,
+// and a storage.Unpacker, which has access to the rawbytes and file order
+// metadata. With the combination of these two items, a precise assembled Tar
+// archive is possible.
+func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadCloser {
+ // ... Since these are interfaces, this is possible, so let's not have a nil pointer
+ if fg == nil || up == nil {
+ return nil
+ }
+ pr, pw := io.Pipe()
+ go func() {
+ err := WriteOutputTarStream(fg, up, pw)
+ if err != nil {
+ pw.CloseWithError(err)
+ } else {
+ pw.Close()
+ }
+ }()
+ return pr
+}
+
+// WriteOutputTarStream writes assembled tar archive to a writer.
+func WriteOutputTarStream(fg storage.FileGetter, up storage.Unpacker, w io.Writer) error {
+ // ... Since these are interfaces, this is possible, so let's not have a nil pointer
+ if fg == nil || up == nil {
+ return nil
+ }
+ var copyBuffer []byte
+ var crcHash hash.Hash
+ var crcSum []byte
+ var multiWriter io.Writer
+ for {
+ entry, err := up.Next()
+ if err != nil {
+ if err == io.EOF {
+ return nil
+ }
+ return err
+ }
+ switch entry.Type {
+ case storage.SegmentType:
+ if _, err := w.Write(entry.Payload); err != nil {
+ return err
+ }
+ case storage.FileType:
+ if entry.Size == 0 {
+ continue
+ }
+ fh, err := fg.Get(entry.GetName())
+ if err != nil {
+ return err
+ }
+ if crcHash == nil {
+ crcHash = crc64.New(storage.CRCTable)
+ crcSum = make([]byte, 8)
+ multiWriter = io.MultiWriter(w, crcHash)
+ copyBuffer = byteBufferPool.Get().([]byte)
+ defer byteBufferPool.Put(copyBuffer)
+ } else {
+ crcHash.Reset()
+ }
+
+ if _, err := copyWithBuffer(multiWriter, fh, copyBuffer); err != nil {
+ fh.Close()
+ return err
+ }
+
+ if !bytes.Equal(crcHash.Sum(crcSum[:0]), entry.Payload) {
+ // I would rather this be a comparable ErrInvalidChecksum or such,
+ // but since it's coming through the PipeReader, the context of
+ // _which_ file would be lost...
+ fh.Close()
+ return fmt.Errorf("file integrity checksum failed for %q", entry.GetName())
+ }
+ fh.Close()
+ }
+ }
+}
+
+var byteBufferPool = &sync.Pool{
+ New: func() interface{} {
+ return make([]byte, 32*1024)
+ },
+}
+
+// copyWithBuffer is taken from stdlib io.Copy implementation
+// https://github.com/golang/go/blob/go1.5.1/src/io/io.go#L367
+func copyWithBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) {
+ for {
+ nr, er := src.Read(buf)
+ if nr > 0 {
+ nw, ew := dst.Write(buf[0:nr])
+ if nw > 0 {
+ written += int64(nw)
+ }
+ if ew != nil {
+ err = ew
+ break
+ }
+ if nr != nw {
+ err = io.ErrShortWrite
+ break
+ }
+ }
+ if er == io.EOF {
+ break
+ }
+ if er != nil {
+ err = er
+ break
+ }
+ }
+ return written, err
+}
diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go
new file mode 100644
index 000000000..54ef23aed
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go
@@ -0,0 +1,141 @@
+package asm
+
+import (
+ "io"
+ "io/ioutil"
+
+ "github.com/vbatts/tar-split/archive/tar"
+ "github.com/vbatts/tar-split/tar/storage"
+)
+
+// NewInputTarStream wraps the Reader stream of a tar archive and provides a
+// Reader stream of the same.
+//
+// In the middle it will pack the segments and file metadata to storage.Packer
+// `p`.
+//
+// The the storage.FilePutter is where payload of files in the stream are
+// stashed. If this stashing is not needed, you can provide a nil
+// storage.FilePutter. Since the checksumming is still needed, then a default
+// of NewDiscardFilePutter will be used internally
+func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) {
+ // What to do here... folks will want their own access to the Reader that is
+ // their tar archive stream, but we'll need that same stream to use our
+ // forked 'archive/tar'.
+ // Perhaps do an io.TeeReader that hands back an io.Reader for them to read
+ // from, and we'll MITM the stream to store metadata.
+ // We'll need a storage.FilePutter too ...
+
+ // Another concern, whether to do any storage.FilePutter operations, such that we
+ // don't extract any amount of the archive. But then again, we're not making
+ // files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter.
+ // Perhaps we have a DiscardFilePutter that is a bit bucket.
+
+ // we'll return the pipe reader, since TeeReader does not buffer and will
+ // only read what the outputRdr Read's. Since Tar archives have padding on
+ // the end, we want to be the one reading the padding, even if the user's
+ // `archive/tar` doesn't care.
+ pR, pW := io.Pipe()
+ outputRdr := io.TeeReader(r, pW)
+
+ // we need a putter that will generate the crc64 sums of file payloads
+ if fp == nil {
+ fp = storage.NewDiscardFilePutter()
+ }
+
+ go func() {
+ tr := tar.NewReader(outputRdr)
+ tr.RawAccounting = true
+ for {
+ hdr, err := tr.Next()
+ if err != nil {
+ if err != io.EOF {
+ pW.CloseWithError(err)
+ return
+ }
+ // even when an EOF is reached, there is often 1024 null bytes on
+ // the end of an archive. Collect them too.
+ if b := tr.RawBytes(); len(b) > 0 {
+ _, err := p.AddEntry(storage.Entry{
+ Type: storage.SegmentType,
+ Payload: b,
+ })
+ if err != nil {
+ pW.CloseWithError(err)
+ return
+ }
+ }
+ break // not return. We need the end of the reader.
+ }
+ if hdr == nil {
+ break // not return. We need the end of the reader.
+ }
+
+ if b := tr.RawBytes(); len(b) > 0 {
+ _, err := p.AddEntry(storage.Entry{
+ Type: storage.SegmentType,
+ Payload: b,
+ })
+ if err != nil {
+ pW.CloseWithError(err)
+ return
+ }
+ }
+
+ var csum []byte
+ if hdr.Size > 0 {
+ var err error
+ _, csum, err = fp.Put(hdr.Name, tr)
+ if err != nil {
+ pW.CloseWithError(err)
+ return
+ }
+ }
+
+ entry := storage.Entry{
+ Type: storage.FileType,
+ Size: hdr.Size,
+ Payload: csum,
+ }
+ // For proper marshalling of non-utf8 characters
+ entry.SetName(hdr.Name)
+
+ // File entries added, regardless of size
+ _, err = p.AddEntry(entry)
+ if err != nil {
+ pW.CloseWithError(err)
+ return
+ }
+
+ if b := tr.RawBytes(); len(b) > 0 {
+ _, err = p.AddEntry(storage.Entry{
+ Type: storage.SegmentType,
+ Payload: b,
+ })
+ if err != nil {
+ pW.CloseWithError(err)
+ return
+ }
+ }
+ }
+
+ // it is allowable, and not uncommon that there is further padding on the
+ // end of an archive, apart from the expected 1024 null bytes.
+ remainder, err := ioutil.ReadAll(outputRdr)
+ if err != nil && err != io.EOF {
+ pW.CloseWithError(err)
+ return
+ }
+ _, err = p.AddEntry(storage.Entry{
+ Type: storage.SegmentType,
+ Payload: remainder,
+ })
+ if err != nil {
+ pW.CloseWithError(err)
+ return
+ }
+ pW.Close()
+ }()
+
+ return pR, nil
+}
diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/doc.go b/vendor/github.com/vbatts/tar-split/tar/asm/doc.go
new file mode 100644
index 000000000..4367b9022
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/asm/doc.go
@@ -0,0 +1,9 @@
+/*
+Package asm provides the API for streaming assembly and disassembly of tar
+archives.
+
+Using the `github.com/vbatts/tar-split/tar/storage` for Packing/Unpacking the
+metadata for a stream, as well as an implementation of Getting/Putting the file
+entries' payload.
+*/
+package asm
diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/doc.go b/vendor/github.com/vbatts/tar-split/tar/storage/doc.go
new file mode 100644
index 000000000..83f7089ff
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/storage/doc.go
@@ -0,0 +1,12 @@
+/*
+Package storage is for metadata of a tar archive.
+
+Packing and unpacking the Entries of the stream. The types of streams are
+either segments of raw bytes (for the raw headers and various padding) and for
+an entry marking a file payload.
+
+The raw bytes are stored precisely in the packed (marshalled) Entry, whereas
+the file payload marker include the name of the file, size, and crc64 checksum
+(for basic file integrity).
+*/
+package storage
diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/entry.go b/vendor/github.com/vbatts/tar-split/tar/storage/entry.go
new file mode 100644
index 000000000..c91e7ea1e
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/storage/entry.go
@@ -0,0 +1,78 @@
+package storage
+
+import "unicode/utf8"
+
+// Entries is for sorting by Position
+type Entries []Entry
+
+func (e Entries) Len() int { return len(e) }
+func (e Entries) Swap(i, j int) { e[i], e[j] = e[j], e[i] }
+func (e Entries) Less(i, j int) bool { return e[i].Position < e[j].Position }
+
+// Type of Entry
+type Type int
+
+const (
+ // FileType represents a file payload from the tar stream.
+ //
+ // This will be used to map to relative paths on disk. Only Size > 0 will get
+ // read into a resulting output stream (due to hardlinks).
+ FileType Type = 1 + iota
+ // SegmentType represents a raw bytes segment from the archive stream. These raw
+ // byte segments consist of the raw headers and various padding.
+ //
+ // Its payload is to be marshalled base64 encoded.
+ SegmentType
+)
+
+// Entry is the structure for packing and unpacking the information read from
+// the Tar archive.
+//
+// FileType Payload checksum is using `hash/crc64` for basic file integrity,
+// _not_ for cryptography.
+// From http://www.backplane.com/matt/crc64.html, CRC32 has almost 40,000
+// collisions in a sample of 18.2 million, CRC64 had none.
+type Entry struct {
+ Type Type `json:"type"`
+ Name string `json:"name,omitempty"`
+ NameRaw []byte `json:"name_raw,omitempty"`
+ Size int64 `json:"size,omitempty"`
+ Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here;
+ Position int `json:"position"`
+}
+
+// SetName will check name for valid UTF-8 string, and set the appropriate
+// field. See https://github.com/vbatts/tar-split/issues/17
+func (e *Entry) SetName(name string) {
+ if utf8.ValidString(name) {
+ e.Name = name
+ } else {
+ e.NameRaw = []byte(name)
+ }
+}
+
+// SetNameBytes will check name for valid UTF-8 string, and set the appropriate
+// field
+func (e *Entry) SetNameBytes(name []byte) {
+ if utf8.Valid(name) {
+ e.Name = string(name)
+ } else {
+ e.NameRaw = name
+ }
+}
+
+// GetName returns the string for the entry's name, regardless of the field stored in
+func (e *Entry) GetName() string {
+ if len(e.NameRaw) > 0 {
+ return string(e.NameRaw)
+ }
+ return e.Name
+}
+
+// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in
+func (e *Entry) GetNameBytes() []byte {
+ if len(e.NameRaw) > 0 {
+ return e.NameRaw
+ }
+ return []byte(e.Name)
+}
diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/getter.go b/vendor/github.com/vbatts/tar-split/tar/storage/getter.go
new file mode 100644
index 000000000..ae11f8ffd
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/storage/getter.go
@@ -0,0 +1,104 @@
+package storage
+
+import (
+ "bytes"
+ "errors"
+ "hash/crc64"
+ "io"
+ "os"
+ "path/filepath"
+)
+
+// FileGetter is the interface for getting a stream of a file payload,
+// addressed by name/filename. Presumably, the names will be scoped to relative
+// file paths.
+type FileGetter interface {
+ // Get returns a stream for the provided file path
+ Get(filename string) (output io.ReadCloser, err error)
+}
+
+// FilePutter is the interface for storing a stream of a file payload,
+// addressed by name/filename.
+type FilePutter interface {
+ // Put returns the size of the stream received, and the crc64 checksum for
+ // the provided stream
+ Put(filename string, input io.Reader) (size int64, checksum []byte, err error)
+}
+
+// FileGetPutter is the interface that groups both Getting and Putting file
+// payloads.
+type FileGetPutter interface {
+ FileGetter
+ FilePutter
+}
+
+// NewPathFileGetter returns a FileGetter that is for files relative to path
+// relpath.
+func NewPathFileGetter(relpath string) FileGetter {
+ return &pathFileGetter{root: relpath}
+}
+
+type pathFileGetter struct {
+ root string
+}
+
+func (pfg pathFileGetter) Get(filename string) (io.ReadCloser, error) {
+ return os.Open(filepath.Join(pfg.root, filename))
+}
+
+type bufferFileGetPutter struct {
+ files map[string][]byte
+}
+
+func (bfgp bufferFileGetPutter) Get(name string) (io.ReadCloser, error) {
+ if _, ok := bfgp.files[name]; !ok {
+ return nil, errors.New("no such file")
+ }
+ b := bytes.NewBuffer(bfgp.files[name])
+ return &readCloserWrapper{b}, nil
+}
+
+func (bfgp *bufferFileGetPutter) Put(name string, r io.Reader) (int64, []byte, error) {
+ crc := crc64.New(CRCTable)
+ buf := bytes.NewBuffer(nil)
+ cw := io.MultiWriter(crc, buf)
+ i, err := io.Copy(cw, r)
+ if err != nil {
+ return 0, nil, err
+ }
+ bfgp.files[name] = buf.Bytes()
+ return i, crc.Sum(nil), nil
+}
+
+type readCloserWrapper struct {
+ io.Reader
+}
+
+func (w *readCloserWrapper) Close() error { return nil }
+
+// NewBufferFileGetPutter is a simple in-memory FileGetPutter
+//
+// Implication is this is memory intensive...
+// Probably best for testing or light weight cases.
+func NewBufferFileGetPutter() FileGetPutter {
+ return &bufferFileGetPutter{
+ files: map[string][]byte{},
+ }
+}
+
+// NewDiscardFilePutter is a bit bucket FilePutter
+func NewDiscardFilePutter() FilePutter {
+ return &bitBucketFilePutter{}
+}
+
+type bitBucketFilePutter struct {
+}
+
+func (bbfp *bitBucketFilePutter) Put(name string, r io.Reader) (int64, []byte, error) {
+ c := crc64.New(CRCTable)
+ i, err := io.Copy(c, r)
+ return i, c.Sum(nil), err
+}
+
+// CRCTable is the default table used for crc64 sum calculations
+var CRCTable = crc64.MakeTable(crc64.ISO)
diff --git a/vendor/github.com/vbatts/tar-split/tar/storage/packer.go b/vendor/github.com/vbatts/tar-split/tar/storage/packer.go
new file mode 100644
index 000000000..aba694818
--- /dev/null
+++ b/vendor/github.com/vbatts/tar-split/tar/storage/packer.go
@@ -0,0 +1,127 @@
+package storage
+
+import (
+ "encoding/json"
+ "errors"
+ "io"
+ "path/filepath"
+ "unicode/utf8"
+)
+
+// ErrDuplicatePath occurs when a tar archive has more than one entry for the
+// same file path
+var ErrDuplicatePath = errors.New("duplicates of file paths not supported")
+
+// Packer describes the methods to pack Entries to a storage destination
+type Packer interface {
+ // AddEntry packs the Entry and returns its position
+ AddEntry(e Entry) (int, error)
+}
+
+// Unpacker describes the methods to read Entries from a source
+type Unpacker interface {
+ // Next returns the next Entry being unpacked, or error, until io.EOF
+ Next() (*Entry, error)
+}
+
+/* TODO(vbatts) figure out a good model for this
+type PackUnpacker interface {
+ Packer
+ Unpacker
+}
+*/
+
+type jsonUnpacker struct {
+ seen seenNames
+ dec *json.Decoder
+}
+
+func (jup *jsonUnpacker) Next() (*Entry, error) {
+ var e Entry
+ err := jup.dec.Decode(&e)
+ if err != nil {
+ return nil, err
+ }
+
+ // check for dup name
+ if e.Type == FileType {
+ cName := filepath.Clean(e.GetName())
+ if _, ok := jup.seen[cName]; ok {
+ return nil, ErrDuplicatePath
+ }
+ jup.seen[cName] = struct{}{}
+ }
+
+ return &e, err
+}
+
+// NewJSONUnpacker provides an Unpacker that reads Entries (SegmentType and
+// FileType) as a json document.
+//
+// Each Entry read are expected to be delimited by new line.
+func NewJSONUnpacker(r io.Reader) Unpacker {
+ return &jsonUnpacker{
+ dec: json.NewDecoder(r),
+ seen: seenNames{},
+ }
+}
+
+type jsonPacker struct {
+ w io.Writer
+ e *json.Encoder
+ pos int
+ seen seenNames
+}
+
+type seenNames map[string]struct{}
+
+func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
+ // if Name is not valid utf8, switch it to raw first.
+ if e.Name != "" {
+ if !utf8.ValidString(e.Name) {
+ e.NameRaw = []byte(e.Name)
+ e.Name = ""
+ }
+ }
+
+ // check early for dup name
+ if e.Type == FileType {
+ cName := filepath.Clean(e.GetName())
+ if _, ok := jp.seen[cName]; ok {
+ return -1, ErrDuplicatePath
+ }
+ jp.seen[cName] = struct{}{}
+ }
+
+ e.Position = jp.pos
+ err := jp.e.Encode(e)
+ if err != nil {
+ return -1, err
+ }
+
+ // made it this far, increment now
+ jp.pos++
+ return e.Position, nil
+}
+
+// NewJSONPacker provides a Packer that writes each Entry (SegmentType and
+// FileType) as a json document.
+//
+// The Entries are delimited by new line.
+func NewJSONPacker(w io.Writer) Packer {
+ return &jsonPacker{
+ w: w,
+ e: json.NewEncoder(w),
+ seen: seenNames{},
+ }
+}
+
+/*
+TODO(vbatts) perhaps have a more compact packer/unpacker, maybe using msgapck
+(https://github.com/ugorji/go)
+
+
+Even though, since our jsonUnpacker and jsonPacker just take
+io.Reader/io.Writer, then we can get away with passing them a
+gzip.Reader/gzip.Writer
+*/