summaryrefslogtreecommitdiff
path: root/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go
blob: 009b3f5d8124e2255b4803ea9007dd8517c38907 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
package asm

import (
	"io"

	"github.com/vbatts/tar-split/archive/tar"
	"github.com/vbatts/tar-split/tar/storage"
)

// NewInputTarStream wraps the Reader stream of a tar archive and provides a
// Reader stream of the same.
//
// In the middle it will pack the segments and file metadata to storage.Packer
// `p`.
//
// The the storage.FilePutter is where payload of files in the stream are
// stashed. If this stashing is not needed, you can provide a nil
// storage.FilePutter. Since the checksumming is still needed, then a default
// of NewDiscardFilePutter will be used internally
func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) {
	// What to do here... folks will want their own access to the Reader that is
	// their tar archive stream, but we'll need that same stream to use our
	// forked 'archive/tar'.
	// Perhaps do an io.TeeReader that hands back an io.Reader for them to read
	// from, and we'll MITM the stream to store metadata.
	// We'll need a storage.FilePutter too ...

	// Another concern, whether to do any storage.FilePutter operations, such that we
	// don't extract any amount of the archive. But then again, we're not making
	// files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter.
	// Perhaps we have a DiscardFilePutter that is a bit bucket.

	// we'll return the pipe reader, since TeeReader does not buffer and will
	// only read what the outputRdr Read's. Since Tar archives have padding on
	// the end, we want to be the one reading the padding, even if the user's
	// `archive/tar` doesn't care.
	pR, pW := io.Pipe()
	outputRdr := io.TeeReader(r, pW)

	// we need a putter that will generate the crc64 sums of file payloads
	if fp == nil {
		fp = storage.NewDiscardFilePutter()
	}

	go func() {
		tr := tar.NewReader(outputRdr)
		tr.RawAccounting = true
		for {
			hdr, err := tr.Next()
			if err != nil {
				if err != io.EOF {
					pW.CloseWithError(err)
					return
				}
				// even when an EOF is reached, there is often 1024 null bytes on
				// the end of an archive. Collect them too.
				if b := tr.RawBytes(); len(b) > 0 {
					_, err := p.AddEntry(storage.Entry{
						Type:    storage.SegmentType,
						Payload: b,
					})
					if err != nil {
						pW.CloseWithError(err)
						return
					}
				}
				break // not return. We need the end of the reader.
			}
			if hdr == nil {
				break // not return. We need the end of the reader.
			}

			if b := tr.RawBytes(); len(b) > 0 {
				_, err := p.AddEntry(storage.Entry{
					Type:    storage.SegmentType,
					Payload: b,
				})
				if err != nil {
					pW.CloseWithError(err)
					return
				}
			}

			var csum []byte
			if hdr.Size > 0 {
				var err error
				_, csum, err = fp.Put(hdr.Name, tr)
				if err != nil {
					pW.CloseWithError(err)
					return
				}
			}

			entry := storage.Entry{
				Type:    storage.FileType,
				Size:    hdr.Size,
				Payload: csum,
			}
			// For proper marshalling of non-utf8 characters
			entry.SetName(hdr.Name)

			// File entries added, regardless of size
			_, err = p.AddEntry(entry)
			if err != nil {
				pW.CloseWithError(err)
				return
			}

			if b := tr.RawBytes(); len(b) > 0 {
				_, err = p.AddEntry(storage.Entry{
					Type:    storage.SegmentType,
					Payload: b,
				})
				if err != nil {
					pW.CloseWithError(err)
					return
				}
			}
		}

		// It is allowable, and not uncommon that there is further padding on
		// the end of an archive, apart from the expected 1024 null bytes. We
		// do this in chunks rather than in one go to avoid cases where a
		// maliciously crafted tar file tries to trick us into reading many GBs
		// into memory.
		const paddingChunkSize = 1024 * 1024
		var paddingChunk [paddingChunkSize]byte
		for {
			var isEOF bool
			n, err := outputRdr.Read(paddingChunk[:])
			if err != nil {
				if err != io.EOF {
					pW.CloseWithError(err)
					return
				}
				isEOF = true
			}
			_, err = p.AddEntry(storage.Entry{
				Type:    storage.SegmentType,
				Payload: paddingChunk[:n],
			})
			if err != nil {
				pW.CloseWithError(err)
				return
			}
			if isEOF {
				break
			}
		}
		pW.Close()
	}()

	return pR, nil
}