diff options
author | Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp> | 2019-11-28 23:33:42 +0900 |
---|---|---|
committer | Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp> | 2020-01-08 19:35:17 +0900 |
commit | da7595a69fc15d131c9d8123d0a165bdde4232b6 (patch) | |
tree | 57985f4d9fbc903610f31f3076011cd413d82fdf /vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent | |
parent | c41fd09a8da3a96bc0e58f9f29f87b9bdf30264d (diff) | |
download | podman-da7595a69fc15d131c9d8123d0a165bdde4232b6.tar.gz podman-da7595a69fc15d131c9d8123d0a165bdde4232b6.tar.bz2 podman-da7595a69fc15d131c9d8123d0a165bdde4232b6.zip |
rootless: use RootlessKit port forwarder
RootlessKit port forwarder has a lot of advantages over the slirp4netns port forwarder:
* Very high throughput.
Benchmark result on Travis: socat: 5.2 Gbps, slirp4netns: 8.3 Gbps, RootlessKit: 27.3 Gbps
(https://travis-ci.org/rootless-containers/rootlesskit/builds/597056377)
* Connections from the host are treated as 127.0.0.1 rather than 10.0.2.2 in the namespace.
No UDP issue (#4586)
* No tcp_rmem issue (#4537)
* Probably works with IPv6. Even if not, it is trivial to support IPv6. (#4311)
* Easily extensible for future support of SCTP
* Easily extensible for future support of `lxc-user-nic` SUID network
RootlessKit port forwarder has been already adopted as the default port forwarder by Rootless Docker/Moby,
and no issue has been reported AFAIK.
As the port forwarder is imported as a Go package, no `rootlesskit` binary is required for Podman.
Fix #4586
May-fix #4559
Fix #4537
May-fix #4311
See https://github.com/rootless-containers/rootlesskit/blob/v0.7.0/pkg/port/builtin/builtin.go
Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
Diffstat (limited to 'vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent')
4 files changed, 459 insertions, 0 deletions
diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/parent.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/parent.go new file mode 100644 index 000000000..893bf1da9 --- /dev/null +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/parent.go @@ -0,0 +1,145 @@ +package parent + +import ( + "context" + "io" + "io/ioutil" + "net" + "os" + "path/filepath" + "sync" + "syscall" + + "github.com/pkg/errors" + + "github.com/rootless-containers/rootlesskit/pkg/port" + "github.com/rootless-containers/rootlesskit/pkg/port/builtin/msg" + "github.com/rootless-containers/rootlesskit/pkg/port/builtin/opaque" + "github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/tcp" + "github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp" + "github.com/rootless-containers/rootlesskit/pkg/port/portutil" +) + +// NewDriver for builtin driver. +func NewDriver(logWriter io.Writer, stateDir string) (port.ParentDriver, error) { + // TODO: consider using socketpair FD instead of socket file + socketPath := filepath.Join(stateDir, ".bp.sock") + childReadyPipePath := filepath.Join(stateDir, ".bp-ready.pipe") + // remove the path just in case the previous rootlesskit instance crashed + if err := os.RemoveAll(childReadyPipePath); err != nil { + return nil, errors.Wrapf(err, "cannot remove %s", childReadyPipePath) + } + if err := syscall.Mkfifo(childReadyPipePath, 0600); err != nil { + return nil, errors.Wrapf(err, "cannot mkfifo %s", childReadyPipePath) + } + d := driver{ + logWriter: logWriter, + socketPath: socketPath, + childReadyPipePath: childReadyPipePath, + ports: make(map[int]*port.Status, 0), + stoppers: make(map[int]func() error, 0), + nextID: 1, + } + return &d, nil +} + +type driver struct { + logWriter io.Writer + socketPath string + childReadyPipePath string + mu sync.Mutex + ports map[int]*port.Status + stoppers map[int]func() error + nextID int +} + +func (d *driver) OpaqueForChild() map[string]string { + return map[string]string{ + opaque.SocketPath: d.socketPath, + opaque.ChildReadyPipePath: d.childReadyPipePath, + } +} + +func (d *driver) RunParentDriver(initComplete chan struct{}, quit <-chan struct{}, _ *port.ChildContext) error { + childReadyPipeR, err := os.OpenFile(d.childReadyPipePath, os.O_RDONLY, os.ModeNamedPipe) + if err != nil { + return err + } + if _, err = ioutil.ReadAll(childReadyPipeR); err != nil { + return err + } + childReadyPipeR.Close() + var dialer net.Dialer + conn, err := dialer.Dial("unix", d.socketPath) + if err != nil { + return err + } + err = msg.Initiate(conn.(*net.UnixConn)) + conn.Close() + if err != nil { + return err + } + initComplete <- struct{}{} + <-quit + return nil +} + +func (d *driver) AddPort(ctx context.Context, spec port.Spec) (*port.Status, error) { + d.mu.Lock() + err := portutil.ValidatePortSpec(spec, d.ports) + d.mu.Unlock() + if err != nil { + return nil, err + } + routineStopCh := make(chan struct{}) + routineStop := func() error { + close(routineStopCh) + return nil // FIXME + } + switch spec.Proto { + case "tcp": + err = tcp.Run(d.socketPath, spec, routineStopCh, d.logWriter) + case "udp": + err = udp.Run(d.socketPath, spec, routineStopCh, d.logWriter) + default: + // NOTREACHED + return nil, errors.New("spec was not validated?") + } + if err != nil { + return nil, err + } + d.mu.Lock() + id := d.nextID + st := port.Status{ + ID: id, + Spec: spec, + } + d.ports[id] = &st + d.stoppers[id] = routineStop + d.nextID++ + d.mu.Unlock() + return &st, nil +} + +func (d *driver) ListPorts(ctx context.Context) ([]port.Status, error) { + var ports []port.Status + d.mu.Lock() + for _, p := range d.ports { + ports = append(ports, *p) + } + d.mu.Unlock() + return ports, nil +} + +func (d *driver) RemovePort(ctx context.Context, id int) error { + d.mu.Lock() + defer d.mu.Unlock() + stop, ok := d.stoppers[id] + if !ok { + return errors.Errorf("unknown id: %d", id) + } + err := stop() + delete(d.stoppers, id) + delete(d.ports, id) + return err +} diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/tcp/tcp.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/tcp/tcp.go new file mode 100644 index 000000000..b9f2d1802 --- /dev/null +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/tcp/tcp.go @@ -0,0 +1,104 @@ +package tcp + +import ( + "fmt" + "io" + "net" + "os" + "sync" + + "github.com/rootless-containers/rootlesskit/pkg/port" + "github.com/rootless-containers/rootlesskit/pkg/port/builtin/msg" +) + +func Run(socketPath string, spec port.Spec, stopCh <-chan struct{}, logWriter io.Writer) error { + ln, err := net.Listen("tcp", fmt.Sprintf("%s:%d", spec.ParentIP, spec.ParentPort)) + if err != nil { + fmt.Fprintf(logWriter, "listen: %v\n", err) + return err + } + newConns := make(chan net.Conn) + go func() { + for { + c, err := ln.Accept() + if err != nil { + fmt.Fprintf(logWriter, "accept: %v\n", err) + close(newConns) + return + } + newConns <- c + } + }() + go func() { + defer ln.Close() + for { + select { + case c, ok := <-newConns: + if !ok { + return + } + go func() { + if err := copyConnToChild(c, socketPath, spec, stopCh); err != nil { + fmt.Fprintf(logWriter, "copyConnToChild: %v\n", err) + return + } + }() + case <-stopCh: + return + } + } + }() + // no wait + return nil +} + +func copyConnToChild(c net.Conn, socketPath string, spec port.Spec, stopCh <-chan struct{}) error { + defer c.Close() + // get fd from the child as an SCM_RIGHTS cmsg + fd, err := msg.ConnectToChildWithRetry(socketPath, spec, 10) + if err != nil { + return err + } + f := os.NewFile(uintptr(fd), "") + defer f.Close() + fc, err := net.FileConn(f) + if err != nil { + return err + } + defer fc.Close() + bicopy(c, fc, stopCh) + return nil +} + +// bicopy is based on libnetwork/cmd/proxy/tcp_proxy.go . +// NOTE: sendfile(2) cannot be used for sockets +func bicopy(x, y net.Conn, quit <-chan struct{}) { + var wg sync.WaitGroup + var broker = func(to, from net.Conn) { + io.Copy(to, from) + if fromTCP, ok := from.(*net.TCPConn); ok { + fromTCP.CloseRead() + } + if toTCP, ok := to.(*net.TCPConn); ok { + toTCP.CloseWrite() + } + wg.Done() + } + + wg.Add(2) + go broker(x, y) + go broker(y, x) + finish := make(chan struct{}) + go func() { + wg.Wait() + close(finish) + }() + + select { + case <-quit: + case <-finish: + } + x.Close() + y.Close() + <-finish +} diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udp.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udp.go new file mode 100644 index 000000000..d8f646b5d --- /dev/null +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udp.go @@ -0,0 +1,60 @@ +package udp + +import ( + "fmt" + "io" + "net" + "os" + + "github.com/pkg/errors" + + "github.com/rootless-containers/rootlesskit/pkg/port" + "github.com/rootless-containers/rootlesskit/pkg/port/builtin/msg" + "github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udpproxy" +) + +func Run(socketPath string, spec port.Spec, stopCh <-chan struct{}, logWriter io.Writer) error { + addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", spec.ParentIP, spec.ParentPort)) + if err != nil { + return err + } + c, err := net.ListenUDP("udp", addr) + if err != nil { + return err + } + udpp := &udpproxy.UDPProxy{ + LogWriter: logWriter, + Listener: c, + BackendDial: func() (*net.UDPConn, error) { + // get fd from the child as an SCM_RIGHTS cmsg + fd, err := msg.ConnectToChildWithRetry(socketPath, spec, 10) + if err != nil { + return nil, err + } + f := os.NewFile(uintptr(fd), "") + defer f.Close() + fc, err := net.FileConn(f) + if err != nil { + return nil, err + } + uc, ok := fc.(*net.UDPConn) + if !ok { + return nil, errors.Errorf("file conn doesn't implement *net.UDPConn: %+v", fc) + } + return uc, nil + }, + } + go udpp.Run() + go func() { + for { + select { + case <-stopCh: + // udpp.Close closes ln as well + udpp.Close() + return + } + } + }() + // no wait + return nil +} diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udpproxy/udp_proxy.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udpproxy/udp_proxy.go new file mode 100644 index 000000000..af7b7d5d9 --- /dev/null +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udpproxy/udp_proxy.go @@ -0,0 +1,150 @@ +// Package udpproxy is from https://raw.githubusercontent.com/docker/libnetwork/fec6476dfa21380bf8ee4d74048515d968c1ee63/cmd/proxy/udp_proxy.go +package udpproxy + +import ( + "encoding/binary" + "fmt" + "io" + "net" + "strings" + "sync" + "syscall" + "time" +) + +const ( + // UDPConnTrackTimeout is the timeout used for UDP connection tracking + UDPConnTrackTimeout = 90 * time.Second + // UDPBufSize is the buffer size for the UDP proxy + UDPBufSize = 65507 +) + +// A net.Addr where the IP is split into two fields so you can use it as a key +// in a map: +type connTrackKey struct { + IPHigh uint64 + IPLow uint64 + Port int +} + +func newConnTrackKey(addr *net.UDPAddr) *connTrackKey { + if len(addr.IP) == net.IPv4len { + return &connTrackKey{ + IPHigh: 0, + IPLow: uint64(binary.BigEndian.Uint32(addr.IP)), + Port: addr.Port, + } + } + return &connTrackKey{ + IPHigh: binary.BigEndian.Uint64(addr.IP[:8]), + IPLow: binary.BigEndian.Uint64(addr.IP[8:]), + Port: addr.Port, + } +} + +type connTrackMap map[connTrackKey]*net.UDPConn + +// UDPProxy is proxy for which handles UDP datagrams. +// From libnetwork udp_proxy.go . +type UDPProxy struct { + LogWriter io.Writer + Listener *net.UDPConn + BackendDial func() (*net.UDPConn, error) + connTrackTable connTrackMap + connTrackLock sync.Mutex +} + +func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr, clientKey *connTrackKey) { + defer func() { + proxy.connTrackLock.Lock() + delete(proxy.connTrackTable, *clientKey) + proxy.connTrackLock.Unlock() + proxyConn.Close() + }() + + readBuf := make([]byte, UDPBufSize) + for { + proxyConn.SetReadDeadline(time.Now().Add(UDPConnTrackTimeout)) + again: + read, err := proxyConn.Read(readBuf) + if err != nil { + if err, ok := err.(*net.OpError); ok && err.Err == syscall.ECONNREFUSED { + // This will happen if the last write failed + // (e.g: nothing is actually listening on the + // proxied port on the container), ignore it + // and continue until UDPConnTrackTimeout + // expires: + goto again + } + return + } + for i := 0; i != read; { + written, err := proxy.Listener.WriteToUDP(readBuf[i:read], clientAddr) + if err != nil { + return + } + i += written + } + } +} + +// Run starts forwarding the traffic using UDP. +func (proxy *UDPProxy) Run() { + proxy.connTrackTable = make(connTrackMap) + readBuf := make([]byte, UDPBufSize) + for { + read, from, err := proxy.Listener.ReadFromUDP(readBuf) + if err != nil { + // NOTE: Apparently ReadFrom doesn't return + // ECONNREFUSED like Read do (see comment in + // UDPProxy.replyLoop) + if !isClosedError(err) { + fmt.Fprintf(proxy.LogWriter, "Stopping proxy on udp: %v\n", err) + } + break + } + + fromKey := newConnTrackKey(from) + proxy.connTrackLock.Lock() + proxyConn, hit := proxy.connTrackTable[*fromKey] + if !hit { + proxyConn, err = proxy.BackendDial() + if err != nil { + fmt.Fprintf(proxy.LogWriter, "Can't proxy a datagram to udp: %v\n", err) + proxy.connTrackLock.Unlock() + continue + } + proxy.connTrackTable[*fromKey] = proxyConn + go proxy.replyLoop(proxyConn, from, fromKey) + } + proxy.connTrackLock.Unlock() + for i := 0; i != read; { + written, err := proxyConn.Write(readBuf[i:read]) + if err != nil { + fmt.Fprintf(proxy.LogWriter, "Can't proxy a datagram to udp: %v\n", err) + break + } + i += written + } + } +} + +// Close stops forwarding the traffic. +func (proxy *UDPProxy) Close() { + proxy.Listener.Close() + proxy.connTrackLock.Lock() + defer proxy.connTrackLock.Unlock() + for _, conn := range proxy.connTrackTable { + conn.Close() + } +} + +func isClosedError(err error) bool { + /* This comparison is ugly, but unfortunately, net.go doesn't export errClosing. + * See: + * http://golang.org/src/pkg/net/net.go + * https://code.google.com/p/go/issues/detail?id=4337 + * https://groups.google.com/forum/#!msg/golang-nuts/0_aaCvBmOcM/SptmDyX1XJMJ + */ + return strings.HasSuffix(err.Error(), "use of closed network connection") +} |