Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ jobs:
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-port.sh builtin --detach-netns
- name: "Benchmark: TCP Ports (network driver=slirp4netns, port driver=builtin) with source-ip-transparent=false"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-port.sh builtin --source-ip-transparent=false
- name: "Benchmark: TCP Ports (network driver=pasta, port driver=implicit)"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
Expand Down
7 changes: 6 additions & 1 deletion cmd/rootlesskit/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ See https://rootlesscontaine.rs/getting-started/common/ .
Aliases: []string{"p"},
Usage: "publish ports. e.g. \"127.0.0.1:8080:80/tcp\"",
}, CategoryPort),
Categorize(&cli.BoolFlag{
Name: "source-ip-transparent",
Usage: "preserve real client source IP using IP_TRANSPARENT (builtin port driver)",
Value: true,
}, CategoryPort),
Categorize(&cli.BoolFlag{
Name: "pidns",
Usage: "create a PID namespace",
Expand Down Expand Up @@ -620,7 +625,7 @@ func createParentOpt(clicontext *cli.Context) (parent.Opt, error) {
if opt.NetworkDriver == nil {
return opt, errors.New("port driver requires non-host network")
}
opt.PortDriver, err = builtin.NewParentDriver(&logrusDebugWriter{label: "port/builtin"}, opt.StateDir)
opt.PortDriver, err = builtin.NewParentDriver(&logrusDebugWriter{label: "port/builtin"}, opt.StateDir, clicontext.Bool("source-ip-transparent"))
if err != nil {
return opt, err
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/port/builtin/builtin.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import (
)

var (
NewParentDriver func(logWriter io.Writer, stateDir string) (port.ParentDriver, error) = parent.NewDriver
NewChildDriver func(logWriter io.Writer) port.ChildDriver = child.NewDriver
NewParentDriver func(logWriter io.Writer, stateDir string, sourceIPTransparent bool) (port.ParentDriver, error) = parent.NewDriver
NewChildDriver func(logWriter io.Writer) port.ChildDriver = child.NewDriver
)

// Available indicates whether this port driver is compiled in (used for generating help text)
Expand Down
3 changes: 2 additions & 1 deletion pkg/port/builtin/builtin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ func TestBuiltIn(t *testing.T) {
t.Fatal(err)
}
defer os.RemoveAll(tmpDir)
d, err := NewParentDriver(os.Stderr, tmpDir)
d, err := NewParentDriver(os.Stderr, tmpDir, true)
if err != nil {
t.Fatal(err)
}
pf := func() port.ParentDriver {
return d
}
testsuite.Run(t, pf)
testsuite.RunTCPTransparent(t, pf)
}
117 changes: 112 additions & 5 deletions pkg/port/builtin/child/child.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ import (
"io"
"net"
"os"
"os/exec"
"strconv"
"strings"
"sync"
"syscall"
"time"

"golang.org/x/sys/unix"

Expand All @@ -25,10 +29,15 @@ func NewDriver(logWriter io.Writer) port.ChildDriver {
}

type childDriver struct {
logWriter io.Writer
logWriter io.Writer
sourceIPTransparent bool
routingSetup sync.Once
routingReady bool
routingWarn sync.Once
}

func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struct{}, detachedNetNSPath string) error {
d.sourceIPTransparent = opaque[opaquepkg.SourceIPTransparent] == "true"
socketPath := opaque[opaquepkg.SocketPath]
if socketPath == "" {
return errors.New("socket path not set")
Expand Down Expand Up @@ -119,7 +128,6 @@ func (d *childDriver) handleConnectRequest(c *net.UnixConn, req *msg.Request) er
}
// dialProto does not need "4", "6" suffix
dialProto := strings.TrimSuffix(strings.TrimSuffix(req.Proto, "6"), "4")
var dialer net.Dialer
ip := req.IP
if ip == "" {
ip = "127.0.0.1"
Expand All @@ -135,9 +143,31 @@ func (d *childDriver) handleConnectRequest(c *net.UnixConn, req *msg.Request) er
}
ip = p.String()
}
targetConn, err := dialer.Dial(dialProto, net.JoinHostPort(ip, strconv.Itoa(req.Port)))
if err != nil {
return err
targetAddr := net.JoinHostPort(ip, strconv.Itoa(req.Port))

var targetConn net.Conn
var err error
if d.sourceIPTransparent && req.SourceIP != "" && req.SourcePort != 0 && dialProto == "tcp" && !net.ParseIP(req.SourceIP).IsLoopback() {
d.routingSetup.Do(func() { d.routingReady = d.setupTransparentRouting() })
if !d.routingReady {
d.routingWarn.Do(func() {
fmt.Fprintf(d.logWriter, "source IP transparent: falling back to non-transparent mode, client source IPs will not be preserved\n")
})
goto fallback
}
targetConn, err = transparentDial(dialProto, targetAddr, req.SourceIP, req.SourcePort)
if err != nil {
fmt.Fprintf(d.logWriter, "transparent dial failed, falling back: %v\n", err)
targetConn, err = nil, nil
}
}
fallback:
if targetConn == nil {
var dialer net.Dialer
targetConn, err = dialer.Dial(dialProto, targetAddr)
if err != nil {
return err
}
}
defer targetConn.Close() // no effect on duplicated FD
targetConnFiler, ok := targetConn.(filer)
Expand All @@ -164,6 +194,83 @@ func (d *childDriver) handleConnectRequest(c *net.UnixConn, req *msg.Request) er
return err
}

// setupTransparentRouting sets up policy routing so that response packets
// destined to transparent-bound source IPs are delivered locally.
//
// Transparent sockets (IP_TRANSPARENT) bind to non-local addresses (the real
// client IP). Response packets to these addresses must be routed locally instead
// of being sent out through the TAP device (slirp4netns).
//
// The transparent SYN goes through OUTPUT (where we tag it with CONNMARK) and
// then either:
//
// 1. Gets DNAT'd to the container (nerdctl/CNI): the SYN-ACK arrives via the
// bridge in PREROUTING, where we restore connmark to fwmark.
//
// 2. Goes through loopback to a userspace proxy like docker-proxy: the SYN
// enters PREROUTING on loopback with connmark, which sets fwmark. With
// tcp_fwmark_accept=1, the accepted socket inherits the fwmark. The proxy's
// SYN-ACK is then routed via the fwmark table (local delivery) instead of
// the default route (TAP), allowing it to reach the transparent socket.
func (d *childDriver) setupTransparentRouting() bool {
// Check that iptables is available before proceeding.
if _, err := exec.LookPath("iptables"); err != nil {
fmt.Fprintf(d.logWriter, "source IP transparent: iptables not found, disabling: %v\n", err)
return false
}
// Verify the connmark module is usable (kernel module might not be loaded).
if out, err := exec.Command("iptables", "-t", "mangle", "-L", "-n").CombinedOutput(); err != nil {
fmt.Fprintf(d.logWriter, "source IP transparent: iptables mangle table not available, disabling: %v: %s\n", err, out)
return false
}
cmds := [][]string{
// Table 100: treat all addresses as local (for delivery to transparent sockets)
{"ip", "route", "add", "local", "default", "dev", "lo", "table", "100"},
{"ip", "-6", "route", "add", "local", "default", "dev", "lo", "table", "100"},
// Route fwmark-100 packets via table 100
{"ip", "rule", "add", "fwmark", "100", "lookup", "100", "priority", "100"},
{"ip", "-6", "rule", "add", "fwmark", "100", "lookup", "100", "priority", "100"},
// Inherit fwmark from SYN to accepted socket (needed for userspace proxies
// like docker-proxy, so that SYN-ACK routing uses table 100)
{"sysctl", "-w", "net.ipv4.tcp_fwmark_accept=1"},
// In OUTPUT: tag transparent connections (non-local source) with CONNMARK
{"iptables", "-t", "mangle", "-A", "OUTPUT", "-p", "tcp", "-m", "addrtype", "!", "--src-type", "LOCAL", "-j", "CONNMARK", "--set-mark", "100"},
{"ip6tables", "-t", "mangle", "-A", "OUTPUT", "-p", "tcp", "-m", "addrtype", "!", "--src-type", "LOCAL", "-j", "CONNMARK", "--set-mark", "100"},
// In PREROUTING: restore connmark to fwmark for routing
{"iptables", "-t", "mangle", "-A", "PREROUTING", "-p", "tcp", "-m", "connmark", "--mark", "100", "-j", "MARK", "--set-mark", "100"},
{"ip6tables", "-t", "mangle", "-A", "PREROUTING", "-p", "tcp", "-m", "connmark", "--mark", "100", "-j", "MARK", "--set-mark", "100"},
}
for _, args := range cmds {
if out, err := exec.Command(args[0], args[1:]...).CombinedOutput(); err != nil {
fmt.Fprintf(d.logWriter, "source IP transparent routing setup: %v: %s\n", err, out)
}
}
return true
}

// transparentDial dials targetAddr using IP_TRANSPARENT, binding to the given
// source IP and port so the backend service sees the real client address.
func transparentDial(dialProto, targetAddr, sourceIP string, sourcePort int) (net.Conn, error) {
dialer := net.Dialer{
Timeout: time.Second,
LocalAddr: &net.TCPAddr{IP: net.ParseIP(sourceIP), Port: sourcePort},
Control: func(network, address string, c syscall.RawConn) error {
var sockErr error
if err := c.Control(func(fd uintptr) {
if strings.Contains(network, "6") {
sockErr = unix.SetsockoptInt(int(fd), unix.SOL_IPV6, unix.IPV6_TRANSPARENT, 1)
} else {
sockErr = unix.SetsockoptInt(int(fd), unix.SOL_IP, unix.IP_TRANSPARENT, 1)
}
}); err != nil {
return err
}
return sockErr
},
}
return dialer.Dial(dialProto, targetAddr)
}

// filer is implemented by *net.TCPConn and *net.UDPConn
type filer interface {
File() (f *os.File, err error)
Expand Down
18 changes: 13 additions & 5 deletions pkg/port/builtin/msg/msg.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ type Request struct {
Port int
ParentIP string
HostGatewayIP string
SourceIP string `json:",omitempty"` // real client IP for IP_TRANSPARENT
SourcePort int `json:",omitempty"` // real client port for IP_TRANSPARENT
}

// Reply may contain FD as OOB
Expand Down Expand Up @@ -69,7 +71,9 @@ func hostGatewayIP() string {

// ConnectToChild connects to the child UNIX socket, and obtains TCP or UDP socket FD
// that corresponds to the port spec.
func ConnectToChild(c *net.UnixConn, spec port.Spec) (int, error) {
// sourceAddr is the real client address (e.g., from net.Conn.RemoteAddr()) for IP_TRANSPARENT support.
// Pass nil to skip source IP preservation.
func ConnectToChild(c *net.UnixConn, spec port.Spec, sourceAddr net.Addr) (int, error) {
req := Request{
Type: RequestTypeConnect,
Proto: spec.Proto,
Expand All @@ -78,6 +82,10 @@ func ConnectToChild(c *net.UnixConn, spec port.Spec) (int, error) {
ParentIP: spec.ParentIP,
HostGatewayIP: hostGatewayIP(),
}
if tcpAddr, ok := sourceAddr.(*net.TCPAddr); ok && tcpAddr != nil {
req.SourceIP = tcpAddr.IP.String()
req.SourcePort = tcpAddr.Port
}
if _, err := lowlevelmsgutil.MarshalToWriter(c, &req); err != nil {
return 0, err
}
Expand Down Expand Up @@ -114,21 +122,21 @@ func ConnectToChild(c *net.UnixConn, spec port.Spec) (int, error) {
}

// ConnectToChildWithSocketPath wraps ConnectToChild
func ConnectToChildWithSocketPath(socketPath string, spec port.Spec) (int, error) {
func ConnectToChildWithSocketPath(socketPath string, spec port.Spec, sourceAddr net.Addr) (int, error) {
var dialer net.Dialer
conn, err := dialer.Dial("unix", socketPath)
if err != nil {
return 0, err
}
defer conn.Close()
c := conn.(*net.UnixConn)
return ConnectToChild(c, spec)
return ConnectToChild(c, spec, sourceAddr)
}

// ConnectToChildWithRetry retries ConnectToChild every (i*5) milliseconds.
func ConnectToChildWithRetry(socketPath string, spec port.Spec, retries int) (int, error) {
func ConnectToChildWithRetry(socketPath string, spec port.Spec, retries int, sourceAddr net.Addr) (int, error) {
for i := 0; i < retries; i++ {
fd, err := ConnectToChildWithSocketPath(socketPath, spec)
fd, err := ConnectToChildWithSocketPath(socketPath, spec, sourceAddr)
if i == retries-1 && err != nil {
return 0, err
}
Expand Down
5 changes: 3 additions & 2 deletions pkg/port/builtin/opaque/opaque.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package opaque

const (
SocketPath = "builtin.socketpath"
ChildReadyPipePath = "builtin.readypipepath"
SocketPath = "builtin.socketpath"
ChildReadyPipePath = "builtin.readypipepath"
SourceIPTransparent = "builtin.source-ip-transparent"
)
36 changes: 21 additions & 15 deletions pkg/port/builtin/parent/parent.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
)

// NewDriver for builtin driver.
func NewDriver(logWriter io.Writer, stateDir string) (port.ParentDriver, error) {
func NewDriver(logWriter io.Writer, stateDir string, sourceIPTransparent bool) (port.ParentDriver, error) {
// TODO: consider using socketpair FD instead of socket file
socketPath := filepath.Join(stateDir, ".bp.sock")
childReadyPipePath := filepath.Join(stateDir, ".bp-ready.pipe")
Expand All @@ -36,24 +36,26 @@ func NewDriver(logWriter io.Writer, stateDir string) (port.ParentDriver, error)
return nil, fmt.Errorf("cannot mkfifo %s: %w", childReadyPipePath, err)
}
d := driver{
logWriter: logWriter,
socketPath: socketPath,
childReadyPipePath: childReadyPipePath,
ports: make(map[int]*port.Status, 0),
stoppers: make(map[int]func(context.Context) error, 0),
nextID: 1,
logWriter: logWriter,
socketPath: socketPath,
childReadyPipePath: childReadyPipePath,
sourceIPTransparent: sourceIPTransparent,
ports: make(map[int]*port.Status, 0),
stoppers: make(map[int]func(context.Context) error, 0),
nextID: 1,
}
return &d, nil
}

type driver struct {
logWriter io.Writer
socketPath string
childReadyPipePath string
mu sync.Mutex
ports map[int]*port.Status
stoppers map[int]func(context.Context) error
nextID int
logWriter io.Writer
socketPath string
childReadyPipePath string
sourceIPTransparent bool
mu sync.Mutex
ports map[int]*port.Status
stoppers map[int]func(context.Context) error
nextID int
}

func (d *driver) Info(ctx context.Context) (*api.PortDriverInfo, error) {
Expand All @@ -66,10 +68,14 @@ func (d *driver) Info(ctx context.Context) (*api.PortDriverInfo, error) {
}

func (d *driver) OpaqueForChild() map[string]string {
return map[string]string{
m := map[string]string{
opaque.SocketPath: d.socketPath,
opaque.ChildReadyPipePath: d.childReadyPipePath,
}
if d.sourceIPTransparent {
m[opaque.SourceIPTransparent] = "true"
}
return m
}

func (d *driver) RunParentDriver(initComplete chan struct{}, quit <-chan struct{}, _ *port.ChildContext) error {
Expand Down
2 changes: 1 addition & 1 deletion pkg/port/builtin/parent/tcp/tcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func Run(socketPath string, spec port.Spec, stopCh <-chan struct{}, stoppedCh ch
func copyConnToChild(c net.Conn, socketPath string, spec port.Spec, stopCh <-chan struct{}) error {
defer c.Close()
// get fd from the child as an SCM_RIGHTS cmsg
fd, err := msg.ConnectToChildWithRetry(socketPath, spec, 10)
fd, err := msg.ConnectToChildWithRetry(socketPath, spec, 10, c.RemoteAddr())
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/port/builtin/parent/udp/udp.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func Run(socketPath string, spec port.Spec, stopCh <-chan struct{}, stoppedCh ch
Listener: c,
BackendDial: func() (*net.UDPConn, error) {
// get fd from the child as an SCM_RIGHTS cmsg
fd, err := msg.ConnectToChildWithRetry(socketPath, spec, 10)
fd, err := msg.ConnectToChildWithRetry(socketPath, spec, 10, nil)
if err != nil {
return nil, err
}
Expand Down
Loading