From b66dd64fa690bcd7606182ae928e9ec4e393e34e Mon Sep 17 00:00:00 2001 From: Danny Canter Date: Thu, 19 Mar 2026 16:00:06 -0700 Subject: [PATCH] Add Seccomp support This change is rather large, but I think it's simpler to get in as one unit. It: - Adds a new ContainerizationSeccomp target/product that is a cBPF compiler specifically for seccomp. Its main use is to take in an OCI seccomp description and spit out a filter we can apply. - Adds a new friendly SeccompProfile API to Containerization to specify what filters you'd like applied. This will (as is the case for basically everything else) get translated to OCI behind the scenes. - Adds a small bit of logic in vmexec to apply the filters. And unit and integration tests for everything. Unit testing is interesting. I've added a small simulator so we actually have some semblance of testing outside of just integration tests and seeing if the syscall is blocked/returns whatever. --- Package.swift | 14 + Sources/Containerization/LinuxContainer.swift | 5 + Sources/Containerization/LinuxPod.swift | 3 + Sources/Containerization/SeccompProfile.swift | 808 ++++++++++++++++++ .../SeccompCompiler+Aarch64.swift | 328 +++++++ .../SeccompCompiler.swift | 421 +++++++++ Sources/Integration/ContainerTests.swift | 306 +++++++ Sources/Integration/Suite.swift | 8 + .../BPFSimulator.swift | 139 +++ .../SeccompCompilerTests.swift | 369 ++++++++ vminitd/Package.swift | 1 + vminitd/Sources/LCShim/include/syscall.h | 14 + vminitd/Sources/LCShim/syscall.c | 12 + vminitd/Sources/vmexec/ExecCommand.swift | 32 +- vminitd/Sources/vmexec/RunCommand.swift | 17 +- vminitd/Sources/vmexec/vmexec.swift | 53 +- vminitd/Sources/vminitd/ManagedProcess.swift | 2 + 17 files changed, 2514 insertions(+), 18 deletions(-) create mode 100644 Sources/Containerization/SeccompProfile.swift create mode 100644 Sources/ContainerizationSeccomp/SeccompCompiler+Aarch64.swift create mode 100644 Sources/ContainerizationSeccomp/SeccompCompiler.swift create mode 100644 Tests/ContainerizationSeccompTests/BPFSimulator.swift create mode 100644 Tests/ContainerizationSeccompTests/SeccompCompilerTests.swift diff --git a/Package.swift b/Package.swift index 40997523..99e28965 100644 --- a/Package.swift +++ b/Package.swift @@ -33,6 +33,7 @@ let package = Package( .library(name: "ContainerizationOS", targets: ["ContainerizationOS"]), .library(name: "ContainerizationExtras", targets: ["ContainerizationExtras"]), .library(name: "ContainerizationArchive", targets: ["ContainerizationArchive"]), + .library(name: "ContainerizationSeccomp", targets: ["ContainerizationSeccomp"]), .executable(name: "cctl", targets: ["cctl"]), ], dependencies: [ @@ -264,5 +265,18 @@ let package = Package( .target( name: "CShim" ), + .target( + name: "ContainerizationSeccomp", + dependencies: [ + "ContainerizationOCI" + ] + ), + .testTarget( + name: "ContainerizationSeccompTests", + dependencies: [ + "ContainerizationSeccomp", + "ContainerizationOCI", + ] + ), ] ) diff --git a/Sources/Containerization/LinuxContainer.swift b/Sources/Containerization/LinuxContainer.swift index 9033a206..d412e532 100644 --- a/Sources/Containerization/LinuxContainer.swift +++ b/Sources/Containerization/LinuxContainer.swift @@ -64,6 +64,8 @@ public final class LinuxContainer: Container, Sendable { public var sockets: [UnixSocketConfiguration] = [] /// The mounts for the container. public var mounts: [Mount] = LinuxContainer.defaultMounts() + /// Seccomp profile for system call filtering. + public var seccomp: SeccompProfile? /// The DNS configuration for the container. public var dns: DNS? /// The hosts to add to /etc/hosts for the container. @@ -90,6 +92,7 @@ public final class LinuxContainer: Container, Sendable { interfaces: [any Interface] = [], sockets: [UnixSocketConfiguration] = [], mounts: [Mount] = LinuxContainer.defaultMounts(), + seccomp: SeccompProfile? = nil, dns: DNS? = nil, hosts: Hosts? = nil, virtualization: Bool = false, @@ -105,6 +108,7 @@ public final class LinuxContainer: Container, Sendable { self.interfaces = interfaces self.sockets = sockets self.mounts = mounts + self.seccomp = seccomp self.dns = dns self.hosts = hosts self.virtualization = virtualization @@ -356,6 +360,7 @@ public final class LinuxContainer: Container, Sendable { // Linux toggles. spec.linux?.sysctl = config.sysctl + spec.linux?.seccomp = config.seccomp?.toOCI(effectiveCapabilities: config.process.capabilities.effective) // If the rootfs was requested as read-only, set it in the OCI spec. // We let the OCI runtime remount as ro, instead of doing it originally. diff --git a/Sources/Containerization/LinuxPod.swift b/Sources/Containerization/LinuxPod.swift index 7574d697..619b8991 100644 --- a/Sources/Containerization/LinuxPod.swift +++ b/Sources/Containerization/LinuxPod.swift @@ -78,6 +78,8 @@ public final class LinuxPod: Sendable { public var sysctl: [String: String] = [:] /// The mounts for the container. public var mounts: [Mount] = LinuxContainer.defaultMounts() + /// Seccomp profile for system call filtering. + public var seccomp: SeccompProfile? /// The Unix domain socket relays to setup for the container. public var sockets: [UnixSocketConfiguration] = [] /// The DNS configuration for the container. @@ -230,6 +232,7 @@ public final class LinuxPod: Sendable { // Linux toggles spec.linux?.sysctl = config.sysctl + spec.linux?.seccomp = config.seccomp?.toOCI(effectiveCapabilities: config.process.capabilities.effective) // If the rootfs was requested as read-only, set it in the OCI spec. // We let the OCI runtime remount as ro, instead of doing it originally. diff --git a/Sources/Containerization/SeccompProfile.swift b/Sources/Containerization/SeccompProfile.swift new file mode 100644 index 00000000..a89c1962 --- /dev/null +++ b/Sources/Containerization/SeccompProfile.swift @@ -0,0 +1,808 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerizationOCI +import ContainerizationOS + +/// Seccomp configuration. +public struct SeccompProfile: Sendable { + /// The action to take for syscalls not matched by any rule. + public var defaultAction: Action + /// Architecture constraints. + public var architectures: [Architecture] + /// Seccomp filter flags. + public var flags: [Flag] + /// Per-syscall rules. + public var syscalls: [Rule] + + /// The action to take for a matching syscall. + /// + /// Note: `SECCOMP_RET_TRACE` and `SECCOMP_RET_USER_NOTIF` are not + /// supported. Both require a host-side (in our case guest) process + // (ptrace tracer or seccomp notify listener). + public enum Action: Sendable { + case allow + case kill + case killProcess + case trap + case errno(UInt) + case log + } + + /// Supported architectures. + public enum Architecture: Sendable { + case aarch64 + } + + /// Seccomp filter flags. + public enum Flag: Sendable { + case log + case specAllow + case waitKillableRecv + } + + /// Identifies a syscall by name or raw number. + public enum Syscall: Sendable, ExpressibleByStringLiteral { + /// A syscall name (e.g. "mkdirat"). Must match an entry in the + /// aarch64 syscall table; unknown names are silently skipped. + case name(String) + /// A raw syscall number. Useful for newly added syscalls. + case number(UInt32) + + public init(stringLiteral value: String) { + self = .name(value) + } + } + + /// A rule that matches one or more syscalls. + public struct Rule: Sendable { + /// The syscalls this rule matches. + public var syscalls: [Syscall] + /// The action to take when matched. + public var action: Action + /// Optional argument conditions (all must match). + public var args: [ArgCondition] + /// Capabilities required for this rule to be included in the filter. + /// If non-empty, the rule is only emitted when the container has all + /// of these capabilities in its effective set. + public var requiredCapabilities: [CapabilityName] + + public init( + syscalls: [Syscall], + action: Action, + args: [ArgCondition] = [], + requiredCapabilities: [CapabilityName] = [] + ) { + self.syscalls = syscalls + self.action = action + self.args = args + self.requiredCapabilities = requiredCapabilities + } + } + + /// A condition on a syscall argument. + public struct ArgCondition: Sendable { + /// The argument index (0-5). + public var index: UInt + /// The comparison operator. + public var op: Operator + /// The value to compare against. + public var value: UInt64 + /// Second value (used with maskedEqual). + public var valueTwo: UInt64 + + /// Comparison operators for argument conditions. + public enum Operator: Sendable { + case equalTo + case notEqual + case lessThan + case lessEqual + case greaterThan + case greaterEqual + case maskedEqual + } + + public init(index: UInt, op: Operator, value: UInt64, valueTwo: UInt64 = 0) { + self.index = index + self.op = op + self.value = value + self.valueTwo = valueTwo + } + } + + /// A profile that allows all syscalls. + public static let allowAll = SeccompProfile(defaultAction: .allow) + + /// Default seccomp profile for unprivileged containers. + public static let defaultProfile: SeccompProfile = { + var profile = SeccompProfile(defaultAction: .errno(1)) + profile.syscalls = [ + // Unconditional allowlist. + Rule( + syscalls: [ + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "cachestat", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "close", + "close_range", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_pwait2", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchmodat2", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_requeue", + "futex_time64", + "futex_wait", + "futex_waitv", + "futex_wake", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "getxattrat", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "io_pgetevents", + "io_pgetevents_time64", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "ipc", + "add_key", + "keyctl", + "kill", + "landlock_add_rule", + "landlock_create_ruleset", + "landlock_restrict_self", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listmount", + "listxattr", + "listxattrat", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "map_shadow_stack", + "membarrier", + "memfd_create", + "memfd_secret", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "mseal", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "name_to_handle_at", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "pkey_alloc", + "pkey_free", + "pkey_mprotect", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "process_mrelease", + "pselect6", + "pselect6_time64", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "removexattrat", + "rename", + "renameat", + "renameat2", + "request_key", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "setxattrat", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigprocmask", + "sigreturn", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statmount", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "uretprobe", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev", + + // arm/arm64 specific + "arm_fadvise64_64", + "arm_sync_file_range", + "sync_file_range2", + "breakpoint", + "cacheflush", + "set_tls", + ], action: .allow), + + // socket: allow all address families except AF_VSOCK (40). + Rule( + syscalls: ["socket"], + action: .allow, + args: [ArgCondition(index: 0, op: .notEqual, value: 40)] + ), + + // personality: only allow specific execution domains. + // 0x0 = PER_LINUX (default) + // 0x8 = ADDR_NO_RANDOMIZE + // 0x20000 = UNAME26 (report kernel as 2.6.x) + // 0x20008 = UNAME26 | ADDR_NO_RANDOMIZE + // 0xFFFFFFFF = query current personality + Rule( + syscalls: ["personality"], + action: .allow, + args: [ArgCondition(index: 0, op: .equalTo, value: 0x0)] + ), + Rule( + syscalls: ["personality"], + action: .allow, + args: [ArgCondition(index: 0, op: .equalTo, value: 0x8)] + ), + Rule( + syscalls: ["personality"], + action: .allow, + args: [ArgCondition(index: 0, op: .equalTo, value: 0x20000)] + ), + Rule( + syscalls: ["personality"], + action: .allow, + args: [ArgCondition(index: 0, op: .equalTo, value: 0x20008)] + ), + Rule( + syscalls: ["personality"], + action: .allow, + args: [ArgCondition(index: 0, op: .equalTo, value: 0xFFFF_FFFF)] + ), + + // clone: allow only if no namespace creation flags are set. + // The mask 0x7E020000 covers CLONE_NEWNS | CLONE_NEWCGROUP | + // CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWPID | + // CLONE_NEWNET. If (flags & mask) == 0, no namespaces are being + // created and the clone is safe. + Rule( + syscalls: ["clone"], + action: .allow, + args: [ArgCondition(index: 0, op: .maskedEqual, value: 2_114_060_288, valueTwo: 0)] + ), + + // clone3: return ENOSYS (38) to force glibc/musl to fall back to + // clone, where we can inspect the flags via the arg filter above. + // clone3 passes flags in a struct rather than a register, so BPF + // cannot inspect them directly. + Rule(syscalls: ["clone3"], action: .errno(38)), + + // Capability-gated rules. These are only included when the + // container has the required capability in its effective set. + + // CAP_DAC_READ_SEARCH + Rule( + syscalls: ["open_by_handle_at"], + action: .allow, + requiredCapabilities: [.dacReadSearch] + ), + + // CAP_SYS_ADMIN: allow clone/clone3 without namespace flag + // restrictions, plus mount/namespace/admin syscalls. + Rule( + syscalls: [ + "bpf", "clone", "clone3", "fanotify_init", + "fsconfig", "fsmount", "fsopen", "fspick", + "lookup_dcookie", + "mount", "mount_setattr", "move_mount", "open_tree", + "perf_event_open", + "quotactl", "quotactl_fd", + "setdomainname", "sethostname", "setns", + "syslog", + "umount", "umount2", "unshare", + ], + action: .allow, + requiredCapabilities: [.sysAdmin] + ), + + // CAP_SYS_BOOT + Rule( + syscalls: ["reboot"], + action: .allow, + requiredCapabilities: [.sysBoot] + ), + + // CAP_SYS_CHROOT + Rule( + syscalls: ["chroot"], + action: .allow, + requiredCapabilities: [.sysChroot] + ), + + // CAP_SYS_MODULE + Rule( + syscalls: ["delete_module", "init_module", "finit_module"], + action: .allow, + requiredCapabilities: [.sysModule] + ), + + // CAP_SYS_PACCT + Rule( + syscalls: ["acct"], + action: .allow, + requiredCapabilities: [.sysPacct] + ), + + // CAP_SYS_PTRACE + Rule( + syscalls: [ + "kcmp", "pidfd_getfd", "process_madvise", + "process_vm_readv", "process_vm_writev", "ptrace", + ], + action: .allow, + requiredCapabilities: [.sysPtrace] + ), + + // CAP_SYS_RAWIO + Rule( + syscalls: ["iopl", "ioperm"], + action: .allow, + requiredCapabilities: [.sysRawio] + ), + + // CAP_SYS_TIME + Rule( + syscalls: ["settimeofday", "stime", "clock_settime", "clock_settime64"], + action: .allow, + requiredCapabilities: [.sysTime] + ), + + // CAP_SYS_TTY_CONFIG + Rule( + syscalls: ["vhangup"], + action: .allow, + requiredCapabilities: [.sysTtyConfig] + ), + + // CAP_SYS_NICE + Rule( + syscalls: ["get_mempolicy", "mbind", "set_mempolicy", "set_mempolicy_home_node"], + action: .allow, + requiredCapabilities: [.sysNice] + ), + + // CAP_SYSLOG + Rule( + syscalls: ["syslog"], + action: .allow, + requiredCapabilities: [.syslog] + ), + + // CAP_BPF + Rule( + syscalls: ["bpf"], + action: .allow, + requiredCapabilities: [.bpf] + ), + + // CAP_PERFMON + Rule( + syscalls: ["perf_event_open"], + action: .allow, + requiredCapabilities: [.perfmon] + ), + ] + return profile + }() + + public init(defaultAction: Action) { + self.defaultAction = defaultAction + self.architectures = [.aarch64] + self.flags = [] + self.syscalls = [] + } + + /// Add a rule that allows the specified syscalls. + public mutating func allow(_ names: String...) { + syscalls.append(Rule(syscalls: names.map { .name($0) }, action: .allow)) + } + + /// Add a rule that returns the specified errno for the given syscalls. + public mutating func errno(_ errnoVal: UInt, _ names: String...) { + syscalls.append(Rule(syscalls: names.map { .name($0) }, action: .errno(errnoVal))) + } + + /// Convert to OCI type for transport. + /// + /// Rules with `requiredCapabilities` are only included when the provided + /// effective capabilities contain all required capabilities for that rule. + public func toOCI(effectiveCapabilities: [CapabilityName] = []) -> ContainerizationOCI.LinuxSeccomp { + let capSet = Set(effectiveCapabilities) + let filteredSyscalls = syscalls.filter { rule in + rule.requiredCapabilities.isEmpty || rule.requiredCapabilities.allSatisfy { capSet.contains($0) } + } + + return ContainerizationOCI.LinuxSeccomp( + defaultAction: defaultAction.toOCI(), + defaultErrnoRet: defaultAction.ociErrnoRet, + architectures: architectures.map { $0.toOCI() }, + flags: flags.map { $0.toOCI() }, + listenerPath: "", + listenerMetadata: "", + syscalls: filteredSyscalls.map { $0.toOCI() } + ) + } +} + +extension SeccompProfile.Action { + func toOCI() -> LinuxSeccompAction { + switch self { + case .allow: return .actAllow + case .kill: return .actKill + case .killProcess: return .actKillProcess + case .trap: return .actTrap + case .errno: return .actErrno + case .log: return .actLog + } + } + + var ociErrnoRet: UInt? { + switch self { + case .errno(let val): return val + default: return nil + } + } +} + +extension SeccompProfile.Architecture { + func toOCI() -> Arch { + switch self { + case .aarch64: return .archAARCH64 + } + } +} + +extension SeccompProfile.Flag { + func toOCI() -> LinuxSeccompFlag { + switch self { + case .log: return .flagLog + case .specAllow: return .flagSpecAllow + case .waitKillableRecv: return .flagWaitKillableRecv + } + } +} + +extension SeccompProfile.Rule { + func toOCI() -> LinuxSyscall { + let errnoRet: UInt? + switch action { + case .errno(let val): errnoRet = val + default: errnoRet = nil + } + + let names = syscalls.map { syscall -> String in + switch syscall { + case .name(let n): return n + case .number(let nr): return String(nr) + } + } + + return LinuxSyscall( + names: names, + action: action.toOCI(), + errnoRet: errnoRet, + args: args.map { $0.toOCI() } + ) + } +} + +extension SeccompProfile.ArgCondition { + func toOCI() -> LinuxSeccompArg { + LinuxSeccompArg( + index: index, + value: value, + valueTwo: valueTwo, + op: op.toOCI() + ) + } +} + +extension SeccompProfile.ArgCondition.Operator { + func toOCI() -> LinuxSeccompOperator { + switch self { + case .equalTo: return .opEqualTo + case .notEqual: return .opNotEqual + case .lessThan: return .opLessThan + case .lessEqual: return .opLessEqual + case .greaterThan: return .opGreaterThan + case .greaterEqual: return .opGreaterEqual + case .maskedEqual: return .opMaskedEqual + } + } +} diff --git a/Sources/ContainerizationSeccomp/SeccompCompiler+Aarch64.swift b/Sources/ContainerizationSeccomp/SeccompCompiler+Aarch64.swift new file mode 100644 index 00000000..49cb7a1b --- /dev/null +++ b/Sources/ContainerizationSeccomp/SeccompCompiler+Aarch64.swift @@ -0,0 +1,328 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +extension SeccompCompiler { + /// Linux aarch64 syscall table. + public static let aarch64SyscallTable: [String: UInt32] = [ + "io_setup": 0, + "io_destroy": 1, + "io_submit": 2, + "io_cancel": 3, + "io_getevents": 4, + "setxattr": 5, + "lsetxattr": 6, + "fsetxattr": 7, + "getxattr": 8, + "lgetxattr": 9, + "fgetxattr": 10, + "listxattr": 11, + "llistxattr": 12, + "flistxattr": 13, + "removexattr": 14, + "lremovexattr": 15, + "fremovexattr": 16, + "getcwd": 17, + "lookup_dcookie": 18, + "eventfd2": 19, + "epoll_create1": 20, + "epoll_ctl": 21, + "epoll_pwait": 22, + "dup": 23, + "dup3": 24, + "fcntl": 25, + "inotify_init1": 26, + "inotify_add_watch": 27, + "inotify_rm_watch": 28, + "ioctl": 29, + "ioprio_set": 30, + "ioprio_get": 31, + "flock": 32, + "mknodat": 33, + "mkdirat": 34, + "unlinkat": 35, + "symlinkat": 36, + "linkat": 37, + "renameat": 38, + "umount2": 39, + "mount": 40, + "pivot_root": 41, + "nfsservctl": 42, + "statfs": 43, + "fstatfs": 44, + "truncate": 45, + "ftruncate": 46, + "fallocate": 47, + "faccessat": 48, + "chdir": 49, + "fchdir": 50, + "chroot": 51, + "fchmod": 52, + "fchmodat": 53, + "fchownat": 54, + "fchown": 55, + "openat": 56, + "close": 57, + "vhangup": 58, + "pipe2": 59, + "quotactl": 60, + "getdents64": 61, + "lseek": 62, + "read": 63, + "write": 64, + "readv": 65, + "writev": 66, + "pread64": 67, + "pwrite64": 68, + "preadv": 69, + "pwritev": 70, + "sendfile": 71, + "pselect6": 72, + "ppoll": 73, + "signalfd4": 74, + "vmsplice": 75, + "splice": 76, + "tee": 77, + "readlinkat": 78, + "fstatat": 79, + "fstat": 80, + "sync": 81, + "fsync": 82, + "fdatasync": 83, + "sync_file_range": 84, + "timerfd_create": 85, + "timerfd_settime": 86, + "timerfd_gettime": 87, + "utimensat": 88, + "acct": 89, + "capget": 90, + "capset": 91, + "personality": 92, + "exit": 93, + "exit_group": 94, + "waitid": 95, + "set_tid_address": 96, + "unshare": 97, + "futex": 98, + "set_robust_list": 99, + "get_robust_list": 100, + "nanosleep": 101, + "getitimer": 102, + "setitimer": 103, + "kexec_load": 104, + "init_module": 105, + "delete_module": 106, + "timer_create": 107, + "timer_gettime": 108, + "timer_getoverrun": 109, + "timer_settime": 110, + "timer_delete": 111, + "clock_settime": 112, + "clock_gettime": 113, + "clock_getres": 114, + "clock_nanosleep": 115, + "syslog": 116, + "ptrace": 117, + "sched_setparam": 118, + "sched_setscheduler": 119, + "sched_getscheduler": 120, + "sched_getparam": 121, + "sched_setaffinity": 122, + "sched_getaffinity": 123, + "sched_yield": 124, + "sched_get_priority_max": 125, + "sched_get_priority_min": 126, + "sched_rr_get_interval": 127, + "restart_syscall": 128, + "kill": 129, + "tkill": 130, + "tgkill": 131, + "sigaltstack": 132, + "rt_sigsuspend": 133, + "rt_sigaction": 134, + "rt_sigprocmask": 135, + "rt_sigpending": 136, + "rt_sigtimedwait": 137, + "rt_sigqueueinfo": 138, + "rt_sigreturn": 139, + "setpriority": 140, + "getpriority": 141, + "reboot": 142, + "setregid": 143, + "setgid": 144, + "setreuid": 145, + "setuid": 146, + "setresuid": 147, + "getresuid": 148, + "setresgid": 149, + "getresgid": 150, + "setfsuid": 151, + "setfsgid": 152, + "times": 153, + "setpgid": 154, + "getpgid": 155, + "getsid": 156, + "setsid": 157, + "getgroups": 158, + "setgroups": 159, + "uname": 160, + "sethostname": 161, + "setdomainname": 162, + "getrlimit": 163, + "setrlimit": 164, + "getrusage": 165, + "umask": 166, + "prctl": 167, + "getcpu": 168, + "gettimeofday": 169, + "settimeofday": 170, + "adjtimex": 171, + "getpid": 172, + "getppid": 173, + "getuid": 174, + "geteuid": 175, + "getgid": 176, + "getegid": 177, + "gettid": 178, + "sysinfo": 179, + "mq_open": 180, + "mq_unlink": 181, + "mq_timedsend": 182, + "mq_timedreceive": 183, + "mq_notify": 184, + "mq_getsetattr": 185, + "msgget": 186, + "msgctl": 187, + "msgrcv": 188, + "msgsnd": 189, + "semget": 190, + "semctl": 191, + "semtimedop": 192, + "semop": 193, + "shmget": 194, + "shmctl": 195, + "shmat": 196, + "shmdt": 197, + "socket": 198, + "socketpair": 199, + "bind": 200, + "listen": 201, + "accept": 202, + "connect": 203, + "getsockname": 204, + "getpeername": 205, + "sendto": 206, + "recvfrom": 207, + "setsockopt": 208, + "getsockopt": 209, + "shutdown": 210, + "sendmsg": 211, + "recvmsg": 212, + "readahead": 213, + "brk": 214, + "munmap": 215, + "mremap": 216, + "add_key": 217, + "request_key": 218, + "keyctl": 219, + "clone": 220, + "execve": 221, + "mmap": 222, + "fadvise64": 223, + "swapon": 224, + "swapoff": 225, + "mprotect": 226, + "msync": 227, + "mlock": 228, + "munlock": 229, + "mlockall": 230, + "munlockall": 231, + "mincore": 232, + "madvise": 233, + "remap_file_pages": 234, + "mbind": 235, + "get_mempolicy": 236, + "set_mempolicy": 237, + "migrate_pages": 238, + "move_pages": 239, + "rt_tgsigqueueinfo": 240, + "perf_event_open": 241, + "accept4": 242, + "recvmmsg": 243, + "arch_specific_syscall": 244, + "wait4": 260, + "prlimit64": 261, + "fanotify_init": 262, + "fanotify_mark": 263, + "name_to_handle_at": 264, + "open_by_handle_at": 265, + "clock_adjtime": 266, + "syncfs": 267, + "setns": 268, + "sendmmsg": 269, + "process_vm_readv": 270, + "process_vm_writev": 271, + "kcmp": 272, + "finit_module": 273, + "sched_setattr": 274, + "sched_getattr": 275, + "renameat2": 276, + "seccomp": 277, + "getrandom": 278, + "memfd_create": 279, + "bpf": 280, + "execveat": 281, + "userfaultfd": 282, + "membarrier": 283, + "mlock2": 284, + "copy_file_range": 285, + "preadv2": 286, + "pwritev2": 287, + "pkey_mprotect": 288, + "pkey_alloc": 289, + "pkey_free": 290, + "statx": 291, + "io_pgetevents": 292, + "rseq": 293, + "kexec_file_load": 294, + "pidfd_send_signal": 424, + "io_uring_setup": 425, + "io_uring_enter": 426, + "io_uring_register": 427, + "open_tree": 428, + "move_mount": 429, + "fsopen": 430, + "fsconfig": 431, + "fsmount": 432, + "fspick": 433, + "pidfd_open": 434, + "clone3": 435, + "close_range": 436, + "openat2": 437, + "pidfd_getfd": 438, + "faccessat2": 439, + "process_madvise": 440, + "epoll_pwait2": 441, + "mount_setattr": 442, + "quotactl_fd": 443, + "landlock_create_ruleset": 444, + "landlock_add_rule": 445, + "landlock_restrict_self": 446, + "memfd_secret": 447, + "process_mrelease": 448, + "futex_waitv": 449, + "set_mempolicy_home_node": 450, + ] +} diff --git a/Sources/ContainerizationSeccomp/SeccompCompiler.swift b/Sources/ContainerizationSeccomp/SeccompCompiler.swift new file mode 100644 index 00000000..fa5ab7ce --- /dev/null +++ b/Sources/ContainerizationSeccomp/SeccompCompiler.swift @@ -0,0 +1,421 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +// https://man7.org/linux/man-pages/man2/seccomp.2.html + +import ContainerizationOCI + +public struct BPFInstruction: Equatable, Sendable { + public var code: UInt16 + public var jt: UInt8 + public var jf: UInt8 + public var k: UInt32 + + public init(code: UInt16, jt: UInt8, jf: UInt8, k: UInt32) { + self.code = code + self.jt = jt + self.jf = jf + self.k = k + } +} + +enum BPF { + enum InstructionClass { + static let ld: UInt16 = 0x00 + static let jmp: UInt16 = 0x05 + static let ret: UInt16 = 0x06 + static let alu: UInt16 = 0x04 + } + + enum Size { + static let w: UInt16 = 0x00 + } + + enum Mode { + static let abs: UInt16 = 0x20 + } + + enum Source { + static let k: UInt16 = 0x00 + } + + enum JumpTest { + static let eq: UInt16 = 0x10 + static let gt: UInt16 = 0x20 + static let ge: UInt16 = 0x30 + static let set: UInt16 = 0x40 + static let always: UInt16 = 0x00 + } + + enum ALUOp { + static let and: UInt16 = 0x50 + } +} + +enum SeccompData { + static let nr: UInt32 = 0 + static let arch: UInt32 = 4 + // args start at offset 16, each is 8 bytes (two 32-bit halves: lo at offset, hi at offset+4) + static let args: UInt32 = 16 +} + +enum SeccompReturn { + static let killThread: UInt32 = 0x0000_0000 + static let killProcess: UInt32 = 0x8000_0000 + static let trap: UInt32 = 0x0003_0000 + static let errno: UInt32 = 0x0005_0000 + static let trace: UInt32 = 0x7FF0_0000 + static let log: UInt32 = 0x7FFC_0000 + static let allow: UInt32 = 0x7FFF_0000 + static let notify: UInt32 = 0x7FC0_0000 +} + +enum AuditArch { + static let aarch64: UInt32 = 0xC000_00B7 +} + +extension LinuxSeccompFlag { + public var kernelFlag: UInt32 { + switch self { + case .flagLog: + return 1 << 1 + case .flagSpecAllow: + return 1 << 2 + case .flagWaitKillableRecv: + return 1 << 5 + } + } +} + +public enum SeccompCompiler { + public enum Error: Swift.Error, CustomStringConvertible { + case unknownSyscall(String) + case invalidArgIndex(UInt) + + public var description: String { + switch self { + case .unknownSyscall(let name): + return "unknown syscall: \(name)" + case .invalidArgIndex(let idx): + return "invalid syscall arg index: \(idx), must be 0-5" + } + } + } + + /// Compiles an OCI `LinuxSeccomp` configuration into a classic BPF (cBPF) + /// filter program. + /// + /// The kernel evaluates the BPF program on every syscall. The program inspects + /// a read-only `seccomp_data` struct: + /// + /// struct seccomp_data { + /// int nr; // offset 0: syscall number + /// __u32 arch; // offset 4: AUDIT_ARCH_* value + /// __u64 ip; // offset 8: instruction pointer (unused) + /// __u64 args[6]; // offset 16: syscall arguments, 8 bytes each + /// }; + /// + /// Each BPF instruction is 8 bytes: a 16-bit opcode, two 8-bit jump offsets + /// (jt = jump-true, jf = jump-false, relative to the next instruction), and + /// a 32-bit immediate value (k). The program terminates when it executes a + /// RET instruction whose k value is the seccomp action (e.g. ALLOW, ERRNO). + /// + /// EXAMPLE: blocking `mkdirat` with ERRNO(EPERM), default ALLOW + /// + /// Given this OCI config: + /// + /// defaultAction: SCMP_ACT_ALLOW + /// syscalls: [{ names: ["mkdirat"], action: SCMP_ACT_ERRNO, errnoRet: 1 }] + /// + /// The compiler produces: + /// + /// [0] LD_ABS [seccomp_data.arch] // load arch into accumulator + /// [1] JEQ AUDIT_ARCH_AARCH64 1, 0 // match → skip 1 to [3]; miss → fall to [2] + /// [2] RET KILL_PROCESS // wrong arch → kill + /// [3] LD_ABS [seccomp_data.nr] // load syscall number into accumulator + /// [4] JEQ 34 (mkdirat) 0, 1 // match → fall to [5]; miss → skip 1 to [6] + /// [5] RET ERRNO|EPERM // blocked syscall → return -EPERM + /// [6] RET ALLOW // no rule matched → allow + /// + /// Jump offsets (jt, jf) are relative to the *next* instruction. A JEQ with + /// jt=1,jf=0 means: if equal, skip 1 instruction forward; if not equal, fall + /// through to the very next instruction (skip 0). + /// + /// EXAMPLE: blocking multiple syscalls + /// + /// Each syscall in the rule list becomes a JEQ+RET pair. The syscall number + /// stays in the accumulator across rules, so no reload is needed: + /// + /// syscalls: [ + /// { names: ["mkdirat"], action: SCMP_ACT_ERRNO, errnoRet: 1 }, + /// { names: ["unlinkat"], action: SCMP_ACT_KILL_PROCESS }, + /// ] + /// + /// [0] LD_ABS [seccomp_data.arch] + /// [1] JEQ AUDIT_ARCH_AARCH64 1, 0 + /// [2] RET KILL_PROCESS + /// [3] LD_ABS [seccomp_data.nr] + /// [4] JEQ 34 (mkdirat) 0, 1 // miss → skip to [6] + /// [5] RET ERRNO|EPERM + /// [6] JEQ 35 (unlinkat) 0, 1 // miss → skip to [8] + /// [7] RET KILL_PROCESS + /// [8] RET ALLOW + /// + /// For rules with argument filters, the compiler inserts additional LD/JEQ + /// sequences between the syscall number match and the action RET. Because BPF + /// is a 32-bit machine, 64-bit arguments are compared as two 32-bit halves + /// (lo at the base offset, hi at base+4, little-endian). + public static func compileFromOCI(config: ContainerizationOCI.LinuxSeccomp) throws -> [BPFInstruction] { + var prog: [BPFInstruction] = [] + + // 1. Check architecture: load seccomp_data.arch + prog.append(BPFInstruction(code: BPF.InstructionClass.ld | BPF.Size.w | BPF.Mode.abs, jt: 0, jf: 0, k: SeccompData.arch)) + + // We only support AARCH64 today, so if arch != AARCH64 kill the process. + prog.append(BPFInstruction(code: BPF.InstructionClass.jmp | BPF.JumpTest.eq | BPF.Source.k, jt: 1, jf: 0, k: AuditArch.aarch64)) + prog.append(BPFInstruction(code: BPF.InstructionClass.ret | BPF.Source.k, jt: 0, jf: 0, k: SeccompReturn.killProcess)) + + // 2. Load syscall number + prog.append(BPFInstruction(code: BPF.InstructionClass.ld | BPF.Size.w | BPF.Mode.abs, jt: 0, jf: 0, k: SeccompData.nr)) + + // 3. Per-rule matching + for syscall in config.syscalls { + let action = mapAction(syscall.action, errnoRet: syscall.errnoRet) + + for name in syscall.names { + guard let nr = Self.aarch64SyscallTable[name] ?? UInt32(name) else { + // Skip unknown syscall names that aren't valid numbers + continue + } + + if syscall.args.isEmpty { + // Simple case: JEQ nr -> return action, else fall through + // We need: JEQ nr, 0, 1 (if equal, execute next which is RET; else skip RET) + prog.append(BPFInstruction(code: BPF.InstructionClass.jmp | BPF.JumpTest.eq | BPF.Source.k, jt: 0, jf: 1, k: nr)) + prog.append(BPFInstruction(code: BPF.InstructionClass.ret | BPF.Source.k, jt: 0, jf: 0, k: action)) + } else { + // With arg filters: JEQ nr -> arg checks -> return action + // First, build the arg check instructions + let argBlock = try buildArgBlock(args: syscall.args, action: action) + // JEQ nr, 0, skip_arg_block + let skipCount = UInt8(argBlock.count) + prog.append(BPFInstruction(code: BPF.InstructionClass.jmp | BPF.JumpTest.eq | BPF.Source.k, jt: 0, jf: skipCount, k: nr)) + prog.append(contentsOf: argBlock) + } + } + } + + // 4. Default action + let defaultAction = mapAction(config.defaultAction, errnoRet: config.defaultErrnoRet) + prog.append(BPFInstruction(code: BPF.InstructionClass.ret | BPF.Source.k, jt: 0, jf: 0, k: defaultAction)) + + return prog + } + + /// Map kernel flags from OCI config flags. + public static func mapFlags(_ flags: [LinuxSeccompFlag]) -> UInt32 { + var result: UInt32 = 0 + for flag in flags { + result |= flag.kernelFlag + } + return result + } + + static func mapAction(_ action: LinuxSeccompAction, errnoRet: UInt?) -> UInt32 { + switch action { + case .actKill, .actKillThread: + return SeccompReturn.killThread + case .actKillProcess: + return SeccompReturn.killProcess + case .actTrap: + return SeccompReturn.trap + case .actErrno: + let errno = UInt32(errnoRet ?? 0) & 0xFFFF + return SeccompReturn.errno | errno + case .actTrace: + return SeccompReturn.trace + case .actLog: + return SeccompReturn.log + case .actAllow: + return SeccompReturn.allow + case .actNotify: + return SeccompReturn.notify + } + } + + enum JumpField { case jt, jf, k } + + /// Build BPF instructions for argument comparison. + /// + /// Each argument is 64-bit but BPF operates on 32-bit values, so we compare + /// the low and high 32-bit halves separately. All arg conditions must match + /// (AND semantics) for the action to be taken. + static func buildArgBlock(args: [LinuxSeccompArg], action: UInt32) throws -> [BPFInstruction] { + // We build the arg checks. If any check fails, we need to jump past + // the remaining checks and the final RET action instruction. + // We'll build everything first, then fix up the failure jumps. + + // Each failure jump records the instruction index and which field + // holds the jump offset. Most use jf (conditional false branch), + // but LT/LE use jt (the "true" branch of JGT/JGE is the fail path) + // and NE uses k (unconditional JA jump). + var checks: [(instructions: [BPFInstruction], failureJumps: [(index: Int, field: JumpField)])] = [] + + for arg in args { + guard arg.index <= 5 else { + throw Error.invalidArgIndex(arg.index) + } + + let check = try buildSingleArgCheck(arg: arg) + checks.append(check) + } + + // Flatten all checks and add the final RET. + var flat: [BPFInstruction] = [] + var failureJumps: [(index: Int, field: JumpField)] = [] + + for check in checks { + let baseIndex = flat.count + flat.append(contentsOf: check.instructions) + for (idx, field) in check.failureJumps { + failureJumps.append((baseIndex + idx, field)) + } + } + + // Append the success RET + flat.append(BPFInstruction(code: BPF.InstructionClass.ret | BPF.Source.k, jt: 0, jf: 0, k: action)) + + // Fix up failure jumps to point past the RET. + // BPF jump offsets are relative to the next instruction. + // jump offset = target - (current + 1) = flat.count - idx - 1 + for (idx, field) in failureJumps { + let jumpOffset = flat.count - idx - 1 + switch field { + case .jt: flat[idx].jt = UInt8(jumpOffset) + case .jf: flat[idx].jf = UInt8(jumpOffset) + case .k: flat[idx].k = UInt32(jumpOffset) + } + } + + // Reload the syscall number after arg checks for the next rule + flat.append(BPFInstruction(code: BPF.InstructionClass.ld | BPF.Size.w | BPF.Mode.abs, jt: 0, jf: 0, k: SeccompData.nr)) + + return flat + } + + /// Build instructions for a single argument comparison. + /// Returns instructions and failure jumps (index + which field to patch). + static func buildSingleArgCheck(arg: LinuxSeccompArg) throws -> (instructions: [BPFInstruction], failureJumps: [(index: Int, field: JumpField)]) { + let argOffset = SeccompData.args + UInt32(arg.index) * 8 + let loOffset = argOffset // low 32 bits + let hiOffset = argOffset + 4 // high 32 bits + + let valueLo = UInt32(arg.value & 0xFFFF_FFFF) + let valueHi = UInt32(arg.value >> 32) + let valueTwoLo = UInt32(arg.valueTwo & 0xFFFF_FFFF) + let valueTwoHi = UInt32(arg.valueTwo >> 32) + + let ldAbs = BPF.InstructionClass.ld | BPF.Size.w | BPF.Mode.abs + let jmpEq = BPF.InstructionClass.jmp | BPF.JumpTest.eq | BPF.Source.k + let jmpGt = BPF.InstructionClass.jmp | BPF.JumpTest.gt | BPF.Source.k + let jmpGe = BPF.InstructionClass.jmp | BPF.JumpTest.ge | BPF.Source.k + let jmpAlways = BPF.InstructionClass.jmp | BPF.JumpTest.always | BPF.Source.k + let aluAnd = BPF.InstructionClass.alu | BPF.ALUOp.and | BPF.Source.k + + var insts: [BPFInstruction] = [] + var fails: [(index: Int, field: JumpField)] = [] + + switch arg.op { + case .opEqualTo: + // EQ: both halves must match. Fail (jf) if either doesn't. + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: hiOffset)) + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 0, k: valueHi)) + fails.append((insts.count - 1, .jf)) + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: loOffset)) + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 0, k: valueLo)) + fails.append((insts.count - 1, .jf)) + + case .opNotEqual: + // NE: succeed if either half differs. Fail if both match. + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: hiOffset)) + // hi differs → success (skip 3 past lo check) + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 3, k: valueHi)) + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: loOffset)) + // lo differs → success (skip 1 past fail jump) + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 1, k: valueLo)) + // Both matched → NE fails. JA uses k for the offset. + insts.append(BPFInstruction(code: jmpAlways, jt: 0, jf: 0, k: 0)) + fails.append((insts.count - 1, .k)) + + case .opGreaterThan: + // GT: hi > v_hi → success; hi == v_hi → check lo > v_lo; else fail. + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: hiOffset)) + insts.append(BPFInstruction(code: jmpGt, jt: 3, jf: 0, k: valueHi)) + // hi not greater; check if equal + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 0, k: valueHi)) + fails.append((insts.count - 1, .jf)) // hi < v_hi → fail + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: loOffset)) + insts.append(BPFInstruction(code: jmpGt, jt: 0, jf: 0, k: valueLo)) + fails.append((insts.count - 1, .jf)) // lo <= v_lo → fail + + case .opGreaterEqual: + // GE: hi > v_hi → success; hi == v_hi → check lo >= v_lo; else fail. + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: hiOffset)) + insts.append(BPFInstruction(code: jmpGt, jt: 3, jf: 0, k: valueHi)) + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 0, k: valueHi)) + fails.append((insts.count - 1, .jf)) + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: loOffset)) + insts.append(BPFInstruction(code: jmpGe, jt: 0, jf: 0, k: valueLo)) + fails.append((insts.count - 1, .jf)) + + case .opLessThan: + // LT: hi < v_hi → success; hi == v_hi → check lo < v_lo; else fail. + // JGT true means hi > v_hi → fail (on jt). + // JGT false + JEQ false means hi < v_hi → success (skip past lo check). + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: hiOffset)) + insts.append(BPFInstruction(code: jmpGt, jt: 0, jf: 0, k: valueHi)) + fails.append((insts.count - 1, .jt)) // hi > v_hi → fail + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 3, k: valueHi)) + // hi < v_hi → skip 3 to success (past lo check) + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: loOffset)) + insts.append(BPFInstruction(code: jmpGe, jt: 0, jf: 0, k: valueLo)) + fails.append((insts.count - 1, .jt)) // lo >= v_lo → fail + + case .opLessEqual: + // LE: hi < v_hi → success; hi == v_hi → check lo <= v_lo; else fail. + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: hiOffset)) + insts.append(BPFInstruction(code: jmpGt, jt: 0, jf: 0, k: valueHi)) + fails.append((insts.count - 1, .jt)) // hi > v_hi → fail + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 3, k: valueHi)) + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: loOffset)) + insts.append(BPFInstruction(code: jmpGt, jt: 0, jf: 0, k: valueLo)) + fails.append((insts.count - 1, .jt)) // lo > v_lo → fail + + case .opMaskedEqual: + // MASKED_EQ: (data & value) == valueTwo, checked per-half. + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: hiOffset)) + insts.append(BPFInstruction(code: aluAnd, jt: 0, jf: 0, k: UInt32(valueHi))) + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 0, k: valueTwoHi)) + fails.append((insts.count - 1, .jf)) + insts.append(BPFInstruction(code: ldAbs, jt: 0, jf: 0, k: loOffset)) + insts.append(BPFInstruction(code: aluAnd, jt: 0, jf: 0, k: UInt32(valueLo))) + insts.append(BPFInstruction(code: jmpEq, jt: 0, jf: 0, k: valueTwoLo)) + fails.append((insts.count - 1, .jf)) + } + + return (insts, fails) + } +} diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index 97f5866e..e8b0a763 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -4344,4 +4344,310 @@ extension IntegrationSuite { throw error } } + + func testSeccompBlockSyscall() async throws { + let id = "test-seccomp-block-syscall" + + let bs = try await bootstrap(id) + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + // Block mkdirat with EPERM. + config.process.arguments = ["mkdir", "/tmp/seccomp-test"] + config.bootLog = bs.bootLog + var seccomp = SeccompProfile(defaultAction: .allow) + seccomp.syscalls = [SeccompProfile.Rule(syscalls: ["mkdirat"], action: .errno(1))] + config.seccomp = seccomp + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + guard status.exitCode != 0 else { + throw IntegrationError.assert(msg: "expected non-zero exit code when mkdirat is blocked by seccomp") + } + } + + func testSeccompBlockSyscallExplicitErrno() async throws { + let id = "test-seccomp-block-explicit-errno" + + let bs = try await bootstrap(id) + let errBuffer = BufferWriter() + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + // Block mkdirat with errnoRet=100 (ENETDOWN). Verify the specific + // errno value passes through by checking for "Network is down". + config.process.arguments = ["sh", "-c", "mkdir /tmp/seccomp-test"] + config.process.noNewPrivileges = true + config.process.stderr = errBuffer + config.bootLog = bs.bootLog + var seccomp = SeccompProfile(defaultAction: .allow) + seccomp.syscalls = [SeccompProfile.Rule(syscalls: ["mkdirat"], action: .errno(100))] + config.seccomp = seccomp + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + guard status.exitCode != 0 else { + throw IntegrationError.assert(msg: "expected non-zero exit code when mkdirat is blocked by seccomp") + } + + let stderr = String(data: errBuffer.data, encoding: .utf8) ?? "" + guard stderr.contains("Network is down") else { + throw IntegrationError.assert(msg: "expected 'Network is down' (ENETDOWN) in stderr, got: \(stderr)") + } + } + + func testSeccompAllowList() async throws { + let id = "test-seccomp-allow-list" + + let bs = try await bootstrap(id) + let buffer = BufferWriter() + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["echo", "seccomp-ok"] + config.process.noNewPrivileges = true + config.process.stdout = buffer + config.bootLog = bs.bootLog + config.seccomp = SeccompProfile(defaultAction: .allow) + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "process status \(status) != 0") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert stdout to UTF8") + } + + guard output.contains("seccomp-ok") else { + throw IntegrationError.assert(msg: "expected 'seccomp-ok' in output, got: \(output)") + } + } + + func testSeccompKillProcess() async throws { + let id = "test-seccomp-kill-process" + + let bs = try await bootstrap(id) + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + // Block mkdirat with KILL_PROCESS. The kernel sends SIGSYS (31), + // which the runtime reports as exit code 128 + 31 = 159. + config.process.arguments = ["mkdir", "/tmp/seccomp-test"] + config.process.noNewPrivileges = true + config.bootLog = bs.bootLog + var seccomp = SeccompProfile(defaultAction: .allow) + seccomp.syscalls = [SeccompProfile.Rule(syscalls: ["mkdirat"], action: .killProcess)] + config.seccomp = seccomp + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + // SECCOMP_RET_KILL_PROCESS sends SIGSYS (31). Exit code = 128 + 31 = 159. + let expectedExitCode: Int32 = 128 + 31 + guard status.exitCode == expectedExitCode else { + throw IntegrationError.assert(msg: "expected exit code \(expectedExitCode) (SIGSYS), got \(status.exitCode)") + } + } + + func testSeccompExec() async throws { + let id = "test-seccomp-exec" + + let bs = try await bootstrap(id) + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["sleep", "30"] + config.process.noNewPrivileges = true + config.bootLog = bs.bootLog + var seccomp = SeccompProfile(defaultAction: .allow) + seccomp.syscalls = [SeccompProfile.Rule(syscalls: ["mkdirat"], action: .errno(1))] + config.seccomp = seccomp + } + + try await container.create() + try await container.start() + + do { + let exec = try await container.exec("exec-seccomp") { process in + process.arguments = ["mkdir", "/tmp/seccomp-test"] + } + try await exec.start() + let execStatus = try await exec.wait() + try await exec.delete() + + guard execStatus.exitCode != 0 else { + throw IntegrationError.assert(msg: "expected non-zero exit code for exec when mkdirat is blocked") + } + + try await container.kill(SIGKILL) + try await container.wait() + try await container.stop() + } catch { + try? await container.stop() + throw error + } + } + + func testSeccompOCIDefaultCapabilityGatedRules() async throws { + let id = "test-seccomp-ocidefault-caps" + + let bs = try await bootstrap(id) + + // Without CAP_SYS_ADMIN: mount is not in the seccomp allowlist. + // Run mount directly — seccomp returns EPERM, mount exits non-zero. + let denied = try LinuxContainer("\(id)-denied", rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["mount", "-t", "tmpfs", "tmpfs", "/tmp"] + config.process.noNewPrivileges = true + config.process.capabilities = .defaultOCICapabilities + config.bootLog = bs.bootLog + config.seccomp = .defaultProfile + } + + try await denied.create() + try await denied.start() + var status = try await denied.wait() + try await denied.stop() + + guard status.exitCode != 0 else { + throw IntegrationError.assert(msg: "expected mount to fail without CAP_SYS_ADMIN") + } + + // With CAP_SYS_ADMIN: mount is added to the seccomp allowlist + // and the process has the kernel capability to perform it. + let allowed = try LinuxContainer("\(id)-allowed", rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["mount", "-t", "tmpfs", "tmpfs", "/tmp"] + config.process.noNewPrivileges = true + config.process.capabilities = LinuxCapabilities(capabilities: [.sysAdmin]) + config.bootLog = bs.bootLog + config.seccomp = .defaultProfile + } + + try await allowed.create() + try await allowed.start() + status = try await allowed.wait() + try await allowed.stop() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "expected mount to succeed with CAP_SYS_ADMIN, got exit code \(status.exitCode)") + } + } + + func testSeccompLogAction() async throws { + let id = "test-seccomp-log" + + let bs = try await bootstrap(id) + let buffer = BufferWriter() + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + // Log mkdirat via SECCOMP_RET_LOG — the syscall succeeds but + // the kernel writes an audit entry to the ring buffer. + // Run mkdir to trigger it, then dmesg to read the log. + config.process.arguments = ["sleep", "30"] + config.bootLog = bs.bootLog + var seccomp = SeccompProfile(defaultAction: .allow) + seccomp.syscalls = [SeccompProfile.Rule(syscalls: ["mkdirat"], action: .log)] + config.seccomp = seccomp + } + + try await container.create() + try await container.start() + + do { + // Trigger the logged syscall. + let mkdirExec = try await container.exec("exec-mkdir") { process in + process.arguments = ["mkdir", "/tmp/seccomp-log-test"] + } + try await mkdirExec.start() + let mkdirStatus = try await mkdirExec.wait() + try await mkdirExec.delete() + + // mkdir should succeed. SECCOMP_RET_LOG allows the syscall. + guard mkdirStatus.exitCode == 0 else { + throw IntegrationError.assert(msg: "expected mkdir to succeed with log action, got exit code \(mkdirStatus.exitCode)") + } + + let dmesgExec = try await container.exec("exec-dmesg") { process in + process.arguments = ["dmesg"] + process.capabilities = LinuxCapabilities(capabilities: [.syslog]) + process.stdout = buffer + } + try await dmesgExec.start() + let dmesgStatus = try await dmesgExec.wait() + try await dmesgExec.delete() + + guard dmesgStatus.exitCode == 0 else { + throw IntegrationError.assert(msg: "dmesg failed with exit code \(dmesgStatus.exitCode)") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert dmesg output to UTF8") + } + + // type=1326 is AUDIT_SECCOMP, syscall=34 is mkdirat (specifically on aarch64). + guard output.contains("type=1326") && output.contains("syscall=34") else { + throw IntegrationError.assert(msg: "expected seccomp audit log for mkdirat (type=1326 syscall=34), got: \(output.suffix(500))") + } + + try await container.kill(SIGKILL) + try await container.wait() + try await container.stop() + } catch { + try? await container.stop() + throw error + } + } + + func testSeccompSyscallByNumber() async throws { + let id = "test-seccomp-syscall-number" + + let bs = try await bootstrap(id) + + // Block mkdirat by name. + let byName = try LinuxContainer("\(id)-name", rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["mkdir", "/tmp/seccomp-test"] + config.process.noNewPrivileges = true + config.bootLog = bs.bootLog + var seccomp = SeccompProfile(defaultAction: .allow) + seccomp.syscalls = [SeccompProfile.Rule(syscalls: [.name("mkdirat")], action: .errno(1))] + config.seccomp = seccomp + } + + try await byName.create() + try await byName.start() + var status = try await byName.wait() + try await byName.stop() + + guard status.exitCode != 0 else { + throw IntegrationError.assert(msg: "expected mkdir to fail when blocked by name") + } + + // Block mkdirat by raw number (34). + let byNumber = try LinuxContainer("\(id)-number", rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["mkdir", "/tmp/seccomp-test"] + config.process.noNewPrivileges = true + config.bootLog = bs.bootLog + var seccomp = SeccompProfile(defaultAction: .allow) + seccomp.syscalls = [SeccompProfile.Rule(syscalls: [.number(34)], action: .errno(1))] + config.seccomp = seccomp + } + + try await byNumber.create() + try await byNumber.start() + status = try await byNumber.wait() + try await byNumber.stop() + + guard status.exitCode != 0 else { + throw IntegrationError.assert(msg: "expected mkdir to fail when blocked by number (34)") + } + } } diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 2f0a079d..5049ef31 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -376,6 +376,14 @@ struct IntegrationSuite: AsyncParsableCommand { Test("container noNewPrivileges exec", testNoNewPrivilegesExec), Test("container workingDir created", testWorkingDirCreated), Test("container workingDir exec created", testWorkingDirExecCreated), + Test("container seccomp block syscall", testSeccompBlockSyscall), + Test("container seccomp block syscall explicit errno", testSeccompBlockSyscallExplicitErrno), + Test("container seccomp allow list", testSeccompAllowList), + Test("container seccomp kill process", testSeccompKillProcess), + Test("container seccomp exec", testSeccompExec), + Test("container seccomp defaultProfile capability gated rules", testSeccompOCIDefaultCapabilityGatedRules), + Test("container seccomp log action", testSeccompLogAction), + Test("container seccomp syscall by number", testSeccompSyscallByNumber), // Pods Test("pod single container", testPodSingleContainer), diff --git a/Tests/ContainerizationSeccompTests/BPFSimulator.swift b/Tests/ContainerizationSeccompTests/BPFSimulator.swift new file mode 100644 index 00000000..d67e654f --- /dev/null +++ b/Tests/ContainerizationSeccompTests/BPFSimulator.swift @@ -0,0 +1,139 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerizationSeccomp + +/// A classic BPF (cBPF) interpreter for seccomp filters. +/// +/// Executes a compiled BPF program against a simulated `seccomp_data` struct, +/// returning the seccomp return value (action + data). This allows testing +/// filter semantics on any OS. +enum BPFSimulator { + /// Simulates a seccomp BPF program against the given inputs. + /// + /// The simulated `seccomp_data` layout (little-endian): + /// - offset 0: nr (syscall number, 32-bit) + /// - offset 4: arch (AUDIT_ARCH_*, 32-bit) + /// - offset 8: instruction_pointer (64-bit, unused) + /// - offset 16: args[0] (64-bit) + /// - offset 24: args[1] (64-bit) + /// - ... up to args[5] at offset 56 + /// + /// Returns the 32-bit seccomp return value, or nil if the program is invalid. + static func run( + _ program: [BPFInstruction], + syscallNr: UInt32, + arch: UInt32, + args: [UInt64] = [] + ) -> UInt32? { + // Build seccomp_data as a byte buffer (64 bytes) + var data = [UInt8](repeating: 0, count: 64) + + // nr at offset 0 (little-endian) + writeU32(&data, offset: 0, value: syscallNr) + // arch at offset 4 + writeU32(&data, offset: 4, value: arch) + // instruction_pointer at offset 8 (leave as 0) + // args at offset 16, each 8 bytes + for (i, arg) in args.prefix(6).enumerated() { + let offset = 16 + i * 8 + writeU32(&data, offset: offset, value: UInt32(arg & 0xFFFF_FFFF)) + writeU32(&data, offset: offset + 4, value: UInt32(arg >> 32)) + } + + var accumulator: UInt32 = 0 + var pc = 0 + + while pc < program.count { + let inst = program[pc] + let cls = inst.code & 0x07 + + switch cls { + case 0x00: // BPF_LD + let mode = inst.code & 0xE0 + guard mode == 0x20 else { + return nil + } + let offset = Int(inst.k) + guard offset + 4 <= data.count else { return nil } + accumulator = readU32(data, offset: offset) + pc += 1 + + case 0x04: // BPF_ALU + let op = inst.code & 0xF0 + let src = inst.code & 0x08 + let operand: UInt32 = src == 0 ? inst.k : 0 // BPF_K vs BPF_X (we only support K) + switch op { + case 0x00: accumulator &+= operand // ADD + case 0x10: accumulator &-= operand // SUB + case 0x20: accumulator &*= operand // MUL + case 0x30: + guard operand != 0 else { return nil } + accumulator /= operand // DIV + case 0x40: accumulator |= operand // OR + case 0x50: accumulator &= operand // AND + case 0x60: accumulator <<= operand // LSH + case 0x70: accumulator >>= operand // RSH + case 0x80: accumulator = ~accumulator // NEG + default: return nil + } + pc += 1 + + case 0x05: // BPF_JMP + let op = inst.code & 0xF0 + let src = inst.code & 0x08 + let operand: UInt32 = src == 0 ? inst.k : 0 + + switch op { + case 0x00: // JA (unconditional) + pc += 1 + Int(inst.k) + case 0x10: // JEQ + pc += 1 + Int(accumulator == operand ? inst.jt : inst.jf) + case 0x20: // JGT + pc += 1 + Int(accumulator > operand ? inst.jt : inst.jf) + case 0x30: // JGE + pc += 1 + Int(accumulator >= operand ? inst.jt : inst.jf) + case 0x40: // JSET + pc += 1 + Int((accumulator & operand) != 0 ? inst.jt : inst.jf) + default: + return nil + } + + case 0x06: // BPF_RET + return inst.k + + default: + return nil + } + } + + return nil + } + + private static func writeU32(_ data: inout [UInt8], offset: Int, value: UInt32) { + data[offset] = UInt8(value & 0xFF) + data[offset + 1] = UInt8((value >> 8) & 0xFF) + data[offset + 2] = UInt8((value >> 16) & 0xFF) + data[offset + 3] = UInt8((value >> 24) & 0xFF) + } + + private static func readU32(_ data: [UInt8], offset: Int) -> UInt32 { + UInt32(data[offset]) + | (UInt32(data[offset + 1]) << 8) + | (UInt32(data[offset + 2]) << 16) + | (UInt32(data[offset + 3]) << 24) + } +} diff --git a/Tests/ContainerizationSeccompTests/SeccompCompilerTests.swift b/Tests/ContainerizationSeccompTests/SeccompCompilerTests.swift new file mode 100644 index 00000000..102caa13 --- /dev/null +++ b/Tests/ContainerizationSeccompTests/SeccompCompilerTests.swift @@ -0,0 +1,369 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerizationOCI +import ContainerizationSeccomp +import Testing + +private let aarch64: UInt32 = 0xC000_00B7 +private let retAllow: UInt32 = 0x7FFF_0000 +private let retKillProcess: UInt32 = 0x8000_0000 + +private func retErrno(_ errno: UInt32) -> UInt32 { 0x0005_0000 | errno } + +private func makeConfig( + defaultAction: LinuxSeccompAction = .actAllow, + defaultErrnoRet: UInt? = nil, + syscalls: [LinuxSyscall] = [] +) -> LinuxSeccomp { + LinuxSeccomp( + defaultAction: defaultAction, + defaultErrnoRet: defaultErrnoRet, + architectures: [], + flags: [], + listenerPath: "", + listenerMetadata: "", + syscalls: syscalls + ) +} + +@Suite("SeccompCompiler") +struct SeccompCompilerTests { + // MARK: - Architecture check + + @Test("kills process on wrong architecture") + func wrongArchKills() throws { + let prog = try SeccompCompiler.compileFromOCI(config: makeConfig()) + let result = BPFSimulator.run(prog, syscallNr: 0, arch: 0x1234_5678) + #expect(result == retKillProcess) + } + + @Test("allows syscall on aarch64") + func correctArchAllows() throws { + let prog = try SeccompCompiler.compileFromOCI(config: makeConfig()) + let result = BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) // openat + #expect(result == retAllow) + } + + // MARK: - Default action + + @Test("returns default allow when no rules match") + func defaultAllow() throws { + let prog = try SeccompCompiler.compileFromOCI(config: makeConfig(defaultAction: .actAllow)) + let result = BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) + #expect(result == retAllow) + } + + @Test("returns default errno when no rules match") + func defaultErrno() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + defaultAction: .actErrno, + defaultErrnoRet: 1 + )) + let result = BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) + #expect(result == retErrno(1)) + } + + // MARK: - Simple syscall blocking + + @Test("blocks specific syscall with errno, allows others") + func blockSyscallErrno() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + syscalls: [ + LinuxSyscall(names: ["mkdirat"], action: .actErrno, errnoRet: 1, args: []) + ] + )) + + // mkdirat (34) → ERRNO|1 + #expect(BPFSimulator.run(prog, syscallNr: 34, arch: aarch64) == retErrno(1)) + // openat (56) → ALLOW (default) + #expect(BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) == retAllow) + // read (63) → ALLOW (default) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64) == retAllow) + } + + @Test("blocks specific syscall with kill process") + func blockSyscallKill() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + syscalls: [ + LinuxSyscall(names: ["mkdirat"], action: .actKillProcess, errnoRet: nil, args: []) + ] + )) + + #expect(BPFSimulator.run(prog, syscallNr: 34, arch: aarch64) == retKillProcess) + #expect(BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) == retAllow) + } + + @Test("errno passes through explicit errno value") + func errnoExplicitValue() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + syscalls: [ + LinuxSyscall(names: ["mkdirat"], action: .actErrno, errnoRet: 100, args: []) + ] + )) + + // ENETDOWN = 100 + #expect(BPFSimulator.run(prog, syscallNr: 34, arch: aarch64) == retErrno(100)) + } + + // MARK: - Multiple rules and syscalls + + @Test("multiple rules apply independently") + func multipleRules() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + syscalls: [ + LinuxSyscall(names: ["mkdirat"], action: .actErrno, errnoRet: 1, args: []), + LinuxSyscall(names: ["unlinkat"], action: .actKillProcess, errnoRet: nil, args: []), + ] + )) + + #expect(BPFSimulator.run(prog, syscallNr: 34, arch: aarch64) == retErrno(1)) // mkdirat + #expect(BPFSimulator.run(prog, syscallNr: 35, arch: aarch64) == retKillProcess) // unlinkat + #expect(BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) == retAllow) // openat + } + + @Test("multiple names in one rule all match") + func multipleNamesOneRule() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + syscalls: [ + LinuxSyscall(names: ["mkdirat", "unlinkat", "symlinkat"], action: .actErrno, errnoRet: 1, args: []) + ] + )) + + #expect(BPFSimulator.run(prog, syscallNr: 34, arch: aarch64) == retErrno(1)) // mkdirat + #expect(BPFSimulator.run(prog, syscallNr: 35, arch: aarch64) == retErrno(1)) // unlinkat + #expect(BPFSimulator.run(prog, syscallNr: 36, arch: aarch64) == retErrno(1)) // symlinkat + #expect(BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) == retAllow) // openat + } + + // MARK: - Unknown syscall names + + @Test("unknown syscall names are silently skipped") + func unknownSyscallSkipped() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + syscalls: [ + LinuxSyscall(names: ["open"], action: .actErrno, errnoRet: 1, args: []) + ] + )) + + // "open" doesn't exist on aarch64, no rule emitted, everything allowed + #expect(BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) == retAllow) + } + + @Test("mixed known/unknown names: known names still match") + func mixedKnownUnknown() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + syscalls: [ + LinuxSyscall(names: ["open", "mkdirat"], action: .actErrno, errnoRet: 1, args: []) + ] + )) + + #expect(BPFSimulator.run(prog, syscallNr: 34, arch: aarch64) == retErrno(1)) // mkdirat matched + #expect(BPFSimulator.run(prog, syscallNr: 56, arch: aarch64) == retAllow) // openat unaffected + } + + // MARK: - Argument filtering + + @Test("equalTo arg filter matches exact value") + func argEqualTo() throws { + // Allow personality(0) only, block everything else via default + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + defaultAction: .actErrno, + defaultErrnoRet: 1, + syscalls: [ + LinuxSyscall( + names: ["personality"], + action: .actAllow, + errnoRet: nil, + args: [LinuxSeccompArg(index: 0, value: 0, valueTwo: 0, op: .opEqualTo)] + ) + ] + )) + + // personality(0) → ALLOW + #expect(BPFSimulator.run(prog, syscallNr: 92, arch: aarch64, args: [0]) == retAllow) + // personality(8) → ERRNO (arg doesn't match) + #expect(BPFSimulator.run(prog, syscallNr: 92, arch: aarch64, args: [8]) == retErrno(1)) + // personality(0xFFFFFFFF) → ERRNO + #expect(BPFSimulator.run(prog, syscallNr: 92, arch: aarch64, args: [0xFFFF_FFFF]) == retErrno(1)) + } + + @Test("notEqual arg filter blocks exact value") + func argNotEqual() throws { + // Allow socket() unless arg0 == 40 (AF_VSOCK) + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + defaultAction: .actErrno, + defaultErrnoRet: 1, + syscalls: [ + LinuxSyscall( + names: ["socket"], + action: .actAllow, + errnoRet: nil, + args: [LinuxSeccompArg(index: 0, value: 40, valueTwo: 0, op: .opNotEqual)] + ) + ] + )) + + // socket(AF_INET=2) → ALLOW (not 40) + #expect(BPFSimulator.run(prog, syscallNr: 198, arch: aarch64, args: [2]) == retAllow) + // socket(AF_VSOCK=40) → ERRNO (arg == 40, NE fails) + #expect(BPFSimulator.run(prog, syscallNr: 198, arch: aarch64, args: [40]) == retErrno(1)) + // socket(AF_UNIX=1) → ALLOW + #expect(BPFSimulator.run(prog, syscallNr: 198, arch: aarch64, args: [1]) == retAllow) + } + + @Test("maskedEqual arg filter checks flag mask") + func argMaskedEqual() throws { + // allow clone if (flags & 0x7E020000) == 0 + let cloneMask: UInt64 = 2_114_060_288 // 0x7E020000 + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + defaultAction: .actErrno, + defaultErrnoRet: 1, + syscalls: [ + LinuxSyscall( + names: ["clone"], + action: .actAllow, + errnoRet: nil, + args: [LinuxSeccompArg(index: 0, value: cloneMask, valueTwo: 0, op: .opMaskedEqual)] + ) + ] + )) + + // clone(SIGCHLD=17) → ALLOW (no namespace flags set) + #expect(BPFSimulator.run(prog, syscallNr: 220, arch: aarch64, args: [17]) == retAllow) + // clone(CLONE_NEWUSER=0x10000000) → ERRNO (namespace flag set) + #expect(BPFSimulator.run(prog, syscallNr: 220, arch: aarch64, args: [0x1000_0000]) == retErrno(1)) + // clone(CLONE_NEWPID=0x20000000) → ERRNO + #expect(BPFSimulator.run(prog, syscallNr: 220, arch: aarch64, args: [0x2000_0000]) == retErrno(1)) + // clone(SIGCHLD | CLONE_THREAD=0x10000) → ALLOW (CLONE_THREAD not in mask) + #expect(BPFSimulator.run(prog, syscallNr: 220, arch: aarch64, args: [UInt64(17 | 0x10000)]) == retAllow) + } + + @Test("greaterThan arg filter") + func argGreaterThan() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + defaultAction: .actErrno, + defaultErrnoRet: 1, + syscalls: [ + LinuxSyscall( + names: ["read"], + action: .actAllow, + errnoRet: nil, + args: [LinuxSeccompArg(index: 2, value: 0, valueTwo: 0, op: .opGreaterThan)] + ) + ] + )) + + // read(fd, buf, count=1) → ALLOW (1 > 0) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [0, 0, 1]) == retAllow) + // read(fd, buf, count=0) → ERRNO (0 is not > 0) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [0, 0, 0]) == retErrno(1)) + } + + @Test("lessEqual arg filter") + func argLessEqual() throws { + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + defaultAction: .actErrno, + defaultErrnoRet: 1, + syscalls: [ + LinuxSyscall( + names: ["read"], + action: .actAllow, + errnoRet: nil, + args: [LinuxSeccompArg(index: 2, value: 4096, valueTwo: 0, op: .opLessEqual)] + ) + ] + )) + + // count=4096 → ALLOW (4096 <= 4096) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [0, 0, 4096]) == retAllow) + // count=4097 → ERRNO (4097 > 4096) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [0, 0, 4097]) == retErrno(1)) + // count=0 → ALLOW (0 <= 4096) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [0, 0, 0]) == retAllow) + } + + // MARK: - 64-bit argument handling + + @Test("handles 64-bit argument values correctly") + func arg64Bit() throws { + let largeValue: UInt64 = 0x1_0000_0001 // Requires both hi and lo halves + let prog = try SeccompCompiler.compileFromOCI( + config: makeConfig( + defaultAction: .actErrno, + defaultErrnoRet: 1, + syscalls: [ + LinuxSyscall( + names: ["read"], + action: .actAllow, + errnoRet: nil, + args: [LinuxSeccompArg(index: 0, value: largeValue, valueTwo: 0, op: .opEqualTo)] + ) + ] + )) + + // Exact match → ALLOW + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [largeValue]) == retAllow) + // Just the low half → ERRNO (high half doesn't match) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [1]) == retErrno(1)) + // Just the high half → ERRNO (low half doesn't match) + #expect(BPFSimulator.run(prog, syscallNr: 63, arch: aarch64, args: [0x1_0000_0000]) == retErrno(1)) + } + + // MARK: - Error handling + + @Test("rejects arg index > 5") + func invalidArgIndex() { + let config = makeConfig( + syscalls: [ + LinuxSyscall( + names: ["read"], + action: .actErrno, + errnoRet: 1, + args: [LinuxSeccompArg(index: 6, value: 0, valueTwo: 0, op: .opEqualTo)] + ) + ] + ) + + #expect(throws: SeccompCompiler.Error.self) { + try SeccompCompiler.compileFromOCI(config: config) + } + } + + // MARK: - Flag mapping + + @Test("maps seccomp flags correctly") + func flagMapping() { + #expect(SeccompCompiler.mapFlags([]) == 0) + #expect(SeccompCompiler.mapFlags([.flagLog]) == 2) + #expect(SeccompCompiler.mapFlags([.flagSpecAllow]) == 4) + #expect(SeccompCompiler.mapFlags([.flagLog, .flagSpecAllow]) == 6) + #expect(SeccompCompiler.mapFlags([.flagWaitKillableRecv]) == 32) + } +} diff --git a/vminitd/Package.swift b/vminitd/Package.swift index 11039778..fcb050a4 100644 --- a/vminitd/Package.swift +++ b/vminitd/Package.swift @@ -68,6 +68,7 @@ let package = Package( .product(name: "SystemPackage", package: "swift-system"), .product(name: "Containerization", package: "containerization"), .product(name: "ContainerizationOS", package: "containerization"), + .product(name: "ContainerizationSeccomp", package: "containerization"), "LCShim", "Cgroup", ] diff --git a/vminitd/Sources/LCShim/include/syscall.h b/vminitd/Sources/LCShim/include/syscall.h index f30e3ab2..dfb7095b 100644 --- a/vminitd/Sources/LCShim/include/syscall.h +++ b/vminitd/Sources/LCShim/include/syscall.h @@ -37,4 +37,18 @@ int CZ_pidfd_getfd(int pidfd, int targetfd, unsigned int flags); int CZ_prctl_set_no_new_privs(); +struct CZ_sock_filter { + unsigned short code; + unsigned char jt; + unsigned char jf; + unsigned int k; +}; + +struct CZ_sock_fprog { + unsigned short len; + struct CZ_sock_filter *filter; +}; + +int CZ_seccomp_set_mode_filter(unsigned int flags, struct CZ_sock_fprog *prog); + #endif diff --git a/vminitd/Sources/LCShim/syscall.c b/vminitd/Sources/LCShim/syscall.c index 4070196c..6af98738 100644 --- a/vminitd/Sources/LCShim/syscall.c +++ b/vminitd/Sources/LCShim/syscall.c @@ -39,3 +39,15 @@ int CZ_pidfd_getfd(int pidfd, int targetfd, unsigned int flags) { int CZ_prctl_set_no_new_privs() { return prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); } + +#ifndef SYS_seccomp +#define SYS_seccomp 277 +#endif + +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +int CZ_seccomp_set_mode_filter(unsigned int flags, struct CZ_sock_fprog *prog) { + return syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, flags, prog); +} diff --git a/vminitd/Sources/vmexec/ExecCommand.swift b/vminitd/Sources/vmexec/ExecCommand.swift index a3f0dd23..e3b40972 100644 --- a/vminitd/Sources/vmexec/ExecCommand.swift +++ b/vminitd/Sources/vmexec/ExecCommand.swift @@ -35,6 +35,9 @@ struct ExecCommand: ParsableCommand { @Option(name: .long, help: "pid of the init process for the container") var parentPid: Int + @Option(name: .long, help: "path to the OCI bundle for the container") + var bundlePath: String? + func run() throws { do { let src = URL(fileURLWithPath: processPath) @@ -43,7 +46,15 @@ struct ExecCommand: ParsableCommand { ContainerizationOCI.Process.self, from: processBytes ) - try execInNamespaces(process: process) + + var seccomp: ContainerizationOCI.LinuxSeccomp? + if let bundlePath { + let bundle = try ContainerizationOCI.Bundle.load(path: URL(filePath: bundlePath)) + let spec = try bundle.loadConfig() + seccomp = spec.linux?.seccomp + } + + try execInNamespaces(process: process, seccomp: seccomp) } catch { App.writeError(error) throw error @@ -56,7 +67,7 @@ struct ExecCommand: ParsableCommand { } } - private func execInNamespaces(process: ContainerizationOCI.Process) throws { + private func execInNamespaces(process: ContainerizationOCI.Process, seccomp: ContainerizationOCI.LinuxSeccomp?) throws { let syncPipe = FileDescriptor(rawValue: 3) let ackPipe = FileDescriptor(rawValue: 4) @@ -142,13 +153,28 @@ struct ExecCommand: ParsableCommand { // Set uid, gid, and supplementary groups try App.setPermissions(user: process.user) + // Resolve the executable path before seccomp is applied. + let resolvedExecutable = try App.resolveExecutable(process: process, currentEnv: process.env) + + // Without noNewPrivileges, seccomp is a privileged operation that + // requires CAP_SYS_ADMIN. Install it before dropping capabilities. + // With noNewPrivileges, install it as late as possible (right before + // exec) to minimize the syscalls that need to be in the profile. + if let seccomp, !process.noNewPrivileges { + try App.setSeccomp(seccomp: seccomp) + } + // Finish capabilities (after user change) try App.finishCapabilities(preparedCaps) // Set no_new_privs if requested by the OCI spec. try App.setNoNewPrivileges(process: process) - try App.exec(process: process, currentEnv: process.env) + if let seccomp, process.noNewPrivileges { + try App.setSeccomp(seccomp: seccomp) + } + + try App.exec(process: process, resolvedExecutable: resolvedExecutable) } else { // parent process // Send our child's pid to our parent before we exit. var childPid = processID diff --git a/vminitd/Sources/vmexec/RunCommand.swift b/vminitd/Sources/vmexec/RunCommand.swift index ba070340..5951f657 100644 --- a/vminitd/Sources/vmexec/RunCommand.swift +++ b/vminitd/Sources/vmexec/RunCommand.swift @@ -193,14 +193,29 @@ struct RunCommand: ParsableCommand { // Set uid, gid, and supplementary groups. try App.setPermissions(user: process.user) + // Resolve the executable path before seccomp is applied. + let resolvedExecutable = try App.resolveExecutable(process: process, currentEnv: process.env) + + // Without noNewPrivileges, seccomp is a privileged operation that + // requires CAP_SYS_ADMIN. Install it before dropping capabilities. + // With noNewPrivileges, install it as late as possible (right before + // exec) to minimize the syscalls that need to be in the profile. + if let seccomp = spec.linux?.seccomp, !process.noNewPrivileges { + try App.setSeccomp(seccomp: seccomp) + } + // Finish capabilities (after user change) try App.finishCapabilities(preparedCaps) // Set no_new_privs if requested by the OCI spec. try App.setNoNewPrivileges(process: process) + if let seccomp = spec.linux?.seccomp, process.noNewPrivileges { + try App.setSeccomp(seccomp: seccomp) + } + // Finally execve the container process. - try App.exec(process: process, currentEnv: process.env) + try App.exec(process: process, resolvedExecutable: resolvedExecutable) } private func setupNamespaces(namespaces: [ContainerizationOCI.LinuxNamespace]?) throws -> Int32 { diff --git a/vminitd/Sources/vmexec/vmexec.swift b/vminitd/Sources/vmexec/vmexec.swift index b1778c37..593f133d 100644 --- a/vminitd/Sources/vmexec/vmexec.swift +++ b/vminitd/Sources/vmexec/vmexec.swift @@ -23,6 +23,7 @@ import ArgumentParser import ContainerizationError import ContainerizationOCI import ContainerizationOS +import ContainerizationSeccomp import FoundationEssentials import LCShim import Logging @@ -64,32 +65,34 @@ extension App { } } - static func exec(process: ContainerizationOCI.Process, currentEnv: [String]? = nil) throws { + static func resolveExecutable(process: ContainerizationOCI.Process, currentEnv: [String]? = nil) throws -> URL { guard !process.args.isEmpty else { throw App.Errno(stage: "exec", info: "process args cannot be empty") } let executableArg = process.args[0] - let resolvedExecutable: URL - if executableArg.contains("/") { - if executableArg.hasPrefix("/") { - resolvedExecutable = URL(fileURLWithPath: executableArg) - } else { - resolvedExecutable = URL(fileURLWithPath: process.cwd).appendingPathComponent(executableArg).standardized - } - - guard FileManager.default.fileExists(atPath: resolvedExecutable.path) else { - throw App.Failure(message: "failed to find target executable \(executableArg)") - } - } else { + guard executableArg.contains("/") else { let path = Path.findPath(currentEnv) ?? Path.getCurrentPath() guard let found = Path.lookPath(executableArg, path: path) else { throw App.Failure(message: "failed to find target executable \(executableArg)") } - resolvedExecutable = found + return found } + let resolved: URL + if executableArg.hasPrefix("/") { + resolved = URL(fileURLWithPath: executableArg) + } else { + resolved = URL(fileURLWithPath: process.cwd).appendingPathComponent(executableArg).standardized + } + + guard FileManager.default.fileExists(atPath: resolved.path) else { + throw App.Failure(message: "failed to find target executable \(executableArg)") + } + return resolved + } + static func exec(process: ContainerizationOCI.Process, resolvedExecutable: URL) throws { let executable = strdup(resolvedExecutable.path) var argv = process.args.map { strdup($0) } argv += [nil] @@ -250,6 +253,28 @@ extension App { } } + static func setSeccomp(seccomp: ContainerizationOCI.LinuxSeccomp) throws { + + let bpfInstructions = try SeccompCompiler.compileFromOCI(config: seccomp) + guard !bpfInstructions.isEmpty else { return } + + let flags = SeccompCompiler.mapFlags(seccomp.flags) + + let filters = bpfInstructions.map { inst in + CZ_sock_filter(code: inst.code, jt: inst.jt, jf: inst.jf, k: inst.k) + } + + try filters.withUnsafeBufferPointer { buffer in + var prog = CZ_sock_fprog( + len: UInt16(buffer.count), + filter: UnsafeMutablePointer(mutating: buffer.baseAddress!) + ) + guard CZ_seccomp_set_mode_filter(flags, &prog) == 0 else { + throw App.Errno(stage: "seccomp(SET_MODE_FILTER)", info: "failed to apply seccomp filter") + } + } + } + static func Errno(stage: String, info: String = "") -> ContainerizationError { let posix = POSIXError(.init(rawValue: errno)!, userInfo: ["stage": stage]) return ContainerizationError(.internalError, message: "\(info) \(String(describing: posix))") diff --git a/vminitd/Sources/vminitd/ManagedProcess.swift b/vminitd/Sources/vminitd/ManagedProcess.swift index 4ace4045..7a409f80 100644 --- a/vminitd/Sources/vminitd/ManagedProcess.swift +++ b/vminitd/Sources/vminitd/ManagedProcess.swift @@ -103,6 +103,8 @@ final class ManagedProcess: ContainerProcess, Sendable { "\(owningPid)", "--process-path", bundle.getExecSpecPath(id: id).path, + "--bundle-path", + bundle.path.path, ] } else { args = ["run", "--bundle-path", bundle.path.path]