diff --git a/api/src/syscall/mod.rs b/api/src/syscall/mod.rs index aea95737..3f841006 100644 --- a/api/src/syscall/mod.rs +++ b/api/src/syscall/mod.rs @@ -427,6 +427,11 @@ pub fn handle_syscall(uctx: &mut UserContext) { uctx.arg3(), uctx.arg4(), ), + Sysno::clone3 => sys_clone3( + uctx, + uctx.arg0() as _, // args_ptr + uctx.arg1() as _, // args_size + ), #[cfg(target_arch = "x86_64")] Sysno::fork => sys_fork(uctx), Sysno::exit => sys_exit(uctx.arg0() as _), diff --git a/api/src/syscall/task/clone.rs b/api/src/syscall/task/clone.rs index 8a567993..afebdc3d 100644 --- a/api/src/syscall/task/clone.rs +++ b/api/src/syscall/task/clone.rs @@ -21,134 +21,234 @@ use crate::{ }; bitflags! { - /// Options for use with [`sys_clone`]. + /// Clone flags for process/thread creation. #[derive(Debug, Clone, Copy, Default)] - struct CloneFlags: u32 { + pub struct CloneFlags: u64 { /// The calling process and the child process run in the same /// memory space. - const VM = CLONE_VM; - /// The caller and the child process share the same filesystem + const VM = CLONE_VM as u64; + /// The caller and the child process share the same filesystem /// information. - const FS = CLONE_FS; + const FS = CLONE_FS as u64; /// The calling process and the child process share the same file /// descriptor table. - const FILES = CLONE_FILES; + const FILES = CLONE_FILES as u64; /// The calling process and the child process share the same table /// of signal handlers. - const SIGHAND = CLONE_SIGHAND; + const SIGHAND = CLONE_SIGHAND as u64; /// Sets pidfd to the child process's PID file descriptor. - const PIDFD = CLONE_PIDFD; + const PIDFD = CLONE_PIDFD as u64; /// If the calling process is being traced, then trace the child /// also. - const PTRACE = CLONE_PTRACE; + const PTRACE = CLONE_PTRACE as u64; /// The execution of the calling process is suspended until the /// child releases its virtual memory resources via a call to /// execve(2) or _exit(2) (as with vfork(2)). - const VFORK = CLONE_VFORK; - /// The parent of the new child (as returned by getppid(2)) + const VFORK = CLONE_VFORK as u64; + /// The parent of the new child (as returned by getppid(2)) /// will be the same as that of the calling process. - const PARENT = CLONE_PARENT; + const PARENT = CLONE_PARENT as u64; /// The child is placed in the same thread group as the calling /// process. - const THREAD = CLONE_THREAD; + const THREAD = CLONE_THREAD as u64; /// The cloned child is started in a new mount namespace. - const NEWNS = CLONE_NEWNS; + const NEWNS = CLONE_NEWNS as u64; /// The child and the calling process share a single list of System /// V semaphore adjustment values - const SYSVSEM = CLONE_SYSVSEM; + const SYSVSEM = CLONE_SYSVSEM as u64; /// The TLS (Thread Local Storage) descriptor is set to tls. - const SETTLS = CLONE_SETTLS; + const SETTLS = CLONE_SETTLS as u64; /// Store the child thread ID in the parent's memory. - const PARENT_SETTID = CLONE_PARENT_SETTID; + const PARENT_SETTID = CLONE_PARENT_SETTID as u64; /// Clear (zero) the child thread ID in child memory when the child /// exits, and do a wakeup on the futex at that address. - const CHILD_CLEARTID = CLONE_CHILD_CLEARTID; + const CHILD_CLEARTID = CLONE_CHILD_CLEARTID as u64; /// A tracing process cannot force `CLONE_PTRACE` on this child /// process. - const UNTRACED = CLONE_UNTRACED; + const UNTRACED = CLONE_UNTRACED as u64; /// Store the child thread ID in the child's memory. - const CHILD_SETTID = CLONE_CHILD_SETTID; + const CHILD_SETTID = CLONE_CHILD_SETTID as u64; /// Create the process in a new cgroup namespace. - const NEWCGROUP = CLONE_NEWCGROUP; + const NEWCGROUP = CLONE_NEWCGROUP as u64; /// Create the process in a new UTS namespace. - const NEWUTS = CLONE_NEWUTS; + const NEWUTS = CLONE_NEWUTS as u64; /// Create the process in a new IPC namespace. - const NEWIPC = CLONE_NEWIPC; + const NEWIPC = CLONE_NEWIPC as u64; /// Create the process in a new user namespace. - const NEWUSER = CLONE_NEWUSER; + const NEWUSER = CLONE_NEWUSER as u64; /// Create the process in a new PID namespace. - const NEWPID = CLONE_NEWPID; + const NEWPID = CLONE_NEWPID as u64; /// Create the process in a new network namespace. - const NEWNET = CLONE_NEWNET; + const NEWNET = CLONE_NEWNET as u64; /// The new process shares an I/O context with the calling process. - const IO = CLONE_IO; + const IO = CLONE_IO as u64; + /// Clear signal handlers on clone (since Linux 5.5) + const CLEAR_SIGHAND = 0x100000000u64; + /// Clone into specific cgroup (since Linux 5.7) + const INTO_CGROUP = 0x200000000u64; } } -pub fn sys_clone( - uctx: &UserContext, - flags: u32, - stack: usize, - parent_tid: usize, - #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, - tls: usize, - #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, -) -> AxResult { - const FLAG_MASK: u32 = 0xff; - let exit_signal = flags & FLAG_MASK; - let mut flags = CloneFlags::from_bits_truncate(flags & !FLAG_MASK); - if flags.contains(CloneFlags::VFORK) { - debug!("sys_clone: CLONE_VFORK slow path"); - flags.remove(CloneFlags::VM); - } +/// Trait for providing clone parameters in a flexible way. +/// +/// This allows clone() and clone3() to have different parameter semantics +/// while sharing the core implementation logic. +pub trait CloneParamProvider { + /// Get clone flags + fn flags(&self) -> CloneFlags; - debug!( - "sys_clone <= flags: {flags:?}, exit_signal: {exit_signal}, stack: {stack:#x}, ptid: \ - {parent_tid:#x}, ctid: {child_tid:#x}, tls: {tls:#x}" - ); + /// Get exit signal (0 means no signal) + fn exit_signal(&self) -> u64; - if exit_signal != 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { + /// Get new stack pointer (0 means inherit parent's) + fn stack_pointer(&self) -> usize; + + /// Get TLS value + fn tls(&self) -> usize; + + /// Get child_tid pointer for CHILD_SETTID + fn child_settid_ptr(&self) -> usize; + fn child_cleartid_ptr(&self) -> usize; + + /// Get parent_tid pointer for PARENT_SETTID (used by both clone and clone3) + fn parent_tid_ptr(&self) -> usize; + + /// Get pidfd pointer (0 if not used) + /// - For clone(): returns 0 (uses parent_tid_ptr instead) + /// - For clone3(): returns the pidfd field + fn pidfd_ptr(&self) -> usize; + + /// Validate parameters (different rules for clone vs clone3) + fn validate(&self) -> AxResult<()>; +} + +/// Common validation logic shared by all clone variants +fn validate_common(flags: CloneFlags, exit_signal: u64) -> AxResult<()> { + // Check for invalid flag combinations + // The original logic is retained here for the time being. + // In the future, it can be ignored and set to 0 simultaneously without reporting an error in some cases. + if exit_signal > 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { return Err(AxError::InvalidInput); } + if flags.contains(CloneFlags::THREAD) && !flags.contains(CloneFlags::VM | CloneFlags::SIGHAND) { return Err(AxError::InvalidInput); } - if flags.contains(CloneFlags::PIDFD | CloneFlags::PARENT_SETTID) { + + // https://man7.org/linux/man-pages/man2/clone.2.html + // CLONE_SIGHAND + // Since Linux 2.6.0, the flags mask must also include CLONE_VM if CLONE_SIGHAND is specified. + if flags.contains(CloneFlags::SIGHAND) && !flags.contains(CloneFlags::VM) { + return Err(AxError::InvalidInput); + } + + if flags.contains(CloneFlags::VFORK) && flags.contains(CloneFlags::THREAD) { + return Err(AxError::InvalidInput); + } + + // Validate exit signal range + if exit_signal >= 64 { return Err(AxError::InvalidInput); } - let exit_signal = Signo::from_repr(exit_signal as u8); + // Namespace flags warning + let namespace_flags = CloneFlags::NEWNS + | CloneFlags::NEWIPC + | CloneFlags::NEWNET + | CloneFlags::NEWPID + | CloneFlags::NEWUSER + | CloneFlags::NEWUTS + | CloneFlags::NEWCGROUP; + + if flags.intersects(namespace_flags) { + warn!( + "sys_clone/sys_clone3: namespace flags detected ({:?}), stub support only", + flags & namespace_flags + ); + } + + Ok(()) +} + +/// Core implementation of clone/clone3/fork/vfork. +/// +/// This function contains the shared logic for creating new tasks. +/// Different parameter semantics are handled through the `CloneParamProvider` trait. +pub fn do_clone(uctx: &UserContext, params: &P) -> AxResult { + // Validate parameters + params.validate()?; + + let mut flags = params.flags(); + let exit_signal = params.exit_signal(); + + // Common validation + validate_common(flags, exit_signal)?; + + // Handle VFORK special case + // NOTE: + // CLONE_VFORK currently shares address space, + // but does NOT suspend parent execution. + // This is a partial implementation. + if flags.contains(CloneFlags::VFORK) { + debug!("do_clone: CLONE_VFORK slow path"); + flags.remove(CloneFlags::VM); + } + + debug!( + "do_clone: flags={flags:?}, exit_signal={exit_signal}, stack={:#x}, tls={:#x}", + params.stack_pointer(), + params.tls() + ); + + let exit_signal = if exit_signal > 0 { + Signo::from_repr(exit_signal as u8) + } else { + None + }; + + // Prepare new user context let mut new_uctx = *uctx; - if stack != 0 { - new_uctx.set_sp(stack); + let stack_ptr = params.stack_pointer(); + if stack_ptr != 0 { + new_uctx.set_sp(stack_ptr); } if flags.contains(CloneFlags::SETTLS) { - new_uctx.set_tls(tls); + new_uctx.set_tls(params.tls()); } new_uctx.set_retval(0); - let set_child_tid = if flags.contains(CloneFlags::CHILD_SETTID) { - Some(UserPtr::::from(child_tid).get_as_mut()?) - } else { - None + // Prepare child_tid pointer if needed + let set_child_tid = { + let p = params.child_settid_ptr(); + if flags.contains(CloneFlags::CHILD_SETTID) && p != 0 { + Some(UserPtr::::from(p).get_as_mut()?) + } else { + None + } }; let curr = current(); let old_proc_data = &curr.as_thread().proc_data; + // Create new task let mut new_task = new_user_task(&curr.name(), new_uctx, set_child_tid); - let tid = new_task.id().as_u64() as Pid; - if flags.contains(CloneFlags::PARENT_SETTID) { - *UserPtr::::from(parent_tid).get_as_mut()? = tid; + + // Write parent TID if PARENT_SETTID is set + let parent_tid_ptr = params.parent_tid_ptr(); + if flags.contains(CloneFlags::PARENT_SETTID) && parent_tid_ptr != 0 { + *UserPtr::::from(parent_tid_ptr).get_as_mut()? = tid; } + // Create process data based on flags (keep original inline logic) let new_proc_data = if flags.contains(CloneFlags::THREAD) { + // Thread creation: share address space new_task .ctx_mut() .set_page_table_root(old_proc_data.aspace.lock().page_table_root()); old_proc_data.clone() } else { + // Process creation let proc = if flags.contains(CloneFlags::PARENT) { old_proc_data.proc.parent().ok_or(AxError::InvalidInput)? } else { @@ -156,6 +256,7 @@ pub fn sys_clone( } .fork(tid); + // Handle address space let aspace = if flags.contains(CloneFlags::VM) { old_proc_data.aspace.clone() } else { @@ -164,15 +265,22 @@ pub fn sys_clone( copy_from_kernel(&mut aspace.lock())?; aspace }; + new_task .ctx_mut() .set_page_table_root(aspace.lock().page_table_root()); + // Handle signal handlers let signal_actions = if flags.contains(CloneFlags::SIGHAND) { old_proc_data.signal.actions.clone() + } else if flags.contains(CloneFlags::CLEAR_SIGHAND) { + // CLONE_CLEAR_SIGHAND: reset to default handlers + Arc::new(SpinNoIrq::new(Default::default())) } else { + // Normal fork: copy signal handlers Arc::new(SpinNoIrq::new(old_proc_data.signal.actions.lock().clone())) }; + let proc_data = ProcessData::new( proc, old_proc_data.exe_path.read().clone(), @@ -183,8 +291,10 @@ pub fn sys_clone( ); proc_data.set_umask(old_proc_data.umask()); + // Handle file descriptors and filesystem context { let mut scope = proc_data.scope.write(); + if flags.contains(CloneFlags::FILES) { FD_TABLE.scope_mut(&mut scope).clone_from(&FD_TABLE); } else { @@ -207,25 +317,142 @@ pub fn sys_clone( proc_data }; + // Add thread to process new_proc_data.proc.add_thread(tid); + // Handle PIDFD if requested + // Different behavior for clone() vs clone3() if flags.contains(CloneFlags::PIDFD) { let pidfd = PidFd::new(&new_proc_data); - *UserPtr::::from(parent_tid).get_as_mut()? = pidfd.add_to_fd_table(true)?; + let fd = pidfd.add_to_fd_table(true)?; + + // Get the correct pointer based on clone variant + let pidfd_target_ptr = params.pidfd_ptr(); + if pidfd_target_ptr != 0 { + // clone3: write to pidfd field + *UserPtr::::from(pidfd_target_ptr).get_as_mut()? = fd; + } else if parent_tid_ptr != 0 { + // clone: write to parent_tid (historical behavior) + *UserPtr::::from(parent_tid_ptr).get_as_mut()? = fd; + } } + // Create thread object let thr = Thread::new(tid, new_proc_data); - if flags.contains(CloneFlags::CHILD_CLEARTID) { - thr.set_clear_child_tid(child_tid); + + // Set clear_child_tid if requested + let clear_child_tid_ptr = params.child_cleartid_ptr(); + if flags.contains(CloneFlags::CHILD_CLEARTID) && clear_child_tid_ptr != 0 { + thr.set_clear_child_tid(clear_child_tid_ptr); } + *new_task.task_ext_mut() = Some(unsafe { AxTaskExt::from_impl(thr) }); + // Spawn the task let task = spawn_task(new_task); add_task_to_table(&task); Ok(tid as _) } +// ================================ +// Clone (legacy) parameters +// ================================ + +/// Parameters for the clone() system call. +/// +/// Note: In clone(), the parent_tid parameter serves dual purpose: +/// - If CLONE_PIDFD: receives the pidfd +/// - If CLONE_PARENT_SETTID: receives the child TID +/// These two flags are mutually exclusive in clone(). +pub struct CloneParams { + flags: u32, + stack: usize, + parent_tid: usize, + child_tid: usize, + tls: usize, +} + +impl CloneParams { + pub fn new(flags: u32, stack: usize, parent_tid: usize, child_tid: usize, tls: usize) -> Self { + Self { + flags, + stack, + parent_tid, + child_tid, + tls, + } + } +} + +impl CloneParamProvider for CloneParams { + fn flags(&self) -> CloneFlags { + const FLAG_MASK: u32 = 0xff; + CloneFlags::from_bits_truncate((self.flags & !FLAG_MASK) as u64) + } + + fn exit_signal(&self) -> u64 { + const FLAG_MASK: u32 = 0xff; + (self.flags & FLAG_MASK) as u64 + } + + fn stack_pointer(&self) -> usize { + // For clone(), stack directly specifies the new SP + self.stack + } + + fn tls(&self) -> usize { + self.tls + } + + fn child_settid_ptr(&self) -> usize { + self.child_tid + } + + fn child_cleartid_ptr(&self) -> usize { + self.child_tid + } + + fn parent_tid_ptr(&self) -> usize { + self.parent_tid + } + + fn pidfd_ptr(&self) -> usize { + // For clone(), PIDFD uses parent_tid, so return 0 here + // The core logic will use parent_tid_ptr() instead + 0 + } + + fn validate(&self) -> AxResult<()> { + let flags = self.flags(); + + // In clone(), PIDFD and PARENT_SETTID are mutually exclusive + // because they share the parent_tid parameter + if flags.contains(CloneFlags::PIDFD) && flags.contains(CloneFlags::PARENT_SETTID) { + return Err(AxError::InvalidInput); + } + + Ok(()) + } +} + +// ================================ +// System call wrappers +// ================================ + +pub fn sys_clone( + uctx: &UserContext, + flags: u32, + stack: usize, + parent_tid: usize, + #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, + tls: usize, + #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, +) -> AxResult { + let params = CloneParams::new(flags, stack, parent_tid, child_tid, tls); + do_clone(uctx, ¶ms) +} + #[cfg(target_arch = "x86_64")] pub fn sys_fork(uctx: &UserContext) -> AxResult { sys_clone(uctx, SIGCHLD, 0, 0, 0, 0) diff --git a/api/src/syscall/task/clone3.rs b/api/src/syscall/task/clone3.rs new file mode 100644 index 00000000..6183df55 --- /dev/null +++ b/api/src/syscall/task/clone3.rs @@ -0,0 +1,110 @@ +use axerrno::{AxError, AxResult}; +use axhal::uspace::UserContext; +use starry_vm::VmPtr; + +use super::clone::{CloneFlags, CloneParamProvider, do_clone}; + +/// Structure passed to clone3() system call. +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct Clone3Args { + pub flags: u64, + pub pidfd: u64, + pub child_tid: u64, + pub parent_tid: u64, + pub exit_signal: u64, + pub stack: u64, + pub stack_size: u64, + pub tls: u64, + pub set_tid: u64, + pub set_tid_size: u64, + pub cgroup: u64, +} + +const MIN_CLONE_ARGS_SIZE: usize = core::mem::size_of::() * 8; + +impl CloneParamProvider for Clone3Args { + fn flags(&self) -> CloneFlags { + CloneFlags::from_bits_truncate(self.flags) + } + + fn exit_signal(&self) -> u64 { + self.exit_signal + } + + fn stack_pointer(&self) -> usize { + // For clone3(), stack + stack_size gives the SP + if self.stack > 0 { + if self.stack_size > 0 { + // Stack grows downward, SP = base + size + (self.stack + self.stack_size) as usize + } else { + // If only stack provided, treat as SP directly + self.stack as usize + } + } else { + 0 + } + } + + fn tls(&self) -> usize { + self.tls as usize + } + + fn child_settid_ptr(&self) -> usize { + self.child_tid as usize + } + + fn child_cleartid_ptr(&self) -> usize { + self.child_tid as usize // for glibc compatibility + } + + fn parent_tid_ptr(&self) -> usize { + self.parent_tid as usize + } + + fn pidfd_ptr(&self) -> usize { + // For clone3(), pidfd is a separate field + self.pidfd as usize + } + + fn validate(&self) -> AxResult<()> { + // Warn about unsupported features + if self.set_tid != 0 || self.set_tid_size != 0 { + warn!("sys_clone3: set_tid/set_tid_size not supported, ignoring"); + } + if self.cgroup != 0 { + warn!("sys_clone3: cgroup parameter not supported, ignoring"); + } + + // In clone3(), PIDFD and PARENT_SETTID can coexist + // because they use separate fields (no validation needed) + + Ok(()) + } +} + +pub fn sys_clone3(uctx: &UserContext, args_ptr: usize, args_size: usize) -> AxResult { + debug!("sys_clone3 <= args_ptr: {args_ptr:#x}, args_size: {args_size}"); + + // Validate size + if args_size < MIN_CLONE_ARGS_SIZE { + warn!("sys_clone3: args_size {args_size} too small, minimum is {MIN_CLONE_ARGS_SIZE}"); + return Err(AxError::InvalidInput); + } + + if args_size > core::mem::size_of::() { + debug!("sys_clone3: args_size {args_size} larger than expected, using known fields only"); + } + + // Copy arguments from user space + let args_ptr = args_ptr as *const Clone3Args; + let args = unsafe { args_ptr.vm_read_uninit()?.assume_init() }; + debug!("sys_clone3: args = {args:?}"); + + // Use common implementation + let result = do_clone(uctx, &args)?; + debug!("sys_clone3 => child tid: {result}"); + + Ok(result) +} diff --git a/api/src/syscall/task/mod.rs b/api/src/syscall/task/mod.rs index a6e77afc..2143a0e8 100644 --- a/api/src/syscall/task/mod.rs +++ b/api/src/syscall/task/mod.rs @@ -1,4 +1,5 @@ mod clone; +mod clone3; mod ctl; mod execve; mod exit; @@ -7,4 +8,6 @@ mod schedule; mod thread; mod wait; -pub use self::{clone::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*}; +pub use self::{ + clone::*, clone3::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*, +};