From 2b823feea624065e931ea0ada52410f4dc60ddf4 Mon Sep 17 00:00:00 2001 From: WyZ0125 Date: Tue, 9 Dec 2025 16:28:05 +0800 Subject: [PATCH 1/3] feat(syscall): add clone3 support --- api/src/syscall/mod.rs | 5 + api/src/syscall/task/clone3.rs | 375 +++++++++++++++++++++++++++++++++ api/src/syscall/task/mod.rs | 5 +- 3 files changed, 384 insertions(+), 1 deletion(-) create mode 100644 api/src/syscall/task/clone3.rs diff --git a/api/src/syscall/mod.rs b/api/src/syscall/mod.rs index aea95737..3f841006 100644 --- a/api/src/syscall/mod.rs +++ b/api/src/syscall/mod.rs @@ -427,6 +427,11 @@ pub fn handle_syscall(uctx: &mut UserContext) { uctx.arg3(), uctx.arg4(), ), + Sysno::clone3 => sys_clone3( + uctx, + uctx.arg0() as _, // args_ptr + uctx.arg1() as _, // args_size + ), #[cfg(target_arch = "x86_64")] Sysno::fork => sys_fork(uctx), Sysno::exit => sys_exit(uctx.arg0() as _), diff --git a/api/src/syscall/task/clone3.rs b/api/src/syscall/task/clone3.rs new file mode 100644 index 00000000..d584d9e1 --- /dev/null +++ b/api/src/syscall/task/clone3.rs @@ -0,0 +1,375 @@ +use alloc::sync::Arc; + +use axerrno::{AxError, AxResult}; +use axfs::FS_CONTEXT; +use axhal::uspace::UserContext; +use axtask::{AxTaskExt, current, spawn_task}; +use bitflags::bitflags; +use kspin::SpinNoIrq; +use linux_raw_sys::general::*; +use starry_core::{ + mm::copy_from_kernel, + task::{AsThread, ProcessData, Thread, add_task_to_table}, +}; +use starry_process::Pid; +use starry_signal::Signo; + +use crate::{ + file::{FD_TABLE, FileLike, PidFd}, + mm::UserPtr, + task::new_user_task, +}; + +bitflags! { + /// Options for use with [`sys_clone3`]. + #[derive(Debug, Clone, Copy, Default)] + struct CloneFlags: u64 { + /// The calling process and the child process run in the same + /// memory space. + const VM = CLONE_VM as u64; + /// The caller and the child process share the same filesystem + /// information. + const FS = CLONE_FS as u64; + /// The calling process and the child process share the same file + /// descriptor table. + const FILES = CLONE_FILES as u64; + /// The calling process and the child process share the same table + /// of signal handlers. + const SIGHAND = CLONE_SIGHAND as u64; + /// Sets pidfd to the child process's PID file descriptor. + const PIDFD = CLONE_PIDFD as u64; + /// If the calling process is being traced, then trace the child + /// also. + const PTRACE = CLONE_PTRACE as u64; + /// The execution of the calling process is suspended until the + /// child releases its virtual memory resources via a call to + /// execve(2) or _exit(2) (as with vfork(2)). + const VFORK = CLONE_VFORK as u64; + /// The parent of the new child (as returned by getppid(2)) + /// will be the same as that of the calling process. + const PARENT = CLONE_PARENT as u64; + /// The child is placed in the same thread group as the calling + /// process. + const THREAD = CLONE_THREAD as u64; + /// The cloned child is started in a new mount namespace. + const NEWNS = CLONE_NEWNS as u64; + /// The child and the calling process share a single list of System + /// V semaphore adjustment values + const SYSVSEM = CLONE_SYSVSEM as u64; + /// The TLS (Thread Local Storage) descriptor is set to tls. + const SETTLS = CLONE_SETTLS as u64; + /// Store the child thread ID in the parent's memory. + const PARENT_SETTID = CLONE_PARENT_SETTID as u64; + /// Clear (zero) the child thread ID in child memory when the child + /// exits, and do a wakeup on the futex at that address. + const CHILD_CLEARTID = CLONE_CHILD_CLEARTID as u64; + /// A tracing process cannot force `CLONE_PTRACE` on this child + /// process. + const UNTRACED = CLONE_UNTRACED as u64; + /// Store the child thread ID in the child's memory. + const CHILD_SETTID = CLONE_CHILD_SETTID as u64; + /// Create the process in a new cgroup namespace. + const NEWCGROUP = CLONE_NEWCGROUP as u64; + /// Create the process in a new UTS namespace. + const NEWUTS = CLONE_NEWUTS as u64; + /// Create the process in a new IPC namespace. + const NEWIPC = CLONE_NEWIPC as u64; + /// Create the process in a new user namespace. + const NEWUSER = CLONE_NEWUSER as u64; + /// Create the process in a new PID namespace. + const NEWPID = CLONE_NEWPID as u64; + /// Create the process in a new network namespace. + const NEWNET = CLONE_NEWNET as u64; + /// The new process shares an I/O context with the calling process. + const IO = CLONE_IO as u64; + /// Clear signal handlers on clone (since Linux 5.5) + const CLEAR_SIGHAND = 0x100000000u64; + /// Clone into specific cgroup (since Linux 5.7) + const INTO_CGROUP = 0x200000000u64; + } +} + +/// Structure passed to clone3() system call +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct CloneArgs { + /// Flags bit mask + pub flags: u64, + /// Where to store PID file descriptor (int *) + pub pidfd: u64, + /// Where to store child TID, in child's memory (pid_t *) + pub child_tid: u64, + /// Where to store child TID, in parent's memory (pid_t *) + pub parent_tid: u64, + /// Signal to deliver to parent on child termination + pub exit_signal: u64, + /// Pointer to lowest byte of stack + pub stack: u64, + /// Size of stack + pub stack_size: u64, + /// Location of new TLS + pub tls: u64, + /// Pointer to a pid_t array (since Linux 5.5) + pub set_tid: u64, + /// Number of elements in set_tid (since Linux 5.5) + pub set_tid_size: u64, + /// File descriptor for target cgroup of child (since Linux 5.7) + pub cgroup: u64, +} + +/// The minimum size of clone_args structure we support +const MIN_CLONE_ARGS_SIZE: usize = core::mem::size_of::() * 8; // First 8 fields + +/// Validate clone_args structure and flags +fn validate_clone_args(args: &CloneArgs) -> AxResult<()> { + let flags = CloneFlags::from_bits_truncate(args.flags); + + // Check for unsupported flag combinations + if args.exit_signal > 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { + return Err(AxError::InvalidInput); + } + + // CLONE_THREAD requires CLONE_VM and CLONE_SIGHAND + if flags.contains(CloneFlags::THREAD) && !flags.contains(CloneFlags::VM | CloneFlags::SIGHAND) { + return Err(AxError::InvalidInput); + } + + // Validate signal number + if args.exit_signal > 0 && args.exit_signal >= 64 { + return Err(AxError::InvalidInput); + } + + // Validate set_tid_size + if args.set_tid_size > 0 { + warn!("sys_clone3: set_tid/set_tid_size not fully supported, ignoring"); + // In a full implementation, we would validate: + // - set_tid_size <= nested PID namespace depth + // - PIDs in set_tid array are available + } + + // Validate cgroup fd + if args.cgroup > 0 { + warn!("sys_clone3: cgroup parameter not fully supported, ignoring"); + } + + // Namespace flags - stub support + let namespace_flags = CloneFlags::NEWNS + | CloneFlags::NEWIPC + | CloneFlags::NEWNET + | CloneFlags::NEWPID + | CloneFlags::NEWUSER + | CloneFlags::NEWUTS + | CloneFlags::NEWCGROUP; + + if flags.intersects(namespace_flags) { + warn!( + "sys_clone3: namespace flags detected ({:?}), stub support only", + flags & namespace_flags + ); + // Don't return error, just log warning for compatibility + } + + Ok(()) +} + +/// Implementation of clone3 system call +pub fn sys_clone3(uctx: &UserContext, args_ptr: usize, args_size: usize) -> AxResult { + debug!( + "sys_clone3 <= args_ptr: {:#x}, args_size: {}", + args_ptr, args_size + ); + + // Validate arguments size + if args_size < MIN_CLONE_ARGS_SIZE { + warn!( + "sys_clone3: args_size {} too small, minimum is {}", + args_size, MIN_CLONE_ARGS_SIZE + ); + return Err(AxError::InvalidInput); + } + + // Support larger structures for forward compatibility + if args_size > core::mem::size_of::() { + // Just use what we understand, ignore extra fields + debug!( + "sys_clone3: args_size {} larger than expected {}, using known fields only", + args_size, + core::mem::size_of::() + ); + } + + // Copy clone_args from user space + let args_uptr = UserPtr::::from(args_ptr); + let args = *args_uptr.get_as_mut()?; + + debug!("sys_clone3: args = {:?}", args); + + // Validate arguments + validate_clone_args(&args)?; + + let mut flags = CloneFlags::from_bits_truncate(args.flags); + + // Handle VFORK special case (same as sys_clone) + if flags.contains(CloneFlags::VFORK) { + debug!("sys_clone3: CLONE_VFORK slow path"); + flags.remove(CloneFlags::VM); + } + + debug!("sys_clone3: effective flags: {:?}", flags); + + // Parse exit signal + let exit_signal = if args.exit_signal > 0 { + Signo::from_repr(args.exit_signal as u8) + } else { + None + }; + + // Prepare new user context + let mut new_uctx = *uctx; + + // Set stack pointer if provided + if args.stack > 0 { + if args.stack_size > 0 { + // Stack grows downward, so set SP to stack + stack_size + new_uctx.set_sp((args.stack + args.stack_size) as usize); + } else { + new_uctx.set_sp(args.stack as usize); + } + } + + // Set TLS if requested + if flags.contains(CloneFlags::SETTLS) { + new_uctx.set_tls(args.tls as usize); + } + + // Child returns 0 + new_uctx.set_retval(0); + + // Prepare child_tid pointer if needed + let set_child_tid = if flags.contains(CloneFlags::CHILD_SETTID) && args.child_tid > 0 { + Some(UserPtr::::from(args.child_tid as usize).get_as_mut()?) + } else { + None + }; + + let curr = current(); + let old_proc_data = &curr.as_thread().proc_data; + + // Create new task + let mut new_task = new_user_task(&curr.name(), new_uctx, set_child_tid); + let tid = new_task.id().as_u64() as Pid; + + // Set parent_tid if requested + if flags.contains(CloneFlags::PARENT_SETTID) && args.parent_tid > 0 { + *UserPtr::::from(args.parent_tid as usize).get_as_mut()? = tid; + } + + // Create process data based on flags + let new_proc_data = if flags.contains(CloneFlags::THREAD) { + // Thread creation: share address space + new_task + .ctx_mut() + .set_page_table_root(old_proc_data.aspace.lock().page_table_root()); + old_proc_data.clone() + } else { + // Process creation: fork or vfork + let proc = if flags.contains(CloneFlags::PARENT) { + old_proc_data.proc.parent().ok_or(AxError::InvalidInput)? + } else { + old_proc_data.proc.clone() + } + .fork(tid); + + // Handle address space + let aspace = if flags.contains(CloneFlags::VM) { + old_proc_data.aspace.clone() + } else { + let mut aspace = old_proc_data.aspace.lock(); + let aspace = aspace.try_clone()?; + copy_from_kernel(&mut aspace.lock())?; + aspace + }; + + new_task + .ctx_mut() + .set_page_table_root(aspace.lock().page_table_root()); + + // Handle signal handlers + let signal_actions = if flags.contains(CloneFlags::SIGHAND) { + old_proc_data.signal.actions.clone() + } else if flags.contains(CloneFlags::CLEAR_SIGHAND) { + // CLONE_CLEAR_SIGHAND: reset to default handlers + Arc::new(SpinNoIrq::new(Default::default())) + } else { + // Normal fork: copy signal handlers + Arc::new(SpinNoIrq::new(old_proc_data.signal.actions.lock().clone())) + }; + + let proc_data = ProcessData::new( + proc, + old_proc_data.exe_path.read().clone(), + old_proc_data.cmdline.read().clone(), + aspace, + signal_actions, + exit_signal, + ); + proc_data.set_umask(old_proc_data.umask()); + + // Handle file descriptors and filesystem context + { + let mut scope = proc_data.scope.write(); + + if flags.contains(CloneFlags::FILES) { + // Share file descriptor table + FD_TABLE.scope_mut(&mut scope).clone_from(&FD_TABLE); + } else { + // Copy file descriptor table + FD_TABLE + .scope_mut(&mut scope) + .write() + .clone_from(&FD_TABLE.read()); + } + + if flags.contains(CloneFlags::FS) { + // Share filesystem context + FS_CONTEXT.scope_mut(&mut scope).clone_from(&FS_CONTEXT); + } else { + // Copy filesystem context + FS_CONTEXT + .scope_mut(&mut scope) + .lock() + .clone_from(&FS_CONTEXT.lock()); + } + } + + proc_data + }; + + // Add thread to process + new_proc_data.proc.add_thread(tid); + + // Handle PIDFD if requested + if flags.contains(CloneFlags::PIDFD) && args.pidfd > 0 { + let pidfd = PidFd::new(&new_proc_data); + let fd = pidfd.add_to_fd_table(true)?; + *UserPtr::::from(args.pidfd as usize).get_as_mut()? = fd; + } + + // Create thread object + let thr = Thread::new(tid, new_proc_data); + + // Set clear_child_tid if requested + if flags.contains(CloneFlags::CHILD_CLEARTID) && args.child_tid > 0 { + thr.set_clear_child_tid(args.child_tid as usize); + } + + *new_task.task_ext_mut() = Some(unsafe { AxTaskExt::from_impl(thr) }); + + // Spawn the task + let task = spawn_task(new_task); + add_task_to_table(&task); + + debug!("sys_clone3 => child tid: {}", tid); + + Ok(tid as _) +} diff --git a/api/src/syscall/task/mod.rs b/api/src/syscall/task/mod.rs index a6e77afc..2143a0e8 100644 --- a/api/src/syscall/task/mod.rs +++ b/api/src/syscall/task/mod.rs @@ -1,4 +1,5 @@ mod clone; +mod clone3; mod ctl; mod execve; mod exit; @@ -7,4 +8,6 @@ mod schedule; mod thread; mod wait; -pub use self::{clone::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*}; +pub use self::{ + clone::*, clone3::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*, +}; From c4f43ca5e94e108561fe58b97c8f1010780657a0 Mon Sep 17 00:00:00 2001 From: WyZ0125 Date: Sun, 14 Dec 2025 20:05:51 +0800 Subject: [PATCH 2/3] syscall: refactor clone/clone3 to share common do_clone logic --- api/src/syscall/task/clone.rs | 353 ++++++++++++++++++++++++------ api/src/syscall/task/clone3.rs | 387 ++++++--------------------------- 2 files changed, 351 insertions(+), 389 deletions(-) diff --git a/api/src/syscall/task/clone.rs b/api/src/syscall/task/clone.rs index 8a567993..0ed287f1 100644 --- a/api/src/syscall/task/clone.rs +++ b/api/src/syscall/task/clone.rs @@ -21,134 +21,234 @@ use crate::{ }; bitflags! { - /// Options for use with [`sys_clone`]. + /// Clone flags for process/thread creation. #[derive(Debug, Clone, Copy, Default)] - struct CloneFlags: u32 { + pub struct CloneFlags: u64 { /// The calling process and the child process run in the same /// memory space. - const VM = CLONE_VM; - /// The caller and the child process share the same filesystem + const VM = CLONE_VM as u64; + /// The caller and the child process share the same filesystem /// information. - const FS = CLONE_FS; + const FS = CLONE_FS as u64; /// The calling process and the child process share the same file /// descriptor table. - const FILES = CLONE_FILES; + const FILES = CLONE_FILES as u64; /// The calling process and the child process share the same table /// of signal handlers. - const SIGHAND = CLONE_SIGHAND; + const SIGHAND = CLONE_SIGHAND as u64; /// Sets pidfd to the child process's PID file descriptor. - const PIDFD = CLONE_PIDFD; + const PIDFD = CLONE_PIDFD as u64; /// If the calling process is being traced, then trace the child /// also. - const PTRACE = CLONE_PTRACE; + const PTRACE = CLONE_PTRACE as u64; /// The execution of the calling process is suspended until the /// child releases its virtual memory resources via a call to /// execve(2) or _exit(2) (as with vfork(2)). - const VFORK = CLONE_VFORK; - /// The parent of the new child (as returned by getppid(2)) + const VFORK = CLONE_VFORK as u64; + /// The parent of the new child (as returned by getppid(2)) /// will be the same as that of the calling process. - const PARENT = CLONE_PARENT; + const PARENT = CLONE_PARENT as u64; /// The child is placed in the same thread group as the calling /// process. - const THREAD = CLONE_THREAD; + const THREAD = CLONE_THREAD as u64; /// The cloned child is started in a new mount namespace. - const NEWNS = CLONE_NEWNS; + const NEWNS = CLONE_NEWNS as u64; /// The child and the calling process share a single list of System /// V semaphore adjustment values - const SYSVSEM = CLONE_SYSVSEM; + const SYSVSEM = CLONE_SYSVSEM as u64; /// The TLS (Thread Local Storage) descriptor is set to tls. - const SETTLS = CLONE_SETTLS; + const SETTLS = CLONE_SETTLS as u64; /// Store the child thread ID in the parent's memory. - const PARENT_SETTID = CLONE_PARENT_SETTID; + const PARENT_SETTID = CLONE_PARENT_SETTID as u64; /// Clear (zero) the child thread ID in child memory when the child /// exits, and do a wakeup on the futex at that address. - const CHILD_CLEARTID = CLONE_CHILD_CLEARTID; + const CHILD_CLEARTID = CLONE_CHILD_CLEARTID as u64; /// A tracing process cannot force `CLONE_PTRACE` on this child /// process. - const UNTRACED = CLONE_UNTRACED; + const UNTRACED = CLONE_UNTRACED as u64; /// Store the child thread ID in the child's memory. - const CHILD_SETTID = CLONE_CHILD_SETTID; + const CHILD_SETTID = CLONE_CHILD_SETTID as u64; /// Create the process in a new cgroup namespace. - const NEWCGROUP = CLONE_NEWCGROUP; + const NEWCGROUP = CLONE_NEWCGROUP as u64; /// Create the process in a new UTS namespace. - const NEWUTS = CLONE_NEWUTS; + const NEWUTS = CLONE_NEWUTS as u64; /// Create the process in a new IPC namespace. - const NEWIPC = CLONE_NEWIPC; + const NEWIPC = CLONE_NEWIPC as u64; /// Create the process in a new user namespace. - const NEWUSER = CLONE_NEWUSER; + const NEWUSER = CLONE_NEWUSER as u64; /// Create the process in a new PID namespace. - const NEWPID = CLONE_NEWPID; + const NEWPID = CLONE_NEWPID as u64; /// Create the process in a new network namespace. - const NEWNET = CLONE_NEWNET; + const NEWNET = CLONE_NEWNET as u64; /// The new process shares an I/O context with the calling process. - const IO = CLONE_IO; + const IO = CLONE_IO as u64; + /// Clear signal handlers on clone (since Linux 5.5) + const CLEAR_SIGHAND = 0x100000000u64; + /// Clone into specific cgroup (since Linux 5.7) + const INTO_CGROUP = 0x200000000u64; } } -pub fn sys_clone( - uctx: &UserContext, - flags: u32, - stack: usize, - parent_tid: usize, - #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, - tls: usize, - #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, -) -> AxResult { - const FLAG_MASK: u32 = 0xff; - let exit_signal = flags & FLAG_MASK; - let mut flags = CloneFlags::from_bits_truncate(flags & !FLAG_MASK); - if flags.contains(CloneFlags::VFORK) { - debug!("sys_clone: CLONE_VFORK slow path"); - flags.remove(CloneFlags::VM); - } +/// Trait for providing clone parameters in a flexible way. +/// +/// This allows clone() and clone3() to have different parameter semantics +/// while sharing the core implementation logic. +pub trait CloneParamProvider { + /// Get clone flags + fn flags(&self) -> CloneFlags; - debug!( - "sys_clone <= flags: {flags:?}, exit_signal: {exit_signal}, stack: {stack:#x}, ptid: \ - {parent_tid:#x}, ctid: {child_tid:#x}, tls: {tls:#x}" - ); + /// Get exit signal (0 means no signal) + fn exit_signal(&self) -> u64; - if exit_signal != 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { + /// Get new stack pointer (0 means inherit parent's) + fn stack_pointer(&self) -> usize; + + /// Get TLS value + fn tls(&self) -> usize; + + /// Get child_tid pointer for CHILD_SETTID + fn child_settid_ptr(&self) -> usize; + fn child_cleartid_ptr(&self) -> usize; + + /// Get parent_tid pointer for PARENT_SETTID (used by both clone and clone3) + fn parent_tid_ptr(&self) -> usize; + + /// Get pidfd pointer (0 if not used) + /// - For clone(): returns 0 (uses parent_tid_ptr instead) + /// - For clone3(): returns the pidfd field + fn pidfd_ptr(&self) -> usize; + + /// Validate parameters (different rules for clone vs clone3) + fn validate(&self) -> AxResult<()>; +} + +/// Common validation logic shared by all clone variants +fn validate_common(flags: CloneFlags, exit_signal: u64) -> AxResult<()> { + // Check for invalid flag combinations + // The original logic is retained here for the time being. + // In the future, it can be ignored and set to 0 simultaneously without reporting an error in some cases. + if exit_signal > 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { return Err(AxError::InvalidInput); } + if flags.contains(CloneFlags::THREAD) && !flags.contains(CloneFlags::VM | CloneFlags::SIGHAND) { return Err(AxError::InvalidInput); } - if flags.contains(CloneFlags::PIDFD | CloneFlags::PARENT_SETTID) { + + // https://man7.org/linux/man-pages/man2/clone.2.html + // CLONE_SIGHAND + // Since Linux 2.6.0, the flags mask must also include CLONE_VM if CLONE_SIGHAND is specified. + if flags.contains(CloneFlags::SIGHAND) && !flags.contains(CloneFlags::VM) { + return Err(AxError::InvalidInput); + } + + if flags.contains(CloneFlags::VFORK) && flags.contains(CloneFlags::THREAD) { + return Err(AxError::InvalidInput); + } + + // Validate exit signal range + if exit_signal >= 64 { return Err(AxError::InvalidInput); } - let exit_signal = Signo::from_repr(exit_signal as u8); + // Namespace flags warning + let namespace_flags = CloneFlags::NEWNS + | CloneFlags::NEWIPC + | CloneFlags::NEWNET + | CloneFlags::NEWPID + | CloneFlags::NEWUSER + | CloneFlags::NEWUTS + | CloneFlags::NEWCGROUP; + + if flags.intersects(namespace_flags) { + warn!( + "sys_clone/sys_clone3: namespace flags detected ({:?}), stub support only", + flags & namespace_flags + ); + } + + Ok(()) +} + +/// Core implementation of clone/clone3/fork/vfork. +/// +/// This function contains the shared logic for creating new tasks. +/// Different parameter semantics are handled through the `CloneParamProvider` trait. +pub fn do_clone(uctx: &UserContext, params: &P) -> AxResult { + // Validate parameters + params.validate()?; + + let mut flags = params.flags(); + let exit_signal = params.exit_signal(); + + // Common validation + validate_common(flags, exit_signal)?; + + // Handle VFORK special case + // NOTE: + // CLONE_VFORK currently shares address space, + // but does NOT suspend parent execution. + // This is a partial implementation. + if flags.contains(CloneFlags::VFORK) { + debug!("do_clone: CLONE_VFORK slow path"); + flags.remove(CloneFlags::VM); + } + + debug!( + "do_clone: flags={flags:?}, exit_signal={exit_signal}, stack={:#x}, tls={:#x}", + params.stack_pointer(), + params.tls() + ); + + let exit_signal = if exit_signal > 0 { + Signo::from_repr(exit_signal as u8) + } else { + None + }; + + // Prepare new user context let mut new_uctx = *uctx; - if stack != 0 { - new_uctx.set_sp(stack); + let stack_ptr = params.stack_pointer(); + if stack_ptr != 0 { + new_uctx.set_sp(stack_ptr); } if flags.contains(CloneFlags::SETTLS) { - new_uctx.set_tls(tls); + new_uctx.set_tls(params.tls()); } new_uctx.set_retval(0); - let set_child_tid = if flags.contains(CloneFlags::CHILD_SETTID) { - Some(UserPtr::::from(child_tid).get_as_mut()?) - } else { - None + // Prepare child_tid pointer if needed + let set_child_tid = { + let p = params.child_settid_ptr(); + if flags.contains(CloneFlags::CHILD_SETTID) && p != 0 { + Some(UserPtr::::from(p).get_as_mut()?) + } else { + None + } }; let curr = current(); let old_proc_data = &curr.as_thread().proc_data; + // Create new task let mut new_task = new_user_task(&curr.name(), new_uctx, set_child_tid); - let tid = new_task.id().as_u64() as Pid; - if flags.contains(CloneFlags::PARENT_SETTID) { - *UserPtr::::from(parent_tid).get_as_mut()? = tid; + + // Write parent TID if PARENT_SETTID is set + let parent_tid_ptr = params.parent_tid_ptr(); + if flags.contains(CloneFlags::PARENT_SETTID) && parent_tid_ptr != 0 { + *UserPtr::::from(parent_tid_ptr).get_as_mut()? = tid; } + // Create process data based on flags (keep original inline logic) let new_proc_data = if flags.contains(CloneFlags::THREAD) { + // Thread creation: share address space new_task .ctx_mut() .set_page_table_root(old_proc_data.aspace.lock().page_table_root()); old_proc_data.clone() } else { + // Process creation let proc = if flags.contains(CloneFlags::PARENT) { old_proc_data.proc.parent().ok_or(AxError::InvalidInput)? } else { @@ -156,6 +256,7 @@ pub fn sys_clone( } .fork(tid); + // Handle address space let aspace = if flags.contains(CloneFlags::VM) { old_proc_data.aspace.clone() } else { @@ -164,15 +265,22 @@ pub fn sys_clone( copy_from_kernel(&mut aspace.lock())?; aspace }; + new_task .ctx_mut() .set_page_table_root(aspace.lock().page_table_root()); + // Handle signal handlers let signal_actions = if flags.contains(CloneFlags::SIGHAND) { old_proc_data.signal.actions.clone() + } else if flags.contains(CloneFlags::CLEAR_SIGHAND) { + // CLONE_CLEAR_SIGHAND: reset to default handlers + Arc::new(SpinNoIrq::new(Default::default())) } else { + // Normal fork: copy signal handlers Arc::new(SpinNoIrq::new(old_proc_data.signal.actions.lock().clone())) }; + let proc_data = ProcessData::new( proc, old_proc_data.exe_path.read().clone(), @@ -183,8 +291,10 @@ pub fn sys_clone( ); proc_data.set_umask(old_proc_data.umask()); + // Handle file descriptors and filesystem context { let mut scope = proc_data.scope.write(); + if flags.contains(CloneFlags::FILES) { FD_TABLE.scope_mut(&mut scope).clone_from(&FD_TABLE); } else { @@ -207,25 +317,142 @@ pub fn sys_clone( proc_data }; + // Add thread to process new_proc_data.proc.add_thread(tid); + // Handle PIDFD if requested + // Different behavior for clone() vs clone3() if flags.contains(CloneFlags::PIDFD) { let pidfd = PidFd::new(&new_proc_data); - *UserPtr::::from(parent_tid).get_as_mut()? = pidfd.add_to_fd_table(true)?; + let fd = pidfd.add_to_fd_table(true)?; + + // Get the correct pointer based on clone variant + let pidfd_target_ptr = params.pidfd_ptr(); + if pidfd_target_ptr != 0 { + // clone3: write to pidfd field + *UserPtr::::from(pidfd_target_ptr).get_as_mut()? = fd; + } else if parent_tid_ptr != 0 { + // clone: write to parent_tid (historical behavior) + *UserPtr::::from(parent_tid_ptr).get_as_mut()? = fd; + } } + // Create thread object let thr = Thread::new(tid, new_proc_data); - if flags.contains(CloneFlags::CHILD_CLEARTID) { - thr.set_clear_child_tid(child_tid); + + // Set clear_child_tid if requested + let clear_child_tid_ptr = params.child_cleartid_ptr(); + if flags.contains(CloneFlags::CHILD_CLEARTID) && clear_child_tid_ptr != 0 { + thr.set_clear_child_tid(clear_child_tid_ptr); } + *new_task.task_ext_mut() = Some(unsafe { AxTaskExt::from_impl(thr) }); + // Spawn the task let task = spawn_task(new_task); add_task_to_table(&task); Ok(tid as _) } +// ================================ +// Clone (legacy) parameters +// ================================ + +/// Parameters for the clone() system call. +/// +/// Note: In clone(), the parent_tid parameter serves dual purpose: +/// - If CLONE_PIDFD: receives the pidfd +/// - If CLONE_PARENT_SETTID: receives the child TID +/// These two flags are mutually exclusive in clone(). +pub struct CloneParams { + flags: u32, + stack: usize, + parent_tid: usize, + child_tid: usize, + tls: usize, +} + +impl CloneParams { + pub fn new(flags: u32, stack: usize, parent_tid: usize, child_tid: usize, tls: usize) -> Self { + Self { + flags, + stack, + parent_tid, + child_tid, + tls, + } + } +} + +impl CloneParamProvider for CloneParams { + fn flags(&self) -> CloneFlags { + const FLAG_MASK: u32 = 0xff; + CloneFlags::from_bits_truncate((self.flags & !FLAG_MASK) as u64) + } + + fn exit_signal(&self) -> u64 { + const FLAG_MASK: u32 = 0xff; + (self.flags & FLAG_MASK) as u64 + } + + fn stack_pointer(&self) -> usize { + // For clone(), stack directly specifies the new SP + self.stack + } + + fn tls(&self) -> usize { + self.tls + } + + fn child_settid_ptr(&self) -> usize { + self.child_tid + } + + fn child_cleartid_ptr(&self) -> usize { + self.child_tid + } + + fn parent_tid_ptr(&self) -> usize { + self.parent_tid + } + + fn pidfd_ptr(&self) -> usize { + // For clone(), PIDFD uses parent_tid, so return 0 here + // The core logic will use parent_tid_ptr() instead + 0 + } + + fn validate(&self) -> AxResult<()> { + let flags = self.flags(); + + // In clone(), PIDFD and PARENT_SETTID are mutually exclusive + // because they share the parent_tid parameter + if flags.contains(CloneFlags::PIDFD) && flags.contains(CloneFlags::PARENT_SETTID) { + return Err(AxError::InvalidInput); + } + + Ok(()) + } +} + +// ================================ +// System call wrappers +// ================================ + +pub fn sys_clone( + uctx: &UserContext, + flags: u32, + stack: usize, + parent_tid: usize, + #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, + tls: usize, + #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, +) -> AxResult { + let params = CloneParams::new(flags, stack, parent_tid, child_tid, tls); + do_clone(uctx, ¶ms) +} + #[cfg(target_arch = "x86_64")] pub fn sys_fork(uctx: &UserContext) -> AxResult { sys_clone(uctx, SIGCHLD, 0, 0, 0, 0) diff --git a/api/src/syscall/task/clone3.rs b/api/src/syscall/task/clone3.rs index d584d9e1..6183df55 100644 --- a/api/src/syscall/task/clone3.rs +++ b/api/src/syscall/task/clone3.rs @@ -1,375 +1,110 @@ -use alloc::sync::Arc; - use axerrno::{AxError, AxResult}; -use axfs::FS_CONTEXT; use axhal::uspace::UserContext; -use axtask::{AxTaskExt, current, spawn_task}; -use bitflags::bitflags; -use kspin::SpinNoIrq; -use linux_raw_sys::general::*; -use starry_core::{ - mm::copy_from_kernel, - task::{AsThread, ProcessData, Thread, add_task_to_table}, -}; -use starry_process::Pid; -use starry_signal::Signo; - -use crate::{ - file::{FD_TABLE, FileLike, PidFd}, - mm::UserPtr, - task::new_user_task, -}; +use starry_vm::VmPtr; -bitflags! { - /// Options for use with [`sys_clone3`]. - #[derive(Debug, Clone, Copy, Default)] - struct CloneFlags: u64 { - /// The calling process and the child process run in the same - /// memory space. - const VM = CLONE_VM as u64; - /// The caller and the child process share the same filesystem - /// information. - const FS = CLONE_FS as u64; - /// The calling process and the child process share the same file - /// descriptor table. - const FILES = CLONE_FILES as u64; - /// The calling process and the child process share the same table - /// of signal handlers. - const SIGHAND = CLONE_SIGHAND as u64; - /// Sets pidfd to the child process's PID file descriptor. - const PIDFD = CLONE_PIDFD as u64; - /// If the calling process is being traced, then trace the child - /// also. - const PTRACE = CLONE_PTRACE as u64; - /// The execution of the calling process is suspended until the - /// child releases its virtual memory resources via a call to - /// execve(2) or _exit(2) (as with vfork(2)). - const VFORK = CLONE_VFORK as u64; - /// The parent of the new child (as returned by getppid(2)) - /// will be the same as that of the calling process. - const PARENT = CLONE_PARENT as u64; - /// The child is placed in the same thread group as the calling - /// process. - const THREAD = CLONE_THREAD as u64; - /// The cloned child is started in a new mount namespace. - const NEWNS = CLONE_NEWNS as u64; - /// The child and the calling process share a single list of System - /// V semaphore adjustment values - const SYSVSEM = CLONE_SYSVSEM as u64; - /// The TLS (Thread Local Storage) descriptor is set to tls. - const SETTLS = CLONE_SETTLS as u64; - /// Store the child thread ID in the parent's memory. - const PARENT_SETTID = CLONE_PARENT_SETTID as u64; - /// Clear (zero) the child thread ID in child memory when the child - /// exits, and do a wakeup on the futex at that address. - const CHILD_CLEARTID = CLONE_CHILD_CLEARTID as u64; - /// A tracing process cannot force `CLONE_PTRACE` on this child - /// process. - const UNTRACED = CLONE_UNTRACED as u64; - /// Store the child thread ID in the child's memory. - const CHILD_SETTID = CLONE_CHILD_SETTID as u64; - /// Create the process in a new cgroup namespace. - const NEWCGROUP = CLONE_NEWCGROUP as u64; - /// Create the process in a new UTS namespace. - const NEWUTS = CLONE_NEWUTS as u64; - /// Create the process in a new IPC namespace. - const NEWIPC = CLONE_NEWIPC as u64; - /// Create the process in a new user namespace. - const NEWUSER = CLONE_NEWUSER as u64; - /// Create the process in a new PID namespace. - const NEWPID = CLONE_NEWPID as u64; - /// Create the process in a new network namespace. - const NEWNET = CLONE_NEWNET as u64; - /// The new process shares an I/O context with the calling process. - const IO = CLONE_IO as u64; - /// Clear signal handlers on clone (since Linux 5.5) - const CLEAR_SIGHAND = 0x100000000u64; - /// Clone into specific cgroup (since Linux 5.7) - const INTO_CGROUP = 0x200000000u64; - } -} +use super::clone::{CloneFlags, CloneParamProvider, do_clone}; -/// Structure passed to clone3() system call +/// Structure passed to clone3() system call. #[repr(C)] #[derive(Debug, Clone, Copy, Default)] -pub struct CloneArgs { - /// Flags bit mask +pub struct Clone3Args { pub flags: u64, - /// Where to store PID file descriptor (int *) pub pidfd: u64, - /// Where to store child TID, in child's memory (pid_t *) pub child_tid: u64, - /// Where to store child TID, in parent's memory (pid_t *) pub parent_tid: u64, - /// Signal to deliver to parent on child termination pub exit_signal: u64, - /// Pointer to lowest byte of stack pub stack: u64, - /// Size of stack pub stack_size: u64, - /// Location of new TLS pub tls: u64, - /// Pointer to a pid_t array (since Linux 5.5) pub set_tid: u64, - /// Number of elements in set_tid (since Linux 5.5) pub set_tid_size: u64, - /// File descriptor for target cgroup of child (since Linux 5.7) pub cgroup: u64, } -/// The minimum size of clone_args structure we support -const MIN_CLONE_ARGS_SIZE: usize = core::mem::size_of::() * 8; // First 8 fields - -/// Validate clone_args structure and flags -fn validate_clone_args(args: &CloneArgs) -> AxResult<()> { - let flags = CloneFlags::from_bits_truncate(args.flags); - - // Check for unsupported flag combinations - if args.exit_signal > 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { - return Err(AxError::InvalidInput); - } - - // CLONE_THREAD requires CLONE_VM and CLONE_SIGHAND - if flags.contains(CloneFlags::THREAD) && !flags.contains(CloneFlags::VM | CloneFlags::SIGHAND) { - return Err(AxError::InvalidInput); - } - - // Validate signal number - if args.exit_signal > 0 && args.exit_signal >= 64 { - return Err(AxError::InvalidInput); - } - - // Validate set_tid_size - if args.set_tid_size > 0 { - warn!("sys_clone3: set_tid/set_tid_size not fully supported, ignoring"); - // In a full implementation, we would validate: - // - set_tid_size <= nested PID namespace depth - // - PIDs in set_tid array are available - } +const MIN_CLONE_ARGS_SIZE: usize = core::mem::size_of::() * 8; - // Validate cgroup fd - if args.cgroup > 0 { - warn!("sys_clone3: cgroup parameter not fully supported, ignoring"); +impl CloneParamProvider for Clone3Args { + fn flags(&self) -> CloneFlags { + CloneFlags::from_bits_truncate(self.flags) } - // Namespace flags - stub support - let namespace_flags = CloneFlags::NEWNS - | CloneFlags::NEWIPC - | CloneFlags::NEWNET - | CloneFlags::NEWPID - | CloneFlags::NEWUSER - | CloneFlags::NEWUTS - | CloneFlags::NEWCGROUP; - - if flags.intersects(namespace_flags) { - warn!( - "sys_clone3: namespace flags detected ({:?}), stub support only", - flags & namespace_flags - ); - // Don't return error, just log warning for compatibility + fn exit_signal(&self) -> u64 { + self.exit_signal } - Ok(()) -} - -/// Implementation of clone3 system call -pub fn sys_clone3(uctx: &UserContext, args_ptr: usize, args_size: usize) -> AxResult { - debug!( - "sys_clone3 <= args_ptr: {:#x}, args_size: {}", - args_ptr, args_size - ); - - // Validate arguments size - if args_size < MIN_CLONE_ARGS_SIZE { - warn!( - "sys_clone3: args_size {} too small, minimum is {}", - args_size, MIN_CLONE_ARGS_SIZE - ); - return Err(AxError::InvalidInput); + fn stack_pointer(&self) -> usize { + // For clone3(), stack + stack_size gives the SP + if self.stack > 0 { + if self.stack_size > 0 { + // Stack grows downward, SP = base + size + (self.stack + self.stack_size) as usize + } else { + // If only stack provided, treat as SP directly + self.stack as usize + } + } else { + 0 + } } - // Support larger structures for forward compatibility - if args_size > core::mem::size_of::() { - // Just use what we understand, ignore extra fields - debug!( - "sys_clone3: args_size {} larger than expected {}, using known fields only", - args_size, - core::mem::size_of::() - ); + fn tls(&self) -> usize { + self.tls as usize } - // Copy clone_args from user space - let args_uptr = UserPtr::::from(args_ptr); - let args = *args_uptr.get_as_mut()?; - - debug!("sys_clone3: args = {:?}", args); - - // Validate arguments - validate_clone_args(&args)?; - - let mut flags = CloneFlags::from_bits_truncate(args.flags); - - // Handle VFORK special case (same as sys_clone) - if flags.contains(CloneFlags::VFORK) { - debug!("sys_clone3: CLONE_VFORK slow path"); - flags.remove(CloneFlags::VM); + fn child_settid_ptr(&self) -> usize { + self.child_tid as usize } - debug!("sys_clone3: effective flags: {:?}", flags); - - // Parse exit signal - let exit_signal = if args.exit_signal > 0 { - Signo::from_repr(args.exit_signal as u8) - } else { - None - }; - - // Prepare new user context - let mut new_uctx = *uctx; - - // Set stack pointer if provided - if args.stack > 0 { - if args.stack_size > 0 { - // Stack grows downward, so set SP to stack + stack_size - new_uctx.set_sp((args.stack + args.stack_size) as usize); - } else { - new_uctx.set_sp(args.stack as usize); - } + fn child_cleartid_ptr(&self) -> usize { + self.child_tid as usize // for glibc compatibility } - // Set TLS if requested - if flags.contains(CloneFlags::SETTLS) { - new_uctx.set_tls(args.tls as usize); + fn parent_tid_ptr(&self) -> usize { + self.parent_tid as usize } - // Child returns 0 - new_uctx.set_retval(0); - - // Prepare child_tid pointer if needed - let set_child_tid = if flags.contains(CloneFlags::CHILD_SETTID) && args.child_tid > 0 { - Some(UserPtr::::from(args.child_tid as usize).get_as_mut()?) - } else { - None - }; - - let curr = current(); - let old_proc_data = &curr.as_thread().proc_data; - - // Create new task - let mut new_task = new_user_task(&curr.name(), new_uctx, set_child_tid); - let tid = new_task.id().as_u64() as Pid; - - // Set parent_tid if requested - if flags.contains(CloneFlags::PARENT_SETTID) && args.parent_tid > 0 { - *UserPtr::::from(args.parent_tid as usize).get_as_mut()? = tid; + fn pidfd_ptr(&self) -> usize { + // For clone3(), pidfd is a separate field + self.pidfd as usize } - // Create process data based on flags - let new_proc_data = if flags.contains(CloneFlags::THREAD) { - // Thread creation: share address space - new_task - .ctx_mut() - .set_page_table_root(old_proc_data.aspace.lock().page_table_root()); - old_proc_data.clone() - } else { - // Process creation: fork or vfork - let proc = if flags.contains(CloneFlags::PARENT) { - old_proc_data.proc.parent().ok_or(AxError::InvalidInput)? - } else { - old_proc_data.proc.clone() + fn validate(&self) -> AxResult<()> { + // Warn about unsupported features + if self.set_tid != 0 || self.set_tid_size != 0 { + warn!("sys_clone3: set_tid/set_tid_size not supported, ignoring"); } - .fork(tid); - - // Handle address space - let aspace = if flags.contains(CloneFlags::VM) { - old_proc_data.aspace.clone() - } else { - let mut aspace = old_proc_data.aspace.lock(); - let aspace = aspace.try_clone()?; - copy_from_kernel(&mut aspace.lock())?; - aspace - }; - - new_task - .ctx_mut() - .set_page_table_root(aspace.lock().page_table_root()); - - // Handle signal handlers - let signal_actions = if flags.contains(CloneFlags::SIGHAND) { - old_proc_data.signal.actions.clone() - } else if flags.contains(CloneFlags::CLEAR_SIGHAND) { - // CLONE_CLEAR_SIGHAND: reset to default handlers - Arc::new(SpinNoIrq::new(Default::default())) - } else { - // Normal fork: copy signal handlers - Arc::new(SpinNoIrq::new(old_proc_data.signal.actions.lock().clone())) - }; - - let proc_data = ProcessData::new( - proc, - old_proc_data.exe_path.read().clone(), - old_proc_data.cmdline.read().clone(), - aspace, - signal_actions, - exit_signal, - ); - proc_data.set_umask(old_proc_data.umask()); - - // Handle file descriptors and filesystem context - { - let mut scope = proc_data.scope.write(); - - if flags.contains(CloneFlags::FILES) { - // Share file descriptor table - FD_TABLE.scope_mut(&mut scope).clone_from(&FD_TABLE); - } else { - // Copy file descriptor table - FD_TABLE - .scope_mut(&mut scope) - .write() - .clone_from(&FD_TABLE.read()); - } - - if flags.contains(CloneFlags::FS) { - // Share filesystem context - FS_CONTEXT.scope_mut(&mut scope).clone_from(&FS_CONTEXT); - } else { - // Copy filesystem context - FS_CONTEXT - .scope_mut(&mut scope) - .lock() - .clone_from(&FS_CONTEXT.lock()); - } + if self.cgroup != 0 { + warn!("sys_clone3: cgroup parameter not supported, ignoring"); } - proc_data - }; - - // Add thread to process - new_proc_data.proc.add_thread(tid); + // In clone3(), PIDFD and PARENT_SETTID can coexist + // because they use separate fields (no validation needed) - // Handle PIDFD if requested - if flags.contains(CloneFlags::PIDFD) && args.pidfd > 0 { - let pidfd = PidFd::new(&new_proc_data); - let fd = pidfd.add_to_fd_table(true)?; - *UserPtr::::from(args.pidfd as usize).get_as_mut()? = fd; + Ok(()) } +} - // Create thread object - let thr = Thread::new(tid, new_proc_data); +pub fn sys_clone3(uctx: &UserContext, args_ptr: usize, args_size: usize) -> AxResult { + debug!("sys_clone3 <= args_ptr: {args_ptr:#x}, args_size: {args_size}"); - // Set clear_child_tid if requested - if flags.contains(CloneFlags::CHILD_CLEARTID) && args.child_tid > 0 { - thr.set_clear_child_tid(args.child_tid as usize); + // Validate size + if args_size < MIN_CLONE_ARGS_SIZE { + warn!("sys_clone3: args_size {args_size} too small, minimum is {MIN_CLONE_ARGS_SIZE}"); + return Err(AxError::InvalidInput); } - *new_task.task_ext_mut() = Some(unsafe { AxTaskExt::from_impl(thr) }); + if args_size > core::mem::size_of::() { + debug!("sys_clone3: args_size {args_size} larger than expected, using known fields only"); + } - // Spawn the task - let task = spawn_task(new_task); - add_task_to_table(&task); + // Copy arguments from user space + let args_ptr = args_ptr as *const Clone3Args; + let args = unsafe { args_ptr.vm_read_uninit()?.assume_init() }; + debug!("sys_clone3: args = {args:?}"); - debug!("sys_clone3 => child tid: {}", tid); + // Use common implementation + let result = do_clone(uctx, &args)?; + debug!("sys_clone3 => child tid: {result}"); - Ok(tid as _) + Ok(result) } From c9ddecddb0329619143a6ee1eba34cebcb94ab21 Mon Sep 17 00:00:00 2001 From: WyZ0125 Date: Sun, 14 Dec 2025 20:14:03 +0800 Subject: [PATCH 3/3] fix clippy error --- api/src/syscall/task/clone.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/src/syscall/task/clone.rs b/api/src/syscall/task/clone.rs index 0ed287f1..afebdc3d 100644 --- a/api/src/syscall/task/clone.rs +++ b/api/src/syscall/task/clone.rs @@ -364,7 +364,7 @@ pub fn do_clone(uctx: &UserContext, params: &P) -> AxResu /// Note: In clone(), the parent_tid parameter serves dual purpose: /// - If CLONE_PIDFD: receives the pidfd /// - If CLONE_PARENT_SETTID: receives the child TID -/// These two flags are mutually exclusive in clone(). +/// These two flags are mutually exclusive in clone(). pub struct CloneParams { flags: u32, stack: usize,