diff --git a/Makefile b/Makefile index 45a921f..72297d0 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,7 @@ SRCS := \ syscall/translate.c \ syscall/mem.c \ syscall/path.c \ + syscall/fuse.c \ syscall/sidecar.c \ syscall/fs.c \ syscall/fs-stat.c \ @@ -149,6 +150,11 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-fuse-basic runs a guest daemon thread and consumer in one process +$(BUILD_DIR)/test-fuse-basic: tests/test-fuse-basic.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + # test-sched-policy spawns a pthread to verify per-thread TID lookup $(BUILD_DIR)/test-sched-policy: tests/test-sched-policy.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" diff --git a/docs/testing.md b/docs/testing.md index 661bb88..1021f1b 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -48,6 +48,8 @@ What they do: downloaded into `build/busybox` on first run. - `make test-busybox`: just the BusyBox suite, useful when iterating on a single applet failure without rerunning the unit suite +- `make test-fuse-alpine`: validate guest `/dev/fuse` + `mount("fuse")` + against the Alpine musl sysroot fixture - `make test-gdbstub`: debugger integration checks against the built-in GDB stub - `make test-matrix`: broader `elfuse` and QEMU cross-check - `make lint`: static analysis through `clang-tidy` @@ -61,7 +63,7 @@ make elfuse make check ``` -For changes that touch procfs, path handling, networking, dynamic linking, or +For changes that touch procfs, path handling, `/dev`, FUSE, networking, dynamic linking, or guest process semantics, run the matrix as well: ```sh diff --git a/mk/tests.mk b/mk/tests.mk index 844b16c..35f48aa 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -8,7 +8,7 @@ test-full test-multi-vcpu test-rwx test-sysroot-rename \ test-case-collision test-case-collision-fallback test-sysroot-create-paths \ test-proctitle-low-stack \ - test-sysroot-procfs-exec test-timeout-disable \ + test-sysroot-procfs-exec test-timeout-disable test-fuse-alpine \ test-sysroot-nofollow test-sysroot-chdir perf ## Build and run the assembly hello world test @@ -29,6 +29,8 @@ check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage @$(MAKE) --no-print-directory test-busybox @printf "\n$(BLUE)━━━ sysroot procfs exec validation ━━━$(RESET)\n" @$(MAKE) --no-print-directory test-sysroot-procfs-exec + @printf "\n$(BLUE)━━━ Alpine sysroot FUSE validation ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-fuse-alpine @printf "\n$(BLUE)━━━ timeout=0 validation ━━━$(RESET)\n" @$(MAKE) --no-print-directory test-timeout-disable @@ -297,6 +299,14 @@ test-dynamic: $(ELFUSE_BIN) @printf "$(BLUE)▸ Running$(RESET) dynamic hello-dynamic (--sysroot)\n" $(ELFUSE_BIN) --sysroot $(SYSROOT_DIR) $(GUEST_DYNAMIC_TESTS)/bin/hello-dynamic +## Run guest FUSE validation against the Alpine musl sysroot +test-fuse-alpine: $(ELFUSE_BIN) $(BUILD_DIR)/test-fuse-basic + @if [ -z "$(SYSROOT_DIR)" ] || [ ! -d "$(SYSROOT_DIR)" ]; then \ + printf "$(YELLOW)SKIP$(RESET) Alpine sysroot not found. Set SYSROOT_DIR=/path/to/sysroot or run tests/fetch-fixtures.sh.\n"; \ + exit 0; \ + fi + @bash tests/test-fuse-alpine.sh $(ELFUSE_BIN) $(SYSROOT_DIR) $(BUILD_DIR)/test-fuse-basic + ## Run dynamically-linked coreutils tests (--sysroot) test-dynamic-coreutils: $(ELFUSE_BIN) @if [ -z "$(SYSROOT_DIR)" ] || [ ! -d "$(SYSROOT_DIR)" ]; then \ diff --git a/src/core/guest.h b/src/core/guest.h index e0ea521..f0b5dd5 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -314,9 +314,9 @@ static inline void guest_pt_gen_bump(guest_t *g) * a synchronous IPI into a sibling vCPU thread, so the window remains. * The guest is responsible for serializing concurrent PT mutations * against concurrent accesses (futex / pthread_mutex), which is the same - * contract real Linux requires of well-behaved multi-threaded code. See - * TODO.md "Bounded retry on stale TLB data abort" (P3 hardening) for the - * tracked follow-up if a workload ever surfaces an actual reproducer. + * contract real Linux requires of well-behaved multi-threaded code. A + * bounded-retry on stale-TLB data aborts is a known hardening direction + * if a workload ever surfaces an actual reproducer. */ extern _Thread_local tlbi_request_t cpu_tlbi_req; diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 852f475..86d2a07 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -49,6 +49,7 @@ #include "syscall/abi.h" #include "syscall/fd.h" +#include "syscall/fuse.h" #include "syscall/internal.h" #include "syscall/proc.h" #include "syscall/sys.h" @@ -2000,6 +2001,7 @@ int proc_intercept_open(const guest_t *g, "\tproc\n" "\tsysfs\n" "\tdevtmpfs\n" + "\tfuse\n" "\text4\n" "\tvfat\n"); } @@ -2009,23 +2011,35 @@ int proc_intercept_open(const guest_t *g, * - type source super_options */ if (!strcmp(path, "/proc/self/mountinfo")) { - return proc_emit_literal( + char buf[8192]; + size_t off = (size_t) snprintf( + buf, sizeof(buf), "1 0 0:1 / / rw,relatime - ext4 /dev/root rw\n" "2 1 0:2 / /proc rw,nosuid,nodev,noexec - proc proc rw\n" "3 1 0:3 / /tmp rw,nosuid,nodev - tmpfs tmpfs rw\n" "4 1 0:4 / /dev rw,nosuid - devtmpfs devtmpfs rw\n" "5 4 0:5 / /dev/shm rw,nosuid,nodev - tmpfs tmpfs rw\n"); + if (off >= sizeof(buf) || + fuse_append_mountinfo(buf, sizeof(buf), &off) < 0) + return -1; + return proc_synthetic_fd(buf, off); } /* /proc/mounts, /etc/mtab -> synthetic mount table */ if (!strcmp(path, "/proc/mounts") || !strcmp(path, "/proc/self/mounts") || !strcmp(path, "/etc/mtab")) { - return proc_emit_literal( - "/ / ext4 rw,relatime 0 0\n" - "proc /proc proc rw,nosuid,nodev,noexec 0 0\n" - "tmpfs /tmp tmpfs rw,nosuid,nodev 0 0\n" - "devtmpfs /dev devtmpfs rw,nosuid 0 0\n" - "tmpfs /dev/shm tmpfs rw,nosuid,nodev 0 0\n"); + char buf[8192]; + size_t off = + (size_t) snprintf(buf, sizeof(buf), + "/ / ext4 rw,relatime 0 0\n" + "proc /proc proc rw,nosuid,nodev,noexec 0 0\n" + "tmpfs /tmp tmpfs rw,nosuid,nodev 0 0\n" + "devtmpfs /dev devtmpfs rw,nosuid 0 0\n" + "tmpfs /dev/shm tmpfs rw,nosuid,nodev 0 0\n"); + if (off >= sizeof(buf) || + fuse_append_mounts(buf, sizeof(buf), &off) < 0) + return -1; + return proc_synthetic_fd(buf, off); } /* OOM nodes share one stored adjustment. @@ -2114,12 +2128,15 @@ int proc_intercept_open(const guest_t *g, } } + int mnt_id = 0; + if (fuse_fd_mnt_id(n, &mnt_id) < 0) + mnt_id = 0; return proc_emit_fmt( "pos:\t%lld\n" "flags:\t0%o\n" - "mnt_id:\t0\n" + "mnt_id:\t%d\n" "%s", - (long long) pos, snap.linux_flags, extra); + (long long) pos, snap.linux_flags, mnt_id, extra); } /* /proc/self/fdinfo -> directory listing. Each open gets its own scratch @@ -2367,6 +2384,9 @@ int proc_intercept_stat(const char *path, struct stat *st) * irrelevant here; callers need stat to succeed before opening the * synthetic file. */ + if (!strcmp(path, "/dev/fuse")) + return fuse_proc_stat(st); + /* /dev/shm is a directory */ if (!strcmp(path, "/dev/shm") || !strcmp(path, "/dev/shm/")) { stat_fill_proc_dir(st, 01777, 2, diff --git a/src/syscall/abi.h b/src/syscall/abi.h index a0ab74a..8716cec 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -36,6 +36,7 @@ #define SYS_symlinkat 36 #define SYS_linkat 37 #define SYS_renameat 38 +#define SYS_mount 40 #define SYS_truncate 45 #define SYS_statfs 43 #define SYS_fstatfs 44 @@ -278,12 +279,14 @@ typedef struct { #define LINUX_EBUSY 16 #define LINUX_EEXIST 17 #define LINUX_EXDEV 18 +#define LINUX_ENODEV 19 #define LINUX_ENOTDIR 20 #define LINUX_EINVAL 22 #define LINUX_EMFILE 24 #define LINUX_ENOTTY 25 #define LINUX_EFBIG 27 #define LINUX_ENOSPC 28 +#define LINUX_ESPIPE 29 #define LINUX_ERANGE 34 #define LINUX_EDEADLK 35 #define LINUX_ENAMETOOLONG 36 @@ -629,6 +632,9 @@ typedef struct { #define FD_PATH 11 #define FD_NETLINK 12 #define FD_PIDFD 13 +#define FD_FUSE_DEV 14 +#define FD_FUSE_FILE 15 +#define FD_FUSE_DIR 16 #define FD_VIRTUAL_PATH_MAX 64 /* File sealing flags (F_SEAL_*) for memfd_create. Tracked per-FD. */ diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl index 56857db..2111449 100644 --- a/src/syscall/dispatch.tbl +++ b/src/syscall/dispatch.tbl @@ -60,6 +60,7 @@ SYS_symlinkat sc_symlinkat 1 SYS_linkat sc_linkat 1 SYS_renameat sc_renameat 1 SYS_renameat2 sc_renameat2 1 +SYS_mount sc_mount 1 SYS_readlinkat sc_readlinkat 1 SYS_newfstatat sc_newfstatat 1 SYS_fstat sc_fstat 0 diff --git a/src/syscall/exec.c b/src/syscall/exec.c index 964193b..43a8097 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -31,6 +31,7 @@ #include "syscall/abi.h" #include "syscall/exec.h" +#include "syscall/fuse.h" #include "syscall/internal.h" #include "syscall/path.h" #include "syscall/proc.h" @@ -118,6 +119,9 @@ int64_t sys_execve(hv_vcpu_t vcpu, char path_host_buf[LINUX_PATH_MAX]; const char *path_host = path; + bool path_host_temp = false; + char interp_host_buf[LINUX_PATH_MAX]; + bool interp_host_temp = false; #define MAX_ARGS 256 #define MAX_ENVS 4096 @@ -172,7 +176,15 @@ int64_t sys_execve(hv_vcpu_t vcpu, err = linux_errno(); goto fail; } - str_copy_trunc(path_host_buf, tx.host_path, sizeof(path_host_buf)); + if (tx.fuse_path) { + err = fuse_materialize_path(tx.intercept_path, path_host_buf, + sizeof(path_host_buf)); + if (err < 0) + goto fail; + path_host_temp = true; + } else { + str_copy_trunc(path_host_buf, tx.host_path, sizeof(path_host_buf)); + } path_host = path_host_buf; } if (!path_host) { @@ -304,9 +316,23 @@ int64_t sys_execve(hv_vcpu_t vcpu, err = linux_errno(); goto fail; } - str_copy_trunc(path_host_buf, interp_tx.host_path, - sizeof(path_host_buf)); - path_host = path_host_buf; + if (path_host_temp) { + unlink(path_host_buf); + path_host_temp = false; + } + if (interp_tx.fuse_path) { + err = + fuse_materialize_path(interp_tx.intercept_path, interp_host_buf, + sizeof(interp_host_buf)); + if (err < 0) + goto fail; + interp_host_temp = true; + path_host = interp_host_buf; + } else { + str_copy_trunc(path_host_buf, interp_tx.host_path, + sizeof(path_host_buf)); + path_host = path_host_buf; + } if (elf_load(path_host, &elf_info) < 0) { err = -LINUX_ENOENT; @@ -383,6 +409,10 @@ int64_t sys_execve(hv_vcpu_t vcpu, */ if (0) { fail: + if (path_host_temp) + unlink(path_host_buf); + if (interp_host_temp) + unlink(interp_host_buf); free(argv_buf); free(envp_buf); return err; @@ -693,8 +723,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, elf_info.segments[i].gpa + elf_info.segments[i].memsz + elf_load_base, elf_pf_to_prot(elf_info.segments[i].flags), - LINUX_MAP_PRIVATE, elf_info.segments[i].offset, - path_host); + LINUX_MAP_PRIVATE, elf_info.segments[i].offset, path); } /* interp_resolved was computed before guest_reset so no filesystem lookup * is needed after the point of no return. @@ -803,6 +832,10 @@ int64_t sys_execve(hv_vcpu_t vcpu, log_debug("execve: loaded %s, entry=0x%llx sp=0x%llx", path_host, (unsigned long long) entry_ipa, (unsigned long long) sp_ipa); + if (path_host_temp) + unlink(path_host_buf); + if (interp_host_temp) + unlink(interp_host_buf); free(argv_buf); free(envp_buf); diff --git a/src/syscall/fs-stat.c b/src/syscall/fs-stat.c index a1e17e8..688a5f2 100644 --- a/src/syscall/fs-stat.c +++ b/src/syscall/fs-stat.c @@ -14,6 +14,7 @@ #include "runtime/procemu.h" #include "syscall/abi.h" +#include "syscall/fuse.h" #include "syscall/fs.h" #include "syscall/internal.h" #include "syscall/path.h" @@ -182,6 +183,13 @@ static int64_t stat_at_path(guest_t *g, sizeof(path), &pathp) < 0) return -LINUX_EFAULT; + if (pathp[0] == '/' && fuse_path_matches_mount(pathp)) { + int frc = fuse_stat_path(pathp, mac_st, flags); + if (frc < 0) + return frc; + return 0; + } + path_translation_t tx; if (path_translate_at(dirfd, pathp, (flags & LINUX_AT_SYMLINK_NOFOLLOW) ? PATH_TR_NOFOLLOW @@ -189,6 +197,13 @@ static int64_t stat_at_path(guest_t *g, &tx) < 0) return linux_errno(); + if (tx.fuse_path) { + int frc = fuse_stat_path(tx.intercept_path, mac_st, flags); + if (frc < 0) + return frc; + return 0; + } + if (tx.proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && pathp[0] != '/' && pathp[0] != '\0' && !proc_get_sysroot()) { int mac_flags = translate_at_flags(flags); @@ -244,13 +259,21 @@ static int64_t stat_at_path(guest_t *g, int64_t sys_fstat(guest_t *g, int fd, uint64_t stat_gva) { + struct stat mac_st; + int frc = fuse_fstat_fd(fd, &mac_st); + if (frc == 0) { + if (write_linux_stat(g, stat_gva, &mac_st) < 0) + return -LINUX_EFAULT; + return 0; + } + if (frc != -LINUX_EBADF) + return frc; + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) { log_debug("fstat(%d): invalid guest fd", fd); return -LINUX_EBADF; } - - struct stat mac_st; if (fstat(host_ref.fd, &mac_st) < 0) { log_debug("fstat(%d->%d): host fstat failed errno=%d", fd, host_ref.fd, errno); @@ -297,6 +320,8 @@ int64_t sys_statfs(guest_t *g, uint64_t path_gva, uint64_t buf_gva) path_translation_t tx; if (path_translate_at(LINUX_AT_FDCWD, path, PATH_TR_NONE, &tx) < 0) return linux_errno(); + if (tx.fuse_path) + return -LINUX_ENOSYS; struct statfs mac_st; if (statfs(tx.host_path, &mac_st) < 0) diff --git a/src/syscall/fs-xattr.c b/src/syscall/fs-xattr.c index f8fdcff..68e224e 100644 --- a/src/syscall/fs-xattr.c +++ b/src/syscall/fs-xattr.c @@ -83,6 +83,8 @@ int64_t sys_getxattr(guest_t *g, if (path_translate_at(LINUX_AT_FDCWD, path, nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) return linux_errno(); + if (tx.fuse_path) + return -LINUX_ENOSYS; int opts = nofollow ? XATTR_NOFOLLOW : 0; @@ -120,6 +122,8 @@ int64_t sys_setxattr(guest_t *g, if (path_translate_at(LINUX_AT_FDCWD, path, nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) return linux_errno(); + if (tx.fuse_path) + return -LINUX_ENOSYS; void *buf; int64_t err = xattr_alloc_buf(size, &buf); @@ -156,6 +160,8 @@ int64_t sys_listxattr(guest_t *g, if (path_translate_at(LINUX_AT_FDCWD, path, nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) return linux_errno(); + if (tx.fuse_path) + return -LINUX_ENOSYS; int opts = nofollow ? XATTR_NOFOLLOW : 0; @@ -190,6 +196,8 @@ int64_t sys_removexattr(guest_t *g, if (path_translate_at(LINUX_AT_FDCWD, path, nofollow ? PATH_TR_NOFOLLOW : PATH_TR_NONE, &tx) < 0) return linux_errno(); + if (tx.fuse_path) + return -LINUX_ENOSYS; int opts = nofollow ? XATTR_NOFOLLOW : 0; int ret = removexattr(tx.host_path, name, opts); diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 396afad..2465ca4 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -27,6 +27,7 @@ #include "runtime/procemu.h" #include "syscall/abi.h" +#include "syscall/fuse.h" #include "syscall/fs.h" #include "syscall/internal.h" #include "syscall/net.h" /* absock_unregister_fd */ @@ -223,6 +224,11 @@ static int64_t read_translated_path(guest_t *g, return 0; } +static int64_t reject_unsupported_fuse_path_op(const path_translation_t *tx) +{ + return tx && tx->fuse_path ? -LINUX_ENOSYS : INT64_MIN; +} + /* open/close. */ int64_t sys_openat_path(guest_t *g, @@ -261,8 +267,8 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); int flags = translate_open_flags(linux_flags); - if (tx.proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && pathp[0] != '/' && - !proc_get_sysroot()) { + if (!tx.fuse_path && tx.proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && + pathp[0] != '/' && !proc_get_sysroot()) { int host_fd = openat(AT_FDCWD, pathp, flags, mode); if (host_fd < 0) return linux_errno(); @@ -282,6 +288,12 @@ int64_t sys_openat_path(guest_t *g, /* Intercept /proc and /dev paths before touching the host filesystem */ if (path_might_use_open_intercept(tx.intercept_path)) { + if (!strcmp(tx.intercept_path, "/dev/fuse")) + return fuse_proc_open(linux_flags); + int64_t fuse_fd = + fuse_open_path(g, tx.intercept_path, linux_flags, mode); + if (fuse_fd != INT64_MIN) + return fuse_fd; int intercepted = proc_intercept_open(g, tx.intercept_path, linux_flags, mode); if (intercepted >= 0) { @@ -445,6 +457,13 @@ static int duplicate_guest_fd(int src_fd, bool fixed_slot, int linux_flags) { + if (RANGE_CHECK(src_fd, 0, FD_TABLE_SIZE)) { + int t = fd_table[src_fd].type; + if (t == FD_FUSE_DEV || t == FD_FUSE_FILE || t == FD_FUSE_DIR) + return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot, + linux_flags); + } + host_fd_ref_t host_ref; if (host_fd_ref_open(src_fd, &host_ref) < 0) { errno = EBADF; @@ -531,6 +550,10 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) return -LINUX_EBADF; + int fd_type = fd_table[fd].type; + bool fuse_fd = (fd_type == FD_FUSE_DEV || fd_type == FD_FUSE_FILE || + fd_type == FD_FUSE_DIR); + /* Linux F_DUPFD=0, F_GETFD=1, F_SETFD=2, F_GETFL=3, F_SETFL=4, * F_DUPFD_CLOEXEC=1030 */ @@ -547,6 +570,8 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if (gfd < 0) { if (errno == EBADF) return -LINUX_EBADF; + if (errno == EOPNOTSUPP) + return -LINUX_EOPNOTSUPP; return -LINUX_EMFILE; } return gfd; @@ -561,6 +586,8 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC; return 0; case 3: { /* F_GETFL */ + if (fuse_fd) + return fd_table[fd].linux_flags; fd_entry_t snap; if (!fd_snapshot(fd, &snap)) return -LINUX_EBADF; @@ -582,6 +609,18 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) } case 4: /* F_SETFL */ { + if (fuse_fd) { + int preserved = + fd_table[fd].linux_flags & + (LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY | + LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); + fd_table[fd].linux_flags = + preserved | + ((int) arg & + ~(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY | + LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE)); + return 0; + } host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; @@ -736,6 +775,8 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) return -LINUX_EBADF; if (fd_table[fd].type == FD_CLOSED) return -LINUX_EBADF; + if (fuse_is_dir_fd(fd)) + return fuse_getdents64(g, fd, buf_gva, count); /* Linux: getdents on an O_PATH fd returns EBADF, even when the underlying * inode is a directory. The early gate keeps the next NOTDIR fallback * specific to non-directory regular fds. @@ -840,6 +881,21 @@ int64_t sys_chdir(guest_t *g, uint64_t path_gva) return 0; } + if (tx.intercept_path && fuse_path_matches_mount(tx.intercept_path)) { + struct stat st; + /* chdir() always follows symlinks, so do not pass AT_SYMLINK_NOFOLLOW. + */ + int stat_rc = fuse_stat_path(tx.intercept_path, &st, 0); + if (stat_rc < 0) + return stat_rc; + if (!S_ISDIR(st.st_mode)) + return -LINUX_ENOTDIR; + if (chdir(tx.host_path) < 0) + return linux_errno(); + proc_cwd_set_virtual(tx.intercept_path); + return 0; + } + if (chdir(tx.host_path) < 0) return linux_errno(); @@ -850,6 +906,10 @@ int64_t sys_chdir(guest_t *g, uint64_t path_gva) int64_t sys_fchdir(int fd) { + int64_t fuse_rc = fuse_fchdir(fd); + if (fuse_rc != INT64_MIN) + return fuse_rc; + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; @@ -943,6 +1003,10 @@ int64_t sys_pipe2(guest_t *g, uint64_t fds_gva, int linux_flags) int64_t sys_lseek(int fd, int64_t offset, int whence) { + int64_t frc = fuse_lseek_fd(fd, offset, whence); + if (frc != INT64_MIN) + return frc; + host_fd_ref_t host_ref; int64_t err = host_fd_ref_open_io(fd, &host_ref); if (err < 0) @@ -984,6 +1048,9 @@ int64_t sys_readlinkat(guest_t *g, } /* intercepted == PROC_NOT_INTERCEPTED: fall through */ + if (tx.fuse_path) + return -LINUX_ENOSYS; + host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; @@ -1019,6 +1086,9 @@ int64_t sys_unlinkat(guest_t *g, int dirfd, uint64_t path_gva, int flags) read_translated_path(g, dirfd, path_gva, PATH_TR_CREATE, path, &tx); if (rc < 0) return rc; + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) @@ -1066,6 +1136,9 @@ int64_t sys_mkdirat(guest_t *g, int dirfd, uint64_t path_gva, int mode) g, dirfd, path_gva, PATH_TR_CREATE | PATH_TR_CREATE_PARENTS, path, &tx); if (rc < 0) return rc; + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) @@ -1119,6 +1192,8 @@ int64_t sys_renameat2(guest_t *g, if (path_translate_at(olddirfd, oldpath, PATH_TR_NONE, &old_tx) < 0 || path_translate_at(newdirfd, newpath, PATH_TR_CREATE, &new_tx) < 0) return linux_errno(); + if (old_tx.fuse_path || new_tx.fuse_path) + return -LINUX_ENOSYS; host_fd_ref_t olddir_ref, newdir_ref; if (host_dirfd_ref_open(olddirfd, &olddir_ref) < 0) @@ -1202,6 +1277,9 @@ int64_t sys_mknodat(guest_t *g, int dirfd, uint64_t path_gva, int mode, int dev) g, dirfd, path_gva, PATH_TR_CREATE | PATH_TR_CREATE_PARENTS, path, &tx); if (rc < 0) return rc; + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) @@ -1246,6 +1324,9 @@ int64_t sys_symlinkat(guest_t *g, linkpath, &tx); if (rc < 0) return rc; + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) @@ -1285,6 +1366,8 @@ int64_t sys_linkat(guest_t *g, if (path_translate_at(olddirfd, oldpath, PATH_TR_NONE, &old_tx) < 0 || path_translate_at(newdirfd, newpath, PATH_TR_CREATE, &new_tx) < 0) return linux_errno(); + if (old_tx.fuse_path || new_tx.fuse_path) + return -LINUX_ENOSYS; host_fd_ref_t olddir_ref, newdir_ref; if (host_dirfd_ref_open(olddirfd, &olddir_ref) < 0) @@ -1318,6 +1401,17 @@ int64_t sys_faccessat(guest_t *g, char dot_path[2]; if (guest_read_small(g, path_gva, dot_path, sizeof(dot_path)) == 0 && dot_path[0] == '.' && dot_path[1] == '\0') { + proc_cwd_view_t view; + if (proc_acquire_cwd_view(&view) == 0) { + if (view.path && view.path[0] == '/' && + fuse_path_matches_mount(view.path)) { + char cwd_path[LINUX_PATH_MAX]; + str_copy_trunc(cwd_path, view.path, sizeof(cwd_path)); + proc_release_cwd_view(&view); + return fuse_access_path(cwd_path, mode, flags); + } + proc_release_cwd_view(&view); + } int mac_flags = translate_faccessat_flags(flags); if (faccessat(AT_FDCWD, ".", mode, mac_flags) < 0) return linux_errno(); @@ -1337,6 +1431,9 @@ int64_t sys_faccessat(guest_t *g, if (!validate_at_flags(flags, LINUX_AT_EACCESS | LINUX_AT_SYMLINK_NOFOLLOW)) return -LINUX_EINVAL; + if (tx.fuse_path) + return fuse_access_path(tx.intercept_path, mode, flags); + if (tx.proc_resolved == 0 && dirfd == LINUX_AT_FDCWD && path[0] != '/') { int mac_flags = translate_faccessat_flags(flags); if (faccessat(AT_FDCWD, path, mode, mac_flags) < 0) @@ -1421,6 +1518,9 @@ int64_t sys_truncate(guest_t *g, uint64_t path_gva, int64_t length) path, &tx); if (rc < 0) return rc; + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) + return rc; if (truncate(tx.host_path, length) < 0) return linux_errno(); @@ -1462,6 +1562,9 @@ int64_t sys_fchmodat(guest_t *g, path, &tx); if (rc < 0) return rc; + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) @@ -1494,6 +1597,9 @@ int64_t sys_fchownat(guest_t *g, path, &tx); if (rc < 0) return rc; + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) + return rc; host_fd_ref_t dir_ref; if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) @@ -1554,6 +1660,11 @@ int64_t sys_utimensat(guest_t *g, host_fd_ref_close(&dir_ref); return rc; } + rc = reject_unsupported_fuse_path_op(&tx); + if (rc != INT64_MIN) { + host_fd_ref_close(&dir_ref); + return rc; + } path_arg = tx.host_path; } diff --git a/src/syscall/fs.h b/src/syscall/fs.h index 8dd987e..c938075 100644 --- a/src/syscall/fs.h +++ b/src/syscall/fs.h @@ -95,6 +95,12 @@ int64_t sys_faccessat(guest_t *g, uint64_t path_gva, int mode, int flags); +int64_t sys_mount(guest_t *g, + uint64_t source_gva, + uint64_t target_gva, + uint64_t fstype_gva, + unsigned long flags, + uint64_t data_gva); /* truncate */ int64_t sys_ftruncate(int fd, int64_t length); diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c new file mode 100644 index 0000000..d990aa0 --- /dev/null +++ b/src/syscall/fuse.c @@ -0,0 +1,2415 @@ +/* + * Guest-internal FUSE transport and minimal VFS dispatch + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#include "runtime/procemu.h" + +#include "syscall/abi.h" +#include "syscall/fuse.h" +#include "syscall/internal.h" +#include "syscall/path.h" +#include "syscall/proc.h" + +#define FUSE_KERNEL_VERSION 7 +#define FUSE_KERNEL_MINOR_VERSION 45 +#define FUSE_ROOT_ID 1 + +#define FUSE_ASYNC_READ (1u << 0) +#define FUSE_BIG_WRITES (1u << 5) +#define FUSE_MAX_PAGES (1u << 22) + +enum fuse_opcode { + FUSE_LOOKUP = 1, + FUSE_GETATTR = 3, + FUSE_OPEN = 14, + FUSE_READ = 15, + FUSE_RELEASE = 18, + FUSE_INIT = 26, + FUSE_OPENDIR = 27, + FUSE_READDIR = 28, + FUSE_RELEASEDIR = 29, +}; + +typedef struct { + uint64_t ino; + uint64_t size; + uint64_t blocks; + uint64_t atime; + uint64_t mtime; + uint64_t ctime; + uint32_t atimensec; + uint32_t mtimensec; + uint32_t ctimensec; + uint32_t mode; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint32_t rdev; + uint32_t blksize; + uint32_t flags; +} fuse_attr_t; + +typedef struct { + uint64_t nodeid; + uint64_t generation; + uint64_t entry_valid; + uint64_t attr_valid; + uint32_t entry_valid_nsec; + uint32_t attr_valid_nsec; + fuse_attr_t attr; +} fuse_entry_out_t; + +typedef struct { + uint64_t attr_valid; + uint32_t attr_valid_nsec; + uint32_t dummy; + fuse_attr_t attr; +} fuse_attr_out_t; + +typedef struct { + uint32_t flags; + uint32_t open_flags; +} fuse_open_in_t; + +typedef struct { + uint64_t fh; + uint32_t open_flags; + int32_t backing_id; +} fuse_open_out_t; + +typedef struct { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t read_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; +} fuse_read_in_t; + +typedef struct { + uint64_t fh; + uint32_t flags; + uint32_t release_flags; + uint64_t lock_owner; +} fuse_release_in_t; + +typedef struct { + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; + uint32_t flags2; + uint32_t unused[11]; +} fuse_init_in_t; + +typedef struct { + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; + uint16_t max_background; + uint16_t congestion_threshold; + uint32_t max_write; + uint32_t time_gran; + uint16_t max_pages; + uint16_t map_alignment; + uint32_t flags2; + uint32_t max_stack_depth; + uint16_t request_timeout; + uint16_t unused[11]; +} fuse_init_out_t; + +typedef struct { + uint32_t len; + uint32_t opcode; + uint64_t unique; + uint64_t nodeid; + uint32_t uid; + uint32_t gid; + uint32_t pid; + uint16_t total_extlen; + uint16_t padding; +} fuse_in_header_t; + +typedef struct { + uint32_t len; + int32_t error; + uint64_t unique; +} fuse_out_header_t; + +typedef struct { + uint64_t ino; + uint64_t off; + uint32_t namelen; + uint32_t type; + char name[]; +} fuse_dirent_t; + +#define FUSE_REC_ALIGN(x) \ + (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) +#define FUSE_NAME_OFFSET offsetof(fuse_dirent_t, name) +#define FUSE_DIRENT_SIZE_RAW(namelen) \ + FUSE_REC_ALIGN(FUSE_NAME_OFFSET + (namelen)) + +#define FUSE_MAX_SESSIONS 8 +#define FUSE_MAX_MOUNTS 8 +#define FUSE_MAX_OPEN_FILES 128 +#define FUSE_MAX_PENDING 128 +#define FUSE_FAKE_DEV 0xF00D + +/* Implementation ceiling for a single FUSE frame (header + payload). The + * kernel FUSE protocol caps a READ or WRITE payload at FUSE_MAX_PAGES * + * page_size = ~1 MiB by default and up to 4 MiB under recent kernels. The + * 8 MiB hard cap below leaves headroom for the FUSE header, in-band + * sub-headers, and any future readahead growth while still bounding the + * largest single malloc the daemon can force. Daemon-negotiated + * max_write is clamped to (FUSE_FRAME_CAP - sizeof(fuse_in_header_t) - + * sizeof(fuse_write_in)) at FUSE_INIT time so the read-reply path cannot + * negotiate a size larger than fuse_dev_write will accept. + */ +#define FUSE_FRAME_CAP ((size_t) (8 * 1024 * 1024)) +#define FUSE_MAX_NEGOTIATED_WRITE ((uint32_t) (FUSE_FRAME_CAP - 256)) + +typedef struct fuse_request { + bool used; + bool answered; + uint64_t unique; + uint8_t *frame; + size_t frame_len; + uint8_t *reply; + size_t reply_len; + int error; + pthread_cond_t cond; + struct fuse_request *next; +} fuse_request_t; + +typedef struct { + bool used; + int guest_fd; + int refcount; + pthread_mutex_t lock; + pthread_cond_t queue_cond; + pthread_cond_t init_cond; + bool closed; + bool daemon_dead; + bool init_done; + uint32_t max_write; + uint16_t max_pages; + uint64_t next_unique; + fuse_request_t requests[FUSE_MAX_PENDING]; + fuse_request_t *queue_head; + fuse_request_t *queue_tail; +} fuse_session_t; + +typedef struct { + bool used; + char path[LINUX_PATH_MAX]; + char source[256]; + char fstype[16]; + int mount_id; + /* session is the live transport for this mount; NULL once the owning + * /dev/fuse fd is closed (the slot is tombstoned, keeping path/source/ + * fstype/mount_id intact so consumers stuck on this mount path can be + * routed to a deterministic -LINUX_ENOTCONN instead of leaking through + * to host-filesystem resolution). Re-binding the slot requires + * fuse_alloc_mount_locked or sys_mount replacing a tombstoned entry. + */ + fuse_session_t *session; +} fuse_mount_t; + +typedef struct { + bool used; + /* refcount keeps the slot alive while any thread holds a snapshot or + * does an in-flight FUSE request against this fd. 1 = held by the + * underlying open fd; +1 per in-flight op acquired via + * fuse_file_get_locked. The slot is zeroed only when refcount hits 0 + * so a concurrent close cannot pull the io_cond out from under a + * waiting reader. + */ + int refcount; + int guest_fd; + bool dir; + uint64_t nodeid; + uint64_t fh; + uint64_t offset; + int linux_flags; + bool path_only; + /* session is pinned by the file's own session ref taken at open time. + * The mount slot the file came from may be reassigned independently; + * mount_id is the stable identifier used to detect that case without + * dereferencing a possibly-recycled fuse_mount_t. + */ + fuse_session_t *session; + int mount_id; + char path[LINUX_PATH_MAX]; + fuse_attr_t attr; + /* Serialize stream read() / readdir() against the offset field. lseek + * also waits on io_in_progress to avoid clobbering an in-flight read's + * post-update. + */ + bool io_in_progress; + pthread_cond_t io_cond; +} fuse_file_t; + +typedef struct { + bool used; + int guest_fd; + fuse_session_t *session; +} fuse_dev_binding_t; + +typedef struct { + bool used; + int guest_fd; + fuse_file_t *file; +} fuse_file_binding_t; + +static pthread_mutex_t fuse_lock = PTHREAD_MUTEX_INITIALIZER; +static fuse_session_t fuse_sessions[FUSE_MAX_SESSIONS]; +static fuse_mount_t fuse_mounts[FUSE_MAX_MOUNTS]; +static fuse_file_t fuse_files[FUSE_MAX_OPEN_FILES]; +static fuse_dev_binding_t fuse_dev_bindings[FD_TABLE_SIZE]; +static fuse_file_binding_t fuse_file_bindings[FD_TABLE_SIZE]; +static int fuse_next_mount_id = 100; + +static const char *skip_slashes(const char *s) +{ + while (s && *s == '/') + s++; + return s; +} + +static void trim_mount_path(char *path) +{ + size_t len = strlen(path); + while (len > 1 && path[len - 1] == '/') { + path[len - 1] = '\0'; + len--; + } +} + +static bool path_component_boundary(const char *base, const char *path) +{ + size_t len = strlen(base); + return !strncmp(base, path, len) && (path[len] == '\0' || path[len] == '/'); +} + +static int fuse_join_virtual_path(const char *base, + const char *path, + char *out, + size_t outsz) +{ + if (!base || !path || !out || outsz == 0 || base[0] != '/') { + errno = EINVAL; + return -1; + } + + size_t depth = 0; + /* marks[i] stores the output index at which the i-th surviving component + * begins. Each value is bounded by outsz (capped at LINUX_PATH_MAX = + * 4096), so uint16_t is sufficient and shrinks the host-stack footprint + * from 16 KiB to 4 KiB per call. + */ + uint16_t marks[LINUX_PATH_MAX / 2]; + out[0] = '/'; + out[1] = '\0'; + + const char *segments[2] = {base, path}; + for (size_t seg = 0; seg < ARRAY_SIZE(segments); seg++) { + const char *cur = segments[seg]; + while (*cur) { + while (*cur == '/') + cur++; + if (*cur == '\0') + break; + const char *start = cur; + while (*cur && *cur != '/') + cur++; + size_t len = (size_t) (cur - start); + if (len == 1 && start[0] == '.') + continue; + if (len == 2 && start[0] == '.' && start[1] == '.') { + if (depth > 0) { + out[marks[--depth]] = '\0'; + if (out[0] == '\0') { + out[0] = '/'; + out[1] = '\0'; + } + } + continue; + } + + size_t cur_len = strlen(out); + size_t prefix_len = cur_len; + size_t needed = cur_len + len + 1; + if (cur_len > 1) + needed++; + if (needed > outsz || depth >= ARRAY_SIZE(marks) || + prefix_len > UINT16_MAX) { + errno = ENAMETOOLONG; + return -1; + } + if (cur_len > 1) + out[cur_len++] = '/'; + else + cur_len = 1; + memcpy(out + cur_len, start, len); + cur_len += len; + out[cur_len] = '\0'; + marks[depth++] = (uint16_t) (prefix_len > 1 ? prefix_len : 1); + } + } + + if (out[0] == '\0') { + out[0] = '/'; + out[1] = '\0'; + } + return 0; +} + +/* Canonicalize an absolute guest path, collapsing "." and ".." against the + * filesystem root. Returns 0 on success with the canonical form written to + * out, or -1 (errno set) on overflow. Always overwrites out on success. + * Callers use this before mount-prefix matching so paths like + * "/mnt/fuse/./foo" and "/mnt/fuse/sub/../foo" route consistently with + * "/mnt/fuse/foo" and paths like "/mnt/fuse/../etc" escape FUSE land + * deterministically. + */ +static int fuse_canonical_abs(const char *in, char *out, size_t outsz) +{ + return fuse_join_virtual_path("/", in, out, outsz); +} + +static fuse_session_t *fuse_session_by_fd_locked(int guest_fd) +{ + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (fuse_dev_bindings[i].used && + fuse_dev_bindings[i].guest_fd == guest_fd) + return fuse_dev_bindings[i].session; + } + return NULL; +} + +static int fuse_bind_dev_fd_locked(int guest_fd, fuse_session_t *session) +{ + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (!fuse_dev_bindings[i].used) { + fuse_dev_bindings[i].used = true; + fuse_dev_bindings[i].guest_fd = guest_fd; + fuse_dev_bindings[i].session = session; + return 0; + } + } + errno = EMFILE; + return -1; +} + +static fuse_session_t *fuse_unbind_dev_fd_locked(int guest_fd) +{ + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (fuse_dev_bindings[i].used && + fuse_dev_bindings[i].guest_fd == guest_fd) { + fuse_session_t *session = fuse_dev_bindings[i].session; + memset(&fuse_dev_bindings[i], 0, sizeof(fuse_dev_bindings[i])); + return session; + } + } + return NULL; +} + +static int fuse_dev_alias_count_locked(fuse_session_t *session) +{ + int count = 0; + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (fuse_dev_bindings[i].used && + fuse_dev_bindings[i].session == session) + count++; + } + return count; +} + +/* Bump session reference under fuse_lock. Each live mount, each FUSE-backed + * file fd, and the /dev/fuse fd itself hold one ref. Destruction is deferred + * until refcount drops to zero so a slow read/write on a mount-backed fd + * cannot race with the daemon closing /dev/fuse and tearing down the lock. + */ +static void fuse_session_get_locked(fuse_session_t *session) +{ + session->refcount++; +} + +/* Drop a session reference under fuse_lock. When the last reference is + * dropped and the session has been closed, destroy the synchronization + * primitives and clear the slot. Callers must not hold session->lock. + */ +static void fuse_session_put_locked(fuse_session_t *session) +{ + if (--session->refcount > 0) + return; + pthread_mutex_destroy(&session->lock); + pthread_cond_destroy(&session->queue_cond); + pthread_cond_destroy(&session->init_cond); + memset(session, 0, sizeof(*session)); +} + +static fuse_mount_t *fuse_mount_for_path_locked(const char *path, + const char **relpath_out) +{ + fuse_mount_t *best = NULL; + size_t best_len = 0; + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (!fuse_mounts[i].used) + continue; + size_t len = strlen(fuse_mounts[i].path); + if (len < best_len || + !path_component_boundary(fuse_mounts[i].path, path)) + continue; + best = &fuse_mounts[i]; + best_len = len; + } + if (best && relpath_out) { + const char *rel = path + best_len; + if (*rel == '/') + rel++; + *relpath_out = rel; + } + return best; +} + +/* Fully free a mount slot. Drops any live session ref (tombstoned slots + * have NULL session and have already dropped their ref via fuse_fd_cleanup). + * Used by sys_mount error paths and by fuse_alloc_mount_locked when + * reclaiming a tombstoned slot. + */ +static void fuse_uninstall_mount_locked(fuse_mount_t *mount) +{ + if (!mount || !mount->used) + return; + if (mount->session) + fuse_session_put_locked(mount->session); + memset(mount, 0, sizeof(*mount)); +} + +/* Allocate a mount slot, preferring fully-free entries. If none are free, + * reclaim the first tombstoned slot (used==true with session==NULL). + * Returns NULL when no slot is available even after tombstone reclamation. + */ +static fuse_mount_t *fuse_alloc_mount_locked(void) +{ + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (!fuse_mounts[i].used) { + memset(&fuse_mounts[i], 0, sizeof(fuse_mounts[i])); + fuse_mounts[i].used = true; + return &fuse_mounts[i]; + } + } + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (fuse_mounts[i].used && fuse_mounts[i].session == NULL) { + fuse_uninstall_mount_locked(&fuse_mounts[i]); + memset(&fuse_mounts[i], 0, sizeof(fuse_mounts[i])); + fuse_mounts[i].used = true; + return &fuse_mounts[i]; + } + } + return NULL; +} + +static void fuse_fill_stat_from_attr(const fuse_attr_t *attr, struct stat *st) +{ + memset(st, 0, sizeof(*st)); + st->st_dev = (dev_t) FUSE_FAKE_DEV; + st->st_ino = (ino_t) attr->ino; + st->st_mode = (mode_t) attr->mode; + st->st_nlink = attr->nlink; + st->st_uid = attr->uid; + st->st_gid = attr->gid; + st->st_rdev = attr->rdev; + st->st_size = (off_t) attr->size; + st->st_blocks = (blkcnt_t) attr->blocks; + st->st_blksize = (blksize_t) (attr->blksize ? attr->blksize : 4096); + st->st_atimespec.tv_sec = (time_t) attr->atime; + st->st_atimespec.tv_nsec = attr->atimensec; + st->st_mtimespec.tv_sec = (time_t) attr->mtime; + st->st_mtimespec.tv_nsec = attr->mtimensec; + st->st_ctimespec.tv_sec = (time_t) attr->ctime; + st->st_ctimespec.tv_nsec = attr->ctimensec; +} + +static fuse_request_t *fuse_alloc_request_locked(fuse_session_t *session) +{ + for (int i = 0; i < FUSE_MAX_PENDING; i++) { + if (!session->requests[i].used) { + memset(&session->requests[i], 0, sizeof(session->requests[i])); + session->requests[i].used = true; + pthread_cond_init(&session->requests[i].cond, NULL); + return &session->requests[i]; + } + } + return NULL; +} + +static void fuse_free_request_locked(fuse_request_t *req) +{ + pthread_cond_destroy(&req->cond); + free(req->frame); + free(req->reply); + memset(req, 0, sizeof(*req)); +} + +static int fuse_wait_for_init_locked(fuse_session_t *session) +{ + while (!session->closed && !session->daemon_dead && !session->init_done) + pthread_cond_wait(&session->init_cond, &session->lock); + if (session->closed || session->daemon_dead) + return -LINUX_ENOTCONN; + return 0; +} + +static int fuse_request_locked(fuse_session_t *session, + uint32_t opcode, + uint64_t nodeid, + const void *payload, + size_t payload_len, + uint8_t **reply_out, + size_t *reply_len_out); + +static int fuse_send_init_locked(fuse_session_t *session) +{ + fuse_init_in_t init = { + .major = FUSE_KERNEL_VERSION, + .minor = FUSE_KERNEL_MINOR_VERSION, + .max_readahead = 1024 * 1024, + .flags = FUSE_ASYNC_READ | FUSE_BIG_WRITES | FUSE_MAX_PAGES, + }; + return fuse_request_locked(session, FUSE_INIT, FUSE_ROOT_ID, &init, + sizeof(init), NULL, NULL); +} + +/* Issue one FUSE request and wait for the reply. Returns 0 on success, or a + * negative Linux errno on failure. The caller must hold session->lock. + * + * Known limitation: the wait uses a plain pthread_cond_wait, so a signal + * delivered to a blocked consumer does not return -EINTR and does not emit + * a FUSE_INTERRUPT frame to the daemon. Honoring SA_RESTART and emitting + * FUSE_INTERRUPT requires integrating with the per-thread signal eventfd + * and remains a deferred Tier B item. Until then, daemon death or session + * close are the only paths that wake a blocked consumer. + */ +static int fuse_request_locked(fuse_session_t *session, + uint32_t opcode, + uint64_t nodeid, + const void *payload, + size_t payload_len, + uint8_t **reply_out, + size_t *reply_len_out) +{ + if (!session->init_done && opcode != FUSE_INIT) { + int init_rc = fuse_wait_for_init_locked(session); + if (init_rc < 0) + return init_rc; + } + if (session->closed || session->daemon_dead) + return -LINUX_ENOTCONN; + + fuse_request_t *req = fuse_alloc_request_locked(session); + if (!req) + return -LINUX_ENOMEM; + + req->unique = session->next_unique++; + req->frame_len = sizeof(fuse_in_header_t) + payload_len; + req->frame = calloc(1, req->frame_len); + if (!req->frame) { + fuse_free_request_locked(req); + return -LINUX_ENOMEM; + } + + fuse_in_header_t *hdr = (fuse_in_header_t *) req->frame; + hdr->len = (uint32_t) req->frame_len; + hdr->opcode = opcode; + hdr->unique = req->unique; + hdr->nodeid = nodeid; + hdr->uid = proc_get_uid(); + hdr->gid = proc_get_gid(); + hdr->pid = (uint32_t) proc_get_pid(); + if (payload_len) + memcpy(req->frame + sizeof(*hdr), payload, payload_len); + + if (session->queue_tail) + session->queue_tail->next = req; + else + session->queue_head = req; + session->queue_tail = req; + pthread_cond_broadcast(&session->queue_cond); + + while (!req->answered && !session->closed && !session->daemon_dead) { + sched_yield(); + pthread_cond_wait(&req->cond, &session->lock); + } + + int rc = 0; + if (req->answered) { + if (req->error < 0) + rc = req->error; + } else if (session->closed || session->daemon_dead) { + rc = -LINUX_ENOTCONN; + } + + if (rc == 0 && reply_out && reply_len_out) { + *reply_len_out = req->reply_len; + *reply_out = NULL; + if (req->reply_len) { + *reply_out = malloc(req->reply_len); + if (!*reply_out) + rc = -LINUX_ENOMEM; + else + memcpy(*reply_out, req->reply, req->reply_len); + } + } + + fuse_free_request_locked(req); + return rc; +} + +static int fuse_lookup_locked(fuse_session_t *session, + uint64_t parent, + const char *name, + fuse_entry_out_t *out) +{ + uint8_t *reply = NULL; + size_t reply_len = 0; + int rc = fuse_request_locked(session, FUSE_LOOKUP, parent, name, + strlen(name) + 1, &reply, &reply_len); + if (rc < 0) + return rc; + if (reply_len < sizeof(*out)) { + free(reply); + return -LINUX_EPROTO; + } + memcpy(out, reply, sizeof(*out)); + free(reply); + return 0; +} + +static int fuse_getattr_locked(fuse_session_t *session, + uint64_t nodeid, + fuse_attr_t *attr) +{ + uint8_t *reply = NULL; + size_t reply_len = 0; + int rc = fuse_request_locked(session, FUSE_GETATTR, nodeid, NULL, 0, &reply, + &reply_len); + if (rc < 0) + return rc; + if (reply_len < sizeof(fuse_attr_out_t)) { + free(reply); + return -LINUX_EPROTO; + } + fuse_attr_out_t out; + memcpy(&out, reply, sizeof(out)); + *attr = out.attr; + free(reply); + return 0; +} + +static int fuse_walk_path_locked(fuse_session_t *session, + const char *relpath, + uint64_t *nodeid_out, + fuse_attr_t *attr_out) +{ + uint64_t nodeid = FUSE_ROOT_ID; + fuse_attr_t attr = {0}; + const char *p = skip_slashes(relpath); + + if (*p == '\0') { + int rc = fuse_getattr_locked(session, nodeid, &attr); + if (rc < 0) + return rc; + *nodeid_out = nodeid; + if (attr_out) + *attr_out = attr; + return 0; + } + + while (*p) { + const char *slash = strchr(p, '/'); + size_t len = slash ? (size_t) (slash - p) : strlen(p); + char name[LINUX_PATH_MAX]; + if (len == 0 || len >= sizeof(name)) + return -LINUX_ENOENT; + memcpy(name, p, len); + name[len] = '\0'; + /* The path is canonicalized before reaching the walk, so "." and + * ".." should never appear as a real component. Defend against + * accidental forwarding to the daemon (which has no notion of the + * mount root's containment) just in case a future caller skips + * canonicalization. The advance-then-continue order matters: a + * bare "continue" without advancing p would spin on the same + * component forever if a non-canonical path ever reached here. + */ + if (len == 1 && name[0] == '.') { + if (!slash) + break; + p = skip_slashes(slash); + continue; + } + if (len == 2 && name[0] == '.' && name[1] == '.') + return -LINUX_ENOENT; + + fuse_entry_out_t entry; + int rc = fuse_lookup_locked(session, nodeid, name, &entry); + if (rc < 0) + return rc; + nodeid = entry.nodeid; + attr = entry.attr; + if (!slash) + break; + p = skip_slashes(slash); + } + + *nodeid_out = nodeid; + if (attr_out) + *attr_out = attr; + return 0; +} + +static fuse_file_t *fuse_file_by_fd_locked(int guest_fd) +{ + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (fuse_file_bindings[i].used && + fuse_file_bindings[i].guest_fd == guest_fd) + return fuse_file_bindings[i].file; + } + return NULL; +} + +static int fuse_bind_file_fd_locked(int guest_fd, fuse_file_t *file) +{ + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (!fuse_file_bindings[i].used) { + fuse_file_bindings[i].used = true; + fuse_file_bindings[i].guest_fd = guest_fd; + fuse_file_bindings[i].file = file; + return 0; + } + } + errno = EMFILE; + return -1; +} + +static fuse_file_t *fuse_unbind_file_fd_locked(int guest_fd) +{ + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (fuse_file_bindings[i].used && + fuse_file_bindings[i].guest_fd == guest_fd) { + fuse_file_t *file = fuse_file_bindings[i].file; + memset(&fuse_file_bindings[i], 0, sizeof(fuse_file_bindings[i])); + return file; + } + } + return NULL; +} + +static int fuse_file_alias_count_locked(fuse_file_t *file) +{ + int count = 0; + for (int i = 0; i < FD_TABLE_SIZE; i++) { + if (fuse_file_bindings[i].used && fuse_file_bindings[i].file == file) + count++; + } + return count; +} + +/* Bump file slot refcount. Holder must release via fuse_file_put_locked. */ +static void fuse_file_get_locked(fuse_file_t *file) +{ + file->refcount++; +} + +/* Drop a file slot refcount under fuse_lock. The slot is destroyed and + * cleared only when no one else holds a reference, so a concurrent close + * cannot tear down io_cond out from under a thread that has already + * snapshotted this slot. + */ +static void fuse_file_put_locked(fuse_file_t *file) +{ + if (--file->refcount > 0) + return; + if (file->session) + fuse_session_put_locked(file->session); + pthread_cond_destroy(&file->io_cond); + memset(file, 0, sizeof(*file)); +} + +static fuse_file_t *fuse_alloc_file_locked(void) +{ + for (int i = 0; i < FUSE_MAX_OPEN_FILES; i++) { + /* A slot with refcount > 0 is still owned by an in-flight op even + * if its underlying fd has been closed; skip it until that op + * releases. used==false && refcount==0 is the only fully-free + * state. + */ + if (!fuse_files[i].used && fuse_files[i].refcount == 0) { + memset(&fuse_files[i], 0, sizeof(fuse_files[i])); + fuse_files[i].used = true; + fuse_files[i].refcount = 1; /* held by the open fd */ + pthread_cond_init(&fuse_files[i].io_cond, NULL); + return &fuse_files[i]; + } + } + return NULL; +} + +static int fuse_open_common_locked(fuse_session_t *session, + uint64_t nodeid, + int linux_flags, + bool dir, + fuse_open_out_t *out) +{ + fuse_open_in_t in = {.flags = (uint32_t) linux_flags}; + uint8_t *reply = NULL; + size_t reply_len = 0; + int rc = fuse_request_locked(session, dir ? FUSE_OPENDIR : FUSE_OPEN, + nodeid, &in, sizeof(in), &reply, &reply_len); + if (rc < 0) + return rc; + if (reply_len < sizeof(*out)) { + free(reply); + return -LINUX_EPROTO; + } + memcpy(out, reply, sizeof(*out)); + free(reply); + return 0; +} + +/* Emit FUSE_RELEASE / FUSE_RELEASEDIR for a file snapshot. session must be + * non-NULL and pinned by the caller; if the daemon has died the request is + * skipped silently. + */ +static int fuse_release_common_locked(fuse_session_t *session, + bool dir, + uint64_t nodeid, + uint64_t fh, + int linux_flags) +{ + if (!session || session->daemon_dead || session->closed) + return 0; + if (linux_flags & LINUX_O_PATH) + return 0; + + fuse_release_in_t in = { + .fh = fh, + .flags = (uint32_t) linux_flags, + }; + return fuse_request_locked(session, dir ? FUSE_RELEASEDIR : FUSE_RELEASE, + nodeid, &in, sizeof(in), NULL, NULL); +} + +static void fuse_fd_cleanup(int guest_fd) +{ + /* Step 1: snapshot the file slot's release-relevant fields and detach + * the slot from the fd. The slot itself stays alive (refcount > 0) + * until any in-flight op releases its ref; only then is io_cond + * destroyed and the slot zeroed. + */ + fuse_session_t *file_session = NULL; + bool have_file = false; + bool file_dir = false; + uint64_t file_nodeid = 0, file_fh = 0; + int file_linux_flags = 0; + + pthread_mutex_lock(&fuse_lock); + fuse_file_t *file = fuse_unbind_file_fd_locked(guest_fd); + if (file) { + file_session = file->session; + file_dir = file->dir; + file_nodeid = file->nodeid; + file_fh = file->fh; + file_linux_flags = file->linux_flags; + have_file = true; + bool final_alias = (fuse_file_alias_count_locked(file) == 0); + if (final_alias) { + /* Mark the slot logically closed but keep refcount intact so an + * in-flight read sees io_cond / session through its existing snap. + * fuse_file_put_locked decrements when those ops release. + */ + file->used = false; + pthread_cond_broadcast(&file->io_cond); + fuse_session_get_locked(file_session); + } else { + have_file = false; + } + fuse_file_put_locked(file); /* drop this fd alias ref */ + } + pthread_mutex_unlock(&fuse_lock); + + if (have_file && file_session) { + pthread_mutex_lock(&file_session->lock); + fuse_release_common_locked(file_session, file_dir, file_nodeid, file_fh, + file_linux_flags); + pthread_mutex_unlock(&file_session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(file_session); + pthread_mutex_unlock(&fuse_lock); + } + + pthread_mutex_lock(&fuse_lock); + fuse_session_t *session = fuse_unbind_dev_fd_locked(guest_fd); + if (!session) { + pthread_mutex_unlock(&fuse_lock); + return; + } + + if (fuse_dev_alias_count_locked(session) > 0) { + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return; + } + + /* Mark dead and wake every blocked waiter under the session lock so they + * see daemon_dead=true and exit with -LINUX_ENOTCONN before the lock + * itself goes away. + */ + pthread_mutex_lock(&session->lock); + session->closed = true; + session->daemon_dead = true; + pthread_cond_broadcast(&session->queue_cond); + pthread_cond_broadcast(&session->init_cond); + for (int i = 0; i < FUSE_MAX_PENDING; i++) { + if (session->requests[i].used) { + session->requests[i].answered = true; + session->requests[i].error = -LINUX_ENOTCONN; + pthread_cond_broadcast(&session->requests[i].cond); + } + } + pthread_mutex_unlock(&session->lock); + + /* Wake any file slots blocked on io_in_progress; their owners will see + * the session's daemon_dead flag and exit with -LINUX_ENOTCONN. + */ + for (int i = 0; i < FUSE_MAX_OPEN_FILES; i++) { + if (fuse_files[i].used && fuse_files[i].session == session) + pthread_cond_broadcast(&fuse_files[i].io_cond); + } + + /* Tombstone each mount whose transport has just died: drop the + * per-mount session ref but keep the slot's path/source/fstype/ + * mount_id intact so a process whose virtual cwd is on this mount + * still routes to a deterministic -LINUX_ENOTCONN instead of falling + * through to host-filesystem resolution. session is cleared to NULL + * as the tombstone marker; the slot is reclaimed by a later + * sys_mount at the same path or by fuse_alloc_mount_locked. + */ + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (fuse_mounts[i].used && fuse_mounts[i].session == session) { + fuse_mounts[i].session = NULL; + fuse_session_put_locked(session); + } + } + + /* Drop the /dev/fuse fd's own session ref. Destruction is deferred until + * the last mount-backed fd closes. + */ + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); +} + +void fuse_init(void) +{ + pthread_mutex_lock(&fuse_lock); + memset(fuse_sessions, 0, sizeof(fuse_sessions)); + memset(fuse_mounts, 0, sizeof(fuse_mounts)); + memset(fuse_files, 0, sizeof(fuse_files)); + memset(fuse_dev_bindings, 0, sizeof(fuse_dev_bindings)); + memset(fuse_file_bindings, 0, sizeof(fuse_file_bindings)); + fuse_next_mount_id = 100; + pthread_mutex_unlock(&fuse_lock); +} + +int fuse_proc_open(int linux_flags) +{ + pthread_mutex_lock(&fuse_lock); + fuse_session_t *slot = NULL; + for (int i = 0; i < FUSE_MAX_SESSIONS; i++) { + if (!fuse_sessions[i].used) { + slot = &fuse_sessions[i]; + memset(slot, 0, sizeof(*slot)); + slot->used = true; + slot->refcount = 1; /* held by the /dev/fuse fd itself */ + pthread_mutex_init(&slot->lock, NULL); + pthread_cond_init(&slot->queue_cond, NULL); + pthread_cond_init(&slot->init_cond, NULL); + slot->max_write = 64 * 1024; + slot->max_pages = 16; + slot->next_unique = 1; + break; + } + } + pthread_mutex_unlock(&fuse_lock); + if (!slot) { + errno = EMFILE; + return -1; + } + + int guest_fd = fd_alloc(FD_FUSE_DEV, -1, fuse_fd_cleanup); + if (guest_fd < 0) { + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(slot); + pthread_mutex_unlock(&fuse_lock); + errno = EMFILE; + return -1; + } + + pthread_mutex_lock(&fuse_lock); + slot->guest_fd = guest_fd; + if (fuse_bind_dev_fd_locked(guest_fd, slot) < 0) { + fuse_session_put_locked(slot); + pthread_mutex_unlock(&fuse_lock); + fd_mark_closed(guest_fd); + errno = EMFILE; + return -1; + } + fd_table[guest_fd].linux_flags = linux_flags; + pthread_mutex_unlock(&fuse_lock); + return guest_fd; +} + +int fuse_proc_stat(struct stat *st) +{ + memset(st, 0, sizeof(*st)); + st->st_mode = S_IFCHR | 0666; + st->st_nlink = 1; + st->st_dev = (dev_t) FUSE_FAKE_DEV; + st->st_rdev = (dev_t) FUSE_FAKE_DEV; + st->st_blksize = 4096; + return 0; +} + +static int parse_mount_fd(const char *data) +{ + const char *fdp = data ? strstr(data, "fd=") : NULL; + if (!fdp) + return -1; + fdp += 3; + char *endp; + errno = 0; + long fd = strtol(fdp, &endp, 10); + /* Reject empty digit run, overflow, negative, and out-of-range fd values so + * a malformed options string cannot smuggle in an integer that bypasses + * later RANGE_CHECK gates. + */ + if (endp == fdp || errno != 0 || fd < 0 || fd > INT_MAX) + return -1; + return (int) fd; +} + +int64_t sys_mount(guest_t *g, + uint64_t source_gva, + uint64_t target_gva, + uint64_t fstype_gva, + unsigned long flags, + uint64_t data_gva) +{ + (void) source_gva; + (void) flags; + + char target[LINUX_PATH_MAX]; + char source[256]; + char fstype[64]; + char data[LINUX_PATH_MAX]; + if (target_gva == 0 || + guest_read_str(g, target_gva, target, sizeof(target)) < 0 || + guest_read_str(g, fstype_gva, fstype, sizeof(fstype)) < 0) + return -LINUX_EFAULT; + if (source_gva) { + if (guest_read_str(g, source_gva, source, sizeof(source)) < 0) + return -LINUX_EFAULT; + } else { + str_copy_trunc(source, "fuse", sizeof(source)); + } + if (data_gva && guest_read_str(g, data_gva, data, sizeof(data)) < 0) + return -LINUX_EFAULT; + if (!data_gva) + data[0] = '\0'; + + if (strcmp(fstype, "fuse") && strcmp(fstype, "fuseblk")) + return -LINUX_ENODEV; + if (target[0] != '/') + return -LINUX_EINVAL; + trim_mount_path(target); + char target_canon[LINUX_PATH_MAX]; + if (fuse_canonical_abs(target, target_canon, sizeof(target_canon)) < 0) + return -LINUX_ENAMETOOLONG; + + path_translation_t tx; + if (path_translate_at(LINUX_AT_FDCWD, target_canon, PATH_TR_NONE, &tx) < 0) + return linux_errno(); + + struct stat st; + if (stat(tx.host_path, &st) < 0) + return linux_errno(); + if (!S_ISDIR(st.st_mode)) + return -LINUX_ENOTDIR; + + int fuse_fd = parse_mount_fd(data); + if (fuse_fd < 0) + return -LINUX_EINVAL; + + pthread_mutex_lock(&fuse_lock); + fuse_session_t *session = fuse_session_by_fd_locked(fuse_fd); + if (!session) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (fuse_mounts[i].used && fuse_mounts[i].session != NULL && + !strcmp(fuse_mounts[i].path, target_canon)) { + /* Live mount at this path already; reject as EBUSY. A + * tombstoned slot at the same path is reclaimed below. + */ + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBUSY; + } + } + /* Prefer reclaiming a tombstoned slot at the same path so the mount_id + * sequence stays stable for consumers that cached it. + */ + fuse_mount_t *mount = NULL; + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (fuse_mounts[i].used && fuse_mounts[i].session == NULL && + !strcmp(fuse_mounts[i].path, target_canon)) { + fuse_uninstall_mount_locked(&fuse_mounts[i]); + memset(&fuse_mounts[i], 0, sizeof(fuse_mounts[i])); + fuse_mounts[i].used = true; + mount = &fuse_mounts[i]; + break; + } + } + if (!mount) + mount = fuse_alloc_mount_locked(); + if (mount) { + mount->session = session; + fuse_session_get_locked(session); + str_copy_trunc(mount->path, target_canon, sizeof(mount->path)); + str_copy_trunc(mount->source, source, sizeof(mount->source)); + str_copy_trunc(mount->fstype, fstype, sizeof(mount->fstype)); + mount->mount_id = fuse_next_mount_id++; + } + pthread_mutex_unlock(&fuse_lock); + if (!mount) + return -LINUX_ENOMEM; + + pthread_mutex_lock(&session->lock); + int init_rc = fuse_send_init_locked(session); + pthread_mutex_unlock(&session->lock); + if (init_rc < 0) { + pthread_mutex_lock(&fuse_lock); + fuse_uninstall_mount_locked(mount); + pthread_mutex_unlock(&fuse_lock); + return init_rc; + } + return 0; +} + +bool fuse_path_matches_mount(const char *path) +{ + if (!path || path[0] != '/') + return false; + char canon[LINUX_PATH_MAX]; + if (fuse_canonical_abs(path, canon, sizeof(canon)) < 0) + return false; + pthread_mutex_lock(&fuse_lock); + /* Matches both live and tombstoned mounts so post-daemon-death + * operations get routed to a deterministic -LINUX_ENOTCONN instead of + * silently falling through to host-filesystem resolution. + */ + bool matched = fuse_mount_for_path_locked(canon, NULL) != NULL; + pthread_mutex_unlock(&fuse_lock); + return matched; +} + +/* Resolve a guest-absolute path to a (session, mount_id, nodeid, attr). + * Returns 0 on success, or a negative Linux errno. The session refcount is + * bumped on success; callers must drop it via fuse_session_put_locked when + * done with the resolution. + */ +static int fuse_path_lookup(const char *path, + fuse_session_t **session_out, + int *mount_id_out, + uint64_t *nodeid_out, + fuse_attr_t *attr_out) +{ + char canon[LINUX_PATH_MAX]; + if (fuse_canonical_abs(path, canon, sizeof(canon)) < 0) + return -LINUX_ENAMETOOLONG; + + pthread_mutex_lock(&fuse_lock); + const char *relpath = NULL; + fuse_mount_t *mount = fuse_mount_for_path_locked(canon, &relpath); + if (!mount) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_ENOENT; + } + if (!mount->session) { + /* Tombstoned mount: daemon dropped /dev/fuse already. */ + pthread_mutex_unlock(&fuse_lock); + return -LINUX_ENOTCONN; + } + fuse_session_t *session = mount->session; + int mount_id = mount->mount_id; + fuse_session_get_locked(session); + pthread_mutex_lock(&session->lock); + pthread_mutex_unlock(&fuse_lock); + + int rc = fuse_walk_path_locked(session, relpath, nodeid_out, attr_out); + pthread_mutex_unlock(&session->lock); + + if (rc < 0) { + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return rc; + } + if (session_out) + *session_out = session; + else { + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + } + if (mount_id_out) + *mount_id_out = mount_id; + return 0; +} + +/* Stat a FUSE-mounted path. Returns 0 on success, or a negative Linux errno + * (LINUX_E*). Errno-via-globals is intentionally avoided: callers in + * src/syscall/fs-stat.c return this value directly to the guest. + * + * at_flags is the LINUX_AT_* mask from the caller. Today only + * LINUX_AT_SYMLINK_NOFOLLOW is honored: when the daemon's final LOOKUP returns + * S_IFLNK and NOFOLLOW is unset, the call returns -LINUX_ENOSYS because symlink + * target resolution is not implemented yet. With NOFOLLOW (lstat-equivalent) + * the symlink's own attrs are returned. + */ +int fuse_stat_path(const char *path, struct stat *st, int at_flags) +{ + fuse_attr_t attr; + uint64_t nodeid = 0; + int rc = fuse_path_lookup(path, NULL, NULL, &nodeid, &attr); + (void) nodeid; + if (rc < 0) + return rc; + if (S_ISLNK(attr.mode) && !(at_flags & LINUX_AT_SYMLINK_NOFOLLOW)) + return -LINUX_ENOSYS; + fuse_fill_stat_from_attr(&attr, st); + return 0; +} + +int fuse_access_path(const char *path, int mode, int flags) +{ + struct stat st; + int rc = fuse_stat_path(path, &st, flags); + if (rc < 0) + return rc; + if (path_check_intercept_access(&st, mode, flags) < 0) + return linux_errno(); + return 0; +} + +static int fuse_write_all(int fd, const void *buf, size_t len) +{ + const uint8_t *p = buf; + while (len > 0) { + ssize_t nw = write(fd, p, len); + if (nw < 0) { + if (errno == EINTR) + continue; + return -1; + } + p += (size_t) nw; + len -= (size_t) nw; + } + return 0; +} + +static int fuse_materialize_open_file_locked(fuse_session_t *session, + uint64_t nodeid, + uint64_t fh, + int linux_flags, + char *out_path, + size_t outsz) +{ + char tmp_template[] = "/tmp/elfuse-fuse-exec.XXXXXX"; + if (sizeof(tmp_template) > outsz) + return -LINUX_ENAMETOOLONG; + + int tmp_fd = mkstemp(tmp_template); + if (tmp_fd < 0) + return linux_errno(); + + int rc = 0; + uint64_t offset = 0; + uint32_t size = session->max_write ? session->max_write : 65536; + for (;;) { + fuse_read_in_t in = { + .fh = fh, + .offset = offset, + .size = size, + .flags = (uint32_t) linux_flags, + }; + uint8_t *reply = NULL; + size_t reply_len = 0; + rc = fuse_request_locked(session, FUSE_READ, nodeid, &in, sizeof(in), + &reply, &reply_len); + if (rc < 0) { + free(reply); + break; + } + if (reply_len == 0) { + free(reply); + break; + } + if (fuse_write_all(tmp_fd, reply, reply_len) < 0) { + rc = linux_errno(); + free(reply); + break; + } + offset += reply_len; + free(reply); + } + if (close(tmp_fd) < 0 && rc == 0) + rc = linux_errno(); + + if (rc < 0) { + unlink(tmp_template); + return rc; + } + + memcpy(out_path, tmp_template, sizeof(tmp_template)); + return 0; +} + +int fuse_materialize_path(const char *path, char *out_path, size_t outsz) +{ + fuse_session_t *session = NULL; + int mount_id = 0; + uint64_t nodeid = 0; + fuse_attr_t attr; + int rc = fuse_path_lookup(path, &session, &mount_id, &nodeid, &attr); + (void) mount_id; + if (rc < 0) + return rc; + if (S_ISDIR(attr.mode)) { + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EACCES; + } + + pthread_mutex_lock(&session->lock); + fuse_open_out_t out; + bool opened = false; + rc = fuse_open_common_locked(session, nodeid, LINUX_O_RDONLY, false, &out); + if (rc == 0) { + opened = true; + rc = fuse_materialize_open_file_locked(session, nodeid, out.fh, + LINUX_O_RDONLY, out_path, outsz); + } + if (opened) { + int rel_rc = fuse_release_common_locked(session, false, nodeid, out.fh, + LINUX_O_RDONLY); + if (rc == 0 && rel_rc < 0) + rc = rel_rc; + } + pthread_mutex_unlock(&session->lock); + + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return rc; +} + +/* fstat against a FUSE-backed fd. Returns 0 on success, negative Linux errno + * otherwise. Returns -LINUX_EBADF when the fd does not refer to a live FUSE + * file so callers can distinguish "not ours" from "ours but failed". + */ +int fuse_fstat_fd(int fd, struct stat *st) +{ + fd_entry_t snap; + if (fd_snapshot(fd, &snap) && snap.type == FD_FUSE_DEV) + return fuse_proc_stat(st); + + pthread_mutex_lock(&fuse_lock); + fuse_file_t *file = fuse_file_by_fd_locked(fd); + if (!file || !file->session) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + fuse_session_t *session = file->session; + int mount_id = file->mount_id; + uint64_t nodeid = file->nodeid; + fuse_session_get_locked(session); + pthread_mutex_lock(&session->lock); + pthread_mutex_unlock(&fuse_lock); + fuse_attr_t attr; + int rc = fuse_getattr_locked(session, nodeid, &attr); + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + if (rc == 0) { + /* file may have been concurrently closed and the slot reused. Refetch + * by guest_fd and only update the attr cache if it still maps to the + * same (session, mount_id, nodeid) identity. + */ + file = fuse_file_by_fd_locked(fd); + if (file && file->session == session && file->mount_id == mount_id && + file->nodeid == nodeid) + file->attr = attr; + } + pthread_mutex_unlock(&fuse_lock); + if (rc < 0) + return rc; + fuse_fill_stat_from_attr(&attr, st); + return 0; +} + +int64_t fuse_open_path(guest_t *g, const char *path, int linux_flags, int mode) +{ + (void) g; + (void) mode; + if (!path || path[0] != '/') + return INT64_MIN; + + char canon[LINUX_PATH_MAX]; + if (fuse_canonical_abs(path, canon, sizeof(canon)) < 0) + return -LINUX_ENAMETOOLONG; + + pthread_mutex_lock(&fuse_lock); + const char *relpath = NULL; + fuse_mount_t *mount = fuse_mount_for_path_locked(canon, &relpath); + if (!mount) { + pthread_mutex_unlock(&fuse_lock); + return INT64_MIN; + } + if (!mount->session) { + /* Tombstoned mount; daemon already dropped /dev/fuse. */ + pthread_mutex_unlock(&fuse_lock); + return -LINUX_ENOTCONN; + } + fuse_session_t *session = mount->session; + int mount_id = mount->mount_id; + fuse_session_get_locked(session); + pthread_mutex_lock(&session->lock); + pthread_mutex_unlock(&fuse_lock); + + uint64_t nodeid = 0; + fuse_attr_t attr; + int rc = fuse_walk_path_locked(session, relpath, &nodeid, &attr); + if (rc < 0) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return rc; + } + + bool want_dir = (linux_flags & LINUX_O_DIRECTORY) || S_ISDIR(attr.mode); + bool path_only = (linux_flags & LINUX_O_PATH) != 0; + if ((linux_flags & LINUX_O_DIRECTORY) && !S_ISDIR(attr.mode)) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_ENOTDIR; + } + /* Linux open(2): when O_PATH is set, access-mode bits (O_RDONLY / + * O_WRONLY / O_RDWR) are ignored. The descriptor is opaque to + * read/write but usable for fstat, fchdir, *at() dirfd, etc. Reject + * non-RDONLY only for ordinary file opens until FUSE write support + * exists for FD_FUSE_FILE. + */ + if (!want_dir && !path_only && (linux_flags & 3) != LINUX_O_RDONLY) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EACCES; + } + + int guest_fd = + fd_alloc(want_dir ? FD_FUSE_DIR : FD_FUSE_FILE, -1, fuse_fd_cleanup); + if (guest_fd < 0) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EMFILE; + } + + fuse_file_t *file = NULL; + pthread_mutex_lock(&fuse_lock); + file = fuse_alloc_file_locked(); + pthread_mutex_unlock(&fuse_lock); + if (!file) { + fd_mark_closed(guest_fd); + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_ENOMEM; + } + + fuse_open_out_t out; + memset(&out, 0, sizeof(out)); + if (!path_only) { + rc = fuse_open_common_locked(session, nodeid, linux_flags, want_dir, + &out); + if (rc < 0) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_file_put_locked(file); /* releases the open-fd ref */ + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + fd_mark_closed(guest_fd); + return rc; + } + } + pthread_mutex_unlock(&session->lock); + + pthread_mutex_lock(&fuse_lock); + file->dir = want_dir; + file->nodeid = nodeid; + file->fh = out.fh; + file->linux_flags = linux_flags; + file->path_only = path_only; + /* Donate the session ref taken above to the file's own ref slot. The + * mount pointer itself is not cached; mount_id is enough to detect + * stale mount-slot reassignment without dereferencing a recycled + * fuse_mount_t. + */ + file->session = session; + file->mount_id = mount_id; + str_copy_trunc(file->path, canon, sizeof(file->path)); + file->attr = attr; + if (fuse_bind_file_fd_locked(guest_fd, file) < 0) { + fuse_session_get_locked(session); + fuse_file_put_locked(file); + pthread_mutex_unlock(&fuse_lock); + pthread_mutex_lock(&session->lock); + (void) fuse_release_common_locked(session, want_dir, nodeid, out.fh, + linux_flags); + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + fd_mark_closed(guest_fd); + return -LINUX_EMFILE; + } + fd_table[guest_fd].linux_flags = linux_flags; + pthread_mutex_unlock(&fuse_lock); + return guest_fd; +} + +/* Snapshot of fuse_file_t fields needed for a single read/readdir request. + * The snapshot pins both the session (via session_get_locked) and the file + * slot (via file_get_locked) for the duration of the operation. file is the + * actual fuse_file_t pointer so io_in_progress / io_cond can be touched on + * release without re-looking-up by guest_fd. + */ +typedef struct { + fuse_file_t *file; + fuse_session_t *session; + uint64_t nodeid; + uint64_t fh; + int mount_id; + int linux_flags; + bool path_only; + bool dir; + bool serialize; /* true for stream reads/readdir; false for pread */ +} fuse_file_snap_t; + +/* Acquire exclusive offset-affecting access to a FUSE file, bump the file + * and session refcounts, and snapshot the identity into snap. Stream reads + * (and readdir) pass serialize=true so concurrent read() calls on the same + * fd are serialized via io_in_progress, matching Linux f_pos_lock + * semantics. pread/getattr-style operations that do not touch the stream + * offset pass serialize=false. Returns 0 on success or a negative Linux + * errno. + */ +static int fuse_file_acquire(int guest_fd, + bool want_dir, + bool serialize, + fuse_file_snap_t *snap) +{ + pthread_mutex_lock(&fuse_lock); + for (;;) { + fuse_file_t *file = fuse_file_by_fd_locked(guest_fd); + if (!file || !file->session) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + if (file->dir != want_dir) { + pthread_mutex_unlock(&fuse_lock); + return want_dir ? -LINUX_ENOTDIR : -LINUX_EBADF; + } + if (serialize && file->io_in_progress) { + /* Bump file ref so the slot (and io_cond) survive the wait. */ + fuse_file_get_locked(file); + pthread_cond_wait(&file->io_cond, &fuse_lock); + fuse_file_put_locked(file); + /* Re-look-up: the slot may have been closed or replaced. */ + continue; + } + if (serialize) + file->io_in_progress = true; + snap->file = file; + snap->session = file->session; + snap->nodeid = file->nodeid; + snap->fh = file->fh; + snap->mount_id = file->mount_id; + snap->linux_flags = file->linux_flags; + snap->path_only = file->path_only; + snap->dir = file->dir; + snap->serialize = serialize; + fuse_file_get_locked(file); + fuse_session_get_locked(snap->session); + pthread_mutex_unlock(&fuse_lock); + return 0; + } +} + +static void fuse_file_release(fuse_file_snap_t *snap) +{ + pthread_mutex_lock(&fuse_lock); + if (snap->serialize) { + /* Clearing io_in_progress on the pinned slot is safe even if the + * slot has been logically closed; in that case the broadcast goes + * to no live waiter and the slot will be zeroed on the final put. + */ + snap->file->io_in_progress = false; + pthread_cond_broadcast(&snap->file->io_cond); + } + fuse_file_put_locked(snap->file); + fuse_session_put_locked(snap->session); + pthread_mutex_unlock(&fuse_lock); +} + +/* Read up to count bytes from a FUSE-backed file or directory at offset. + * Writes the daemon's reply into the guest buffer at buf_gva. Updates the + * stream offset on success when advance_offset is true and the fd still + * references the same (session, mount_id, nodeid) identity (post-close + * races leave offsets untouched). + */ +static int64_t fuse_read_common(guest_t *g, + int guest_fd, + fuse_file_snap_t *snap, + uint64_t buf_gva, + uint64_t count, + uint64_t offset, + bool advance_offset) +{ + if (snap->path_only) + return -LINUX_EBADF; + + pthread_mutex_lock(&snap->session->lock); + uint32_t size = (count < (uint64_t) snap->session->max_write) + ? (uint32_t) count + : snap->session->max_write; + fuse_read_in_t in = { + .fh = snap->fh, + .offset = offset, + .size = size, + .flags = (uint32_t) snap->linux_flags, + }; + uint8_t *reply = NULL; + size_t reply_len = 0; + int rc = + fuse_request_locked(snap->session, snap->dir ? FUSE_READDIR : FUSE_READ, + snap->nodeid, &in, sizeof(in), &reply, &reply_len); + pthread_mutex_unlock(&snap->session->lock); + if (rc < 0) + return rc; + if (guest_write(g, buf_gva, reply, reply_len) < 0) { + free(reply); + return -LINUX_EFAULT; + } + free(reply); + if (advance_offset) { + pthread_mutex_lock(&fuse_lock); + fuse_file_t *file = fuse_file_by_fd_locked(guest_fd); + if (file && file->session == snap->session && + file->mount_id == snap->mount_id && file->nodeid == snap->nodeid) + file->offset = offset + reply_len; + pthread_mutex_unlock(&fuse_lock); + } + return (int64_t) reply_len; +} + +int64_t fuse_read_fd(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) +{ + fuse_file_snap_t snap; + int err = fuse_file_acquire(fd, false, true, &snap); + if (err < 0) + return err; + /* The acquire holds io_in_progress, so the offset is stable. */ + uint64_t offset = snap.file->offset; + int64_t rc = fuse_read_common(g, fd, &snap, buf_gva, count, offset, true); + fuse_file_release(&snap); + return rc; +} + +int64_t fuse_pread_fd(guest_t *g, + int fd, + uint64_t buf_gva, + uint64_t count, + int64_t offset) +{ + if (offset < 0) + return -LINUX_EINVAL; + fuse_file_snap_t snap; + int err = fuse_file_acquire(fd, false, false, &snap); + if (err < 0) + return err; + int64_t rc = fuse_read_common(g, fd, &snap, buf_gva, count, + (uint64_t) offset, false); + fuse_file_release(&snap); + return rc; +} + +int64_t fuse_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) +{ + fuse_file_snap_t snap; + int err = fuse_file_acquire(fd, true, true, &snap); + if (err < 0) + return err; + + uint64_t dir_off = snap.file->offset; + + int64_t raw = + fuse_read_common(g, fd, &snap, buf_gva, count, dir_off, false); + if (raw <= 0) { + fuse_file_release(&snap); + return raw; + } + + uint8_t *tmp = malloc((size_t) raw); + if (!tmp) { + fuse_file_release(&snap); + return -LINUX_ENOMEM; + } + if (guest_read(g, buf_gva, tmp, (size_t) raw) < 0) { + free(tmp); + fuse_file_release(&snap); + return -LINUX_EFAULT; + } + + size_t src = 0; + size_t dst = 0; + while (src + FUSE_NAME_OFFSET <= (size_t) raw) { + fuse_dirent_t *fde = (fuse_dirent_t *) (tmp + src); + /* The daemon supplies fde->namelen; bound it to Linux NAME_MAX before + * any further arithmetic so a malicious daemon cannot make + * lreclen overflow the fixed entry[] buffer below or exceed the + * remaining frame body. + */ + if (fde->namelen > 255) { + free(tmp); + fuse_file_release(&snap); + return dst ? (int64_t) dst : -LINUX_EIO; + } + size_t freclen = FUSE_DIRENT_SIZE_RAW(fde->namelen); + if (freclen < FUSE_NAME_OFFSET + fde->namelen) + break; + if (src + freclen > (size_t) raw) + break; + + size_t lreclen = (19 + fde->namelen + 1 + 7) & ~7ULL; + /* d_ino(8) + d_off(8) + d_reclen(2) + d_type(1) + name(<=255) + + * NUL(1) + padding(<=7) <= 280. Defense in depth against an + * arithmetic error -- never trust the daemon's record length. + */ + uint8_t entry[280]; + if (lreclen > sizeof(entry)) + break; + if (dst + lreclen > count) + break; + + struct { + uint64_t d_ino; + int64_t d_off; + uint16_t d_reclen; + uint8_t d_type; + } lde = { + .d_ino = fde->ino, + .d_off = (int64_t) fde->off, + .d_reclen = (uint16_t) lreclen, + .d_type = (uint8_t) fde->type, + }; + memcpy(entry, &lde, sizeof(lde)); + memcpy(entry + 19, fde->name, fde->namelen); + entry[19 + fde->namelen] = '\0'; + if (19 + fde->namelen + 1 < lreclen) + memset(entry + 19 + fde->namelen + 1, 0, + lreclen - (19 + fde->namelen + 1)); + if (guest_write(g, buf_gva + dst, entry, lreclen) < 0) { + free(tmp); + fuse_file_release(&snap); + return dst ? (int64_t) dst : -LINUX_EFAULT; + } + dst += lreclen; + src += freclen; + pthread_mutex_lock(&fuse_lock); + fuse_file_t *cur = fuse_file_by_fd_locked(fd); + if (cur && cur->session == snap.session && + cur->mount_id == snap.mount_id && cur->nodeid == snap.nodeid) + cur->offset = fde->off; + pthread_mutex_unlock(&fuse_lock); + } + + free(tmp); + fuse_file_release(&snap); + return (int64_t) dst; +} + +int64_t fuse_dev_read(int guest_fd, + guest_t *g, + uint64_t buf_gva, + uint64_t count) +{ + pthread_mutex_lock(&fuse_lock); + fuse_session_t *session = fuse_session_by_fd_locked(guest_fd); + if (!session) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + fuse_session_get_locked(session); + pthread_mutex_unlock(&fuse_lock); + + pthread_mutex_lock(&session->lock); + while (!session->closed && !session->queue_head) { + if (fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EAGAIN; + } + pthread_cond_wait(&session->queue_cond, &session->lock); + } + if (session->closed || session->daemon_dead) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_ENOTCONN; + } + + /* Peek the head request: short reads MUST NOT consume the frame, or the + * consumer blocks forever waiting for a reply that can never be matched. + * fuse(4) lets the daemon retry the read with a larger buffer. + */ + fuse_request_t *req = session->queue_head; + size_t frame_len = req->frame_len; + if (count < frame_len) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EINVAL; + } + if (guest_write(g, buf_gva, req->frame, frame_len) < 0) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EFAULT; + } + /* Commit: dequeue only after the copy succeeded. */ + session->queue_head = req->next; + if (!session->queue_head) + session->queue_tail = NULL; + req->next = NULL; + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + return (int64_t) frame_len; +} + +int64_t fuse_dev_write(guest_t *g, + int guest_fd, + uint64_t buf_gva, + uint64_t count) +{ + if (count < sizeof(fuse_out_header_t)) + return -LINUX_EINVAL; + /* Reject any daemon write that exceeds the implementation hard ceiling. + * The same ceiling is applied at FUSE_INIT negotiation, so a daemon + * cannot advertise max_write larger than this and then have its reply + * payload silently rejected here. + */ + if (count > FUSE_FRAME_CAP) + return -LINUX_EINVAL; + + uint8_t *buf = malloc((size_t) count); + if (!buf) + return -LINUX_ENOMEM; + if (guest_read(g, buf_gva, buf, (size_t) count) < 0) { + free(buf); + return -LINUX_EFAULT; + } + + fuse_out_header_t hdr; + memcpy(&hdr, buf, sizeof(hdr)); + if (hdr.len > count || hdr.len < sizeof(hdr)) { + free(buf); + return -LINUX_EINVAL; + } + + pthread_mutex_lock(&fuse_lock); + fuse_session_t *session = fuse_session_by_fd_locked(guest_fd); + if (!session) { + pthread_mutex_unlock(&fuse_lock); + free(buf); + return -LINUX_EBADF; + } + fuse_session_get_locked(session); + pthread_mutex_unlock(&fuse_lock); + + pthread_mutex_lock(&session->lock); + fuse_request_t *req = NULL; + for (int i = 0; i < FUSE_MAX_PENDING; i++) { + if (session->requests[i].used && + session->requests[i].unique == hdr.unique) { + req = &session->requests[i]; + break; + } + } + if (!req) { + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + free(buf); + return -LINUX_EINVAL; + } + + req->answered = true; + /* The daemon's error field is in Linux errno space (negative). Pass it + * through unchanged; the consumer side already treats req->error as a + * negative Linux errno. + */ + req->error = hdr.error; + req->reply_len = hdr.len - sizeof(hdr); + if (req->reply_len) { + req->reply = malloc(req->reply_len); + if (!req->reply) + req->error = -LINUX_ENOMEM; + else + memcpy(req->reply, buf + sizeof(hdr), req->reply_len); + } + + if (req->frame && ((fuse_in_header_t *) req->frame)->opcode == FUSE_INIT) { + /* fuse(4): the daemon may return a fuse_init_out_t shorter than the + * current struct size (older libfuse), and may negotiate a minor + * version different from ours. Accept any reply whose major matches + * and that is large enough to carry max_write. Reject only on an + * actual major-version mismatch or daemon-reported error. + */ + const size_t init_min_len = offsetof(fuse_init_out_t, max_write) + + sizeof(((fuse_init_out_t *) 0)->max_write); + if (hdr.error == 0 && req->reply_len >= init_min_len) { + fuse_init_out_t init_out; + memset(&init_out, 0, sizeof(init_out)); + size_t copy_len = req->reply_len < sizeof(init_out) + ? req->reply_len + : sizeof(init_out); + memcpy(&init_out, req->reply, copy_len); + if (init_out.major != FUSE_KERNEL_VERSION) { + req->error = -LINUX_EPROTO; + session->daemon_dead = true; + } else { + uint32_t neg_write = + init_out.max_write ? init_out.max_write : 65536; + if (neg_write > FUSE_MAX_NEGOTIATED_WRITE) + neg_write = FUSE_MAX_NEGOTIATED_WRITE; + session->max_write = neg_write; + session->max_pages = + init_out.max_pages ? init_out.max_pages : 16; + session->init_done = true; + } + } else { + req->error = (hdr.error < 0) ? hdr.error : -LINUX_EPROTO; + session->daemon_dead = true; + } + pthread_cond_broadcast(&session->init_cond); + } + pthread_cond_broadcast(&req->cond); + pthread_mutex_unlock(&session->lock); + pthread_mutex_lock(&fuse_lock); + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + free(buf); + return (int64_t) count; +} + +bool fuse_is_device_fd(int fd) +{ + fd_entry_t snap; + return fd_snapshot(fd, &snap) && snap.type == FD_FUSE_DEV; +} + +bool fuse_is_file_fd(int fd) +{ + fd_entry_t snap; + return fd_snapshot(fd, &snap) && snap.type == FD_FUSE_FILE; +} + +bool fuse_is_dir_fd(int fd) +{ + fd_entry_t snap; + return fd_snapshot(fd, &snap) && snap.type == FD_FUSE_DIR; +} + +bool fuse_fd_refuse_mmap(int fd) +{ + fd_entry_t snap; + if (!fd_snapshot(fd, &snap) || + (snap.type != FD_FUSE_FILE && snap.type != FD_FUSE_DIR)) + return false; + + pthread_mutex_lock(&fuse_lock); + fuse_file_t *file = fuse_file_by_fd_locked(fd); + bool refuse = file && !file->path_only; + pthread_mutex_unlock(&fuse_lock); + return refuse; +} + +int64_t fuse_lseek_fd(int fd, int64_t offset, int whence) +{ + fd_entry_t snap; + if (!fd_snapshot(fd, &snap)) + return -LINUX_EBADF; + /* /dev/fuse: stream-like, no absolute position. Linux returns ESPIPE on + * lseek of a pipe-equivalent fd. + */ + if (snap.type == FD_FUSE_DEV) + return -LINUX_ESPIPE; + if (snap.type != FD_FUSE_FILE && snap.type != FD_FUSE_DIR) + return INT64_MIN; + + /* Fast-reject O_PATH fds before the wait loop, then re-lookup under + * the lock for the seek itself. + */ + pthread_mutex_lock(&fuse_lock); + fuse_file_t *probe = fuse_file_by_fd_locked(fd); + if (!probe || !probe->session) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + if (probe->path_only) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + pthread_mutex_unlock(&fuse_lock); + + /* Linux SEEK_SET=0, SEEK_CUR=1, SEEK_END=2. SEEK_HOLE/SEEK_DATA are not + * supported against FUSE files without daemon round-trip; reject for now. + */ + if (whence != 0 && whence != 1 && whence != 2) + return -LINUX_EINVAL; + + pthread_mutex_lock(&fuse_lock); + /* Block while a stream read is in flight on this fd so the seek does + * not race the post-read offset update. The wait holds a file ref so + * io_cond cannot be destroyed under it. + */ + for (;;) { + fuse_file_t *waiter = fuse_file_by_fd_locked(fd); + if (!waiter || !waiter->session) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + if (!waiter->io_in_progress) + break; + fuse_file_get_locked(waiter); + pthread_cond_wait(&waiter->io_cond, &fuse_lock); + fuse_file_put_locked(waiter); + } + fuse_file_t *file = fuse_file_by_fd_locked(fd); + if (!file || !file->session) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + int64_t cur = (int64_t) file->offset; + int64_t end_size = (int64_t) file->attr.size; + int64_t base; + switch (whence) { + case 0: + base = 0; + break; + case 1: + base = cur; + break; + case 2: + base = end_size; + break; + default: + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EINVAL; + } + /* Both overflow checks complete before the addition itself; INT64_MIN + * for offset would otherwise produce signed-overflow UB on the bounds + * test. + */ + if (offset > 0 && base > INT64_MAX - offset) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EOVERFLOW; + } + if (offset < 0 && base < INT64_MIN - offset) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EOVERFLOW; + } + int64_t new_off = base + offset; + if (new_off < 0) { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EINVAL; + } + file->offset = (uint64_t) new_off; + pthread_mutex_unlock(&fuse_lock); + return new_off; +} + +int64_t fuse_fchdir(int fd) +{ + fd_entry_t snap; + if (!fd_snapshot(fd, &snap)) + return INT64_MIN; + if (snap.type != FD_FUSE_DIR) { + if (snap.type == FD_FUSE_FILE || snap.type == FD_FUSE_DEV) + return -LINUX_ENOTDIR; + return INT64_MIN; + } + + pthread_mutex_lock(&fuse_lock); + fuse_file_t *file = fuse_file_by_fd_locked(fd); + if (!file || !file->session || !file->dir || file->path[0] != '/') { + pthread_mutex_unlock(&fuse_lock); + return -LINUX_EBADF; + } + char path[LINUX_PATH_MAX]; + str_copy_trunc(path, file->path, sizeof(path)); + pthread_mutex_unlock(&fuse_lock); + proc_cwd_set_virtual(path); + return 0; +} + +int fuse_dup_fd(int src_fd, + int min_guest_fd, + int fixed_guest_fd, + bool fixed_slot, + int linux_flags) +{ + fd_entry_t snap; + if (!fd_snapshot(src_fd, &snap)) { + errno = EBADF; + return -1; + } + if (snap.type != FD_FUSE_DEV && snap.type != FD_FUSE_FILE && + snap.type != FD_FUSE_DIR) { + errno = EBADF; + return -1; + } + + int guest_fd = fixed_slot + ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type, -1) + : fd_alloc_from_relaxed(min_guest_fd, snap.type, -1); + if (guest_fd < 0) { + if (fixed_slot) + errno = EBADF; + return -1; + } + + pthread_mutex_lock(&fuse_lock); + if (snap.type == FD_FUSE_DEV) { + fuse_session_t *session = fuse_session_by_fd_locked(src_fd); + if (!session) { + pthread_mutex_unlock(&fuse_lock); + fd_mark_closed(guest_fd); + errno = EBADF; + return -1; + } + fuse_session_get_locked(session); + if (fuse_bind_dev_fd_locked(guest_fd, session) < 0) { + fuse_session_put_locked(session); + pthread_mutex_unlock(&fuse_lock); + fd_mark_closed(guest_fd); + errno = EMFILE; + return -1; + } + } else { + fuse_file_t *file = fuse_file_by_fd_locked(src_fd); + if (!file || !file->session) { + pthread_mutex_unlock(&fuse_lock); + fd_mark_closed(guest_fd); + errno = EBADF; + return -1; + } + fuse_file_get_locked(file); + if (fuse_bind_file_fd_locked(guest_fd, file) < 0) { + fuse_file_put_locked(file); + pthread_mutex_unlock(&fuse_lock); + fd_mark_closed(guest_fd); + errno = EMFILE; + return -1; + } + } + + int preserved_flags = fd_table[src_fd].linux_flags & + (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | + LINUX_O_DIRECT | LINUX_O_LARGEFILE); + fd_table[guest_fd].linux_flags = preserved_flags | linux_flags; + fd_table[guest_fd].cleanup = fuse_fd_cleanup; + pthread_mutex_unlock(&fuse_lock); + return guest_fd; +} + +int fuse_resolve_at_path(guest_fd_t dirfd, + const char *path, + char *out, + size_t outsz) +{ + if (!path || path[0] == '\0' || path[0] == '/') + return 0; + + if (dirfd != LINUX_AT_FDCWD) { + fd_entry_t snap; + if (!fd_snapshot(dirfd, &snap) || snap.type != FD_FUSE_DIR) + return 0; + + pthread_mutex_lock(&fuse_lock); + fuse_file_t *file = fuse_file_by_fd_locked(dirfd); + if (!file || !file->session || !file->dir || file->path[0] != '/') { + pthread_mutex_unlock(&fuse_lock); + errno = EBADF; + return -1; + } + char base[LINUX_PATH_MAX]; + str_copy_trunc(base, file->path, sizeof(base)); + pthread_mutex_unlock(&fuse_lock); + if (fuse_join_virtual_path(base, path, out, outsz) < 0) + return -1; + return 1; + } + + proc_cwd_view_t view; + if (proc_acquire_cwd_view(&view) < 0) + return 0; + + /* fuse_path_matches_mount returns true for both live and tombstoned + * mounts, so a virtual cwd left dangling by daemon death still routes + * the relative lookup into FUSE land. The follow-on + * fuse_path_lookup / fuse_open_path / fuse_stat_path call detects the + * tombstoned mount and surfaces -LINUX_ENOTCONN instead of letting the + * resolution fall back to host-relative open against the host cwd. + */ + int rc = 0; + if (view.path && view.path[0] == '/' && + fuse_path_matches_mount(view.path)) { + rc = fuse_join_virtual_path(view.path, path, out, outsz) < 0 ? -1 : 1; + } + proc_release_cwd_view(&view); + return rc; +} + +int fuse_fd_mnt_id(int fd, int *mnt_id_out) +{ + if (!mnt_id_out) { + errno = EINVAL; + return -1; + } + + pthread_mutex_lock(&fuse_lock); + fuse_file_t *file = fuse_file_by_fd_locked(fd); + if (!file || !file->session) { + pthread_mutex_unlock(&fuse_lock); + errno = ENOENT; + return -1; + } + *mnt_id_out = file->mount_id; + pthread_mutex_unlock(&fuse_lock); + return 0; +} + +int fuse_append_mountinfo(char *buf, size_t bufsz, size_t *off) +{ + if (!buf || !off || *off > bufsz) { + errno = EINVAL; + return -1; + } + + pthread_mutex_lock(&fuse_lock); + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (!fuse_mounts[i].used) + continue; + int n = snprintf( + buf + *off, bufsz - *off, + "%d 1 0:%d / %s rw,nosuid,nodev,relatime - %s %s rw\n", + fuse_mounts[i].mount_id, FUSE_FAKE_DEV, fuse_mounts[i].path, + fuse_mounts[i].fstype[0] ? fuse_mounts[i].fstype : "fuse", + fuse_mounts[i].source[0] ? fuse_mounts[i].source : "fuse"); + if (n < 0 || (size_t) n >= bufsz - *off) { + pthread_mutex_unlock(&fuse_lock); + errno = ENAMETOOLONG; + return -1; + } + *off += (size_t) n; + } + pthread_mutex_unlock(&fuse_lock); + return 0; +} + +int fuse_append_mounts(char *buf, size_t bufsz, size_t *off) +{ + if (!buf || !off || *off > bufsz) { + errno = EINVAL; + return -1; + } + + pthread_mutex_lock(&fuse_lock); + for (int i = 0; i < FUSE_MAX_MOUNTS; i++) { + if (!fuse_mounts[i].used) + continue; + int n = snprintf( + buf + *off, bufsz - *off, "%s %s %s rw,nosuid,nodev,relatime 0 0\n", + fuse_mounts[i].source[0] ? fuse_mounts[i].source : "fuse", + fuse_mounts[i].path, + fuse_mounts[i].fstype[0] ? fuse_mounts[i].fstype : "fuse"); + if (n < 0 || (size_t) n >= bufsz - *off) { + pthread_mutex_unlock(&fuse_lock); + errno = ENAMETOOLONG; + return -1; + } + *off += (size_t) n; + } + pthread_mutex_unlock(&fuse_lock); + return 0; +} diff --git a/src/syscall/fuse.h b/src/syscall/fuse.h new file mode 100644 index 0000000..0b5eaec --- /dev/null +++ b/src/syscall/fuse.h @@ -0,0 +1,70 @@ +#pragma once + +#include +#include +#include + +#include "core/guest.h" + +void fuse_init(void); + +int64_t sys_mount(guest_t *g, + uint64_t source_gva, + uint64_t target_gva, + uint64_t fstype_gva, + unsigned long flags, + uint64_t data_gva); + +int fuse_proc_open(int linux_flags); +int fuse_proc_stat(struct stat *st); + +int64_t fuse_open_path(guest_t *g, const char *path, int linux_flags, int mode); +bool fuse_path_matches_mount(const char *path); +/* Stat a FUSE-mounted path. at_flags carries the Linux AT_* mask from the + * caller; only LINUX_AT_SYMLINK_NOFOLLOW is consulted today. When the + * daemon returns S_IFLNK for the final component and the caller did not + * request NOFOLLOW, the call surfaces -LINUX_ENOSYS because symlink + * target resolution is not implemented yet. With NOFOLLOW the symlink's + * own attrs are returned unchanged. + */ +int fuse_stat_path(const char *path, struct stat *st, int at_flags); +int fuse_access_path(const char *path, int mode, int flags); +int fuse_materialize_path(const char *path, char *out_path, size_t outsz); +int fuse_fstat_fd(int fd, struct stat *st); +int64_t fuse_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); +int64_t fuse_read_fd(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); +int64_t fuse_pread_fd(guest_t *g, + int fd, + uint64_t buf_gva, + uint64_t count, + int64_t offset); +int64_t fuse_dev_read(int guest_fd, + guest_t *g, + uint64_t buf_gva, + uint64_t count); +int64_t fuse_dev_write(guest_t *g, + int guest_fd, + uint64_t buf_gva, + uint64_t count); + +bool fuse_is_device_fd(int fd); +bool fuse_is_file_fd(int fd); +bool fuse_is_dir_fd(int fd); +bool fuse_fd_refuse_mmap(int fd); + +/* Move the per-fd offset for a FUSE-backed regular file. Returns the new offset + * on success or a negative Linux errno. /dev/fuse and FUSE-backed directories + * return -ESPIPE/-EINVAL to match Linux semantics for fds that do not support + * absolute seeking. + */ +int64_t fuse_lseek_fd(int fd, int64_t offset, int whence); +int64_t fuse_fchdir(int fd); +int fuse_dup_fd(int src_fd, + int min_guest_fd, + int fixed_guest_fd, + bool fixed_slot, + int linux_flags); +int fuse_resolve_at_path(int dirfd, const char *path, char *out, size_t outsz); +int fuse_fd_mnt_id(int fd, int *mnt_id_out); +int fuse_append_mountinfo(char *buf, size_t bufsz, size_t *off); +int fuse_append_mounts(char *buf, size_t bufsz, size_t *off); diff --git a/src/syscall/internal.h b/src/syscall/internal.h index 3a5a2d1..2760ce9 100644 --- a/src/syscall/internal.h +++ b/src/syscall/internal.h @@ -138,8 +138,9 @@ int64_t linux_errno(void); */ int translate_at_flags(int linux_flags); -/* Reject any flag bits outside `allowed`. Caller returns -LINUX_EINVAL on - * failure. Shared by every *at() handler that validates its flags argument. +/* Reject any flag bits outside the allowed mask. Caller returns + * -LINUX_EINVAL on failure. Shared by every *at() handler that validates + * its flags argument. */ static inline int validate_at_flags(int flags, int allowed) { diff --git a/src/syscall/io.c b/src/syscall/io.c index 3fa5b13..748ff98 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -29,6 +29,7 @@ #include "syscall/abi.h" #include "syscall/fd.h" +#include "syscall/fuse.h" #include "syscall/internal.h" #include "syscall/inotify.h" #include "syscall/io.h" @@ -591,8 +592,12 @@ static int64_t io_write_result(ssize_t ret) int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_EVENTFD) - return eventfd_write(fd, g, buf_gva, count); + if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { + if (fd_table[fd].type == FD_FUSE_DEV) + return fuse_dev_write(g, fd, buf_gva, count); + if (fd_table[fd].type == FD_EVENTFD) + return eventfd_write(fd, g, buf_gva, count); + } host_fd_ref_t host_ref; int64_t err = host_fd_ref_open_checked(fd, &host_ref, true); @@ -640,6 +645,10 @@ int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) { if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { + if (fd_table[fd].type == FD_FUSE_DEV) + return fuse_dev_read(fd, g, buf_gva, count); + if (fd_table[fd].type == FD_FUSE_FILE) + return fuse_read_fd(g, fd, buf_gva, count); if (fd_table[fd].type == FD_EVENTFD) return eventfd_read(fd, g, buf_gva, count); if (fd_table[fd].type == FD_SIGNALFD) @@ -694,6 +703,9 @@ int64_t sys_pread64(guest_t *g, uint64_t count, int64_t offset) { + if (fuse_is_file_fd(fd)) + return fuse_pread_fd(g, fd, buf_gva, count, offset); + host_fd_ref_t host_ref; int64_t err = host_fd_ref_open_regular_io(fd, &host_ref); if (err < 0) diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 7f7c374..5edba87 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -23,6 +23,7 @@ #include "runtime/thread.h" #include "syscall/abi.h" +#include "syscall/fuse.h" #include "syscall/internal.h" #include "syscall/mem.h" @@ -1177,6 +1178,9 @@ int64_t sys_mmap(guest_t *g, if (!is_anon && (offset & 4095)) return -LINUX_EINVAL; + if (!is_anon && fuse_fd_refuse_mmap(fd)) + return -LINUX_ENODEV; + /* Round length up to page size (overflow-safe) */ if (length > UINT64_MAX - 4095) return -LINUX_ENOMEM; diff --git a/src/syscall/path.c b/src/syscall/path.c index 971accf..c82b46f 100644 --- a/src/syscall/path.c +++ b/src/syscall/path.c @@ -17,6 +17,7 @@ #include "utils.h" #include "syscall/abi.h" +#include "syscall/fuse.h" #include "syscall/path.h" #include "syscall/proc.h" #include "syscall/sidecar.h" @@ -50,6 +51,8 @@ bool path_might_use_open_intercept(const char *path) return true; if (!strncmp(path, "/dev", 4)) return true; + if (fuse_path_matches_mount(path)) + return true; if (path_prefix_match(path, SYSFS_CPU_PREFIX, sizeof(SYSFS_CPU_PREFIX) - 1)) return true; if (!strcmp(path, "/etc/mtab") || !strcmp(path, "/etc/passwd") || @@ -70,6 +73,10 @@ bool path_might_use_stat_intercept(const char *path) return true; if (!strncmp(path, "/dev/shm", 8)) return true; + if (!strcmp(path, "/dev/fuse")) + return true; + if (fuse_path_matches_mount(path)) + return true; if (path_prefix_match(path, SYSFS_CPU_PREFIX, sizeof(SYSFS_CPU_PREFIX) - 1)) return true; @@ -149,6 +156,16 @@ int path_translate_at(guest_fd_t dirfd, if (tx->proc_resolved > 0) { tx->guest_path = tx->proc_path; tx->intercept_path = tx->proc_path; + } else { + int fuse_rc = fuse_resolve_at_path(dirfd, path, tx->guest_buf, + sizeof(tx->guest_buf)); + if (fuse_rc < 0) + return -1; + if (fuse_rc > 0) { + tx->guest_path = tx->guest_buf; + tx->intercept_path = tx->guest_buf; + tx->fuse_path = true; + } } errno = 0; diff --git a/src/syscall/path.h b/src/syscall/path.h index 0b95f23..765a6e3 100644 --- a/src/syscall/path.h +++ b/src/syscall/path.h @@ -25,7 +25,9 @@ typedef struct { const char *intercept_path; const char *host_path; int proc_resolved; + bool fuse_path; char proc_path[LINUX_PATH_MAX]; + char guest_buf[LINUX_PATH_MAX]; char host_buf[LINUX_PATH_MAX]; } path_translation_t; diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index b0b15e1..4548021 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -46,6 +46,7 @@ #include "syscall/abi.h" #include "syscall/exec.h" #include "syscall/fd.h" +#include "syscall/fuse.h" #include "syscall/fs.h" #include "syscall/inotify.h" #include "syscall/internal.h" @@ -94,6 +95,7 @@ void syscall_init(void) timerfd_init(); inotify_init(); netlink_init(); + fuse_init(); wakeup_pipe_init(); } @@ -202,6 +204,7 @@ SC_FORWARD(sc_readlinkat, sys_readlinkat(g, (int) x0, x1, x2, x3)) SC_FORWARD(sc_mknodat, sys_mknodat(g, (int) x0, x1, (int) x2, (int) x3)) SC_FORWARD(sc_symlinkat, sys_symlinkat(g, x0, (int) x1, x2)) SC_FORWARD(sc_linkat, sys_linkat(g, (int) x0, x1, (int) x2, x3, (int) x4)) +SC_FORWARD(sc_mount, sys_mount(g, x0, x1, x2, (unsigned long) x3, x4)) SC_FORWARD(sc_fchmod, sys_fchmod((int) x0, (uint32_t) x1)) /* Linux fchmodat (SYS 53) is 3-arg: dirfd, path, mode. diff --git a/src/utils.h b/src/utils.h index efe55c5..1a1fc08 100644 --- a/src/utils.h +++ b/src/utils.h @@ -45,7 +45,7 @@ /* Branchless range check: true when minx <= x < minx + size. * - * Replaces the recurring pair `x >= minx && x < minx + size` with a single + * Replaces the recurring pair (x >= minx && x < minx + size) with a single * unsigned compare: shift x into a [0, size) window and let unsigned * wraparound flag both underflow (x < minx) and overflow (x >= minx + size). * Width-safe for any operand up to uint64_t. @@ -188,7 +188,7 @@ static inline void timespec_deadline_in_ms(struct timespec *out, long rel_ms) * the word and pass the bit position within it. Centralizing the shift and * compiler-intrinsic calls here keeps the meaning ("the bit for slot N", * "lowest set bit") visible at the call site instead of leaving readers to - * decode `1ULL << (n)` and `__builtin_ctzll`. + * decode 1ULL << (n) and __builtin_ctzll. */ /* The bit value for position n (0..63). n is evaluated once. */ @@ -218,6 +218,6 @@ static inline int bit_popcount64(uint64_t word) * * PACKED removes inter-field padding, used for Linux ABI structures whose * layout must match the kernel exactly (e.g., linux_dirent64). Apply at the - * end of a struct definition: `} PACKED name_t;`. + * end of a struct definition: } PACKED name_t;. */ #define PACKED __attribute__((packed)) diff --git a/tests/hvf-test.h b/tests/hvf-test.h index ef89478..c9d2690 100644 --- a/tests/hvf-test.h +++ b/tests/hvf-test.h @@ -21,7 +21,7 @@ enum { /* AArch64 instruction encoders. * * Three families share an encoding shape; the lists below drive a token- - * pasted `a64_` definition for each entry, eliminating the per- + * pasted a64_ definition for each entry, eliminating the per- * instruction boilerplate. Each encoder asserts its operand preconditions * so a bad caller surfaces immediately instead of silently emitting a * different (but still valid) instruction. diff --git a/tests/test-fuse-alpine.sh b/tests/test-fuse-alpine.sh new file mode 100755 index 0000000..537ab12 --- /dev/null +++ b/tests/test-fuse-alpine.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# test-fuse-alpine.sh — Validate guest FUSE inside the Alpine musl sysroot. +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +ELFUSE="${1:?Usage: $0 }" +SYSROOT="${2:?Usage: $0 }" +TEST_BIN="${3:?Usage: $0 }" + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +TEST_LABEL_WIDTH=14 +TEST_TIMEOUT=20 +source "$SCRIPT_DIR/lib/test-runner.sh" + +TEST_RUNNER=("$ELFUSE" --sysroot "$SYSROOT") + +if [ ! -d "$SYSROOT" ]; then + printf "%s\n" "missing sysroot: $SYSROOT" >&2 + exit 1 +fi + +if [ ! -x "$TEST_BIN" ]; then + printf "%s\n" "missing test binary: $TEST_BIN" >&2 + exit 1 +fi + +mkdir -p "$SYSROOT/mnt/fuse" + +printf "%b\n" "${BLUE}Dynamic FUSE test suite (Alpine sysroot)${RESET}" +if output=$(timeout "$TEST_TIMEOUT" "${TEST_RUNNER[@]}" "$TEST_BIN" 2>&1); then + test_report ok "fuse-basic" + pass=$((pass + 1)) +else + rc=$? + test_report fail "fuse-basic" " (exit rc=$rc)" + test_excerpt "$output" + fail=$((fail + 1)) +fi + +printf "%b\n" "${BLUE}FUSE results${RESET}" +printf " pass=%d fail=%d skip=%d xfail=%d\n" "$pass" "$fail" "$skip" "$expected_fail" + +if [ "$fail" -gt 0 ]; then + exit 1 +fi diff --git a/tests/test-fuse-basic.c b/tests/test-fuse-basic.c new file mode 100644 index 0000000..33c6edf --- /dev/null +++ b/tests/test-fuse-basic.c @@ -0,0 +1,643 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef SYS_mount +#define SYS_mount 40 +#endif + +#ifndef SYS_getdents64 +#define SYS_getdents64 61 +#endif + +#ifndef O_PATH +#define O_PATH 010000000 +#endif + +#define FUSE_ROOT_ID 1 +#define FUSE_ASYNC_READ (1u << 0) +#define FUSE_BIG_WRITES (1u << 5) +#define FUSE_MAX_PAGES (1u << 22) + +enum fuse_opcode { + FUSE_LOOKUP = 1, + FUSE_GETATTR = 3, + FUSE_OPEN = 14, + FUSE_READ = 15, + FUSE_RELEASE = 18, + FUSE_INIT = 26, + FUSE_OPENDIR = 27, + FUSE_READDIR = 28, + FUSE_RELEASEDIR = 29, +}; + +struct fuse_attr { + uint64_t ino, size, blocks, atime, mtime, ctime; + uint32_t atimensec, mtimensec, ctimensec; + uint32_t mode, nlink, uid, gid, rdev, blksize, flags; +}; + +struct fuse_entry_out { + uint64_t nodeid, generation, entry_valid, attr_valid; + uint32_t entry_valid_nsec, attr_valid_nsec; + struct fuse_attr attr; +}; + +struct fuse_attr_out { + uint64_t attr_valid; + uint32_t attr_valid_nsec; + uint32_t dummy; + struct fuse_attr attr; +}; + +struct fuse_open_out { + uint64_t fh; + uint32_t open_flags; + int32_t backing_id; +}; + +struct fuse_read_in { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t read_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; +}; + +struct fuse_init_out { + uint32_t major, minor, max_readahead, flags; + uint16_t max_background, congestion_threshold; + uint32_t max_write, time_gran; + uint16_t max_pages, map_alignment; + uint32_t flags2, max_stack_depth; + uint16_t request_timeout, unused[11]; +}; + +struct fuse_in_header { + uint32_t len, opcode; + uint64_t unique, nodeid; + uint32_t uid, gid, pid; + uint16_t total_extlen, padding; +}; + +struct fuse_out_header { + uint32_t len; + int32_t error; + uint64_t unique; +}; + +struct fuse_dirent { + uint64_t ino; + uint64_t off; + uint32_t namelen; + uint32_t type; + char name[]; +}; + +struct linux_dirent64 { + uint64_t d_ino; + int64_t d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[]; +}; + +static const char hello_name[] = "hello"; +static const char hello_data[] = "hello from guest fuse\n"; +static const char source_name[] = "elfuse-test"; + +typedef struct { + int fusefd; + int saw_release; + int saw_releasedir; + int init_error; +} daemon_ctx_t; + +static void fill_dir_attr(struct fuse_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->ino = 1; + attr->mode = S_IFDIR | 0755; + attr->nlink = 2; + attr->uid = getuid(); + attr->gid = getgid(); + attr->blksize = 4096; +} + +static void fill_file_attr(struct fuse_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->ino = 2; + attr->mode = S_IFREG | 0644; + attr->nlink = 1; + attr->uid = getuid(); + attr->gid = getgid(); + attr->size = sizeof(hello_data) - 1; + attr->blocks = 1; + attr->blksize = 4096; +} + +static int reply_frame(int fd, + uint64_t unique, + int32_t error, + const void *payload, + size_t len) +{ + uint8_t buf[4096]; + struct fuse_out_header out = { + .len = (uint32_t) (sizeof(out) + len), + .error = error, + .unique = unique, + }; + memcpy(buf, &out, sizeof(out)); + if (len) + memcpy(buf + sizeof(out), payload, len); + if (write(fd, buf, sizeof(out) + len) != (ssize_t) (sizeof(out) + len)) { + perror("write(/dev/fuse)"); + return -1; + } + return 0; +} + +static size_t append_dirent(uint8_t *buf, + size_t off, + uint64_t ino, + uint64_t next_off, + unsigned type, + const char *name) +{ + struct fuse_dirent *de = (struct fuse_dirent *) (buf + off); + size_t namelen = strlen(name); + size_t reclen = (24 + namelen + 7) & ~7ULL; + de->ino = ino; + de->off = next_off; + de->namelen = (uint32_t) namelen; + de->type = type; + memcpy(de->name, name, namelen); + memset(((uint8_t *) de) + 24 + namelen, 0, reclen - (24 + namelen)); + return off + reclen; +} + +static void *daemon_main(void *arg) +{ + daemon_ctx_t *ctx = arg; + uint8_t buf[4096]; + for (;;) { + ssize_t nr = read(ctx->fusefd, buf, sizeof(buf)); + if (nr < 0) { + if (errno == ENOTCONN) + return NULL; + perror("read(/dev/fuse)"); + exit(1); + } + + struct fuse_in_header *in = (struct fuse_in_header *) buf; + switch (in->opcode) { + case FUSE_INIT: { + if (ctx->init_error) { + if (reply_frame(ctx->fusefd, in->unique, -ctx->init_error, NULL, + 0) < 0) + exit(1); + return NULL; + } + struct fuse_init_out out = { + .major = 7, + .minor = 45, + .max_readahead = 1024 * 1024, + .flags = FUSE_ASYNC_READ | FUSE_BIG_WRITES | FUSE_MAX_PAGES, + .max_write = 65536, + .max_pages = 16, + }; + if (reply_frame(ctx->fusefd, in->unique, 0, &out, sizeof(out)) < 0) + exit(1); + break; + } + case FUSE_GETATTR: { + struct fuse_attr_out out; + memset(&out, 0, sizeof(out)); + if (in->nodeid == FUSE_ROOT_ID) + fill_dir_attr(&out.attr); + else if (in->nodeid == 2) + fill_file_attr(&out.attr); + else if (reply_frame(ctx->fusefd, in->unique, -ENOENT, NULL, 0) < 0) + exit(1); + if (in->nodeid == FUSE_ROOT_ID || in->nodeid == 2) { + if (reply_frame(ctx->fusefd, in->unique, 0, &out, sizeof(out)) < + 0) + exit(1); + } + break; + } + case FUSE_LOOKUP: { + const char *name = (const char *) (buf + sizeof(*in)); + if (in->nodeid != FUSE_ROOT_ID || strcmp(name, hello_name)) { + if (reply_frame(ctx->fusefd, in->unique, -ENOENT, NULL, 0) < 0) + exit(1); + break; + } + struct fuse_entry_out out; + memset(&out, 0, sizeof(out)); + out.nodeid = 2; + fill_file_attr(&out.attr); + if (reply_frame(ctx->fusefd, in->unique, 0, &out, sizeof(out)) < 0) + exit(1); + break; + } + case FUSE_OPENDIR: { + struct fuse_open_out out = {.fh = 10}; + if (reply_frame(ctx->fusefd, in->unique, 0, &out, sizeof(out)) < 0) + exit(1); + break; + } + case FUSE_OPEN: { + struct fuse_open_out out = {.fh = 11}; + if (reply_frame(ctx->fusefd, in->unique, 0, &out, sizeof(out)) < 0) + exit(1); + break; + } + case FUSE_READDIR: { + struct fuse_read_in *rin = + (struct fuse_read_in *) (buf + sizeof(*in)); + uint8_t out[256]; + size_t out_len = 0; + if (rin->offset == 0) { + out_len = append_dirent(out, out_len, 1, 1, DT_DIR, "."); + out_len = append_dirent(out, out_len, 1, 2, DT_DIR, ".."); + out_len = append_dirent(out, out_len, 2, 3, DT_REG, hello_name); + } + if (reply_frame(ctx->fusefd, in->unique, 0, out, out_len) < 0) + exit(1); + break; + } + case FUSE_READ: { + struct fuse_read_in *rin = + (struct fuse_read_in *) (buf + sizeof(*in)); + size_t len = sizeof(hello_data) - 1; + if (rin->offset >= len) { + if (reply_frame(ctx->fusefd, in->unique, 0, NULL, 0) < 0) + exit(1); + break; + } + size_t avail = len - (size_t) rin->offset; + if (avail > rin->size) + avail = rin->size; + if (reply_frame(ctx->fusefd, in->unique, 0, + hello_data + rin->offset, avail) < 0) + exit(1); + break; + } + case FUSE_RELEASE: + ctx->saw_release = 1; + if (reply_frame(ctx->fusefd, in->unique, 0, NULL, 0) < 0) + exit(1); + if (ctx->saw_release && ctx->saw_releasedir) + return NULL; + break; + case FUSE_RELEASEDIR: + ctx->saw_releasedir = 1; + if (reply_frame(ctx->fusefd, in->unique, 0, NULL, 0) < 0) + exit(1); + if (ctx->saw_release && ctx->saw_releasedir) + return NULL; + break; + default: + if (reply_frame(ctx->fusefd, in->unique, -ENOSYS, NULL, 0) < 0) + exit(1); + break; + } + } +} + +static void die(const char *msg) +{ + perror(msg); + exit(1); +} + +static void expect_contains(const char *path, const char *needle) +{ + int fd = open(path, O_RDONLY); + if (fd < 0) + die(path); + char buf[4096]; + ssize_t nr = read(fd, buf, sizeof(buf) - 1); + if (nr < 0) + die("read(procfs)"); + buf[nr] = '\0'; + close(fd); + if (!strstr(buf, needle)) { + fprintf(stderr, "%s missing '%s'\n", path, needle); + exit(1); + } +} + +static void expect_hello_fd(int fd) +{ + char buf[64]; + ssize_t nr = read(fd, buf, sizeof(buf)); + if (nr != (ssize_t) (sizeof(hello_data) - 1) || + memcmp(buf, hello_data, sizeof(hello_data) - 1) != 0) { + fprintf(stderr, "unexpected read payload\n"); + exit(1); + } +} + +int main(void) +{ + const char *mount_dir = "/mnt/fuse"; + if (access(mount_dir, F_OK) < 0) + die("access(mountpoint)"); + if (access("/dev/fuse", F_OK) < 0) + die("access(/dev/fuse)"); + + struct stat st; + if (stat("/dev/fuse", &st) < 0) + die("stat(/dev/fuse)"); + if (!S_ISCHR(st.st_mode)) { + fprintf(stderr, "/dev/fuse is not a character device\n"); + return 1; + } + + int fusefd = open("/dev/fuse", O_RDWR); + if (fusefd < 0) + die("open(/dev/fuse)"); + if (fstat(fusefd, &st) < 0) + die("fstat(/dev/fuse)"); + if (!S_ISCHR(st.st_mode)) { + fprintf(stderr, "fstat(/dev/fuse) did not report char device\n"); + return 1; + } + close(fusefd); + + int bad_fusefd = open("/dev/fuse", O_RDWR); + if (bad_fusefd < 0) + die("open(/dev/fuse bad)"); + + daemon_ctx_t bad_ctx = {.fusefd = bad_fusefd, .init_error = EPROTO}; + pthread_t bad_tid; + if (pthread_create(&bad_tid, NULL, daemon_main, &bad_ctx) != 0) { + errno = EINVAL; + die("pthread_create bad daemon"); + } + + char opts[128]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=40000,user_id=%u,group_id=%u", + bad_fusefd, (unsigned) getuid(), (unsigned) getgid()); + errno = 0; + if (syscall(SYS_mount, source_name, mount_dir, "fuse", 0, opts) >= 0 || + errno != EPROTO) { + fprintf(stderr, "expected mount INIT failure as EPROTO, got errno=%d\n", + errno); + return 1; + } + close(bad_fusefd); + if (pthread_join(bad_tid, NULL) != 0) { + errno = EINVAL; + die("pthread_join bad daemon"); + } + + fusefd = open("/dev/fuse", O_RDWR); + if (fusefd < 0) + die("open(/dev/fuse good)"); + + daemon_ctx_t ctx = {.fusefd = fusefd}; + pthread_t tid; + if (pthread_create(&tid, NULL, daemon_main, &ctx) != 0) { + errno = EINVAL; + die("pthread_create"); + } + + snprintf(opts, sizeof(opts), "fd=%d,rootmode=40000,user_id=%u,group_id=%u", + fusefd, (unsigned) getuid(), (unsigned) getgid()); + if (syscall(SYS_mount, source_name, mount_dir, "fuse", 0, opts) < 0) + die("mount(fuse)"); + + if (stat(mount_dir, &st) < 0) + die("stat(mountpoint)"); + if (!S_ISDIR(st.st_mode)) { + fprintf(stderr, "mountpoint is not a directory\n"); + return 1; + } + + expect_contains("/proc/self/mountinfo", mount_dir); + expect_contains("/proc/self/mountinfo", " - fuse "); + expect_contains("/proc/mounts", mount_dir); + expect_contains("/proc/mounts", source_name); + + char hello_path[256]; + snprintf(hello_path, sizeof(hello_path), "%s/%s", mount_dir, hello_name); + if (stat(hello_path, &st) < 0) + die("stat(file)"); + if (!S_ISREG(st.st_mode) || + st.st_size != (off_t) (sizeof(hello_data) - 1)) { + fprintf(stderr, "unexpected file stat\n"); + return 1; + } + + int pathfd = open(hello_path, O_PATH); + if (pathfd < 0) + die("open(file O_PATH)"); + char path_buf[8]; + errno = 0; + if (read(pathfd, path_buf, sizeof(path_buf)) >= 0 || errno != EBADF) { + fprintf(stderr, "expected read(O_PATH file) to fail with EBADF\n"); + return 1; + } + close(pathfd); + + int pathdfd = open(mount_dir, O_PATH | O_DIRECTORY); + if (pathdfd < 0) + die("open(dir O_PATH)"); + char path_dents[128]; + errno = 0; + if (syscall(SYS_getdents64, pathdfd, path_dents, sizeof(path_dents)) >= 0 || + errno != EBADF) { + fprintf(stderr, "expected getdents64(O_PATH dir) to fail with EBADF\n"); + return 1; + } + if (fchdir(pathdfd) < 0) + die("fchdir(O_PATH fuse-dir)"); + if (chdir("/") < 0) + die("chdir(/ after O_PATH)"); + close(pathdfd); + + int fd = open(hello_path, O_RDONLY); + if (fd < 0) + die("open(file)"); + errno = 0; + int wfd = open(hello_path, O_RDWR); + if (wfd >= 0 || errno != EACCES) { + fprintf(stderr, "expected O_RDWR FUSE open to fail with EACCES\n"); + return 1; + } + if (fstat(fd, &st) < 0) + die("fstat(file)"); + char fdinfo_path[64]; + snprintf(fdinfo_path, sizeof(fdinfo_path), "/proc/self/fdinfo/%d", fd); + expect_contains(fdinfo_path, "mnt_id:\t"); + expect_hello_fd(fd); + int dupfd = dup(fd); + if (dupfd < 0) + die("dup(fuse-file)"); + char eof_probe[8]; + ssize_t dup_nr = read(dupfd, eof_probe, sizeof(eof_probe)); + if (dup_nr != 0) { + fprintf(stderr, "expected dup'd FUSE file to share EOF offset\n"); + return 1; + } + close(dupfd); + + void *map = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, fd, 0); + if (map != MAP_FAILED || errno != ENODEV) { + fprintf(stderr, + "expected mmap ENODEV on FUSE fd, got map=%p errno=%d\n", map, + errno); + return 1; + } + close(fd); + + /* Canonicalization: ./ and intermediate-up traversals must collapse to + * the same file rather than be forwarded as literal FUSE_LOOKUP names. + * Without canonicalization the daemon would receive LOOKUP for "." and + * fail. + */ + char dot_path[256]; + snprintf(dot_path, sizeof(dot_path), "%s/./%s", mount_dir, hello_name); + int dotfd = open(dot_path, O_RDONLY); + if (dotfd < 0) + die("open(./hello)"); + expect_hello_fd(dotfd); + close(dotfd); + + /* sub/../hello must canonicalize to hello before the FUSE walk; the + * daemon does not implement sub or "..", so this would fail with + * ENOENT if the path were forwarded literally. + */ + char up_path[256]; + snprintf(up_path, sizeof(up_path), "%s/sub/../%s", mount_dir, hello_name); + int upfd = open(up_path, O_RDONLY); + if (upfd < 0) + die("open(sub/../hello)"); + expect_hello_fd(upfd); + close(upfd); + + int dfd = open(mount_dir, O_RDONLY | O_DIRECTORY); + if (dfd < 0) + die("open(dir)"); + int dupdfd = dup(dfd); + if (dupdfd < 0) + die("dup(fuse-dir)"); + int relfd = openat(dfd, hello_name, O_RDONLY); + if (relfd < 0) + die("openat(dirfd, hello)"); + expect_hello_fd(relfd); + close(relfd); + + if (fchdir(dfd) < 0) + die("fchdir(fuse-dir)"); + relfd = open(hello_name, O_RDONLY); + if (relfd < 0) + die("open(cwd-relative hello)"); + expect_hello_fd(relfd); + close(relfd); + if (chdir("/") < 0) + die("chdir(/)"); + + if (chdir(mount_dir) < 0) + die("chdir(mountpoint)"); + if (access(".", F_OK) < 0) + die("access(.) inside fuse"); + if (stat(".", &st) < 0) + die("stat(.) inside fuse"); + if (!S_ISDIR(st.st_mode)) { + fprintf(stderr, "stat(.) inside fuse is not a directory\n"); + return 1; + } + relfd = open(hello_name, O_RDONLY); + if (relfd < 0) + die("open(chdir-relative hello)"); + expect_hello_fd(relfd); + close(relfd); + if (chdir("/") < 0) + die("chdir(/ restore)"); + + char dents[512]; + ssize_t nr = syscall(SYS_getdents64, dfd, dents, sizeof(dents)); + if (nr < 0) + die("getdents64"); + + int found = 0; + for (size_t off = 0; off < (size_t) nr;) { + struct linux_dirent64 *de = (struct linux_dirent64 *) (dents + off); + if (!strcmp(de->d_name, hello_name)) + found = 1; + off += de->d_reclen; + } + if (!found) { + fprintf(stderr, "readdir did not report %s\n", hello_name); + return 1; + } + nr = syscall(SYS_getdents64, dupdfd, dents, sizeof(dents)); + if (nr != 0) { + fprintf(stderr, "expected dup'd FUSE dir to share EOF offset\n"); + return 1; + } + + /* Daemon-death + post-tombstone routing test. + * + * Set the virtual cwd to the FUSE mount via fchdir(dfd) so the consumer + * has a FUSE-rooted relative-path baseline. Close /dev/fuse from the + * consumer side to simulate daemon death: fuse_fd_cleanup tombstones + * the mount (mount->session = NULL but the slot path/source/fstype/ + * mount_id stay intact), wakes any blocked requests, and the daemon + * thread's next read returns ENOTCONN and the thread exits. + * + * After the daemon is gone, a relative open from the still-FUSE-rooted + * virtual cwd MUST return -LINUX_ENOTCONN rather than silently falling + * through to host-relative open against the host cwd. The tombstoned + * mount keeps the path matching FUSE, and fuse_open_path detects + * session==NULL and returns ENOTCONN. + */ + if (fchdir(dfd) < 0) + die("fchdir(dfd) before daemon death"); + int alive_fd = open(hello_name, O_RDONLY); + if (alive_fd < 0) + die("open(hello) pre-death sanity"); + close(alive_fd); + + close(fusefd); + fusefd = -1; + if (pthread_join(tid, NULL) != 0) { + errno = EINVAL; + die("pthread_join after daemon death"); + } + + errno = 0; + int dead_fd = open(hello_name, O_RDONLY); + if (dead_fd >= 0 || errno != ENOTCONN) { + fprintf(stderr, + "expected ENOTCONN on post-death relative open;" + " got fd=%d errno=%d (%s)\n", + dead_fd, errno, strerror(errno)); + if (dead_fd >= 0) + close(dead_fd); + return 1; + } + + if (chdir("/") < 0) + die("chdir(/) after daemon death"); + close(dupdfd); + close(dfd); + return 0; +} diff --git a/tests/test-negative.c b/tests/test-negative.c index 191728e..2c068c0 100644 --- a/tests/test-negative.c +++ b/tests/test-negative.c @@ -379,11 +379,11 @@ static void test_einval(void) TEST("clock_nanosleep(bad clockid) -> EINVAL"); { - /* Replaces an earlier "bad flags" probe that was unreliable: the - * Linux kernel's clock_nanosleep only validates `flags & - * TIMER_ABSTIME` and silently ignores other bits, so a raw-syscall - * flag check round-trips to glibc territory rather than the kernel. - * An invalid clockid is a real kernel-side error path. + /* Replaces an earlier "bad flags" probe that was unreliable: Linux's + * clock_nanosleep only validates the (flags & TIMER_ABSTIME) bit and + * silently ignores other bits, so a raw-syscall flag check round-trips + * to glibc territory rather than the kernel. An invalid clockid is a + * real kernel-side error path. */ struct timespec ts = {.tv_sec = 0, .tv_nsec = 1}; EXPECT_RAW_ERRNO(raw_syscall4(__NR_clock_nanosleep, 99 /* invalid */, 0,