diff --git a/fs/namespace.c b/fs/namespace.c index 854f4fc66469..fe919abd2f01 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2646,6 +2646,19 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (unlikely(shorter) && child != source_mnt) mp = shorter; + /* + * If @q was locked it was meant to hide + * whatever was under it. Let @child take over + * that job and lock it, then we can unlock @q. + * That'll allow another namespace to shed @q + * and reveal @child. Clearly, that mounter + * consented to this by not severing the mount + * relationship. Otherwise, what's the point. + */ + if (IS_MNT_LOCKED(q)) { + child->mnt.mnt_flags |= MNT_LOCKED; + q->mnt.mnt_flags &= ~MNT_LOCKED; + } mnt_change_mountpoint(r, mp, q); } } @@ -2722,7 +2735,7 @@ static inline struct mount *where_to_mount(const struct path *path, * In all cases the location must not have been unmounted and the * chosen mountpoint must be allowed to be mounted on. For "beneath" * case we also require the location to be at the root of a mount - * that has a parent (i.e. is not a root of some namespace). + * that has something mounted on top of it (i.e. has an overmount). */ static void do_lock_mount(const struct path *path, struct pinned_mountpoint *res, @@ -2958,10 +2971,9 @@ static inline bool may_copy_tree(const struct path *path) } static struct mount *__do_loopback(const struct path *old_path, - unsigned int flags, unsigned int copy_flags) + bool recurse, unsigned int copy_flags) { struct mount *old = real_mount(old_path->mnt); - bool recurse = flags & AT_RECURSIVE; if (IS_MNT_UNBINDABLE(old)) return ERR_PTR(-EINVAL); @@ -2972,18 +2984,6 @@ static struct mount *__do_loopback(const struct path *old_path, if (!recurse && __has_locked_children(old, old_path->dentry)) return ERR_PTR(-EINVAL); - /* - * When creating a new mount namespace we don't want to copy over - * mounts of mount namespaces to avoid the risk of cycles and also to - * minimize the default complex interdependencies between mount - * namespaces. - * - * We could ofc just check whether all mount namespace files aren't - * creating cycles but really let's keep this simple. - */ - if (!(flags & OPEN_TREE_NAMESPACE)) - copy_flags |= CL_COPY_MNT_NS_FILE; - if (recurse) return copy_tree(old, old_path->dentry, copy_flags); @@ -2998,7 +2998,6 @@ static int do_loopback(const struct path *path, const char *old_name, { struct path old_path __free(path_put) = {}; struct mount *mnt = NULL; - unsigned int flags = recurse ? AT_RECURSIVE : 0; int err; if (!old_name || !*old_name) @@ -3017,7 +3016,7 @@ static int do_loopback(const struct path *path, const char *old_name, if (!check_mnt(mp.parent)) return -EINVAL; - mnt = __do_loopback(&old_path, flags, 0); + mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -3055,7 +3054,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned ns->seq_origin = src_mnt_ns->ns.ns_id; } - mnt = __do_loopback(path, flags, 0); + mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE); if (IS_ERR(mnt)) { emptied_ns = ns; return ERR_CAST(mnt); @@ -3087,7 +3086,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags) return file; } -static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) +enum mount_copy_flags_t { + MOUNT_COPY_RECURSIVE = (1 << 0), + MOUNT_COPY_NEW = (1 << 1), +}; + +static struct mnt_namespace *create_new_namespace(struct path *path, + enum mount_copy_flags_t flags) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct user_namespace *user_ns = current_user_ns(); @@ -3096,7 +3101,7 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in struct path to_path; struct mount *mnt; unsigned int copy_flags = 0; - bool locked = false; + bool locked = false, recurse = flags & MOUNT_COPY_RECURSIVE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; @@ -3131,11 +3136,14 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in } /* - * We don't emulate unshare()ing a mount namespace. We stick - * to the restrictions of creating detached bind-mounts. It - * has a lot saner and simpler semantics. + * We don't emulate unshare()ing a mount namespace. We stick to + * the restrictions of creating detached bind-mounts. It has a + * lot saner and simpler semantics. */ - mnt = __do_loopback(path, flags, copy_flags); + if (flags & MOUNT_COPY_NEW) + mnt = clone_mnt(real_mount(path->mnt), path->dentry, copy_flags); + else + mnt = __do_loopback(path, recurse, copy_flags); scoped_guard(mount_writer) { if (IS_ERR(mnt)) { emptied_ns = new_ns; @@ -3164,7 +3172,8 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in return new_ns; } -static struct file *open_new_namespace(struct path *path, unsigned int flags) +static struct file *open_new_namespace(struct path *path, + enum mount_copy_flags_t flags) { struct mnt_namespace *new_ns; @@ -3217,7 +3226,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned return ERR_PTR(ret); if (flags & OPEN_TREE_NAMESPACE) - return open_new_namespace(&path, flags); + return open_new_namespace(&path, (flags & AT_RECURSIVE) ? MOUNT_COPY_RECURSIVE : 0); if (flags & OPEN_TREE_CLONE) return open_detached_copy(&path, flags); @@ -3513,8 +3522,6 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * @mnt_to: mount under which to mount * @mp: mountpoint of @mnt_to * - * - Make sure that nothing can be mounted beneath the caller's current - * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. * - Ensure that nothing has been mounted on top of @mnt_from before we @@ -3528,26 +3535,14 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) */ static int can_move_mount_beneath(const struct mount *mnt_from, const struct mount *mnt_to, - const struct mountpoint *mp) + struct pinned_mountpoint *mp) { struct mount *parent_mnt_to = mnt_to->mnt_parent; - if (IS_MNT_LOCKED(mnt_to)) - return -EINVAL; - /* Avoid creating shadow mounts during mount propagation. */ if (mnt_from->overmount) return -EINVAL; - /* - * Mounting beneath the rootfs only makes sense when the - * semantics of pivot_root(".", ".") are used. - */ - if (&mnt_to->mnt == current->fs->root.mnt) - return -EINVAL; - if (parent_mnt_to == current->nsproxy->mnt_ns->root) - return -EINVAL; - if (mount_is_ancestor(mnt_to, mnt_from)) return -EINVAL; @@ -3557,7 +3552,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from, * propagating a copy @c of @mnt_from on top of @mnt_to. This * defeats the whole purpose of mounting beneath another mount. */ - if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) + if (propagation_would_overmount(parent_mnt_to, mnt_to, mp->mp)) return -EINVAL; /* @@ -3573,7 +3568,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from, * @mnt_from beneath @mnt_to. */ if (check_mnt(mnt_from) && - propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + propagation_would_overmount(parent_mnt_to, mnt_from, mp->mp)) return -EINVAL; return 0; @@ -3682,7 +3677,7 @@ static int do_move_mount(const struct path *old_path, if (mp.parent != over->mnt_parent) over = mp.parent->overmount; - err = can_move_mount_beneath(old, over, mp.mp); + err = can_move_mount_beneath(old, over, &mp); if (err) return err; } @@ -4231,8 +4226,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt __free(mntput) = NULL; - struct vfsmount *pwdmnt __free(mntput) = NULL; + struct path old_root __free(path_put) = {}; + struct path old_pwd __free(path_put) = {}; struct mount *p, *q; struct mount *old; struct mount *new; @@ -4252,11 +4247,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, return new_ns; guard(namespace_excl)(); - /* First pass: copy the tree topology */ - copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; + + if (flags & CLONE_EMPTY_MNTNS) + copy_flags = 0; + else + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; - new = copy_tree(old, old->mnt.mnt_root, copy_flags); + + if (flags & CLONE_EMPTY_MNTNS) + new = clone_mnt(old, old->mnt.mnt_root, copy_flags); + else + new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { emptied_ns = new_ns; return ERR_CAST(new); @@ -4267,33 +4269,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, } new_ns->root = new; - /* - * Second pass: switch the tsk->fs->* elements and mark new vfsmounts - * as belonging to new namespace. We have already acquired a private - * fs_struct, so tsk->fs->lock is not needed. - */ - p = old; - q = new; - while (p) { - mnt_add_to_ns(new_ns, q); - new_ns->nr_mounts++; + if (flags & CLONE_EMPTY_MNTNS) { + /* + * Empty mount namespace: only the root mount exists. + * Reset root and pwd to the cloned mount's root dentry. + */ if (new_fs) { - if (&p->mnt == new_fs->root.mnt) { - new_fs->root.mnt = mntget(&q->mnt); - rootmnt = &p->mnt; - } - if (&p->mnt == new_fs->pwd.mnt) { - new_fs->pwd.mnt = mntget(&q->mnt); - pwdmnt = &p->mnt; - } + old_root = new_fs->root; + old_pwd = new_fs->pwd; + + new_fs->root.mnt = mntget(&new->mnt); + new_fs->root.dentry = dget(new->mnt.mnt_root); + + new_fs->pwd.mnt = mntget(&new->mnt); + new_fs->pwd.dentry = dget(new->mnt.mnt_root); + } + mnt_add_to_ns(new_ns, new); + new_ns->nr_mounts++; + } else { + /* + * Full copy: walk old and new trees in parallel, switching + * the tsk->fs->* elements and marking new vfsmounts as + * belonging to new namespace. We have already acquired a + * private fs_struct, so tsk->fs->lock is not needed. + */ + p = old; + q = new; + while (p) { + mnt_add_to_ns(new_ns, q); + new_ns->nr_mounts++; + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + old_root.mnt = new_fs->root.mnt; + new_fs->root.mnt = mntget(&q->mnt); + } + if (&p->mnt == new_fs->pwd.mnt) { + old_pwd.mnt = new_fs->pwd.mnt; + new_fs->pwd.mnt = mntget(&q->mnt); + } + } + p = next_mnt(p, old); + q = next_mnt(q, new); + if (!q) + break; + // an mntns binding we'd skipped? + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(skip_mnt_tree(p), old); } - p = next_mnt(p, old); - q = next_mnt(q, new); - if (!q) - break; - // an mntns binding we'd skipped? - while (p->mnt.mnt_root != q->mnt.mnt_root) - p = next_mnt(skip_mnt_tree(p), old); } ns_tree_add_raw(new_ns); return new_ns; @@ -4414,11 +4436,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int mnt_flags = 0; long ret; - if (!may_mount()) + if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0) + return -EINVAL; + + if ((flags & FSMOUNT_NAMESPACE) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; - if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) - return -EINVAL; + if (!(flags & FSMOUNT_NAMESPACE) && !may_mount()) + return -EPERM; if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; @@ -4485,6 +4511,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, */ vfs_clean_context(fc); + if (flags & FSMOUNT_NAMESPACE) + return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, + open_new_namespace(&new_path, MOUNT_COPY_NEW)); + ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); if (IS_ERR(ns)) return PTR_ERR(ns); @@ -5649,14 +5679,14 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) if (mnt_ns_empty(ns)) return -ENOENT; - first = child = ns->root; - for (;;) { - child = listmnt_next(child, false); - if (!child) - return -ENOENT; - if (child->mnt_parent == first) + first = ns->root; + for (child = node_to_mount(ns->mnt_first_node); child; + child = listmnt_next(child, false)) { + if (child != first && child->mnt_parent == first) break; } + if (!child) + return -ENOENT; root->mnt = mntget(&child->mnt); root->dentry = dget(root->mnt->mnt_root); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index d9d86598d100..2204708dbf7a 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -110,6 +110,7 @@ enum fsconfig_command { * fsmount() flags. */ #define FSMOUNT_CLOEXEC 0x00000001 +#define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */ /* * Mount attributes. diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 01e09609b605..33a4624285cd 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -39,6 +39,7 @@ #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ #define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ #define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */ +#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 @@ -46,6 +47,12 @@ */ #define CLONE_NEWTIME 0x00000080 /* New time namespace */ +/* + * unshare flags share the bit space with clone flags but only apply to the + * unshare syscall: + */ +#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/kernel/fork.c b/kernel/fork.c index 131ae7bbb0de..9c194fc58736 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2666,6 +2666,16 @@ pid_t kernel_clone(struct kernel_clone_args *args) int trace = 0; pid_t nr; + /* + * Creating an empty mount namespace implies creating a new mount + * namespace. Set this before copy_process() so that the + * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly. + */ + if (clone_flags & CLONE_EMPTY_MNTNS) { + clone_flags |= CLONE_NEWNS; + args->flags = clone_flags; + } + /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are @@ -2944,8 +2954,9 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | - CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | + CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP | + CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS)) return false; /* @@ -3096,7 +3107,7 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NS_ALL)) + CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing @@ -3195,6 +3206,8 @@ int ksys_unshare(unsigned long unshare_flags) /* * If unsharing namespace, must also unshare filesystem information. */ + if (unshare_flags & UNSHARE_EMPTY_MNTNS) + unshare_flags |= CLONE_NEWNS; if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 63b44ee79847..d9d3d5973bf5 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -96,7 +96,8 @@ static struct nsproxy *create_new_namespaces(u64 flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, + user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; @@ -211,16 +212,26 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { struct user_namespace *user_ns; + u64 flags = unshare_flags; int err = 0; - if (!(unshare_flags & (CLONE_NS_ALL & ~CLONE_NEWUSER))) + if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + /* + * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases + * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS. + */ + if (flags & UNSHARE_EMPTY_MNTNS) { + flags &= ~(u64)UNSHARE_EMPTY_MNTNS; + flags |= CLONE_EMPTY_MNTNS; + } + + *new_nsp = create_new_namespaces(flags, current, user_ns, new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h index 7fa67c2031a5..2204708dbf7a 100644 --- a/tools/include/uapi/linux/mount.h +++ b/tools/include/uapi/linux/mount.h @@ -61,7 +61,8 @@ /* * open_tree() flags. */ -#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */ +#define OPEN_TREE_NAMESPACE (1 << 1) /* Clone the target tree into a new mount namespace */ #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ /* @@ -109,6 +110,7 @@ enum fsconfig_command { * fsmount() flags. */ #define FSMOUNT_CLOEXEC 0x00000001 +#define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */ /* * Mount attributes. @@ -197,7 +199,10 @@ struct statmount { */ struct mnt_id_req { __u32 size; - __u32 spare; + union { + __u32 mnt_ns_fd; + __u32 mnt_fd; + }; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; @@ -232,4 +237,9 @@ struct mnt_id_req { #define LSMT_ROOT 0xffffffffffffffff /* root mount */ #define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ +/* + * @flag bits for statmount(2) + */ +#define STATMOUNT_BY_FD 0x00000001U /* want mountinfo for given fd */ + #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 1db72e6b05b8..984abb6d42ab 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -38,6 +38,9 @@ TARGETS += filesystems/overlayfs TARGETS += filesystems/statmount TARGETS += filesystems/mount-notify TARGETS += filesystems/fuse +TARGETS += filesystems/move_mount +TARGETS += filesystems/empty_mntns +TARGETS += filesystems/fsmount_ns TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/empty_mntns/.gitignore b/tools/testing/selftests/filesystems/empty_mntns/.gitignore new file mode 100644 index 000000000000..99f89d329db2 --- /dev/null +++ b/tools/testing/selftests/filesystems/empty_mntns/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +clone3_empty_mntns_test +empty_mntns_test +overmount_chroot_test diff --git a/tools/testing/selftests/filesystems/empty_mntns/Makefile b/tools/testing/selftests/filesystems/empty_mntns/Makefile new file mode 100644 index 000000000000..22e3fb915e81 --- /dev/null +++ b/tools/testing/selftests/filesystems/empty_mntns/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS += -lcap + +TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test clone3_empty_mntns_test + +include ../../lib.mk + +$(OUTPUT)/empty_mntns_test: ../utils.c +$(OUTPUT)/overmount_chroot_test: ../utils.c +$(OUTPUT)/clone3_empty_mntns_test: ../utils.c diff --git a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c new file mode 100644 index 000000000000..6370086f886d --- /dev/null +++ b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c @@ -0,0 +1,938 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Tests for empty mount namespace creation via clone3() CLONE_EMPTY_MNTNS + * + * These tests exercise the clone3() code path for creating empty mount + * namespaces, which is distinct from the unshare() path tested in + * empty_mntns_test.c. With clone3(), CLONE_EMPTY_MNTNS (0x2000000000ULL) + * is a 64-bit flag that implies CLONE_NEWNS. The implication happens in + * kernel_clone() before copy_process(), unlike unshare() where it goes + * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in + * unshare_nsproxy_namespaces(). + * + * Copyright (c) 2024 Christian Brauner + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../utils.h" +#include "../wrappers.h" +#include "clone3/clone3_selftests.h" +#include "empty_mntns.h" +#include "kselftest_harness.h" + +static pid_t clone3_empty_mntns(uint64_t extra_flags) +{ + struct __clone_args args = { + .flags = CLONE_EMPTY_MNTNS | extra_flags, + .exit_signal = SIGCHLD, + }; + + return sys_clone3(&args, sizeof(args)); +} + +static bool clone3_empty_mntns_supported(void) +{ + pid_t pid; + int status; + + pid = fork(); + if (pid < 0) + return false; + + if (pid == 0) { + if (enter_userns()) + _exit(1); + + pid = clone3_empty_mntns(0); + if (pid < 0) + _exit(1); + + if (pid == 0) + _exit(0); + + _exit(wait_for_pid(pid) != 0); + } + + if (waitpid(pid, &status, 0) != pid) + return false; + + if (!WIFEXITED(status)) + return false; + + return WEXITSTATUS(status) == 0; +} + +FIXTURE(clone3_empty_mntns) {}; + +FIXTURE_SETUP(clone3_empty_mntns) +{ + if (!clone3_empty_mntns_supported()) + SKIP(return, "CLONE_EMPTY_MNTNS via clone3 not supported"); +} + +FIXTURE_TEARDOWN(clone3_empty_mntns) {} + +/* + * Basic clone3() with CLONE_EMPTY_MNTNS: child gets empty mount namespace + * with exactly 1 mount and root == cwd. + */ +TEST_F(clone3_empty_mntns, basic) +{ + pid_t pid, inner; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + if (enter_userns()) + _exit(1); + + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(2); + + if (inner == 0) { + uint64_t root_id, cwd_id; + + if (count_mounts() != 1) + _exit(3); + + root_id = get_unique_mnt_id("/"); + cwd_id = get_unique_mnt_id("."); + if (root_id == 0 || cwd_id == 0) + _exit(4); + + if (root_id != cwd_id) + _exit(5); + + _exit(0); + } + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * CLONE_EMPTY_MNTNS implies CLONE_NEWNS. Verify that it works without + * explicitly setting CLONE_NEWNS (tests fork.c:2627-2630). + */ +TEST_F(clone3_empty_mntns, implies_newns) +{ + pid_t pid, inner; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ssize_t parent_mounts; + + if (enter_userns()) + _exit(1); + + /* Verify we have mounts in our current namespace. */ + parent_mounts = count_mounts(); + if (parent_mounts < 1) + _exit(2); + + /* Only CLONE_EMPTY_MNTNS, no explicit CLONE_NEWNS. */ + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(3); + + if (inner == 0) { + if (count_mounts() != 1) + _exit(4); + + _exit(0); + } + + /* Parent still has its mounts. */ + if (count_mounts() != parent_mounts) + _exit(5); + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Helper macro: generate a test that clones with CLONE_EMPTY_MNTNS | + * @extra_flags and verifies the child has exactly one mount. + */ +#define TEST_CLONE3_FLAGS(test_name, extra_flags) \ +TEST_F(clone3_empty_mntns, test_name) \ +{ \ + pid_t pid, inner; \ + \ + pid = fork(); \ + ASSERT_GE(pid, 0); \ + \ + if (pid == 0) { \ + if (enter_userns()) \ + _exit(1); \ + \ + inner = clone3_empty_mntns(extra_flags); \ + if (inner < 0) \ + _exit(2); \ + \ + if (inner == 0) { \ + if (count_mounts() != 1) \ + _exit(3); \ + _exit(0); \ + } \ + \ + _exit(wait_for_pid(inner)); \ + } \ + \ + ASSERT_EQ(wait_for_pid(pid), 0); \ +} + +/* Redundant CLONE_NEWNS | CLONE_EMPTY_MNTNS should succeed. */ +TEST_CLONE3_FLAGS(with_explicit_newns, CLONE_NEWNS) + +/* CLONE_EMPTY_MNTNS combined with CLONE_NEWUSER. */ +TEST_CLONE3_FLAGS(with_newuser, CLONE_NEWUSER) + +/* CLONE_EMPTY_MNTNS combined with other namespace flags. */ +TEST_CLONE3_FLAGS(with_other_ns_flags, CLONE_NEWUTS | CLONE_NEWIPC) + +/* + * CLONE_EMPTY_MNTNS combined with CLONE_NEWPID. + */ +TEST_F(clone3_empty_mntns, with_newpid) +{ + pid_t pid, inner; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + if (enter_userns()) + _exit(1); + + inner = clone3_empty_mntns(CLONE_NEWPID); + if (inner < 0) + _exit(2); + + if (inner == 0) { + if (count_mounts() != 1) + _exit(3); + + /* In a new PID namespace, getpid() returns 1. */ + if (getpid() != 1) + _exit(4); + + _exit(0); + } + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * CLONE_EMPTY_MNTNS | CLONE_FS must fail because the implied CLONE_NEWNS + * and CLONE_FS are mutually exclusive (fork.c:1981). + */ +TEST_F(clone3_empty_mntns, with_clone_fs_fails) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct __clone_args args = { + .flags = CLONE_EMPTY_MNTNS | CLONE_FS, + .exit_signal = SIGCHLD, + }; + pid_t ret; + + if (enter_userns()) + _exit(1); + + ret = sys_clone3(&args, sizeof(args)); + if (ret >= 0) { + if (ret == 0) + _exit(0); + wait_for_pid(ret); + _exit(2); + } + + if (errno != EINVAL) + _exit(3); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * CLONE_EMPTY_MNTNS combined with CLONE_PIDFD returns a valid pidfd. + */ +TEST_F(clone3_empty_mntns, with_pidfd) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct __clone_args args = { + .flags = CLONE_EMPTY_MNTNS | CLONE_PIDFD, + .exit_signal = SIGCHLD, + }; + int pidfd = -1; + pid_t inner; + + if (enter_userns()) + _exit(1); + + args.pidfd = (uintptr_t)&pidfd; + + inner = sys_clone3(&args, sizeof(args)); + if (inner < 0) + _exit(2); + + if (inner == 0) { + if (count_mounts() != 1) + _exit(3); + + _exit(0); + } + + /* Verify we got a valid pidfd. */ + if (pidfd < 0) + _exit(4); + + close(pidfd); + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * clone3 without CAP_SYS_ADMIN must fail with EPERM. + */ +TEST_F(clone3_empty_mntns, eperm_without_caps) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pid_t ret; + + /* Skip if already root. */ + if (getuid() == 0) + _exit(0); + + ret = clone3_empty_mntns(0); + if (ret >= 0) { + if (ret == 0) + _exit(0); + wait_for_pid(ret); + _exit(1); + } + + if (errno != EPERM) + _exit(2); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Parent's mount namespace is unaffected after clone3 with CLONE_EMPTY_MNTNS. + */ +TEST_F(clone3_empty_mntns, parent_unchanged) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ssize_t nr_before, nr_after; + pid_t inner; + + if (enter_userns()) + _exit(1); + + nr_before = count_mounts(); + if (nr_before < 1) + _exit(2); + + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(3); + + if (inner == 0) + _exit(0); + + if (wait_for_pid(inner) != 0) + _exit(4); + + nr_after = count_mounts(); + if (nr_after != nr_before) + _exit(5); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Parent with many mounts: child still gets exactly 1 mount. + */ +TEST_F(clone3_empty_mntns, many_parent_mounts) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + char tmpdir[] = "/tmp/clone3_mntns_test.XXXXXX"; + pid_t inner; + int i; + + if (enter_userns()) + _exit(1); + + if (unshare(CLONE_NEWNS)) + _exit(2); + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) + _exit(3); + + if (!mkdtemp(tmpdir)) + _exit(4); + + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) + _exit(5); + + for (i = 0; i < 5; i++) { + char subdir[256]; + + snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i); + if (mkdir(subdir, 0755) && errno != EEXIST) + _exit(6); + if (mount(subdir, subdir, NULL, MS_BIND, NULL)) + _exit(7); + } + + if (count_mounts() < 5) + _exit(8); + + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(9); + + if (inner == 0) { + if (count_mounts() != 1) + _exit(10); + + _exit(0); + } + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Verify the child's root mount is nullfs with expected statmount properties. + */ +TEST_F(clone3_empty_mntns, mount_properties) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pid_t inner; + + if (enter_userns()) + _exit(1); + + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(2); + + if (inner == 0) { + struct statmount *sm; + uint64_t root_id; + + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(3); + + sm = statmount_alloc(root_id, 0, + STATMOUNT_MNT_BASIC | + STATMOUNT_MNT_POINT | + STATMOUNT_FS_TYPE, 0); + if (!sm) + _exit(4); + + /* Root mount point is "/". */ + if (!(sm->mask & STATMOUNT_MNT_POINT)) + _exit(5); + if (strcmp(sm->str + sm->mnt_point, "/") != 0) + _exit(6); + + /* Filesystem type is nullfs. */ + if (!(sm->mask & STATMOUNT_FS_TYPE)) + _exit(7); + if (strcmp(sm->str + sm->fs_type, "nullfs") != 0) + _exit(8); + + /* Root mount is its own parent. */ + if (!(sm->mask & STATMOUNT_MNT_BASIC)) + _exit(9); + if (sm->mnt_parent_id != sm->mnt_id) + _exit(10); + + free(sm); + _exit(0); + } + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Listmount returns only the root mount in the child's empty namespace. + */ +TEST_F(clone3_empty_mntns, listmount_single_entry) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pid_t inner; + + if (enter_userns()) + _exit(1); + + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(2); + + if (inner == 0) { + uint64_t list[16]; + ssize_t nr_mounts; + uint64_t root_id; + + nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0); + if (nr_mounts != 1) + _exit(3); + + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(4); + + if (list[0] != root_id) + _exit(5); + + _exit(0); + } + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Child can mount tmpfs over nullfs root (the primary container use case). + * + * Uses the new mount API (fsopen/fsmount/move_mount) because resolving + * "/" returns the process root directly without following overmounts. + * The mount fd from fsmount lets us fchdir + chroot into the new tmpfs. + */ +TEST_F(clone3_empty_mntns, child_overmount_tmpfs) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pid_t inner; + + if (enter_userns()) + _exit(1); + + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(2); + + if (inner == 0) { + struct statmount *sm; + uint64_t root_id; + int fd, fsfd, mntfd; + + if (count_mounts() != 1) + _exit(3); + + /* Verify root is nullfs. */ + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(4); + + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); + if (!sm) + _exit(5); + if (!(sm->mask & STATMOUNT_FS_TYPE)) + _exit(6); + if (strcmp(sm->str + sm->fs_type, "nullfs") != 0) + _exit(7); + free(sm); + + /* Create tmpfs via the new mount API. */ + fsfd = sys_fsopen("tmpfs", 0); + if (fsfd < 0) + _exit(8); + + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, + "size", "1M", 0)) { + close(fsfd); + _exit(9); + } + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, + NULL, NULL, 0)) { + close(fsfd); + _exit(10); + } + + mntfd = sys_fsmount(fsfd, 0, 0); + close(fsfd); + if (mntfd < 0) + _exit(11); + + /* Attach tmpfs to "/". */ + if (sys_move_mount(mntfd, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH)) { + close(mntfd); + _exit(12); + } + + if (count_mounts() != 2) { + close(mntfd); + _exit(13); + } + + /* Enter the tmpfs. */ + if (fchdir(mntfd)) { + close(mntfd); + _exit(14); + } + + if (chroot(".")) { + close(mntfd); + _exit(15); + } + + close(mntfd); + + /* Verify "/" is now tmpfs. */ + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(16); + + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); + if (!sm) + _exit(17); + if (!(sm->mask & STATMOUNT_FS_TYPE)) + _exit(18); + if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) + _exit(19); + free(sm); + + /* Verify tmpfs is writable. */ + fd = open("/testfile", O_CREAT | O_RDWR, 0644); + if (fd < 0) + _exit(20); + + if (write(fd, "test", 4) != 4) { + close(fd); + _exit(21); + } + close(fd); + + if (access("/testfile", F_OK)) + _exit(22); + + _exit(0); + } + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Multiple clone3 calls with CLONE_EMPTY_MNTNS produce children with + * distinct mount namespace root mount IDs. + */ +TEST_F(clone3_empty_mntns, repeated) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int pipe1[2], pipe2[2]; + uint64_t id1 = 0, id2 = 0; + pid_t inner1, inner2; + + if (enter_userns()) + _exit(1); + + if (pipe(pipe1) || pipe(pipe2)) + _exit(2); + + inner1 = clone3_empty_mntns(0); + if (inner1 < 0) + _exit(3); + + if (inner1 == 0) { + uint64_t root_id; + + close(pipe1[0]); + root_id = get_unique_mnt_id("/"); + if (write(pipe1[1], &root_id, sizeof(root_id)) != sizeof(root_id)) + _exit(1); + close(pipe1[1]); + _exit(0); + } + + inner2 = clone3_empty_mntns(0); + if (inner2 < 0) + _exit(4); + + if (inner2 == 0) { + uint64_t root_id; + + close(pipe2[0]); + root_id = get_unique_mnt_id("/"); + if (write(pipe2[1], &root_id, sizeof(root_id)) != sizeof(root_id)) + _exit(1); + close(pipe2[1]); + _exit(0); + } + + close(pipe1[1]); + close(pipe2[1]); + + if (read(pipe1[0], &id1, sizeof(id1)) != sizeof(id1)) + _exit(5); + if (read(pipe2[0], &id2, sizeof(id2)) != sizeof(id2)) + _exit(6); + + close(pipe1[0]); + close(pipe2[0]); + + if (wait_for_pid(inner1) || wait_for_pid(inner2)) + _exit(7); + + /* Each child must have a distinct root mount ID. */ + if (id1 == 0 || id2 == 0) + _exit(8); + if (id1 == id2) + _exit(9); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Verify setns() into a child's empty mount namespace works. + */ +TEST_F(clone3_empty_mntns, setns_into_child_mntns) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int pipe_fd[2]; + pid_t inner; + char c; + + if (enter_userns()) + _exit(1); + + if (pipe(pipe_fd)) + _exit(2); + + inner = clone3_empty_mntns(0); + if (inner < 0) + _exit(3); + + if (inner == 0) { + /* Signal parent we're ready. */ + close(pipe_fd[0]); + if (write(pipe_fd[1], "r", 1) != 1) + _exit(1); + + /* + * Wait for parent to finish. Reading from our + * write end will block until the parent closes + * its read end, giving us an implicit barrier. + */ + if (read(pipe_fd[1], &c, 1) < 0) + ; + close(pipe_fd[1]); + _exit(0); + } + + close(pipe_fd[1]); + + /* Wait for child to be ready. */ + if (read(pipe_fd[0], &c, 1) != 1) + _exit(4); + + /* Open child's mount namespace. */ + { + char path[64]; + int mntns_fd; + + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", inner); + mntns_fd = open(path, O_RDONLY); + if (mntns_fd < 0) + _exit(5); + + if (setns(mntns_fd, CLONE_NEWNS)) + _exit(6); + + close(mntns_fd); + } + + /* Now we should be in the child's empty mntns. */ + if (count_mounts() != 1) + _exit(7); + + close(pipe_fd[0]); + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Tests below do not require CLONE_EMPTY_MNTNS support. + */ + +/* + * Unknown 64-bit flags beyond the known set are rejected. + */ +TEST(unknown_flags_rejected) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct __clone_args args = { + .flags = 0x800000000ULL, + .exit_signal = SIGCHLD, + }; + pid_t ret; + + ret = sys_clone3(&args, sizeof(args)); + if (ret >= 0) { + if (ret == 0) + _exit(0); + wait_for_pid(ret); + _exit(1); + } + + if (errno != EINVAL) + _exit(2); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Regular clone3 with CLONE_NEWNS (without CLONE_EMPTY_MNTNS) still + * copies the full mount tree. + */ +TEST(clone3_newns_full_copy) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct __clone_args args = { + .flags = CLONE_NEWNS, + .exit_signal = SIGCHLD, + }; + ssize_t parent_mounts; + pid_t inner; + + if (enter_userns()) + _exit(1); + + parent_mounts = count_mounts(); + if (parent_mounts < 1) + _exit(2); + + inner = sys_clone3(&args, sizeof(args)); + if (inner < 0) + _exit(3); + + if (inner == 0) { + /* Full copy should have at least as many mounts. */ + if (count_mounts() < parent_mounts) + _exit(1); + + _exit(0); + } + + _exit(wait_for_pid(inner)); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h new file mode 100644 index 000000000000..3d9c6b14bbef --- /dev/null +++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef EMPTY_MNTNS_H +#define EMPTY_MNTNS_H + +#include +#include + +#include "../statmount/statmount.h" + +#ifndef UNSHARE_EMPTY_MNTNS +#define UNSHARE_EMPTY_MNTNS 0x00100000 +#endif + +#ifndef CLONE_EMPTY_MNTNS +#define CLONE_EMPTY_MNTNS (1ULL << 37) +#endif + +static inline ssize_t count_mounts(void) +{ + uint64_t list[4096]; + + return listmount(LSMT_ROOT, 0, 0, list, sizeof(list) / sizeof(list[0]), 0); +} + +#endif /* EMPTY_MNTNS_H */ diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c new file mode 100644 index 000000000000..43e296b97d84 --- /dev/null +++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c @@ -0,0 +1,725 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Tests for empty mount namespace creation via UNSHARE_EMPTY_MNTNS + * + * Copyright (c) 2024 Christian Brauner + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../utils.h" +#include "../wrappers.h" +#include "empty_mntns.h" +#include "kselftest_harness.h" + +static bool unshare_empty_mntns_supported(void) +{ + pid_t pid; + int status; + + pid = fork(); + if (pid < 0) + return false; + + if (pid == 0) { + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS) && errno == EINVAL) + _exit(1); + _exit(0); + } + + if (waitpid(pid, &status, 0) != pid) + return false; + + if (!WIFEXITED(status)) + return false; + + return WEXITSTATUS(status) == 0; +} + + +FIXTURE(empty_mntns) {}; + +FIXTURE_SETUP(empty_mntns) +{ + if (!unshare_empty_mntns_supported()) + SKIP(return, "UNSHARE_EMPTY_MNTNS not supported"); +} + +FIXTURE_TEARDOWN(empty_mntns) {} + +/* Verify unshare succeeds, produces exactly 1 mount, and root == cwd */ +TEST_F(empty_mntns, basic) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t root_id, cwd_id; + + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(2); + + if (count_mounts() != 1) + _exit(3); + + root_id = get_unique_mnt_id("/"); + cwd_id = get_unique_mnt_id("."); + if (root_id == 0 || cwd_id == 0) + _exit(4); + + if (root_id != cwd_id) + _exit(5); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * UNSHARE_EMPTY_MNTNS combined with CLONE_NEWUSER. + * + * The user namespace must be created first so /proc is still accessible + * for writing uid_map/gid_map. The empty mount namespace is created + * afterwards. + */ +TEST_F(empty_mntns, with_clone_newuser) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uid_t uid = getuid(); + gid_t gid = getgid(); + char map[100]; + + if (unshare(CLONE_NEWUSER)) + _exit(1); + + snprintf(map, sizeof(map), "0 %d 1", uid); + if (write_file("/proc/self/uid_map", map)) + _exit(2); + + if (write_file("/proc/self/setgroups", "deny")) + _exit(3); + + snprintf(map, sizeof(map), "0 %d 1", gid); + if (write_file("/proc/self/gid_map", map)) + _exit(4); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(5); + + if (count_mounts() != 1) + _exit(6); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* UNSHARE_EMPTY_MNTNS combined with other namespace flags */ +TEST_F(empty_mntns, with_other_ns_flags) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS | CLONE_NEWUTS | CLONE_NEWIPC)) + _exit(2); + + if (count_mounts() != 1) + _exit(3); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* EPERM without proper capabilities */ +TEST_F(empty_mntns, eperm_without_caps) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Skip if already root */ + if (getuid() == 0) + _exit(0); + + if (unshare(UNSHARE_EMPTY_MNTNS) == 0) + _exit(1); + + if (errno != EPERM) + _exit(2); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* Many source mounts still result in exactly 1 mount */ +TEST_F(empty_mntns, many_source_mounts) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + char tmpdir[] = "/tmp/empty_mntns_test.XXXXXX"; + int i; + + if (enter_userns()) + _exit(1); + + if (unshare(CLONE_NEWNS)) + _exit(2); + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) + _exit(3); + + if (!mkdtemp(tmpdir)) + _exit(4); + + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) + _exit(5); + + for (i = 0; i < 5; i++) { + char subdir[256]; + + snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i); + if (mkdir(subdir, 0755) && errno != EEXIST) + _exit(6); + if (mount(subdir, subdir, NULL, MS_BIND, NULL)) + _exit(7); + } + + if (count_mounts() < 5) + _exit(8); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(9); + + if (count_mounts() != 1) + _exit(10); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* CWD on a different mount gets reset to root */ +TEST_F(empty_mntns, cwd_reset) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + char tmpdir[] = "/tmp/empty_mntns_cwd.XXXXXX"; + uint64_t root_id, cwd_id; + struct statmount *sm; + + if (enter_userns()) + _exit(1); + + if (unshare(CLONE_NEWNS)) + _exit(2); + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) + _exit(3); + + if (!mkdtemp(tmpdir)) + _exit(4); + + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) + _exit(5); + + if (chdir(tmpdir)) + _exit(6); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(7); + + root_id = get_unique_mnt_id("/"); + cwd_id = get_unique_mnt_id("."); + if (root_id == 0 || cwd_id == 0) + _exit(8); + + if (root_id != cwd_id) + _exit(9); + + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, 0); + if (!sm) + _exit(10); + + if (strcmp(sm->str + sm->mnt_point, "/") != 0) + _exit(11); + + free(sm); + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* Verify statmount properties of the root mount */ +TEST_F(empty_mntns, mount_properties) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct statmount *sm; + uint64_t root_id; + + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(2); + + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(3); + + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0); + if (!sm) + _exit(4); + + if (!(sm->mask & STATMOUNT_MNT_POINT)) + _exit(5); + + if (strcmp(sm->str + sm->mnt_point, "/") != 0) + _exit(6); + + if (!(sm->mask & STATMOUNT_MNT_BASIC)) + _exit(7); + + if (sm->mnt_id != root_id) + _exit(8); + + free(sm); + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* Consecutive UNSHARE_EMPTY_MNTNS calls produce new namespaces */ +TEST_F(empty_mntns, repeated_unshare) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t first_root_id, second_root_id; + + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(2); + + if (count_mounts() != 1) + _exit(3); + + first_root_id = get_unique_mnt_id("/"); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(4); + + if (count_mounts() != 1) + _exit(5); + + second_root_id = get_unique_mnt_id("/"); + + if (first_root_id == second_root_id) + _exit(6); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* Root mount's parent is itself */ +TEST_F(empty_mntns, root_is_own_parent) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct statmount sm; + uint64_t root_id; + + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(2); + + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(3); + + if (statmount(root_id, 0, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0) < 0) + _exit(4); + + if (!(sm.mask & STATMOUNT_MNT_BASIC)) + _exit(5); + + if (sm.mnt_parent_id != sm.mnt_id) + _exit(6); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* Listmount returns only the root mount */ +TEST_F(empty_mntns, listmount_single_entry) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t list[16]; + ssize_t nr_mounts; + uint64_t root_id; + + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(2); + + nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0); + if (nr_mounts != 1) + _exit(3); + + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(4); + + if (list[0] != root_id) + _exit(5); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Mount tmpfs over nullfs root to build a writable filesystem from scratch. + * This exercises the intended usage pattern: create an empty mount namespace + * (which has a nullfs root), then mount a real filesystem over it. + * + * Because resolving "/" returns the process root directly (via nd_jump_root) + * without following overmounts, we use the new mount API (fsopen/fsmount) + * to obtain a mount fd, then fchdir + chroot to enter the new filesystem. + */ +TEST_F(empty_mntns, overmount_tmpfs) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct statmount *sm; + uint64_t root_id, cwd_id; + int fd, fsfd, mntfd; + + if (enter_userns()) + _exit(1); + + if (unshare(UNSHARE_EMPTY_MNTNS)) + _exit(2); + + if (count_mounts() != 1) + _exit(3); + + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(4); + + /* Verify root is nullfs */ + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); + if (!sm) + _exit(5); + + if (!(sm->mask & STATMOUNT_FS_TYPE)) + _exit(6); + + if (strcmp(sm->str + sm->fs_type, "nullfs") != 0) + _exit(7); + + free(sm); + + cwd_id = get_unique_mnt_id("."); + if (!cwd_id || root_id != cwd_id) + _exit(8); + + /* + * nullfs root is immutable. open(O_CREAT) returns ENOENT + * because empty_dir_lookup() returns -ENOENT before the + * IS_IMMUTABLE permission check in may_o_create() is reached. + */ + fd = open("/test", O_CREAT | O_RDWR, 0644); + if (fd >= 0) { + close(fd); + _exit(9); + } + if (errno != ENOENT) + _exit(10); + + /* + * Use the new mount API to create tmpfs and get a mount fd. + * We need the fd because after attaching the tmpfs on top of + * "/", path resolution of "/" still returns the process root + * (nullfs) without following the overmount. The mount fd + * lets us fchdir + chroot into the tmpfs. + */ + fsfd = sys_fsopen("tmpfs", 0); + if (fsfd < 0) + _exit(11); + + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "size", "1M", 0)) { + close(fsfd); + _exit(12); + } + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + close(fsfd); + _exit(13); + } + + mntfd = sys_fsmount(fsfd, 0, 0); + close(fsfd); + if (mntfd < 0) + _exit(14); + + if (sys_move_mount(mntfd, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH)) { + close(mntfd); + _exit(15); + } + + if (count_mounts() != 2) { + close(mntfd); + _exit(16); + } + + /* Enter the tmpfs via the mount fd */ + if (fchdir(mntfd)) { + close(mntfd); + _exit(17); + } + + if (chroot(".")) { + close(mntfd); + _exit(18); + } + + close(mntfd); + + /* Verify "/" now resolves to tmpfs */ + root_id = get_unique_mnt_id("/"); + if (!root_id) + _exit(19); + + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); + if (!sm) + _exit(20); + + if (!(sm->mask & STATMOUNT_FS_TYPE)) + _exit(21); + + if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) + _exit(22); + + free(sm); + + /* Verify tmpfs is writable */ + fd = open("/testfile", O_CREAT | O_RDWR, 0644); + if (fd < 0) + _exit(23); + + if (write(fd, "test", 4) != 4) { + close(fd); + _exit(24); + } + + close(fd); + + if (access("/testfile", F_OK)) + _exit(25); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* + * Tests below do not require UNSHARE_EMPTY_MNTNS support. + */ + +/* Invalid unshare flags return EINVAL */ +TEST(invalid_flags) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + if (enter_userns()) + _exit(1); + + if (unshare(0x80000000) == 0) + _exit(2); + + if (errno != EINVAL) + _exit(3); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* Regular CLONE_NEWNS still copies the full mount tree */ +TEST(clone_newns_full_copy) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ssize_t nr_mounts_before, nr_mounts_after; + char tmpdir[] = "/tmp/empty_mntns_regr.XXXXXX"; + int i; + + if (enter_userns()) + _exit(1); + + if (unshare(CLONE_NEWNS)) + _exit(2); + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) + _exit(3); + + if (!mkdtemp(tmpdir)) + _exit(4); + + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) + _exit(5); + + for (i = 0; i < 3; i++) { + char subdir[256]; + + snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i); + if (mkdir(subdir, 0755) && errno != EEXIST) + _exit(6); + if (mount(subdir, subdir, NULL, MS_BIND, NULL)) + _exit(7); + } + + nr_mounts_before = count_mounts(); + if (nr_mounts_before < 3) + _exit(8); + + if (unshare(CLONE_NEWNS)) + _exit(9); + + nr_mounts_after = count_mounts(); + if (nr_mounts_after < nr_mounts_before) + _exit(10); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +/* Other namespace unshares are unaffected */ +TEST(other_ns_unaffected) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + char hostname[256]; + + if (enter_userns()) + _exit(1); + + if (unshare(CLONE_NEWUTS)) + _exit(2); + + if (sethostname("test-empty-mntns", 16)) + _exit(3); + + if (gethostname(hostname, sizeof(hostname))) + _exit(4); + + if (strcmp(hostname, "test-empty-mntns") != 0) + _exit(5); + + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c new file mode 100644 index 000000000000..6e21c58258c3 --- /dev/null +++ b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test: rootfs overmounted multiple times with chroot into topmost + * + * This test creates a scenario where: + * 1. A new mount namespace is created with a tmpfs root (via pivot_root) + * 2. A mountpoint is created and overmounted multiple times + * 3. The caller chroots into the topmost mount layer + * + * The test verifies that: + * - Multiple overmounts create separate mount layers + * - Each layer's files are isolated + * - chroot correctly sets the process's root to the topmost layer + * - After chroot, only the topmost layer's files are visible + * + * Copyright (c) 2024 Christian Brauner + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../utils.h" +#include "empty_mntns.h" +#include "kselftest_harness.h" + +#define NR_OVERMOUNTS 5 + +/* + * Setup a proper root filesystem using pivot_root. + * This ensures we own the root directory in our user namespace. + */ +static int setup_root(void) +{ + char tmpdir[] = "/tmp/overmount_test.XXXXXX"; + char oldroot[256]; + + if (!mkdtemp(tmpdir)) + return -1; + + /* Mount tmpfs at the temporary directory */ + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=10M")) + return -1; + + /* Create directory for old root */ + snprintf(oldroot, sizeof(oldroot), "%s/oldroot", tmpdir); + if (mkdir(oldroot, 0755)) + return -1; + + /* pivot_root to use the tmpfs as new root */ + if (syscall(SYS_pivot_root, tmpdir, oldroot)) + return -1; + + if (chdir("/")) + return -1; + + /* Unmount old root */ + if (umount2("/oldroot", MNT_DETACH)) + return -1; + + /* Remove oldroot directory */ + if (rmdir("/oldroot")) + return -1; + + return 0; +} + +/* + * Test scenario: + * 1. Enter a user namespace to gain CAP_SYS_ADMIN + * 2. Create a new mount namespace + * 3. Setup a tmpfs root via pivot_root + * 4. Create a mountpoint /newroot and overmount it multiple times + * 5. Create a marker file in each layer + * 6. Chroot into /newroot (the topmost overmount) + * 7. Verify we're in the topmost layer (only topmost marker visible) + */ +TEST(overmount_chroot) +{ + pid_t pid; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ssize_t nr_mounts; + uint64_t mnt_ids[NR_OVERMOUNTS + 1]; + uint64_t root_id_before, root_id_after; + struct statmount *sm; + char marker[64]; + int fd, i; + + /* Step 1: Enter user namespace for privileges */ + if (enter_userns()) + _exit(1); + + /* Step 2: Create a new mount namespace */ + if (unshare(CLONE_NEWNS)) + _exit(2); + + /* Step 3: Make the mount tree private */ + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) + _exit(3); + + /* Step 4: Setup a proper tmpfs root via pivot_root */ + if (setup_root()) + _exit(4); + + /* Create the base mount point for overmounting */ + if (mkdir("/newroot", 0755)) + _exit(5); + + /* Mount base tmpfs on /newroot */ + if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M")) + _exit(6); + + /* Record base mount ID */ + mnt_ids[0] = get_unique_mnt_id("/newroot"); + if (!mnt_ids[0]) + _exit(7); + + /* Create marker in base layer */ + fd = open("/newroot/layer_0", O_CREAT | O_RDWR, 0644); + if (fd < 0) + _exit(8); + if (write(fd, "layer_0", 7) != 7) { + close(fd); + _exit(9); + } + close(fd); + + /* Step 5: Overmount /newroot multiple times with tmpfs */ + for (i = 0; i < NR_OVERMOUNTS; i++) { + if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M")) + _exit(10); + + /* Record mount ID for this layer */ + mnt_ids[i + 1] = get_unique_mnt_id("/newroot"); + if (!mnt_ids[i + 1]) + _exit(11); + + /* Create a marker file in each layer */ + snprintf(marker, sizeof(marker), "/newroot/layer_%d", i + 1); + fd = open(marker, O_CREAT | O_RDWR, 0644); + if (fd < 0) + _exit(12); + + if (write(fd, marker, strlen(marker)) != (ssize_t)strlen(marker)) { + close(fd); + _exit(13); + } + close(fd); + } + + /* Verify mount count increased */ + nr_mounts = count_mounts(); + if (nr_mounts < NR_OVERMOUNTS + 2) + _exit(14); + + /* Record root mount ID before chroot */ + root_id_before = get_unique_mnt_id("/newroot"); + + /* Verify this is the topmost layer's mount */ + if (root_id_before != mnt_ids[NR_OVERMOUNTS]) + _exit(15); + + /* Step 6: Chroot into /newroot (the topmost overmount) */ + if (chroot("/newroot")) + _exit(16); + + /* Change to root directory within the chroot */ + if (chdir("/")) + _exit(17); + + /* Step 7: Verify we're in the topmost layer */ + root_id_after = get_unique_mnt_id("/"); + + /* The mount ID should be the same as the topmost layer */ + if (root_id_after != mnt_ids[NR_OVERMOUNTS]) + _exit(18); + + /* Verify the topmost layer's marker file exists */ + snprintf(marker, sizeof(marker), "/layer_%d", NR_OVERMOUNTS); + if (access(marker, F_OK)) + _exit(19); + + /* Verify we cannot see markers from lower layers (they're hidden) */ + for (i = 0; i < NR_OVERMOUNTS; i++) { + snprintf(marker, sizeof(marker), "/layer_%d", i); + if (access(marker, F_OK) == 0) + _exit(20); + } + + /* Verify the root mount is tmpfs */ + sm = statmount_alloc(root_id_after, 0, + STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0); + if (!sm) + _exit(21); + + if (sm->mask & STATMOUNT_FS_TYPE) { + if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) { + free(sm); + _exit(22); + } + } + + free(sm); + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/fsmount_ns/.gitignore b/tools/testing/selftests/filesystems/fsmount_ns/.gitignore new file mode 100644 index 000000000000..f1ecf6c6e37b --- /dev/null +++ b/tools/testing/selftests/filesystems/fsmount_ns/.gitignore @@ -0,0 +1 @@ +fsmount_ns_test diff --git a/tools/testing/selftests/filesystems/fsmount_ns/Makefile b/tools/testing/selftests/filesystems/fsmount_ns/Makefile new file mode 100644 index 000000000000..d9647efc0739 --- /dev/null +++ b/tools/testing/selftests/filesystems/fsmount_ns/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := fsmount_ns_test + +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS := -lcap + +include ../../lib.mk + +$(OUTPUT)/fsmount_ns_test: fsmount_ns_test.c ../utils.c + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) diff --git a/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c new file mode 100644 index 000000000000..b70b3051eed4 --- /dev/null +++ b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c @@ -0,0 +1,1135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 Christian Brauner + * + * Test for FSMOUNT_NAMESPACE flag. + * + * Test that fsmount() with FSMOUNT_NAMESPACE creates a new mount + * namespace containing the specified mount. + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../wrappers.h" +#include "../statmount/statmount.h" +#include "../utils.h" +#include "../../kselftest_harness.h" + +#ifndef FSMOUNT_NAMESPACE +#define FSMOUNT_NAMESPACE 0x00000002 +#endif + +#ifndef FSMOUNT_CLOEXEC +#define FSMOUNT_CLOEXEC 0x00000001 +#endif + +#ifndef FSCONFIG_CMD_CREATE +#define FSCONFIG_CMD_CREATE 6 +#endif + +static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id) +{ + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) + return -errno; + return 0; +} + +static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + ret = get_mnt_ns_id(fd, mnt_ns_id); + close(fd); + return ret; +} + +static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) +{ + const char *fs_type = ""; + const char *mnt_root = ""; + const char *mnt_point = ""; + + if (sm->mask & STATMOUNT_FS_TYPE) + fs_type = sm->str + sm->fs_type; + if (sm->mask & STATMOUNT_MNT_ROOT) + mnt_root = sm->str + sm->mnt_root; + if (sm->mask & STATMOUNT_MNT_POINT) + mnt_point = sm->str + sm->mnt_point; + + TH_LOG(" mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s", + (unsigned long long)sm->mnt_id, + (unsigned long long)sm->mnt_parent_id, + fs_type, mnt_root, mnt_point); +} + +static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id) +{ + uint64_t list[256]; + ssize_t nr_mounts; + + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) { + TH_LOG("listmount failed: %s", strerror(errno)); + return; + } + + TH_LOG("Mount namespace %llu contains %zd mount(s):", + (unsigned long long)mnt_ns_id, nr_mounts); + + for (ssize_t i = 0; i < nr_mounts; i++) { + struct statmount *sm; + + sm = statmount_alloc(list[i], mnt_ns_id, + STATMOUNT_MNT_BASIC | + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT, 0); + if (!sm) { + TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", + i, (unsigned long long)list[i], strerror(errno)); + continue; + } + + log_mount(_metadata, sm); + free(sm); + } +} + +static int create_tmpfs_fd(void) +{ + int fs_fd, ret; + + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fs_fd < 0) + return -errno; + + ret = sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); + if (ret < 0) { + close(fs_fd); + return -errno; + } + + return fs_fd; +} + +FIXTURE(fsmount_ns) +{ + int fd; + int fs_fd; + uint64_t current_ns_id; +}; + +FIXTURE_VARIANT(fsmount_ns) +{ + const char *fstype; + unsigned int flags; + bool expect_success; + bool expect_different_ns; + int min_mounts; +}; + +FIXTURE_VARIANT_ADD(fsmount_ns, basic_tmpfs) +{ + .fstype = "tmpfs", + .flags = FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(fsmount_ns, cloexec_only) +{ + .fstype = "tmpfs", + .flags = FSMOUNT_CLOEXEC, + .expect_success = true, + .expect_different_ns = false, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(fsmount_ns, namespace_only) +{ + .fstype = "tmpfs", + .flags = FSMOUNT_NAMESPACE, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_SETUP(fsmount_ns) +{ + int ret; + + self->fd = -1; + self->fs_fd = -1; + + /* Check if fsopen syscall is supported */ + ret = sys_fsopen("tmpfs", 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "fsopen() syscall not supported"); + if (ret >= 0) + close(ret); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); + + /* Get current mount namespace ID for comparison */ + ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id); + if (ret < 0) + SKIP(return, "Failed to get current mount namespace ID"); +} + +FIXTURE_TEARDOWN(fsmount_ns) +{ + if (self->fd >= 0) + close(self->fd); + if (self->fs_fd >= 0) + close(self->fs_fd); +} + +TEST_F(fsmount_ns, create_namespace) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, variant->flags, 0); + + if (!variant->expect_success) { + ASSERT_LT(self->fd, 0); + return; + } + + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + if (variant->expect_different_ns) { + /* Verify we can get the namespace ID from the fd */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + /* Verify it's a different namespace */ + ASSERT_NE(new_ns_id, self->current_ns_id); + + /* List mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("%m - listmount failed"); + } + + /* Verify minimum expected mounts */ + ASSERT_GE(nr_mounts, variant->min_mounts); + TH_LOG("Namespace contains %zd mounts", nr_mounts); + } +} + +TEST_F(fsmount_ns, setns_into_namespace) +{ + uint64_t new_ns_id; + pid_t pid; + int status; + int ret; + + /* Only test with FSMOUNT_NAMESPACE flag */ + if (!(variant->flags & FSMOUNT_NAMESPACE)) + SKIP(return, "setns test only for FSMOUNT_NAMESPACE case"); + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, variant->flags, 0); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Get namespace ID and dump all mounts */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + dump_mounts(_metadata, new_ns_id); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: try to enter the namespace */ + if (setns(self->fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(fsmount_ns, verify_mount_properties) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + /* Only test with basic FSMOUNT_NAMESPACE flags */ + if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC)) + SKIP(return, "mount properties test only for basic case"); + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + /* Get info about the root mount */ + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + TH_LOG("Root mount id: %llu, parent: %llu", + (unsigned long long)sm.mnt_id, + (unsigned long long)sm.mnt_parent_id); +} + +TEST_F(fsmount_ns, verify_tmpfs_type) +{ + struct statmount *sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + const char *fs_type; + int ret; + + /* Only test with basic FSMOUNT_NAMESPACE flags */ + if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC)) + SKIP(return, "fs type test only for basic case"); + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + sm = statmount_alloc(list[0], new_ns_id, STATMOUNT_FS_TYPE, 0); + ASSERT_NE(sm, NULL); + + fs_type = sm->str + sm->fs_type; + ASSERT_STREQ(fs_type, "tmpfs"); + + free(sm); +} + +FIXTURE(fsmount_ns_caps) +{ + bool has_caps; +}; + +FIXTURE_SETUP(fsmount_ns_caps) +{ + int ret; + + /* Check if fsopen syscall is supported */ + ret = sys_fsopen("tmpfs", 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "fsopen() syscall not supported"); + if (ret >= 0) + close(ret); + + self->has_caps = (geteuid() == 0); +} + +FIXTURE_TEARDOWN(fsmount_ns_caps) +{ +} + +TEST_F(fsmount_ns_caps, requires_cap_sys_admin) +{ + pid_t pid; + int status; + int fs_fd; + + /* + * Prepare the configured filesystem fd as root before forking. + * fsopen() requires CAP_SYS_ADMIN in the mount namespace's + * user_ns, which won't be available after enter_userns(). + */ + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_GE(fs_fd, 0); + + ASSERT_EQ(sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + + /* Child: drop privileges using utils.h helper */ + if (enter_userns() != 0) + _exit(2); + + /* Drop all caps using utils.h helper */ + if (caps_down() == 0) + _exit(3); + + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); + close(fs_fd); + + if (fd >= 0) { + close(fd); + /* Should have failed without caps */ + _exit(1); + } + + if (errno == EPERM) + _exit(0); + + /* EINVAL means FSMOUNT_NAMESPACE not supported */ + if (errno == EINVAL) + _exit(6); + + /* Unexpected error */ + _exit(7); + } + + close(fs_fd); + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Expected: EPERM without caps */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("FSMOUNT_NAMESPACE succeeded without caps"); + break; + case 2: + SKIP(return, "enter_userns failed"); + break; + case 3: + SKIP(return, "caps_down failed"); + break; + case 6: + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(fsmount_ns_userns) +{ + int fd; + int fs_fd; +}; + +FIXTURE_SETUP(fsmount_ns_userns) +{ + int ret; + + self->fd = -1; + self->fs_fd = -1; + + /* Check if fsopen syscall is supported */ + ret = sys_fsopen("tmpfs", 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "fsopen() syscall not supported"); + if (ret >= 0) + close(ret); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); +} + +FIXTURE_TEARDOWN(fsmount_ns_userns) +{ + if (self->fd >= 0) + close(self->fd); + if (self->fs_fd >= 0) + close(self->fs_fd); +} + +TEST_F(fsmount_ns_userns, create_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fs_fd, fd; + + /* Create new user namespace (also creates mount namespace) */ + if (setup_userns() != 0) + _exit(2); + + /* Now we have CAP_SYS_ADMIN in the user namespace */ + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fs_fd < 0) + _exit(3); + + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + close(fs_fd); + _exit(4); + } + + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); + close(fs_fd); + + if (fd < 0) { + if (errno == EINVAL) + _exit(6); /* FSMOUNT_NAMESPACE not supported */ + _exit(1); + } + + /* Verify we can get the namespace ID */ + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(7); + + /* Verify we can list mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(8); + + /* Should have at least 1 mount (the tmpfs) */ + if (nr_mounts < 1) + _exit(9); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 3: + SKIP(return, "fsopen failed in userns"); + break; + case 4: + SKIP(return, "fsconfig CMD_CREATE failed in userns"); + break; + case 6: + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(fsmount_ns_userns, setns_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + int fs_fd, fd; + pid_t inner_pid; + int inner_status; + + /* Create new user namespace */ + if (setup_userns() != 0) + _exit(2); + + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fs_fd < 0) + _exit(3); + + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + close(fs_fd); + _exit(4); + } + + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); + close(fs_fd); + + if (fd < 0) { + if (errno == EINVAL) + _exit(6); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(7); + + /* Fork again to test setns into the new namespace */ + inner_pid = fork(); + if (inner_pid < 0) + _exit(10); + + if (inner_pid == 0) { + /* Inner child: enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + if (waitpid(inner_pid, &inner_status, 0) != inner_pid) + _exit(11); + + if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0) + _exit(12); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("fsmount or setns failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 3: + SKIP(return, "fsopen failed in userns"); + break; + case 4: + SKIP(return, "fsconfig CMD_CREATE failed in userns"); + break; + case 6: + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("Inner fork failed"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("Inner waitpid failed"); + break; + case 12: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(fsmount_ns_userns, umount_fails_einval) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fs_fd, fd; + ssize_t i; + + /* Create new user namespace */ + if (setup_userns() != 0) + _exit(2); + + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fs_fd < 0) + _exit(3); + + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + close(fs_fd); + _exit(4); + } + + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); + close(fs_fd); + + if (fd < 0) { + if (errno == EINVAL) + _exit(6); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(7); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(13); + + if (nr_mounts < 1) + _exit(14); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(8); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT, 0); + if (!sm) + _exit(15); + + mnt_point = sm->str + sm->mnt_point; + + if (umount2(mnt_point, MNT_DETACH) == 0) { + free(sm); + _exit(9); + } + + if (errno != EINVAL) { + /* Wrong error */ + free(sm); + _exit(10); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 3: + SKIP(return, "fsopen failed in userns"); + break; + case 4: + SKIP(return, "fsconfig CMD_CREATE failed in userns"); + break; + case 6: + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)"); + break; + case 13: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 14: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 15: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(fsmount_ns_userns, umount_succeeds) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fs_fd, fd; + ssize_t i; + + if (unshare(CLONE_NEWNS)) + _exit(1); + + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0) + _exit(1); + + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fs_fd < 0) + _exit(3); + + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + close(fs_fd); + _exit(4); + } + + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); + close(fs_fd); + + if (fd < 0) { + if (errno == EINVAL) + _exit(6); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(7); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(13); + + if (nr_mounts < 1) + _exit(14); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(8); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT, 0); + if (!sm) + _exit(15); + + mnt_point = sm->str + sm->mnt_point; + + if (umount2(mnt_point, MNT_DETACH) != 0) { + free(sm); + _exit(9); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed or unshare failed"); + break; + case 3: + SKIP(return, "fsopen failed"); + break; + case 4: + SKIP(return, "fsconfig CMD_CREATE failed"); + break; + case 6: + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded"); + break; + case 13: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 14: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 15: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(fsmount_ns_mount_attrs) +{ + int fd; + int fs_fd; +}; + +FIXTURE_SETUP(fsmount_ns_mount_attrs) +{ + int ret; + + self->fd = -1; + self->fs_fd = -1; + + /* Check if fsopen syscall is supported */ + ret = sys_fsopen("tmpfs", 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "fsopen() syscall not supported"); + if (ret >= 0) + close(ret); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); +} + +FIXTURE_TEARDOWN(fsmount_ns_mount_attrs) +{ + if (self->fd >= 0) + close(self->fd); + if (self->fs_fd >= 0) + close(self->fs_fd); +} + +TEST_F(fsmount_ns_mount_attrs, readonly) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, + MOUNT_ATTR_RDONLY); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + /* Verify the mount is read-only */ + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY); +} + +TEST_F(fsmount_ns_mount_attrs, noexec) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, + MOUNT_ATTR_NOEXEC); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + /* Verify the mount is noexec */ + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC); +} + +TEST_F(fsmount_ns_mount_attrs, nosuid) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, + MOUNT_ATTR_NOSUID); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + /* Verify the mount is nosuid */ + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID); +} + +TEST_F(fsmount_ns_mount_attrs, noatime) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, + MOUNT_ATTR_NOATIME); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + /* Verify the mount is noatime */ + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME); +} + +TEST_F(fsmount_ns_mount_attrs, combined) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fs_fd = create_tmpfs_fd(); + ASSERT_GE(self->fs_fd, 0); + + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, + MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOEXEC | + MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOATIME); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "FSMOUNT_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + /* Verify all attributes are set */ + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY); + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC); + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID); + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/move_mount/.gitignore b/tools/testing/selftests/filesystems/move_mount/.gitignore new file mode 100644 index 000000000000..c7557db30671 --- /dev/null +++ b/tools/testing/selftests/filesystems/move_mount/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +move_mount_test diff --git a/tools/testing/selftests/filesystems/move_mount/Makefile b/tools/testing/selftests/filesystems/move_mount/Makefile new file mode 100644 index 000000000000..5c5b199b464b --- /dev/null +++ b/tools/testing/selftests/filesystems/move_mount/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS += -lcap + +TEST_GEN_PROGS := move_mount_test + +include ../../lib.mk + +$(OUTPUT)/move_mount_test: ../utils.c diff --git a/tools/testing/selftests/filesystems/move_mount/move_mount_test.c b/tools/testing/selftests/filesystems/move_mount/move_mount_test.c new file mode 100644 index 000000000000..f08f94b1f0ec --- /dev/null +++ b/tools/testing/selftests/filesystems/move_mount/move_mount_test.c @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2026 Christian Brauner + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../wrappers.h" +#include "../utils.h" +#include "../statmount/statmount.h" +#include "../../kselftest_harness.h" + +#include + +#ifndef MOVE_MOUNT_BENEATH +#define MOVE_MOUNT_BENEATH 0x00000200 +#endif + +static uint64_t get_unique_mnt_id_fd(int fd) +{ + struct statx sx; + int ret; + + ret = statx(fd, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &sx); + if (ret) + return 0; + + if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) + return 0; + + return sx.stx_mnt_id; +} + +/* + * Create a locked overmount stack at /mnt_dir for testing MNT_LOCKED + * transfer on non-rootfs mounts. + * + * Mounts tmpfs A at /mnt_dir, overmounts with tmpfs B, then enters a + * new user+mount namespace where both become locked. Returns the exit + * code to use on failure, or 0 on success. + */ +static int setup_locked_overmount(void) +{ + /* Isolate so mounts don't leak. */ + if (unshare(CLONE_NEWNS)) + return 1; + if (mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL)) + return 2; + + /* + * Create mounts while still in the initial user namespace so + * they become locked after the subsequent user namespace + * unshare. + */ + rmdir("/mnt_dir"); + if (mkdir("/mnt_dir", 0755)) + return 3; + + /* Mount tmpfs A */ + if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL)) + return 4; + + /* Overmount with tmpfs B */ + if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL)) + return 5; + + /* + * Create user+mount namespace. Mounts A and B become locked + * because they might be covering something that is not supposed + * to be revealed. + */ + if (setup_userns()) + return 6; + + /* Sanity check: B must be locked */ + if (!umount2("/mnt_dir", MNT_DETACH) || errno != EINVAL) + return 7; + + return 0; +} + +/* + * Create a detached tmpfs mount and return its fd, or -1 on failure. + */ +static int create_detached_tmpfs(void) +{ + int fs_fd, mnt_fd; + + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); + if (fs_fd < 0) + return -1; + + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + close(fs_fd); + return -1; + } + + mnt_fd = sys_fsmount(fs_fd, FSMOUNT_CLOEXEC, 0); + close(fs_fd); + return mnt_fd; +} + +FIXTURE(move_mount) { + uint64_t orig_root_id; +}; + +FIXTURE_SETUP(move_mount) +{ + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0); + + self->orig_root_id = get_unique_mnt_id("/"); + ASSERT_NE(self->orig_root_id, 0); +} + +FIXTURE_TEARDOWN(move_mount) +{ +} + +/* + * Test successful MOVE_MOUNT_BENEATH on the rootfs. + * Mount a clone beneath /, fchdir to the clone, chroot to switch root, + * then detach the old root. + */ +TEST_F(move_mount, beneath_rootfs_success) +{ + int fd_tree, ret; + uint64_t clone_id, root_id; + + fd_tree = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree, 0); + + clone_id = get_unique_mnt_id_fd(fd_tree); + ASSERT_NE(clone_id, 0); + ASSERT_NE(clone_id, self->orig_root_id); + + ASSERT_EQ(fchdir(fd_tree), 0); + + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, 0); + + close(fd_tree); + + /* Switch root to the clone */ + ASSERT_EQ(chroot("."), 0); + + /* Verify "/" is now the clone */ + root_id = get_unique_mnt_id("/"); + ASSERT_NE(root_id, 0); + ASSERT_EQ(root_id, clone_id); + + /* Detach old root */ + ASSERT_EQ(umount2(".", MNT_DETACH), 0); +} + +/* + * Test that after MOVE_MOUNT_BENEATH on the rootfs the old root is + * stacked on top of the clone. Verify via statmount that the old + * root's parent is the clone. + */ +TEST_F(move_mount, beneath_rootfs_old_root_stacked) +{ + int fd_tree, ret; + uint64_t clone_id; + struct statmount sm; + + fd_tree = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree, 0); + + clone_id = get_unique_mnt_id_fd(fd_tree); + ASSERT_NE(clone_id, 0); + ASSERT_NE(clone_id, self->orig_root_id); + + ASSERT_EQ(fchdir(fd_tree), 0); + + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, 0); + + close(fd_tree); + + ASSERT_EQ(chroot("."), 0); + + /* Old root's parent should now be the clone */ + ASSERT_EQ(statmount(self->orig_root_id, 0, 0, + STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0); + ASSERT_EQ(sm.mnt_parent_id, clone_id); + + ASSERT_EQ(umount2(".", MNT_DETACH), 0); +} + +/* + * Test that MOVE_MOUNT_BENEATH on rootfs fails when chroot'd into a + * subdirectory of the same mount. The caller's fs->root.dentry doesn't + * match mnt->mnt_root so the kernel rejects it. + */ +TEST_F(move_mount, beneath_rootfs_in_chroot_fail) +{ + int fd_tree, ret; + uint64_t chroot_id, clone_id; + + rmdir("/chroot_dir"); + ASSERT_EQ(mkdir("/chroot_dir", 0755), 0); + + chroot_id = get_unique_mnt_id("/chroot_dir"); + ASSERT_NE(chroot_id, 0); + ASSERT_EQ(self->orig_root_id, chroot_id); + + ASSERT_EQ(chdir("/chroot_dir"), 0); + ASSERT_EQ(chroot("."), 0); + + fd_tree = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree, 0); + + clone_id = get_unique_mnt_id_fd(fd_tree); + ASSERT_NE(clone_id, 0); + ASSERT_NE(clone_id, chroot_id); + + ASSERT_EQ(fchdir(fd_tree), 0); + + /* + * Should fail: fs->root.dentry (/chroot_dir) doesn't match + * the mount's mnt_root (/). + */ + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + close(fd_tree); +} + +/* + * Test that MOVE_MOUNT_BENEATH on rootfs succeeds when chroot'd into a + * separate tmpfs mount. The caller's root dentry matches the mount's + * mnt_root since it's a dedicated mount. + */ +TEST_F(move_mount, beneath_rootfs_in_chroot_success) +{ + int fd_tree, ret; + uint64_t chroot_id, clone_id, root_id; + struct statmount sm; + + rmdir("/chroot_dir"); + ASSERT_EQ(mkdir("/chroot_dir", 0755), 0); + ASSERT_EQ(mount("tmpfs", "/chroot_dir", "tmpfs", 0, NULL), 0); + + chroot_id = get_unique_mnt_id("/chroot_dir"); + ASSERT_NE(chroot_id, 0); + + ASSERT_EQ(chdir("/chroot_dir"), 0); + ASSERT_EQ(chroot("."), 0); + + ASSERT_EQ(get_unique_mnt_id("/"), chroot_id); + + fd_tree = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree, 0); + + clone_id = get_unique_mnt_id_fd(fd_tree); + ASSERT_NE(clone_id, 0); + ASSERT_NE(clone_id, chroot_id); + + ASSERT_EQ(fchdir(fd_tree), 0); + + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, 0); + + close(fd_tree); + + ASSERT_EQ(chroot("."), 0); + + root_id = get_unique_mnt_id("/"); + ASSERT_NE(root_id, 0); + ASSERT_EQ(root_id, clone_id); + + ASSERT_EQ(statmount(chroot_id, 0, 0, + STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0); + ASSERT_EQ(sm.mnt_parent_id, clone_id); + + ASSERT_EQ(umount2(".", MNT_DETACH), 0); +} + +/* + * Test MNT_LOCKED transfer when mounting beneath rootfs in a user+mount + * namespace. After mount-beneath the new root gets MNT_LOCKED and the + * old root has MNT_LOCKED cleared so it can be unmounted. + */ +TEST_F(move_mount, beneath_rootfs_locked_transfer) +{ + int fd_tree, ret; + uint64_t clone_id, root_id; + + ASSERT_EQ(setup_userns(), 0); + + ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0); + + fd_tree = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | + AT_RECURSIVE); + ASSERT_GE(fd_tree, 0); + + clone_id = get_unique_mnt_id_fd(fd_tree); + ASSERT_NE(clone_id, 0); + + ASSERT_EQ(fchdir(fd_tree), 0); + + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH | + MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, 0); + + close(fd_tree); + + ASSERT_EQ(chroot("."), 0); + + root_id = get_unique_mnt_id("/"); + ASSERT_EQ(root_id, clone_id); + + /* + * The old root should be unmountable (MNT_LOCKED was + * transferred to the clone). If MNT_LOCKED wasn't + * cleared, this would fail with EINVAL. + */ + ASSERT_EQ(umount2(".", MNT_DETACH), 0); + + /* Verify "/" is still the clone after detaching old root */ + root_id = get_unique_mnt_id("/"); + ASSERT_EQ(root_id, clone_id); +} + +/* + * Test containment invariant: after mount-beneath rootfs in a user+mount + * namespace, the new root must be MNT_LOCKED. The lock transfer from the + * old root preserves containment -- the process cannot unmount the new root + * to escape the namespace. + */ +TEST_F(move_mount, beneath_rootfs_locked_containment) +{ + int fd_tree, ret; + uint64_t clone_id, root_id; + + ASSERT_EQ(setup_userns(), 0); + + ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0); + + /* Sanity: rootfs must be locked in the new userns */ + ASSERT_EQ(umount2("/", MNT_DETACH), -1); + ASSERT_EQ(errno, EINVAL); + + fd_tree = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | + AT_RECURSIVE); + ASSERT_GE(fd_tree, 0); + + clone_id = get_unique_mnt_id_fd(fd_tree); + ASSERT_NE(clone_id, 0); + + ASSERT_EQ(fchdir(fd_tree), 0); + + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", + MOVE_MOUNT_F_EMPTY_PATH | + MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, 0); + + close(fd_tree); + + ASSERT_EQ(chroot("."), 0); + + root_id = get_unique_mnt_id("/"); + ASSERT_EQ(root_id, clone_id); + + /* Detach old root (MNT_LOCKED was cleared from it) */ + ASSERT_EQ(umount2(".", MNT_DETACH), 0); + + /* Verify "/" is still the clone after detaching old root */ + root_id = get_unique_mnt_id("/"); + ASSERT_EQ(root_id, clone_id); + + /* + * The new root must be locked (MNT_LOCKED was transferred + * from the old root). Attempting to unmount it must fail + * with EINVAL, preserving the containment invariant. + */ + ASSERT_EQ(umount2("/", MNT_DETACH), -1); + ASSERT_EQ(errno, EINVAL); +} + +/* + * Test MNT_LOCKED transfer when mounting beneath a non-rootfs locked mount. + * Mounts created before unshare(CLONE_NEWUSER | CLONE_NEWNS) become locked + * in the new namespace. Mount-beneath transfers the lock from the displaced + * mount to the new mount, so the displaced mount can be unmounted. + */ +TEST_F(move_mount, beneath_non_rootfs_locked_transfer) +{ + int mnt_fd, ret; + uint64_t mnt_new_id, mnt_visible_id; + + ASSERT_EQ(setup_locked_overmount(), 0); + + mnt_fd = create_detached_tmpfs(); + ASSERT_GE(mnt_fd, 0); + + mnt_new_id = get_unique_mnt_id_fd(mnt_fd); + ASSERT_NE(mnt_new_id, 0); + + /* Move mount beneath B (which is locked) */ + ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir", + MOVE_MOUNT_F_EMPTY_PATH | + MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, 0); + + close(mnt_fd); + + /* + * B should now be unmountable (MNT_LOCKED was transferred + * to the new mount beneath it). If MNT_LOCKED wasn't + * cleared from B, this would fail with EINVAL. + */ + ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0); + + /* Verify the new mount is now visible */ + mnt_visible_id = get_unique_mnt_id("/mnt_dir"); + ASSERT_EQ(mnt_visible_id, mnt_new_id); +} + +/* + * Test MNT_LOCKED containment when mounting beneath a non-rootfs mount + * that was locked during unshare(CLONE_NEWUSER | CLONE_NEWNS). + * Mounts created before unshare become locked in the new namespace. + * Mount-beneath transfers the lock, preserving containment: the new + * mount cannot be unmounted, but the displaced mount can. + */ +TEST_F(move_mount, beneath_non_rootfs_locked_containment) +{ + int mnt_fd, ret; + uint64_t mnt_new_id, mnt_visible_id; + + ASSERT_EQ(setup_locked_overmount(), 0); + + mnt_fd = create_detached_tmpfs(); + ASSERT_GE(mnt_fd, 0); + + mnt_new_id = get_unique_mnt_id_fd(mnt_fd); + ASSERT_NE(mnt_new_id, 0); + + /* + * Move new tmpfs beneath B at /mnt_dir. + * Stack becomes: A -> new -> B + * Lock transfers from B to new. + */ + ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir", + MOVE_MOUNT_F_EMPTY_PATH | + MOVE_MOUNT_BENEATH); + ASSERT_EQ(ret, 0); + + close(mnt_fd); + + /* + * B lost MNT_LOCKED -- unmounting it must succeed. + * This reveals the new mount at /mnt_dir. + */ + ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0); + + /* Verify the new mount is now visible */ + mnt_visible_id = get_unique_mnt_id("/mnt_dir"); + ASSERT_EQ(mnt_visible_id, mnt_new_id); + + /* + * The new mount gained MNT_LOCKED -- unmounting it must + * fail with EINVAL, preserving the containment invariant. + */ + ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), -1); + ASSERT_EQ(errno, EINVAL); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile index 73c03c4a7ef6..4976ed1d7d4a 100644 --- a/tools/testing/selftests/filesystems/open_tree_ns/Makefile +++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := open_tree_ns_test -CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES) +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) LDLIBS := -lcap include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c index 9711556280ae..82f3c8c02c9a 100644 --- a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c +++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (c) 2026 Christian Brauner + * * Test for OPEN_TREE_NAMESPACE flag. * * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount @@ -50,31 +52,6 @@ static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id) return ret; } -#define STATMOUNT_BUFSIZE (1 << 15) - -static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask) -{ - struct statmount *buf; - size_t bufsize = STATMOUNT_BUFSIZE; - int ret; - - for (;;) { - buf = malloc(bufsize); - if (!buf) - return NULL; - - ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0); - if (ret == 0) - return buf; - - free(buf); - if (errno != EOVERFLOW) - return NULL; - - bufsize <<= 1; - } -} - static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) { const char *fs_type = ""; @@ -115,7 +92,7 @@ static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id) STATMOUNT_MNT_BASIC | STATMOUNT_FS_TYPE | STATMOUNT_MNT_ROOT | - STATMOUNT_MNT_POINT); + STATMOUNT_MNT_POINT, 0); if (!sm) { TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", i, (unsigned long long)list[i], strerror(errno)); @@ -221,7 +198,7 @@ FIXTURE_SETUP(open_tree_ns) SKIP(return, "open_tree() syscall not supported"); /* Check if statmount/listmount are supported */ - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); if (ret == -1 && errno == ENOSYS) SKIP(return, "statmount() syscall not supported"); @@ -340,7 +317,7 @@ TEST_F(open_tree_ns, verify_mount_properties) ASSERT_GE(nr_mounts, 1); /* Get info about the root mount (the bind mount, rootfs is hidden) */ - ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); ASSERT_EQ(ret, 0); ASSERT_NE(sm.mnt_id, sm.mnt_parent_id); @@ -452,7 +429,7 @@ FIXTURE_SETUP(open_tree_ns_userns) SKIP(return, "open_tree() syscall not supported"); /* Check if statmount/listmount are supported */ - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); if (ret == -1 && errno == ENOSYS) SKIP(return, "statmount() syscall not supported"); } @@ -746,7 +723,7 @@ TEST_F(open_tree_ns_userns, umount_fails_einval) const char *mnt_point; sm = statmount_alloc(list[i], new_ns_id, - STATMOUNT_MNT_POINT); + STATMOUNT_MNT_POINT, 0); if (!sm) _exit(11); @@ -863,7 +840,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds) const char *mnt_point; sm = statmount_alloc(list[i], new_ns_id, - STATMOUNT_MNT_POINT); + STATMOUNT_MNT_POINT, 0); if (!sm) _exit(11); @@ -904,7 +881,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds) ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); break; case 7: - ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded"); break; case 9: ASSERT_FALSE(true) TH_LOG("listmount failed"); @@ -1003,7 +980,7 @@ TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable) struct statmount *sm; const char *mnt_point; - sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT); + sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT, 0); ASSERT_NE(sm, NULL) { TH_LOG("statmount_alloc failed for mnt_id %llu", (unsigned long long)list[i]); diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h index e1cba4bfd8d9..675f7cc00076 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount.h +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -3,10 +3,14 @@ #ifndef __STATMOUNT_H #define __STATMOUNT_H +#include #include +#include #include #include +#define STATMOUNT_BUFSIZE (1 << 15) + #ifndef __NR_statmount #if defined __alpha__ #define __NR_statmount 567 @@ -84,4 +88,51 @@ static inline ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, return syscall(__NR_listmount, &req, list, num, flags); } +static inline struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, + uint64_t mask, unsigned int flags) +{ + struct statmount *buf; + size_t bufsize = STATMOUNT_BUFSIZE; + int ret; + + for (;;) { + buf = malloc(bufsize); + if (!buf) + return NULL; + + ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, flags); + if (ret == 0) + return buf; + + free(buf); + if (errno != EOVERFLOW) + return NULL; + + bufsize <<= 1; + } +} + +static inline struct statmount *statmount_alloc_by_fd(int fd, uint64_t mask) +{ + struct statmount *buf; + size_t bufsize = STATMOUNT_BUFSIZE; + int ret; + + for (;;) { + buf = malloc(bufsize); + if (!buf) + return NULL; + + ret = statmount(0, 0, fd, mask, buf, bufsize, STATMOUNT_BY_FD); + if (ret == 0) + return buf; + + free(buf); + if (errno != EOVERFLOW) + return NULL; + + bufsize <<= 1; + } +} + #endif /* __STATMOUNT_H */ diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index a04bcaace126..8dc018d47a93 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -33,45 +33,6 @@ static const char *const known_fs[] = { "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; -static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags) -{ - size_t bufsize = 1 << 15; - struct statmount *buf = NULL, *tmp = NULL; - int tofree = 0; - int ret; - - if (flags & STATMOUNT_BY_FD && fd < 0) - return NULL; - - tmp = alloca(bufsize); - - for (;;) { - if (flags & STATMOUNT_BY_FD) - ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags); - else - ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags); - - if (ret != -1) - break; - if (tofree) - free(tmp); - if (errno != EOVERFLOW) - return NULL; - bufsize <<= 1; - tofree = 1; - tmp = malloc(bufsize); - if (!tmp) - return NULL; - } - buf = malloc(tmp->size); - if (buf) - memcpy(buf, tmp, tmp->size); - if (tofree) - free(tmp); - - return buf; -} - static void write_file(const char *path, const char *val) { int fd = open(path, O_WRONLY); @@ -715,7 +676,7 @@ static void test_statmount_by_fd(void) goto err_fd; } - sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT); if (!sm) { ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); goto err_chroot; @@ -750,7 +711,7 @@ static void test_statmount_by_fd(void) } free(sm); - sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT); if (!sm) { ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); goto err_fd; @@ -844,7 +805,7 @@ static void test_statmount_by_fd_unmounted(void) goto err_fd; } - sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD); + sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT); if (!sm) { ksft_test_result_fail("statmount by fd unmounted: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index 063d9de46431..e500905e4c07 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -34,31 +34,6 @@ static void handle_result(int ret, const char *testname) ksft_test_result_skip("%s\n", testname); } -static inline int wait_for_pid(pid_t pid) -{ - int status, ret; - -again: - ret = waitpid(pid, &status, 0); - if (ret == -1) { - if (errno == EINTR) - goto again; - - ksft_print_msg("waitpid returned -1, errno=%d\n", errno); - return -1; - } - - if (!WIFEXITED(status)) { - ksft_print_msg( - "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n", - WIFSIGNALED(status), WTERMSIG(status)); - return -1; - } - - ret = WEXITSTATUS(status); - return ret; -} - static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id) { int fd = open(mnt_ns, O_RDONLY); diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index d6f26f849053..d73d7d8171db 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -158,7 +158,7 @@ static int get_userns_fd_cb(void *data) _exit(0); } -static int wait_for_pid(pid_t pid) +int wait_for_pid(pid_t pid) { int status, ret; @@ -450,7 +450,7 @@ out_close: return fret; } -static int write_file(const char *path, const char *val) +int write_file(const char *path, const char *val) { int fd = open(path, O_WRONLY); size_t len = strlen(val); diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h index 0bccfed666a9..d03085cef5cb 100644 --- a/tools/testing/selftests/filesystems/utils.h +++ b/tools/testing/selftests/filesystems/utils.h @@ -44,6 +44,8 @@ static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps) return true; } +extern int wait_for_pid(pid_t pid); +extern int write_file(const char *path, const char *val); extern uint64_t get_unique_mnt_id(const char *path); #endif /* __IDMAP_UTILS_H */ diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c index c7ed4023d7a8..b570746e917c 100644 --- a/tools/testing/selftests/namespaces/listns_efault_test.c +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -19,7 +19,6 @@ #include #include #include "../kselftest_harness.h" -#include "../filesystems/utils.h" #include "../pidfd/pidfd.h" #include "wrappers.h"