diff --git a/fs/namespace.c b/fs/namespace.c index 702e93243505..555f0a10de9a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt __free(mntput) = NULL; - struct vfsmount *pwdmnt __free(mntput) = NULL; + struct path old_root __free(path_put) = {}; + struct path old_pwd __free(path_put) = {}; struct mount *p, *q; struct mount *old; struct mount *new; @@ -4254,11 +4254,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, return new_ns; guard(namespace_excl)(); - /* First pass: copy the tree topology */ - copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; + + if (flags & CLONE_EMPTY_MNTNS) + copy_flags = 0; + else + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; - new = copy_tree(old, old->mnt.mnt_root, copy_flags); + + if (flags & CLONE_EMPTY_MNTNS) + new = clone_mnt(old, old->mnt.mnt_root, copy_flags); + else + new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { emptied_ns = new_ns; return ERR_CAST(new); @@ -4269,33 +4276,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, } new_ns->root = new; - /* - * Second pass: switch the tsk->fs->* elements and mark new vfsmounts - * as belonging to new namespace. We have already acquired a private - * fs_struct, so tsk->fs->lock is not needed. - */ - p = old; - q = new; - while (p) { - mnt_add_to_ns(new_ns, q); - new_ns->nr_mounts++; + if (flags & CLONE_EMPTY_MNTNS) { + /* + * Empty mount namespace: only the root mount exists. + * Reset root and pwd to the cloned mount's root dentry. + */ if (new_fs) { - if (&p->mnt == new_fs->root.mnt) { - new_fs->root.mnt = mntget(&q->mnt); - rootmnt = &p->mnt; - } - if (&p->mnt == new_fs->pwd.mnt) { - new_fs->pwd.mnt = mntget(&q->mnt); - pwdmnt = &p->mnt; - } + old_root = new_fs->root; + old_pwd = new_fs->pwd; + + new_fs->root.mnt = mntget(&new->mnt); + new_fs->root.dentry = dget(new->mnt.mnt_root); + + new_fs->pwd.mnt = mntget(&new->mnt); + new_fs->pwd.dentry = dget(new->mnt.mnt_root); + } + mnt_add_to_ns(new_ns, new); + new_ns->nr_mounts++; + } else { + /* + * Full copy: walk old and new trees in parallel, switching + * the tsk->fs->* elements and marking new vfsmounts as + * belonging to new namespace. We have already acquired a + * private fs_struct, so tsk->fs->lock is not needed. + */ + p = old; + q = new; + while (p) { + mnt_add_to_ns(new_ns, q); + new_ns->nr_mounts++; + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + old_root.mnt = new_fs->root.mnt; + new_fs->root.mnt = mntget(&q->mnt); + } + if (&p->mnt == new_fs->pwd.mnt) { + old_pwd.mnt = new_fs->pwd.mnt; + new_fs->pwd.mnt = mntget(&q->mnt); + } + } + p = next_mnt(p, old); + q = next_mnt(q, new); + if (!q) + break; + // an mntns binding we'd skipped? + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(skip_mnt_tree(p), old); } - p = next_mnt(p, old); - q = next_mnt(q, new); - if (!q) - break; - // an mntns binding we'd skipped? - while (p->mnt.mnt_root != q->mnt.mnt_root) - p = next_mnt(skip_mnt_tree(p), old); } ns_tree_add_raw(new_ns); return new_ns; diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..4e76fce9f777 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -36,6 +36,7 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 @@ -43,6 +44,12 @@ */ #define CLONE_NEWTIME 0x00000080 /* New time namespace */ +/* + * unshare flags share the bit space with clone flags but only apply to the + * unshare syscall: + */ +#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/kernel/fork.c b/kernel/fork.c index 65113a304518..dea6b3454447 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2619,6 +2619,16 @@ pid_t kernel_clone(struct kernel_clone_args *args) int trace = 0; pid_t nr; + /* + * Creating an empty mount namespace implies creating a new mount + * namespace. Set this before copy_process() so that the + * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly. + */ + if (clone_flags & CLONE_EMPTY_MNTNS) { + clone_flags |= CLONE_NEWNS; + args->flags = clone_flags; + } + /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are @@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | + CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS)) return false; /* @@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags) CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| - CLONE_NEWTIME)) + CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing @@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags) /* * If unsharing namespace, must also unshare filesystem information. */ + if (unshare_flags & UNSHARE_EMPTY_MNTNS) + unshare_flags |= CLONE_NEWNS; if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 259c4b4f1eeb..1bdc5be2dd20 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, + user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; @@ -212,18 +213,28 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { struct user_namespace *user_ns; + u64 flags = unshare_flags; int err = 0; - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | - CLONE_NEWTIME))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | + CLONE_NEWTIME))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + /* + * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases + * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS. + */ + if (flags & UNSHARE_EMPTY_MNTNS) { + flags &= ~(u64)UNSHARE_EMPTY_MNTNS; + flags |= CLONE_EMPTY_MNTNS; + } + + *new_nsp = create_new_namespaces(flags, current, user_ns, new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp);