mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
namespace: allow creating empty mount namespaces
Add support for creating a mount namespace that contains only a copy of the root mount from the caller's mount namespace, with none of the child mounts. This is useful for containers and sandboxes that want to start with a minimal mount table and populate it from scratch rather than inheriting and then tearing down the full mount tree. Two new flags are introduced: - CLONE_EMPTY_MNTNS for clone3(), using the 64-bit flag space. - UNSHARE_EMPTY_MNTNS for unshare(), reusing the CLONE_PARENT_SETTID bit which has no meaning for unshare. Both flags imply CLONE_NEWNS. For the unshare path, UNSHARE_EMPTY_MNTNS is converted to CLONE_EMPTY_MNTNS in unshare_nsproxy_namespaces() before it reaches copy_mnt_ns(), so the mount namespace code only needs to handle a single flag. In copy_mnt_ns(), when CLONE_EMPTY_MNTNS is set, clone_mnt() is used instead of copy_tree() to clone only the root mount. The caller's root and working directory are both reset to the root dentry of the new mount. The cleanup variables are changed from vfsmount pointers with __free(mntput) to struct path with __free(path_put) because the empty mount namespace path needs to release both mount and dentry references when replacing the caller's root and pwd. In the normal (non-empty) path only the mount component is set, and dput(NULL) is a no-op so path_put remains correct there as well. Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-1-6eb30529bbb0@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
@@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
|
||||
struct user_namespace *user_ns, struct fs_struct *new_fs)
|
||||
{
|
||||
struct mnt_namespace *new_ns;
|
||||
struct vfsmount *rootmnt __free(mntput) = NULL;
|
||||
struct vfsmount *pwdmnt __free(mntput) = NULL;
|
||||
struct path old_root __free(path_put) = {};
|
||||
struct path old_pwd __free(path_put) = {};
|
||||
struct mount *p, *q;
|
||||
struct mount *old;
|
||||
struct mount *new;
|
||||
@@ -4254,11 +4254,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
|
||||
return new_ns;
|
||||
|
||||
guard(namespace_excl)();
|
||||
/* First pass: copy the tree topology */
|
||||
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
|
||||
|
||||
if (flags & CLONE_EMPTY_MNTNS)
|
||||
copy_flags = 0;
|
||||
else
|
||||
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
|
||||
if (user_ns != ns->user_ns)
|
||||
copy_flags |= CL_SLAVE;
|
||||
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
|
||||
|
||||
if (flags & CLONE_EMPTY_MNTNS)
|
||||
new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
|
||||
else
|
||||
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
|
||||
if (IS_ERR(new)) {
|
||||
emptied_ns = new_ns;
|
||||
return ERR_CAST(new);
|
||||
@@ -4269,33 +4276,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
|
||||
}
|
||||
new_ns->root = new;
|
||||
|
||||
/*
|
||||
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
|
||||
* as belonging to new namespace. We have already acquired a private
|
||||
* fs_struct, so tsk->fs->lock is not needed.
|
||||
*/
|
||||
p = old;
|
||||
q = new;
|
||||
while (p) {
|
||||
mnt_add_to_ns(new_ns, q);
|
||||
new_ns->nr_mounts++;
|
||||
if (flags & CLONE_EMPTY_MNTNS) {
|
||||
/*
|
||||
* Empty mount namespace: only the root mount exists.
|
||||
* Reset root and pwd to the cloned mount's root dentry.
|
||||
*/
|
||||
if (new_fs) {
|
||||
if (&p->mnt == new_fs->root.mnt) {
|
||||
new_fs->root.mnt = mntget(&q->mnt);
|
||||
rootmnt = &p->mnt;
|
||||
}
|
||||
if (&p->mnt == new_fs->pwd.mnt) {
|
||||
new_fs->pwd.mnt = mntget(&q->mnt);
|
||||
pwdmnt = &p->mnt;
|
||||
}
|
||||
old_root = new_fs->root;
|
||||
old_pwd = new_fs->pwd;
|
||||
|
||||
new_fs->root.mnt = mntget(&new->mnt);
|
||||
new_fs->root.dentry = dget(new->mnt.mnt_root);
|
||||
|
||||
new_fs->pwd.mnt = mntget(&new->mnt);
|
||||
new_fs->pwd.dentry = dget(new->mnt.mnt_root);
|
||||
}
|
||||
mnt_add_to_ns(new_ns, new);
|
||||
new_ns->nr_mounts++;
|
||||
} else {
|
||||
/*
|
||||
* Full copy: walk old and new trees in parallel, switching
|
||||
* the tsk->fs->* elements and marking new vfsmounts as
|
||||
* belonging to new namespace. We have already acquired a
|
||||
* private fs_struct, so tsk->fs->lock is not needed.
|
||||
*/
|
||||
p = old;
|
||||
q = new;
|
||||
while (p) {
|
||||
mnt_add_to_ns(new_ns, q);
|
||||
new_ns->nr_mounts++;
|
||||
if (new_fs) {
|
||||
if (&p->mnt == new_fs->root.mnt) {
|
||||
old_root.mnt = new_fs->root.mnt;
|
||||
new_fs->root.mnt = mntget(&q->mnt);
|
||||
}
|
||||
if (&p->mnt == new_fs->pwd.mnt) {
|
||||
old_pwd.mnt = new_fs->pwd.mnt;
|
||||
new_fs->pwd.mnt = mntget(&q->mnt);
|
||||
}
|
||||
}
|
||||
p = next_mnt(p, old);
|
||||
q = next_mnt(q, new);
|
||||
if (!q)
|
||||
break;
|
||||
// an mntns binding we'd skipped?
|
||||
while (p->mnt.mnt_root != q->mnt.mnt_root)
|
||||
p = next_mnt(skip_mnt_tree(p), old);
|
||||
}
|
||||
p = next_mnt(p, old);
|
||||
q = next_mnt(q, new);
|
||||
if (!q)
|
||||
break;
|
||||
// an mntns binding we'd skipped?
|
||||
while (p->mnt.mnt_root != q->mnt.mnt_root)
|
||||
p = next_mnt(skip_mnt_tree(p), old);
|
||||
}
|
||||
ns_tree_add_raw(new_ns);
|
||||
return new_ns;
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
/* Flags for the clone3() syscall. */
|
||||
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
||||
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||
#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */
|
||||
|
||||
/*
|
||||
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
||||
@@ -43,6 +44,12 @@
|
||||
*/
|
||||
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
|
||||
|
||||
/*
|
||||
* unshare flags share the bit space with clone flags but only apply to the
|
||||
* unshare syscall:
|
||||
*/
|
||||
#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
/**
|
||||
* struct clone_args - arguments for the clone3 syscall
|
||||
|
||||
@@ -2619,6 +2619,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
|
||||
int trace = 0;
|
||||
pid_t nr;
|
||||
|
||||
/*
|
||||
* Creating an empty mount namespace implies creating a new mount
|
||||
* namespace. Set this before copy_process() so that the
|
||||
* CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
|
||||
*/
|
||||
if (clone_flags & CLONE_EMPTY_MNTNS) {
|
||||
clone_flags |= CLONE_NEWNS;
|
||||
args->flags = clone_flags;
|
||||
}
|
||||
|
||||
/*
|
||||
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
|
||||
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
|
||||
@@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
|
||||
{
|
||||
/* Verify that no unknown flags are passed along. */
|
||||
if (kargs->flags &
|
||||
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
|
||||
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
|
||||
CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS))
|
||||
return false;
|
||||
|
||||
/*
|
||||
@@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
|
||||
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
|
||||
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
|
||||
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
|
||||
CLONE_NEWTIME))
|
||||
CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS))
|
||||
return -EINVAL;
|
||||
/*
|
||||
* Not implemented, but pretend it works if there is nothing
|
||||
@@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags)
|
||||
/*
|
||||
* If unsharing namespace, must also unshare filesystem information.
|
||||
*/
|
||||
if (unshare_flags & UNSHARE_EMPTY_MNTNS)
|
||||
unshare_flags |= CLONE_NEWNS;
|
||||
if (unshare_flags & CLONE_NEWNS)
|
||||
unshare_flags |= CLONE_FS;
|
||||
|
||||
|
||||
@@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
|
||||
if (!new_nsp)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
|
||||
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
|
||||
user_ns, new_fs);
|
||||
if (IS_ERR(new_nsp->mnt_ns)) {
|
||||
err = PTR_ERR(new_nsp->mnt_ns);
|
||||
goto out_ns;
|
||||
@@ -212,18 +213,28 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
|
||||
struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
|
||||
{
|
||||
struct user_namespace *user_ns;
|
||||
u64 flags = unshare_flags;
|
||||
int err = 0;
|
||||
|
||||
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
||||
CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
|
||||
CLONE_NEWTIME)))
|
||||
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
||||
CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
|
||||
CLONE_NEWTIME)))
|
||||
return 0;
|
||||
|
||||
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
|
||||
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
|
||||
/*
|
||||
* Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
|
||||
* CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
|
||||
*/
|
||||
if (flags & UNSHARE_EMPTY_MNTNS) {
|
||||
flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
|
||||
flags |= CLONE_EMPTY_MNTNS;
|
||||
}
|
||||
|
||||
*new_nsp = create_new_namespaces(flags, current, user_ns,
|
||||
new_fs ? new_fs : current->fs);
|
||||
if (IS_ERR(*new_nsp)) {
|
||||
err = PTR_ERR(*new_nsp);
|
||||
|
||||
Reference in New Issue
Block a user