Files
linux/kernel/nsproxy.c
Linus Torvalds 7c8a4671dc Merge tag 'vfs-7.1-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs mount updates from Christian Brauner:

 - Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
   namespace with the newly created filesystem attached to a copy of the
   real rootfs. This returns a namespace file descriptor instead of an
   O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for
   open_tree().

   This allows creating a new filesystem and immediately placing it in a
   new mount namespace in a single operation, which is useful for
   container runtimes and other namespace-based isolation mechanisms.

   This accompanies OPEN_TREE_NAMESPACE and avoids a needless detour via
   OPEN_TREE_NAMESPACE to get the same effect. Will be especially useful
   when you mount an actual filesystem to be used as the container
   rootfs.

 - Currently, creating a new mount namespace always copies the entire
   mount tree from the caller's namespace. For containers and sandboxes
   that intend to build their mount table from scratch this is wasteful:
   they inherit a potentially large mount tree only to immediately tear
   it down.

   This series adds support for creating a mount namespace that contains
   only a clone of the root mount, with none of the child mounts. Two
   new flags are introduced:

     - CLONE_EMPTY_MNTNS (0x400000000) for clone3(), using the 64-bit flag space
     - UNSHARE_EMPTY_MNTNS (0x00100000) for unshare()

   Both flags imply CLONE_NEWNS. The resulting namespace contains a
   single nullfs root mount with an immutable empty directory. The
   intended workflow is to then mount a real filesystem (e.g., tmpfs)
   over the root and build the mount table from there.

 - Allow MOVE_MOUNT_BENEATH to target the caller's rootfs, allowing to
   switch out the rootfs without pivot_root(2).

   The traditional approach to switching the rootfs involves
   pivot_root(2) or a chroot_fs_refs()-based mechanism that atomically
   updates fs->root for all tasks sharing the same fs_struct. This has
   consequences for fork(), unshare(CLONE_FS), and setns().

   This series instead decomposes root-switching into individually
   atomic, locally-scoped steps:

	fd_tree = open_tree(-EBADF, "/newroot", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
	fchdir(fd_tree);
	move_mount(fd_tree, "", AT_FDCWD, "/", MOVE_MOUNT_BENEATH | MOVE_MOUNT_F_EMPTY_PATH);
	chroot(".");
	umount2(".", MNT_DETACH);

   Since each step only modifies the caller's own state, the
   fork/unshare/setns races are eliminated by design.

   A key step to making this possible is to remove the locked mount
   restriction. Originally MOVE_MOUNT_BENEATH doesn't support mounting
   beneath a mount that is locked. The locked mount protects the
   underlying mount from being revealed. This is a core mechanism of
   unshare(CLONE_NEWUSER | CLONE_NEWNS). The mounts in the new mount
   namespace become locked. That effectively makes the new mount table
   useless as the caller cannot ever get rid of any of the mounts no
   matter how useless they are.

   We can lift this restriction though. We simply transfer the locked
   property from the top mount to the mount beneath. This works because
   what we care about is to protect the underlying mount aka the parent.
   The mount mounted between the parent and the top mount takes over the
   job of protecting the parent mount from the top mount mount. This
   leaves us free to remove the locked property from the top mount which
   can consequently be unmounted:

	unshare(CLONE_NEWUSER | CLONE_NEWNS)

   and we inherit a clone of procfs on /proc then currently we cannot
   unmount it as:

	umount -l /proc

   will fail with EINVAL because the procfs mount is locked.

   After this series we can now do:

	mount --beneath -t tmpfs tmpfs /proc
	umount -l /proc

   after which a tmpfs mount has been placed beneath the procfs mount.
   The tmpfs mount has become locked and the procfs mount has become
   unlocked.

   This means you can safely modify an inherited mount table after
   unprivileged namespace creation.

   Afterwards we simply make it possible to move a mount beneath the
   rootfs allowing to upgrade the rootfs.

   Removing the locked restriction makes this very useful for containers
   created with unshare(CLONE_NEWUSER | CLONE_NEWNS) to reshuffle an
   inherited mount table safely and MOVE_MOUNT_BENEATH makes it possible
   to switch out the rootfs instead of using the costly pivot_root(2).

* tag 'vfs-7.1-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  selftests/namespaces: remove unused utils.h include from listns_efault_test
  selftests/fsmount_ns: add missing TARGETS and fix cap test
  selftests/empty_mntns: fix wrong CLONE_EMPTY_MNTNS hex value in comment
  selftests/empty_mntns: fix statmount_alloc() signature mismatch
  selftests/statmount: remove duplicate wait_for_pid()
  mount: always duplicate mount
  selftests/filesystems: add MOVE_MOUNT_BENEATH rootfs tests
  move_mount: allow MOVE_MOUNT_BENEATH on the rootfs
  move_mount: transfer MNT_LOCKED
  selftests/filesystems: add clone3 tests for empty mount namespaces
  selftests/filesystems: add tests for empty mount namespaces
  namespace: allow creating empty mount namespaces
  selftests: add FSMOUNT_NAMESPACE tests
  selftests/statmount: add statmount_alloc() helper
  tools: update mount.h header
  mount: add FSMOUNT_NAMESPACE
  mount: simplify __do_loopback()
  mount: start iterating from start of rbtree
2026-04-14 19:59:25 -07:00

614 lines
14 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2006 IBM Corporation
*
* Author: Serge Hallyn <serue@us.ibm.com>
*
* Jun 2006 - namespaces support
* OpenVZ, SWsoft Inc.
* Pavel Emelianov <xemul@openvz.org>
*/
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/ns/ns_common_types.h>
#include <linux/init_task.h>
#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
#include <linux/fs_struct.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/nstree.h>
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
.count = REFCOUNT_INIT(1),
.uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &init_ipc_ns,
#endif
.mnt_ns = NULL,
.pid_ns_for_children = &init_pid_ns,
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
#ifdef CONFIG_TIME_NS
.time_ns = &init_time_ns,
.time_ns_for_children = &init_time_ns,
#endif
};
static inline struct nsproxy *create_nsproxy(void)
{
struct nsproxy *nsproxy;
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
if (nsproxy)
refcount_set(&nsproxy->count, 1);
return nsproxy;
}
static inline void nsproxy_free(struct nsproxy *ns)
{
put_mnt_ns(ns->mnt_ns);
put_uts_ns(ns->uts_ns);
put_ipc_ns(ns->ipc_ns);
put_pid_ns(ns->pid_ns_for_children);
put_time_ns(ns->time_ns);
put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
void deactivate_nsproxy(struct nsproxy *ns)
{
nsproxy_ns_active_put(ns);
nsproxy_free(ns);
}
/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy. Do not attach this to the task,
* leave it to the caller to do proper locking and attach it to task.
*/
static struct nsproxy *create_new_namespaces(u64 flags,
struct task_struct *tsk, struct user_namespace *user_ns,
struct fs_struct *new_fs)
{
struct nsproxy *new_nsp;
int err;
new_nsp = create_nsproxy();
if (!new_nsp)
return ERR_PTR(-ENOMEM);
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
}
new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
}
new_nsp->pid_ns_for_children =
copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
if (IS_ERR(new_nsp->pid_ns_for_children)) {
err = PTR_ERR(new_nsp->pid_ns_for_children);
goto out_pid;
}
new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
tsk->nsproxy->cgroup_ns);
if (IS_ERR(new_nsp->cgroup_ns)) {
err = PTR_ERR(new_nsp->cgroup_ns);
goto out_cgroup;
}
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;
}
new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
tsk->nsproxy->time_ns_for_children);
if (IS_ERR(new_nsp->time_ns_for_children)) {
err = PTR_ERR(new_nsp->time_ns_for_children);
goto out_time;
}
new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
return new_nsp;
out_time:
put_net(new_nsp->net_ns);
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
put_uts_ns(new_nsp->uts_ns);
out_uts:
put_mnt_ns(new_nsp->mnt_ns);
out_ns:
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);
}
/*
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
int copy_namespaces(u64 flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
if (likely(!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))) {
if ((flags & CLONE_VM) ||
likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
return 0;
}
} else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
* CLONE_NEWIPC must detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable. In clone parlance, CLONE_SYSVSEM
* means share undolist with parent, so we must forbid using
* it along with CLONE_NEWIPC.
*/
if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
(CLONE_NEWIPC | CLONE_SYSVSEM))
return -EINVAL;
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
if ((flags & CLONE_VM) == 0)
timens_on_fork(new_ns, tsk);
nsproxy_ns_active_get(new_ns);
tsk->nsproxy = new_ns;
return 0;
}
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
* On success, returns the new nsproxy.
*/
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
struct user_namespace *user_ns;
u64 flags = unshare_flags;
int err = 0;
if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
* Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
* CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
*/
if (flags & UNSHARE_EMPTY_MNTNS) {
flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
flags |= CLONE_EMPTY_MNTNS;
}
*new_nsp = create_new_namespaces(flags, current, user_ns,
new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
goto out;
}
out:
return err;
}
void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
{
struct nsproxy *ns;
might_sleep();
if (new)
nsproxy_ns_active_get(new);
task_lock(p);
ns = p->nsproxy;
p->nsproxy = new;
task_unlock(p);
if (ns)
put_nsproxy(ns);
}
void exit_nsproxy_namespaces(struct task_struct *p)
{
switch_task_namespaces(p, NULL);
}
void switch_cred_namespaces(const struct cred *old, const struct cred *new)
{
ns_ref_active_get(new->user_ns);
ns_ref_active_put(old->user_ns);
}
void get_cred_namespaces(struct task_struct *tsk)
{
ns_ref_active_get(tsk->real_cred->user_ns);
}
void exit_cred_namespaces(struct task_struct *tsk)
{
ns_ref_active_put(tsk->real_cred->user_ns);
}
int exec_task_namespaces(void)
{
struct task_struct *tsk = current;
struct nsproxy *new;
if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
return 0;
new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
if (IS_ERR(new))
return PTR_ERR(new);
timens_on_fork(new, tsk);
switch_task_namespaces(tsk, new);
return 0;
}
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~CLONE_NS_ALL))
return -EINVAL;
#ifndef CONFIG_USER_NS
if (flags & CLONE_NEWUSER)
return -EINVAL;
#endif
#ifndef CONFIG_PID_NS
if (flags & CLONE_NEWPID)
return -EINVAL;
#endif
#ifndef CONFIG_UTS_NS
if (flags & CLONE_NEWUTS)
return -EINVAL;
#endif
#ifndef CONFIG_IPC_NS
if (flags & CLONE_NEWIPC)
return -EINVAL;
#endif
#ifndef CONFIG_CGROUPS
if (flags & CLONE_NEWCGROUP)
return -EINVAL;
#endif
#ifndef CONFIG_NET_NS
if (flags & CLONE_NEWNET)
return -EINVAL;
#endif
#ifndef CONFIG_TIME_NS
if (flags & CLONE_NEWTIME)
return -EINVAL;
#endif
return 0;
}
static void put_nsset(struct nsset *nsset)
{
unsigned flags = nsset->flags;
if (flags & CLONE_NEWUSER)
put_cred(nsset_cred(nsset));
/*
* We only created a temporary copy if we attached to more than just
* the mount namespace.
*/
if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
free_fs_struct(nsset->fs);
if (nsset->nsproxy)
nsproxy_free(nsset->nsproxy);
}
static int prepare_nsset(unsigned flags, struct nsset *nsset)
{
struct task_struct *me = current;
nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
if (IS_ERR(nsset->nsproxy))
return PTR_ERR(nsset->nsproxy);
if (flags & CLONE_NEWUSER)
nsset->cred = prepare_creds();
else
nsset->cred = current_cred();
if (!nsset->cred)
goto out;
/* Only create a temporary copy of fs_struct if we really need to. */
if (flags == CLONE_NEWNS) {
nsset->fs = me->fs;
} else if (flags & CLONE_NEWNS) {
nsset->fs = copy_fs_struct(me->fs);
if (!nsset->fs)
goto out;
}
nsset->flags = flags;
return 0;
out:
put_nsset(nsset);
return -ENOMEM;
}
static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
{
return ns->ops->install(nsset, ns);
}
/*
* This is the inverse operation to unshare().
* Ordering is equivalent to the standard ordering used everywhere else
* during unshare and process creation. The switch to the new set of
* namespaces occurs at the point of no return after installation of
* all requested namespaces was successful in commit_nsset().
*/
static int validate_nsset(struct nsset *nsset, struct pid *pid)
{
int ret = 0;
unsigned flags = nsset->flags;
struct user_namespace *user_ns = NULL;
struct pid_namespace *pid_ns = NULL;
struct nsproxy *nsp;
struct task_struct *tsk;
/* Take a "snapshot" of the target task's namespaces. */
rcu_read_lock();
tsk = pid_task(pid, PIDTYPE_PID);
if (!tsk) {
rcu_read_unlock();
return -ESRCH;
}
if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
return -EPERM;
}
task_lock(tsk);
nsp = tsk->nsproxy;
if (nsp)
get_nsproxy(nsp);
task_unlock(tsk);
if (!nsp) {
rcu_read_unlock();
return -ESRCH;
}
#ifdef CONFIG_PID_NS
if (flags & CLONE_NEWPID) {
pid_ns = task_active_pid_ns(tsk);
if (unlikely(!pid_ns)) {
rcu_read_unlock();
ret = -ESRCH;
goto out;
}
get_pid_ns(pid_ns);
}
#endif
#ifdef CONFIG_USER_NS
if (flags & CLONE_NEWUSER)
user_ns = get_user_ns(__task_cred(tsk)->user_ns);
#endif
rcu_read_unlock();
/*
* Install requested namespaces. The caller will have
* verified earlier that the requested namespaces are
* supported on this kernel. We don't report errors here
* if a namespace is requested that isn't supported.
*/
#ifdef CONFIG_USER_NS
if (flags & CLONE_NEWUSER) {
ret = validate_ns(nsset, &user_ns->ns);
if (ret)
goto out;
}
#endif
if (flags & CLONE_NEWNS) {
ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
if (ret)
goto out;
}
#ifdef CONFIG_UTS_NS
if (flags & CLONE_NEWUTS) {
ret = validate_ns(nsset, &nsp->uts_ns->ns);
if (ret)
goto out;
}
#endif
#ifdef CONFIG_IPC_NS
if (flags & CLONE_NEWIPC) {
ret = validate_ns(nsset, &nsp->ipc_ns->ns);
if (ret)
goto out;
}
#endif
#ifdef CONFIG_PID_NS
if (flags & CLONE_NEWPID) {
ret = validate_ns(nsset, &pid_ns->ns);
if (ret)
goto out;
}
#endif
#ifdef CONFIG_CGROUPS
if (flags & CLONE_NEWCGROUP) {
ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
if (ret)
goto out;
}
#endif
#ifdef CONFIG_NET_NS
if (flags & CLONE_NEWNET) {
ret = validate_ns(nsset, &nsp->net_ns->ns);
if (ret)
goto out;
}
#endif
#ifdef CONFIG_TIME_NS
if (flags & CLONE_NEWTIME) {
ret = validate_ns(nsset, &nsp->time_ns->ns);
if (ret)
goto out;
}
#endif
out:
if (pid_ns)
put_pid_ns(pid_ns);
if (nsp)
put_nsproxy(nsp);
put_user_ns(user_ns);
return ret;
}
/*
* This is the point of no return. There are just a few namespaces
* that do some actual work here and it's sufficiently minimal that
* a separate ns_common operation seems unnecessary for now.
* Unshare is doing the same thing. If we'll end up needing to do
* more in a given namespace or a helper here is ultimately not
* exported anymore a simple commit handler for each namespace
* should be added to ns_common.
*/
static void commit_nsset(struct nsset *nsset)
{
unsigned flags = nsset->flags;
struct task_struct *me = current;
#ifdef CONFIG_USER_NS
if (flags & CLONE_NEWUSER) {
/* transfer ownership */
commit_creds(nsset_cred(nsset));
nsset->cred = NULL;
}
#endif
/* We only need to commit if we have used a temporary fs_struct. */
if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
set_fs_root(me->fs, &nsset->fs->root);
set_fs_pwd(me->fs, &nsset->fs->pwd);
}
#ifdef CONFIG_IPC_NS
if (flags & CLONE_NEWIPC)
exit_sem(me);
#endif
#ifdef CONFIG_TIME_NS
if (flags & CLONE_NEWTIME)
timens_commit(me, nsset->nsproxy->time_ns);
#endif
/* transfer ownership */
switch_task_namespaces(me, nsset->nsproxy);
nsset->nsproxy = NULL;
}
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
CLASS(fd, f)(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
if (fd_empty(f))
return -EBADF;
if (proc_ns_file(fd_file(f))) {
ns = get_proc_ns(file_inode(fd_file(f)));
if (flags && (ns->ns_type != flags))
err = -EINVAL;
flags = ns->ns_type;
} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
}
if (err)
goto out;
err = prepare_nsset(flags, &nsset);
if (err)
goto out;
if (proc_ns_file(fd_file(f)))
err = validate_ns(&nsset, ns);
else
err = validate_nsset(&nsset, pidfd_pid(fd_file(f)));
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
}
put_nsset(&nsset);
out:
return err;
}
int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
return 0;
}