mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Pull vfs mount updates from Christian Brauner:
- Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
namespace with the newly created filesystem attached to a copy of the
real rootfs. This returns a namespace file descriptor instead of an
O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for
open_tree().
This allows creating a new filesystem and immediately placing it in a
new mount namespace in a single operation, which is useful for
container runtimes and other namespace-based isolation mechanisms.
This accompanies OPEN_TREE_NAMESPACE and avoids a needless detour via
OPEN_TREE_NAMESPACE to get the same effect. Will be especially useful
when you mount an actual filesystem to be used as the container
rootfs.
- Currently, creating a new mount namespace always copies the entire
mount tree from the caller's namespace. For containers and sandboxes
that intend to build their mount table from scratch this is wasteful:
they inherit a potentially large mount tree only to immediately tear
it down.
This series adds support for creating a mount namespace that contains
only a clone of the root mount, with none of the child mounts. Two
new flags are introduced:
- CLONE_EMPTY_MNTNS (0x400000000) for clone3(), using the 64-bit flag space
- UNSHARE_EMPTY_MNTNS (0x00100000) for unshare()
Both flags imply CLONE_NEWNS. The resulting namespace contains a
single nullfs root mount with an immutable empty directory. The
intended workflow is to then mount a real filesystem (e.g., tmpfs)
over the root and build the mount table from there.
- Allow MOVE_MOUNT_BENEATH to target the caller's rootfs, allowing to
switch out the rootfs without pivot_root(2).
The traditional approach to switching the rootfs involves
pivot_root(2) or a chroot_fs_refs()-based mechanism that atomically
updates fs->root for all tasks sharing the same fs_struct. This has
consequences for fork(), unshare(CLONE_FS), and setns().
This series instead decomposes root-switching into individually
atomic, locally-scoped steps:
fd_tree = open_tree(-EBADF, "/newroot", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
fchdir(fd_tree);
move_mount(fd_tree, "", AT_FDCWD, "/", MOVE_MOUNT_BENEATH | MOVE_MOUNT_F_EMPTY_PATH);
chroot(".");
umount2(".", MNT_DETACH);
Since each step only modifies the caller's own state, the
fork/unshare/setns races are eliminated by design.
A key step to making this possible is to remove the locked mount
restriction. Originally MOVE_MOUNT_BENEATH doesn't support mounting
beneath a mount that is locked. The locked mount protects the
underlying mount from being revealed. This is a core mechanism of
unshare(CLONE_NEWUSER | CLONE_NEWNS). The mounts in the new mount
namespace become locked. That effectively makes the new mount table
useless as the caller cannot ever get rid of any of the mounts no
matter how useless they are.
We can lift this restriction though. We simply transfer the locked
property from the top mount to the mount beneath. This works because
what we care about is to protect the underlying mount aka the parent.
The mount mounted between the parent and the top mount takes over the
job of protecting the parent mount from the top mount mount. This
leaves us free to remove the locked property from the top mount which
can consequently be unmounted:
unshare(CLONE_NEWUSER | CLONE_NEWNS)
and we inherit a clone of procfs on /proc then currently we cannot
unmount it as:
umount -l /proc
will fail with EINVAL because the procfs mount is locked.
After this series we can now do:
mount --beneath -t tmpfs tmpfs /proc
umount -l /proc
after which a tmpfs mount has been placed beneath the procfs mount.
The tmpfs mount has become locked and the procfs mount has become
unlocked.
This means you can safely modify an inherited mount table after
unprivileged namespace creation.
Afterwards we simply make it possible to move a mount beneath the
rootfs allowing to upgrade the rootfs.
Removing the locked restriction makes this very useful for containers
created with unshare(CLONE_NEWUSER | CLONE_NEWNS) to reshuffle an
inherited mount table safely and MOVE_MOUNT_BENEATH makes it possible
to switch out the rootfs instead of using the costly pivot_root(2).
* tag 'vfs-7.1-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
selftests/namespaces: remove unused utils.h include from listns_efault_test
selftests/fsmount_ns: add missing TARGETS and fix cap test
selftests/empty_mntns: fix wrong CLONE_EMPTY_MNTNS hex value in comment
selftests/empty_mntns: fix statmount_alloc() signature mismatch
selftests/statmount: remove duplicate wait_for_pid()
mount: always duplicate mount
selftests/filesystems: add MOVE_MOUNT_BENEATH rootfs tests
move_mount: allow MOVE_MOUNT_BENEATH on the rootfs
move_mount: transfer MNT_LOCKED
selftests/filesystems: add clone3 tests for empty mount namespaces
selftests/filesystems: add tests for empty mount namespaces
namespace: allow creating empty mount namespaces
selftests: add FSMOUNT_NAMESPACE tests
selftests/statmount: add statmount_alloc() helper
tools: update mount.h header
mount: add FSMOUNT_NAMESPACE
mount: simplify __do_loopback()
mount: start iterating from start of rbtree
614 lines
14 KiB
C
614 lines
14 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (C) 2006 IBM Corporation
|
|
*
|
|
* Author: Serge Hallyn <serue@us.ibm.com>
|
|
*
|
|
* Jun 2006 - namespaces support
|
|
* OpenVZ, SWsoft Inc.
|
|
* Pavel Emelianov <xemul@openvz.org>
|
|
*/
|
|
|
|
#include <linux/slab.h>
|
|
#include <linux/export.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/ns/ns_common_types.h>
|
|
#include <linux/init_task.h>
|
|
#include <linux/mnt_namespace.h>
|
|
#include <linux/utsname.h>
|
|
#include <linux/pid_namespace.h>
|
|
#include <net/net_namespace.h>
|
|
#include <linux/ipc_namespace.h>
|
|
#include <linux/time_namespace.h>
|
|
#include <linux/fs_struct.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/proc_ns.h>
|
|
#include <linux/file.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/cgroup.h>
|
|
#include <linux/perf_event.h>
|
|
#include <linux/nstree.h>
|
|
|
|
static struct kmem_cache *nsproxy_cachep;
|
|
|
|
struct nsproxy init_nsproxy = {
|
|
.count = REFCOUNT_INIT(1),
|
|
.uts_ns = &init_uts_ns,
|
|
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
|
|
.ipc_ns = &init_ipc_ns,
|
|
#endif
|
|
.mnt_ns = NULL,
|
|
.pid_ns_for_children = &init_pid_ns,
|
|
#ifdef CONFIG_NET
|
|
.net_ns = &init_net,
|
|
#endif
|
|
#ifdef CONFIG_CGROUPS
|
|
.cgroup_ns = &init_cgroup_ns,
|
|
#endif
|
|
#ifdef CONFIG_TIME_NS
|
|
.time_ns = &init_time_ns,
|
|
.time_ns_for_children = &init_time_ns,
|
|
#endif
|
|
};
|
|
|
|
static inline struct nsproxy *create_nsproxy(void)
|
|
{
|
|
struct nsproxy *nsproxy;
|
|
|
|
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
|
|
if (nsproxy)
|
|
refcount_set(&nsproxy->count, 1);
|
|
return nsproxy;
|
|
}
|
|
|
|
static inline void nsproxy_free(struct nsproxy *ns)
|
|
{
|
|
put_mnt_ns(ns->mnt_ns);
|
|
put_uts_ns(ns->uts_ns);
|
|
put_ipc_ns(ns->ipc_ns);
|
|
put_pid_ns(ns->pid_ns_for_children);
|
|
put_time_ns(ns->time_ns);
|
|
put_time_ns(ns->time_ns_for_children);
|
|
put_cgroup_ns(ns->cgroup_ns);
|
|
put_net(ns->net_ns);
|
|
kmem_cache_free(nsproxy_cachep, ns);
|
|
}
|
|
|
|
void deactivate_nsproxy(struct nsproxy *ns)
|
|
{
|
|
nsproxy_ns_active_put(ns);
|
|
nsproxy_free(ns);
|
|
}
|
|
|
|
/*
|
|
* Create new nsproxy and all of its the associated namespaces.
|
|
* Return the newly created nsproxy. Do not attach this to the task,
|
|
* leave it to the caller to do proper locking and attach it to task.
|
|
*/
|
|
static struct nsproxy *create_new_namespaces(u64 flags,
|
|
struct task_struct *tsk, struct user_namespace *user_ns,
|
|
struct fs_struct *new_fs)
|
|
{
|
|
struct nsproxy *new_nsp;
|
|
int err;
|
|
|
|
new_nsp = create_nsproxy();
|
|
if (!new_nsp)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
|
|
user_ns, new_fs);
|
|
if (IS_ERR(new_nsp->mnt_ns)) {
|
|
err = PTR_ERR(new_nsp->mnt_ns);
|
|
goto out_ns;
|
|
}
|
|
|
|
new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
|
|
if (IS_ERR(new_nsp->uts_ns)) {
|
|
err = PTR_ERR(new_nsp->uts_ns);
|
|
goto out_uts;
|
|
}
|
|
|
|
new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
|
|
if (IS_ERR(new_nsp->ipc_ns)) {
|
|
err = PTR_ERR(new_nsp->ipc_ns);
|
|
goto out_ipc;
|
|
}
|
|
|
|
new_nsp->pid_ns_for_children =
|
|
copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
|
|
if (IS_ERR(new_nsp->pid_ns_for_children)) {
|
|
err = PTR_ERR(new_nsp->pid_ns_for_children);
|
|
goto out_pid;
|
|
}
|
|
|
|
new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
|
|
tsk->nsproxy->cgroup_ns);
|
|
if (IS_ERR(new_nsp->cgroup_ns)) {
|
|
err = PTR_ERR(new_nsp->cgroup_ns);
|
|
goto out_cgroup;
|
|
}
|
|
|
|
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
|
|
if (IS_ERR(new_nsp->net_ns)) {
|
|
err = PTR_ERR(new_nsp->net_ns);
|
|
goto out_net;
|
|
}
|
|
|
|
new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
|
|
tsk->nsproxy->time_ns_for_children);
|
|
if (IS_ERR(new_nsp->time_ns_for_children)) {
|
|
err = PTR_ERR(new_nsp->time_ns_for_children);
|
|
goto out_time;
|
|
}
|
|
new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
|
|
|
|
return new_nsp;
|
|
|
|
out_time:
|
|
put_net(new_nsp->net_ns);
|
|
out_net:
|
|
put_cgroup_ns(new_nsp->cgroup_ns);
|
|
out_cgroup:
|
|
put_pid_ns(new_nsp->pid_ns_for_children);
|
|
out_pid:
|
|
put_ipc_ns(new_nsp->ipc_ns);
|
|
out_ipc:
|
|
put_uts_ns(new_nsp->uts_ns);
|
|
out_uts:
|
|
put_mnt_ns(new_nsp->mnt_ns);
|
|
out_ns:
|
|
kmem_cache_free(nsproxy_cachep, new_nsp);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
/*
|
|
* called from clone. This now handles copy for nsproxy and all
|
|
* namespaces therein.
|
|
*/
|
|
int copy_namespaces(u64 flags, struct task_struct *tsk)
|
|
{
|
|
struct nsproxy *old_ns = tsk->nsproxy;
|
|
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
|
|
struct nsproxy *new_ns;
|
|
|
|
if (likely(!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))) {
|
|
if ((flags & CLONE_VM) ||
|
|
likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
|
|
get_nsproxy(old_ns);
|
|
return 0;
|
|
}
|
|
} else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
|
|
/*
|
|
* CLONE_NEWIPC must detach from the undolist: after switching
|
|
* to a new ipc namespace, the semaphore arrays from the old
|
|
* namespace are unreachable. In clone parlance, CLONE_SYSVSEM
|
|
* means share undolist with parent, so we must forbid using
|
|
* it along with CLONE_NEWIPC.
|
|
*/
|
|
if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
|
|
(CLONE_NEWIPC | CLONE_SYSVSEM))
|
|
return -EINVAL;
|
|
|
|
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
|
|
if (IS_ERR(new_ns))
|
|
return PTR_ERR(new_ns);
|
|
|
|
if ((flags & CLONE_VM) == 0)
|
|
timens_on_fork(new_ns, tsk);
|
|
|
|
nsproxy_ns_active_get(new_ns);
|
|
tsk->nsproxy = new_ns;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Called from unshare. Unshare all the namespaces part of nsproxy.
|
|
* On success, returns the new nsproxy.
|
|
*/
|
|
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
|
|
struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
|
|
{
|
|
struct user_namespace *user_ns;
|
|
u64 flags = unshare_flags;
|
|
int err = 0;
|
|
|
|
if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))
|
|
return 0;
|
|
|
|
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
|
|
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
|
|
/*
|
|
* Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
|
|
* CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
|
|
*/
|
|
if (flags & UNSHARE_EMPTY_MNTNS) {
|
|
flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
|
|
flags |= CLONE_EMPTY_MNTNS;
|
|
}
|
|
|
|
*new_nsp = create_new_namespaces(flags, current, user_ns,
|
|
new_fs ? new_fs : current->fs);
|
|
if (IS_ERR(*new_nsp)) {
|
|
err = PTR_ERR(*new_nsp);
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
|
|
{
|
|
struct nsproxy *ns;
|
|
|
|
might_sleep();
|
|
|
|
if (new)
|
|
nsproxy_ns_active_get(new);
|
|
|
|
task_lock(p);
|
|
ns = p->nsproxy;
|
|
p->nsproxy = new;
|
|
task_unlock(p);
|
|
|
|
if (ns)
|
|
put_nsproxy(ns);
|
|
}
|
|
|
|
void exit_nsproxy_namespaces(struct task_struct *p)
|
|
{
|
|
switch_task_namespaces(p, NULL);
|
|
}
|
|
|
|
void switch_cred_namespaces(const struct cred *old, const struct cred *new)
|
|
{
|
|
ns_ref_active_get(new->user_ns);
|
|
ns_ref_active_put(old->user_ns);
|
|
}
|
|
|
|
void get_cred_namespaces(struct task_struct *tsk)
|
|
{
|
|
ns_ref_active_get(tsk->real_cred->user_ns);
|
|
}
|
|
|
|
void exit_cred_namespaces(struct task_struct *tsk)
|
|
{
|
|
ns_ref_active_put(tsk->real_cred->user_ns);
|
|
}
|
|
|
|
int exec_task_namespaces(void)
|
|
{
|
|
struct task_struct *tsk = current;
|
|
struct nsproxy *new;
|
|
|
|
if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
|
|
return 0;
|
|
|
|
new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
|
|
if (IS_ERR(new))
|
|
return PTR_ERR(new);
|
|
|
|
timens_on_fork(new, tsk);
|
|
switch_task_namespaces(tsk, new);
|
|
return 0;
|
|
}
|
|
|
|
static int check_setns_flags(unsigned long flags)
|
|
{
|
|
if (!flags || (flags & ~CLONE_NS_ALL))
|
|
return -EINVAL;
|
|
|
|
#ifndef CONFIG_USER_NS
|
|
if (flags & CLONE_NEWUSER)
|
|
return -EINVAL;
|
|
#endif
|
|
#ifndef CONFIG_PID_NS
|
|
if (flags & CLONE_NEWPID)
|
|
return -EINVAL;
|
|
#endif
|
|
#ifndef CONFIG_UTS_NS
|
|
if (flags & CLONE_NEWUTS)
|
|
return -EINVAL;
|
|
#endif
|
|
#ifndef CONFIG_IPC_NS
|
|
if (flags & CLONE_NEWIPC)
|
|
return -EINVAL;
|
|
#endif
|
|
#ifndef CONFIG_CGROUPS
|
|
if (flags & CLONE_NEWCGROUP)
|
|
return -EINVAL;
|
|
#endif
|
|
#ifndef CONFIG_NET_NS
|
|
if (flags & CLONE_NEWNET)
|
|
return -EINVAL;
|
|
#endif
|
|
#ifndef CONFIG_TIME_NS
|
|
if (flags & CLONE_NEWTIME)
|
|
return -EINVAL;
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void put_nsset(struct nsset *nsset)
|
|
{
|
|
unsigned flags = nsset->flags;
|
|
|
|
if (flags & CLONE_NEWUSER)
|
|
put_cred(nsset_cred(nsset));
|
|
/*
|
|
* We only created a temporary copy if we attached to more than just
|
|
* the mount namespace.
|
|
*/
|
|
if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
|
|
free_fs_struct(nsset->fs);
|
|
if (nsset->nsproxy)
|
|
nsproxy_free(nsset->nsproxy);
|
|
}
|
|
|
|
static int prepare_nsset(unsigned flags, struct nsset *nsset)
|
|
{
|
|
struct task_struct *me = current;
|
|
|
|
nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
|
|
if (IS_ERR(nsset->nsproxy))
|
|
return PTR_ERR(nsset->nsproxy);
|
|
|
|
if (flags & CLONE_NEWUSER)
|
|
nsset->cred = prepare_creds();
|
|
else
|
|
nsset->cred = current_cred();
|
|
if (!nsset->cred)
|
|
goto out;
|
|
|
|
/* Only create a temporary copy of fs_struct if we really need to. */
|
|
if (flags == CLONE_NEWNS) {
|
|
nsset->fs = me->fs;
|
|
} else if (flags & CLONE_NEWNS) {
|
|
nsset->fs = copy_fs_struct(me->fs);
|
|
if (!nsset->fs)
|
|
goto out;
|
|
}
|
|
|
|
nsset->flags = flags;
|
|
return 0;
|
|
|
|
out:
|
|
put_nsset(nsset);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
|
|
{
|
|
return ns->ops->install(nsset, ns);
|
|
}
|
|
|
|
/*
|
|
* This is the inverse operation to unshare().
|
|
* Ordering is equivalent to the standard ordering used everywhere else
|
|
* during unshare and process creation. The switch to the new set of
|
|
* namespaces occurs at the point of no return after installation of
|
|
* all requested namespaces was successful in commit_nsset().
|
|
*/
|
|
static int validate_nsset(struct nsset *nsset, struct pid *pid)
|
|
{
|
|
int ret = 0;
|
|
unsigned flags = nsset->flags;
|
|
struct user_namespace *user_ns = NULL;
|
|
struct pid_namespace *pid_ns = NULL;
|
|
struct nsproxy *nsp;
|
|
struct task_struct *tsk;
|
|
|
|
/* Take a "snapshot" of the target task's namespaces. */
|
|
rcu_read_lock();
|
|
tsk = pid_task(pid, PIDTYPE_PID);
|
|
if (!tsk) {
|
|
rcu_read_unlock();
|
|
return -ESRCH;
|
|
}
|
|
|
|
if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
|
|
rcu_read_unlock();
|
|
return -EPERM;
|
|
}
|
|
|
|
task_lock(tsk);
|
|
nsp = tsk->nsproxy;
|
|
if (nsp)
|
|
get_nsproxy(nsp);
|
|
task_unlock(tsk);
|
|
if (!nsp) {
|
|
rcu_read_unlock();
|
|
return -ESRCH;
|
|
}
|
|
|
|
#ifdef CONFIG_PID_NS
|
|
if (flags & CLONE_NEWPID) {
|
|
pid_ns = task_active_pid_ns(tsk);
|
|
if (unlikely(!pid_ns)) {
|
|
rcu_read_unlock();
|
|
ret = -ESRCH;
|
|
goto out;
|
|
}
|
|
get_pid_ns(pid_ns);
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_USER_NS
|
|
if (flags & CLONE_NEWUSER)
|
|
user_ns = get_user_ns(__task_cred(tsk)->user_ns);
|
|
#endif
|
|
rcu_read_unlock();
|
|
|
|
/*
|
|
* Install requested namespaces. The caller will have
|
|
* verified earlier that the requested namespaces are
|
|
* supported on this kernel. We don't report errors here
|
|
* if a namespace is requested that isn't supported.
|
|
*/
|
|
#ifdef CONFIG_USER_NS
|
|
if (flags & CLONE_NEWUSER) {
|
|
ret = validate_ns(nsset, &user_ns->ns);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
#endif
|
|
|
|
if (flags & CLONE_NEWNS) {
|
|
ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
|
|
#ifdef CONFIG_UTS_NS
|
|
if (flags & CLONE_NEWUTS) {
|
|
ret = validate_ns(nsset, &nsp->uts_ns->ns);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_IPC_NS
|
|
if (flags & CLONE_NEWIPC) {
|
|
ret = validate_ns(nsset, &nsp->ipc_ns->ns);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_PID_NS
|
|
if (flags & CLONE_NEWPID) {
|
|
ret = validate_ns(nsset, &pid_ns->ns);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_CGROUPS
|
|
if (flags & CLONE_NEWCGROUP) {
|
|
ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_NET_NS
|
|
if (flags & CLONE_NEWNET) {
|
|
ret = validate_ns(nsset, &nsp->net_ns->ns);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_TIME_NS
|
|
if (flags & CLONE_NEWTIME) {
|
|
ret = validate_ns(nsset, &nsp->time_ns->ns);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
#endif
|
|
|
|
out:
|
|
if (pid_ns)
|
|
put_pid_ns(pid_ns);
|
|
if (nsp)
|
|
put_nsproxy(nsp);
|
|
put_user_ns(user_ns);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* This is the point of no return. There are just a few namespaces
|
|
* that do some actual work here and it's sufficiently minimal that
|
|
* a separate ns_common operation seems unnecessary for now.
|
|
* Unshare is doing the same thing. If we'll end up needing to do
|
|
* more in a given namespace or a helper here is ultimately not
|
|
* exported anymore a simple commit handler for each namespace
|
|
* should be added to ns_common.
|
|
*/
|
|
static void commit_nsset(struct nsset *nsset)
|
|
{
|
|
unsigned flags = nsset->flags;
|
|
struct task_struct *me = current;
|
|
|
|
#ifdef CONFIG_USER_NS
|
|
if (flags & CLONE_NEWUSER) {
|
|
/* transfer ownership */
|
|
commit_creds(nsset_cred(nsset));
|
|
nsset->cred = NULL;
|
|
}
|
|
#endif
|
|
|
|
/* We only need to commit if we have used a temporary fs_struct. */
|
|
if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
|
|
set_fs_root(me->fs, &nsset->fs->root);
|
|
set_fs_pwd(me->fs, &nsset->fs->pwd);
|
|
}
|
|
|
|
#ifdef CONFIG_IPC_NS
|
|
if (flags & CLONE_NEWIPC)
|
|
exit_sem(me);
|
|
#endif
|
|
|
|
#ifdef CONFIG_TIME_NS
|
|
if (flags & CLONE_NEWTIME)
|
|
timens_commit(me, nsset->nsproxy->time_ns);
|
|
#endif
|
|
|
|
/* transfer ownership */
|
|
switch_task_namespaces(me, nsset->nsproxy);
|
|
nsset->nsproxy = NULL;
|
|
}
|
|
|
|
SYSCALL_DEFINE2(setns, int, fd, int, flags)
|
|
{
|
|
CLASS(fd, f)(fd);
|
|
struct ns_common *ns = NULL;
|
|
struct nsset nsset = {};
|
|
int err = 0;
|
|
|
|
if (fd_empty(f))
|
|
return -EBADF;
|
|
|
|
if (proc_ns_file(fd_file(f))) {
|
|
ns = get_proc_ns(file_inode(fd_file(f)));
|
|
if (flags && (ns->ns_type != flags))
|
|
err = -EINVAL;
|
|
flags = ns->ns_type;
|
|
} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
|
|
err = check_setns_flags(flags);
|
|
} else {
|
|
err = -EINVAL;
|
|
}
|
|
if (err)
|
|
goto out;
|
|
|
|
err = prepare_nsset(flags, &nsset);
|
|
if (err)
|
|
goto out;
|
|
|
|
if (proc_ns_file(fd_file(f)))
|
|
err = validate_ns(&nsset, ns);
|
|
else
|
|
err = validate_nsset(&nsset, pidfd_pid(fd_file(f)));
|
|
if (!err) {
|
|
commit_nsset(&nsset);
|
|
perf_event_namespaces(current);
|
|
}
|
|
put_nsset(&nsset);
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
int __init nsproxy_cache_init(void)
|
|
{
|
|
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
|
|
return 0;
|
|
}
|