mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Merge tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull pid_namespace updates from Christian Brauner:
- pid_namespace: make init creation more flexible
Annotate ->child_reaper accesses with {READ,WRITE}_ONCE() to protect
the unlocked readers from cpu/compiler reordering, and enforce that
pid 1 in a pid namespace is always the first allocated pid (the
set_tid path already required this).
On top of that, allow opening pid_for_children before the pid
namespace init has been created. This lets one process create the pid
namespace and a different process create the init via setns(), which
makes clone3(set_tid) usable in all cases evenly and is particularly
useful to CRIU when restoring nested containers.
A new selftest covers both the basic create-pidns-then-init flow and
the cross-process variant, and a MAINTAINERS entry for the pid
namespace code is added.
- unrelated signal cleanup: update outdated comment for the removed
freezable_schedule()
* tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
signal: update outdated comment for removed freezable_schedule()
MAINTAINERS: add a pid namespace entry
selftests: Add tests for creating pidns init via setns
pid_namespace: allow opening pid_for_children before init was created
pid: check init is created first after idr alloc
pid_namespace: avoid optimization of accesses to ->child_reaper
This commit is contained in:
11
MAINTAINERS
11
MAINTAINERS
@@ -18191,6 +18191,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/next
|
||||
F: drivers/mtd/nand/
|
||||
F: include/linux/mtd/*nand*.h
|
||||
|
||||
NAMESPACES:
|
||||
M: Christian Brauner <christian@brauner.io>
|
||||
R: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Maintained
|
||||
F: rust/kernel/pid_namespace.rs
|
||||
F: kernel/pid_namespace.c
|
||||
F: tools/testing/selftests/pid_namespace/
|
||||
|
||||
NATIONAL INSTRUMENTS SERIAL DRIVER
|
||||
M: Chaitanya Vadrevu <chaitanya.vadrevu@emerson.com>
|
||||
L: linux-serial@vger.kernel.org
|
||||
@@ -20804,10 +20813,8 @@ M: Christian Brauner <christian@brauner.io>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
|
||||
F: rust/kernel/pid_namespace.rs
|
||||
F: samples/pidfd/
|
||||
F: tools/testing/selftests/clone3/
|
||||
F: tools/testing/selftests/pid_namespace/
|
||||
F: tools/testing/selftests/pidfd/
|
||||
K: (?i)pidfd
|
||||
K: (?i)clone3
|
||||
|
||||
@@ -608,7 +608,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
|
||||
|
||||
reaper = find_alive_thread(father);
|
||||
if (reaper) {
|
||||
pid_ns->child_reaper = reaper;
|
||||
ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper);
|
||||
WRITE_ONCE(pid_ns->child_reaper, reaper);
|
||||
return reaper;
|
||||
}
|
||||
|
||||
|
||||
@@ -2469,7 +2469,10 @@ __latent_entropy struct task_struct *copy_process(
|
||||
init_task_pid(p, PIDTYPE_SID, task_session(current));
|
||||
|
||||
if (is_child_reaper(pid)) {
|
||||
ns_of_pid(pid)->child_reaper = p;
|
||||
struct pid_namespace *ns = ns_of_pid(pid);
|
||||
|
||||
ASSERT_EXCLUSIVE_WRITER(ns->child_reaper);
|
||||
WRITE_ONCE(ns->child_reaper, p);
|
||||
p->signal->flags |= SIGNAL_UNKILLABLE;
|
||||
}
|
||||
p->signal->shared_pending.signal = delayed.signal;
|
||||
|
||||
19
kernel/pid.c
19
kernel/pid.c
@@ -128,7 +128,7 @@ void free_pid(struct pid *pid)
|
||||
* is the reaper wake up the reaper. The reaper
|
||||
* may be sleeping in zap_pid_ns_processes().
|
||||
*/
|
||||
wake_up_process(ns->child_reaper);
|
||||
wake_up_process(READ_ONCE(ns->child_reaper));
|
||||
break;
|
||||
case PIDNS_ADDING:
|
||||
/* Handle a fork failure of the first process */
|
||||
@@ -215,12 +215,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
|
||||
retval = -EINVAL;
|
||||
if (tid < 1 || tid >= pid_max[ns->level - i])
|
||||
goto out_abort;
|
||||
/*
|
||||
* Also fail if a PID != 1 is requested and
|
||||
* no PID 1 exists.
|
||||
*/
|
||||
if (tid != 1 && !tmp->child_reaper)
|
||||
goto out_abort;
|
||||
retval = -EPERM;
|
||||
if (!checkpoint_restore_ns_capable(tmp->user_ns))
|
||||
goto out_abort;
|
||||
@@ -296,9 +290,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
|
||||
|
||||
pid->numbers[i].nr = nr;
|
||||
pid->numbers[i].ns = tmp;
|
||||
tmp = tmp->parent;
|
||||
i--;
|
||||
retried_preload = false;
|
||||
|
||||
/*
|
||||
* PID 1 (init) must be created first.
|
||||
*/
|
||||
if (!READ_ONCE(tmp->child_reaper) && nr != 1) {
|
||||
retval = -EINVAL;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
tmp = tmp->parent;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -369,15 +369,6 @@ static struct ns_common *pidns_for_children_get(struct task_struct *task)
|
||||
}
|
||||
task_unlock(task);
|
||||
|
||||
if (ns) {
|
||||
read_lock(&tasklist_lock);
|
||||
if (!ns->child_reaper) {
|
||||
put_pid_ns(ns);
|
||||
ns = NULL;
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
}
|
||||
|
||||
return ns ? &ns->ns : NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -2818,8 +2818,9 @@ bool get_signal(struct ksignal *ksig)
|
||||
|
||||
/*
|
||||
* Do this once, we can't return to user-mode if freezing() == T.
|
||||
* do_signal_stop() and ptrace_stop() do freezable_schedule() and
|
||||
* thus do not need another check after return.
|
||||
* do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED
|
||||
* and the freezer handles those states via TASK_FROZEN, thus they
|
||||
* do not need another check after return.
|
||||
*/
|
||||
try_to_freeze();
|
||||
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
pid_max
|
||||
pidns_init_via_setns
|
||||
regression_enomem
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
CFLAGS += -g $(KHDR_INCLUDES)
|
||||
|
||||
TEST_GEN_PROGS = regression_enomem pid_max
|
||||
TEST_GEN_PROGS = regression_enomem pid_max pidns_init_via_setns
|
||||
|
||||
LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h
|
||||
|
||||
|
||||
238
tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
Normal file
238
tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
Normal file
@@ -0,0 +1,238 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#define _GNU_SOURCE
|
||||
#include <fcntl.h>
|
||||
#include <sched.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "kselftest_harness.h"
|
||||
#include "../pidfd/pidfd.h"
|
||||
|
||||
/*
|
||||
* Test that a process can become PID 1 (init) in a new PID namespace
|
||||
* created via unshare() and joined via setns().
|
||||
*
|
||||
* Flow:
|
||||
* 1. Parent creates a pipe for synchronization.
|
||||
* 2. Parent forks a child.
|
||||
* 3. Parent calls unshare(CLONE_NEWPID) to create a new PID namespace.
|
||||
* 4. Parent signals the child via the pipe.
|
||||
* 5. Child opens parent's /proc/<ppid>/ns/pid_for_children and calls
|
||||
* setns(fd, CLONE_NEWPID) to join the new namespace.
|
||||
* 6. Child forks a grandchild.
|
||||
* 7. Grandchild verifies getpid() == 1.
|
||||
*/
|
||||
TEST(pidns_init_via_setns)
|
||||
{
|
||||
pid_t child, parent_pid;
|
||||
int pipe_fd[2];
|
||||
char buf;
|
||||
|
||||
if (geteuid())
|
||||
ASSERT_EQ(0, unshare(CLONE_NEWUSER));
|
||||
|
||||
parent_pid = getpid();
|
||||
|
||||
ASSERT_EQ(0, pipe(pipe_fd));
|
||||
|
||||
child = fork();
|
||||
ASSERT_GE(child, 0);
|
||||
|
||||
if (child == 0) {
|
||||
char path[256];
|
||||
int nsfd;
|
||||
pid_t grandchild;
|
||||
|
||||
close(pipe_fd[1]);
|
||||
|
||||
/* Wait for parent to complete unshare */
|
||||
ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
|
||||
close(pipe_fd[0]);
|
||||
|
||||
snprintf(path, sizeof(path),
|
||||
"/proc/%d/ns/pid_for_children", parent_pid);
|
||||
nsfd = open(path, O_RDONLY);
|
||||
ASSERT_GE(nsfd, 0);
|
||||
|
||||
ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
|
||||
close(nsfd);
|
||||
|
||||
grandchild = fork();
|
||||
ASSERT_GE(grandchild, 0);
|
||||
|
||||
if (grandchild == 0) {
|
||||
/* Should be init (PID 1) in the new namespace */
|
||||
if (getpid() != 1)
|
||||
_exit(1);
|
||||
_exit(0);
|
||||
}
|
||||
|
||||
ASSERT_EQ(0, wait_for_pid(grandchild));
|
||||
_exit(0);
|
||||
}
|
||||
|
||||
close(pipe_fd[0]);
|
||||
|
||||
ASSERT_EQ(0, unshare(CLONE_NEWPID));
|
||||
|
||||
/* Signal child that the new PID namespace is ready */
|
||||
buf = 0;
|
||||
ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
|
||||
close(pipe_fd[1]);
|
||||
|
||||
ASSERT_EQ(0, wait_for_pid(child));
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to pidns_init_via_setns, but:
|
||||
* 1. Parent enters a new PID namespace right from the start to be able to
|
||||
* later freely use pid 1001 in it.
|
||||
* 2. After forking child, parent also calls unshare(CLONE_NEWUSER)
|
||||
* before unshare(CLONE_NEWPID) so that new old and new pid namespaces have
|
||||
* different user namespace owners.
|
||||
* 3. Child uses clone3() with set_tid={1, 1001} instead of fork() and
|
||||
* grandchild checks that it gets desired pids .
|
||||
*
|
||||
* Flow:
|
||||
* 1. Test process creates a new PID namespace and forks a wrapper
|
||||
* (PID 1 in the outer namespace).
|
||||
* 2. Wrapper forks a child.
|
||||
* 3. Wrapper calls unshare(CLONE_NEWUSER) + unshare(CLONE_NEWPID)
|
||||
* to create an inner PID namespace.
|
||||
* 4. Wrapper signals the child via pipe.
|
||||
* 5. Child opens wrapper's /proc/<pid>/ns/pid_for_children and calls
|
||||
* setns(fd, CLONE_NEWPID) to join the inner namespace.
|
||||
* 6. Child calls clone3() with set_tid={1, 1001}.
|
||||
* 7. Grandchild verifies its NSpid ends with "1001 1".
|
||||
*/
|
||||
|
||||
pid_t set_tid[] = {1, 1001};
|
||||
|
||||
static int pidns_init_via_setns_set_tid_grandchild(struct __test_metadata *_metadata)
|
||||
{
|
||||
char *line = NULL;
|
||||
size_t len = 0;
|
||||
int found = 0;
|
||||
FILE *gf;
|
||||
|
||||
gf = fopen("/proc/self/status", "r");
|
||||
ASSERT_NE(gf, NULL);
|
||||
|
||||
while (getline(&line, &len, gf) != -1) {
|
||||
if (strncmp(line, "NSpid:", 6) != 0)
|
||||
continue;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
char *last = strrchr(line, '\t');
|
||||
pid_t pid;
|
||||
|
||||
ASSERT_NE(last, NULL);
|
||||
ASSERT_EQ(sscanf(last, "%d", &pid), 1);
|
||||
ASSERT_EQ(pid, set_tid[i]);
|
||||
*last = '\0';
|
||||
}
|
||||
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
|
||||
free(line);
|
||||
fclose(gf);
|
||||
ASSERT_TRUE(found);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pidns_init_via_setns_set_tid_child(struct __test_metadata *_metadata,
|
||||
pid_t parent_pid, int pipe_fd[2])
|
||||
{
|
||||
struct __clone_args args = {
|
||||
.exit_signal = SIGCHLD,
|
||||
.set_tid = ptr_to_u64(set_tid),
|
||||
.set_tid_size = 2,
|
||||
};
|
||||
pid_t grandchild;
|
||||
char path[256];
|
||||
char buf;
|
||||
int nsfd;
|
||||
|
||||
close(pipe_fd[1]);
|
||||
|
||||
ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
|
||||
close(pipe_fd[0]);
|
||||
|
||||
snprintf(path, sizeof(path),
|
||||
"/proc/%d/ns/pid_for_children", parent_pid);
|
||||
nsfd = open(path, O_RDONLY);
|
||||
ASSERT_GE(nsfd, 0);
|
||||
|
||||
ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
|
||||
close(nsfd);
|
||||
|
||||
grandchild = sys_clone3(&args, sizeof(args));
|
||||
ASSERT_GE(grandchild, 0);
|
||||
|
||||
if (grandchild == 0)
|
||||
_exit(pidns_init_via_setns_set_tid_grandchild(_metadata));
|
||||
|
||||
ASSERT_EQ(0, wait_for_pid(grandchild));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pidns_init_via_setns_set_tid_wrapper(struct __test_metadata *_metadata)
|
||||
{
|
||||
int pipe_fd[2];
|
||||
pid_t child, parent_pid;
|
||||
char buf;
|
||||
FILE *f;
|
||||
|
||||
/*
|
||||
* We are PID 1 inside the new namespace, but /proc is
|
||||
* mounted from the host. Read our host-visible PID so
|
||||
* the child can reach our pid_for_children via /proc.
|
||||
*/
|
||||
f = fopen("/proc/self/stat", "r");
|
||||
ASSERT_NE(f, NULL);
|
||||
ASSERT_EQ(fscanf(f, "%d", &parent_pid), 1);
|
||||
ASSERT_EQ(0, pipe(pipe_fd));
|
||||
|
||||
child = fork();
|
||||
ASSERT_GE(child, 0);
|
||||
|
||||
if (child == 0)
|
||||
_exit(pidns_init_via_setns_set_tid_child(_metadata, parent_pid, pipe_fd));
|
||||
|
||||
close(pipe_fd[0]);
|
||||
|
||||
ASSERT_EQ(0, unshare(CLONE_NEWUSER));
|
||||
ASSERT_EQ(0, unshare(CLONE_NEWPID));
|
||||
|
||||
buf = 0;
|
||||
ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
|
||||
close(pipe_fd[1]);
|
||||
|
||||
ASSERT_EQ(0, wait_for_pid(child));
|
||||
|
||||
fclose(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
TEST(pidns_init_via_setns_set_tid)
|
||||
{
|
||||
pid_t wrapper;
|
||||
|
||||
if (geteuid())
|
||||
SKIP(return, "This test needs root to run!");
|
||||
|
||||
ASSERT_EQ(0, unshare(CLONE_NEWPID));
|
||||
|
||||
wrapper = fork();
|
||||
ASSERT_GE(wrapper, 0);
|
||||
|
||||
if (wrapper == 0)
|
||||
_exit(pidns_init_via_setns_set_tid_wrapper(_metadata));
|
||||
|
||||
ASSERT_EQ(0, wait_for_pid(wrapper));
|
||||
}
|
||||
|
||||
TEST_HARNESS_MAIN
|
||||
Reference in New Issue
Block a user