Merge tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull pid_namespace updates from Christian Brauner:

 - pid_namespace: make init creation more flexible

   Annotate ->child_reaper accesses with {READ,WRITE}_ONCE() to protect
   the unlocked readers from cpu/compiler reordering, and enforce that
   pid 1 in a pid namespace is always the first allocated pid (the
   set_tid path already required this).

   On top of that, allow opening pid_for_children before the pid
   namespace init has been created. This lets one process create the pid
   namespace and a different process create the init via setns(), which
   makes clone3(set_tid) usable in all cases evenly and is particularly
   useful to CRIU when restoring nested containers.

   A new selftest covers both the basic create-pidns-then-init flow and
   the cross-process variant, and a MAINTAINERS entry for the pid
   namespace code is added.

 - unrelated signal cleanup: update outdated comment for the removed
   freezable_schedule()

* tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  signal: update outdated comment for removed freezable_schedule()
  MAINTAINERS: add a pid namespace entry
  selftests: Add tests for creating pidns init via setns
  pid_namespace: allow opening pid_for_children before init was created
  pid: check init is created first after idr alloc
  pid_namespace: avoid optimization of accesses to ->child_reaper
This commit is contained in:
Linus Torvalds
2026-04-14 20:28:40 -07:00
9 changed files with 269 additions and 24 deletions

View File

@@ -18191,6 +18191,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/next
F: drivers/mtd/nand/
F: include/linux/mtd/*nand*.h
NAMESPACES:
M: Christian Brauner <christian@brauner.io>
R: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
L: linux-kernel@vger.kernel.org
S: Maintained
F: rust/kernel/pid_namespace.rs
F: kernel/pid_namespace.c
F: tools/testing/selftests/pid_namespace/
NATIONAL INSTRUMENTS SERIAL DRIVER
M: Chaitanya Vadrevu <chaitanya.vadrevu@emerson.com>
L: linux-serial@vger.kernel.org
@@ -20804,10 +20813,8 @@ M: Christian Brauner <christian@brauner.io>
L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
F: rust/kernel/pid_namespace.rs
F: samples/pidfd/
F: tools/testing/selftests/clone3/
F: tools/testing/selftests/pid_namespace/
F: tools/testing/selftests/pidfd/
K: (?i)pidfd
K: (?i)clone3

View File

@@ -608,7 +608,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
reaper = find_alive_thread(father);
if (reaper) {
pid_ns->child_reaper = reaper;
ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper);
WRITE_ONCE(pid_ns->child_reaper, reaper);
return reaper;
}

View File

@@ -2469,7 +2469,10 @@ __latent_entropy struct task_struct *copy_process(
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
struct pid_namespace *ns = ns_of_pid(pid);
ASSERT_EXCLUSIVE_WRITER(ns->child_reaper);
WRITE_ONCE(ns->child_reaper, p);
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->shared_pending.signal = delayed.signal;

View File

@@ -128,7 +128,7 @@ void free_pid(struct pid *pid)
* is the reaper wake up the reaper. The reaper
* may be sleeping in zap_pid_ns_processes().
*/
wake_up_process(ns->child_reaper);
wake_up_process(READ_ONCE(ns->child_reaper));
break;
case PIDNS_ADDING:
/* Handle a fork failure of the first process */
@@ -215,12 +215,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
retval = -EINVAL;
if (tid < 1 || tid >= pid_max[ns->level - i])
goto out_abort;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
goto out_abort;
retval = -EPERM;
if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_abort;
@@ -296,9 +290,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
i--;
retried_preload = false;
/*
* PID 1 (init) must be created first.
*/
if (!READ_ONCE(tmp->child_reaper) && nr != 1) {
retval = -EINVAL;
goto out_free;
}
tmp = tmp->parent;
}
/*

View File

@@ -369,15 +369,6 @@ static struct ns_common *pidns_for_children_get(struct task_struct *task)
}
task_unlock(task);
if (ns) {
read_lock(&tasklist_lock);
if (!ns->child_reaper) {
put_pid_ns(ns);
ns = NULL;
}
read_unlock(&tasklist_lock);
}
return ns ? &ns->ns : NULL;
}

View File

@@ -2818,8 +2818,9 @@ bool get_signal(struct ksignal *ksig)
/*
* Do this once, we can't return to user-mode if freezing() == T.
* do_signal_stop() and ptrace_stop() do freezable_schedule() and
* thus do not need another check after return.
* do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED
* and the freezer handles those states via TASK_FROZEN, thus they
* do not need another check after return.
*/
try_to_freeze();

View File

@@ -1,2 +1,3 @@
pid_max
pidns_init_via_setns
regression_enomem

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -g $(KHDR_INCLUDES)
TEST_GEN_PROGS = regression_enomem pid_max
TEST_GEN_PROGS = regression_enomem pid_max pidns_init_via_setns
LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h

View File

@@ -0,0 +1,238 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <fcntl.h>
#include <sched.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include "kselftest_harness.h"
#include "../pidfd/pidfd.h"
/*
* Test that a process can become PID 1 (init) in a new PID namespace
* created via unshare() and joined via setns().
*
* Flow:
* 1. Parent creates a pipe for synchronization.
* 2. Parent forks a child.
* 3. Parent calls unshare(CLONE_NEWPID) to create a new PID namespace.
* 4. Parent signals the child via the pipe.
* 5. Child opens parent's /proc/<ppid>/ns/pid_for_children and calls
* setns(fd, CLONE_NEWPID) to join the new namespace.
* 6. Child forks a grandchild.
* 7. Grandchild verifies getpid() == 1.
*/
TEST(pidns_init_via_setns)
{
pid_t child, parent_pid;
int pipe_fd[2];
char buf;
if (geteuid())
ASSERT_EQ(0, unshare(CLONE_NEWUSER));
parent_pid = getpid();
ASSERT_EQ(0, pipe(pipe_fd));
child = fork();
ASSERT_GE(child, 0);
if (child == 0) {
char path[256];
int nsfd;
pid_t grandchild;
close(pipe_fd[1]);
/* Wait for parent to complete unshare */
ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
close(pipe_fd[0]);
snprintf(path, sizeof(path),
"/proc/%d/ns/pid_for_children", parent_pid);
nsfd = open(path, O_RDONLY);
ASSERT_GE(nsfd, 0);
ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
close(nsfd);
grandchild = fork();
ASSERT_GE(grandchild, 0);
if (grandchild == 0) {
/* Should be init (PID 1) in the new namespace */
if (getpid() != 1)
_exit(1);
_exit(0);
}
ASSERT_EQ(0, wait_for_pid(grandchild));
_exit(0);
}
close(pipe_fd[0]);
ASSERT_EQ(0, unshare(CLONE_NEWPID));
/* Signal child that the new PID namespace is ready */
buf = 0;
ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
close(pipe_fd[1]);
ASSERT_EQ(0, wait_for_pid(child));
}
/*
* Similar to pidns_init_via_setns, but:
* 1. Parent enters a new PID namespace right from the start to be able to
* later freely use pid 1001 in it.
* 2. After forking child, parent also calls unshare(CLONE_NEWUSER)
* before unshare(CLONE_NEWPID) so that new old and new pid namespaces have
* different user namespace owners.
* 3. Child uses clone3() with set_tid={1, 1001} instead of fork() and
* grandchild checks that it gets desired pids .
*
* Flow:
* 1. Test process creates a new PID namespace and forks a wrapper
* (PID 1 in the outer namespace).
* 2. Wrapper forks a child.
* 3. Wrapper calls unshare(CLONE_NEWUSER) + unshare(CLONE_NEWPID)
* to create an inner PID namespace.
* 4. Wrapper signals the child via pipe.
* 5. Child opens wrapper's /proc/<pid>/ns/pid_for_children and calls
* setns(fd, CLONE_NEWPID) to join the inner namespace.
* 6. Child calls clone3() with set_tid={1, 1001}.
* 7. Grandchild verifies its NSpid ends with "1001 1".
*/
pid_t set_tid[] = {1, 1001};
static int pidns_init_via_setns_set_tid_grandchild(struct __test_metadata *_metadata)
{
char *line = NULL;
size_t len = 0;
int found = 0;
FILE *gf;
gf = fopen("/proc/self/status", "r");
ASSERT_NE(gf, NULL);
while (getline(&line, &len, gf) != -1) {
if (strncmp(line, "NSpid:", 6) != 0)
continue;
for (int i = 0; i < 2; i++) {
char *last = strrchr(line, '\t');
pid_t pid;
ASSERT_NE(last, NULL);
ASSERT_EQ(sscanf(last, "%d", &pid), 1);
ASSERT_EQ(pid, set_tid[i]);
*last = '\0';
}
found = true;
break;
}
free(line);
fclose(gf);
ASSERT_TRUE(found);
return 0;
}
static int pidns_init_via_setns_set_tid_child(struct __test_metadata *_metadata,
pid_t parent_pid, int pipe_fd[2])
{
struct __clone_args args = {
.exit_signal = SIGCHLD,
.set_tid = ptr_to_u64(set_tid),
.set_tid_size = 2,
};
pid_t grandchild;
char path[256];
char buf;
int nsfd;
close(pipe_fd[1]);
ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
close(pipe_fd[0]);
snprintf(path, sizeof(path),
"/proc/%d/ns/pid_for_children", parent_pid);
nsfd = open(path, O_RDONLY);
ASSERT_GE(nsfd, 0);
ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
close(nsfd);
grandchild = sys_clone3(&args, sizeof(args));
ASSERT_GE(grandchild, 0);
if (grandchild == 0)
_exit(pidns_init_via_setns_set_tid_grandchild(_metadata));
ASSERT_EQ(0, wait_for_pid(grandchild));
return 0;
}
static int pidns_init_via_setns_set_tid_wrapper(struct __test_metadata *_metadata)
{
int pipe_fd[2];
pid_t child, parent_pid;
char buf;
FILE *f;
/*
* We are PID 1 inside the new namespace, but /proc is
* mounted from the host. Read our host-visible PID so
* the child can reach our pid_for_children via /proc.
*/
f = fopen("/proc/self/stat", "r");
ASSERT_NE(f, NULL);
ASSERT_EQ(fscanf(f, "%d", &parent_pid), 1);
ASSERT_EQ(0, pipe(pipe_fd));
child = fork();
ASSERT_GE(child, 0);
if (child == 0)
_exit(pidns_init_via_setns_set_tid_child(_metadata, parent_pid, pipe_fd));
close(pipe_fd[0]);
ASSERT_EQ(0, unshare(CLONE_NEWUSER));
ASSERT_EQ(0, unshare(CLONE_NEWPID));
buf = 0;
ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
close(pipe_fd[1]);
ASSERT_EQ(0, wait_for_pid(child));
fclose(f);
return 0;
}
TEST(pidns_init_via_setns_set_tid)
{
pid_t wrapper;
if (geteuid())
SKIP(return, "This test needs root to run!");
ASSERT_EQ(0, unshare(CLONE_NEWPID));
wrapper = fork();
ASSERT_GE(wrapper, 0);
if (wrapper == 0)
_exit(pidns_init_via_setns_set_tid_wrapper(_metadata));
ASSERT_EQ(0, wait_for_pid(wrapper));
}
TEST_HARNESS_MAIN