From 0c0b046392b5b6e2402bf75215ab0c3a242d7af5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Jan 2026 11:48:46 +0100
Subject: [PATCH 01/18] mount: start iterating from start of rbtree

If the root of the namespace has an id that's greater than the child
we'd not find it. Handle that case.

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-1-5ef0a886e646@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 854f4fc66469..9fd5114e5eb7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5649,14 +5649,14 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
 	if (mnt_ns_empty(ns))
 		return -ENOENT;
 
-	first = child = ns->root;
-	for (;;) {
-		child = listmnt_next(child, false);
-		if (!child)
-			return -ENOENT;
-		if (child->mnt_parent == first)
+	first = ns->root;
+	for (child = node_to_mount(ns->mnt_first_node); child;
+	     child = listmnt_next(child, false)) {
+		if (child != first && child->mnt_parent == first)
 			break;
 	}
+	if (!child)
+		return -ENOENT;
 
 	root->mnt = mntget(&child->mnt);
 	root->dentry = dget(root->mnt->mnt_root);

From ad4a3599e58d5ac0caa3f576c48a4b62f38d400d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Jan 2026 11:48:47 +0100
Subject: [PATCH 02/18] mount: simplify __do_loopback()

Remove the OPEN_TREE_NAMESPACE flag checking from __do_loopback() and
instead have callers pass CL_COPY_MNT_NS_FILE directly in copy_flags.

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-2-5ef0a886e646@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 9fd5114e5eb7..b098d1131e69 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2958,10 +2958,9 @@ static inline bool may_copy_tree(const struct path *path)
 }
 
 static struct mount *__do_loopback(const struct path *old_path,
-				   unsigned int flags, unsigned int copy_flags)
+				   bool recurse, unsigned int copy_flags)
 {
 	struct mount *old = real_mount(old_path->mnt);
-	bool recurse = flags & AT_RECURSIVE;
 
 	if (IS_MNT_UNBINDABLE(old))
 		return ERR_PTR(-EINVAL);
@@ -2972,18 +2971,6 @@ static struct mount *__do_loopback(const struct path *old_path,
 	if (!recurse && __has_locked_children(old, old_path->dentry))
 		return ERR_PTR(-EINVAL);
 
-	/*
-	 * When creating a new mount namespace we don't want to copy over
-	 * mounts of mount namespaces to avoid the risk of cycles and also to
-	 * minimize the default complex interdependencies between mount
-	 * namespaces.
-	 *
-	 * We could ofc just check whether all mount namespace files aren't
-	 * creating cycles but really let's keep this simple.
-	 */
-	if (!(flags & OPEN_TREE_NAMESPACE))
-		copy_flags |= CL_COPY_MNT_NS_FILE;
-
 	if (recurse)
 		return copy_tree(old, old_path->dentry, copy_flags);
 
@@ -2998,7 +2985,6 @@ static int do_loopback(const struct path *path, const char *old_name,
 {
 	struct path old_path __free(path_put) = {};
 	struct mount *mnt = NULL;
-	unsigned int flags = recurse ? AT_RECURSIVE : 0;
 	int err;
 
 	if (!old_name || !*old_name)
@@ -3017,7 +3003,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	if (!check_mnt(mp.parent))
 		return -EINVAL;
 
-	mnt = __do_loopback(&old_path, flags, 0);
+	mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -3055,7 +3041,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned
 			ns->seq_origin = src_mnt_ns->ns.ns_id;
 	}
 
-	mnt = __do_loopback(path, flags, 0);
+	mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE);
 	if (IS_ERR(mnt)) {
 		emptied_ns = ns;
 		return ERR_CAST(mnt);
@@ -3087,7 +3073,8 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
 	return file;
 }
 
-static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+static struct mnt_namespace *create_new_namespace(struct path *path,
+						  bool recurse)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct user_namespace *user_ns = current_user_ns();
@@ -3135,7 +3122,7 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
 	 * to the restrictions of creating detached bind-mounts. It
 	 * has a lot saner and simpler semantics.
 	 */
-	mnt = __do_loopback(path, flags, copy_flags);
+	mnt = __do_loopback(path, recurse, copy_flags);
 	scoped_guard(mount_writer) {
 		if (IS_ERR(mnt)) {
 			emptied_ns = new_ns;
@@ -3164,11 +3151,11 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
 	return new_ns;
 }
 
-static struct file *open_new_namespace(struct path *path, unsigned int flags)
+static struct file *open_new_namespace(struct path *path, bool recurse)
 {
 	struct mnt_namespace *new_ns;
 
-	new_ns = create_new_namespace(path, flags);
+	new_ns = create_new_namespace(path, recurse);
 	if (IS_ERR(new_ns))
 		return ERR_CAST(new_ns);
 	return open_namespace_file(to_ns_common(new_ns));
@@ -3217,7 +3204,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
 		return ERR_PTR(ret);
 
 	if (flags & OPEN_TREE_NAMESPACE)
-		return open_new_namespace(&path, flags);
+		return open_new_namespace(&path, (flags & AT_RECURSIVE));
 
 	if (flags & OPEN_TREE_CLONE)
 		return open_detached_copy(&path, flags);

From 5e8969bd192712419aae511dd5ba26855c2c78db Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Jan 2026 11:48:48 +0100
Subject: [PATCH 03/18] mount: add FSMOUNT_NAMESPACE

Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
namespace with the newly created filesystem attached to a copy of the
real rootfs. This returns a namespace file descriptor instead of an
O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for open_tree().

This allows creating a new filesystem and immediately placing it in a
new mount namespace in a single operation, which is useful for container
runtimes and other namespace-based isolation mechanisms.

The rootfs mount is created before copying the real rootfs for the new
namespace meaning that the mount namespace id for the mount of the root
of the namespace is bigger than the child mounted on top of it. We've
never explicitly given the guarantee for such ordering and I doubt
anyone relies on it. Accepting that lets us avoid copying the mount
again and also avoids having to massage may_copy_tree() to grant an
exception for fsmount->mnt->mnt_ns being NULL.

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-3-5ef0a886e646@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 37 ++++++++++++++++++++++++++++++-------
 include/uapi/linux/mount.h |  1 +
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index b098d1131e69..702e93243505 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3118,11 +3118,26 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	}
 
 	/*
-	 * We don't emulate unshare()ing a mount namespace. We stick
-	 * to the restrictions of creating detached bind-mounts. It
-	 * has a lot saner and simpler semantics.
+	 * We don't emulate unshare()ing a mount namespace. We stick to
+	 * the restrictions of creating detached bind-mounts. It has a
+	 * lot saner and simpler semantics.
 	 */
-	mnt = __do_loopback(path, recurse, copy_flags);
+	mnt = real_mount(path->mnt);
+	if (!mnt->mnt_ns) {
+		/*
+		 * If we're moving into a new mount namespace via
+		 * fsmount() swap the mount ids so the nullfs mount id
+		 * is the lowest in the mount namespace avoiding another
+		 * useless copy. This is fine we're not attached to any
+		 * mount namespace so the mount ids are pure decoration
+		 * at that point.
+		 */
+		swap(mnt->mnt_id_unique, new_ns_root->mnt_id_unique);
+		swap(mnt->mnt_id, new_ns_root->mnt_id);
+		mntget(&mnt->mnt);
+	} else {
+		mnt = __do_loopback(path, recurse, copy_flags);
+	}
 	scoped_guard(mount_writer) {
 		if (IS_ERR(mnt)) {
 			emptied_ns = new_ns;
@@ -4401,11 +4416,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	unsigned int mnt_flags = 0;
 	long ret;
 
-	if (!may_mount())
+	if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
+		return -EINVAL;
+
+	if ((flags & FSMOUNT_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
-		return -EINVAL;
+	if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
+		return -EPERM;
 
 	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
 		return -EINVAL;
@@ -4472,6 +4491,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	 */
 	vfs_clean_context(fc);
 
+	if (flags & FSMOUNT_NAMESPACE)
+		return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
+			      open_new_namespace(&new_path, 0));
+
 	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
 	if (IS_ERR(ns))
 		return PTR_ERR(ns);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d9d86598d100..2204708dbf7a 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -110,6 +110,7 @@ enum fsconfig_command {
  * fsmount() flags.
  */
 #define FSMOUNT_CLOEXEC		0x00000001
+#define FSMOUNT_NAMESPACE	0x00000002	/* Create the mount in a new mount namespace */
 
 /*
  * Mount attributes.

From fc1a05de0058bc1dbbc202f6f6cdb25ee0bae16d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Jan 2026 11:48:49 +0100
Subject: [PATCH 04/18] tools: update mount.h header

Update the mount.h header so we can rely on it in the selftests.

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-4-5ef0a886e646@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/include/uapi/linux/mount.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h
index 7fa67c2031a5..2204708dbf7a 100644
--- a/tools/include/uapi/linux/mount.h
+++ b/tools/include/uapi/linux/mount.h
@@ -61,7 +61,8 @@
 /*
  * open_tree() flags.
  */
-#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLONE		(1 << 0)	/* Clone the target tree and attach the clone */
+#define OPEN_TREE_NAMESPACE	(1 << 1)	/* Clone the target tree into a new mount namespace */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
 
 /*
@@ -109,6 +110,7 @@ enum fsconfig_command {
  * fsmount() flags.
  */
 #define FSMOUNT_CLOEXEC		0x00000001
+#define FSMOUNT_NAMESPACE	0x00000002	/* Create the mount in a new mount namespace */
 
 /*
  * Mount attributes.
@@ -197,7 +199,10 @@ struct statmount {
  */
 struct mnt_id_req {
 	__u32 size;
-	__u32 spare;
+	union {
+		__u32 mnt_ns_fd;
+		__u32 mnt_fd;
+	};
 	__u64 mnt_id;
 	__u64 param;
 	__u64 mnt_ns_id;
@@ -232,4 +237,9 @@ struct mnt_id_req {
 #define LSMT_ROOT		0xffffffffffffffff	/* root mount */
 #define LISTMOUNT_REVERSE	(1 << 0) /* List later mounts first */
 
+/*
+ * @flag bits for statmount(2)
+ */
+#define STATMOUNT_BY_FD		0x00000001U	/* want mountinfo for given fd */
+
 #endif /* _UAPI_LINUX_MOUNT_H */

From be1ca3ee8f97067fee87fda73ea5959d5ab75bbf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Jan 2026 11:48:50 +0100
Subject: [PATCH 05/18] selftests/statmount: add statmount_alloc() helper

Add a helper to allocate a statmount buffer and call statmount(). This
helper will be shared by multiple test suites that need to query mount
information via statmount().

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-5-5ef0a886e646@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../open_tree_ns/open_tree_ns_test.c          |  8 +--
 .../filesystems/statmount/statmount.h         | 51 +++++++++++++++++++
 .../filesystems/statmount/statmount_test.c    | 45 ++--------------
 3 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
index 9711556280ae..86d5d7a831c1 100644
--- a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
+++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
@@ -115,7 +115,7 @@ static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id)
 				     STATMOUNT_MNT_BASIC |
 				     STATMOUNT_FS_TYPE |
 				     STATMOUNT_MNT_ROOT |
-				     STATMOUNT_MNT_POINT);
+				     STATMOUNT_MNT_POINT, 0);
 		if (!sm) {
 			TH_LOG("  [%zd] mnt_id %llu: statmount failed: %s",
 			       i, (unsigned long long)list[i], strerror(errno));
@@ -746,7 +746,7 @@ TEST_F(open_tree_ns_userns, umount_fails_einval)
 			const char *mnt_point;
 
 			sm = statmount_alloc(list[i], new_ns_id,
-					     STATMOUNT_MNT_POINT);
+					     STATMOUNT_MNT_POINT, 0);
 			if (!sm)
 				_exit(11);
 
@@ -863,7 +863,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds)
 			const char *mnt_point;
 
 			sm = statmount_alloc(list[i], new_ns_id,
-					     STATMOUNT_MNT_POINT);
+					     STATMOUNT_MNT_POINT, 0);
 			if (!sm)
 				_exit(11);
 
@@ -1003,7 +1003,7 @@ TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable)
 		struct statmount *sm;
 		const char *mnt_point;
 
-		sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT);
+		sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT, 0);
 		ASSERT_NE(sm, NULL) {
 			TH_LOG("statmount_alloc failed for mnt_id %llu",
 			       (unsigned long long)list[i]);
diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h
index e1cba4bfd8d9..675f7cc00076 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount.h
+++ b/tools/testing/selftests/filesystems/statmount/statmount.h
@@ -3,10 +3,14 @@
 #ifndef __STATMOUNT_H
 #define __STATMOUNT_H
 
+#include <errno.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <linux/mount.h>
 #include <asm/unistd.h>
 
+#define STATMOUNT_BUFSIZE (1 << 15)
+
 #ifndef __NR_statmount
 	#if defined __alpha__
 		#define __NR_statmount 567
@@ -84,4 +88,51 @@ static inline ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id,
 	return syscall(__NR_listmount, &req, list, num, flags);
 }
 
+static inline struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id,
+						 uint64_t mask, unsigned int flags)
+{
+	struct statmount *buf;
+	size_t bufsize = STATMOUNT_BUFSIZE;
+	int ret;
+
+	for (;;) {
+		buf = malloc(bufsize);
+		if (!buf)
+			return NULL;
+
+		ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, flags);
+		if (ret == 0)
+			return buf;
+
+		free(buf);
+		if (errno != EOVERFLOW)
+			return NULL;
+
+		bufsize <<= 1;
+	}
+}
+
+static inline struct statmount *statmount_alloc_by_fd(int fd, uint64_t mask)
+{
+	struct statmount *buf;
+	size_t bufsize = STATMOUNT_BUFSIZE;
+	int ret;
+
+	for (;;) {
+		buf = malloc(bufsize);
+		if (!buf)
+			return NULL;
+
+		ret = statmount(0, 0, fd, mask, buf, bufsize, STATMOUNT_BY_FD);
+		if (ret == 0)
+			return buf;
+
+		free(buf);
+		if (errno != EOVERFLOW)
+			return NULL;
+
+		bufsize <<= 1;
+	}
+}
+
 #endif /* __STATMOUNT_H */
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index a04bcaace126..8dc018d47a93 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -33,45 +33,6 @@ static const char *const known_fs[] = {
 	"sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf",
 	"vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL };
 
-static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags)
-{
-	size_t bufsize = 1 << 15;
-	struct statmount *buf = NULL, *tmp = NULL;
-	int tofree = 0;
-	int ret;
-
-	if (flags & STATMOUNT_BY_FD && fd < 0)
-		return NULL;
-
-	tmp = alloca(bufsize);
-
-	for (;;) {
-		if (flags & STATMOUNT_BY_FD)
-			ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags);
-		else
-			ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags);
-
-		if (ret != -1)
-			break;
-		if (tofree)
-			free(tmp);
-		if (errno != EOVERFLOW)
-			return NULL;
-		bufsize <<= 1;
-		tofree = 1;
-		tmp = malloc(bufsize);
-		if (!tmp)
-			return NULL;
-	}
-	buf = malloc(tmp->size);
-	if (buf)
-		memcpy(buf, tmp, tmp->size);
-	if (tofree)
-		free(tmp);
-
-	return buf;
-}
-
 static void write_file(const char *path, const char *val)
 {
 	int fd = open(path, O_WRONLY);
@@ -715,7 +676,7 @@ static void test_statmount_by_fd(void)
 		goto err_fd;
 	}
 
-	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
 	if (!sm) {
 		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
 		goto err_chroot;
@@ -750,7 +711,7 @@ static void test_statmount_by_fd(void)
 	}
 
 	free(sm);
-	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
 	if (!sm) {
 		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
 		goto err_fd;
@@ -844,7 +805,7 @@ static void test_statmount_by_fd_unmounted(void)
 		goto err_fd;
 	}
 
-	sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD);
+	sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT);
 	if (!sm) {
 		ksft_test_result_fail("statmount by fd unmounted: %s\n",
 				      strerror(errno));

From 3ac7ea91f3d0442caf6b079e1ddc80e06b079ff9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Jan 2026 11:48:51 +0100
Subject: [PATCH 06/18] selftests: add FSMOUNT_NAMESPACE tests

Add selftests for FSMOUNT_NAMESPACE which creates a new mount namespace
with the newly created filesystem mounted onto a copy of the real
rootfs.

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-6-5ef0a886e646@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../filesystems/fsmount_ns/.gitignore         |    1 +
 .../selftests/filesystems/fsmount_ns/Makefile |   10 +
 .../filesystems/fsmount_ns/fsmount_ns_test.c  | 1138 +++++++++++++++++
 .../filesystems/open_tree_ns/Makefile         |    2 +-
 .../open_tree_ns/open_tree_ns_test.c          |   35 +-
 5 files changed, 1156 insertions(+), 30 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/fsmount_ns/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/fsmount_ns/Makefile
 create mode 100644 tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c

diff --git a/tools/testing/selftests/filesystems/fsmount_ns/.gitignore b/tools/testing/selftests/filesystems/fsmount_ns/.gitignore
new file mode 100644
index 000000000000..f1ecf6c6e37b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fsmount_ns/.gitignore
@@ -0,0 +1 @@
+fsmount_ns_test
diff --git a/tools/testing/selftests/filesystems/fsmount_ns/Makefile b/tools/testing/selftests/filesystems/fsmount_ns/Makefile
new file mode 100644
index 000000000000..d9647efc0739
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fsmount_ns/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := fsmount_ns_test
+
+CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS := -lcap
+
+include ../../lib.mk
+
+$(OUTPUT)/fsmount_ns_test: fsmount_ns_test.c ../utils.c
+	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
diff --git a/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c
new file mode 100644
index 000000000000..239abbe63814
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c
@@ -0,0 +1,1138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2026 Christian Brauner <brauner@kernel.org>
+ *
+ * Test for FSMOUNT_NAMESPACE flag.
+ *
+ * Test that fsmount() with FSMOUNT_NAMESPACE creates a new mount
+ * namespace containing the specified mount.
+ */
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/nsfs.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../wrappers.h"
+#include "../statmount/statmount.h"
+#include "../utils.h"
+#include "../../kselftest_harness.h"
+
+#ifndef FSMOUNT_NAMESPACE
+#define FSMOUNT_NAMESPACE	0x00000002
+#endif
+
+#ifndef FSMOUNT_CLOEXEC
+#define FSMOUNT_CLOEXEC		0x00000001
+#endif
+
+#ifndef FSCONFIG_CMD_CREATE
+#define FSCONFIG_CMD_CREATE	6
+#endif
+
+static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id)
+{
+	if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0)
+		return -errno;
+	return 0;
+}
+
+static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id)
+{
+	int fd, ret;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	ret = get_mnt_ns_id(fd, mnt_ns_id);
+	close(fd);
+	return ret;
+}
+
+static void log_mount(struct __test_metadata *_metadata, struct statmount *sm)
+{
+	const char *fs_type = "";
+	const char *mnt_root = "";
+	const char *mnt_point = "";
+
+	if (sm->mask & STATMOUNT_FS_TYPE)
+		fs_type = sm->str + sm->fs_type;
+	if (sm->mask & STATMOUNT_MNT_ROOT)
+		mnt_root = sm->str + sm->mnt_root;
+	if (sm->mask & STATMOUNT_MNT_POINT)
+		mnt_point = sm->str + sm->mnt_point;
+
+	TH_LOG("  mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s",
+	       (unsigned long long)sm->mnt_id,
+	       (unsigned long long)sm->mnt_parent_id,
+	       fs_type, mnt_root, mnt_point);
+}
+
+static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id)
+{
+	uint64_t list[256];
+	ssize_t nr_mounts;
+
+	nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0);
+	if (nr_mounts < 0) {
+		TH_LOG("listmount failed: %s", strerror(errno));
+		return;
+	}
+
+	TH_LOG("Mount namespace %llu contains %zd mount(s):",
+	       (unsigned long long)mnt_ns_id, nr_mounts);
+
+	for (ssize_t i = 0; i < nr_mounts; i++) {
+		struct statmount *sm;
+
+		sm = statmount_alloc(list[i], mnt_ns_id,
+				     STATMOUNT_MNT_BASIC |
+				     STATMOUNT_FS_TYPE |
+				     STATMOUNT_MNT_ROOT |
+				     STATMOUNT_MNT_POINT, 0);
+		if (!sm) {
+			TH_LOG("  [%zd] mnt_id %llu: statmount failed: %s",
+			       i, (unsigned long long)list[i], strerror(errno));
+			continue;
+		}
+
+		log_mount(_metadata, sm);
+		free(sm);
+	}
+}
+
+static int create_tmpfs_fd(void)
+{
+	int fs_fd, ret;
+
+	fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+	if (fs_fd < 0)
+		return -errno;
+
+	ret = sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
+	if (ret < 0) {
+		close(fs_fd);
+		return -errno;
+	}
+
+	return fs_fd;
+}
+
+FIXTURE(fsmount_ns)
+{
+	int fd;
+	int fs_fd;
+	uint64_t current_ns_id;
+};
+
+FIXTURE_VARIANT(fsmount_ns)
+{
+	const char *fstype;
+	unsigned int flags;
+	bool expect_success;
+	bool expect_different_ns;
+	int min_mounts;
+};
+
+FIXTURE_VARIANT_ADD(fsmount_ns, basic_tmpfs)
+{
+	.fstype = "tmpfs",
+	.flags = FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(fsmount_ns, cloexec_only)
+{
+	.fstype = "tmpfs",
+	.flags = FSMOUNT_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = false,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(fsmount_ns, namespace_only)
+{
+	.fstype = "tmpfs",
+	.flags = FSMOUNT_NAMESPACE,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_SETUP(fsmount_ns)
+{
+	int ret;
+
+	self->fd = -1;
+	self->fs_fd = -1;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+
+	/* Get current mount namespace ID for comparison */
+	ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id);
+	if (ret < 0)
+		SKIP(return, "Failed to get current mount namespace ID");
+}
+
+FIXTURE_TEARDOWN(fsmount_ns)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+	if (self->fs_fd >= 0)
+		close(self->fs_fd);
+}
+
+TEST_F(fsmount_ns, create_namespace)
+{
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, variant->flags, 0);
+
+	if (!variant->expect_success) {
+		ASSERT_LT(self->fd, 0);
+		return;
+	}
+
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	if (variant->expect_different_ns) {
+		/* Verify we can get the namespace ID from the fd */
+		ret = get_mnt_ns_id(self->fd, &new_ns_id);
+		ASSERT_EQ(ret, 0);
+
+		/* Verify it's a different namespace */
+		ASSERT_NE(new_ns_id, self->current_ns_id);
+
+		/* List mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+		ASSERT_GE(nr_mounts, 0) {
+			TH_LOG("%m - listmount failed");
+		}
+
+		/* Verify minimum expected mounts */
+		ASSERT_GE(nr_mounts, variant->min_mounts);
+		TH_LOG("Namespace contains %zd mounts", nr_mounts);
+	}
+}
+
+TEST_F(fsmount_ns, setns_into_namespace)
+{
+	uint64_t new_ns_id;
+	pid_t pid;
+	int status;
+	int ret;
+
+	/* Only test with FSMOUNT_NAMESPACE flag */
+	if (!(variant->flags & FSMOUNT_NAMESPACE))
+		SKIP(return, "setns test only for FSMOUNT_NAMESPACE case");
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, variant->flags, 0);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	/* Get namespace ID and dump all mounts */
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	dump_mounts(_metadata, new_ns_id);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child: try to enter the namespace */
+		if (setns(self->fd, CLONE_NEWNS) < 0)
+			_exit(1);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(fsmount_ns, verify_mount_properties)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	/* Only test with basic FSMOUNT_NAMESPACE flags */
+	if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC))
+		SKIP(return, "mount properties test only for basic case");
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	/* Get info about the root mount */
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	TH_LOG("Root mount id: %llu, parent: %llu",
+	       (unsigned long long)sm.mnt_id,
+	       (unsigned long long)sm.mnt_parent_id);
+}
+
+TEST_F(fsmount_ns, verify_tmpfs_type)
+{
+	struct statmount *sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	const char *fs_type;
+	int ret;
+
+	/* Only test with basic FSMOUNT_NAMESPACE flags */
+	if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC))
+		SKIP(return, "fs type test only for basic case");
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	sm = statmount_alloc(list[0], new_ns_id, STATMOUNT_FS_TYPE, 0);
+	ASSERT_NE(sm, NULL);
+
+	fs_type = sm->str + sm->fs_type;
+	ASSERT_STREQ(fs_type, "tmpfs");
+
+	free(sm);
+}
+
+FIXTURE(fsmount_ns_caps)
+{
+	bool has_caps;
+};
+
+FIXTURE_SETUP(fsmount_ns_caps)
+{
+	int ret;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	self->has_caps = (geteuid() == 0);
+}
+
+FIXTURE_TEARDOWN(fsmount_ns_caps)
+{
+}
+
+TEST_F(fsmount_ns_caps, requires_cap_sys_admin)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int fs_fd, fd;
+
+		/* Child: drop privileges using utils.h helper */
+		if (enter_userns() != 0)
+			_exit(2);
+
+		/* Drop all caps using utils.h helper */
+		if (caps_down() == 0)
+			_exit(3);
+
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(4);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(5);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd >= 0) {
+			close(fd);
+			/* Should have failed without caps */
+			_exit(1);
+		}
+
+		if (errno == EPERM)
+			_exit(0);
+
+		/* EINVAL means FSMOUNT_NAMESPACE not supported */
+		if (errno == EINVAL)
+			_exit(6);
+
+		/* Unexpected error */
+		_exit(7);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Expected: EPERM without caps */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("FSMOUNT_NAMESPACE succeeded without caps");
+		break;
+	case 2:
+		SKIP(return, "enter_userns failed");
+		break;
+	case 3:
+		SKIP(return, "caps_down failed");
+		break;
+	case 4:
+		SKIP(return, "fsopen failed in userns");
+		break;
+	case 5:
+		SKIP(return, "fsconfig CMD_CREATE failed in userns");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+FIXTURE(fsmount_ns_userns)
+{
+	int fd;
+	int fs_fd;
+};
+
+FIXTURE_SETUP(fsmount_ns_userns)
+{
+	int ret;
+
+	self->fd = -1;
+	self->fs_fd = -1;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+}
+
+FIXTURE_TEARDOWN(fsmount_ns_userns)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+	if (self->fs_fd >= 0)
+		close(self->fs_fd);
+}
+
+TEST_F(fsmount_ns_userns, create_in_userns)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fs_fd, fd;
+
+		/* Create new user namespace (also creates mount namespace) */
+		if (setup_userns() != 0)
+			_exit(2);
+
+		/* Now we have CAP_SYS_ADMIN in the user namespace */
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6); /* FSMOUNT_NAMESPACE not supported */
+			_exit(1);
+		}
+
+		/* Verify we can get the namespace ID */
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Verify we can list mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+		if (nr_mounts < 0)
+			_exit(8);
+
+		/* Should have at least 1 mount (the tmpfs) */
+		if (nr_mounts < 1)
+			_exit(9);
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Success */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed in userns");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed in userns");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed in userns");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("New namespace has no mounts");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(fsmount_ns_userns, setns_in_userns)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		int fs_fd, fd;
+		pid_t inner_pid;
+		int inner_status;
+
+		/* Create new user namespace */
+		if (setup_userns() != 0)
+			_exit(2);
+
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Fork again to test setns into the new namespace */
+		inner_pid = fork();
+		if (inner_pid < 0)
+			_exit(10);
+
+		if (inner_pid == 0) {
+			/* Inner child: enter the new namespace */
+			if (setns(fd, CLONE_NEWNS) < 0)
+				_exit(1);
+			_exit(0);
+		}
+
+		if (waitpid(inner_pid, &inner_status, 0) != inner_pid)
+			_exit(11);
+
+		if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0)
+			_exit(12);
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Success */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount or setns failed in userns");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed in userns");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed in userns");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 10:
+		ASSERT_FALSE(true) TH_LOG("Inner fork failed");
+		break;
+	case 11:
+		ASSERT_FALSE(true) TH_LOG("Inner waitpid failed");
+		break;
+	case 12:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(fsmount_ns_userns, umount_fails_einval)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fs_fd, fd;
+		ssize_t i;
+
+		/* Create new user namespace */
+		if (setup_userns() != 0)
+			_exit(2);
+
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Get all mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE);
+		if (nr_mounts < 0)
+			_exit(13);
+
+		if (nr_mounts < 1)
+			_exit(14);
+
+		/* Enter the new namespace */
+		if (setns(fd, CLONE_NEWNS) < 0)
+			_exit(8);
+
+		for (i = 0; i < nr_mounts; i++) {
+			struct statmount *sm;
+			const char *mnt_point;
+
+			sm = statmount_alloc(list[i], new_ns_id,
+					     STATMOUNT_MNT_POINT, 0);
+			if (!sm)
+				_exit(15);
+
+			mnt_point = sm->str + sm->mnt_point;
+
+			if (umount2(mnt_point, MNT_DETACH) == 0) {
+				free(sm);
+				_exit(9);
+			}
+
+			if (errno != EINVAL) {
+				/* Wrong error */
+				free(sm);
+				_exit(10);
+			}
+
+			free(sm);
+		}
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed in userns");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed in userns");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL");
+		break;
+	case 10:
+		ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)");
+		break;
+	case 13:
+		ASSERT_FALSE(true) TH_LOG("listmount failed");
+		break;
+	case 14:
+		ASSERT_FALSE(true) TH_LOG("No mounts in new namespace");
+		break;
+	case 15:
+		ASSERT_FALSE(true) TH_LOG("statmount_alloc failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(fsmount_ns_userns, umount_succeeds)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fs_fd, fd;
+		ssize_t i;
+
+		if (unshare(CLONE_NEWNS))
+			_exit(1);
+
+		if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0)
+			_exit(1);
+
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Get all mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE);
+		if (nr_mounts < 0)
+			_exit(13);
+
+		if (nr_mounts < 1)
+			_exit(14);
+
+		/* Enter the new namespace */
+		if (setns(fd, CLONE_NEWNS) < 0)
+			_exit(8);
+
+		for (i = 0; i < nr_mounts; i++) {
+			struct statmount *sm;
+			const char *mnt_point;
+
+			sm = statmount_alloc(list[i], new_ns_id,
+					     STATMOUNT_MNT_POINT, 0);
+			if (!sm)
+				_exit(15);
+
+			mnt_point = sm->str + sm->mnt_point;
+
+			if (umount2(mnt_point, MNT_DETACH) != 0) {
+				free(sm);
+				_exit(9);
+			}
+
+			free(sm);
+		}
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed or unshare failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded");
+		break;
+	case 13:
+		ASSERT_FALSE(true) TH_LOG("listmount failed");
+		break;
+	case 14:
+		ASSERT_FALSE(true) TH_LOG("No mounts in new namespace");
+		break;
+	case 15:
+		ASSERT_FALSE(true) TH_LOG("statmount_alloc failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+FIXTURE(fsmount_ns_mount_attrs)
+{
+	int fd;
+	int fs_fd;
+};
+
+FIXTURE_SETUP(fsmount_ns_mount_attrs)
+{
+	int ret;
+
+	self->fd = -1;
+	self->fs_fd = -1;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+}
+
+FIXTURE_TEARDOWN(fsmount_ns_mount_attrs)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+	if (self->fs_fd >= 0)
+		close(self->fs_fd);
+}
+
+TEST_F(fsmount_ns_mount_attrs, readonly)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_RDONLY);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is read-only */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY);
+}
+
+TEST_F(fsmount_ns_mount_attrs, noexec)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_NOEXEC);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is noexec */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC);
+}
+
+TEST_F(fsmount_ns_mount_attrs, nosuid)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_NOSUID);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is nosuid */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID);
+}
+
+TEST_F(fsmount_ns_mount_attrs, noatime)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_NOATIME);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is noatime */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME);
+}
+
+TEST_F(fsmount_ns_mount_attrs, combined)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOEXEC |
+			       MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOATIME);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify all attributes are set */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY);
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC);
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID);
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile
index 73c03c4a7ef6..4976ed1d7d4a 100644
--- a/tools/testing/selftests/filesystems/open_tree_ns/Makefile
+++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 TEST_GEN_PROGS := open_tree_ns_test
 
-CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES)
+CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
 LDLIBS := -lcap
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
index 86d5d7a831c1..82f3c8c02c9a 100644
--- a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
+++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
+ * Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
+ *
  * Test for OPEN_TREE_NAMESPACE flag.
  *
  * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount
@@ -50,31 +52,6 @@ static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id)
 	return ret;
 }
 
-#define STATMOUNT_BUFSIZE (1 << 15)
-
-static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask)
-{
-	struct statmount *buf;
-	size_t bufsize = STATMOUNT_BUFSIZE;
-	int ret;
-
-	for (;;) {
-		buf = malloc(bufsize);
-		if (!buf)
-			return NULL;
-
-		ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0);
-		if (ret == 0)
-			return buf;
-
-		free(buf);
-		if (errno != EOVERFLOW)
-			return NULL;
-
-		bufsize <<= 1;
-	}
-}
-
 static void log_mount(struct __test_metadata *_metadata, struct statmount *sm)
 {
 	const char *fs_type = "";
@@ -221,7 +198,7 @@ FIXTURE_SETUP(open_tree_ns)
 		SKIP(return, "open_tree() syscall not supported");
 
 	/* Check if statmount/listmount are supported */
-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	if (ret == -1 && errno == ENOSYS)
 		SKIP(return, "statmount() syscall not supported");
 
@@ -340,7 +317,7 @@ TEST_F(open_tree_ns, verify_mount_properties)
 	ASSERT_GE(nr_mounts, 1);
 
 	/* Get info about the root mount (the bind mount, rootfs is hidden) */
-	ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
 	ASSERT_EQ(ret, 0);
 
 	ASSERT_NE(sm.mnt_id, sm.mnt_parent_id);
@@ -452,7 +429,7 @@ FIXTURE_SETUP(open_tree_ns_userns)
 		SKIP(return, "open_tree() syscall not supported");
 
 	/* Check if statmount/listmount are supported */
-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	if (ret == -1 && errno == ENOSYS)
 		SKIP(return, "statmount() syscall not supported");
 }
@@ -904,7 +881,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds)
 		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
 		break;
 	case 7:
-		ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL");
+		ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded");
 		break;
 	case 9:
 		ASSERT_FALSE(true) TH_LOG("listmount failed");

From 9d4e752a24f740b31ca827bfab07010e4e7f34b0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 6 Mar 2026 17:28:37 +0100
Subject: [PATCH 07/18] namespace: allow creating empty mount namespaces

Add support for creating a mount namespace that contains only a copy of
the root mount from the caller's mount namespace, with none of the
child mounts.  This is useful for containers and sandboxes that want to
start with a minimal mount table and populate it from scratch rather
than inheriting and then tearing down the full mount tree.

Two new flags are introduced:

- CLONE_EMPTY_MNTNS for clone3(), using the 64-bit flag space.

- UNSHARE_EMPTY_MNTNS for unshare(), reusing the
  CLONE_PARENT_SETTID bit which has no meaning for unshare.

Both flags imply CLONE_NEWNS.  For the unshare path,
UNSHARE_EMPTY_MNTNS is converted to CLONE_EMPTY_MNTNS in
unshare_nsproxy_namespaces() before it reaches copy_mnt_ns(), so the
mount namespace code only needs to handle a single flag.

In copy_mnt_ns(), when CLONE_EMPTY_MNTNS is set, clone_mnt() is used
instead of copy_tree() to clone only the root mount.  The caller's root
and working directory are both reset to the root dentry of the new
mount.

The cleanup variables are changed from vfsmount pointers with
__free(mntput) to struct path with __free(path_put) because the empty
mount namespace path needs to release both mount and dentry references
when replacing the caller's root and pwd.  In the normal (non-empty)
path only the mount component is set, and dput(NULL) is a no-op so
path_put remains correct there as well.

Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-1-6eb30529bbb0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 87 +++++++++++++++++++++++++-------------
 include/uapi/linux/sched.h |  7 +++
 kernel/fork.c              | 17 +++++++-
 kernel/nsproxy.c           | 21 ++++++---
 4 files changed, 95 insertions(+), 37 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 702e93243505..555f0a10de9a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
-	struct vfsmount *rootmnt __free(mntput) = NULL;
-	struct vfsmount *pwdmnt __free(mntput) = NULL;
+	struct path old_root __free(path_put) = {};
+	struct path old_pwd __free(path_put) = {};
 	struct mount *p, *q;
 	struct mount *old;
 	struct mount *new;
@@ -4254,11 +4254,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 		return new_ns;
 
 	guard(namespace_excl)();
-	/* First pass: copy the tree topology */
-	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
+
+	if (flags & CLONE_EMPTY_MNTNS)
+		copy_flags = 0;
+	else
+		copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SLAVE;
-	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
+
+	if (flags & CLONE_EMPTY_MNTNS)
+		new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
+	else
+		new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		emptied_ns = new_ns;
 		return ERR_CAST(new);
@@ -4269,33 +4276,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 	}
 	new_ns->root = new;
 
-	/*
-	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
-	 * as belonging to new namespace.  We have already acquired a private
-	 * fs_struct, so tsk->fs->lock is not needed.
-	 */
-	p = old;
-	q = new;
-	while (p) {
-		mnt_add_to_ns(new_ns, q);
-		new_ns->nr_mounts++;
+	if (flags & CLONE_EMPTY_MNTNS) {
+		/*
+		 * Empty mount namespace: only the root mount exists.
+		 * Reset root and pwd to the cloned mount's root dentry.
+		 */
 		if (new_fs) {
-			if (&p->mnt == new_fs->root.mnt) {
-				new_fs->root.mnt = mntget(&q->mnt);
-				rootmnt = &p->mnt;
-			}
-			if (&p->mnt == new_fs->pwd.mnt) {
-				new_fs->pwd.mnt = mntget(&q->mnt);
-				pwdmnt = &p->mnt;
-			}
+			old_root = new_fs->root;
+			old_pwd = new_fs->pwd;
+
+			new_fs->root.mnt = mntget(&new->mnt);
+			new_fs->root.dentry = dget(new->mnt.mnt_root);
+
+			new_fs->pwd.mnt = mntget(&new->mnt);
+			new_fs->pwd.dentry = dget(new->mnt.mnt_root);
+		}
+		mnt_add_to_ns(new_ns, new);
+		new_ns->nr_mounts++;
+	} else {
+		/*
+		 * Full copy: walk old and new trees in parallel, switching
+		 * the tsk->fs->* elements and marking new vfsmounts as
+		 * belonging to new namespace.  We have already acquired a
+		 * private fs_struct, so tsk->fs->lock is not needed.
+		 */
+		p = old;
+		q = new;
+		while (p) {
+			mnt_add_to_ns(new_ns, q);
+			new_ns->nr_mounts++;
+			if (new_fs) {
+				if (&p->mnt == new_fs->root.mnt) {
+					old_root.mnt = new_fs->root.mnt;
+					new_fs->root.mnt = mntget(&q->mnt);
+				}
+				if (&p->mnt == new_fs->pwd.mnt) {
+					old_pwd.mnt = new_fs->pwd.mnt;
+					new_fs->pwd.mnt = mntget(&q->mnt);
+				}
+			}
+			p = next_mnt(p, old);
+			q = next_mnt(q, new);
+			if (!q)
+				break;
+			// an mntns binding we'd skipped?
+			while (p->mnt.mnt_root != q->mnt.mnt_root)
+				p = next_mnt(skip_mnt_tree(p), old);
 		}
-		p = next_mnt(p, old);
-		q = next_mnt(q, new);
-		if (!q)
-			break;
-		// an mntns binding we'd skipped?
-		while (p->mnt.mnt_root != q->mnt.mnt_root)
-			p = next_mnt(skip_mnt_tree(p), old);
 	}
 	ns_tree_add_raw(new_ns);
 	return new_ns;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 359a14cc76a4..4e76fce9f777 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,6 +36,7 @@
 /* Flags for the clone3() syscall. */
 #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
 #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#define CLONE_EMPTY_MNTNS	(1ULL << 37) /* Create an empty mount namespace. */
 
 /*
  * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -43,6 +44,12 @@
  */
 #define CLONE_NEWTIME	0x00000080	/* New time namespace */
 
+/*
+ * unshare flags share the bit space with clone flags but only apply to the
+ * unshare syscall:
+ */
+#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */
+
 #ifndef __ASSEMBLY__
 /**
  * struct clone_args - arguments for the clone3 syscall
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..dea6b3454447 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2619,6 +2619,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 	int trace = 0;
 	pid_t nr;
 
+	/*
+	 * Creating an empty mount namespace implies creating a new mount
+	 * namespace.  Set this before copy_process() so that the
+	 * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
+	 */
+	if (clone_flags & CLONE_EMPTY_MNTNS) {
+		clone_flags |= CLONE_NEWNS;
+		args->flags = clone_flags;
+	}
+
 	/*
 	 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
 	 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
@@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
 {
 	/* Verify that no unknown flags are passed along. */
 	if (kargs->flags &
-	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+	      CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS))
 		return false;
 
 	/*
@@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
 				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
 				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
-				CLONE_NEWTIME))
+				CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing
@@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags)
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */
+	if (unshare_flags & UNSHARE_EMPTY_MNTNS)
+		unshare_flags |= CLONE_NEWNS;
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 259c4b4f1eeb..1bdc5be2dd20 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
 	if (!new_nsp)
 		return ERR_PTR(-ENOMEM);
 
-	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
+				      user_ns, new_fs);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
@@ -212,18 +213,28 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
 {
 	struct user_namespace *user_ns;
+	u64 flags = unshare_flags;
 	int err = 0;
 
-	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
-			       CLONE_NEWTIME)))
+	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+		       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+		       CLONE_NEWTIME)))
 		return 0;
 
 	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+	/*
+	 * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
+	 * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
+	 */
+	if (flags & UNSHARE_EMPTY_MNTNS) {
+		flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
+		flags |= CLONE_EMPTY_MNTNS;
+	}
+
+	*new_nsp = create_new_namespaces(flags, current, user_ns,
 					 new_fs ? new_fs : current->fs);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);

From 32f54f2bbccfdeff81d930d18ccf3161a1c203b9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 6 Mar 2026 17:28:38 +0100
Subject: [PATCH 08/18] selftests/filesystems: add tests for empty mount
 namespaces

Add a test suite for the UNSHARE_EMPTY_MNTNS and CLONE_EMPTY_MNTNS
flags exercising the empty mount namespace functionality through the
kselftest harness.

The tests cover:

- basic functionality: unshare succeeds, exactly one mount exists in
  the new namespace, root and cwd point to the same mount
- flag interactions: UNSHARE_EMPTY_MNTNS works standalone without
  explicit CLONE_NEWNS, combines correctly with CLONE_NEWUSER and
  other namespace flags (CLONE_NEWUTS, CLONE_NEWIPC)
- edge cases: EPERM without capabilities, works from a user namespace,
  many source mounts still result in one mount, cwd on a different
  mount gets reset to root
- error paths: invalid flags return EINVAL
- regression: plain CLONE_NEWNS still copies the full mount tree,
  other namespace unshares are unaffected
- mount properties: the root mount has the expected statmount
  properties, is its own parent, and is the only entry returned by
  listmount
- repeated unshare: consecutive UNSHARE_EMPTY_MNTNS calls each
  produce a new namespace with a distinct mount ID
- overmount workflow: verifies the intended usage pattern of creating
  an empty mount namespace with a nullfs root and then mounting tmpfs
  over it to build a writable filesystem from scratch

Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-2-6eb30529bbb0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../filesystems/empty_mntns/.gitignore        |   3 +
 .../filesystems/empty_mntns/Makefile          |  11 +
 .../filesystems/empty_mntns/empty_mntns.h     |  50 ++
 .../empty_mntns/empty_mntns_test.c            | 725 ++++++++++++++++++
 .../empty_mntns/overmount_chroot_test.c       | 225 ++++++
 tools/testing/selftests/filesystems/utils.c   |   4 +-
 tools/testing/selftests/filesystems/utils.h   |   2 +
 7 files changed, 1018 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/empty_mntns/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/empty_mntns/Makefile
 create mode 100644 tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
 create mode 100644 tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
 create mode 100644 tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c

diff --git a/tools/testing/selftests/filesystems/empty_mntns/.gitignore b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
new file mode 100644
index 000000000000..48054440b7e1
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+empty_mntns_test
+overmount_chroot_test
diff --git a/tools/testing/selftests/filesystems/empty_mntns/Makefile b/tools/testing/selftests/filesystems/empty_mntns/Makefile
new file mode 100644
index 000000000000..5d4cffa4c4ae
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test
+
+include ../../lib.mk
+
+$(OUTPUT)/empty_mntns_test: ../utils.c
+$(OUTPUT)/overmount_chroot_test: ../utils.c
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
new file mode 100644
index 000000000000..dfd24c88eec8
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef EMPTY_MNTNS_H
+#define EMPTY_MNTNS_H
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "../statmount/statmount.h"
+
+#ifndef UNSHARE_EMPTY_MNTNS
+#define UNSHARE_EMPTY_MNTNS	0x00100000
+#endif
+
+#ifndef CLONE_EMPTY_MNTNS
+#define CLONE_EMPTY_MNTNS	(1ULL << 37)
+#endif
+
+static inline ssize_t count_mounts(void)
+{
+	uint64_t list[4096];
+
+	return listmount(LSMT_ROOT, 0, 0, list, sizeof(list) / sizeof(list[0]), 0);
+}
+
+static inline struct statmount *statmount_alloc(uint64_t mnt_id,
+						uint64_t mnt_ns_id,
+						uint64_t mask)
+{
+	size_t bufsize = 1 << 15;
+	struct statmount *buf;
+	int ret;
+
+	for (;;) {
+		buf = malloc(bufsize);
+		if (!buf)
+			return NULL;
+
+		ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, 0);
+		if (ret == 0)
+			return buf;
+
+		free(buf);
+		if (errno != EOVERFLOW)
+			return NULL;
+
+		bufsize <<= 1;
+	}
+}
+
+#endif /* EMPTY_MNTNS_H */
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
new file mode 100644
index 000000000000..733aad83dbbf
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Tests for empty mount namespace creation via UNSHARE_EMPTY_MNTNS
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "../wrappers.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+static bool unshare_empty_mntns_supported(void)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	if (pid < 0)
+		return false;
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS) && errno == EINVAL)
+			_exit(1);
+		_exit(0);
+	}
+
+	if (waitpid(pid, &status, 0) != pid)
+		return false;
+
+	if (!WIFEXITED(status))
+		return false;
+
+	return WEXITSTATUS(status) == 0;
+}
+
+
+FIXTURE(empty_mntns) {};
+
+FIXTURE_SETUP(empty_mntns)
+{
+	if (!unshare_empty_mntns_supported())
+		SKIP(return, "UNSHARE_EMPTY_MNTNS not supported");
+}
+
+FIXTURE_TEARDOWN(empty_mntns) {}
+
+/* Verify unshare succeeds, produces exactly 1 mount, and root == cwd */
+TEST_F(empty_mntns, basic)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t root_id, cwd_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		root_id = get_unique_mnt_id("/");
+		cwd_id = get_unique_mnt_id(".");
+		if (root_id == 0 || cwd_id == 0)
+			_exit(4);
+
+		if (root_id != cwd_id)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * UNSHARE_EMPTY_MNTNS combined with CLONE_NEWUSER.
+ *
+ * The user namespace must be created first so /proc is still accessible
+ * for writing uid_map/gid_map.  The empty mount namespace is created
+ * afterwards.
+ */
+TEST_F(empty_mntns, with_clone_newuser)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uid_t uid = getuid();
+		gid_t gid = getgid();
+		char map[100];
+
+		if (unshare(CLONE_NEWUSER))
+			_exit(1);
+
+		snprintf(map, sizeof(map), "0 %d 1", uid);
+		if (write_file("/proc/self/uid_map", map))
+			_exit(2);
+
+		if (write_file("/proc/self/setgroups", "deny"))
+			_exit(3);
+
+		snprintf(map, sizeof(map), "0 %d 1", gid);
+		if (write_file("/proc/self/gid_map", map))
+			_exit(4);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(5);
+
+		if (count_mounts() != 1)
+			_exit(6);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* UNSHARE_EMPTY_MNTNS combined with other namespace flags */
+TEST_F(empty_mntns, with_other_ns_flags)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS | CLONE_NEWUTS | CLONE_NEWIPC))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* EPERM without proper capabilities */
+TEST_F(empty_mntns, eperm_without_caps)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Skip if already root */
+		if (getuid() == 0)
+			_exit(0);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS) == 0)
+			_exit(1);
+
+		if (errno != EPERM)
+			_exit(2);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Many source mounts still result in exactly 1 mount */
+TEST_F(empty_mntns, many_source_mounts)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char tmpdir[] = "/tmp/empty_mntns_test.XXXXXX";
+		int i;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		for (i = 0; i < 5; i++) {
+			char subdir[256];
+
+			snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+			if (mkdir(subdir, 0755) && errno != EEXIST)
+				_exit(6);
+			if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+				_exit(7);
+		}
+
+		if (count_mounts() < 5)
+			_exit(8);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(9);
+
+		if (count_mounts() != 1)
+			_exit(10);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* CWD on a different mount gets reset to root */
+TEST_F(empty_mntns, cwd_reset)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char tmpdir[] = "/tmp/empty_mntns_cwd.XXXXXX";
+		uint64_t root_id, cwd_id;
+		struct statmount *sm;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		if (chdir(tmpdir))
+			_exit(6);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(7);
+
+		root_id = get_unique_mnt_id("/");
+		cwd_id = get_unique_mnt_id(".");
+		if (root_id == 0 || cwd_id == 0)
+			_exit(8);
+
+		if (root_id != cwd_id)
+			_exit(9);
+
+		sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
+		if (!sm)
+			_exit(10);
+
+		if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+			_exit(11);
+
+		free(sm);
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Verify statmount properties of the root mount */
+TEST_F(empty_mntns, mount_properties)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct statmount *sm;
+		uint64_t root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(3);
+
+		sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
+				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE);
+		if (!sm)
+			_exit(4);
+
+		if (!(sm->mask & STATMOUNT_MNT_POINT))
+			_exit(5);
+
+		if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+			_exit(6);
+
+		if (!(sm->mask & STATMOUNT_MNT_BASIC))
+			_exit(7);
+
+		if (sm->mnt_id != root_id)
+			_exit(8);
+
+		free(sm);
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Consecutive UNSHARE_EMPTY_MNTNS calls produce new namespaces */
+TEST_F(empty_mntns, repeated_unshare)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t first_root_id, second_root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		first_root_id = get_unique_mnt_id("/");
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(4);
+
+		if (count_mounts() != 1)
+			_exit(5);
+
+		second_root_id = get_unique_mnt_id("/");
+
+		if (first_root_id == second_root_id)
+			_exit(6);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Root mount's parent is itself */
+TEST_F(empty_mntns, root_is_own_parent)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct statmount sm;
+		uint64_t root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(3);
+
+		if (statmount(root_id, 0, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0) < 0)
+			_exit(4);
+
+		if (!(sm.mask & STATMOUNT_MNT_BASIC))
+			_exit(5);
+
+		if (sm.mnt_parent_id != sm.mnt_id)
+			_exit(6);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Listmount returns only the root mount */
+TEST_F(empty_mntns, listmount_single_entry)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t list[16];
+		ssize_t nr_mounts;
+		uint64_t root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
+		if (nr_mounts != 1)
+			_exit(3);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(4);
+
+		if (list[0] != root_id)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Mount tmpfs over nullfs root to build a writable filesystem from scratch.
+ * This exercises the intended usage pattern: create an empty mount namespace
+ * (which has a nullfs root), then mount a real filesystem over it.
+ *
+ * Because resolving "/" returns the process root directly (via nd_jump_root)
+ * without following overmounts, we use the new mount API (fsopen/fsmount)
+ * to obtain a mount fd, then fchdir + chroot to enter the new filesystem.
+ */
+TEST_F(empty_mntns, overmount_tmpfs)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct statmount *sm;
+		uint64_t root_id, cwd_id;
+		int fd, fsfd, mntfd;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(4);
+
+		/* Verify root is nullfs */
+		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+		if (!sm)
+			_exit(5);
+
+		if (!(sm->mask & STATMOUNT_FS_TYPE))
+			_exit(6);
+
+		if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+			_exit(7);
+
+		free(sm);
+
+		cwd_id = get_unique_mnt_id(".");
+		if (!cwd_id || root_id != cwd_id)
+			_exit(8);
+
+		/*
+		 * nullfs root is immutable.  open(O_CREAT) returns ENOENT
+		 * because empty_dir_lookup() returns -ENOENT before the
+		 * IS_IMMUTABLE permission check in may_o_create() is reached.
+		 */
+		fd = open("/test", O_CREAT | O_RDWR, 0644);
+		if (fd >= 0) {
+			close(fd);
+			_exit(9);
+		}
+		if (errno != ENOENT)
+			_exit(10);
+
+		/*
+		 * Use the new mount API to create tmpfs and get a mount fd.
+		 * We need the fd because after attaching the tmpfs on top of
+		 * "/", path resolution of "/" still returns the process root
+		 * (nullfs) without following the overmount.  The mount fd
+		 * lets us fchdir + chroot into the tmpfs.
+		 */
+		fsfd = sys_fsopen("tmpfs", 0);
+		if (fsfd < 0)
+			_exit(11);
+
+		if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "size", "1M", 0)) {
+			close(fsfd);
+			_exit(12);
+		}
+
+		if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) {
+			close(fsfd);
+			_exit(13);
+		}
+
+		mntfd = sys_fsmount(fsfd, 0, 0);
+		close(fsfd);
+		if (mntfd < 0)
+			_exit(14);
+
+		if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
+				   MOVE_MOUNT_F_EMPTY_PATH)) {
+			close(mntfd);
+			_exit(15);
+		}
+
+		if (count_mounts() != 2) {
+			close(mntfd);
+			_exit(16);
+		}
+
+		/* Enter the tmpfs via the mount fd */
+		if (fchdir(mntfd)) {
+			close(mntfd);
+			_exit(17);
+		}
+
+		if (chroot(".")) {
+			close(mntfd);
+			_exit(18);
+		}
+
+		close(mntfd);
+
+		/* Verify "/" now resolves to tmpfs */
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(19);
+
+		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+		if (!sm)
+			_exit(20);
+
+		if (!(sm->mask & STATMOUNT_FS_TYPE))
+			_exit(21);
+
+		if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
+			_exit(22);
+
+		free(sm);
+
+		/* Verify tmpfs is writable */
+		fd = open("/testfile", O_CREAT | O_RDWR, 0644);
+		if (fd < 0)
+			_exit(23);
+
+		if (write(fd, "test", 4) != 4) {
+			close(fd);
+			_exit(24);
+		}
+
+		close(fd);
+
+		if (access("/testfile", F_OK))
+			_exit(25);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Tests below do not require UNSHARE_EMPTY_MNTNS support.
+ */
+
+/* Invalid unshare flags return EINVAL */
+TEST(invalid_flags)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(0x80000000) == 0)
+			_exit(2);
+
+		if (errno != EINVAL)
+			_exit(3);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Regular CLONE_NEWNS still copies the full mount tree */
+TEST(clone_newns_full_copy)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t nr_mounts_before, nr_mounts_after;
+		char tmpdir[] = "/tmp/empty_mntns_regr.XXXXXX";
+		int i;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		for (i = 0; i < 3; i++) {
+			char subdir[256];
+
+			snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+			if (mkdir(subdir, 0755) && errno != EEXIST)
+				_exit(6);
+			if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+				_exit(7);
+		}
+
+		nr_mounts_before = count_mounts();
+		if (nr_mounts_before < 3)
+			_exit(8);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(9);
+
+		nr_mounts_after = count_mounts();
+		if (nr_mounts_after < nr_mounts_before)
+			_exit(10);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Other namespace unshares are unaffected */
+TEST(other_ns_unaffected)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char hostname[256];
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWUTS))
+			_exit(2);
+
+		if (sethostname("test-empty-mntns", 16))
+			_exit(3);
+
+		if (gethostname(hostname, sizeof(hostname)))
+			_exit(4);
+
+		if (strcmp(hostname, "test-empty-mntns") != 0)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
new file mode 100644
index 000000000000..0b623d0c6bb9
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test: rootfs overmounted multiple times with chroot into topmost
+ *
+ * This test creates a scenario where:
+ * 1. A new mount namespace is created with a tmpfs root (via pivot_root)
+ * 2. A mountpoint is created and overmounted multiple times
+ * 3. The caller chroots into the topmost mount layer
+ *
+ * The test verifies that:
+ * - Multiple overmounts create separate mount layers
+ * - Each layer's files are isolated
+ * - chroot correctly sets the process's root to the topmost layer
+ * - After chroot, only the topmost layer's files are visible
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+#define NR_OVERMOUNTS 5
+
+/*
+ * Setup a proper root filesystem using pivot_root.
+ * This ensures we own the root directory in our user namespace.
+ */
+static int setup_root(void)
+{
+	char tmpdir[] = "/tmp/overmount_test.XXXXXX";
+	char oldroot[256];
+
+	if (!mkdtemp(tmpdir))
+		return -1;
+
+	/* Mount tmpfs at the temporary directory */
+	if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=10M"))
+		return -1;
+
+	/* Create directory for old root */
+	snprintf(oldroot, sizeof(oldroot), "%s/oldroot", tmpdir);
+	if (mkdir(oldroot, 0755))
+		return -1;
+
+	/* pivot_root to use the tmpfs as new root */
+	if (syscall(SYS_pivot_root, tmpdir, oldroot))
+		return -1;
+
+	if (chdir("/"))
+		return -1;
+
+	/* Unmount old root */
+	if (umount2("/oldroot", MNT_DETACH))
+		return -1;
+
+	/* Remove oldroot directory */
+	if (rmdir("/oldroot"))
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Test scenario:
+ * 1. Enter a user namespace to gain CAP_SYS_ADMIN
+ * 2. Create a new mount namespace
+ * 3. Setup a tmpfs root via pivot_root
+ * 4. Create a mountpoint /newroot and overmount it multiple times
+ * 5. Create a marker file in each layer
+ * 6. Chroot into /newroot (the topmost overmount)
+ * 7. Verify we're in the topmost layer (only topmost marker visible)
+ */
+TEST(overmount_chroot)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t nr_mounts;
+		uint64_t mnt_ids[NR_OVERMOUNTS + 1];
+		uint64_t root_id_before, root_id_after;
+		struct statmount *sm;
+		char marker[64];
+		int fd, i;
+
+		/* Step 1: Enter user namespace for privileges */
+		if (enter_userns())
+			_exit(1);
+
+		/* Step 2: Create a new mount namespace */
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		/* Step 3: Make the mount tree private */
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		/* Step 4: Setup a proper tmpfs root via pivot_root */
+		if (setup_root())
+			_exit(4);
+
+		/* Create the base mount point for overmounting */
+		if (mkdir("/newroot", 0755))
+			_exit(5);
+
+		/* Mount base tmpfs on /newroot */
+		if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M"))
+			_exit(6);
+
+		/* Record base mount ID */
+		mnt_ids[0] = get_unique_mnt_id("/newroot");
+		if (!mnt_ids[0])
+			_exit(7);
+
+		/* Create marker in base layer */
+		fd = open("/newroot/layer_0", O_CREAT | O_RDWR, 0644);
+		if (fd < 0)
+			_exit(8);
+		if (write(fd, "layer_0", 7) != 7) {
+			close(fd);
+			_exit(9);
+		}
+		close(fd);
+
+		/* Step 5: Overmount /newroot multiple times with tmpfs */
+		for (i = 0; i < NR_OVERMOUNTS; i++) {
+			if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M"))
+				_exit(10);
+
+			/* Record mount ID for this layer */
+			mnt_ids[i + 1] = get_unique_mnt_id("/newroot");
+			if (!mnt_ids[i + 1])
+				_exit(11);
+
+			/* Create a marker file in each layer */
+			snprintf(marker, sizeof(marker), "/newroot/layer_%d", i + 1);
+			fd = open(marker, O_CREAT | O_RDWR, 0644);
+			if (fd < 0)
+				_exit(12);
+
+			if (write(fd, marker, strlen(marker)) != (ssize_t)strlen(marker)) {
+				close(fd);
+				_exit(13);
+			}
+			close(fd);
+		}
+
+		/* Verify mount count increased */
+		nr_mounts = count_mounts();
+		if (nr_mounts < NR_OVERMOUNTS + 2)
+			_exit(14);
+
+		/* Record root mount ID before chroot */
+		root_id_before = get_unique_mnt_id("/newroot");
+
+		/* Verify this is the topmost layer's mount */
+		if (root_id_before != mnt_ids[NR_OVERMOUNTS])
+			_exit(15);
+
+		/* Step 6: Chroot into /newroot (the topmost overmount) */
+		if (chroot("/newroot"))
+			_exit(16);
+
+		/* Change to root directory within the chroot */
+		if (chdir("/"))
+			_exit(17);
+
+		/* Step 7: Verify we're in the topmost layer */
+		root_id_after = get_unique_mnt_id("/");
+
+		/* The mount ID should be the same as the topmost layer */
+		if (root_id_after != mnt_ids[NR_OVERMOUNTS])
+			_exit(18);
+
+		/* Verify the topmost layer's marker file exists */
+		snprintf(marker, sizeof(marker), "/layer_%d", NR_OVERMOUNTS);
+		if (access(marker, F_OK))
+			_exit(19);
+
+		/* Verify we cannot see markers from lower layers (they're hidden) */
+		for (i = 0; i < NR_OVERMOUNTS; i++) {
+			snprintf(marker, sizeof(marker), "/layer_%d", i);
+			if (access(marker, F_OK) == 0)
+				_exit(20);
+		}
+
+		/* Verify the root mount is tmpfs */
+		sm = statmount_alloc(root_id_after, 0,
+				     STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
+				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE);
+		if (!sm)
+			_exit(21);
+
+		if (sm->mask & STATMOUNT_FS_TYPE) {
+			if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) {
+				free(sm);
+				_exit(22);
+			}
+		}
+
+		free(sm);
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index d6f26f849053..d73d7d8171db 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -158,7 +158,7 @@ static int get_userns_fd_cb(void *data)
 	_exit(0);
 }
 
-static int wait_for_pid(pid_t pid)
+int wait_for_pid(pid_t pid)
 {
 	int status, ret;
 
@@ -450,7 +450,7 @@ out_close:
 	return fret;
 }
 
-static int write_file(const char *path, const char *val)
+int write_file(const char *path, const char *val)
 {
 	int fd = open(path, O_WRONLY);
 	size_t len = strlen(val);
diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
index 0bccfed666a9..d03085cef5cb 100644
--- a/tools/testing/selftests/filesystems/utils.h
+++ b/tools/testing/selftests/filesystems/utils.h
@@ -44,6 +44,8 @@ static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
 	return true;
 }
 
+extern int wait_for_pid(pid_t pid);
+extern int write_file(const char *path, const char *val);
 extern uint64_t get_unique_mnt_id(const char *path);
 
 #endif /* __IDMAP_UTILS_H */

From 5b8ffd63fbd94fe71f1baf50a55e31be54a97ca9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 6 Mar 2026 17:28:39 +0100
Subject: [PATCH 09/18] selftests/filesystems: add clone3 tests for empty mount
 namespaces

Add a test suite for the CLONE_EMPTY_MNTNS flag exercising the empty
mount namespace functionality through the clone3() syscall.

The clone3() code path is distinct from the unshare() path already
tested in empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS
(0x400000000ULL) is a 64-bit flag that implies CLONE_NEWNS.  The
implication happens in kernel_clone() before copy_process(), unlike
unshare() where it goes through UNSHARE_EMPTY_MNTNS to
CLONE_EMPTY_MNTNS conversion in unshare_nsproxy_namespaces().

The tests cover:

- basic functionality: clone3 child gets empty mount namespace with
  exactly one mount, root and cwd point to the same mount
- CLONE_NEWNS implication: CLONE_EMPTY_MNTNS works without explicit
  CLONE_NEWNS, also works with redundant CLONE_NEWNS
- flag interactions: combines correctly with CLONE_NEWUSER,
  CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_PIDFD
- mutual exclusion: CLONE_EMPTY_MNTNS | CLONE_FS returns EINVAL
  because the implied CLONE_NEWNS conflicts with CLONE_FS
- error paths: EPERM without capabilities, unknown 64-bit flags
  rejected
- parent isolation: parent mount namespace is unchanged after clone
- many parent mounts: child still gets exactly one mount
- mount properties: root mount is nullfs, is its own parent, is the
  only listmount entry
- overmount workflow: child can mount tmpfs over nullfs root to build
  a writable filesystem from scratch
- repeated clone3: each child gets a distinct mount namespace
- setns: parent can join child's empty mount namespace via setns()
- regression: plain CLONE_NEWNS via clone3 still copies the full
  mount tree

Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-3-6eb30529bbb0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../filesystems/empty_mntns/.gitignore        |   1 +
 .../filesystems/empty_mntns/Makefile          |   3 +-
 .../empty_mntns/clone3_empty_mntns_test.c     | 938 ++++++++++++++++++
 3 files changed, 941 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c

diff --git a/tools/testing/selftests/filesystems/empty_mntns/.gitignore b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
index 48054440b7e1..99f89d329db2 100644
--- a/tools/testing/selftests/filesystems/empty_mntns/.gitignore
+++ b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+clone3_empty_mntns_test
 empty_mntns_test
 overmount_chroot_test
diff --git a/tools/testing/selftests/filesystems/empty_mntns/Makefile b/tools/testing/selftests/filesystems/empty_mntns/Makefile
index 5d4cffa4c4ae..22e3fb915e81 100644
--- a/tools/testing/selftests/filesystems/empty_mntns/Makefile
+++ b/tools/testing/selftests/filesystems/empty_mntns/Makefile
@@ -3,9 +3,10 @@
 CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
 LDLIBS += -lcap
 
-TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test
+TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test clone3_empty_mntns_test
 
 include ../../lib.mk
 
 $(OUTPUT)/empty_mntns_test: ../utils.c
 $(OUTPUT)/overmount_chroot_test: ../utils.c
+$(OUTPUT)/clone3_empty_mntns_test: ../utils.c
diff --git a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
new file mode 100644
index 000000000000..130cc1a1b407
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
@@ -0,0 +1,938 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Tests for empty mount namespace creation via clone3() CLONE_EMPTY_MNTNS
+ *
+ * These tests exercise the clone3() code path for creating empty mount
+ * namespaces, which is distinct from the unshare() path tested in
+ * empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS (0x400000000ULL)
+ * is a 64-bit flag that implies CLONE_NEWNS.  The implication happens in
+ * kernel_clone() before copy_process(), unlike unshare() where it goes
+ * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in
+ * unshare_nsproxy_namespaces().
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "../wrappers.h"
+#include "clone3/clone3_selftests.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+static pid_t clone3_empty_mntns(uint64_t extra_flags)
+{
+	struct __clone_args args = {
+		.flags		= CLONE_EMPTY_MNTNS | extra_flags,
+		.exit_signal	= SIGCHLD,
+	};
+
+	return sys_clone3(&args, sizeof(args));
+}
+
+static bool clone3_empty_mntns_supported(void)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	if (pid < 0)
+		return false;
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		pid = clone3_empty_mntns(0);
+		if (pid < 0)
+			_exit(1);
+
+		if (pid == 0)
+			_exit(0);
+
+		_exit(wait_for_pid(pid) != 0);
+	}
+
+	if (waitpid(pid, &status, 0) != pid)
+		return false;
+
+	if (!WIFEXITED(status))
+		return false;
+
+	return WEXITSTATUS(status) == 0;
+}
+
+FIXTURE(clone3_empty_mntns) {};
+
+FIXTURE_SETUP(clone3_empty_mntns)
+{
+	if (!clone3_empty_mntns_supported())
+		SKIP(return, "CLONE_EMPTY_MNTNS via clone3 not supported");
+}
+
+FIXTURE_TEARDOWN(clone3_empty_mntns) {}
+
+/*
+ * Basic clone3() with CLONE_EMPTY_MNTNS: child gets empty mount namespace
+ * with exactly 1 mount and root == cwd.
+ */
+TEST_F(clone3_empty_mntns, basic)
+{
+	pid_t pid, inner;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			uint64_t root_id, cwd_id;
+
+			if (count_mounts() != 1)
+				_exit(3);
+
+			root_id = get_unique_mnt_id("/");
+			cwd_id = get_unique_mnt_id(".");
+			if (root_id == 0 || cwd_id == 0)
+				_exit(4);
+
+			if (root_id != cwd_id)
+				_exit(5);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS implies CLONE_NEWNS.  Verify that it works without
+ * explicitly setting CLONE_NEWNS (tests fork.c:2627-2630).
+ */
+TEST_F(clone3_empty_mntns, implies_newns)
+{
+	pid_t pid, inner;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t parent_mounts;
+
+		if (enter_userns())
+			_exit(1);
+
+		/* Verify we have mounts in our current namespace. */
+		parent_mounts = count_mounts();
+		if (parent_mounts < 1)
+			_exit(2);
+
+		/* Only CLONE_EMPTY_MNTNS, no explicit CLONE_NEWNS. */
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(4);
+
+			_exit(0);
+		}
+
+		/* Parent still has its mounts. */
+		if (count_mounts() != parent_mounts)
+			_exit(5);
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Helper macro: generate a test that clones with CLONE_EMPTY_MNTNS |
+ * @extra_flags and verifies the child has exactly one mount.
+ */
+#define TEST_CLONE3_FLAGS(test_name, extra_flags)			\
+TEST_F(clone3_empty_mntns, test_name)					\
+{									\
+	pid_t pid, inner;						\
+									\
+	pid = fork();							\
+	ASSERT_GE(pid, 0);						\
+									\
+	if (pid == 0) {							\
+		if (enter_userns())					\
+			_exit(1);					\
+									\
+		inner = clone3_empty_mntns(extra_flags);		\
+		if (inner < 0)						\
+			_exit(2);					\
+									\
+		if (inner == 0) {					\
+			if (count_mounts() != 1)			\
+				_exit(3);				\
+			_exit(0);					\
+		}							\
+									\
+		_exit(wait_for_pid(inner));				\
+	}								\
+									\
+	ASSERT_EQ(wait_for_pid(pid), 0);				\
+}
+
+/* Redundant CLONE_NEWNS | CLONE_EMPTY_MNTNS should succeed. */
+TEST_CLONE3_FLAGS(with_explicit_newns, CLONE_NEWNS)
+
+/* CLONE_EMPTY_MNTNS combined with CLONE_NEWUSER. */
+TEST_CLONE3_FLAGS(with_newuser, CLONE_NEWUSER)
+
+/* CLONE_EMPTY_MNTNS combined with other namespace flags. */
+TEST_CLONE3_FLAGS(with_other_ns_flags, CLONE_NEWUTS | CLONE_NEWIPC)
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_NEWPID.
+ */
+TEST_F(clone3_empty_mntns, with_newpid)
+{
+	pid_t pid, inner;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(CLONE_NEWPID);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(3);
+
+			/* In a new PID namespace, getpid() returns 1. */
+			if (getpid() != 1)
+				_exit(4);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS | CLONE_FS must fail because the implied CLONE_NEWNS
+ * and CLONE_FS are mutually exclusive (fork.c:1981).
+ */
+TEST_F(clone3_empty_mntns, with_clone_fs_fails)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= CLONE_EMPTY_MNTNS | CLONE_FS,
+			.exit_signal	= SIGCHLD,
+		};
+		pid_t ret;
+
+		if (enter_userns())
+			_exit(1);
+
+		ret = sys_clone3(&args, sizeof(args));
+		if (ret >= 0) {
+			if (ret == 0)
+				_exit(0);
+			wait_for_pid(ret);
+			_exit(2);
+		}
+
+		if (errno != EINVAL)
+			_exit(3);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_PIDFD returns a valid pidfd.
+ */
+TEST_F(clone3_empty_mntns, with_pidfd)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= CLONE_EMPTY_MNTNS | CLONE_PIDFD,
+			.exit_signal	= SIGCHLD,
+		};
+		int pidfd = -1;
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		args.pidfd = (uintptr_t)&pidfd;
+
+		inner = sys_clone3(&args, sizeof(args));
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(3);
+
+			_exit(0);
+		}
+
+		/* Verify we got a valid pidfd. */
+		if (pidfd < 0)
+			_exit(4);
+
+		close(pidfd);
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * clone3 without CAP_SYS_ADMIN must fail with EPERM.
+ */
+TEST_F(clone3_empty_mntns, eperm_without_caps)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t ret;
+
+		/* Skip if already root. */
+		if (getuid() == 0)
+			_exit(0);
+
+		ret = clone3_empty_mntns(0);
+		if (ret >= 0) {
+			if (ret == 0)
+				_exit(0);
+			wait_for_pid(ret);
+			_exit(1);
+		}
+
+		if (errno != EPERM)
+			_exit(2);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent's mount namespace is unaffected after clone3 with CLONE_EMPTY_MNTNS.
+ */
+TEST_F(clone3_empty_mntns, parent_unchanged)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t nr_before, nr_after;
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		nr_before = count_mounts();
+		if (nr_before < 1)
+			_exit(2);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0)
+			_exit(0);
+
+		if (wait_for_pid(inner) != 0)
+			_exit(4);
+
+		nr_after = count_mounts();
+		if (nr_after != nr_before)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent with many mounts: child still gets exactly 1 mount.
+ */
+TEST_F(clone3_empty_mntns, many_parent_mounts)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char tmpdir[] = "/tmp/clone3_mntns_test.XXXXXX";
+		pid_t inner;
+		int i;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		for (i = 0; i < 5; i++) {
+			char subdir[256];
+
+			snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+			if (mkdir(subdir, 0755) && errno != EEXIST)
+				_exit(6);
+			if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+				_exit(7);
+		}
+
+		if (count_mounts() < 5)
+			_exit(8);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(9);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(10);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify the child's root mount is nullfs with expected statmount properties.
+ */
+TEST_F(clone3_empty_mntns, mount_properties)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			struct statmount *sm;
+			uint64_t root_id;
+
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(3);
+
+			sm = statmount_alloc(root_id, 0,
+					     STATMOUNT_MNT_BASIC |
+					     STATMOUNT_MNT_POINT |
+					     STATMOUNT_FS_TYPE);
+			if (!sm)
+				_exit(4);
+
+			/* Root mount point is "/". */
+			if (!(sm->mask & STATMOUNT_MNT_POINT))
+				_exit(5);
+			if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+				_exit(6);
+
+			/* Filesystem type is nullfs. */
+			if (!(sm->mask & STATMOUNT_FS_TYPE))
+				_exit(7);
+			if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+				_exit(8);
+
+			/* Root mount is its own parent. */
+			if (!(sm->mask & STATMOUNT_MNT_BASIC))
+				_exit(9);
+			if (sm->mnt_parent_id != sm->mnt_id)
+				_exit(10);
+
+			free(sm);
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Listmount returns only the root mount in the child's empty namespace.
+ */
+TEST_F(clone3_empty_mntns, listmount_single_entry)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			uint64_t list[16];
+			ssize_t nr_mounts;
+			uint64_t root_id;
+
+			nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
+			if (nr_mounts != 1)
+				_exit(3);
+
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(4);
+
+			if (list[0] != root_id)
+				_exit(5);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Child can mount tmpfs over nullfs root (the primary container use case).
+ *
+ * Uses the new mount API (fsopen/fsmount/move_mount) because resolving
+ * "/" returns the process root directly without following overmounts.
+ * The mount fd from fsmount lets us fchdir + chroot into the new tmpfs.
+ */
+TEST_F(clone3_empty_mntns, child_overmount_tmpfs)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			struct statmount *sm;
+			uint64_t root_id;
+			int fd, fsfd, mntfd;
+
+			if (count_mounts() != 1)
+				_exit(3);
+
+			/* Verify root is nullfs. */
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(4);
+
+			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+			if (!sm)
+				_exit(5);
+			if (!(sm->mask & STATMOUNT_FS_TYPE))
+				_exit(6);
+			if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+				_exit(7);
+			free(sm);
+
+			/* Create tmpfs via the new mount API. */
+			fsfd = sys_fsopen("tmpfs", 0);
+			if (fsfd < 0)
+				_exit(8);
+
+			if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING,
+					 "size", "1M", 0)) {
+				close(fsfd);
+				_exit(9);
+			}
+
+			if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE,
+					 NULL, NULL, 0)) {
+				close(fsfd);
+				_exit(10);
+			}
+
+			mntfd = sys_fsmount(fsfd, 0, 0);
+			close(fsfd);
+			if (mntfd < 0)
+				_exit(11);
+
+			/* Attach tmpfs to "/". */
+			if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
+					   MOVE_MOUNT_F_EMPTY_PATH)) {
+				close(mntfd);
+				_exit(12);
+			}
+
+			if (count_mounts() != 2) {
+				close(mntfd);
+				_exit(13);
+			}
+
+			/* Enter the tmpfs. */
+			if (fchdir(mntfd)) {
+				close(mntfd);
+				_exit(14);
+			}
+
+			if (chroot(".")) {
+				close(mntfd);
+				_exit(15);
+			}
+
+			close(mntfd);
+
+			/* Verify "/" is now tmpfs. */
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(16);
+
+			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+			if (!sm)
+				_exit(17);
+			if (!(sm->mask & STATMOUNT_FS_TYPE))
+				_exit(18);
+			if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
+				_exit(19);
+			free(sm);
+
+			/* Verify tmpfs is writable. */
+			fd = open("/testfile", O_CREAT | O_RDWR, 0644);
+			if (fd < 0)
+				_exit(20);
+
+			if (write(fd, "test", 4) != 4) {
+				close(fd);
+				_exit(21);
+			}
+			close(fd);
+
+			if (access("/testfile", F_OK))
+				_exit(22);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Multiple clone3 calls with CLONE_EMPTY_MNTNS produce children with
+ * distinct mount namespace root mount IDs.
+ */
+TEST_F(clone3_empty_mntns, repeated)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int pipe1[2], pipe2[2];
+		uint64_t id1 = 0, id2 = 0;
+		pid_t inner1, inner2;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (pipe(pipe1) || pipe(pipe2))
+			_exit(2);
+
+		inner1 = clone3_empty_mntns(0);
+		if (inner1 < 0)
+			_exit(3);
+
+		if (inner1 == 0) {
+			uint64_t root_id;
+
+			close(pipe1[0]);
+			root_id = get_unique_mnt_id("/");
+			if (write(pipe1[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+				_exit(1);
+			close(pipe1[1]);
+			_exit(0);
+		}
+
+		inner2 = clone3_empty_mntns(0);
+		if (inner2 < 0)
+			_exit(4);
+
+		if (inner2 == 0) {
+			uint64_t root_id;
+
+			close(pipe2[0]);
+			root_id = get_unique_mnt_id("/");
+			if (write(pipe2[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+				_exit(1);
+			close(pipe2[1]);
+			_exit(0);
+		}
+
+		close(pipe1[1]);
+		close(pipe2[1]);
+
+		if (read(pipe1[0], &id1, sizeof(id1)) != sizeof(id1))
+			_exit(5);
+		if (read(pipe2[0], &id2, sizeof(id2)) != sizeof(id2))
+			_exit(6);
+
+		close(pipe1[0]);
+		close(pipe2[0]);
+
+		if (wait_for_pid(inner1) || wait_for_pid(inner2))
+			_exit(7);
+
+		/* Each child must have a distinct root mount ID. */
+		if (id1 == 0 || id2 == 0)
+			_exit(8);
+		if (id1 == id2)
+			_exit(9);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify setns() into a child's empty mount namespace works.
+ */
+TEST_F(clone3_empty_mntns, setns_into_child_mntns)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int pipe_fd[2];
+		pid_t inner;
+		char c;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (pipe(pipe_fd))
+			_exit(2);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0) {
+			/* Signal parent we're ready. */
+			close(pipe_fd[0]);
+			if (write(pipe_fd[1], "r", 1) != 1)
+				_exit(1);
+
+			/*
+			 * Wait for parent to finish.  Reading from our
+			 * write end will block until the parent closes
+			 * its read end, giving us an implicit barrier.
+			 */
+			if (read(pipe_fd[1], &c, 1) < 0)
+				;
+			close(pipe_fd[1]);
+			_exit(0);
+		}
+
+		close(pipe_fd[1]);
+
+		/* Wait for child to be ready. */
+		if (read(pipe_fd[0], &c, 1) != 1)
+			_exit(4);
+
+		/* Open child's mount namespace. */
+		{
+			char path[64];
+			int mntns_fd;
+
+			snprintf(path, sizeof(path), "/proc/%d/ns/mnt", inner);
+			mntns_fd = open(path, O_RDONLY);
+			if (mntns_fd < 0)
+				_exit(5);
+
+			if (setns(mntns_fd, CLONE_NEWNS))
+				_exit(6);
+
+			close(mntns_fd);
+		}
+
+		/* Now we should be in the child's empty mntns. */
+		if (count_mounts() != 1)
+			_exit(7);
+
+		close(pipe_fd[0]);
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Tests below do not require CLONE_EMPTY_MNTNS support.
+ */
+
+/*
+ * Unknown 64-bit flags beyond the known set are rejected.
+ */
+TEST(unknown_flags_rejected)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= 0x800000000ULL,
+			.exit_signal	= SIGCHLD,
+		};
+		pid_t ret;
+
+		ret = sys_clone3(&args, sizeof(args));
+		if (ret >= 0) {
+			if (ret == 0)
+				_exit(0);
+			wait_for_pid(ret);
+			_exit(1);
+		}
+
+		if (errno != EINVAL)
+			_exit(2);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Regular clone3 with CLONE_NEWNS (without CLONE_EMPTY_MNTNS) still
+ * copies the full mount tree.
+ */
+TEST(clone3_newns_full_copy)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= CLONE_NEWNS,
+			.exit_signal	= SIGCHLD,
+		};
+		ssize_t parent_mounts;
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		parent_mounts = count_mounts();
+		if (parent_mounts < 1)
+			_exit(2);
+
+		inner = sys_clone3(&args, sizeof(args));
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0) {
+			/* Full copy should have at least as many mounts. */
+			if (count_mounts() < parent_mounts)
+				_exit(1);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN

From c62a4766937edec2962d52e583276b459b739f2d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Feb 2026 01:40:26 +0100
Subject: [PATCH 10/18] move_mount: transfer MNT_LOCKED

When performing a mount-beneath operation the target mount can often be
locked:

  unshare(CLONE_NEWUSER | CLONE_NEWNS);
  mount --beneath -t tmpfs tmpfs /proc

will fail because the procfs mount on /proc became locked when the mount
namespace was created from the parent mount namespace. Same logic for:

  unshare(CLONE_NEWUSER | CLONE_NEWNS);
  mount --beneath -t tmpfs tmpfs /

MNT_LOCKED is raised to prevent an unprivileged mount namespace from
revealing whatever is under a given mount. To replace the rootfs we need
to handle that case though.

We can simply transfer the locked mount property from the top mount to
the mount beneath. The new mount we mounted beneath the top mount takes
over the job of the top mount in protecting the parent mount from being
revealed. This leaves us free to allow the top mount to be unmounted.

This also works during mount propagation and also works for the
non-MOVE_MOUNT_BENEATH case:

(1) move_mount(MOVE_MOUNT_BENEATH): @source_mnt->overmount always NULL
(2) move_mount():                   @source_mnt->overmount maybe !NULL

For (1) can_move_mount_beneath() rejects overmounted @source_mnt (We
could allow this but whatever it's not really a use-case and it's fugly
to move an overmounted mount stack around. What are you even doing? So
let's keep that restriction.

For (2) we can have @source_mnt overmounted (Someone overmounted us
while we locked the target mount.). Both are fine. @source_mnt will be
mounted on whatever @q was mounted on and @q will be mounted on the top
of the @source_mnt mount stack. Even in such cases we can unlock @q and
lock @source_mnt if @q was locked.

This effectively makes mount propagation useful in cases where a mount
namespace has a locked mount somewhere and we propagate a new mount
beneath it but the mount namespace could never get at it because the old
top mount remains locked. Again, we just let the newly propagated mount
take over the protection and unlock the top mount.

Link: https://patch.msgid.link/20260224-work-mount-beneath-rootfs-v1-1-8c58bf08488f@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 555f0a10de9a..0aa87e55cad9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2646,6 +2646,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 
 			if (unlikely(shorter) && child != source_mnt)
 				mp = shorter;
+			/*
+			 * If @q was locked it was meant to hide
+			 * whatever was under it. Let @child take over
+			 * that job and lock it, then we can unlock @q.
+			 * That'll allow another namespace to shed @q
+			 * and reveal @child. Clearly, that mounter
+			 * consented to this by not severing the mount
+			 * relationship. Otherwise, what's the point.
+			 */
+			if (IS_MNT_LOCKED(q)) {
+				child->mnt.mnt_flags |= MNT_LOCKED;
+				q->mnt.mnt_flags &= ~MNT_LOCKED;
+			}
 			mnt_change_mountpoint(r, mp, q);
 		}
 	}
@@ -3534,9 +3547,6 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
 {
 	struct mount *parent_mnt_to = mnt_to->mnt_parent;
 
-	if (IS_MNT_LOCKED(mnt_to))
-		return -EINVAL;
-
 	/* Avoid creating shadow mounts during mount propagation. */
 	if (mnt_from->overmount)
 		return -EINVAL;

From ccfac16e0be52b674ac04fb5ba88c643f76ae0e1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Feb 2026 01:40:27 +0100
Subject: [PATCH 11/18] move_mount: allow MOVE_MOUNT_BENEATH on the rootfs

Allow MOVE_MOUNT_BENEATH to target the caller's rootfs. When the target
of a mount-beneath operation is the caller's root mount, verify that:

(1) The caller is located at the root of the mount, as enforced by
    path_mounted() in do_lock_mount().
(2) Propagation from the parent mount would not overmount the target,
    to avoid propagating beneath the rootfs of other mount namespaces.

The root-switching is decomposed into individually atomic, locally-scoped
steps: mount-beneath inserts the new root under the old one, chroot(".")
switches the caller's root, and umount2(".", MNT_DETACH) removes the old
root. Since each step only modifies the caller's own state, this avoids
cross-namespace vulnerabilities and inherent fork/unshare/setns races
that a chroot_fs_refs()-based approach would have.

Userspace can use the following workflow to switch roots:

    fd_tree = open_tree(-EBADF, "/newroot",
                        OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
    fchdir(fd_tree);
    move_mount(fd_tree, "", AT_FDCWD, "/",
               MOVE_MOUNT_BENEATH | MOVE_MOUNT_F_EMPTY_PATH);
    chroot(".");
    umount2(".", MNT_DETACH);

Link: https://patch.msgid.link/20260224-work-mount-beneath-rootfs-v1-2-8c58bf08488f@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 0aa87e55cad9..5e57ed9800f5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2735,7 +2735,7 @@ static inline struct mount *where_to_mount(const struct path *path,
  * In all cases the location must not have been unmounted and the
  * chosen mountpoint must be allowed to be mounted on.  For "beneath"
  * case we also require the location to be at the root of a mount
- * that has a parent (i.e. is not a root of some namespace).
+ * that has something mounted on top of it (i.e. has an overmount).
  */
 static void do_lock_mount(const struct path *path,
 			  struct pinned_mountpoint *res,
@@ -3528,8 +3528,6 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
  * @mnt_to:   mount under which to mount
  * @mp:   mountpoint of @mnt_to
  *
- * - Make sure that nothing can be mounted beneath the caller's current
- *   root or the rootfs of the namespace.
  * - Make sure that the caller can unmount the topmost mount ensuring
  *   that the caller could reveal the underlying mountpoint.
  * - Ensure that nothing has been mounted on top of @mnt_from before we
@@ -3543,7 +3541,7 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
  */
 static int can_move_mount_beneath(const struct mount *mnt_from,
 				  const struct mount *mnt_to,
-				  const struct mountpoint *mp)
+				  struct pinned_mountpoint *mp)
 {
 	struct mount *parent_mnt_to = mnt_to->mnt_parent;
 
@@ -3551,15 +3549,6 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
 	if (mnt_from->overmount)
 		return -EINVAL;
 
-	/*
-	 * Mounting beneath the rootfs only makes sense when the
-	 * semantics of pivot_root(".", ".") are used.
-	 */
-	if (&mnt_to->mnt == current->fs->root.mnt)
-		return -EINVAL;
-	if (parent_mnt_to == current->nsproxy->mnt_ns->root)
-		return -EINVAL;
-
 	if (mount_is_ancestor(mnt_to, mnt_from))
 		return -EINVAL;
 
@@ -3569,7 +3558,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
 	 * propagating a copy @c of @mnt_from on top of @mnt_to. This
 	 * defeats the whole purpose of mounting beneath another mount.
 	 */
-	if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
+	if (propagation_would_overmount(parent_mnt_to, mnt_to, mp->mp))
 		return -EINVAL;
 
 	/*
@@ -3585,7 +3574,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
 	 * @mnt_from beneath @mnt_to.
 	 */
 	if (check_mnt(mnt_from) &&
-	    propagation_would_overmount(parent_mnt_to, mnt_from, mp))
+	    propagation_would_overmount(parent_mnt_to, mnt_from, mp->mp))
 		return -EINVAL;
 
 	return 0;
@@ -3694,7 +3683,7 @@ static int do_move_mount(const struct path *old_path,
 
 		if (mp.parent != over->mnt_parent)
 			over = mp.parent->overmount;
-		err = can_move_mount_beneath(old, over, mp.mp);
+		err = can_move_mount_beneath(old, over, &mp);
 		if (err)
 			return err;
 	}

From bb5c17bc863d1ac9ee0d51d300d5399d632fe69f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Feb 2026 01:40:28 +0100
Subject: [PATCH 12/18] selftests/filesystems: add MOVE_MOUNT_BENEATH rootfs
 tests

Add tests for mounting beneath the rootfs using MOVE_MOUNT_BENEATH:

- beneath_rootfs_success: mount beneath /, fchdir, chroot, umount2
  MNT_DETACH -- verify root changed
- beneath_rootfs_old_root_stacked: after mount-beneath, verify old root
  parent is clone via statmount
- beneath_rootfs_in_chroot_fail: chroot into subdir of same mount,
  mount-beneath fails (dentry != mnt_root)
- beneath_rootfs_in_chroot_success: chroot into separate tmpfs mount,
  mount-beneath succeeds
- beneath_rootfs_locked_transfer: in user+mount ns: mount-beneath
  rootfs succeeds, MNT_LOCKED transfers, old root unmountable
- beneath_rootfs_locked_containment: in user+mount ns: after full
  root-switch workflow, new root is MNT_LOCKED (containment preserved)
- beneath_non_rootfs_locked_transfer: mounts created before
  unshare(CLONE_NEWUSER | CLONE_NEWNS) become locked; mount-beneath
  transfers MNT_LOCKED, displaced mount can be unmounted
- beneath_non_rootfs_locked_containment: same setup, verify new mount
  is MNT_LOCKED (containment preserved)

Link: https://patch.msgid.link/20260224-work-mount-beneath-rootfs-v1-3-8c58bf08488f@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/Makefile              |   1 +
 .../filesystems/move_mount/.gitignore         |   2 +
 .../selftests/filesystems/move_mount/Makefile |  10 +
 .../filesystems/move_mount/move_mount_test.c  | 492 ++++++++++++++++++
 4 files changed, 505 insertions(+)
 create mode 100644 tools/testing/selftests/filesystems/move_mount/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/move_mount/Makefile
 create mode 100644 tools/testing/selftests/filesystems/move_mount/move_mount_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 450f13ba4cca..2d05b3e1a26e 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -38,6 +38,7 @@ TARGETS += filesystems/overlayfs
 TARGETS += filesystems/statmount
 TARGETS += filesystems/mount-notify
 TARGETS += filesystems/fuse
+TARGETS += filesystems/move_mount
 TARGETS += firmware
 TARGETS += fpu
 TARGETS += ftrace
diff --git a/tools/testing/selftests/filesystems/move_mount/.gitignore b/tools/testing/selftests/filesystems/move_mount/.gitignore
new file mode 100644
index 000000000000..c7557db30671
--- /dev/null
+++ b/tools/testing/selftests/filesystems/move_mount/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+move_mount_test
diff --git a/tools/testing/selftests/filesystems/move_mount/Makefile b/tools/testing/selftests/filesystems/move_mount/Makefile
new file mode 100644
index 000000000000..5c5b199b464b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/move_mount/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := move_mount_test
+
+include ../../lib.mk
+
+$(OUTPUT)/move_mount_test: ../utils.c
diff --git a/tools/testing/selftests/filesystems/move_mount/move_mount_test.c b/tools/testing/selftests/filesystems/move_mount/move_mount_test.c
new file mode 100644
index 000000000000..f08f94b1f0ec
--- /dev/null
+++ b/tools/testing/selftests/filesystems/move_mount/move_mount_test.c
@@ -0,0 +1,492 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "../wrappers.h"
+#include "../utils.h"
+#include "../statmount/statmount.h"
+#include "../../kselftest_harness.h"
+
+#include <linux/stat.h>
+
+#ifndef MOVE_MOUNT_BENEATH
+#define MOVE_MOUNT_BENEATH 0x00000200
+#endif
+
+static uint64_t get_unique_mnt_id_fd(int fd)
+{
+	struct statx sx;
+	int ret;
+
+	ret = statx(fd, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &sx);
+	if (ret)
+		return 0;
+
+	if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE))
+		return 0;
+
+	return sx.stx_mnt_id;
+}
+
+/*
+ * Create a locked overmount stack at /mnt_dir for testing MNT_LOCKED
+ * transfer on non-rootfs mounts.
+ *
+ * Mounts tmpfs A at /mnt_dir, overmounts with tmpfs B, then enters a
+ * new user+mount namespace where both become locked. Returns the exit
+ * code to use on failure, or 0 on success.
+ */
+static int setup_locked_overmount(void)
+{
+	/* Isolate so mounts don't leak. */
+	if (unshare(CLONE_NEWNS))
+		return 1;
+	if (mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL))
+		return 2;
+
+	/*
+	 * Create mounts while still in the initial user namespace so
+	 * they become locked after the subsequent user namespace
+	 * unshare.
+	 */
+	rmdir("/mnt_dir");
+	if (mkdir("/mnt_dir", 0755))
+		return 3;
+
+	/* Mount tmpfs A */
+	if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL))
+		return 4;
+
+	/* Overmount with tmpfs B */
+	if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL))
+		return 5;
+
+	/*
+	 * Create user+mount namespace. Mounts A and B become locked
+	 * because they might be covering something that is not supposed
+	 * to be revealed.
+	 */
+	if (setup_userns())
+		return 6;
+
+	/* Sanity check: B must be locked */
+	if (!umount2("/mnt_dir", MNT_DETACH) || errno != EINVAL)
+		return 7;
+
+	return 0;
+}
+
+/*
+ * Create a detached tmpfs mount and return its fd, or -1 on failure.
+ */
+static int create_detached_tmpfs(void)
+{
+	int fs_fd, mnt_fd;
+
+	fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+	if (fs_fd < 0)
+		return -1;
+
+	if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) {
+		close(fs_fd);
+		return -1;
+	}
+
+	mnt_fd = sys_fsmount(fs_fd, FSMOUNT_CLOEXEC, 0);
+	close(fs_fd);
+	return mnt_fd;
+}
+
+FIXTURE(move_mount) {
+	uint64_t orig_root_id;
+};
+
+FIXTURE_SETUP(move_mount)
+{
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+
+	ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0);
+
+	self->orig_root_id = get_unique_mnt_id("/");
+	ASSERT_NE(self->orig_root_id, 0);
+}
+
+FIXTURE_TEARDOWN(move_mount)
+{
+}
+
+/*
+ * Test successful MOVE_MOUNT_BENEATH on the rootfs.
+ * Mount a clone beneath /, fchdir to the clone, chroot to switch root,
+ * then detach the old root.
+ */
+TEST_F(move_mount, beneath_rootfs_success)
+{
+	int fd_tree, ret;
+	uint64_t clone_id, root_id;
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, self->orig_root_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	/* Switch root to the clone */
+	ASSERT_EQ(chroot("."), 0);
+
+	/* Verify "/" is now the clone */
+	root_id = get_unique_mnt_id("/");
+	ASSERT_NE(root_id, 0);
+	ASSERT_EQ(root_id, clone_id);
+
+	/* Detach old root */
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+}
+
+/*
+ * Test that after MOVE_MOUNT_BENEATH on the rootfs the old root is
+ * stacked on top of the clone. Verify via statmount that the old
+ * root's parent is the clone.
+ */
+TEST_F(move_mount, beneath_rootfs_old_root_stacked)
+{
+	int fd_tree, ret;
+	uint64_t clone_id;
+	struct statmount sm;
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, self->orig_root_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	/* Old root's parent should now be the clone */
+	ASSERT_EQ(statmount(self->orig_root_id, 0, 0,
+			     STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0);
+	ASSERT_EQ(sm.mnt_parent_id, clone_id);
+
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+}
+
+/*
+ * Test that MOVE_MOUNT_BENEATH on rootfs fails when chroot'd into a
+ * subdirectory of the same mount. The caller's fs->root.dentry doesn't
+ * match mnt->mnt_root so the kernel rejects it.
+ */
+TEST_F(move_mount, beneath_rootfs_in_chroot_fail)
+{
+	int fd_tree, ret;
+	uint64_t chroot_id, clone_id;
+
+	rmdir("/chroot_dir");
+	ASSERT_EQ(mkdir("/chroot_dir", 0755), 0);
+
+	chroot_id = get_unique_mnt_id("/chroot_dir");
+	ASSERT_NE(chroot_id, 0);
+	ASSERT_EQ(self->orig_root_id, chroot_id);
+
+	ASSERT_EQ(chdir("/chroot_dir"), 0);
+	ASSERT_EQ(chroot("."), 0);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, chroot_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	/*
+	 * Should fail: fs->root.dentry (/chroot_dir) doesn't match
+	 * the mount's mnt_root (/).
+	 */
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	close(fd_tree);
+}
+
+/*
+ * Test that MOVE_MOUNT_BENEATH on rootfs succeeds when chroot'd into a
+ * separate tmpfs mount. The caller's root dentry matches the mount's
+ * mnt_root since it's a dedicated mount.
+ */
+TEST_F(move_mount, beneath_rootfs_in_chroot_success)
+{
+	int fd_tree, ret;
+	uint64_t chroot_id, clone_id, root_id;
+	struct statmount sm;
+
+	rmdir("/chroot_dir");
+	ASSERT_EQ(mkdir("/chroot_dir", 0755), 0);
+	ASSERT_EQ(mount("tmpfs", "/chroot_dir", "tmpfs", 0, NULL), 0);
+
+	chroot_id = get_unique_mnt_id("/chroot_dir");
+	ASSERT_NE(chroot_id, 0);
+
+	ASSERT_EQ(chdir("/chroot_dir"), 0);
+	ASSERT_EQ(chroot("."), 0);
+
+	ASSERT_EQ(get_unique_mnt_id("/"), chroot_id);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, chroot_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	root_id = get_unique_mnt_id("/");
+	ASSERT_NE(root_id, 0);
+	ASSERT_EQ(root_id, clone_id);
+
+	ASSERT_EQ(statmount(chroot_id, 0, 0,
+			     STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0);
+	ASSERT_EQ(sm.mnt_parent_id, clone_id);
+
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+}
+
+/*
+ * Test MNT_LOCKED transfer when mounting beneath rootfs in a user+mount
+ * namespace. After mount-beneath the new root gets MNT_LOCKED and the
+ * old root has MNT_LOCKED cleared so it can be unmounted.
+ */
+TEST_F(move_mount, beneath_rootfs_locked_transfer)
+{
+	int fd_tree, ret;
+	uint64_t clone_id, root_id;
+
+	ASSERT_EQ(setup_userns(), 0);
+
+	ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
+				AT_RECURSIVE);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+
+	/*
+	 * The old root should be unmountable (MNT_LOCKED was
+	 * transferred to the clone). If MNT_LOCKED wasn't
+	 * cleared, this would fail with EINVAL.
+	 */
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+
+	/* Verify "/" is still the clone after detaching old root */
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+}
+
+/*
+ * Test containment invariant: after mount-beneath rootfs in a user+mount
+ * namespace, the new root must be MNT_LOCKED. The lock transfer from the
+ * old root preserves containment -- the process cannot unmount the new root
+ * to escape the namespace.
+ */
+TEST_F(move_mount, beneath_rootfs_locked_containment)
+{
+	int fd_tree, ret;
+	uint64_t clone_id, root_id;
+
+	ASSERT_EQ(setup_userns(), 0);
+
+	ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0);
+
+	/* Sanity: rootfs must be locked in the new userns */
+	ASSERT_EQ(umount2("/", MNT_DETACH), -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
+				AT_RECURSIVE);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+
+	/* Detach old root (MNT_LOCKED was cleared from it) */
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+
+	/* Verify "/" is still the clone after detaching old root */
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+
+	/*
+	 * The new root must be locked (MNT_LOCKED was transferred
+	 * from the old root). Attempting to unmount it must fail
+	 * with EINVAL, preserving the containment invariant.
+	 */
+	ASSERT_EQ(umount2("/", MNT_DETACH), -1);
+	ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * Test MNT_LOCKED transfer when mounting beneath a non-rootfs locked mount.
+ * Mounts created before unshare(CLONE_NEWUSER | CLONE_NEWNS) become locked
+ * in the new namespace. Mount-beneath transfers the lock from the displaced
+ * mount to the new mount, so the displaced mount can be unmounted.
+ */
+TEST_F(move_mount, beneath_non_rootfs_locked_transfer)
+{
+	int mnt_fd, ret;
+	uint64_t mnt_new_id, mnt_visible_id;
+
+	ASSERT_EQ(setup_locked_overmount(), 0);
+
+	mnt_fd = create_detached_tmpfs();
+	ASSERT_GE(mnt_fd, 0);
+
+	mnt_new_id = get_unique_mnt_id_fd(mnt_fd);
+	ASSERT_NE(mnt_new_id, 0);
+
+	/* Move mount beneath B (which is locked) */
+	ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(mnt_fd);
+
+	/*
+	 * B should now be unmountable (MNT_LOCKED was transferred
+	 * to the new mount beneath it). If MNT_LOCKED wasn't
+	 * cleared from B, this would fail with EINVAL.
+	 */
+	ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0);
+
+	/* Verify the new mount is now visible */
+	mnt_visible_id = get_unique_mnt_id("/mnt_dir");
+	ASSERT_EQ(mnt_visible_id, mnt_new_id);
+}
+
+/*
+ * Test MNT_LOCKED containment when mounting beneath a non-rootfs mount
+ * that was locked during unshare(CLONE_NEWUSER | CLONE_NEWNS).
+ * Mounts created before unshare become locked in the new namespace.
+ * Mount-beneath transfers the lock, preserving containment: the new
+ * mount cannot be unmounted, but the displaced mount can.
+ */
+TEST_F(move_mount, beneath_non_rootfs_locked_containment)
+{
+	int mnt_fd, ret;
+	uint64_t mnt_new_id, mnt_visible_id;
+
+	ASSERT_EQ(setup_locked_overmount(), 0);
+
+	mnt_fd = create_detached_tmpfs();
+	ASSERT_GE(mnt_fd, 0);
+
+	mnt_new_id = get_unique_mnt_id_fd(mnt_fd);
+	ASSERT_NE(mnt_new_id, 0);
+
+	/*
+	 * Move new tmpfs beneath B at /mnt_dir.
+	 * Stack becomes: A -> new -> B
+	 * Lock transfers from B to new.
+	 */
+	ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(mnt_fd);
+
+	/*
+	 * B lost MNT_LOCKED -- unmounting it must succeed.
+	 * This reveals the new mount at /mnt_dir.
+	 */
+	ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0);
+
+	/* Verify the new mount is now visible */
+	mnt_visible_id = get_unique_mnt_id("/mnt_dir");
+	ASSERT_EQ(mnt_visible_id, mnt_new_id);
+
+	/*
+	 * The new mount gained MNT_LOCKED -- unmounting it must
+	 * fail with EINVAL, preserving the containment invariant.
+	 */
+	ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), -1);
+	ASSERT_EQ(errno, EINVAL);
+}
+
+TEST_HARNESS_MAIN

From ad4999496e73923adb524b24c2f448c9498476b5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 23 Mar 2026 15:05:07 +0100
Subject: [PATCH 13/18] mount: always duplicate mount

In the OPEN_TREE_NAMESPACE path vfs_open_tree() resolves a path via
filename_lookup() without holding namespace_lock. Between the lookup
and create_new_namespace() acquiring namespace_lock via
LOCK_MOUNT_EXACT_COPY() another thread can unmount the mount, setting
mnt->mnt_ns to NULL.

When create_new_namespace() then checks !mnt->mnt_ns it incorrectly
takes the swap-and-mntget path that was designed for fsmount()'s
detached mounts. This reuses a mount whose mnt_mp_list is in an
inconsistent state from the concurrent unmount, causing a general
protection fault in __umount_mnt() -> hlist_del_init(&mnt->mnt_mp_list)
during namespace teardown.

Remove the !mnt->mnt_ns special case entirely. Instead, always
duplicate the mount:

 - For OPEN_TREE_NAMESPACE use __do_loopback() which will properly
   clone the mount or reject it via may_copy_tree() if it was
   unmounted in the race window.
 - For fsmount() use clone_mnt() directly (via the new MOUNT_COPY_NEW
   flag) since the mount is freshly created by vfs_create_mount() and
   not in any namespace so __do_loopback()'s IS_MNT_UNBINDABLE,
   may_copy_tree, and __has_locked_children checks don't apply.

Reported-by: syzbot+e4470cc28308f2081ec8@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 5e57ed9800f5..fe919abd2f01 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3086,8 +3086,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
 	return file;
 }
 
+enum mount_copy_flags_t {
+	MOUNT_COPY_RECURSIVE    = (1 << 0),
+	MOUNT_COPY_NEW		= (1 << 1),
+};
+
 static struct mnt_namespace *create_new_namespace(struct path *path,
-						  bool recurse)
+						  enum mount_copy_flags_t flags)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct user_namespace *user_ns = current_user_ns();
@@ -3096,7 +3101,7 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	struct path to_path;
 	struct mount *mnt;
 	unsigned int copy_flags = 0;
-	bool locked = false;
+	bool locked = false, recurse = flags & MOUNT_COPY_RECURSIVE;
 
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SLAVE;
@@ -3135,22 +3140,10 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	 * the restrictions of creating detached bind-mounts. It has a
 	 * lot saner and simpler semantics.
 	 */
-	mnt = real_mount(path->mnt);
-	if (!mnt->mnt_ns) {
-		/*
-		 * If we're moving into a new mount namespace via
-		 * fsmount() swap the mount ids so the nullfs mount id
-		 * is the lowest in the mount namespace avoiding another
-		 * useless copy. This is fine we're not attached to any
-		 * mount namespace so the mount ids are pure decoration
-		 * at that point.
-		 */
-		swap(mnt->mnt_id_unique, new_ns_root->mnt_id_unique);
-		swap(mnt->mnt_id, new_ns_root->mnt_id);
-		mntget(&mnt->mnt);
-	} else {
+	if (flags & MOUNT_COPY_NEW)
+		mnt = clone_mnt(real_mount(path->mnt), path->dentry, copy_flags);
+	else
 		mnt = __do_loopback(path, recurse, copy_flags);
-	}
 	scoped_guard(mount_writer) {
 		if (IS_ERR(mnt)) {
 			emptied_ns = new_ns;
@@ -3179,11 +3172,12 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	return new_ns;
 }
 
-static struct file *open_new_namespace(struct path *path, bool recurse)
+static struct file *open_new_namespace(struct path *path,
+				       enum mount_copy_flags_t flags)
 {
 	struct mnt_namespace *new_ns;
 
-	new_ns = create_new_namespace(path, recurse);
+	new_ns = create_new_namespace(path, flags);
 	if (IS_ERR(new_ns))
 		return ERR_CAST(new_ns);
 	return open_namespace_file(to_ns_common(new_ns));
@@ -3232,7 +3226,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
 		return ERR_PTR(ret);
 
 	if (flags & OPEN_TREE_NAMESPACE)
-		return open_new_namespace(&path, (flags & AT_RECURSIVE));
+		return open_new_namespace(&path, (flags & AT_RECURSIVE) ? MOUNT_COPY_RECURSIVE : 0);
 
 	if (flags & OPEN_TREE_CLONE)
 		return open_detached_copy(&path, flags);
@@ -4519,7 +4513,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 
 	if (flags & FSMOUNT_NAMESPACE)
 		return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
-			      open_new_namespace(&new_path, 0));
+			      open_new_namespace(&new_path, MOUNT_COPY_NEW));
 
 	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
 	if (IS_ERR(ns))

From a27e4642629381ed36d7e22d5b6fff5792ec31f6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 23 Mar 2026 16:31:30 +0100
Subject: [PATCH 14/18] selftests/statmount: remove duplicate wait_for_pid()

Remove the local static wait_for_pid() definition from
statmount_test_ns.c as it conflicts with the extern declaration in
utils.h. The identical function is already provided by utils.c.

Fixes: 3ac7ea91f3d0 ("selftests: add FSMOUNT_NAMESPACE tests")
Cc: <stable@kernel.org> # mainline only
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../filesystems/statmount/statmount_test_ns.c | 25 -------------------
 1 file changed, 25 deletions(-)

diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
index 063d9de46431..e500905e4c07 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@@ -34,31 +34,6 @@ static void handle_result(int ret, const char *testname)
 		ksft_test_result_skip("%s\n", testname);
 }
 
-static inline int wait_for_pid(pid_t pid)
-{
-	int status, ret;
-
-again:
-	ret = waitpid(pid, &status, 0);
-	if (ret == -1) {
-		if (errno == EINTR)
-			goto again;
-
-		ksft_print_msg("waitpid returned -1, errno=%d\n", errno);
-		return -1;
-	}
-
-	if (!WIFEXITED(status)) {
-		ksft_print_msg(
-		       "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n",
-		       WIFSIGNALED(status), WTERMSIG(status));
-		return -1;
-	}
-
-	ret = WEXITSTATUS(status);
-	return ret;
-}
-
 static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id)
 {
 	int fd = open(mnt_ns, O_RDONLY);

From 1a398a23787506360b4c766270de00abf51b27c8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 23 Mar 2026 17:00:09 +0100
Subject: [PATCH 15/18] selftests/empty_mntns: fix statmount_alloc() signature
 mismatch

empty_mntns.h includes ../statmount/statmount.h which provides a
4-argument statmount_alloc(mnt_id, mnt_ns_id, mask, flags), but then
redefines its own 3-argument version without the flags parameter. This
causes a build failure due to conflicting types.

Remove the duplicate definition from empty_mntns.h and update all
callers to pass 0 for the flags argument.

Fixes: 32f54f2bbccf ("selftests/filesystems: add tests for empty mount namespaces")
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../empty_mntns/clone3_empty_mntns_test.c     |  6 ++---
 .../filesystems/empty_mntns/empty_mntns.h     | 25 -------------------
 .../empty_mntns/empty_mntns_test.c            |  8 +++---
 .../empty_mntns/overmount_chroot_test.c       |  2 +-
 4 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
index 130cc1a1b407..d800f661cb7b 100644
--- a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
+++ b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
@@ -493,7 +493,7 @@ TEST_F(clone3_empty_mntns, mount_properties)
 			sm = statmount_alloc(root_id, 0,
 					     STATMOUNT_MNT_BASIC |
 					     STATMOUNT_MNT_POINT |
-					     STATMOUNT_FS_TYPE);
+					     STATMOUNT_FS_TYPE, 0);
 			if (!sm)
 				_exit(4);
 
@@ -607,7 +607,7 @@ TEST_F(clone3_empty_mntns, child_overmount_tmpfs)
 			if (!root_id)
 				_exit(4);
 
-			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
 			if (!sm)
 				_exit(5);
 			if (!(sm->mask & STATMOUNT_FS_TYPE))
@@ -668,7 +668,7 @@ TEST_F(clone3_empty_mntns, child_overmount_tmpfs)
 			if (!root_id)
 				_exit(16);
 
-			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
 			if (!sm)
 				_exit(17);
 			if (!(sm->mask & STATMOUNT_FS_TYPE))
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
index dfd24c88eec8..3d9c6b14bbef 100644
--- a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
@@ -22,29 +22,4 @@ static inline ssize_t count_mounts(void)
 	return listmount(LSMT_ROOT, 0, 0, list, sizeof(list) / sizeof(list[0]), 0);
 }
 
-static inline struct statmount *statmount_alloc(uint64_t mnt_id,
-						uint64_t mnt_ns_id,
-						uint64_t mask)
-{
-	size_t bufsize = 1 << 15;
-	struct statmount *buf;
-	int ret;
-
-	for (;;) {
-		buf = malloc(bufsize);
-		if (!buf)
-			return NULL;
-
-		ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, 0);
-		if (ret == 0)
-			return buf;
-
-		free(buf);
-		if (errno != EOVERFLOW)
-			return NULL;
-
-		bufsize <<= 1;
-	}
-}
-
 #endif /* EMPTY_MNTNS_H */
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
index 733aad83dbbf..43e296b97d84 100644
--- a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
@@ -283,7 +283,7 @@ TEST_F(empty_mntns, cwd_reset)
 		if (root_id != cwd_id)
 			_exit(9);
 
-		sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
+		sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, 0);
 		if (!sm)
 			_exit(10);
 
@@ -320,7 +320,7 @@ TEST_F(empty_mntns, mount_properties)
 			_exit(3);
 
 		sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
-				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE);
+				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0);
 		if (!sm)
 			_exit(4);
 
@@ -490,7 +490,7 @@ TEST_F(empty_mntns, overmount_tmpfs)
 			_exit(4);
 
 		/* Verify root is nullfs */
-		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
 		if (!sm)
 			_exit(5);
 
@@ -574,7 +574,7 @@ TEST_F(empty_mntns, overmount_tmpfs)
 		if (!root_id)
 			_exit(19);
 
-		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE);
+		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
 		if (!sm)
 			_exit(20);
 
diff --git a/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
index 0b623d0c6bb9..6e21c58258c3 100644
--- a/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
+++ b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
@@ -204,7 +204,7 @@ TEST(overmount_chroot)
 		/* Verify the root mount is tmpfs */
 		sm = statmount_alloc(root_id_after, 0,
 				     STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
-				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE);
+				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0);
 		if (!sm)
 			_exit(21);
 

From d38aa6cdee8e09d77ce3a6c5b04800fb3b146d69 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 23 Mar 2026 17:41:00 +0100
Subject: [PATCH 16/18] selftests/empty_mntns: fix wrong CLONE_EMPTY_MNTNS hex
 value in comment

CLONE_EMPTY_MNTNS is (1ULL << 37) = 0x2000000000ULL, not 0x400000000ULL.

Fixes: 5b8ffd63fbd9 ("selftests/filesystems: add clone3 tests for empty mount namespaces")
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
index d800f661cb7b..6370086f886d 100644
--- a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
+++ b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
@@ -4,7 +4,7 @@
  *
  * These tests exercise the clone3() code path for creating empty mount
  * namespaces, which is distinct from the unshare() path tested in
- * empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS (0x400000000ULL)
+ * empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS (0x2000000000ULL)
  * is a 64-bit flag that implies CLONE_NEWNS.  The implication happens in
  * kernel_clone() before copy_process(), unlike unshare() where it goes
  * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in

From 660c09404cdabfe969d58375e990d2955af59797 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 23 Mar 2026 17:41:08 +0100
Subject: [PATCH 17/18] selftests/fsmount_ns: add missing TARGETS and fix cap
 test

Add missing top-level kselftest TARGETS entries for empty_mntns and
fsmount_ns so that 'make kselftest' discovers and runs these tests.

Fix requires_cap_sys_admin test which always SKIPped because fsopen()
was called after enter_userns(), where CAP_SYS_ADMIN in the mount
namespace's user_ns is unavailable. Move fsopen/fsconfig before fork so
the configured fs_fd is inherited by the child, which then only needs to
call fsmount() after dropping privileges.

Fixes: 3ac7ea91f3d0 ("selftests: add FSMOUNT_NAMESPACE tests")
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/Makefile              |  2 ++
 .../filesystems/fsmount_ns/fsmount_ns_test.c  | 29 +++++++++----------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 2d05b3e1a26e..5677dd8b2a76 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -39,6 +39,8 @@ TARGETS += filesystems/statmount
 TARGETS += filesystems/mount-notify
 TARGETS += filesystems/fuse
 TARGETS += filesystems/move_mount
+TARGETS += filesystems/empty_mntns
+TARGETS += filesystems/fsmount_ns
 TARGETS += firmware
 TARGETS += fpu
 TARGETS += ftrace
diff --git a/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c
index 239abbe63814..b70b3051eed4 100644
--- a/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c
+++ b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c
@@ -388,12 +388,23 @@ TEST_F(fsmount_ns_caps, requires_cap_sys_admin)
 {
 	pid_t pid;
 	int status;
+	int fs_fd;
+
+	/*
+	 * Prepare the configured filesystem fd as root before forking.
+	 * fsopen() requires CAP_SYS_ADMIN in the mount namespace's
+	 * user_ns, which won't be available after enter_userns().
+	 */
+	fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+	ASSERT_GE(fs_fd, 0);
+
+	ASSERT_EQ(sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
 
 	pid = fork();
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0) {
-		int fs_fd, fd;
+		int fd;
 
 		/* Child: drop privileges using utils.h helper */
 		if (enter_userns() != 0)
@@ -403,15 +414,6 @@ TEST_F(fsmount_ns_caps, requires_cap_sys_admin)
 		if (caps_down() == 0)
 			_exit(3);
 
-		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
-		if (fs_fd < 0)
-			_exit(4);
-
-		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
-			close(fs_fd);
-			_exit(5);
-		}
-
 		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
 		close(fs_fd);
 
@@ -432,6 +434,7 @@ TEST_F(fsmount_ns_caps, requires_cap_sys_admin)
 		_exit(7);
 	}
 
+	close(fs_fd);
 	ASSERT_EQ(waitpid(pid, &status, 0), pid);
 	ASSERT_TRUE(WIFEXITED(status));
 
@@ -448,12 +451,6 @@ TEST_F(fsmount_ns_caps, requires_cap_sys_admin)
 	case 3:
 		SKIP(return, "caps_down failed");
 		break;
-	case 4:
-		SKIP(return, "fsopen failed in userns");
-		break;
-	case 5:
-		SKIP(return, "fsconfig CMD_CREATE failed in userns");
-		break;
 	case 6:
 		SKIP(return, "FSMOUNT_NAMESPACE not supported");
 		break;

From cad3bf1c330274d11f25f1b7afae9b9dba13fbd3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 31 Mar 2026 10:58:58 +0200
Subject: [PATCH 18/18] selftests/namespaces: remove unused utils.h include
 from listns_efault_test

Remove the inclusion of ../filesystems/utils.h from listns_efault_test.c.
The test doesn't use any symbols from that header. Including it alongside
../pidfd/pidfd.h causes a build failure because both headers define
wait_for_pid() with conflicting linkage:

  ../filesystems/utils.h:  extern int wait_for_pid(pid_t pid);
  ../pidfd/pidfd.h:        static inline int wait_for_pid(pid_t pid)

All symbols the test actually uses (create_child, read_nointr,
write_nointr, sys_pidfd_send_signal) come from pidfd.h.

Reported-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/all/acPV19IY3Gna6Ira@sirena.org.uk
Fixes: 07d7ad46dad4 ("selftests/namespaces: test for efault")
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/namespaces/listns_efault_test.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
index c7ed4023d7a8..b570746e917c 100644
--- a/tools/testing/selftests/namespaces/listns_efault_test.c
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -19,7 +19,6 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #include "../kselftest_harness.h"
-#include "../filesystems/utils.h"
 #include "../pidfd/pidfd.h"
 #include "wrappers.h"