mount: add FSMOUNT_NAMESPACE

Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount namespace with the newly created filesystem attached to a copy of the real rootfs. This returns a namespace file descriptor instead of an O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for open_tree(). This allows creating a new filesystem and immediately placing it in a new mount namespace in a single operation, which is useful for container runtimes and other namespace-based isolation mechanisms. The rootfs mount is created before copying the real rootfs for the new namespace meaning that the mount namespace id for the mount of the root of the namespace is bigger than the child mounted on top of it. We've never explicitly given the guarantee for such ordering and I doubt anyone relies on it. Accepting that lets us avoid copying the mount again and also avoids having to massage may_copy_tree() to grant an exception for fsmount->mnt->mnt_ns being NULL. Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-3-5ef0a886e646@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2026-04-18 06:44:00 -04:00 · 2026-01-22 11:48:48 +01:00
parent ad4a3599e5
commit 5e8969bd19
2 changed files with 31 additions and 7 deletions
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3118,11 +3118,26 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	}

 	/*
-	 * We don't emulate unshare()ing a mount namespace. We stick
-	 * to the restrictions of creating detached bind-mounts. It
-	 * has a lot saner and simpler semantics.
+	 * We don't emulate unshare()ing a mount namespace. We stick to
+	 * the restrictions of creating detached bind-mounts. It has a
+	 * lot saner and simpler semantics.
 	 */
-	mnt = __do_loopback(path, recurse, copy_flags);
+	mnt = real_mount(path->mnt);
+	if (!mnt->mnt_ns) {
+		/*
+		 * If we're moving into a new mount namespace via
+		 * fsmount() swap the mount ids so the nullfs mount id
+		 * is the lowest in the mount namespace avoiding another
+		 * useless copy. This is fine we're not attached to any
+		 * mount namespace so the mount ids are pure decoration
+		 * at that point.
+		 */
+		swap(mnt->mnt_id_unique, new_ns_root->mnt_id_unique);
+		swap(mnt->mnt_id, new_ns_root->mnt_id);
+		mntget(&mnt->mnt);
+	} else {
+		mnt = __do_loopback(path, recurse, copy_flags);
+	}
 	scoped_guard(mount_writer) {
 		if (IS_ERR(mnt)) {
 			emptied_ns = new_ns;
@@ -4401,11 +4416,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	unsigned int mnt_flags = 0;
 	long ret;

-	if (!may_mount())
+	if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
+		return -EINVAL;
+
+	if ((flags & FSMOUNT_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;

-	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
-		return -EINVAL;
+	if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
+		return -EPERM;

 	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
 		return -EINVAL;
@@ -4472,6 +4491,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	 */
 	vfs_clean_context(fc);

+	if (flags & FSMOUNT_NAMESPACE)
+		return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
+			      open_new_namespace(&new_path, 0));
+
 	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
 	if (IS_ERR(ns))
 		return PTR_ERR(ns);