mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
Previously, missing time namespace support in the vDSO meant that time namespaces needed to be disabled globally. This was expressed in a hard dependency on the generic vDSO library. This also meant that architectures without any vDSO or only a stub vDSO could not enable time namespaces. Now that all architectures using a real vDSO are using the generic library, that dependency is not necessary anymore. Remove the dependency and let all architectures enable time namespaces. Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de> Signed-off-by: Thomas Gleixner <tglx@kernel.org> Link: https://patch.msgid.link/20260326-vdso-timens-decoupling-v2-2-c82693a7775f@linutronix.de
161 lines
4.2 KiB
C
161 lines
4.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Author: Andrei Vagin <avagin@openvz.org>
|
|
* Author: Dmitry Safonov <dima@arista.com>
|
|
*/
|
|
|
|
#include <linux/cleanup.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/time_namespace.h>
|
|
#include <linux/time.h>
|
|
#include <linux/vdso_datastore.h>
|
|
|
|
#include <vdso/clocksource.h>
|
|
#include <vdso/datapage.h>
|
|
|
|
#include "namespace_internal.h"
|
|
|
|
static struct timens_offset offset_from_ts(struct timespec64 off)
|
|
{
|
|
struct timens_offset ret;
|
|
|
|
ret.sec = off.tv_sec;
|
|
ret.nsec = off.tv_nsec;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* A time namespace VVAR page has the same layout as the VVAR page which
|
|
* contains the system wide VDSO data.
|
|
*
|
|
* For a normal task the VVAR pages are installed in the normal ordering:
|
|
* VVAR
|
|
* PVCLOCK
|
|
* HVCLOCK
|
|
* TIMENS <- Not really required
|
|
*
|
|
* Now for a timens task the pages are installed in the following order:
|
|
* TIMENS
|
|
* PVCLOCK
|
|
* HVCLOCK
|
|
* VVAR
|
|
*
|
|
* The check for vdso_clock->clock_mode is in the unlikely path of
|
|
* the seq begin magic. So for the non-timens case most of the time
|
|
* 'seq' is even, so the branch is not taken.
|
|
*
|
|
* If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
|
|
* for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
|
|
* update to finish and for 'seq' to become even anyway.
|
|
*
|
|
* Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
|
|
* enforces the time namespace handling path.
|
|
*/
|
|
static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
|
|
struct time_namespace *ns)
|
|
{
|
|
struct timens_offset *offset = vc->offset;
|
|
struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
|
|
struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
|
|
|
|
vc->seq = 1;
|
|
vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
|
|
offset[CLOCK_MONOTONIC] = monotonic;
|
|
offset[CLOCK_MONOTONIC_RAW] = monotonic;
|
|
offset[CLOCK_MONOTONIC_COARSE] = monotonic;
|
|
offset[CLOCK_BOOTTIME] = boottime;
|
|
offset[CLOCK_BOOTTIME_ALARM] = boottime;
|
|
}
|
|
|
|
struct page *find_timens_vvar_page(struct vm_area_struct *vma)
|
|
{
|
|
if (likely(vma->vm_mm == current->mm))
|
|
return current->nsproxy->time_ns->vvar_page;
|
|
|
|
/*
|
|
* VM_PFNMAP | VM_IO protect .fault() handler from being called
|
|
* through interfaces like /proc/$pid/mem or
|
|
* process_vm_{readv,writev}() as long as there's no .access()
|
|
* in special_mapping_vmops().
|
|
* For more details check_vma_flags() and __access_remote_vm()
|
|
*/
|
|
|
|
WARN(1, "vvar_page accessed remotely");
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void timens_set_vvar_page(struct task_struct *task,
|
|
struct time_namespace *ns)
|
|
{
|
|
struct vdso_time_data *vdata;
|
|
struct vdso_clock *vc;
|
|
unsigned int i;
|
|
|
|
if (ns == &init_time_ns)
|
|
return;
|
|
|
|
/* Fast-path, taken by every task in namespace except the first. */
|
|
if (likely(ns->frozen_offsets))
|
|
return;
|
|
|
|
guard(mutex)(&timens_offset_lock);
|
|
/* Nothing to-do: vvar_page has been already initialized. */
|
|
if (ns->frozen_offsets)
|
|
return;
|
|
|
|
ns->frozen_offsets = true;
|
|
vdata = page_address(ns->vvar_page);
|
|
vc = vdata->clock_data;
|
|
|
|
for (i = 0; i < CS_BASES; i++)
|
|
timens_setup_vdso_clock_data(&vc[i], ns);
|
|
|
|
if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
|
|
for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
|
|
timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The vvar page layout depends on whether a task belongs to the root or
|
|
* non-root time namespace. Whenever a task changes its namespace, the VVAR
|
|
* page tables are cleared and then they will be re-faulted with a
|
|
* corresponding layout.
|
|
* See also the comment near timens_setup_vdso_clock_data() for details.
|
|
*/
|
|
static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
|
{
|
|
struct mm_struct *mm = task->mm;
|
|
struct vm_area_struct *vma;
|
|
VMA_ITERATOR(vmi, mm, 0);
|
|
|
|
guard(mmap_read_lock)(mm);
|
|
for_each_vma(vmi, vma) {
|
|
if (vma_is_special_mapping(vma, &vdso_vvar_mapping))
|
|
zap_vma_pages(vma);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
|
|
{
|
|
timens_set_vvar_page(tsk, ns);
|
|
vdso_join_timens(tsk, ns);
|
|
}
|
|
|
|
int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
|
|
{
|
|
ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
|
if (!ns->vvar_page)
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void timens_vdso_free_vvar_page(struct time_namespace *ns)
|
|
{
|
|
__free_page(ns->vvar_page);
|
|
}
|