linux/kernel/hung_task.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Detect Hung Task
 *
 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
 *
 */

#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/export.h>
#include <linux/panic_notifier.h>
#include <linux/sysctl.h>
#include <linux/suspend.h>
#include <linux/utsname.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
#include <linux/hung_task.h>
#include <linux/rwsem.h>
#include <linux/sys_info.h>

#include <trace/events/sched.h>

/*
 * The number of tasks checked:
 */
static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;

/*
 * Total number of tasks detected as hung since boot:
 */
static atomic_long_t sysctl_hung_task_detect_count = ATOMIC_LONG_INIT(0);

/*
 * Limit number of tasks checked in a batch.
 *
 * This value controls the preemptibility of khungtaskd since preemption
 * is disabled during the critical section. It also controls the size of
 * the RCU grace period. So it needs to be upper-bound.
 */
#define HUNG_TASK_LOCK_BREAK (HZ / 10)

/*
 * Zero means infinite timeout - no checking done:
 */
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;

/*
 * Zero (default value) means use sysctl_hung_task_timeout_secs:
 */
static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;

static int __read_mostly sysctl_hung_task_warnings = 10;

static int __read_mostly did_panic;
static bool hung_task_call_panic;

static struct task_struct *watchdog_task;

/*
 * A bitmask to control what kinds of system info to be printed when
 * a hung task is detected, it could be task, memory, lock etc. Refer
 * include/linux/sys_info.h for detailed bit definition.
 */
static unsigned long hung_task_si_mask;

#ifdef CONFIG_SMP
/*
 * Should we dump all CPUs backtraces in a hung task event?
 * Defaults to 0, can be changed via sysctl.
 */
static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
#else
#define sysctl_hung_task_all_cpu_backtrace 0
#endif /* CONFIG_SMP */

/*
 * Should we panic (and reboot, if panic_timeout= is set) when a
 * hung task is detected:
 */
static unsigned int __read_mostly sysctl_hung_task_panic =
	CONFIG_BOOTPARAM_HUNG_TASK_PANIC;

static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
	did_panic = 1;

	return NOTIFY_DONE;
}

static struct notifier_block panic_block = {
	.notifier_call = hung_task_panic,
};

static bool task_is_hung(struct task_struct *t, unsigned long timeout)
{
	unsigned long switch_count = t->nvcsw + t->nivcsw;
	unsigned int state = READ_ONCE(t->__state);

	/*
	 * skip the TASK_KILLABLE tasks -- these can be killed
	 * skip the TASK_IDLE tasks -- those are genuinely idle
	 * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer
	 */
	if (!(state & TASK_UNINTERRUPTIBLE) ||
	    (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN)))
		return false;

	/*
	 * When a freshly created task is scheduled once, changes its state to
	 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
	 * musn't be checked.
	 */
	if (unlikely(!switch_count))
		return false;

	if (switch_count != t->last_switch_count) {
		t->last_switch_count = switch_count;
		t->last_switch_time = jiffies;
		return false;
	}
	if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
		return false;

	return true;
}

#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
static void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
	struct task_struct *g, *t;
	unsigned long owner, blocker, blocker_type;
	const char *rwsem_blocked_by, *rwsem_blocked_as;

	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");

	blocker = READ_ONCE(task->blocker);
	if (!blocker)
		return;

	blocker_type = hung_task_get_blocker_type(blocker);

	switch (blocker_type) {
	case BLOCKER_TYPE_MUTEX:
		owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
		break;
	case BLOCKER_TYPE_SEM:
		owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
		break;
	case BLOCKER_TYPE_RWSEM_READER:
	case BLOCKER_TYPE_RWSEM_WRITER:
		owner = (unsigned long)rwsem_owner(
					hung_task_blocker_to_lock(blocker));
		rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
					"reader" : "writer";
		rwsem_blocked_by = is_rwsem_reader_owned(
					hung_task_blocker_to_lock(blocker)) ?
					"reader" : "writer";
		break;
	default:
		WARN_ON_ONCE(1);
		return;
	}


	if (unlikely(!owner)) {
		switch (blocker_type) {
		case BLOCKER_TYPE_MUTEX:
			pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
			       task->comm, task->pid);
			break;
		case BLOCKER_TYPE_SEM:
			pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
			       task->comm, task->pid);
			break;
		case BLOCKER_TYPE_RWSEM_READER:
		case BLOCKER_TYPE_RWSEM_WRITER:
			pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
			       task->comm, task->pid);
			break;
		}
		return;
	}

	/* Ensure the owner information is correct. */
	for_each_process_thread(g, t) {
		if ((unsigned long)t != owner)
			continue;

		switch (blocker_type) {
		case BLOCKER_TYPE_MUTEX:
			pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
			       task->comm, task->pid, t->comm, t->pid);
			break;
		case BLOCKER_TYPE_SEM:
			pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
			       task->comm, task->pid, t->comm, t->pid);
			break;
		case BLOCKER_TYPE_RWSEM_READER:
		case BLOCKER_TYPE_RWSEM_WRITER:
			pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
			       task->comm, task->pid, rwsem_blocked_as, t->comm,
			       t->pid, rwsem_blocked_by);
			break;
		}
		/* Avoid duplicated task dump, skip if the task is also hung. */
		if (!task_is_hung(t, timeout))
			sched_show_task(t);
		return;
	}
}
#else
static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
}
#endif

/**
 * hung_task_info - Print diagnostic details for a hung task
 * @t: Pointer to the detected hung task.
 * @timeout: Timeout threshold for detecting hung tasks
 * @this_round_count: Count of hung tasks detected in the current iteration
 *
 * Print structured information about the specified hung task, if warnings
 * are enabled or if the panic batch threshold is exceeded.
 */
static void hung_task_info(struct task_struct *t, unsigned long timeout,
			   unsigned long this_round_count)
{
	trace_sched_process_hang(t);

	if (sysctl_hung_task_panic && this_round_count >= sysctl_hung_task_panic) {
		console_verbose();
		hung_task_call_panic = true;
	}

	/*
	 * The given task did not get scheduled for more than
	 * CONFIG_DEFAULT_HUNG_TASK_TIMEOUT. Therefore, complain
	 * accordingly
	 */
	if (sysctl_hung_task_warnings || hung_task_call_panic) {
		if (sysctl_hung_task_warnings > 0)
			sysctl_hung_task_warnings--;
		pr_err("INFO: task %s:%d blocked%s for more than %ld seconds.\n",
		       t->comm, t->pid, t->in_iowait ? " in I/O wait" : "",
		       (jiffies - t->last_switch_time) / HZ);
		pr_err("      %s %s %.*s\n",
			print_tainted(), init_utsname()->release,
			(int)strcspn(init_utsname()->version, " "),
			init_utsname()->version);
		if (t->flags & PF_POSTCOREDUMP)
			pr_err("      Blocked by coredump.\n");
		pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
			" disables this message.\n");
		sched_show_task(t);
		debug_show_blocker(t, timeout);

		if (!sysctl_hung_task_warnings)
			pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
	}

	touch_nmi_watchdog();
}

/*
 * To avoid extending the RCU grace period for an unbounded amount of time,
 * periodically exit the critical section and enter a new one.
 *
 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
 * to exit the grace period. For classic RCU, a reschedule is required.
 */
static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
	bool can_cont;

	get_task_struct(g);
	get_task_struct(t);
	rcu_read_unlock();
	cond_resched();
	rcu_read_lock();
	can_cont = pid_alive(g) && pid_alive(t);
	put_task_struct(t);
	put_task_struct(g);

	return can_cont;
}

/*
 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
 * a really long time. If that happens, print out a warning.
 */
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
	int max_count = sysctl_hung_task_check_count;
	unsigned long last_break = jiffies;
	struct task_struct *g, *t;
	unsigned long this_round_count;
	int need_warning = sysctl_hung_task_warnings;
	unsigned long si_mask = hung_task_si_mask;

	/*
	 * If the system crashed already then all bets are off,
	 * do not report extra hung tasks:
	 */
	if (test_taint(TAINT_DIE) || did_panic)
		return;

	this_round_count = 0;
	rcu_read_lock();
	for_each_process_thread(g, t) {
		if (!max_count--)
			goto unlock;
		if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
			if (!rcu_lock_break(g, t))
				goto unlock;
			last_break = jiffies;
		}

		if (task_is_hung(t, timeout)) {
			/*
			 * Increment the global counter so that userspace could
			 * start migrating tasks ASAP. But count the current
			 * round separately because userspace could reset
			 * the global counter at any time.
			 */
			atomic_long_inc(&sysctl_hung_task_detect_count);
			this_round_count++;
			hung_task_info(t, timeout, this_round_count);
		}
	}
 unlock:
	rcu_read_unlock();

	if (!this_round_count)
		return;

	if (need_warning || hung_task_call_panic) {
		si_mask |= SYS_INFO_LOCKS;

		if (sysctl_hung_task_all_cpu_backtrace)
			si_mask |= SYS_INFO_ALL_BT;
	}

	sys_info(si_mask);

	if (hung_task_call_panic)
		panic("hung_task: blocked tasks");
}

static long hung_timeout_jiffies(unsigned long last_checked,
				 unsigned long timeout)
{
	/* timeout of 0 will disable the watchdog */
	return timeout ? last_checked - jiffies + timeout * HZ :
		MAX_SCHEDULE_TIMEOUT;
}

#ifdef CONFIG_SYSCTL

/**
 * proc_dohung_task_detect_count - proc handler for hung_task_detect_count
 * @table: Pointer to the struct ctl_table definition for this proc entry
 * @dir: Flag indicating the operation
 * @buffer: User space buffer for data transfer
 * @lenp: Pointer to the length of the data being transferred
 * @ppos: Pointer to the current file offset
 *
 * This handler is used for reading the current hung task detection count
 * and for resetting it to zero when a write operation is performed using a
 * zero value only.
 * Return: 0 on success, or a negative error code on failure.
 */
static int proc_dohung_task_detect_count(const struct ctl_table *table, int dir,
					 void *buffer, size_t *lenp, loff_t *ppos)
{
	unsigned long detect_count;
	struct ctl_table proxy_table;
	int err;

	proxy_table = *table;
	proxy_table.data = &detect_count;

	if (SYSCTL_KERN_TO_USER(dir))
		detect_count = atomic_long_read(&sysctl_hung_task_detect_count);

	err = proc_doulongvec_minmax(&proxy_table, dir, buffer, lenp, ppos);
	if (err < 0)
		return err;

	if (SYSCTL_USER_TO_KERN(dir)) {
		if (detect_count)
			return -EINVAL;
		atomic_long_set(&sysctl_hung_task_detect_count, 0);
	}

	return 0;
}

/*
 * Process updating of timeout sysctl
 */
static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,
				  void *buffer,
				  size_t *lenp, loff_t *ppos)
{
	int ret;

	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);

	if (ret || !write)
		goto out;

	wake_up_process(watchdog_task);

 out:
	return ret;
}

/*
 * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
 * and hung_task_check_interval_secs
 */
static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
static const struct ctl_table hung_task_sysctls[] = {
#ifdef CONFIG_SMP
	{
		.procname	= "hung_task_all_cpu_backtrace",
		.data		= &sysctl_hung_task_all_cpu_backtrace,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_ONE,
	},
#endif /* CONFIG_SMP */
	{
		.procname	= "hung_task_panic",
		.data		= &sysctl_hung_task_panic,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_INT_MAX,
	},
	{
		.procname	= "hung_task_check_count",
		.data		= &sysctl_hung_task_check_count,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
	},
	{
		.procname	= "hung_task_timeout_secs",
		.data		= &sysctl_hung_task_timeout_secs,
		.maxlen		= sizeof(unsigned long),
		.mode		= 0644,
		.proc_handler	= proc_dohung_task_timeout_secs,
		.extra2		= (void *)&hung_task_timeout_max,
	},
	{
		.procname	= "hung_task_check_interval_secs",
		.data		= &sysctl_hung_task_check_interval_secs,
		.maxlen		= sizeof(unsigned long),
		.mode		= 0644,
		.proc_handler	= proc_dohung_task_timeout_secs,
		.extra2		= (void *)&hung_task_timeout_max,
	},
	{
		.procname	= "hung_task_warnings",
		.data		= &sysctl_hung_task_warnings,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_NEG_ONE,
	},
	{
		.procname	= "hung_task_detect_count",
		.maxlen		= sizeof(unsigned long),
		.mode		= 0644,
		.proc_handler	= proc_dohung_task_detect_count,
	},
	{
		.procname	= "hung_task_sys_info",
		.data		= &hung_task_si_mask,
		.maxlen         = sizeof(hung_task_si_mask),
		.mode		= 0644,
		.proc_handler	= sysctl_sys_info_handler,
	},
};

static void __init hung_task_sysctl_init(void)
{
	register_sysctl_init("kernel", hung_task_sysctls);
}
#else
#define hung_task_sysctl_init() do { } while (0)
#endif /* CONFIG_SYSCTL */


static atomic_t reset_hung_task = ATOMIC_INIT(0);

void reset_hung_task_detector(void)
{
	atomic_set(&reset_hung_task, 1);
}
EXPORT_SYMBOL_GPL(reset_hung_task_detector);

static bool hung_detector_suspended;

static int hungtask_pm_notify(struct notifier_block *self,
			      unsigned long action, void *hcpu)
{
	switch (action) {
	case PM_SUSPEND_PREPARE:
	case PM_HIBERNATION_PREPARE:
	case PM_RESTORE_PREPARE:
		hung_detector_suspended = true;
		break;
	case PM_POST_SUSPEND:
	case PM_POST_HIBERNATION:
	case PM_POST_RESTORE:
		hung_detector_suspended = false;
		break;
	default:
		break;
	}
	return NOTIFY_OK;
}

/*
 * kthread which checks for tasks stuck in D state
 */
static int watchdog(void *dummy)
{
	unsigned long hung_last_checked = jiffies;

	set_user_nice(current, 0);

	for ( ; ; ) {
		unsigned long timeout = sysctl_hung_task_timeout_secs;
		unsigned long interval = sysctl_hung_task_check_interval_secs;
		long t;

		if (interval == 0)
			interval = timeout;
		interval = min_t(unsigned long, interval, timeout);
		t = hung_timeout_jiffies(hung_last_checked, interval);
		if (t <= 0) {
			if (!atomic_xchg(&reset_hung_task, 0) &&
			    !hung_detector_suspended)
				check_hung_uninterruptible_tasks(timeout);
			hung_last_checked = jiffies;
			continue;
		}
		schedule_timeout_interruptible(t);
	}

	return 0;
}

static int __init hung_task_init(void)
{
	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);

	/* Disable hung task detector on suspend */
	pm_notifier(hungtask_pm_notify, 0);

	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
	hung_task_sysctl_init();

	return 0;
}
subsys_initcall(hung_task_init);