watchdog.c 28.6 KB
Newer Older
1 2 3 4 5
/*
 * Detect hard and soft lockups on a system
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
6 7 8
 * Note: Most of this code is borrowed heavily from the original softlockup
 * detector, so thanks to Ingo for the initial implementation.
 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
9 10 11
 * to those contributors as well.
 */

12 13
#define pr_fmt(fmt) "NMI watchdog: " fmt

14 15 16 17 18 19
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
20
#include <linux/smpboot.h>
21
#include <linux/sched/rt.h>
22
#include <linux/tick.h>
23 24

#include <asm/irq_regs.h>
25
#include <linux/kvm_para.h>
26
#include <linux/perf_event.h>
27
#include <linux/kthread.h>
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
/*
 * The run state of the lockup detectors is controlled by the content of the
 * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
 * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
 *
 * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
 * are variables that are only used as an 'interface' between the parameters
 * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
 * 'watchdog_thresh' variable is handled differently because its value is not
 * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
 * is equal zero.
 */
#define NMI_WATCHDOG_ENABLED_BIT   0
#define SOFT_WATCHDOG_ENABLED_BIT  1
#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)

P
Peter Zijlstra 已提交
46 47
static DEFINE_MUTEX(watchdog_proc_mutex);

48 49 50 51 52 53 54 55
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
#else
static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif
int __read_mostly nmi_watchdog_enabled;
int __read_mostly soft_watchdog_enabled;
int __read_mostly watchdog_user_enabled;
56
int __read_mostly watchdog_thresh = 10;
57

58 59
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
60
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
61 62
#else
#define sysctl_softlockup_all_cpu_backtrace 0
63
#define sysctl_hardlockup_all_cpu_backtrace 0
64
#endif
65 66 67 68 69 70
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);

/* Helper for online, unparked cpus. */
#define for_each_watchdog_cpu(cpu) \
	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
71

72 73 74 75 76
/*
 * The 'watchdog_running' variable is set to 1 when the watchdog threads
 * are registered/started and is set to 0 when the watchdog threads are
 * unregistered/stopped, so it is an indicator whether the threads exist.
 */
77
static int __read_mostly watchdog_running;
78 79 80 81 82 83 84 85 86 87 88 89 90 91
/*
 * If a subsystem has a need to deactivate the watchdog temporarily, it
 * can use the suspend/resume interface to achieve this. The content of
 * the 'watchdog_suspended' variable reflects this state. Existing threads
 * are parked/unparked by the lockup_detector_{suspend|resume} functions
 * (see comment blocks pertaining to those functions for further details).
 *
 * 'watchdog_suspended' also prevents threads from being registered/started
 * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
 * of 'watchdog_running' cannot change while the watchdog is deactivated
 * temporarily (see related code in 'proc' handlers).
 */
static int __read_mostly watchdog_suspended;

92
static u64 __read_mostly sample_period;
93 94 95 96 97 98

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
99 100
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
101
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
102
#ifdef CONFIG_HARDLOCKUP_DETECTOR
103 104
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
105 106 107
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
108
static unsigned long soft_lockup_nmi_warn;
109 110 111 112 113

/* boot commands */
/*
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
114
#ifdef CONFIG_HARDLOCKUP_DETECTOR
115
unsigned int __read_mostly hardlockup_panic =
116
			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
117
static unsigned long hardlockup_allcpu_dumped;
118 119 120 121 122 123 124 125
/*
 * We may not want to enable hard lockup detection by default in all cases,
 * for example when running the kernel as a guest on a hypervisor. In these
 * cases this function can be called to disable hard lockup detection. This
 * function should only be executed once by the boot processor before the
 * kernel command line parameters are parsed, because otherwise it is not
 * possible to override this in hardlockup_panic_setup().
 */
126
void hardlockup_detector_disable(void)
127
{
128
	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
129 130
}

131 132 133 134
static int __init hardlockup_panic_setup(char *str)
{
	if (!strncmp(str, "panic", 5))
		hardlockup_panic = 1;
135 136
	else if (!strncmp(str, "nopanic", 7))
		hardlockup_panic = 0;
137
	else if (!strncmp(str, "0", 1))
138 139 140
		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
	else if (!strncmp(str, "1", 1))
		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
	return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
#endif

unsigned int __read_mostly softlockup_panic =
			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;

static int __init softlockup_panic_setup(char *str)
{
	softlockup_panic = simple_strtoul(str, NULL, 0);

	return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);

static int __init nowatchdog_setup(char *str)
{
159
	watchdog_enabled = 0;
160 161 162 163 164 165
	return 1;
}
__setup("nowatchdog", nowatchdog_setup);

static int __init nosoftlockup_setup(char *str)
{
166
	watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
167 168 169
	return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
170

171 172 173 174 175 176 177 178
#ifdef CONFIG_SMP
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
	sysctl_softlockup_all_cpu_backtrace =
		!!simple_strtol(str, NULL, 0);
	return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
179 180 181 182 183 184 185
static int __init hardlockup_all_cpu_backtrace_setup(char *str)
{
	sysctl_hardlockup_all_cpu_backtrace =
		!!simple_strtol(str, NULL, 0);
	return 1;
}
__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
186
#endif
187

188 189 190 191 192 193 194
/*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
 * want a higher threshold for soft lockups than for hard lockups. So we couple
 * the thresholds with a factor: we make the soft threshold twice the amount of
 * time the hard threshold is.
 */
195
static int get_softlockup_thresh(void)
196 197 198
{
	return watchdog_thresh * 2;
}
199 200 201 202 203 204

/*
 * Returns seconds, approximately.  We don't need nanosecond
 * resolution, and we don't need to waste time with a big divide when
 * 2^30ns == 1.074s.
 */
205
static unsigned long get_timestamp(void)
206
{
207
	return running_clock() >> 30LL;  /* 2^30 ~= 10^9 */
208 209
}

210
static void set_sample_period(void)
211 212
{
	/*
213
	 * convert watchdog_thresh from seconds to ns
214 215 216 217
	 * the divide by 5 is to give hrtimer several chances (two
	 * or three with the current relation between the soft
	 * and hard thresholds) to increment before the
	 * hardlockup detector generates a warning
218
	 */
219
	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
220 221 222 223 224
}

/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
225
	__this_cpu_write(watchdog_touch_ts, get_timestamp());
226 227
}

228
void touch_softlockup_watchdog(void)
229
{
230 231 232 233 234
	/*
	 * Preemption can be enabled.  It doesn't matter which CPU's timestamp
	 * gets zeroed here, so use the raw_ operation.
	 */
	raw_cpu_write(watchdog_touch_ts, 0);
235
}
236
EXPORT_SYMBOL(touch_softlockup_watchdog);
237

238
void touch_all_softlockup_watchdogs(void)
239 240 241 242 243 244 245 246
{
	int cpu;

	/*
	 * this is done lockless
	 * do we care if a 0 races with a timestamp?
	 * all it means is the softlock check starts one cycle later
	 */
247
	for_each_watchdog_cpu(cpu)
248 249 250
		per_cpu(watchdog_touch_ts, cpu) = 0;
}

251
#ifdef CONFIG_HARDLOCKUP_DETECTOR
252 253
void touch_nmi_watchdog(void)
{
254 255 256 257 258 259 260
	/*
	 * Using __raw here because some code paths have
	 * preemption enabled.  If preemption is enabled
	 * then interrupts should be enabled too, in which
	 * case we shouldn't have to worry about the watchdog
	 * going off.
	 */
261
	raw_cpu_write(watchdog_nmi_touch, true);
262
	touch_softlockup_watchdog();
263 264 265
}
EXPORT_SYMBOL(touch_nmi_watchdog);

266 267
#endif

268 269
void touch_softlockup_watchdog_sync(void)
{
270 271
	__this_cpu_write(softlockup_touch_sync, true);
	__this_cpu_write(watchdog_touch_ts, 0);
272 273
}

274
#ifdef CONFIG_HARDLOCKUP_DETECTOR
275
/* watchdog detector functions */
276
static bool is_hardlockup(void)
277
{
278
	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
279

280
	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
281
		return true;
282

283
	__this_cpu_write(hrtimer_interrupts_saved, hrint);
284
	return false;
285 286 287
}
#endif

288
static int is_softlockup(unsigned long touch_ts)
289
{
290
	unsigned long now = get_timestamp();
291

292 293 294 295 296
	if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
		/* Warn about unreasonable delays. */
		if (time_after(now, touch_ts + get_softlockup_thresh()))
			return now - touch_ts;
	}
297 298 299
	return 0;
}

300
#ifdef CONFIG_HARDLOCKUP_DETECTOR
301

302 303 304 305 306 307 308 309 310
static struct perf_event_attr wd_hw_attr = {
	.type		= PERF_TYPE_HARDWARE,
	.config		= PERF_COUNT_HW_CPU_CYCLES,
	.size		= sizeof(struct perf_event_attr),
	.pinned		= 1,
	.disabled	= 1,
};

/* Callback function for perf event subsystem */
311
static void watchdog_overflow_callback(struct perf_event *event,
312 313 314
		 struct perf_sample_data *data,
		 struct pt_regs *regs)
{
315 316 317
	/* Ensure the watchdog never gets throttled */
	event->hw.interrupts = 0;

318 319
	if (__this_cpu_read(watchdog_nmi_touch) == true) {
		__this_cpu_write(watchdog_nmi_touch, false);
320 321 322 323 324 325 326 327 328
		return;
	}

	/* check for a hardlockup
	 * This is done by making sure our timer interrupt
	 * is incrementing.  The timer interrupt should have
	 * fired multiple times before we overflow'd.  If it hasn't
	 * then this is a good indication the cpu is stuck
	 */
329 330
	if (is_hardlockup()) {
		int this_cpu = smp_processor_id();
331
		struct pt_regs *regs = get_irq_regs();
332

333
		/* only print hardlockups once */
334
		if (__this_cpu_read(hard_watchdog_warn) == true)
335 336
			return;

337 338 339 340 341
		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
		print_modules();
		print_irqtrace_events(current);
		if (regs)
			show_regs(regs);
342
		else
343 344 345 346 347 348 349 350 351 352 353 354
			dump_stack();

		/*
		 * Perform all-CPU dump only once to avoid multiple hardlockups
		 * generating interleaving traces
		 */
		if (sysctl_hardlockup_all_cpu_backtrace &&
				!test_and_set_bit(0, &hardlockup_allcpu_dumped))
			trigger_allbutself_cpu_backtrace();

		if (hardlockup_panic)
			panic("Hard LOCKUP");
355

356
		__this_cpu_write(hard_watchdog_warn, true);
357 358 359
		return;
	}

360
	__this_cpu_write(hard_watchdog_warn, false);
361 362
	return;
}
363 364
#endif /* CONFIG_HARDLOCKUP_DETECTOR */

365 366
static void watchdog_interrupt_count(void)
{
367
	__this_cpu_inc(hrtimer_interrupts);
368
}
369 370 371

static int watchdog_nmi_enable(unsigned int cpu);
static void watchdog_nmi_disable(unsigned int cpu);
372

373 374 375
static int watchdog_enable_all_cpus(void);
static void watchdog_disable_all_cpus(void);

376 377 378
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
379
	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
380 381
	struct pt_regs *regs = get_irq_regs();
	int duration;
382
	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
383 384 385 386 387

	/* kick the hardlockup detector */
	watchdog_interrupt_count();

	/* kick the softlockup detector */
388
	wake_up_process(__this_cpu_read(softlockup_watchdog));
389 390

	/* .. and repeat */
391
	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
392 393

	if (touch_ts == 0) {
394
		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
395 396 397 398
			/*
			 * If the time stamp was touched atomically
			 * make sure the scheduler tick is up to date.
			 */
399
			__this_cpu_write(softlockup_touch_sync, false);
400 401
			sched_clock_tick();
		}
402 403 404

		/* Clear the guest paused flag on watchdog reset */
		kvm_check_and_clear_guest_paused();
405 406 407 408 409 410 411 412 413 414
		__touch_watchdog();
		return HRTIMER_RESTART;
	}

	/* check for a softlockup
	 * This is done by making sure a high priority task is
	 * being scheduled.  The task touches the watchdog to
	 * indicate it is getting cpu time.  If it hasn't then
	 * this is a good indication some task is hogging the cpu
	 */
415
	duration = is_softlockup(touch_ts);
416
	if (unlikely(duration)) {
417 418 419 420 421 422 423 424
		/*
		 * If a virtual machine is stopped by the host it can look to
		 * the watchdog like a soft lockup, check to see if the host
		 * stopped the vm before we issue the warning
		 */
		if (kvm_check_and_clear_guest_paused())
			return HRTIMER_RESTART;

425
		/* only warn once */
426 427 428 429 430 431 432 433 434 435 436 437 438 439
		if (__this_cpu_read(soft_watchdog_warn) == true) {
			/*
			 * When multiple processes are causing softlockups the
			 * softlockup detector only warns on the first one
			 * because the code relies on a full quiet cycle to
			 * re-arm.  The second process prevents the quiet cycle
			 * and never gets reported.  Use task pointers to detect
			 * this.
			 */
			if (__this_cpu_read(softlockup_task_ptr_saved) !=
			    current) {
				__this_cpu_write(soft_watchdog_warn, false);
				__touch_watchdog();
			}
440
			return HRTIMER_RESTART;
441
		}
442

443 444 445 446 447 448 449 450 451 452 453
		if (softlockup_all_cpu_backtrace) {
			/* Prevent multiple soft-lockup reports if one cpu is already
			 * engaged in dumping cpu back traces
			 */
			if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
				/* Someone else will report us. Let's give up */
				__this_cpu_write(soft_watchdog_warn, true);
				return HRTIMER_RESTART;
			}
		}

454
		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
455
			smp_processor_id(), duration,
456
			current->comm, task_pid_nr(current));
457
		__this_cpu_write(softlockup_task_ptr_saved, current);
458 459 460 461 462 463 464
		print_modules();
		print_irqtrace_events(current);
		if (regs)
			show_regs(regs);
		else
			dump_stack();

465 466 467 468 469 470 471 472 473 474 475
		if (softlockup_all_cpu_backtrace) {
			/* Avoid generating two back traces for current
			 * given that one is already made above
			 */
			trigger_allbutself_cpu_backtrace();

			clear_bit(0, &soft_lockup_nmi_warn);
			/* Barrier to sync with other cpus */
			smp_mb__after_atomic();
		}

J
Josh Hunt 已提交
476
		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
477 478
		if (softlockup_panic)
			panic("softlockup: hung tasks");
479
		__this_cpu_write(soft_watchdog_warn, true);
480
	} else
481
		__this_cpu_write(soft_watchdog_warn, false);
482 483 484 485

	return HRTIMER_RESTART;
}

486 487 488
static void watchdog_set_prio(unsigned int policy, unsigned int prio)
{
	struct sched_param param = { .sched_priority = prio };
489

490 491 492 493
	sched_setscheduler(current, policy, &param);
}

static void watchdog_enable(unsigned int cpu)
494
{
495
	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
496

497 498 499 500
	/* kick off the timer for the hardlockup detector */
	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hrtimer->function = watchdog_timer_fn;

501 502
	/* Enable the perf event */
	watchdog_nmi_enable(cpu);
503 504

	/* done here because hrtimer_start can only pin to smp_processor_id() */
505
	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
506 507
		      HRTIMER_MODE_REL_PINNED);

508 509 510 511
	/* initialize timestamp */
	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
	__touch_watchdog();
}
512

513 514
static void watchdog_disable(unsigned int cpu)
{
515
	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
516

517 518 519 520
	watchdog_set_prio(SCHED_NORMAL, 0);
	hrtimer_cancel(hrtimer);
	/* disable the perf event */
	watchdog_nmi_disable(cpu);
521 522
}

523 524 525 526 527
static void watchdog_cleanup(unsigned int cpu, bool online)
{
	watchdog_disable(cpu);
}

528 529 530 531 532 533 534 535 536
static int watchdog_should_run(unsigned int cpu)
{
	return __this_cpu_read(hrtimer_interrupts) !=
		__this_cpu_read(soft_lockup_hrtimer_cnt);
}

/*
 * The watchdog thread function - touches the timestamp.
 *
537
 * It only runs once every sample_period seconds (4 seconds by
538 539 540 541 542 543 544 545 546
 * default) to reset the softlockup timestamp. If this gets delayed
 * for more than 2*watchdog_thresh seconds then the debug-printout
 * triggers in watchdog_timer_fn().
 */
static void watchdog(unsigned int cpu)
{
	__this_cpu_write(soft_lockup_hrtimer_cnt,
			 __this_cpu_read(hrtimer_interrupts));
	__touch_watchdog();
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561

	/*
	 * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
	 * failure path. Check for failures that can occur asynchronously -
	 * for example, when CPUs are on-lined - and shut down the hardware
	 * perf event on each CPU accordingly.
	 *
	 * The only non-obvious place this bit can be cleared is through
	 * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
	 * pr_info here would be too noisy as it would result in a message
	 * every few seconds if the hardlockup was disabled but the softlockup
	 * enabled.
	 */
	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
		watchdog_nmi_disable(cpu);
562
}
563

564
#ifdef CONFIG_HARDLOCKUP_DETECTOR
565 566 567 568 569 570 571
/*
 * People like the simple clean cpu node info on boot.
 * Reduce the watchdog noise by only printing messages
 * that are different from what cpu0 displayed.
 */
static unsigned long cpu0_err;

572
static int watchdog_nmi_enable(unsigned int cpu)
573 574 575 576
{
	struct perf_event_attr *wd_attr;
	struct perf_event *event = per_cpu(watchdog_ev, cpu);

577 578 579
	/* nothing to do if the hard lockup detector is disabled */
	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
		goto out;
580

581 582 583 584 585 586 587 588 589
	/* is it already setup and enabled? */
	if (event && event->state > PERF_EVENT_STATE_OFF)
		goto out;

	/* it is setup but not enabled */
	if (event != NULL)
		goto out_enable;

	wd_attr = &wd_hw_attr;
590
	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
591 592

	/* Try to register using hardware perf events */
593
	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
594 595 596 597 598

	/* save cpu0 error for future comparision */
	if (cpu == 0 && IS_ERR(event))
		cpu0_err = PTR_ERR(event);

599
	if (!IS_ERR(event)) {
600 601 602
		/* only print for cpu0 or different than cpu0 */
		if (cpu == 0 || cpu0_err)
			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
603 604 605
		goto out_save;
	}

606 607 608 609 610 611 612 613 614 615 616 617
	/*
	 * Disable the hard lockup detector if _any_ CPU fails to set up
	 * set up the hardware perf event. The watchdog() function checks
	 * the NMI_WATCHDOG_ENABLED bit periodically.
	 *
	 * The barriers are for syncing up watchdog_enabled across all the
	 * cpus, as clear_bit() does not use barriers.
	 */
	smp_mb__before_atomic();
	clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
	smp_mb__after_atomic();

618 619 620
	/* skip displaying the same error again */
	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
		return PTR_ERR(event);
621 622 623

	/* vary the KERN level based on the returned errno */
	if (PTR_ERR(event) == -EOPNOTSUPP)
624
		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
625
	else if (PTR_ERR(event) == -ENOENT)
626
		pr_warn("disabled (cpu%i): hardware events not enabled\n",
627
			 cpu);
628
	else
629 630
		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
			cpu, PTR_ERR(event));
631 632 633

	pr_info("Shutting down hard lockup detector on all cpus\n");

634
	return PTR_ERR(event);
635 636 637 638 639 640 641 642 643 644

	/* success path */
out_save:
	per_cpu(watchdog_ev, cpu) = event;
out_enable:
	perf_event_enable(per_cpu(watchdog_ev, cpu));
out:
	return 0;
}

645
static void watchdog_nmi_disable(unsigned int cpu)
646 647 648 649 650 651 652 653 654 655
{
	struct perf_event *event = per_cpu(watchdog_ev, cpu);

	if (event) {
		perf_event_disable(event);
		per_cpu(watchdog_ev, cpu) = NULL;

		/* should be in cleanup, but blocks oprofile */
		perf_event_release_kernel(event);
	}
656 657 658 659
	if (cpu == 0) {
		/* watchdog_nmi_enable() expects this to be zero initially. */
		cpu0_err = 0;
	}
660
}
661

662
#else
663 664
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
665
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
666

667 668 669 670 671 672 673 674 675 676 677
static struct smp_hotplug_thread watchdog_threads = {
	.store			= &softlockup_watchdog,
	.thread_should_run	= watchdog_should_run,
	.thread_fn		= watchdog,
	.thread_comm		= "watchdog/%u",
	.setup			= watchdog_enable,
	.cleanup		= watchdog_cleanup,
	.park			= watchdog_disable,
	.unpark			= watchdog_enable,
};

678 679
/*
 * park all watchdog threads that are specified in 'watchdog_cpumask'
680 681 682 683 684 685
 *
 * This function returns an error if kthread_park() of a watchdog thread
 * fails. In this situation, the watchdog threads of some CPUs can already
 * be parked and the watchdog threads of other CPUs can still be runnable.
 * Callers are expected to handle this special condition as appropriate in
 * their context.
686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
 */
static int watchdog_park_threads(void)
{
	int cpu, ret = 0;

	get_online_cpus();
	for_each_watchdog_cpu(cpu) {
		ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
		if (ret)
			break;
	}
	put_online_cpus();

	return ret;
}

/*
 * unpark all watchdog threads that are specified in 'watchdog_cpumask'
 */
static void watchdog_unpark_threads(void)
{
	int cpu;

	get_online_cpus();
	for_each_watchdog_cpu(cpu)
		kthread_unpark(per_cpu(softlockup_watchdog, cpu));
	put_online_cpus();
}

715 716 717
/*
 * Suspend the hard and soft lockup detector by parking the watchdog threads.
 */
718
int lockup_detector_suspend(void)
719 720 721
{
	int ret = 0;

722
	get_online_cpus();
723 724 725 726 727 728
	mutex_lock(&watchdog_proc_mutex);
	/*
	 * Multiple suspend requests can be active in parallel (counted by
	 * the 'watchdog_suspended' variable). If the watchdog threads are
	 * running, the first caller takes care that they will be parked.
	 * The state of 'watchdog_running' cannot change while a suspend
729
	 * request is active (see related code in 'proc' handlers).
730 731 732 733 734 735
	 */
	if (watchdog_running && !watchdog_suspended)
		ret = watchdog_park_threads();

	if (ret == 0)
		watchdog_suspended++;
736 737 738 739 740
	else {
		watchdog_disable_all_cpus();
		pr_err("Failed to suspend lockup detectors, disabled\n");
		watchdog_enabled = 0;
	}
741 742 743 744 745 746 747 748 749

	mutex_unlock(&watchdog_proc_mutex);

	return ret;
}

/*
 * Resume the hard and soft lockup detector by unparking the watchdog threads.
 */
750
void lockup_detector_resume(void)
751 752 753 754 755 756 757 758 759 760 761 762
{
	mutex_lock(&watchdog_proc_mutex);

	watchdog_suspended--;
	/*
	 * The watchdog threads are unparked if they were previously running
	 * and if there is no more active suspend request.
	 */
	if (watchdog_running && !watchdog_suspended)
		watchdog_unpark_threads();

	mutex_unlock(&watchdog_proc_mutex);
763
	put_online_cpus();
764 765
}

766
static int update_watchdog_all_cpus(void)
767
{
768 769 770 771 772 773
	int ret;

	ret = watchdog_park_threads();
	if (ret)
		return ret;

774
	watchdog_unpark_threads();
775 776

	return 0;
777 778
}

779
static int watchdog_enable_all_cpus(void)
780
{
781
	int err = 0;
782

783
	if (!watchdog_running) {
784 785
		err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
							     &watchdog_cpumask);
786 787
		if (err)
			pr_err("Failed to create watchdog threads, disabled\n");
788
		else
789
			watchdog_running = 1;
790 791 792 793 794
	} else {
		/*
		 * Enable/disable the lockup detectors or
		 * change the sample period 'on the fly'.
		 */
795 796 797 798 799 800
		err = update_watchdog_all_cpus();

		if (err) {
			watchdog_disable_all_cpus();
			pr_err("Failed to update lockup detectors, disabled\n");
		}
801
	}
802

803 804 805
	if (err)
		watchdog_enabled = 0;

806
	return err;
807 808 809 810
}

static void watchdog_disable_all_cpus(void)
{
811 812
	if (watchdog_running) {
		watchdog_running = 0;
813
		smpboot_unregister_percpu_thread(&watchdog_threads);
814
	}
815 816
}

817 818
#ifdef CONFIG_SYSCTL

819
/*
820 821 822 823 824 825 826 827 828 829 830 831 832 833
 * Update the run state of the lockup detectors.
 */
static int proc_watchdog_update(void)
{
	int err = 0;

	/*
	 * Watchdog threads won't be started if they are already active.
	 * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
	 * care of this. If those threads are already active, the sample
	 * period will be updated and the lockup detectors will be enabled
	 * or disabled 'on the fly'.
	 */
	if (watchdog_enabled && watchdog_thresh)
834
		err = watchdog_enable_all_cpus();
835 836 837 838 839 840 841
	else
		watchdog_disable_all_cpus();

	return err;

}

842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
/*
 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
 *
 * caller             | table->data points to | 'which' contains the flag(s)
 * -------------------|-----------------------|-----------------------------
 * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
 *                    |                       | with SOFT_WATCHDOG_ENABLED
 * -------------------|-----------------------|-----------------------------
 * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
 * -------------------|-----------------------|-----------------------------
 * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
 */
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
				void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int err, old, new;
	int *watchdog_param = (int *)table->data;

	mutex_lock(&watchdog_proc_mutex);

862 863 864 865 866 867
	if (watchdog_suspended) {
		/* no parameter changes allowed while watchdog is suspended */
		err = -EAGAIN;
		goto out;
	}

868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
	/*
	 * If the parameter is being read return the state of the corresponding
	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
	 * run state of the lockup detectors.
	 */
	if (!write) {
		*watchdog_param = (watchdog_enabled & which) != 0;
		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
	} else {
		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
		if (err)
			goto out;

		/*
		 * There is a race window between fetching the current value
		 * from 'watchdog_enabled' and storing the new value. During
		 * this race window, watchdog_nmi_enable() can sneak in and
		 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
		 * The 'cmpxchg' detects this race and the loop retries.
		 */
		do {
			old = watchdog_enabled;
			/*
			 * If the parameter value is not zero set the
			 * corresponding bit(s), else clear it(them).
			 */
			if (*watchdog_param)
				new = old | which;
			else
				new = old & ~which;
		} while (cmpxchg(&watchdog_enabled, old, new) != old);

		/*
901 902 903 904 905
		 * Update the run state of the lockup detectors. There is _no_
		 * need to check the value returned by proc_watchdog_update()
		 * and to restore the previous value of 'watchdog_enabled' as
		 * both lockup detectors are disabled if proc_watchdog_update()
		 * returns an error.
906 907 908 909 910 911 912 913
		 */
		err = proc_watchdog_update();
	}
out:
	mutex_unlock(&watchdog_proc_mutex);
	return err;
}

914 915 916 917 918 919 920 921 922 923 924 925
/*
 * /proc/sys/kernel/watchdog
 */
int proc_watchdog(struct ctl_table *table, int write,
		  void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
				    table, write, buffer, lenp, ppos);
}

/*
 * /proc/sys/kernel/nmi_watchdog
926
 */
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
int proc_nmi_watchdog(struct ctl_table *table, int write,
		      void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
				    table, write, buffer, lenp, ppos);
}

/*
 * /proc/sys/kernel/soft_watchdog
 */
int proc_soft_watchdog(struct ctl_table *table, int write,
			void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
				    table, write, buffer, lenp, ppos);
}
943

944 945 946 947 948
/*
 * /proc/sys/kernel/watchdog_thresh
 */
int proc_watchdog_thresh(struct ctl_table *table, int write,
			 void __user *buffer, size_t *lenp, loff_t *ppos)
949
{
950
	int err, old;
951

952
	mutex_lock(&watchdog_proc_mutex);
953

954 955 956 957 958 959
	if (watchdog_suspended) {
		/* no parameter changes allowed while watchdog is suspended */
		err = -EAGAIN;
		goto out;
	}

960
	old = ACCESS_ONCE(watchdog_thresh);
961
	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
962

963
	if (err || !write)
964
		goto out;
965

966
	/*
967
	 * Update the sample period. Restore on failure.
968
	 */
969 970
	set_sample_period();
	err = proc_watchdog_update();
971
	if (err) {
972
		watchdog_thresh = old;
973 974
		set_sample_period();
	}
975 976
out:
	mutex_unlock(&watchdog_proc_mutex);
977
	return err;
978
}
979 980 981 982 983 984 985 986 987 988 989 990 991

/*
 * The cpumask is the mask of possible cpus that the watchdog can run
 * on, not the mask of cpus it is actually running on.  This allows the
 * user to specify a mask that will include cpus that have not yet
 * been brought online, if desired.
 */
int proc_watchdog_cpumask(struct ctl_table *table, int write,
			  void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int err;

	mutex_lock(&watchdog_proc_mutex);
992 993 994 995 996 997 998

	if (watchdog_suspended) {
		/* no parameter changes allowed while watchdog is suspended */
		err = -EAGAIN;
		goto out;
	}

999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
	if (!err && write) {
		/* Remove impossible cpus to keep sysctl output cleaner. */
		cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
			    cpu_possible_mask);

		if (watchdog_running) {
			/*
			 * Failure would be due to being unable to allocate
			 * a temporary cpumask, so we are likely not in a
			 * position to do much else to make things better.
			 */
			if (smpboot_update_cpumask_percpu_thread(
				    &watchdog_threads, &watchdog_cpumask) != 0)
				pr_err("cpumask update failed\n");
		}
	}
1016
out:
1017 1018 1019 1020
	mutex_unlock(&watchdog_proc_mutex);
	return err;
}

1021 1022
#endif /* CONFIG_SYSCTL */

1023
void __init lockup_detector_init(void)
1024
{
1025
	set_sample_period();
1026

1027 1028
#ifdef CONFIG_NO_HZ_FULL
	if (tick_nohz_full_enabled()) {
1029 1030
		pr_info("Disabling watchdog on nohz_full cores by default\n");
		cpumask_copy(&watchdog_cpumask, housekeeping_mask);
1031 1032 1033 1034 1035 1036
	} else
		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else
	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#endif

1037
	if (watchdog_enabled)
1038
		watchdog_enable_all_cpus();
1039
}