watchdog.c 17.5 KB
Newer Older
1 2 3 4 5
/*
 * Detect hard and soft lockups on a system
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
6 7 8
 * Note: Most of this code is borrowed heavily from the original softlockup
 * detector, so thanks to Ingo for the initial implementation.
 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
9 10 11
 * to those contributors as well.
 */

12 13
#define pr_fmt(fmt) "NMI watchdog: " fmt

14 15 16 17 18 19
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
20
#include <linux/smpboot.h>
21
#include <linux/sched/rt.h>
22 23

#include <asm/irq_regs.h>
24
#include <linux/kvm_para.h>
25 26
#include <linux/perf_event.h>

27
int watchdog_user_enabled = 1;
28
int __read_mostly watchdog_thresh = 10;
29 30 31 32 33 34
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#endif

35
static int __read_mostly watchdog_running;
36
static u64 __read_mostly sample_period;
37 38 39 40 41 42

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
43 44
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
45
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
46
#ifdef CONFIG_HARDLOCKUP_DETECTOR
47 48
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
49 50 51
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
52
static unsigned long soft_lockup_nmi_warn;
53 54 55 56 57

/* boot commands */
/*
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
58
#ifdef CONFIG_HARDLOCKUP_DETECTOR
59 60
static int hardlockup_panic =
			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
61 62 63 64 65

static int __init hardlockup_panic_setup(char *str)
{
	if (!strncmp(str, "panic", 5))
		hardlockup_panic = 1;
66 67
	else if (!strncmp(str, "nopanic", 7))
		hardlockup_panic = 0;
68
	else if (!strncmp(str, "0", 1))
69
		watchdog_user_enabled = 0;
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
	return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
#endif

unsigned int __read_mostly softlockup_panic =
			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;

static int __init softlockup_panic_setup(char *str)
{
	softlockup_panic = simple_strtoul(str, NULL, 0);

	return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);

static int __init nowatchdog_setup(char *str)
{
88
	watchdog_user_enabled = 0;
89 90 91 92 93 94 95
	return 1;
}
__setup("nowatchdog", nowatchdog_setup);

/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
96
	watchdog_user_enabled = 0;
97 98 99 100
	return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
/*  */
101 102 103 104 105 106 107 108 109
#ifdef CONFIG_SMP
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
	sysctl_softlockup_all_cpu_backtrace =
		!!simple_strtol(str, NULL, 0);
	return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
#endif
110

111 112 113 114 115 116 117
/*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
 * want a higher threshold for soft lockups than for hard lockups. So we couple
 * the thresholds with a factor: we make the soft threshold twice the amount of
 * time the hard threshold is.
 */
118
static int get_softlockup_thresh(void)
119 120 121
{
	return watchdog_thresh * 2;
}
122 123 124 125 126 127

/*
 * Returns seconds, approximately.  We don't need nanosecond
 * resolution, and we don't need to waste time with a big divide when
 * 2^30ns == 1.074s.
 */
128
static unsigned long get_timestamp(void)
129
{
130
	return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
131 132
}

133
static void set_sample_period(void)
134 135
{
	/*
136
	 * convert watchdog_thresh from seconds to ns
137 138 139 140
	 * the divide by 5 is to give hrtimer several chances (two
	 * or three with the current relation between the soft
	 * and hard thresholds) to increment before the
	 * hardlockup detector generates a warning
141
	 */
142
	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
143 144 145 146 147
}

/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
148
	__this_cpu_write(watchdog_touch_ts, get_timestamp());
149 150
}

151
void touch_softlockup_watchdog(void)
152
{
153 154 155 156 157
	/*
	 * Preemption can be enabled.  It doesn't matter which CPU's timestamp
	 * gets zeroed here, so use the raw_ operation.
	 */
	raw_cpu_write(watchdog_touch_ts, 0);
158
}
159
EXPORT_SYMBOL(touch_softlockup_watchdog);
160

161
void touch_all_softlockup_watchdogs(void)
162 163 164 165 166 167 168 169 170 171 172 173
{
	int cpu;

	/*
	 * this is done lockless
	 * do we care if a 0 races with a timestamp?
	 * all it means is the softlock check starts one cycle later
	 */
	for_each_online_cpu(cpu)
		per_cpu(watchdog_touch_ts, cpu) = 0;
}

174
#ifdef CONFIG_HARDLOCKUP_DETECTOR
175 176
void touch_nmi_watchdog(void)
{
177 178 179 180 181 182 183 184
	/*
	 * Using __raw here because some code paths have
	 * preemption enabled.  If preemption is enabled
	 * then interrupts should be enabled too, in which
	 * case we shouldn't have to worry about the watchdog
	 * going off.
	 */
	__raw_get_cpu_var(watchdog_nmi_touch) = true;
185
	touch_softlockup_watchdog();
186 187 188
}
EXPORT_SYMBOL(touch_nmi_watchdog);

189 190
#endif

191 192 193 194 195 196
void touch_softlockup_watchdog_sync(void)
{
	__raw_get_cpu_var(softlockup_touch_sync) = true;
	__raw_get_cpu_var(watchdog_touch_ts) = 0;
}

197
#ifdef CONFIG_HARDLOCKUP_DETECTOR
198
/* watchdog detector functions */
199
static int is_hardlockup(void)
200
{
201
	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
202

203
	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
204 205
		return 1;

206
	__this_cpu_write(hrtimer_interrupts_saved, hrint);
207 208 209 210
	return 0;
}
#endif

211
static int is_softlockup(unsigned long touch_ts)
212
{
213
	unsigned long now = get_timestamp();
214 215

	/* Warn about unreasonable delays: */
216
	if (time_after(now, touch_ts + get_softlockup_thresh()))
217 218 219 220 221
		return now - touch_ts;

	return 0;
}

222
#ifdef CONFIG_HARDLOCKUP_DETECTOR
223

224 225 226 227 228 229 230 231 232
static struct perf_event_attr wd_hw_attr = {
	.type		= PERF_TYPE_HARDWARE,
	.config		= PERF_COUNT_HW_CPU_CYCLES,
	.size		= sizeof(struct perf_event_attr),
	.pinned		= 1,
	.disabled	= 1,
};

/* Callback function for perf event subsystem */
233
static void watchdog_overflow_callback(struct perf_event *event,
234 235 236
		 struct perf_sample_data *data,
		 struct pt_regs *regs)
{
237 238 239
	/* Ensure the watchdog never gets throttled */
	event->hw.interrupts = 0;

240 241
	if (__this_cpu_read(watchdog_nmi_touch) == true) {
		__this_cpu_write(watchdog_nmi_touch, false);
242 243 244 245 246 247 248 249 250
		return;
	}

	/* check for a hardlockup
	 * This is done by making sure our timer interrupt
	 * is incrementing.  The timer interrupt should have
	 * fired multiple times before we overflow'd.  If it hasn't
	 * then this is a good indication the cpu is stuck
	 */
251 252 253
	if (is_hardlockup()) {
		int this_cpu = smp_processor_id();

254
		/* only print hardlockups once */
255
		if (__this_cpu_read(hard_watchdog_warn) == true)
256 257 258
			return;

		if (hardlockup_panic)
259 260
			panic("Watchdog detected hard LOCKUP on cpu %d",
			      this_cpu);
261
		else
262 263
			WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
			     this_cpu);
264

265
		__this_cpu_write(hard_watchdog_warn, true);
266 267 268
		return;
	}

269
	__this_cpu_write(hard_watchdog_warn, false);
270 271
	return;
}
272 273
#endif /* CONFIG_HARDLOCKUP_DETECTOR */

274 275
static void watchdog_interrupt_count(void)
{
276
	__this_cpu_inc(hrtimer_interrupts);
277
}
278 279 280

static int watchdog_nmi_enable(unsigned int cpu);
static void watchdog_nmi_disable(unsigned int cpu);
281 282 283 284

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
285
	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
286 287
	struct pt_regs *regs = get_irq_regs();
	int duration;
288
	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
289 290 291 292 293

	/* kick the hardlockup detector */
	watchdog_interrupt_count();

	/* kick the softlockup detector */
294
	wake_up_process(__this_cpu_read(softlockup_watchdog));
295 296

	/* .. and repeat */
297
	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
298 299

	if (touch_ts == 0) {
300
		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
301 302 303 304
			/*
			 * If the time stamp was touched atomically
			 * make sure the scheduler tick is up to date.
			 */
305
			__this_cpu_write(softlockup_touch_sync, false);
306 307
			sched_clock_tick();
		}
308 309 310

		/* Clear the guest paused flag on watchdog reset */
		kvm_check_and_clear_guest_paused();
311 312 313 314 315 316 317 318 319 320
		__touch_watchdog();
		return HRTIMER_RESTART;
	}

	/* check for a softlockup
	 * This is done by making sure a high priority task is
	 * being scheduled.  The task touches the watchdog to
	 * indicate it is getting cpu time.  If it hasn't then
	 * this is a good indication some task is hogging the cpu
	 */
321
	duration = is_softlockup(touch_ts);
322
	if (unlikely(duration)) {
323 324 325 326 327 328 329 330
		/*
		 * If a virtual machine is stopped by the host it can look to
		 * the watchdog like a soft lockup, check to see if the host
		 * stopped the vm before we issue the warning
		 */
		if (kvm_check_and_clear_guest_paused())
			return HRTIMER_RESTART;

331
		/* only warn once */
332 333 334 335 336 337 338 339 340 341 342 343 344 345
		if (__this_cpu_read(soft_watchdog_warn) == true) {
			/*
			 * When multiple processes are causing softlockups the
			 * softlockup detector only warns on the first one
			 * because the code relies on a full quiet cycle to
			 * re-arm.  The second process prevents the quiet cycle
			 * and never gets reported.  Use task pointers to detect
			 * this.
			 */
			if (__this_cpu_read(softlockup_task_ptr_saved) !=
			    current) {
				__this_cpu_write(soft_watchdog_warn, false);
				__touch_watchdog();
			}
346
			return HRTIMER_RESTART;
347
		}
348

349 350 351 352 353 354 355 356 357 358 359
		if (softlockup_all_cpu_backtrace) {
			/* Prevent multiple soft-lockup reports if one cpu is already
			 * engaged in dumping cpu back traces
			 */
			if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
				/* Someone else will report us. Let's give up */
				__this_cpu_write(soft_watchdog_warn, true);
				return HRTIMER_RESTART;
			}
		}

360
		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
361
			smp_processor_id(), duration,
362
			current->comm, task_pid_nr(current));
363
		__this_cpu_write(softlockup_task_ptr_saved, current);
364 365 366 367 368 369 370
		print_modules();
		print_irqtrace_events(current);
		if (regs)
			show_regs(regs);
		else
			dump_stack();

371 372 373 374 375 376 377 378 379 380 381
		if (softlockup_all_cpu_backtrace) {
			/* Avoid generating two back traces for current
			 * given that one is already made above
			 */
			trigger_allbutself_cpu_backtrace();

			clear_bit(0, &soft_lockup_nmi_warn);
			/* Barrier to sync with other cpus */
			smp_mb__after_atomic();
		}

J
Josh Hunt 已提交
382
		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
383 384
		if (softlockup_panic)
			panic("softlockup: hung tasks");
385
		__this_cpu_write(soft_watchdog_warn, true);
386
	} else
387
		__this_cpu_write(soft_watchdog_warn, false);
388 389 390 391

	return HRTIMER_RESTART;
}

392 393 394
static void watchdog_set_prio(unsigned int policy, unsigned int prio)
{
	struct sched_param param = { .sched_priority = prio };
395

396 397 398 399
	sched_setscheduler(current, policy, &param);
}

static void watchdog_enable(unsigned int cpu)
400
{
401
	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
402

403 404 405 406
	/* kick off the timer for the hardlockup detector */
	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hrtimer->function = watchdog_timer_fn;

407 408
	/* Enable the perf event */
	watchdog_nmi_enable(cpu);
409 410

	/* done here because hrtimer_start can only pin to smp_processor_id() */
411
	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
412 413
		      HRTIMER_MODE_REL_PINNED);

414 415 416 417
	/* initialize timestamp */
	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
	__touch_watchdog();
}
418

419 420 421
static void watchdog_disable(unsigned int cpu)
{
	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
422

423 424 425 426
	watchdog_set_prio(SCHED_NORMAL, 0);
	hrtimer_cancel(hrtimer);
	/* disable the perf event */
	watchdog_nmi_disable(cpu);
427 428
}

429 430 431 432 433
static void watchdog_cleanup(unsigned int cpu, bool online)
{
	watchdog_disable(cpu);
}

434 435 436 437 438 439 440 441 442
static int watchdog_should_run(unsigned int cpu)
{
	return __this_cpu_read(hrtimer_interrupts) !=
		__this_cpu_read(soft_lockup_hrtimer_cnt);
}

/*
 * The watchdog thread function - touches the timestamp.
 *
443
 * It only runs once every sample_period seconds (4 seconds by
444 445 446 447 448 449 450 451 452 453
 * default) to reset the softlockup timestamp. If this gets delayed
 * for more than 2*watchdog_thresh seconds then the debug-printout
 * triggers in watchdog_timer_fn().
 */
static void watchdog(unsigned int cpu)
{
	__this_cpu_write(soft_lockup_hrtimer_cnt,
			 __this_cpu_read(hrtimer_interrupts));
	__touch_watchdog();
}
454

455
#ifdef CONFIG_HARDLOCKUP_DETECTOR
456 457 458 459 460 461 462
/*
 * People like the simple clean cpu node info on boot.
 * Reduce the watchdog noise by only printing messages
 * that are different from what cpu0 displayed.
 */
static unsigned long cpu0_err;

463
static int watchdog_nmi_enable(unsigned int cpu)
464 465 466 467 468 469 470 471 472 473 474 475 476
{
	struct perf_event_attr *wd_attr;
	struct perf_event *event = per_cpu(watchdog_ev, cpu);

	/* is it already setup and enabled? */
	if (event && event->state > PERF_EVENT_STATE_OFF)
		goto out;

	/* it is setup but not enabled */
	if (event != NULL)
		goto out_enable;

	wd_attr = &wd_hw_attr;
477
	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
478 479

	/* Try to register using hardware perf events */
480
	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
481 482 483 484 485

	/* save cpu0 error for future comparision */
	if (cpu == 0 && IS_ERR(event))
		cpu0_err = PTR_ERR(event);

486
	if (!IS_ERR(event)) {
487 488 489
		/* only print for cpu0 or different than cpu0 */
		if (cpu == 0 || cpu0_err)
			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
490 491 492
		goto out_save;
	}

493 494 495
	/* skip displaying the same error again */
	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
		return PTR_ERR(event);
496 497 498

	/* vary the KERN level based on the returned errno */
	if (PTR_ERR(event) == -EOPNOTSUPP)
499
		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
500
	else if (PTR_ERR(event) == -ENOENT)
501
		pr_warn("disabled (cpu%i): hardware events not enabled\n",
502
			 cpu);
503
	else
504 505
		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
			cpu, PTR_ERR(event));
506
	return PTR_ERR(event);
507 508 509 510 511 512 513 514 515 516

	/* success path */
out_save:
	per_cpu(watchdog_ev, cpu) = event;
out_enable:
	perf_event_enable(per_cpu(watchdog_ev, cpu));
out:
	return 0;
}

517
static void watchdog_nmi_disable(unsigned int cpu)
518 519 520 521 522 523 524 525 526 527
{
	struct perf_event *event = per_cpu(watchdog_ev, cpu);

	if (event) {
		perf_event_disable(event);
		per_cpu(watchdog_ev, cpu) = NULL;

		/* should be in cleanup, but blocks oprofile */
		perf_event_release_kernel(event);
	}
528 529 530 531
	if (cpu == 0) {
		/* watchdog_nmi_enable() expects this to be zero initially. */
		cpu0_err = 0;
	}
532 533
}
#else
534 535
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
536
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
537

538 539 540 541 542 543 544 545 546 547 548
static struct smp_hotplug_thread watchdog_threads = {
	.store			= &softlockup_watchdog,
	.thread_should_run	= watchdog_should_run,
	.thread_fn		= watchdog,
	.thread_comm		= "watchdog/%u",
	.setup			= watchdog_enable,
	.cleanup		= watchdog_cleanup,
	.park			= watchdog_disable,
	.unpark			= watchdog_enable,
};

549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
static void restart_watchdog_hrtimer(void *info)
{
	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
	int ret;

	/*
	 * No need to cancel and restart hrtimer if it is currently executing
	 * because it will reprogram itself with the new period now.
	 * We should never see it unqueued here because we are running per-cpu
	 * with interrupts disabled.
	 */
	ret = hrtimer_try_to_cancel(hrtimer);
	if (ret == 1)
		hrtimer_start(hrtimer, ns_to_ktime(sample_period),
				HRTIMER_MODE_REL_PINNED);
}

static void update_timers(int cpu)
{
	/*
	 * Make sure that perf event counter will adopt to a new
	 * sampling period. Updating the sampling period directly would
	 * be much nicer but we do not have an API for that now so
	 * let's use a big hammer.
	 * Hrtimer will adopt the new period on the next tick but this
	 * might be late already so we have to restart the timer as well.
	 */
	watchdog_nmi_disable(cpu);
577
	smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
578 579 580 581 582 583 584 585 586 587 588 589 590 591
	watchdog_nmi_enable(cpu);
}

static void update_timers_all_cpus(void)
{
	int cpu;

	get_online_cpus();
	for_each_online_cpu(cpu)
		update_timers(cpu);
	put_online_cpus();
}

static int watchdog_enable_all_cpus(bool sample_period_changed)
592
{
593
	int err = 0;
594

595
	if (!watchdog_running) {
596 597 598 599
		err = smpboot_register_percpu_thread(&watchdog_threads);
		if (err)
			pr_err("Failed to create watchdog threads, disabled\n");
		else
600
			watchdog_running = 1;
601 602
	} else if (sample_period_changed) {
		update_timers_all_cpus();
603
	}
604 605

	return err;
606 607
}

608 609 610
/* prepare/enable/disable routines */
/* sysctl functions */
#ifdef CONFIG_SYSCTL
611 612
static void watchdog_disable_all_cpus(void)
{
613 614
	if (watchdog_running) {
		watchdog_running = 0;
615
		smpboot_unregister_percpu_thread(&watchdog_threads);
616
	}
617 618 619
}

/*
620
 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
621 622
 */

623 624
int proc_dowatchdog(struct ctl_table *table, int write,
		    void __user *buffer, size_t *lenp, loff_t *ppos)
625
{
626
	int err, old_thresh, old_enabled;
627
	static DEFINE_MUTEX(watchdog_proc_mutex);
628

629
	mutex_lock(&watchdog_proc_mutex);
630
	old_thresh = ACCESS_ONCE(watchdog_thresh);
631
	old_enabled = ACCESS_ONCE(watchdog_user_enabled);
632

633 634
	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
	if (err || !write)
635
		goto out;
636

637
	set_sample_period();
638 639
	/*
	 * Watchdog threads shouldn't be enabled if they are
640
	 * disabled. The 'watchdog_running' variable check in
641 642
	 * watchdog_*_all_cpus() function takes care of this.
	 */
643
	if (watchdog_user_enabled && watchdog_thresh)
644
		err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
645 646 647
	else
		watchdog_disable_all_cpus();

648 649 650
	/* Restore old values on failure */
	if (err) {
		watchdog_thresh = old_thresh;
651
		watchdog_user_enabled = old_enabled;
652
	}
653 654
out:
	mutex_unlock(&watchdog_proc_mutex);
655
	return err;
656 657 658
}
#endif /* CONFIG_SYSCTL */

659
void __init lockup_detector_init(void)
660
{
661
	set_sample_period();
662

663
	if (watchdog_user_enabled)
664
		watchdog_enable_all_cpus(false);
665
}