watchdog.c 15.7 KB
Newer Older
1 2 3 4 5
/*
 * Detect hard and soft lockups on a system
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
6 7 8
 * Note: Most of this code is borrowed heavily from the original softlockup
 * detector, so thanks to Ingo for the initial implementation.
 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
9 10 11
 * to those contributors as well.
 */

12 13
#define pr_fmt(fmt) "NMI watchdog: " fmt

14 15 16 17 18 19 20 21 22 23 24
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
25
#include <linux/smpboot.h>
26
#include <linux/sched/rt.h>
27 28

#include <asm/irq_regs.h>
29
#include <linux/kvm_para.h>
30 31
#include <linux/perf_event.h>

32
int watchdog_user_enabled = 1;
33
int __read_mostly watchdog_thresh = 10;
34
static int __read_mostly watchdog_running;
35
static u64 __read_mostly sample_period;
36 37 38 39 40 41

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
42 43
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
44
#ifdef CONFIG_HARDLOCKUP_DETECTOR
45 46
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
47 48 49 50 51 52 53 54
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif

/* boot commands */
/*
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
55
#ifdef CONFIG_HARDLOCKUP_DETECTOR
56 57
static int hardlockup_panic =
			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
58 59 60 61 62

static int __init hardlockup_panic_setup(char *str)
{
	if (!strncmp(str, "panic", 5))
		hardlockup_panic = 1;
63 64
	else if (!strncmp(str, "nopanic", 7))
		hardlockup_panic = 0;
65
	else if (!strncmp(str, "0", 1))
66
		watchdog_user_enabled = 0;
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
	return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
#endif

unsigned int __read_mostly softlockup_panic =
			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;

static int __init softlockup_panic_setup(char *str)
{
	softlockup_panic = simple_strtoul(str, NULL, 0);

	return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);

static int __init nowatchdog_setup(char *str)
{
85
	watchdog_user_enabled = 0;
86 87 88 89 90 91 92
	return 1;
}
__setup("nowatchdog", nowatchdog_setup);

/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
93
	watchdog_user_enabled = 0;
94 95 96 97 98
	return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
/*  */

99 100 101 102 103 104 105
/*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
 * want a higher threshold for soft lockups than for hard lockups. So we couple
 * the thresholds with a factor: we make the soft threshold twice the amount of
 * time the hard threshold is.
 */
106
static int get_softlockup_thresh(void)
107 108 109
{
	return watchdog_thresh * 2;
}
110 111 112 113 114 115

/*
 * Returns seconds, approximately.  We don't need nanosecond
 * resolution, and we don't need to waste time with a big divide when
 * 2^30ns == 1.074s.
 */
116
static unsigned long get_timestamp(void)
117
{
118
	return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
119 120
}

121
static void set_sample_period(void)
122 123
{
	/*
124
	 * convert watchdog_thresh from seconds to ns
125 126 127 128
	 * the divide by 5 is to give hrtimer several chances (two
	 * or three with the current relation between the soft
	 * and hard thresholds) to increment before the
	 * hardlockup detector generates a warning
129
	 */
130
	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
131 132 133 134 135
}

/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
136
	__this_cpu_write(watchdog_touch_ts, get_timestamp());
137 138
}

139
void touch_softlockup_watchdog(void)
140
{
141
	__this_cpu_write(watchdog_touch_ts, 0);
142
}
143
EXPORT_SYMBOL(touch_softlockup_watchdog);
144

145
void touch_all_softlockup_watchdogs(void)
146 147 148 149 150 151 152 153 154 155 156 157
{
	int cpu;

	/*
	 * this is done lockless
	 * do we care if a 0 races with a timestamp?
	 * all it means is the softlock check starts one cycle later
	 */
	for_each_online_cpu(cpu)
		per_cpu(watchdog_touch_ts, cpu) = 0;
}

158
#ifdef CONFIG_HARDLOCKUP_DETECTOR
159 160
void touch_nmi_watchdog(void)
{
161 162 163 164 165 166 167 168
	/*
	 * Using __raw here because some code paths have
	 * preemption enabled.  If preemption is enabled
	 * then interrupts should be enabled too, in which
	 * case we shouldn't have to worry about the watchdog
	 * going off.
	 */
	__raw_get_cpu_var(watchdog_nmi_touch) = true;
169
	touch_softlockup_watchdog();
170 171 172
}
EXPORT_SYMBOL(touch_nmi_watchdog);

173 174
#endif

175 176 177 178 179 180
void touch_softlockup_watchdog_sync(void)
{
	__raw_get_cpu_var(softlockup_touch_sync) = true;
	__raw_get_cpu_var(watchdog_touch_ts) = 0;
}

181
#ifdef CONFIG_HARDLOCKUP_DETECTOR
182
/* watchdog detector functions */
183
static int is_hardlockup(void)
184
{
185
	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
186

187
	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
188 189
		return 1;

190
	__this_cpu_write(hrtimer_interrupts_saved, hrint);
191 192 193 194
	return 0;
}
#endif

195
static int is_softlockup(unsigned long touch_ts)
196
{
197
	unsigned long now = get_timestamp();
198 199

	/* Warn about unreasonable delays: */
200
	if (time_after(now, touch_ts + get_softlockup_thresh()))
201 202 203 204 205
		return now - touch_ts;

	return 0;
}

206
#ifdef CONFIG_HARDLOCKUP_DETECTOR
207

208 209 210 211 212 213 214 215 216
static struct perf_event_attr wd_hw_attr = {
	.type		= PERF_TYPE_HARDWARE,
	.config		= PERF_COUNT_HW_CPU_CYCLES,
	.size		= sizeof(struct perf_event_attr),
	.pinned		= 1,
	.disabled	= 1,
};

/* Callback function for perf event subsystem */
217
static void watchdog_overflow_callback(struct perf_event *event,
218 219 220
		 struct perf_sample_data *data,
		 struct pt_regs *regs)
{
221 222 223
	/* Ensure the watchdog never gets throttled */
	event->hw.interrupts = 0;

224 225
	if (__this_cpu_read(watchdog_nmi_touch) == true) {
		__this_cpu_write(watchdog_nmi_touch, false);
226 227 228 229 230 231 232 233 234
		return;
	}

	/* check for a hardlockup
	 * This is done by making sure our timer interrupt
	 * is incrementing.  The timer interrupt should have
	 * fired multiple times before we overflow'd.  If it hasn't
	 * then this is a good indication the cpu is stuck
	 */
235 236 237
	if (is_hardlockup()) {
		int this_cpu = smp_processor_id();

238
		/* only print hardlockups once */
239
		if (__this_cpu_read(hard_watchdog_warn) == true)
240 241 242 243 244 245 246
			return;

		if (hardlockup_panic)
			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
		else
			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);

247
		__this_cpu_write(hard_watchdog_warn, true);
248 249 250
		return;
	}

251
	__this_cpu_write(hard_watchdog_warn, false);
252 253
	return;
}
254 255
#endif /* CONFIG_HARDLOCKUP_DETECTOR */

256 257
static void watchdog_interrupt_count(void)
{
258
	__this_cpu_inc(hrtimer_interrupts);
259
}
260 261 262

static int watchdog_nmi_enable(unsigned int cpu);
static void watchdog_nmi_disable(unsigned int cpu);
263 264 265 266

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
267
	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
268 269 270 271 272 273 274
	struct pt_regs *regs = get_irq_regs();
	int duration;

	/* kick the hardlockup detector */
	watchdog_interrupt_count();

	/* kick the softlockup detector */
275
	wake_up_process(__this_cpu_read(softlockup_watchdog));
276 277

	/* .. and repeat */
278
	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
279 280

	if (touch_ts == 0) {
281
		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
282 283 284 285
			/*
			 * If the time stamp was touched atomically
			 * make sure the scheduler tick is up to date.
			 */
286
			__this_cpu_write(softlockup_touch_sync, false);
287 288
			sched_clock_tick();
		}
289 290 291

		/* Clear the guest paused flag on watchdog reset */
		kvm_check_and_clear_guest_paused();
292 293 294 295 296 297 298 299 300 301
		__touch_watchdog();
		return HRTIMER_RESTART;
	}

	/* check for a softlockup
	 * This is done by making sure a high priority task is
	 * being scheduled.  The task touches the watchdog to
	 * indicate it is getting cpu time.  If it hasn't then
	 * this is a good indication some task is hogging the cpu
	 */
302
	duration = is_softlockup(touch_ts);
303
	if (unlikely(duration)) {
304 305 306 307 308 309 310 311
		/*
		 * If a virtual machine is stopped by the host it can look to
		 * the watchdog like a soft lockup, check to see if the host
		 * stopped the vm before we issue the warning
		 */
		if (kvm_check_and_clear_guest_paused())
			return HRTIMER_RESTART;

312
		/* only warn once */
313
		if (__this_cpu_read(soft_watchdog_warn) == true)
314 315
			return HRTIMER_RESTART;

316
		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
317
			smp_processor_id(), duration,
318 319 320 321 322 323 324 325 326 327
			current->comm, task_pid_nr(current));
		print_modules();
		print_irqtrace_events(current);
		if (regs)
			show_regs(regs);
		else
			dump_stack();

		if (softlockup_panic)
			panic("softlockup: hung tasks");
328
		__this_cpu_write(soft_watchdog_warn, true);
329
	} else
330
		__this_cpu_write(soft_watchdog_warn, false);
331 332 333 334

	return HRTIMER_RESTART;
}

335 336 337
static void watchdog_set_prio(unsigned int policy, unsigned int prio)
{
	struct sched_param param = { .sched_priority = prio };
338

339 340 341 342
	sched_setscheduler(current, policy, &param);
}

static void watchdog_enable(unsigned int cpu)
343
{
344
	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
345

346 347 348 349
	/* kick off the timer for the hardlockup detector */
	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hrtimer->function = watchdog_timer_fn;

350 351
	/* Enable the perf event */
	watchdog_nmi_enable(cpu);
352 353

	/* done here because hrtimer_start can only pin to smp_processor_id() */
354
	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
355 356
		      HRTIMER_MODE_REL_PINNED);

357 358 359 360
	/* initialize timestamp */
	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
	__touch_watchdog();
}
361

362 363 364
static void watchdog_disable(unsigned int cpu)
{
	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
365

366 367 368 369
	watchdog_set_prio(SCHED_NORMAL, 0);
	hrtimer_cancel(hrtimer);
	/* disable the perf event */
	watchdog_nmi_disable(cpu);
370 371
}

372 373 374 375 376
static void watchdog_cleanup(unsigned int cpu, bool online)
{
	watchdog_disable(cpu);
}

377 378 379 380 381 382 383 384 385
static int watchdog_should_run(unsigned int cpu)
{
	return __this_cpu_read(hrtimer_interrupts) !=
		__this_cpu_read(soft_lockup_hrtimer_cnt);
}

/*
 * The watchdog thread function - touches the timestamp.
 *
386
 * It only runs once every sample_period seconds (4 seconds by
387 388 389 390 391 392 393 394 395 396
 * default) to reset the softlockup timestamp. If this gets delayed
 * for more than 2*watchdog_thresh seconds then the debug-printout
 * triggers in watchdog_timer_fn().
 */
static void watchdog(unsigned int cpu)
{
	__this_cpu_write(soft_lockup_hrtimer_cnt,
			 __this_cpu_read(hrtimer_interrupts));
	__touch_watchdog();
}
397

398
#ifdef CONFIG_HARDLOCKUP_DETECTOR
399 400 401 402 403 404 405
/*
 * People like the simple clean cpu node info on boot.
 * Reduce the watchdog noise by only printing messages
 * that are different from what cpu0 displayed.
 */
static unsigned long cpu0_err;

406
static int watchdog_nmi_enable(unsigned int cpu)
407 408 409 410 411 412 413 414 415 416 417 418 419
{
	struct perf_event_attr *wd_attr;
	struct perf_event *event = per_cpu(watchdog_ev, cpu);

	/* is it already setup and enabled? */
	if (event && event->state > PERF_EVENT_STATE_OFF)
		goto out;

	/* it is setup but not enabled */
	if (event != NULL)
		goto out_enable;

	wd_attr = &wd_hw_attr;
420
	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
421 422

	/* Try to register using hardware perf events */
423
	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
424 425 426 427 428

	/* save cpu0 error for future comparision */
	if (cpu == 0 && IS_ERR(event))
		cpu0_err = PTR_ERR(event);

429
	if (!IS_ERR(event)) {
430 431 432
		/* only print for cpu0 or different than cpu0 */
		if (cpu == 0 || cpu0_err)
			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
433 434 435
		goto out_save;
	}

436 437 438
	/* skip displaying the same error again */
	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
		return PTR_ERR(event);
439 440 441

	/* vary the KERN level based on the returned errno */
	if (PTR_ERR(event) == -EOPNOTSUPP)
442
		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
443
	else if (PTR_ERR(event) == -ENOENT)
444 445
		pr_warning("disabled (cpu%i): hardware events not enabled\n",
			 cpu);
446
	else
447 448
		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
			cpu, PTR_ERR(event));
449
	return PTR_ERR(event);
450 451 452 453 454 455 456 457 458 459

	/* success path */
out_save:
	per_cpu(watchdog_ev, cpu) = event;
out_enable:
	perf_event_enable(per_cpu(watchdog_ev, cpu));
out:
	return 0;
}

460
static void watchdog_nmi_disable(unsigned int cpu)
461 462 463 464 465 466 467 468 469 470 471 472 473
{
	struct perf_event *event = per_cpu(watchdog_ev, cpu);

	if (event) {
		perf_event_disable(event);
		per_cpu(watchdog_ev, cpu) = NULL;

		/* should be in cleanup, but blocks oprofile */
		perf_event_release_kernel(event);
	}
	return;
}
#else
474 475
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
476
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
477

478 479 480 481 482 483 484 485 486 487 488
static struct smp_hotplug_thread watchdog_threads = {
	.store			= &softlockup_watchdog,
	.thread_should_run	= watchdog_should_run,
	.thread_fn		= watchdog,
	.thread_comm		= "watchdog/%u",
	.setup			= watchdog_enable,
	.cleanup		= watchdog_cleanup,
	.park			= watchdog_disable,
	.unpark			= watchdog_enable,
};

489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
static void restart_watchdog_hrtimer(void *info)
{
	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
	int ret;

	/*
	 * No need to cancel and restart hrtimer if it is currently executing
	 * because it will reprogram itself with the new period now.
	 * We should never see it unqueued here because we are running per-cpu
	 * with interrupts disabled.
	 */
	ret = hrtimer_try_to_cancel(hrtimer);
	if (ret == 1)
		hrtimer_start(hrtimer, ns_to_ktime(sample_period),
				HRTIMER_MODE_REL_PINNED);
}

static void update_timers(int cpu)
{
	/*
	 * Make sure that perf event counter will adopt to a new
	 * sampling period. Updating the sampling period directly would
	 * be much nicer but we do not have an API for that now so
	 * let's use a big hammer.
	 * Hrtimer will adopt the new period on the next tick but this
	 * might be late already so we have to restart the timer as well.
	 */
	watchdog_nmi_disable(cpu);
517
	smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
	watchdog_nmi_enable(cpu);
}

static void update_timers_all_cpus(void)
{
	int cpu;

	get_online_cpus();
	preempt_disable();
	for_each_online_cpu(cpu)
		update_timers(cpu);
	preempt_enable();
	put_online_cpus();
}

static int watchdog_enable_all_cpus(bool sample_period_changed)
534
{
535
	int err = 0;
536

537
	if (!watchdog_running) {
538 539 540 541
		err = smpboot_register_percpu_thread(&watchdog_threads);
		if (err)
			pr_err("Failed to create watchdog threads, disabled\n");
		else
542
			watchdog_running = 1;
543 544
	} else if (sample_period_changed) {
		update_timers_all_cpus();
545
	}
546 547

	return err;
548 549
}

550 551 552
/* prepare/enable/disable routines */
/* sysctl functions */
#ifdef CONFIG_SYSCTL
553 554
static void watchdog_disable_all_cpus(void)
{
555 556
	if (watchdog_running) {
		watchdog_running = 0;
557
		smpboot_unregister_percpu_thread(&watchdog_threads);
558
	}
559 560 561
}

/*
562
 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
563 564
 */

565 566
int proc_dowatchdog(struct ctl_table *table, int write,
		    void __user *buffer, size_t *lenp, loff_t *ppos)
567
{
568
	int err, old_thresh, old_enabled;
569
	static DEFINE_MUTEX(watchdog_proc_mutex);
570

571
	mutex_lock(&watchdog_proc_mutex);
572
	old_thresh = ACCESS_ONCE(watchdog_thresh);
573
	old_enabled = ACCESS_ONCE(watchdog_user_enabled);
574

575 576
	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
	if (err || !write)
577
		goto out;
578

579
	set_sample_period();
580 581
	/*
	 * Watchdog threads shouldn't be enabled if they are
582
	 * disabled. The 'watchdog_running' variable check in
583 584
	 * watchdog_*_all_cpus() function takes care of this.
	 */
585
	if (watchdog_user_enabled && watchdog_thresh)
586
		err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
587 588 589
	else
		watchdog_disable_all_cpus();

590 591 592
	/* Restore old values on failure */
	if (err) {
		watchdog_thresh = old_thresh;
593
		watchdog_user_enabled = old_enabled;
594
	}
595 596
out:
	mutex_unlock(&watchdog_proc_mutex);
597
	return err;
598 599 600
}
#endif /* CONFIG_SYSCTL */

601
void __init lockup_detector_init(void)
602
{
603
	set_sample_period();
604

605
	if (watchdog_user_enabled)
606
		watchdog_enable_all_cpus(false);
607
}