therm_throt.c 15.3 KB
Newer Older
1
/*
2 3
 * Thermal throttle event support code (such as syslog messaging and rate
 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
I
Ingo Molnar 已提交
4
 *
5 6 7 8 9
 * This allows consistent reporting of CPU thermal throttle events.
 *
 * Maintains a counter in /sys that keeps track of the number of thermal
 * events, such that the user knows how bad the thermal problem might be
 * (since the logging to syslog and mcelog is rate limited).
10 11 12 13
 *
 * Author: Dmitriy Zavin (dmitriyz@google.com)
 *
 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14
 *          Inspired by Ross Biro's and Al Borchers' counter code.
15
 */
16
#include <linux/interrupt.h>
I
Ingo Molnar 已提交
17 18
#include <linux/notifier.h>
#include <linux/jiffies.h>
19
#include <linux/kernel.h>
20
#include <linux/percpu.h>
21
#include <linux/export.h>
22 23 24
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
25
#include <linux/cpu.h>
I
Ingo Molnar 已提交
26

27 28
#include <asm/processor.h>
#include <asm/apic.h>
29 30
#include <asm/idle.h>
#include <asm/mce.h>
31
#include <asm/msr.h>
32 33

/* How long to wait between reporting thermal events */
I
Ingo Molnar 已提交
34
#define CHECK_INTERVAL		(300 * HZ)
35

36 37 38
#define THERMAL_THROTTLING_EVENT	0
#define POWER_LIMIT_EVENT		1

39
/*
40
 * Current thermal event state:
41
 */
42
struct _thermal_state {
43 44
	bool			new_event;
	int			event;
45
	u64			next_check;
46 47
	unsigned long		count;
	unsigned long		last_count;
48
};
I
Ingo Molnar 已提交
49

50
struct thermal_state {
51 52 53 54
	struct _thermal_state core_throttle;
	struct _thermal_state core_power_limit;
	struct _thermal_state package_throttle;
	struct _thermal_state package_power_limit;
55 56
	struct _thermal_state core_thresh0;
	struct _thermal_state core_thresh1;
57 58
	struct _thermal_state pkg_thresh0;
	struct _thermal_state pkg_thresh1;
59 60
};

61 62
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
63
EXPORT_SYMBOL(platform_thermal_notify);
64

65 66 67 68 69 70 71 72 73 74
/* Callback to handle core package threshold_interrupts */
int (*platform_thermal_package_notify)(__u64 msr_val);
EXPORT_SYMBOL_GPL(platform_thermal_package_notify);

/* Callback support of rate control, return true, if
 * callback has rate control */
bool (*platform_thermal_package_rate_control)(void);
EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);


75 76 77
static DEFINE_PER_CPU(struct thermal_state, thermal_state);

static atomic_t therm_throt_en	= ATOMIC_INIT(0);
78

79 80
static u32 lvtthmr_init __read_mostly;

81
#ifdef CONFIG_SYSFS
82 83 84
#define define_therm_throt_device_one_ro(_name)				\
	static DEVICE_ATTR(_name, 0444,					\
			   therm_throt_device_show_##_name,		\
85
				   NULL)				\
I
Ingo Molnar 已提交
86

87
#define define_therm_throt_device_show_func(event, name)		\
88
									\
89 90 91
static ssize_t therm_throt_device_show_##event##_##name(		\
			struct device *dev,				\
			struct device_attribute *attr,			\
92
			char *buf)					\
I
Ingo Molnar 已提交
93 94 95 96 97
{									\
	unsigned int cpu = dev->id;					\
	ssize_t ret;							\
									\
	preempt_disable();	/* CPU hotplug */			\
98
	if (cpu_online(cpu)) {						\
I
Ingo Molnar 已提交
99
		ret = sprintf(buf, "%lu\n",				\
100
			      per_cpu(thermal_state, cpu).event.name);	\
101
	} else								\
I
Ingo Molnar 已提交
102 103 104 105
		ret = 0;						\
	preempt_enable();						\
									\
	return ret;							\
106 107
}

108 109
define_therm_throt_device_show_func(core_throttle, count);
define_therm_throt_device_one_ro(core_throttle_count);
110

111 112
define_therm_throt_device_show_func(core_power_limit, count);
define_therm_throt_device_one_ro(core_power_limit_count);
113

114 115
define_therm_throt_device_show_func(package_throttle, count);
define_therm_throt_device_one_ro(package_throttle_count);
116

117 118
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
119

120
static struct attribute *thermal_throttle_attrs[] = {
121
	&dev_attr_core_throttle_count.attr,
122 123 124
	NULL
};

125
static struct attribute_group thermal_attr_group = {
I
Ingo Molnar 已提交
126 127
	.attrs	= thermal_throttle_attrs,
	.name	= "thermal_throttle"
128 129
};
#endif /* CONFIG_SYSFS */
130

131 132 133
#define CORE_LEVEL	0
#define PACKAGE_LEVEL	1

134
/***
135
 * therm_throt_process - Process thermal throttling event from interrupt
136 137 138 139
 * @curr: Whether the condition is current or not (boolean), since the
 *        thermal interrupt normally gets called both when the thermal
 *        event begins and once the event has ended.
 *
140
 * This function is called by the thermal interrupt after the
141 142 143 144 145 146 147 148 149
 * IRQ has been acknowledged.
 *
 * It will take care of rate limiting and printing messages to the syslog.
 *
 * Returns: 0 : Event should NOT be further logged, i.e. still in
 *              "timeout" from previous log message.
 *          1 : Event should be logged further, and a message has been
 *              printed to the syslog.
 */
150
static int therm_throt_process(bool new_event, int event, int level)
151
{
152
	struct _thermal_state *state;
153 154
	unsigned int this_cpu = smp_processor_id();
	bool old_event;
155
	u64 now;
156
	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
157 158

	now = get_jiffies_64();
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
	if (level == CORE_LEVEL) {
		if (event == THERMAL_THROTTLING_EVENT)
			state = &pstate->core_throttle;
		else if (event == POWER_LIMIT_EVENT)
			state = &pstate->core_power_limit;
		else
			 return 0;
	} else if (level == PACKAGE_LEVEL) {
		if (event == THERMAL_THROTTLING_EVENT)
			state = &pstate->package_throttle;
		else if (event == POWER_LIMIT_EVENT)
			state = &pstate->package_power_limit;
		else
			return 0;
	} else
		return 0;
175

176 177
	old_event = state->new_event;
	state->new_event = new_event;
178

179 180
	if (new_event)
		state->count++;
181

182
	if (time_before64(now, state->next_check) &&
183
			state->count != state->last_count)
184 185
		return 0;

186
	state->next_check = now + CHECK_INTERVAL;
187
	state->last_count = state->count;
188 189

	/* if we just entered the thermal event */
190 191 192 193 194 195 196 197 198 199 200
	if (new_event) {
		if (event == THERMAL_THROTTLING_EVENT)
			printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package",
				state->count);
		else
			printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package",
				state->count);
201 202
		return 1;
	}
203 204 205 206 207 208 209 210 211
	if (old_event) {
		if (event == THERMAL_THROTTLING_EVENT)
			printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package");
		else
			printk(KERN_INFO "CPU%d: %s power limit normal\n",
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package");
212
		return 1;
213 214
	}

215
	return 0;
216
}
217

218
static int thresh_event_valid(int level, int event)
219 220 221 222 223 224
{
	struct _thermal_state *state;
	unsigned int this_cpu = smp_processor_id();
	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
	u64 now = get_jiffies_64();

225 226 227 228 229 230
	if (level == PACKAGE_LEVEL)
		state = (event == 0) ? &pstate->pkg_thresh0 :
						&pstate->pkg_thresh1;
	else
		state = (event == 0) ? &pstate->core_thresh0 :
						&pstate->core_thresh1;
231 232 233 234 235

	if (time_before64(now, state->next_check))
		return 0;

	state->next_check = now + CHECK_INTERVAL;
236

237 238 239
	return 1;
}

240
#ifdef CONFIG_SYSFS
I
Ingo Molnar 已提交
241
/* Add/Remove thermal_throttle interface for CPU device: */
242
static __cpuinit int thermal_throttle_add_dev(struct device *dev,
243
				unsigned int cpu)
244
{
245
	int err;
246
	struct cpuinfo_x86 *c = &cpu_data(cpu);
247

248
	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
249 250 251
	if (err)
		return err;

252
	if (cpu_has(c, X86_FEATURE_PLN))
253 254
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_core_power_limit_count.attr,
255
					      thermal_attr_group.name);
256
	if (cpu_has(c, X86_FEATURE_PTS)) {
257 258
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_package_throttle_count.attr,
259 260
					      thermal_attr_group.name);
		if (cpu_has(c, X86_FEATURE_PLN))
261 262
			err = sysfs_add_file_to_group(&dev->kobj,
					&dev_attr_package_power_limit_count.attr,
263
					thermal_attr_group.name);
264
	}
265 266

	return err;
267 268
}

269
static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
270
{
271
	sysfs_remove_group(&dev->kobj, &thermal_attr_group);
272 273
}

I
Ingo Molnar 已提交
274
/* Mutex protecting device creation against CPU hotplug: */
275 276 277
static DEFINE_MUTEX(therm_cpu_lock);

/* Get notified when a cpu comes on/off. Be hotplug friendly. */
I
Ingo Molnar 已提交
278 279 280 281
static __cpuinit int
thermal_throttle_cpu_callback(struct notifier_block *nfb,
			      unsigned long action,
			      void *hcpu)
282 283
{
	unsigned int cpu = (unsigned long)hcpu;
284
	struct device *dev;
285
	int err = 0;
286

287
	dev = get_cpu_device(cpu);
I
Ingo Molnar 已提交
288

289
	switch (action) {
290 291
	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
292
		mutex_lock(&therm_cpu_lock);
293
		err = thermal_throttle_add_dev(dev, cpu);
294
		mutex_unlock(&therm_cpu_lock);
295
		WARN_ON(err);
296
		break;
297 298
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
299
	case CPU_DEAD:
300
	case CPU_DEAD_FROZEN:
301
		mutex_lock(&therm_cpu_lock);
302
		thermal_throttle_remove_dev(dev);
303
		mutex_unlock(&therm_cpu_lock);
304 305
		break;
	}
306
	return notifier_from_errno(err);
307 308
}

S
Satyam Sharma 已提交
309
static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
310 311 312 313 314 315 316
{
	.notifier_call = thermal_throttle_cpu_callback,
};

static __init int thermal_throttle_init_device(void)
{
	unsigned int cpu = 0;
317
	int err;
318 319 320 321 322 323 324 325 326 327

	if (!atomic_read(&therm_throt_en))
		return 0;

	register_hotcpu_notifier(&thermal_throttle_cpu_notifier);

#ifdef CONFIG_HOTPLUG_CPU
	mutex_lock(&therm_cpu_lock);
#endif
	/* connect live CPUs to sysfs */
328
	for_each_online_cpu(cpu) {
329
		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
330 331
		WARN_ON(err);
	}
332 333 334 335 336 337 338
#ifdef CONFIG_HOTPLUG_CPU
	mutex_unlock(&therm_cpu_lock);
#endif

	return 0;
}
device_initcall(thermal_throttle_init_device);
339

340
#endif /* CONFIG_SYSFS */
341

342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
static void notify_package_thresholds(__u64 msr_val)
{
	bool notify_thres_0 = false;
	bool notify_thres_1 = false;

	if (!platform_thermal_package_notify)
		return;

	/* lower threshold check */
	if (msr_val & THERM_LOG_THRESHOLD0)
		notify_thres_0 = true;
	/* higher threshold check */
	if (msr_val & THERM_LOG_THRESHOLD1)
		notify_thres_1 = true;

	if (!notify_thres_0 && !notify_thres_1)
		return;

	if (platform_thermal_package_rate_control &&
		platform_thermal_package_rate_control()) {
		/* Rate control is implemented in callback */
		platform_thermal_package_notify(msr_val);
		return;
	}

	/* lower threshold reached */
	if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
		platform_thermal_package_notify(msr_val);
	/* higher threshold reached */
	if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
		platform_thermal_package_notify(msr_val);
}

375 376 377 378 379 380 381 382 383
static void notify_thresholds(__u64 msr_val)
{
	/* check whether the interrupt handler is defined;
	 * otherwise simply return
	 */
	if (!platform_thermal_notify)
		return;

	/* lower threshold reached */
384 385
	if ((msr_val & THERM_LOG_THRESHOLD0) &&
			thresh_event_valid(CORE_LEVEL, 0))
386 387
		platform_thermal_notify(msr_val);
	/* higher threshold reached */
388 389
	if ((msr_val & THERM_LOG_THRESHOLD1) &&
			thresh_event_valid(CORE_LEVEL, 1))
390 391 392
		platform_thermal_notify(msr_val);
}

393
/* Thermal transition interrupt handler */
394
static void intel_thermal_interrupt(void)
395 396 397 398
{
	__u64 msr_val;

	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
399

400 401 402
	/* Check for violation of core thermal thresholds*/
	notify_thresholds(msr_val);

403
	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
404
				THERMAL_THROTTLING_EVENT,
405
				CORE_LEVEL) != 0)
406
		mce_log_therm_throt_event(msr_val);
407

408
	if (this_cpu_has(X86_FEATURE_PLN))
409
		therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
410
					POWER_LIMIT_EVENT,
411
					CORE_LEVEL);
412

413
	if (this_cpu_has(X86_FEATURE_PTS)) {
414
		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
415 416
		/* check violations of package thermal thresholds */
		notify_package_thresholds(msr_val);
417
		therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
418
					THERMAL_THROTTLING_EVENT,
419
					PACKAGE_LEVEL);
420
		if (this_cpu_has(X86_FEATURE_PLN))
421
			therm_throt_process(msr_val &
422 423
					PACKAGE_THERM_STATUS_POWER_LIMIT,
					POWER_LIMIT_EVENT,
424
					PACKAGE_LEVEL);
425
	}
426 427 428 429
}

static void unexpected_thermal_interrupt(void)
{
430
	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
431 432 433 434 435 436 437 438
			smp_processor_id());
}

static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;

asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
{
	irq_enter();
439
	exit_idle();
440 441 442 443 444 445 446
	inc_irq_stat(irq_thermal_count);
	smp_thermal_vector();
	irq_exit();
	/* Ack only at the end to avoid potential reentry */
	ack_APIC_irq();
}

447 448 449 450 451 452 453 454 455 456
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
static int intel_thermal_supported(struct cpuinfo_x86 *c)
{
	if (!cpu_has_apic)
		return 0;
	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
		return 0;
	return 1;
}

457
void __init mcheck_intel_therm_init(void)
458 459 460 461 462 463
{
	/*
	 * This function is only called on boot CPU. Save the init thermal
	 * LVT value on BSP and use that value to restore APs' thermal LVT
	 * entry BIOS programmed later
	 */
464
	if (intel_thermal_supported(&boot_cpu_data))
465 466 467
		lvtthmr_init = apic_read(APIC_LVTTHMR);
}

H
Hidetoshi Seto 已提交
468
void intel_init_thermal(struct cpuinfo_x86 *c)
469 470 471 472 473
{
	unsigned int cpu = smp_processor_id();
	int tm2 = 0;
	u32 l, h;

474
	if (!intel_thermal_supported(c))
475 476 477 478 479 480 481 482
		return;

	/*
	 * First check if its enabled already, in which case there might
	 * be some SMM goo which handles it, so we can't even put a handler
	 * since it might be delivered via SMI already:
	 */
	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
483

484
	h = lvtthmr_init;
485 486 487 488 489
	/*
	 * The initial value of thermal LVT entries on all APs always reads
	 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
	 * sequence to them and LVT registers are reset to 0s except for
	 * the mask bits which are set to 1s when APs receive INIT IPI.
490 491 492 493
	 * If BIOS takes over the thermal interrupt and sets its interrupt
	 * delivery mode to SMI (not fixed), it restores the value that the
	 * BIOS has programmed on AP based on BSP's info we saved since BIOS
	 * is always setting the same value for all threads/cores.
494
	 */
495 496
	if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
		apic_write(APIC_LVTTHMR, lvtthmr_init);
497 498


499 500 501 502 503 504 505 506 507 508 509 510 511 512
	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
		printk(KERN_DEBUG
		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
		return;
	}

	/* Check whether a vector already exists */
	if (h & APIC_VECTOR_MASK) {
		printk(KERN_DEBUG
		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
		       cpu, (h & APIC_VECTOR_MASK));
		return;
	}

513 514 515 516 517 518 519 520 521 522
	/* early Pentium M models use different method for enabling TM2 */
	if (cpu_has(c, X86_FEATURE_TM2)) {
		if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
			rdmsr(MSR_THERM2_CTL, l, h);
			if (l & MSR_THERM2_CTL_TM_SELECT)
				tm2 = 1;
		} else if (l & MSR_IA32_MISC_ENABLE_TM2)
			tm2 = 1;
	}

523 524 525 526 527
	/* We'll mask the thermal vector in the lapic till we're ready: */
	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
	apic_write(APIC_LVTTHMR, h);

	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
528 529 530 531 532 533 534
	if (cpu_has(c, X86_FEATURE_PLN))
		wrmsr(MSR_IA32_THERM_INTERRUPT,
		      l | (THERM_INT_LOW_ENABLE
			| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
	else
		wrmsr(MSR_IA32_THERM_INTERRUPT,
		      l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
535

536 537
	if (cpu_has(c, X86_FEATURE_PTS)) {
		rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
538 539 540 541 542 543 544 545 546
		if (cpu_has(c, X86_FEATURE_PLN))
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
			      l | (PACKAGE_THERM_INT_LOW_ENABLE
				| PACKAGE_THERM_INT_HIGH_ENABLE
				| PACKAGE_THERM_INT_PLN_ENABLE), h);
		else
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
			      l | (PACKAGE_THERM_INT_LOW_ENABLE
				| PACKAGE_THERM_INT_HIGH_ENABLE), h);
547 548
	}

549
	smp_thermal_vector = intel_thermal_interrupt;
550 551 552 553 554 555 556 557

	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);

	/* Unmask the thermal vector: */
	l = apic_read(APIC_LVTTHMR);
	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);

558 559
	printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
		       tm2 ? "TM2" : "TM1");
560 561 562 563

	/* enable thermal throttle processing */
	atomic_set(&therm_throt_en, 1);
}