therm_throt.c 15.4 KB
Newer Older
1
/*
2 3
 * Thermal throttle event support code (such as syslog messaging and rate
 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
I
Ingo Molnar 已提交
4
 *
5 6 7 8 9
 * This allows consistent reporting of CPU thermal throttle events.
 *
 * Maintains a counter in /sys that keeps track of the number of thermal
 * events, such that the user knows how bad the thermal problem might be
 * (since the logging to syslog and mcelog is rate limited).
10 11 12 13
 *
 * Author: Dmitriy Zavin (dmitriyz@google.com)
 *
 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14
 *          Inspired by Ross Biro's and Al Borchers' counter code.
15
 */
16
#include <linux/interrupt.h>
I
Ingo Molnar 已提交
17 18
#include <linux/notifier.h>
#include <linux/jiffies.h>
19
#include <linux/kernel.h>
20
#include <linux/percpu.h>
21
#include <linux/export.h>
22 23 24
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
25
#include <linux/cpu.h>
I
Ingo Molnar 已提交
26

27 28
#include <asm/processor.h>
#include <asm/apic.h>
29 30
#include <asm/idle.h>
#include <asm/mce.h>
31
#include <asm/msr.h>
32
#include <asm/trace/irq_vectors.h>
33 34

/* How long to wait between reporting thermal events */
I
Ingo Molnar 已提交
35
#define CHECK_INTERVAL		(300 * HZ)
36

37 38 39
#define THERMAL_THROTTLING_EVENT	0
#define POWER_LIMIT_EVENT		1

40
/*
41
 * Current thermal event state:
42
 */
43
struct _thermal_state {
44 45
	bool			new_event;
	int			event;
46
	u64			next_check;
47 48
	unsigned long		count;
	unsigned long		last_count;
49
};
I
Ingo Molnar 已提交
50

51
struct thermal_state {
52 53 54 55
	struct _thermal_state core_throttle;
	struct _thermal_state core_power_limit;
	struct _thermal_state package_throttle;
	struct _thermal_state package_power_limit;
56 57
	struct _thermal_state core_thresh0;
	struct _thermal_state core_thresh1;
58 59
	struct _thermal_state pkg_thresh0;
	struct _thermal_state pkg_thresh1;
60 61
};

62 63
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
64
EXPORT_SYMBOL(platform_thermal_notify);
65

66 67 68 69 70 71 72 73 74 75
/* Callback to handle core package threshold_interrupts */
int (*platform_thermal_package_notify)(__u64 msr_val);
EXPORT_SYMBOL_GPL(platform_thermal_package_notify);

/* Callback support of rate control, return true, if
 * callback has rate control */
bool (*platform_thermal_package_rate_control)(void);
EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);


76 77 78
static DEFINE_PER_CPU(struct thermal_state, thermal_state);

static atomic_t therm_throt_en	= ATOMIC_INIT(0);
79

80 81
static u32 lvtthmr_init __read_mostly;

82
#ifdef CONFIG_SYSFS
83 84 85
#define define_therm_throt_device_one_ro(_name)				\
	static DEVICE_ATTR(_name, 0444,					\
			   therm_throt_device_show_##_name,		\
86
				   NULL)				\
I
Ingo Molnar 已提交
87

88
#define define_therm_throt_device_show_func(event, name)		\
89
									\
90 91 92
static ssize_t therm_throt_device_show_##event##_##name(		\
			struct device *dev,				\
			struct device_attribute *attr,			\
93
			char *buf)					\
I
Ingo Molnar 已提交
94 95 96 97 98
{									\
	unsigned int cpu = dev->id;					\
	ssize_t ret;							\
									\
	preempt_disable();	/* CPU hotplug */			\
99
	if (cpu_online(cpu)) {						\
I
Ingo Molnar 已提交
100
		ret = sprintf(buf, "%lu\n",				\
101
			      per_cpu(thermal_state, cpu).event.name);	\
102
	} else								\
I
Ingo Molnar 已提交
103 104 105 106
		ret = 0;						\
	preempt_enable();						\
									\
	return ret;							\
107 108
}

109 110
define_therm_throt_device_show_func(core_throttle, count);
define_therm_throt_device_one_ro(core_throttle_count);
111

112 113
define_therm_throt_device_show_func(core_power_limit, count);
define_therm_throt_device_one_ro(core_power_limit_count);
114

115 116
define_therm_throt_device_show_func(package_throttle, count);
define_therm_throt_device_one_ro(package_throttle_count);
117

118 119
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
120

121
static struct attribute *thermal_throttle_attrs[] = {
122
	&dev_attr_core_throttle_count.attr,
123 124 125
	NULL
};

126
static struct attribute_group thermal_attr_group = {
I
Ingo Molnar 已提交
127 128
	.attrs	= thermal_throttle_attrs,
	.name	= "thermal_throttle"
129 130
};
#endif /* CONFIG_SYSFS */
131

132 133 134
#define CORE_LEVEL	0
#define PACKAGE_LEVEL	1

135
/***
136
 * therm_throt_process - Process thermal throttling event from interrupt
137 138 139 140
 * @curr: Whether the condition is current or not (boolean), since the
 *        thermal interrupt normally gets called both when the thermal
 *        event begins and once the event has ended.
 *
141
 * This function is called by the thermal interrupt after the
142 143 144 145 146 147 148 149 150
 * IRQ has been acknowledged.
 *
 * It will take care of rate limiting and printing messages to the syslog.
 *
 * Returns: 0 : Event should NOT be further logged, i.e. still in
 *              "timeout" from previous log message.
 *          1 : Event should be logged further, and a message has been
 *              printed to the syslog.
 */
151
static int therm_throt_process(bool new_event, int event, int level)
152
{
153
	struct _thermal_state *state;
154 155
	unsigned int this_cpu = smp_processor_id();
	bool old_event;
156
	u64 now;
157
	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
158 159

	now = get_jiffies_64();
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
	if (level == CORE_LEVEL) {
		if (event == THERMAL_THROTTLING_EVENT)
			state = &pstate->core_throttle;
		else if (event == POWER_LIMIT_EVENT)
			state = &pstate->core_power_limit;
		else
			 return 0;
	} else if (level == PACKAGE_LEVEL) {
		if (event == THERMAL_THROTTLING_EVENT)
			state = &pstate->package_throttle;
		else if (event == POWER_LIMIT_EVENT)
			state = &pstate->package_power_limit;
		else
			return 0;
	} else
		return 0;
176

177 178
	old_event = state->new_event;
	state->new_event = new_event;
179

180 181
	if (new_event)
		state->count++;
182

183
	if (time_before64(now, state->next_check) &&
184
			state->count != state->last_count)
185 186
		return 0;

187
	state->next_check = now + CHECK_INTERVAL;
188
	state->last_count = state->count;
189 190

	/* if we just entered the thermal event */
191 192 193 194 195 196
	if (new_event) {
		if (event == THERMAL_THROTTLING_EVENT)
			printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package",
				state->count);
197 198
		return 1;
	}
199 200 201 202 203
	if (old_event) {
		if (event == THERMAL_THROTTLING_EVENT)
			printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package");
204
		return 1;
205 206
	}

207
	return 0;
208
}
209

210
static int thresh_event_valid(int level, int event)
211 212 213 214 215 216
{
	struct _thermal_state *state;
	unsigned int this_cpu = smp_processor_id();
	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
	u64 now = get_jiffies_64();

217 218 219 220 221 222
	if (level == PACKAGE_LEVEL)
		state = (event == 0) ? &pstate->pkg_thresh0 :
						&pstate->pkg_thresh1;
	else
		state = (event == 0) ? &pstate->core_thresh0 :
						&pstate->core_thresh1;
223 224 225 226 227

	if (time_before64(now, state->next_check))
		return 0;

	state->next_check = now + CHECK_INTERVAL;
228

229 230 231
	return 1;
}

232 233 234 235 236 237 238 239 240
static bool int_pln_enable;
static int __init int_pln_enable_setup(char *s)
{
	int_pln_enable = true;

	return 1;
}
__setup("int_pln_enable", int_pln_enable_setup);

241
#ifdef CONFIG_SYSFS
I
Ingo Molnar 已提交
242
/* Add/Remove thermal_throttle interface for CPU device: */
243
static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
244
{
245
	int err;
246
	struct cpuinfo_x86 *c = &cpu_data(cpu);
247

248
	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
249 250 251
	if (err)
		return err;

252
	if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
253 254
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_core_power_limit_count.attr,
255
					      thermal_attr_group.name);
256
	if (cpu_has(c, X86_FEATURE_PTS)) {
257 258
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_package_throttle_count.attr,
259
					      thermal_attr_group.name);
260
		if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
261 262
			err = sysfs_add_file_to_group(&dev->kobj,
					&dev_attr_package_power_limit_count.attr,
263
					thermal_attr_group.name);
264
	}
265 266

	return err;
267 268
}

269
static void thermal_throttle_remove_dev(struct device *dev)
270
{
271
	sysfs_remove_group(&dev->kobj, &thermal_attr_group);
272 273 274
}

/* Get notified when a cpu comes on/off. Be hotplug friendly. */
275
static int
I
Ingo Molnar 已提交
276 277 278
thermal_throttle_cpu_callback(struct notifier_block *nfb,
			      unsigned long action,
			      void *hcpu)
279 280
{
	unsigned int cpu = (unsigned long)hcpu;
281
	struct device *dev;
282
	int err = 0;
283

284
	dev = get_cpu_device(cpu);
I
Ingo Molnar 已提交
285

286
	switch (action) {
287 288
	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
289
		err = thermal_throttle_add_dev(dev, cpu);
290
		WARN_ON(err);
291
		break;
292 293
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
294
	case CPU_DEAD:
295
	case CPU_DEAD_FROZEN:
296
		thermal_throttle_remove_dev(dev);
297 298
		break;
	}
299
	return notifier_from_errno(err);
300 301
}

302
static struct notifier_block thermal_throttle_cpu_notifier =
303 304 305 306 307 308 309
{
	.notifier_call = thermal_throttle_cpu_callback,
};

static __init int thermal_throttle_init_device(void)
{
	unsigned int cpu = 0;
310
	int err;
311 312 313 314

	if (!atomic_read(&therm_throt_en))
		return 0;

315
	cpu_notifier_register_begin();
316 317

	/* connect live CPUs to sysfs */
318
	for_each_online_cpu(cpu) {
319
		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
320 321
		WARN_ON(err);
	}
322

323 324 325
	__register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
	cpu_notifier_register_done();

326 327 328
	return 0;
}
device_initcall(thermal_throttle_init_device);
329

330
#endif /* CONFIG_SYSFS */
331

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
static void notify_package_thresholds(__u64 msr_val)
{
	bool notify_thres_0 = false;
	bool notify_thres_1 = false;

	if (!platform_thermal_package_notify)
		return;

	/* lower threshold check */
	if (msr_val & THERM_LOG_THRESHOLD0)
		notify_thres_0 = true;
	/* higher threshold check */
	if (msr_val & THERM_LOG_THRESHOLD1)
		notify_thres_1 = true;

	if (!notify_thres_0 && !notify_thres_1)
		return;

	if (platform_thermal_package_rate_control &&
		platform_thermal_package_rate_control()) {
		/* Rate control is implemented in callback */
		platform_thermal_package_notify(msr_val);
		return;
	}

	/* lower threshold reached */
	if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
		platform_thermal_package_notify(msr_val);
	/* higher threshold reached */
	if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
		platform_thermal_package_notify(msr_val);
}

365 366 367 368 369 370 371 372 373
static void notify_thresholds(__u64 msr_val)
{
	/* check whether the interrupt handler is defined;
	 * otherwise simply return
	 */
	if (!platform_thermal_notify)
		return;

	/* lower threshold reached */
374 375
	if ((msr_val & THERM_LOG_THRESHOLD0) &&
			thresh_event_valid(CORE_LEVEL, 0))
376 377
		platform_thermal_notify(msr_val);
	/* higher threshold reached */
378 379
	if ((msr_val & THERM_LOG_THRESHOLD1) &&
			thresh_event_valid(CORE_LEVEL, 1))
380 381 382
		platform_thermal_notify(msr_val);
}

383
/* Thermal transition interrupt handler */
384
static void intel_thermal_interrupt(void)
385 386 387 388
{
	__u64 msr_val;

	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
389

390 391 392
	/* Check for violation of core thermal thresholds*/
	notify_thresholds(msr_val);

393
	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
394
				THERMAL_THROTTLING_EVENT,
395
				CORE_LEVEL) != 0)
396
		mce_log_therm_throt_event(msr_val);
397

398
	if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
399
		therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
400
					POWER_LIMIT_EVENT,
401
					CORE_LEVEL);
402

403
	if (this_cpu_has(X86_FEATURE_PTS)) {
404
		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
405 406
		/* check violations of package thermal thresholds */
		notify_package_thresholds(msr_val);
407
		therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
408
					THERMAL_THROTTLING_EVENT,
409
					PACKAGE_LEVEL);
410
		if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
411
			therm_throt_process(msr_val &
412 413
					PACKAGE_THERM_STATUS_POWER_LIMIT,
					POWER_LIMIT_EVENT,
414
					PACKAGE_LEVEL);
415
	}
416 417 418 419
}

static void unexpected_thermal_interrupt(void)
{
420
	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
421 422 423 424 425
			smp_processor_id());
}

static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;

426
static inline void __smp_thermal_interrupt(void)
427 428 429
{
	inc_irq_stat(irq_thermal_count);
	smp_thermal_vector();
430 431
}

432
asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
433 434 435 436
{
	entering_irq();
	__smp_thermal_interrupt();
	exiting_ack_irq();
437 438
}

439
asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
440 441 442 443 444 445 446 447
{
	entering_irq();
	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
	__smp_thermal_interrupt();
	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
	exiting_ack_irq();
}

448 449 450 451 452 453 454 455 456 457
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
static int intel_thermal_supported(struct cpuinfo_x86 *c)
{
	if (!cpu_has_apic)
		return 0;
	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
		return 0;
	return 1;
}

458
void __init mcheck_intel_therm_init(void)
459 460 461 462 463 464
{
	/*
	 * This function is only called on boot CPU. Save the init thermal
	 * LVT value on BSP and use that value to restore APs' thermal LVT
	 * entry BIOS programmed later
	 */
465
	if (intel_thermal_supported(&boot_cpu_data))
466 467 468
		lvtthmr_init = apic_read(APIC_LVTTHMR);
}

H
Hidetoshi Seto 已提交
469
void intel_init_thermal(struct cpuinfo_x86 *c)
470 471 472 473 474
{
	unsigned int cpu = smp_processor_id();
	int tm2 = 0;
	u32 l, h;

475
	if (!intel_thermal_supported(c))
476 477 478 479 480 481 482 483
		return;

	/*
	 * First check if its enabled already, in which case there might
	 * be some SMM goo which handles it, so we can't even put a handler
	 * since it might be delivered via SMI already:
	 */
	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
484

485
	h = lvtthmr_init;
486 487 488 489 490
	/*
	 * The initial value of thermal LVT entries on all APs always reads
	 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
	 * sequence to them and LVT registers are reset to 0s except for
	 * the mask bits which are set to 1s when APs receive INIT IPI.
491 492 493 494
	 * If BIOS takes over the thermal interrupt and sets its interrupt
	 * delivery mode to SMI (not fixed), it restores the value that the
	 * BIOS has programmed on AP based on BSP's info we saved since BIOS
	 * is always setting the same value for all threads/cores.
495
	 */
496 497
	if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
		apic_write(APIC_LVTTHMR, lvtthmr_init);
498 499


500
	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
501 502
		if (system_state == SYSTEM_BOOTING)
			printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
503 504 505
		return;
	}

506 507 508 509 510 511 512 513 514 515
	/* early Pentium M models use different method for enabling TM2 */
	if (cpu_has(c, X86_FEATURE_TM2)) {
		if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
			rdmsr(MSR_THERM2_CTL, l, h);
			if (l & MSR_THERM2_CTL_TM_SELECT)
				tm2 = 1;
		} else if (l & MSR_IA32_MISC_ENABLE_TM2)
			tm2 = 1;
	}

516 517 518 519 520
	/* We'll mask the thermal vector in the lapic till we're ready: */
	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
	apic_write(APIC_LVTTHMR, h);

	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
521
	if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
522
		wrmsr(MSR_IA32_THERM_INTERRUPT,
523 524 525
			(l | (THERM_INT_LOW_ENABLE
			| THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
	else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
526
		wrmsr(MSR_IA32_THERM_INTERRUPT,
527
			l | (THERM_INT_LOW_ENABLE
528 529 530 531
			| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
	else
		wrmsr(MSR_IA32_THERM_INTERRUPT,
		      l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
532

533 534
	if (cpu_has(c, X86_FEATURE_PTS)) {
		rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
535
		if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
536
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
537 538 539 540 541 542
				(l | (PACKAGE_THERM_INT_LOW_ENABLE
				| PACKAGE_THERM_INT_HIGH_ENABLE))
				& ~PACKAGE_THERM_INT_PLN_ENABLE, h);
		else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
				l | (PACKAGE_THERM_INT_LOW_ENABLE
543 544 545 546 547 548
				| PACKAGE_THERM_INT_HIGH_ENABLE
				| PACKAGE_THERM_INT_PLN_ENABLE), h);
		else
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
			      l | (PACKAGE_THERM_INT_LOW_ENABLE
				| PACKAGE_THERM_INT_HIGH_ENABLE), h);
549 550
	}

551
	smp_thermal_vector = intel_thermal_interrupt;
552 553 554 555 556 557 558 559

	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);

	/* Unmask the thermal vector: */
	l = apic_read(APIC_LVTTHMR);
	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);

560 561
	printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
		       tm2 ? "TM2" : "TM1");
562 563 564 565

	/* enable thermal throttle processing */
	atomic_set(&therm_throt_en, 1);
}