therm_throt.c 15.5 KB
Newer Older
1
/*
2 3
 * Thermal throttle event support code (such as syslog messaging and rate
 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
I
Ingo Molnar 已提交
4
 *
5 6 7 8 9
 * This allows consistent reporting of CPU thermal throttle events.
 *
 * Maintains a counter in /sys that keeps track of the number of thermal
 * events, such that the user knows how bad the thermal problem might be
 * (since the logging to syslog and mcelog is rate limited).
10 11 12 13
 *
 * Author: Dmitriy Zavin (dmitriyz@google.com)
 *
 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14
 *          Inspired by Ross Biro's and Al Borchers' counter code.
15
 */
16
#include <linux/interrupt.h>
I
Ingo Molnar 已提交
17 18
#include <linux/notifier.h>
#include <linux/jiffies.h>
19
#include <linux/kernel.h>
20
#include <linux/percpu.h>
21
#include <linux/export.h>
22 23 24
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
25
#include <linux/cpu.h>
I
Ingo Molnar 已提交
26

27 28
#include <asm/processor.h>
#include <asm/apic.h>
29
#include <asm/mce.h>
30
#include <asm/msr.h>
31
#include <asm/trace/irq_vectors.h>
32 33

/* How long to wait between reporting thermal events */
I
Ingo Molnar 已提交
34
#define CHECK_INTERVAL		(300 * HZ)
35

36 37 38
#define THERMAL_THROTTLING_EVENT	0
#define POWER_LIMIT_EVENT		1

39
/*
40
 * Current thermal event state:
41
 */
42
struct _thermal_state {
43 44
	bool			new_event;
	int			event;
45
	u64			next_check;
46 47
	unsigned long		count;
	unsigned long		last_count;
48
};
I
Ingo Molnar 已提交
49

50
struct thermal_state {
51 52 53 54
	struct _thermal_state core_throttle;
	struct _thermal_state core_power_limit;
	struct _thermal_state package_throttle;
	struct _thermal_state package_power_limit;
55 56
	struct _thermal_state core_thresh0;
	struct _thermal_state core_thresh1;
57 58
	struct _thermal_state pkg_thresh0;
	struct _thermal_state pkg_thresh1;
59 60
};

61 62
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
63
EXPORT_SYMBOL(platform_thermal_notify);
64

65 66 67 68 69 70 71 72 73 74
/* Callback to handle core package threshold_interrupts */
int (*platform_thermal_package_notify)(__u64 msr_val);
EXPORT_SYMBOL_GPL(platform_thermal_package_notify);

/* Callback support of rate control, return true, if
 * callback has rate control */
bool (*platform_thermal_package_rate_control)(void);
EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);


75 76 77
static DEFINE_PER_CPU(struct thermal_state, thermal_state);

static atomic_t therm_throt_en	= ATOMIC_INIT(0);
78

79 80
static u32 lvtthmr_init __read_mostly;

81
#ifdef CONFIG_SYSFS
82 83 84
#define define_therm_throt_device_one_ro(_name)				\
	static DEVICE_ATTR(_name, 0444,					\
			   therm_throt_device_show_##_name,		\
85
				   NULL)				\
I
Ingo Molnar 已提交
86

87
#define define_therm_throt_device_show_func(event, name)		\
88
									\
89 90 91
static ssize_t therm_throt_device_show_##event##_##name(		\
			struct device *dev,				\
			struct device_attribute *attr,			\
92
			char *buf)					\
I
Ingo Molnar 已提交
93 94 95 96 97
{									\
	unsigned int cpu = dev->id;					\
	ssize_t ret;							\
									\
	preempt_disable();	/* CPU hotplug */			\
98
	if (cpu_online(cpu)) {						\
I
Ingo Molnar 已提交
99
		ret = sprintf(buf, "%lu\n",				\
100
			      per_cpu(thermal_state, cpu).event.name);	\
101
	} else								\
I
Ingo Molnar 已提交
102 103 104 105
		ret = 0;						\
	preempt_enable();						\
									\
	return ret;							\
106 107
}

108 109
define_therm_throt_device_show_func(core_throttle, count);
define_therm_throt_device_one_ro(core_throttle_count);
110

111 112
define_therm_throt_device_show_func(core_power_limit, count);
define_therm_throt_device_one_ro(core_power_limit_count);
113

114 115
define_therm_throt_device_show_func(package_throttle, count);
define_therm_throt_device_one_ro(package_throttle_count);
116

117 118
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
119

120
static struct attribute *thermal_throttle_attrs[] = {
121
	&dev_attr_core_throttle_count.attr,
122 123 124
	NULL
};

125
static struct attribute_group thermal_attr_group = {
I
Ingo Molnar 已提交
126 127
	.attrs	= thermal_throttle_attrs,
	.name	= "thermal_throttle"
128 129
};
#endif /* CONFIG_SYSFS */
130

131 132 133
#define CORE_LEVEL	0
#define PACKAGE_LEVEL	1

134
/***
135
 * therm_throt_process - Process thermal throttling event from interrupt
136 137 138 139
 * @curr: Whether the condition is current or not (boolean), since the
 *        thermal interrupt normally gets called both when the thermal
 *        event begins and once the event has ended.
 *
140
 * This function is called by the thermal interrupt after the
141 142 143 144 145 146 147 148 149
 * IRQ has been acknowledged.
 *
 * It will take care of rate limiting and printing messages to the syslog.
 *
 * Returns: 0 : Event should NOT be further logged, i.e. still in
 *              "timeout" from previous log message.
 *          1 : Event should be logged further, and a message has been
 *              printed to the syslog.
 */
150
static int therm_throt_process(bool new_event, int event, int level)
151
{
152
	struct _thermal_state *state;
153 154
	unsigned int this_cpu = smp_processor_id();
	bool old_event;
155
	u64 now;
156
	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
157 158

	now = get_jiffies_64();
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
	if (level == CORE_LEVEL) {
		if (event == THERMAL_THROTTLING_EVENT)
			state = &pstate->core_throttle;
		else if (event == POWER_LIMIT_EVENT)
			state = &pstate->core_power_limit;
		else
			 return 0;
	} else if (level == PACKAGE_LEVEL) {
		if (event == THERMAL_THROTTLING_EVENT)
			state = &pstate->package_throttle;
		else if (event == POWER_LIMIT_EVENT)
			state = &pstate->package_power_limit;
		else
			return 0;
	} else
		return 0;
175

176 177
	old_event = state->new_event;
	state->new_event = new_event;
178

179 180
	if (new_event)
		state->count++;
181

182
	if (time_before64(now, state->next_check) &&
183
			state->count != state->last_count)
184 185
		return 0;

186
	state->next_check = now + CHECK_INTERVAL;
187
	state->last_count = state->count;
188 189

	/* if we just entered the thermal event */
190 191
	if (new_event) {
		if (event == THERMAL_THROTTLING_EVENT)
192
			pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
193 194 195
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package",
				state->count);
196 197
		return 1;
	}
198 199
	if (old_event) {
		if (event == THERMAL_THROTTLING_EVENT)
200
			pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
201
				level == CORE_LEVEL ? "Core" : "Package");
202
		return 1;
203 204
	}

205
	return 0;
206
}
207

208
static int thresh_event_valid(int level, int event)
209 210 211 212 213 214
{
	struct _thermal_state *state;
	unsigned int this_cpu = smp_processor_id();
	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
	u64 now = get_jiffies_64();

215 216 217 218 219 220
	if (level == PACKAGE_LEVEL)
		state = (event == 0) ? &pstate->pkg_thresh0 :
						&pstate->pkg_thresh1;
	else
		state = (event == 0) ? &pstate->core_thresh0 :
						&pstate->core_thresh1;
221 222 223 224 225

	if (time_before64(now, state->next_check))
		return 0;

	state->next_check = now + CHECK_INTERVAL;
226

227 228 229
	return 1;
}

230 231 232 233 234 235 236 237 238
static bool int_pln_enable;
static int __init int_pln_enable_setup(char *s)
{
	int_pln_enable = true;

	return 1;
}
__setup("int_pln_enable", int_pln_enable_setup);

239
#ifdef CONFIG_SYSFS
I
Ingo Molnar 已提交
240
/* Add/Remove thermal_throttle interface for CPU device: */
241
static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
242
{
243
	int err;
244
	struct cpuinfo_x86 *c = &cpu_data(cpu);
245

246
	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
247 248 249
	if (err)
		return err;

250
	if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
251 252
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_core_power_limit_count.attr,
253
					      thermal_attr_group.name);
254
	if (cpu_has(c, X86_FEATURE_PTS)) {
255 256
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_package_throttle_count.attr,
257
					      thermal_attr_group.name);
258
		if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
259 260
			err = sysfs_add_file_to_group(&dev->kobj,
					&dev_attr_package_power_limit_count.attr,
261
					thermal_attr_group.name);
262
	}
263 264

	return err;
265 266
}

267
static void thermal_throttle_remove_dev(struct device *dev)
268
{
269
	sysfs_remove_group(&dev->kobj, &thermal_attr_group);
270 271 272
}

/* Get notified when a cpu comes on/off. Be hotplug friendly. */
273
static int
I
Ingo Molnar 已提交
274 275 276
thermal_throttle_cpu_callback(struct notifier_block *nfb,
			      unsigned long action,
			      void *hcpu)
277 278
{
	unsigned int cpu = (unsigned long)hcpu;
279
	struct device *dev;
280
	int err = 0;
281

282
	dev = get_cpu_device(cpu);
I
Ingo Molnar 已提交
283

284
	switch (action) {
285 286
	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
287
		err = thermal_throttle_add_dev(dev, cpu);
288
		WARN_ON(err);
289
		break;
290 291
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
292
	case CPU_DEAD:
293
	case CPU_DEAD_FROZEN:
294
		thermal_throttle_remove_dev(dev);
295 296
		break;
	}
297
	return notifier_from_errno(err);
298 299
}

300
static struct notifier_block thermal_throttle_cpu_notifier =
301 302 303 304 305 306 307
{
	.notifier_call = thermal_throttle_cpu_callback,
};

static __init int thermal_throttle_init_device(void)
{
	unsigned int cpu = 0;
308
	int err;
309 310 311 312

	if (!atomic_read(&therm_throt_en))
		return 0;

313
	cpu_notifier_register_begin();
314 315

	/* connect live CPUs to sysfs */
316
	for_each_online_cpu(cpu) {
317
		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
318 319
		WARN_ON(err);
	}
320

321 322 323
	__register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
	cpu_notifier_register_done();

324 325 326
	return 0;
}
device_initcall(thermal_throttle_init_device);
327

328
#endif /* CONFIG_SYSFS */
329

330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
static void notify_package_thresholds(__u64 msr_val)
{
	bool notify_thres_0 = false;
	bool notify_thres_1 = false;

	if (!platform_thermal_package_notify)
		return;

	/* lower threshold check */
	if (msr_val & THERM_LOG_THRESHOLD0)
		notify_thres_0 = true;
	/* higher threshold check */
	if (msr_val & THERM_LOG_THRESHOLD1)
		notify_thres_1 = true;

	if (!notify_thres_0 && !notify_thres_1)
		return;

	if (platform_thermal_package_rate_control &&
		platform_thermal_package_rate_control()) {
		/* Rate control is implemented in callback */
		platform_thermal_package_notify(msr_val);
		return;
	}

	/* lower threshold reached */
	if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
		platform_thermal_package_notify(msr_val);
	/* higher threshold reached */
	if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
		platform_thermal_package_notify(msr_val);
}

363 364 365 366 367 368 369 370 371
static void notify_thresholds(__u64 msr_val)
{
	/* check whether the interrupt handler is defined;
	 * otherwise simply return
	 */
	if (!platform_thermal_notify)
		return;

	/* lower threshold reached */
372 373
	if ((msr_val & THERM_LOG_THRESHOLD0) &&
			thresh_event_valid(CORE_LEVEL, 0))
374 375
		platform_thermal_notify(msr_val);
	/* higher threshold reached */
376 377
	if ((msr_val & THERM_LOG_THRESHOLD1) &&
			thresh_event_valid(CORE_LEVEL, 1))
378 379 380
		platform_thermal_notify(msr_val);
}

381
/* Thermal transition interrupt handler */
382
static void intel_thermal_interrupt(void)
383 384 385
{
	__u64 msr_val;

386 387 388
	if (static_cpu_has(X86_FEATURE_HWP))
		wrmsrl_safe(MSR_HWP_STATUS, 0);

389
	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
390

391 392 393
	/* Check for violation of core thermal thresholds*/
	notify_thresholds(msr_val);

394
	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
395
				THERMAL_THROTTLING_EVENT,
396
				CORE_LEVEL) != 0)
397
		mce_log_therm_throt_event(msr_val);
398

399
	if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
400
		therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
401
					POWER_LIMIT_EVENT,
402
					CORE_LEVEL);
403

404
	if (this_cpu_has(X86_FEATURE_PTS)) {
405
		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
406 407
		/* check violations of package thermal thresholds */
		notify_package_thresholds(msr_val);
408
		therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
409
					THERMAL_THROTTLING_EVENT,
410
					PACKAGE_LEVEL);
411
		if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
412
			therm_throt_process(msr_val &
413 414
					PACKAGE_THERM_STATUS_POWER_LIMIT,
					POWER_LIMIT_EVENT,
415
					PACKAGE_LEVEL);
416
	}
417 418 419 420
}

static void unexpected_thermal_interrupt(void)
{
421 422
	pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
		smp_processor_id());
423 424 425 426
}

static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;

427
static inline void __smp_thermal_interrupt(void)
428 429 430
{
	inc_irq_stat(irq_thermal_count);
	smp_thermal_vector();
431 432
}

433
asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
434 435 436 437
{
	entering_irq();
	__smp_thermal_interrupt();
	exiting_ack_irq();
438 439
}

440
asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
441 442 443 444 445 446 447 448
{
	entering_irq();
	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
	__smp_thermal_interrupt();
	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
	exiting_ack_irq();
}

449 450 451
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
static int intel_thermal_supported(struct cpuinfo_x86 *c)
{
452
	if (!boot_cpu_has(X86_FEATURE_APIC))
453 454 455 456 457 458
		return 0;
	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
		return 0;
	return 1;
}

459
void __init mcheck_intel_therm_init(void)
460 461 462 463 464 465
{
	/*
	 * This function is only called on boot CPU. Save the init thermal
	 * LVT value on BSP and use that value to restore APs' thermal LVT
	 * entry BIOS programmed later
	 */
466
	if (intel_thermal_supported(&boot_cpu_data))
467 468 469
		lvtthmr_init = apic_read(APIC_LVTTHMR);
}

H
Hidetoshi Seto 已提交
470
void intel_init_thermal(struct cpuinfo_x86 *c)
471 472 473 474 475
{
	unsigned int cpu = smp_processor_id();
	int tm2 = 0;
	u32 l, h;

476
	if (!intel_thermal_supported(c))
477 478 479 480 481 482 483 484
		return;

	/*
	 * First check if its enabled already, in which case there might
	 * be some SMM goo which handles it, so we can't even put a handler
	 * since it might be delivered via SMI already:
	 */
	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
485

486
	h = lvtthmr_init;
487 488 489 490 491
	/*
	 * The initial value of thermal LVT entries on all APs always reads
	 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
	 * sequence to them and LVT registers are reset to 0s except for
	 * the mask bits which are set to 1s when APs receive INIT IPI.
492 493 494 495
	 * If BIOS takes over the thermal interrupt and sets its interrupt
	 * delivery mode to SMI (not fixed), it restores the value that the
	 * BIOS has programmed on AP based on BSP's info we saved since BIOS
	 * is always setting the same value for all threads/cores.
496
	 */
497 498
	if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
		apic_write(APIC_LVTTHMR, lvtthmr_init);
499 500


501
	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
502
		if (system_state == SYSTEM_BOOTING)
503
			pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
504 505 506
		return;
	}

507 508 509 510 511 512 513 514 515 516
	/* early Pentium M models use different method for enabling TM2 */
	if (cpu_has(c, X86_FEATURE_TM2)) {
		if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
			rdmsr(MSR_THERM2_CTL, l, h);
			if (l & MSR_THERM2_CTL_TM_SELECT)
				tm2 = 1;
		} else if (l & MSR_IA32_MISC_ENABLE_TM2)
			tm2 = 1;
	}

517 518 519 520 521
	/* We'll mask the thermal vector in the lapic till we're ready: */
	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
	apic_write(APIC_LVTTHMR, h);

	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
522
	if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
523
		wrmsr(MSR_IA32_THERM_INTERRUPT,
524 525 526
			(l | (THERM_INT_LOW_ENABLE
			| THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
	else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
527
		wrmsr(MSR_IA32_THERM_INTERRUPT,
528
			l | (THERM_INT_LOW_ENABLE
529 530 531 532
			| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
	else
		wrmsr(MSR_IA32_THERM_INTERRUPT,
		      l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
533

534 535
	if (cpu_has(c, X86_FEATURE_PTS)) {
		rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
536
		if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
537
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
538 539 540 541 542 543
				(l | (PACKAGE_THERM_INT_LOW_ENABLE
				| PACKAGE_THERM_INT_HIGH_ENABLE))
				& ~PACKAGE_THERM_INT_PLN_ENABLE, h);
		else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
				l | (PACKAGE_THERM_INT_LOW_ENABLE
544 545 546 547 548 549
				| PACKAGE_THERM_INT_HIGH_ENABLE
				| PACKAGE_THERM_INT_PLN_ENABLE), h);
		else
			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
			      l | (PACKAGE_THERM_INT_LOW_ENABLE
				| PACKAGE_THERM_INT_HIGH_ENABLE), h);
550 551
	}

552
	smp_thermal_vector = intel_thermal_interrupt;
553 554 555 556 557 558 559 560

	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);

	/* Unmask the thermal vector: */
	l = apic_read(APIC_LVTTHMR);
	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);

561 562
	pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
		      tm2 ? "TM2" : "TM1");
563 564 565 566

	/* enable thermal throttle processing */
	atomic_set(&therm_throt_en, 1);
}