powernv-cpufreq.c 30.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 * POWERNV cpufreq driver for the IBM POWER processors
 *
 * (C) Copyright IBM 2014
 *
 * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 */

#define pr_fmt(fmt)	"powernv-cpufreq: " fmt

#include <linux/kernel.h>
#include <linux/sysfs.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/cpufreq.h>
#include <linux/smp.h>
#include <linux/of.h>
29
#include <linux/reboot.h>
30
#include <linux/slab.h>
31
#include <linux/cpu.h>
32
#include <linux/hashtable.h>
33
#include <trace/events/power.h>
34 35

#include <asm/cputhreads.h>
36
#include <asm/firmware.h>
37
#include <asm/reg.h>
38
#include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
39
#include <asm/opal.h>
40
#include <linux/timer.h>
41

42 43
#define POWERNV_MAX_PSTATES_ORDER  8
#define POWERNV_MAX_PSTATES	(1UL << (POWERNV_MAX_PSTATES_ORDER))
44 45
#define PMSR_PSAFE_ENABLE	(1UL << 30)
#define PMSR_SPR_EM_DISABLE	(1UL << 31)
46
#define MAX_PSTATE_SHIFT	32
47 48
#define LPSTATE_SHIFT		48
#define GPSTATE_SHIFT		56
49

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
#define MAX_RAMP_DOWN_TIME				5120
/*
 * On an idle system we want the global pstate to ramp-down from max value to
 * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
 * then ramp-down rapidly later on.
 *
 * This gives a percentage rampdown for time elapsed in milliseconds.
 * ramp_down_percentage = ((ms * ms) >> 18)
 *			~= 3.8 * (sec * sec)
 *
 * At 0 ms	ramp_down_percent = 0
 * At 5120 ms	ramp_down_percent = 100
 */
#define ramp_down_percent(time)		((time * time) >> 18)

/* Interval after which the timer is queued to bring down global pstate */
#define GPSTATE_TIMER_INTERVAL				2000

/**
 * struct global_pstate_info -	Per policy data structure to maintain history of
 *				global pstates
71 72
 * @highest_lpstate_idx:	The local pstate index from which we are
 *				ramping down
73
 * @elapsed_time:		Time in ms spent in ramping down from
74
 *				highest_lpstate_idx
75 76
 * @last_sampled_time:		Time from boot in ms when global pstates were
 *				last set
77 78
 * @last_lpstate_idx,		Last set value of local pstate and global
 * last_gpstate_idx		pstate in terms of cpufreq table index
79 80 81 82 83 84 85
 * @timer:			Is used for ramping down if cpu goes idle for
 *				a long time with global pstate held high
 * @gpstate_lock:		A spinlock to maintain synchronization between
 *				routines called by the timer handler and
 *				governer's target_index calls
 */
struct global_pstate_info {
86
	int highest_lpstate_idx;
87 88
	unsigned int elapsed_time;
	unsigned int last_sampled_time;
89 90
	int last_lpstate_idx;
	int last_gpstate_idx;
91 92
	spinlock_t gpstate_lock;
	struct timer_list timer;
93
	struct cpufreq_policy *policy;
94 95
};

96
static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112

DEFINE_HASHTABLE(pstate_revmap, POWERNV_MAX_PSTATES_ORDER);
/**
 * struct pstate_idx_revmap_data: Entry in the hashmap pstate_revmap
 *				  indexed by a function of pstate id.
 *
 * @pstate_id: pstate id for this entry.
 *
 * @cpufreq_table_idx: Index into the powernv_freqs
 *		       cpufreq_frequency_table for frequency
 *		       corresponding to pstate_id.
 *
 * @hentry: hlist_node that hooks this entry into the pstate_revmap
 *	    hashtable
 */
struct pstate_idx_revmap_data {
113
	u8 pstate_id;
114 115 116 117
	unsigned int cpufreq_table_idx;
	struct hlist_node hentry;
};

118
static bool rebooting, throttled, occ_reset;
119

120 121 122 123 124 125 126 127 128
static const char * const throttle_reason[] = {
	"No throttling",
	"Power Cap",
	"Processor Over Temperature",
	"Power Supply Failure",
	"Over Current",
	"OCC Reset"
};

129 130 131 132 133 134 135 136 137 138
enum throttle_reason_type {
	NO_THROTTLE = 0,
	POWERCAP,
	CPU_OVERTEMP,
	POWER_SUPPLY_FAILURE,
	OVERCURRENT,
	OCC_RESET_THROTTLE,
	OCC_MAX_REASON
};

139 140 141
static struct chip {
	unsigned int id;
	bool throttled;
142 143
	bool restore;
	u8 throttle_reason;
144 145
	cpumask_t mask;
	struct work_struct throttle;
146 147 148
	int throttle_turbo;
	int throttle_sub_turbo;
	int reason[OCC_MAX_REASON];
149 150 151
} *chips;

static int nr_chips;
152
static DEFINE_PER_CPU(struct chip *, chip_info);
153

154
/*
155 156 157 158 159
 * Note:
 * The set of pstates consists of contiguous integers.
 * powernv_pstate_info stores the index of the frequency table for
 * max, min and nominal frequencies. It also stores number of
 * available frequencies.
160
 *
161 162
 * powernv_pstate_info.nominal indicates the index to the highest
 * non-turbo frequency.
163 164
 */
static struct powernv_pstate_info {
165 166 167 168
	unsigned int min;
	unsigned int max;
	unsigned int nominal;
	unsigned int nr_pstates;
169
	bool wof_enabled;
170 171
} powernv_pstate_info;

172
static inline u8 extract_pstate(u64 pmsr_val, unsigned int shift)
173
{
174
	return ((pmsr_val >> shift) & 0xFF);
175 176 177 178 179 180
}

#define extract_local_pstate(x) extract_pstate(x, LPSTATE_SHIFT)
#define extract_global_pstate(x) extract_pstate(x, GPSTATE_SHIFT)
#define extract_max_pstate(x)  extract_pstate(x, MAX_PSTATE_SHIFT)

181 182 183 184 185 186 187 188 189 190
/* Use following functions for conversions between pstate_id and index */

/**
 * idx_to_pstate : Returns the pstate id corresponding to the
 *		   frequency in the cpufreq frequency table
 *		   powernv_freqs indexed by @i.
 *
 *		   If @i is out of bound, this will return the pstate
 *		   corresponding to the nominal frequency.
 */
191
static inline u8 idx_to_pstate(unsigned int i)
192
{
193
	if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
194
		pr_warn_once("idx_to_pstate: index %u is out of bound\n", i);
195 196 197
		return powernv_freqs[powernv_pstate_info.nominal].driver_data;
	}

198 199 200
	return powernv_freqs[i].driver_data;
}

201 202 203 204 205 206 207 208 209
/**
 * pstate_to_idx : Returns the index in the cpufreq frequencytable
 *		   powernv_freqs for the frequency whose corresponding
 *		   pstate id is @pstate.
 *
 *		   If no frequency corresponding to @pstate is found,
 *		   this will return the index of the nominal
 *		   frequency.
 */
210
static unsigned int pstate_to_idx(u8 pstate)
211
{
212 213
	unsigned int key = pstate % POWERNV_MAX_PSTATES;
	struct pstate_idx_revmap_data *revmap_data;
214

215 216 217
	hash_for_each_possible(pstate_revmap, revmap_data, hentry, key) {
		if (revmap_data->pstate_id == pstate)
			return revmap_data->cpufreq_table_idx;
218
	}
219

220
	pr_warn_once("pstate_to_idx: pstate 0x%x not found\n", pstate);
221
	return powernv_pstate_info.nominal;
222 223
}

224 225 226 227
static inline void reset_gpstates(struct cpufreq_policy *policy)
{
	struct global_pstate_info *gpstates = policy->driver_data;

228
	gpstates->highest_lpstate_idx = 0;
229 230
	gpstates->elapsed_time = 0;
	gpstates->last_sampled_time = 0;
231 232
	gpstates->last_lpstate_idx = 0;
	gpstates->last_gpstate_idx = 0;
233 234
}

235 236 237 238 239 240 241
/*
 * Initialize the freq table based on data obtained
 * from the firmware passed via device-tree
 */
static int init_powernv_pstates(void)
{
	struct device_node *power_mgt;
242
	int i, nr_pstates = 0;
243 244
	const __be32 *pstate_ids, *pstate_freqs;
	u32 len_ids, len_freqs;
245
	u32 pstate_min, pstate_max, pstate_nominal;
246
	u32 pstate_turbo, pstate_ultra_turbo;
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268

	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
	if (!power_mgt) {
		pr_warn("power-mgt node not found\n");
		return -ENODEV;
	}

	if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
		pr_warn("ibm,pstate-min node not found\n");
		return -ENODEV;
	}

	if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
		pr_warn("ibm,pstate-max node not found\n");
		return -ENODEV;
	}

	if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
				 &pstate_nominal)) {
		pr_warn("ibm,pstate-nominal not found\n");
		return -ENODEV;
	}
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287

	if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
				 &pstate_ultra_turbo)) {
		powernv_pstate_info.wof_enabled = false;
		goto next;
	}

	if (of_property_read_u32(power_mgt, "ibm,pstate-turbo",
				 &pstate_turbo)) {
		powernv_pstate_info.wof_enabled = false;
		goto next;
	}

	if (pstate_turbo == pstate_ultra_turbo)
		powernv_pstate_info.wof_enabled = false;
	else
		powernv_pstate_info.wof_enabled = true;

next:
288
	pr_info("cpufreq pstate min 0x%x nominal 0x%x max 0x%x\n", pstate_min,
289
		pstate_nominal, pstate_max);
290 291
	pr_info("Workload Optimized Frequency is %s in the platform\n",
		(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
292 293 294 295 296 297 298 299 300 301 302 303 304 305

	pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
	if (!pstate_ids) {
		pr_warn("ibm,pstate-ids not found\n");
		return -ENODEV;
	}

	pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
				      &len_freqs);
	if (!pstate_freqs) {
		pr_warn("ibm,pstate-frequencies-mhz not found\n");
		return -ENODEV;
	}

306 307 308 309 310
	if (len_ids != len_freqs) {
		pr_warn("Entries in ibm,pstate-ids and "
			"ibm,pstate-frequencies-mhz does not match\n");
	}

311 312 313 314 315 316
	nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
	if (!nr_pstates) {
		pr_warn("No PStates found\n");
		return -ENODEV;
	}

317
	powernv_pstate_info.nr_pstates = nr_pstates;
318
	pr_debug("NR PStates %d\n", nr_pstates);
319

320 321 322
	for (i = 0; i < nr_pstates; i++) {
		u32 id = be32_to_cpu(pstate_ids[i]);
		u32 freq = be32_to_cpu(pstate_freqs[i]);
323 324
		struct pstate_idx_revmap_data *revmap_data;
		unsigned int key;
325 326 327

		pr_debug("PState id %d freq %d MHz\n", id, freq);
		powernv_freqs[i].frequency = freq * 1000; /* kHz */
328
		powernv_freqs[i].driver_data = id & 0xFF;
329

330 331 332
		revmap_data = (struct pstate_idx_revmap_data *)
			      kmalloc(sizeof(*revmap_data), GFP_KERNEL);

333
		revmap_data->pstate_id = id & 0xFF;
334
		revmap_data->cpufreq_table_idx = i;
335
		key = (revmap_data->pstate_id) % POWERNV_MAX_PSTATES;
336 337
		hash_add(pstate_revmap, &revmap_data->hentry, key);

338 339 340 341 342 343
		if (id == pstate_max)
			powernv_pstate_info.max = i;
		else if (id == pstate_nominal)
			powernv_pstate_info.nominal = i;
		else if (id == pstate_min)
			powernv_pstate_info.min = i;
344 345 346 347 348 349 350

		if (powernv_pstate_info.wof_enabled && id == pstate_turbo) {
			int j;

			for (j = i - 1; j >= (int)powernv_pstate_info.max; j--)
				powernv_freqs[j].flags = CPUFREQ_BOOST_FREQ;
		}
351
	}
352

353 354 355 356 357 358
	/* End of list marker entry */
	powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
	return 0;
}

/* Returns the CPU frequency corresponding to the pstate_id. */
359
static unsigned int pstate_id_to_freq(u8 pstate_id)
360 361 362
{
	int i;

363
	i = pstate_to_idx(pstate_id);
364
	if (i >= powernv_pstate_info.nr_pstates || i < 0) {
365
		pr_warn("PState id 0x%x outside of PState table, reporting nominal id 0x%x instead\n",
366 367
			pstate_id, idx_to_pstate(powernv_pstate_info.nominal));
		i = powernv_pstate_info.nominal;
368
	}
369 370 371 372 373 374 375 376 377 378 379 380

	return powernv_freqs[i].frequency;
}

/*
 * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by
 * the firmware
 */
static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy,
					char *buf)
{
	return sprintf(buf, "%u\n",
381
		powernv_freqs[powernv_pstate_info.nominal].frequency);
382 383 384 385 386
}

struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq =
	__ATTR_RO(cpuinfo_nominal_freq);

387 388
#define SCALING_BOOST_FREQS_ATTR_INDEX		2

389 390 391
static struct freq_attr *powernv_cpu_freq_attr[] = {
	&cpufreq_freq_attr_scaling_available_freqs,
	&cpufreq_freq_attr_cpuinfo_nominal_freq,
392
	&cpufreq_freq_attr_scaling_boost_freqs,
393 394 395
	NULL,
};

396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
#define throttle_attr(name, member)					\
static ssize_t name##_show(struct cpufreq_policy *policy, char *buf)	\
{									\
	struct chip *chip = per_cpu(chip_info, policy->cpu);		\
									\
	return sprintf(buf, "%u\n", chip->member);			\
}									\
									\
static struct freq_attr throttle_attr_##name = __ATTR_RO(name)		\

throttle_attr(unthrottle, reason[NO_THROTTLE]);
throttle_attr(powercap, reason[POWERCAP]);
throttle_attr(overtemp, reason[CPU_OVERTEMP]);
throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
throttle_attr(overcurrent, reason[OVERCURRENT]);
throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
throttle_attr(turbo_stat, throttle_turbo);
throttle_attr(sub_turbo_stat, throttle_sub_turbo);

static struct attribute *throttle_attrs[] = {
	&throttle_attr_unthrottle.attr,
	&throttle_attr_powercap.attr,
	&throttle_attr_overtemp.attr,
	&throttle_attr_supply_fault.attr,
	&throttle_attr_overcurrent.attr,
	&throttle_attr_occ_reset.attr,
	&throttle_attr_turbo_stat.attr,
	&throttle_attr_sub_turbo_stat.attr,
	NULL,
};

static const struct attribute_group throttle_attr_grp = {
	.name	= "throttle_stats",
	.attrs	= throttle_attrs,
};

432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
/* Helper routines */

/* Access helpers to power mgt SPR */

static inline unsigned long get_pmspr(unsigned long sprn)
{
	switch (sprn) {
	case SPRN_PMCR:
		return mfspr(SPRN_PMCR);

	case SPRN_PMICR:
		return mfspr(SPRN_PMICR);

	case SPRN_PMSR:
		return mfspr(SPRN_PMSR);
	}
	BUG();
}

static inline void set_pmspr(unsigned long sprn, unsigned long val)
{
	switch (sprn) {
	case SPRN_PMCR:
		mtspr(SPRN_PMCR, val);
		return;

	case SPRN_PMICR:
		mtspr(SPRN_PMICR, val);
		return;
	}
	BUG();
}

/*
 * Use objects of this type to query/update
 * pstates on a remote CPU via smp_call_function.
 */
struct powernv_smp_call_data {
	unsigned int freq;
471 472
	u8 pstate_id;
	u8 gpstate_id;
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
};

/*
 * powernv_read_cpu_freq: Reads the current frequency on this CPU.
 *
 * Called via smp_call_function.
 *
 * Note: The caller of the smp_call_function should pass an argument of
 * the type 'struct powernv_smp_call_data *' along with this function.
 *
 * The current frequency on this CPU will be returned via
 * ((struct powernv_smp_call_data *)arg)->freq;
 */
static void powernv_read_cpu_freq(void *arg)
{
	unsigned long pmspr_val;
	struct powernv_smp_call_data *freq_data = arg;

	pmspr_val = get_pmspr(SPRN_PMSR);
492
	freq_data->pstate_id = extract_local_pstate(pmspr_val);
493 494
	freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);

495 496 497
	pr_debug("cpu %d pmsr %016lX pstate_id 0x%x frequency %d kHz\n",
		 raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
		 freq_data->freq);
498 499 500 501 502 503 504
}

/*
 * powernv_cpufreq_get: Returns the CPU frequency as reported by the
 * firmware for CPU 'cpu'. This value is reported through the sysfs
 * file cpuinfo_cur_freq.
 */
505
static unsigned int powernv_cpufreq_get(unsigned int cpu)
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
{
	struct powernv_smp_call_data freq_data;

	smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq,
			&freq_data, 1);

	return freq_data.freq;
}

/*
 * set_pstate: Sets the pstate on this CPU.
 *
 * This is called via an smp_call_function.
 *
 * The caller must ensure that freq_data is of the type
 * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
 * on this CPU should be present in freq_data->pstate_id.
 */
524
static void set_pstate(void *data)
525 526
{
	unsigned long val;
527 528 529
	struct powernv_smp_call_data *freq_data = data;
	unsigned long pstate_ul = freq_data->pstate_id;
	unsigned long gpstate_ul = freq_data->gpstate_id;
530 531 532 533 534

	val = get_pmspr(SPRN_PMCR);
	val = val & 0x0000FFFFFFFFFFFFULL;

	pstate_ul = pstate_ul & 0xFF;
535
	gpstate_ul = gpstate_ul & 0xFF;
536 537

	/* Set both global(bits 56..63) and local(bits 48..55) PStates */
538
	val = val | (gpstate_ul << 56) | (pstate_ul << 48);
539 540 541 542 543 544

	pr_debug("Setting cpu %d pmcr to %016lX\n",
			raw_smp_processor_id(), val);
	set_pmspr(SPRN_PMCR, val);
}

545 546 547 548 549 550
/*
 * get_nominal_index: Returns the index corresponding to the nominal
 * pstate in the cpufreq table
 */
static inline unsigned int get_nominal_index(void)
{
551
	return powernv_pstate_info.nominal;
552 553
}

554
static void powernv_cpufreq_throttle_check(void *data)
555
{
556
	struct chip *chip;
557
	unsigned int cpu = smp_processor_id();
558
	unsigned long pmsr;
559
	u8 pmsr_pmax;
560
	unsigned int pmsr_pmax_idx;
561 562

	pmsr = get_pmspr(SPRN_PMSR);
563
	chip = this_cpu_read(chip_info);
564

565
	/* Check for Pmax Capping */
566
	pmsr_pmax = extract_max_pstate(pmsr);
567 568
	pmsr_pmax_idx = pstate_to_idx(pmsr_pmax);
	if (pmsr_pmax_idx != powernv_pstate_info.max) {
569
		if (chip->throttled)
570
			goto next;
571
		chip->throttled = true;
572
		if (pmsr_pmax_idx > powernv_pstate_info.nominal) {
573
			pr_warn_once("CPU %d on Chip %u has Pmax(0x%x) reduced below that of nominal frequency(0x%x)\n",
574
				     cpu, chip->id, pmsr_pmax,
575
				     idx_to_pstate(powernv_pstate_info.nominal));
576 577 578 579
			chip->throttle_sub_turbo++;
		} else {
			chip->throttle_turbo++;
		}
580 581
		trace_powernv_throttle(chip->id,
				      throttle_reason[chip->throttle_reason],
582
				      pmsr_pmax);
583 584 585 586
	} else if (chip->throttled) {
		chip->throttled = false;
		trace_powernv_throttle(chip->id,
				      throttle_reason[chip->throttle_reason],
587
				      pmsr_pmax);
588 589
	}

590
	/* Check if Psafe_mode_active is set in PMSR. */
591
next:
592
	if (pmsr & PMSR_PSAFE_ENABLE) {
593 594 595 596 597 598 599 600 601 602 603 604
		throttled = true;
		pr_info("Pstate set to safe frequency\n");
	}

	/* Check if SPR_EM_DISABLE is set in PMSR */
	if (pmsr & PMSR_SPR_EM_DISABLE) {
		throttled = true;
		pr_info("Frequency Control disabled from OS\n");
	}

	if (throttled) {
		pr_info("PMSR = %16lx\n", pmsr);
605
		pr_warn("CPU Frequency could be throttled\n");
606 607 608
	}
}

609 610
/**
 * calc_global_pstate - Calculate global pstate
611 612 613
 * @elapsed_time:		Elapsed time in milliseconds
 * @local_pstate_idx:		New local pstate
 * @highest_lpstate_idx:	pstate from which its ramping down
614 615 616 617 618 619
 *
 * Finds the appropriate global pstate based on the pstate from which its
 * ramping down and the time elapsed in ramping down. It follows a quadratic
 * equation which ensures that it reaches ramping down to pmin in 5sec.
 */
static inline int calc_global_pstate(unsigned int elapsed_time,
620 621
				     int highest_lpstate_idx,
				     int local_pstate_idx)
622
{
623
	int index_diff;
624 625 626 627

	/*
	 * Using ramp_down_percent we get the percentage of rampdown
	 * that we are expecting to be dropping. Difference between
628
	 * highest_lpstate_idx and powernv_pstate_info.min will give a absolute
629 630 631
	 * number of how many pstates we will drop eventually by the end of
	 * 5 seconds, then just scale it get the number pstates to be dropped.
	 */
632 633
	index_diff =  ((int)ramp_down_percent(elapsed_time) *
			(powernv_pstate_info.min - highest_lpstate_idx)) / 100;
634 635

	/* Ensure that global pstate is >= to local pstate */
636 637
	if (highest_lpstate_idx + index_diff >= local_pstate_idx)
		return local_pstate_idx;
638
	else
639
		return highest_lpstate_idx + index_diff;
640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
}

static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
{
	unsigned int timer_interval;

	/*
	 * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
	 * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
	 * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
	 * seconds of ramp down time.
	 */
	if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
	     > MAX_RAMP_DOWN_TIME)
		timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
	else
		timer_interval = GPSTATE_TIMER_INTERVAL;

658
	mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));
659 660 661 662 663 664 665 666 667 668 669
}

/**
 * gpstate_timer_handler
 *
 * @data: pointer to cpufreq_policy on which timer was queued
 *
 * This handler brings down the global pstate closer to the local pstate
 * according quadratic equation. Queues a new timer if it is still not equal
 * to local pstate
 */
670
void gpstate_timer_handler(struct timer_list *t)
671
{
672 673
	struct global_pstate_info *gpstates = from_timer(gpstates, t, timer);
	struct cpufreq_policy *policy = gpstates->policy;
674 675
	int gpstate_idx, lpstate_idx;
	unsigned long val;
676 677 678 679 680 681 682
	unsigned int time_diff = jiffies_to_msecs(jiffies)
					- gpstates->last_sampled_time;
	struct powernv_smp_call_data freq_data;

	if (!spin_trylock(&gpstates->gpstate_lock))
		return;

683 684 685 686 687 688
	/*
	 * If PMCR was last updated was using fast_swtich then
	 * We may have wrong in gpstate->last_lpstate_idx
	 * value. Hence, read from PMCR to get correct data.
	 */
	val = get_pmspr(SPRN_PMCR);
689 690
	freq_data.gpstate_id = extract_global_pstate(val);
	freq_data.pstate_id = extract_local_pstate(val);
691 692 693 694 695 696
	if (freq_data.gpstate_id  == freq_data.pstate_id) {
		reset_gpstates(policy);
		spin_unlock(&gpstates->gpstate_lock);
		return;
	}

697 698 699
	gpstates->last_sampled_time += time_diff;
	gpstates->elapsed_time += time_diff;

700
	if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
701
		gpstate_idx = pstate_to_idx(freq_data.pstate_id);
702
		lpstate_idx = gpstate_idx;
703
		reset_gpstates(policy);
704
		gpstates->highest_lpstate_idx = gpstate_idx;
705
	} else {
706
		lpstate_idx = pstate_to_idx(freq_data.pstate_id);
707 708
		gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
						 gpstates->highest_lpstate_idx,
709
						 lpstate_idx);
710
	}
711 712 713
	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
	gpstates->last_gpstate_idx = gpstate_idx;
	gpstates->last_lpstate_idx = lpstate_idx;
714 715 716 717
	/*
	 * If local pstate is equal to global pstate, rampdown is over
	 * So timer is not required to be queued.
	 */
718
	if (gpstate_idx != gpstates->last_lpstate_idx)
719 720
		queue_gpstate_timer(gpstates);

721 722
	spin_unlock(&gpstates->gpstate_lock);

723 724 725 726
	/* Timer may get migrated to a different cpu on cpu hot unplug */
	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
}

727 728 729 730 731 732 733 734 735
/*
 * powernv_cpufreq_target_index: Sets the frequency corresponding to
 * the cpufreq table entry indexed by new_index on the cpus in the
 * mask policy->cpus
 */
static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
					unsigned int new_index)
{
	struct powernv_smp_call_data freq_data;
736
	unsigned int cur_msec, gpstate_idx;
737
	struct global_pstate_info *gpstates = policy->driver_data;
738

739 740 741
	if (unlikely(rebooting) && new_index != get_nominal_index())
		return 0;

742 743 744 745 746
	if (!throttled) {
		/* we don't want to be preempted while
		 * checking if the CPU frequency has been throttled
		 */
		preempt_disable();
747
		powernv_cpufreq_throttle_check(NULL);
748 749
		preempt_enable();
	}
750

751 752
	cur_msec = jiffies_to_msecs(get_jiffies_64());

753
	spin_lock(&gpstates->gpstate_lock);
754
	freq_data.pstate_id = idx_to_pstate(new_index);
755

756
	if (!gpstates->last_sampled_time) {
757 758
		gpstate_idx = new_index;
		gpstates->highest_lpstate_idx = new_index;
759 760 761
		goto gpstates_done;
	}

762
	if (gpstates->last_gpstate_idx < new_index) {
763 764 765 766 767 768 769 770 771 772
		gpstates->elapsed_time += cur_msec -
						 gpstates->last_sampled_time;

		/*
		 * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
		 * we should be resetting all global pstate related data. Set it
		 * equal to local pstate to start fresh.
		 */
		if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
			reset_gpstates(policy);
773 774
			gpstates->highest_lpstate_idx = new_index;
			gpstate_idx = new_index;
775 776
		} else {
		/* Elaspsed_time is less than 5 seconds, continue to rampdown */
777 778 779
			gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
							 gpstates->highest_lpstate_idx,
							 new_index);
780 781 782
		}
	} else {
		reset_gpstates(policy);
783 784
		gpstates->highest_lpstate_idx = new_index;
		gpstate_idx = new_index;
785 786 787 788 789 790
	}

	/*
	 * If local pstate is equal to global pstate, rampdown is over
	 * So timer is not required to be queued.
	 */
791
	if (gpstate_idx != new_index)
792
		queue_gpstate_timer(gpstates);
793 794
	else
		del_timer_sync(&gpstates->timer);
795 796

gpstates_done:
797
	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
798
	gpstates->last_sampled_time = cur_msec;
799 800
	gpstates->last_gpstate_idx = gpstate_idx;
	gpstates->last_lpstate_idx = new_index;
801

802 803
	spin_unlock(&gpstates->gpstate_lock);

804 805 806 807 808 809 810 811 812 813 814
	/*
	 * Use smp_call_function to send IPI and execute the
	 * mtspr on target CPU.  We could do that without IPI
	 * if current CPU is within policy->cpus (core)
	 */
	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
	return 0;
}

static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
{
815
	int base, i, ret;
816
	struct kernfs_node *kn;
817
	struct global_pstate_info *gpstates;
818 819 820 821 822 823

	base = cpu_first_thread_sibling(policy->cpu);

	for (i = 0; i < threads_per_core; i++)
		cpumask_set_cpu(base + i, policy->cpus);

824 825
	kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
	if (!kn) {
826 827 828 829 830 831 832 833
		int ret;

		ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
		if (ret) {
			pr_info("Failed to create throttle stats directory for cpu %d\n",
				policy->cpu);
			return ret;
		}
834 835
	} else {
		kernfs_put(kn);
836
	}
837 838 839 840 841 842 843 844

	gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
	if (!gpstates)
		return -ENOMEM;

	policy->driver_data = gpstates;

	/* initialize timer */
845 846 847
	gpstates->policy = policy;
	timer_setup(&gpstates->timer, gpstate_timer_handler,
		    TIMER_PINNED | TIMER_DEFERRABLE);
848 849 850 851 852
	gpstates->timer.expires = jiffies +
				msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
	spin_lock_init(&gpstates->gpstate_lock);
	ret = cpufreq_table_validate_and_show(policy, powernv_freqs);

853
	if (ret < 0) {
854
		kfree(policy->driver_data);
855 856
		return ret;
	}
857

858
	policy->fast_switch_possible = true;
859 860 861 862 863 864 865 866 867
	return ret;
}

static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
{
	/* timer is deleted in cpufreq_cpu_stop() */
	kfree(policy->driver_data);

	return 0;
868 869
}

870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888
static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
				unsigned long action, void *unused)
{
	int cpu;
	struct cpufreq_policy cpu_policy;

	rebooting = true;
	for_each_online_cpu(cpu) {
		cpufreq_get_policy(&cpu_policy, cpu);
		powernv_cpufreq_target_index(&cpu_policy, get_nominal_index());
	}

	return NOTIFY_DONE;
}

static struct notifier_block powernv_cpufreq_reboot_nb = {
	.notifier_call = powernv_cpufreq_reboot_notifier,
};

889 890 891
void powernv_cpufreq_work_fn(struct work_struct *work)
{
	struct chip *chip = container_of(work, struct chip, throttle);
892
	unsigned int cpu;
893
	cpumask_t mask;
894

895 896 897
	get_online_cpus();
	cpumask_and(&mask, &chip->mask, cpu_online_mask);
	smp_call_function_any(&mask,
898
			      powernv_cpufreq_throttle_check, NULL, 0);
899 900

	if (!chip->restore)
901
		goto out;
902 903

	chip->restore = false;
904 905
	for_each_cpu(cpu, &mask) {
		int index;
906 907 908
		struct cpufreq_policy policy;

		cpufreq_get_policy(&policy, cpu);
909
		index = cpufreq_table_find_index_c(&policy, policy.cur);
910
		powernv_cpufreq_target_index(&policy, index);
911
		cpumask_andnot(&mask, &mask, policy.cpus);
912
	}
913 914
out:
	put_online_cpus();
915 916
}

917 918 919 920 921
static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
				   unsigned long msg_type, void *_msg)
{
	struct opal_msg *msg = _msg;
	struct opal_occ_msg omsg;
922
	int i;
923 924 925 926 927 928 929 930 931

	if (msg_type != OPAL_MSG_OCC)
		return 0;

	omsg.type = be64_to_cpu(msg->params[0]);

	switch (omsg.type) {
	case OCC_RESET:
		occ_reset = true;
932
		pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n");
933 934 935 936 937 938 939 940 941
		/*
		 * powernv_cpufreq_throttle_check() is called in
		 * target() callback which can detect the throttle state
		 * for governors like ondemand.
		 * But static governors will not call target() often thus
		 * report throttling here.
		 */
		if (!throttled) {
			throttled = true;
942
			pr_warn("CPU frequency is throttled for duration\n");
943
		}
944

945 946
		break;
	case OCC_LOAD:
947
		pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n");
948 949 950 951 952 953 954 955
		break;
	case OCC_THROTTLE:
		omsg.chip = be64_to_cpu(msg->params[1]);
		omsg.throttle_status = be64_to_cpu(msg->params[2]);

		if (occ_reset) {
			occ_reset = false;
			throttled = false;
956
			pr_info("OCC Active, CPU frequency is no longer throttled\n");
957

958 959
			for (i = 0; i < nr_chips; i++) {
				chips[i].restore = true;
960
				schedule_work(&chips[i].throttle);
961
			}
962

963 964 965
			return 0;
		}

966 967 968 969 970
		for (i = 0; i < nr_chips; i++)
			if (chips[i].id == omsg.chip)
				break;

		if (omsg.throttle_status >= 0 &&
971
		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
972
			chips[i].throttle_reason = omsg.throttle_status;
973 974
			chips[i].reason[omsg.throttle_status]++;
		}
975

976 977 978 979
		if (!omsg.throttle_status)
			chips[i].restore = true;

		schedule_work(&chips[i].throttle);
980 981 982 983 984 985 986 987 988 989
	}
	return 0;
}

static struct notifier_block powernv_cpufreq_opal_nb = {
	.notifier_call	= powernv_cpufreq_occ_msg,
	.next		= NULL,
	.priority	= 0,
};

990 991 992
static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
{
	struct powernv_smp_call_data freq_data;
993
	struct global_pstate_info *gpstates = policy->driver_data;
994

995 996
	freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min);
	freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min);
997
	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
998
	del_timer_sync(&gpstates->timer);
999 1000
}

1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
					unsigned int target_freq)
{
	int index;
	struct powernv_smp_call_data freq_data;

	index = cpufreq_table_find_index_dl(policy, target_freq);
	freq_data.pstate_id = powernv_freqs[index].driver_data;
	freq_data.gpstate_id = powernv_freqs[index].driver_data;
	set_pstate(&freq_data);

	return powernv_freqs[index].frequency;
}

1015 1016 1017 1018
static struct cpufreq_driver powernv_cpufreq_driver = {
	.name		= "powernv-cpufreq",
	.flags		= CPUFREQ_CONST_LOOPS,
	.init		= powernv_cpufreq_cpu_init,
1019
	.exit		= powernv_cpufreq_cpu_exit,
1020 1021
	.verify		= cpufreq_generic_frequency_table_verify,
	.target_index	= powernv_cpufreq_target_index,
1022
	.fast_switch	= powernv_fast_switch,
1023
	.get		= powernv_cpufreq_get,
1024
	.stop_cpu	= powernv_cpufreq_stop_cpu,
1025 1026 1027
	.attr		= powernv_cpu_freq_attr,
};

1028 1029 1030 1031 1032
static int init_chip_info(void)
{
	unsigned int chip[256];
	unsigned int cpu, i;
	unsigned int prev_chip_id = UINT_MAX;
1033

1034
	for_each_possible_cpu(cpu) {
1035 1036 1037 1038 1039 1040 1041 1042
		unsigned int id = cpu_to_chip_id(cpu);

		if (prev_chip_id != id) {
			prev_chip_id = id;
			chip[nr_chips++] = id;
		}
	}

1043
	chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
1044
	if (!chips)
1045
		return -ENOMEM;
1046 1047 1048

	for (i = 0; i < nr_chips; i++) {
		chips[i].id = chip[i];
1049 1050
		cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
		INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
1051 1052
		for_each_cpu(cpu, &chips[i].mask)
			per_cpu(chip_info, cpu) =  &chips[i];
1053 1054 1055 1056 1057
	}

	return 0;
}

1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
static inline void clean_chip_info(void)
{
	kfree(chips);
}

static inline void unregister_all_notifiers(void)
{
	opal_message_notifier_unregister(OPAL_MSG_OCC,
					 &powernv_cpufreq_opal_nb);
	unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
}

1070 1071 1072 1073
static int __init powernv_cpufreq_init(void)
{
	int rc = 0;

1074
	/* Don't probe on pseries (guest) platforms */
1075
	if (!firmware_has_feature(FW_FEATURE_OPAL))
1076 1077
		return -ENODEV;

1078 1079
	/* Discover pstates from device tree and init */
	rc = init_powernv_pstates();
1080 1081
	if (rc)
		goto out;
1082

1083 1084 1085
	/* Populate chip info */
	rc = init_chip_info();
	if (rc)
1086
		goto out;
1087

1088
	register_reboot_notifier(&powernv_cpufreq_reboot_nb);
1089
	opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
1090

1091 1092 1093 1094 1095
	if (powernv_pstate_info.wof_enabled)
		powernv_cpufreq_driver.boost_enabled = true;
	else
		powernv_cpu_freq_attr[SCALING_BOOST_FREQS_ATTR_INDEX] = NULL;

1096
	rc = cpufreq_register_driver(&powernv_cpufreq_driver);
1097 1098 1099 1100
	if (rc) {
		pr_info("Failed to register the cpufreq driver (%d)\n", rc);
		goto cleanup_notifiers;
	}
1101

1102 1103 1104 1105 1106
	if (powernv_pstate_info.wof_enabled)
		cpufreq_enable_boost_support();

	return 0;
cleanup_notifiers:
1107 1108 1109 1110 1111
	unregister_all_notifiers();
	clean_chip_info();
out:
	pr_info("Platform driver disabled. System does not support PState control\n");
	return rc;
1112 1113 1114 1115 1116 1117
}
module_init(powernv_cpufreq_init);

static void __exit powernv_cpufreq_exit(void)
{
	cpufreq_unregister_driver(&powernv_cpufreq_driver);
1118 1119
	unregister_all_notifiers();
	clean_chip_info();
1120 1121 1122 1123 1124
}
module_exit(powernv_cpufreq_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");