cpufreq_ondemand.c 19.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 *  drivers/cpufreq/cpufreq_ondemand.c
 *
 *  Copyright (C)  2001 Russell King
 *            (C)  2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
 *                      Jun Nakajima <jun.nakajima@intel.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/cpufreq.h>
A
Andrew Morton 已提交
17
#include <linux/cpu.h>
L
Linus Torvalds 已提交
18 19
#include <linux/jiffies.h>
#include <linux/kernel_stat.h>
20
#include <linux/mutex.h>
21 22 23
#include <linux/hrtimer.h>
#include <linux/tick.h>
#include <linux/ktime.h>
24
#include <linux/sched.h>
L
Linus Torvalds 已提交
25 26 27 28 29 30

/*
 * dbs is used in this file as a shortform for demandbased switching
 * It helps to keep variable names smaller, simpler
 */

31
#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(10)
L
Linus Torvalds 已提交
32
#define DEF_FREQUENCY_UP_THRESHOLD		(80)
33 34
#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL	(3)
#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
35
#define MIN_FREQUENCY_UP_THRESHOLD		(11)
L
Linus Torvalds 已提交
36 37
#define MAX_FREQUENCY_UP_THRESHOLD		(100)

38 39
/*
 * The polling frequency of this governor depends on the capability of
L
Linus Torvalds 已提交
40
 * the processor. Default polling frequency is 1000 times the transition
41 42
 * latency of the processor. The governor will work on any processor with
 * transition latency <= 10mS, using appropriate sampling
L
Linus Torvalds 已提交
43 44 45 46 47
 * rate.
 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL)
 * this governor will not work.
 * All times here are in uS.
 */
48
static unsigned int def_sampling_rate;
49 50
#define MIN_SAMPLING_RATE_RATIO			(2)
/* for correct statistics, we need at least 10 ticks between each measure */
51 52 53 54
#define MIN_STAT_SAMPLING_RATE 			\
			(MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10))
#define MIN_SAMPLING_RATE			\
			(def_sampling_rate / MIN_SAMPLING_RATE_RATIO)
55 56 57 58 59 60 61 62 63 64 65 66
/* Above MIN_SAMPLING_RATE will vanish with its sysfs file soon
 * Define the minimal settable sampling rate to the greater of:
 *   - "HW transition latency" * 100 (same as default sampling / 10)
 *   - MIN_STAT_SAMPLING_RATE
 * To avoid that userspace shoots itself.
*/
static unsigned int minimum_sampling_rate(void)
{
	return max(def_sampling_rate / 10, MIN_STAT_SAMPLING_RATE);
}

/* This will also vanish soon with removing sampling_rate_max */
L
Linus Torvalds 已提交
67
#define MAX_SAMPLING_RATE			(500 * def_sampling_rate)
68
#define LATENCY_MULTIPLIER			(1000)
69
#define TRANSITION_LATENCY_LIMIT		(10 * 1000 * 1000)
L
Linus Torvalds 已提交
70

D
David Howells 已提交
71 72 73
static void do_dbs_timer(struct work_struct *work);

/* Sampling types */
74
enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
L
Linus Torvalds 已提交
75 76

struct cpu_dbs_info_s {
77 78
	cputime64_t prev_cpu_idle;
	cputime64_t prev_cpu_wall;
79
	cputime64_t prev_cpu_nice;
80
	struct cpufreq_policy *cur_policy;
81
	struct delayed_work work;
82 83 84 85
	struct cpufreq_frequency_table *freq_table;
	unsigned int freq_lo;
	unsigned int freq_lo_jiffies;
	unsigned int freq_hi_jiffies;
86 87
	int cpu;
	unsigned int enable:1,
88
		sample_type:1;
L
Linus Torvalds 已提交
89 90 91 92 93
};
static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);

static unsigned int dbs_enable;	/* number of CPUs using this policy */

94 95 96 97 98 99 100
/*
 * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
 * lock and dbs_mutex. cpu_hotplug lock should always be held before
 * dbs_mutex. If any function that can potentially take cpu_hotplug lock
 * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
 * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
 * is recursive for the same process. -Venki
101 102 103
 * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
 * would deadlock with cancel_delayed_work_sync(), which is needed for proper
 * raceless workqueue teardown.
104
 */
105
static DEFINE_MUTEX(dbs_mutex);
L
Linus Torvalds 已提交
106

107
static struct workqueue_struct	*kondemand_wq;
108

109
static struct dbs_tuners {
110 111
	unsigned int sampling_rate;
	unsigned int up_threshold;
112
	unsigned int down_differential;
113
	unsigned int ignore_nice;
114 115
	unsigned int powersave_bias;
} dbs_tuners_ins = {
116
	.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
117
	.down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
118
	.ignore_nice = 0,
119
	.powersave_bias = 0,
L
Linus Torvalds 已提交
120 121
};

122 123
static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
							cputime64_t *wall)
124
{
125
	cputime64_t idle_time;
126
	cputime64_t cur_wall_time;
127
	cputime64_t busy_time;
128

129
	cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
130 131
	busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
			kstat_cpu(cpu).cpustat.system);
132

133 134 135
	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
136
	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
137

138 139 140 141
	idle_time = cputime64_sub(cur_wall_time, busy_time);
	if (wall)
		*wall = cur_wall_time;

142
	return idle_time;
143 144
}

145 146 147 148 149 150 151 152 153 154
static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
{
	u64 idle_time = get_cpu_idle_time_us(cpu, wall);

	if (idle_time == -1ULL)
		return get_cpu_idle_time_jiffy(cpu, wall);

	return idle_time;
}

155 156 157 158 159
/*
 * Find right freq to be set now with powersave_bias on.
 * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
 */
160 161 162
static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
					  unsigned int freq_next,
					  unsigned int relation)
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
{
	unsigned int freq_req, freq_reduc, freq_avg;
	unsigned int freq_hi, freq_lo;
	unsigned int index = 0;
	unsigned int jiffies_total, jiffies_hi, jiffies_lo;
	struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);

	if (!dbs_info->freq_table) {
		dbs_info->freq_lo = 0;
		dbs_info->freq_lo_jiffies = 0;
		return freq_next;
	}

	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next,
			relation, &index);
	freq_req = dbs_info->freq_table[index].frequency;
	freq_reduc = freq_req * dbs_tuners_ins.powersave_bias / 1000;
	freq_avg = freq_req - freq_reduc;

	/* Find freq bounds for freq_avg in freq_table */
	index = 0;
	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
			CPUFREQ_RELATION_H, &index);
	freq_lo = dbs_info->freq_table[index].frequency;
	index = 0;
	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
			CPUFREQ_RELATION_L, &index);
	freq_hi = dbs_info->freq_table[index].frequency;

	/* Find out how long we have to be in hi and lo freqs */
	if (freq_hi == freq_lo) {
		dbs_info->freq_lo = 0;
		dbs_info->freq_lo_jiffies = 0;
		return freq_lo;
	}
	jiffies_total = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
	jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
	jiffies_hi += ((freq_hi - freq_lo) / 2);
	jiffies_hi /= (freq_hi - freq_lo);
	jiffies_lo = jiffies_total - jiffies_hi;
	dbs_info->freq_lo = freq_lo;
	dbs_info->freq_lo_jiffies = jiffies_lo;
	dbs_info->freq_hi_jiffies = jiffies_hi;
	return freq_hi;
}

static void ondemand_powersave_bias_init(void)
{
	int i;
	for_each_online_cpu(i) {
		struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i);
		dbs_info->freq_table = cpufreq_frequency_get_table(i);
		dbs_info->freq_lo = 0;
	}
}

L
Linus Torvalds 已提交
219 220 221
/************************** sysfs interface ************************/
static ssize_t show_sampling_rate_max(struct cpufreq_policy *policy, char *buf)
{
222 223 224 225 226 227 228 229
	static int print_once;

	if (!print_once) {
		printk(KERN_INFO "CPUFREQ: ondemand sampling_rate_max "
		       "sysfs file is deprecated - used by: %s\n",
		       current->comm);
		print_once = 1;
	}
230
	return sprintf(buf, "%u\n", MAX_SAMPLING_RATE);
L
Linus Torvalds 已提交
231 232 233 234
}

static ssize_t show_sampling_rate_min(struct cpufreq_policy *policy, char *buf)
{
235 236 237 238 239 240 241 242
	static int print_once;

	if (!print_once) {
		printk(KERN_INFO "CPUFREQ: ondemand sampling_rate_min "
		       "sysfs file is deprecated - used by: %s\n",
		       current->comm);
		print_once = 1;
	}
243
	return sprintf(buf, "%u\n", MIN_SAMPLING_RATE);
L
Linus Torvalds 已提交
244 245
}

246 247
#define define_one_ro(_name)		\
static struct freq_attr _name =		\
L
Linus Torvalds 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261
__ATTR(_name, 0444, show_##_name, NULL)

define_one_ro(sampling_rate_max);
define_one_ro(sampling_rate_min);

/* cpufreq_ondemand Governor Tunables */
#define show_one(file_name, object)					\
static ssize_t show_##file_name						\
(struct cpufreq_policy *unused, char *buf)				\
{									\
	return sprintf(buf, "%u\n", dbs_tuners_ins.object);		\
}
show_one(sampling_rate, sampling_rate);
show_one(up_threshold, up_threshold);
262
show_one(ignore_nice_load, ignore_nice);
263
show_one(powersave_bias, powersave_bias);
L
Linus Torvalds 已提交
264

265
static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
L
Linus Torvalds 已提交
266 267 268 269
		const char *buf, size_t count)
{
	unsigned int input;
	int ret;
270
	ret = sscanf(buf, "%u", &input);
L
Linus Torvalds 已提交
271

272
	mutex_lock(&dbs_mutex);
273
	if (ret != 1) {
274
		mutex_unlock(&dbs_mutex);
L
Linus Torvalds 已提交
275 276
		return -EINVAL;
	}
277
	dbs_tuners_ins.sampling_rate = max(input, minimum_sampling_rate());
278
	mutex_unlock(&dbs_mutex);
L
Linus Torvalds 已提交
279 280 281 282

	return count;
}

283
static ssize_t store_up_threshold(struct cpufreq_policy *unused,
L
Linus Torvalds 已提交
284 285 286 287
		const char *buf, size_t count)
{
	unsigned int input;
	int ret;
288
	ret = sscanf(buf, "%u", &input);
L
Linus Torvalds 已提交
289

290
	mutex_lock(&dbs_mutex);
291
	if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
292
			input < MIN_FREQUENCY_UP_THRESHOLD) {
293
		mutex_unlock(&dbs_mutex);
L
Linus Torvalds 已提交
294 295 296 297
		return -EINVAL;
	}

	dbs_tuners_ins.up_threshold = input;
298
	mutex_unlock(&dbs_mutex);
L
Linus Torvalds 已提交
299 300 301 302

	return count;
}

303
static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
304 305 306 307 308 309
		const char *buf, size_t count)
{
	unsigned int input;
	int ret;

	unsigned int j;
310

311
	ret = sscanf(buf, "%u", &input);
312
	if (ret != 1)
313 314
		return -EINVAL;

315
	if (input > 1)
316
		input = 1;
317

318
	mutex_lock(&dbs_mutex);
319
	if (input == dbs_tuners_ins.ignore_nice) { /* nothing to do */
320
		mutex_unlock(&dbs_mutex);
321 322 323 324
		return count;
	}
	dbs_tuners_ins.ignore_nice = input;

325
	/* we need to re-evaluate prev_cpu_idle */
326
	for_each_online_cpu(j) {
327 328
		struct cpu_dbs_info_s *dbs_info;
		dbs_info = &per_cpu(cpu_dbs_info, j);
329 330
		dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
						&dbs_info->prev_cpu_wall);
331 332 333
		if (dbs_tuners_ins.ignore_nice)
			dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;

334
	}
335
	mutex_unlock(&dbs_mutex);
336 337 338 339

	return count;
}

340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
static ssize_t store_powersave_bias(struct cpufreq_policy *unused,
		const char *buf, size_t count)
{
	unsigned int input;
	int ret;
	ret = sscanf(buf, "%u", &input);

	if (ret != 1)
		return -EINVAL;

	if (input > 1000)
		input = 1000;

	mutex_lock(&dbs_mutex);
	dbs_tuners_ins.powersave_bias = input;
	ondemand_powersave_bias_init();
	mutex_unlock(&dbs_mutex);

	return count;
}

L
Linus Torvalds 已提交
361 362 363 364 365 366
#define define_one_rw(_name) \
static struct freq_attr _name = \
__ATTR(_name, 0644, show_##_name, store_##_name)

define_one_rw(sampling_rate);
define_one_rw(up_threshold);
367
define_one_rw(ignore_nice_load);
368
define_one_rw(powersave_bias);
L
Linus Torvalds 已提交
369

370
static struct attribute *dbs_attributes[] = {
L
Linus Torvalds 已提交
371 372 373 374
	&sampling_rate_max.attr,
	&sampling_rate_min.attr,
	&sampling_rate.attr,
	&up_threshold.attr,
375
	&ignore_nice_load.attr,
376
	&powersave_bias.attr,
L
Linus Torvalds 已提交
377 378 379 380 381 382 383 384 385 386
	NULL
};

static struct attribute_group dbs_attr_group = {
	.attrs = dbs_attributes,
	.name = "ondemand",
};

/************************** sysfs end ************************/

387
static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
L
Linus Torvalds 已提交
388
{
389
	unsigned int max_load_freq;
L
Linus Torvalds 已提交
390 391 392 393 394 395 396

	struct cpufreq_policy *policy;
	unsigned int j;

	if (!this_dbs_info->enable)
		return;

397
	this_dbs_info->freq_lo = 0;
L
Linus Torvalds 已提交
398
	policy = this_dbs_info->cur_policy;
399

400
	/*
401 402
	 * Every sampling_rate, we check, if current idle time is less
	 * than 20% (default), then we try to increase frequency
403
	 * Every sampling_rate, we look for a the lowest
404 405
	 * frequency which can sustain the load while keeping idle time over
	 * 30%. If such a frequency exist, we try to decrease to this frequency.
L
Linus Torvalds 已提交
406
	 *
407 408 409
	 * Any frequency increase takes it to the maximum frequency.
	 * Frequency reduction happens at minimum steps of
	 * 5% (default) of current frequency
L
Linus Torvalds 已提交
410 411
	 */

412 413 414
	/* Get Absolute Load - in terms of freq */
	max_load_freq = 0;

415
	for_each_cpu(j, policy->cpus) {
L
Linus Torvalds 已提交
416
		struct cpu_dbs_info_s *j_dbs_info;
417 418 419 420
		cputime64_t cur_wall_time, cur_idle_time;
		unsigned int idle_time, wall_time;
		unsigned int load, load_freq;
		int freq_avg;
L
Linus Torvalds 已提交
421 422

		j_dbs_info = &per_cpu(cpu_dbs_info, j);
423 424 425

		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);

426 427 428 429 430
		wall_time = (unsigned int) cputime64_sub(cur_wall_time,
				j_dbs_info->prev_cpu_wall);
		j_dbs_info->prev_cpu_wall = cur_wall_time;

		idle_time = (unsigned int) cputime64_sub(cur_idle_time,
431
				j_dbs_info->prev_cpu_idle);
432
		j_dbs_info->prev_cpu_idle = cur_idle_time;
L
Linus Torvalds 已提交
433

434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
		if (dbs_tuners_ins.ignore_nice) {
			cputime64_t cur_nice;
			unsigned long cur_nice_jiffies;

			cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
					 j_dbs_info->prev_cpu_nice);
			/*
			 * Assumption: nice time between sampling periods will
			 * be less than 2^32 jiffies for 32 bit sys
			 */
			cur_nice_jiffies = (unsigned long)
					cputime64_to_jiffies64(cur_nice);

			j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
			idle_time += jiffies_to_usecs(cur_nice_jiffies);
		}

451
		if (unlikely(!wall_time || wall_time < idle_time))
452 453 454 455 456 457 458 459 460 461 462
			continue;

		load = 100 * (wall_time - idle_time) / wall_time;

		freq_avg = __cpufreq_driver_getavg(policy, j);
		if (freq_avg <= 0)
			freq_avg = policy->cur;

		load_freq = load * freq_avg;
		if (load_freq > max_load_freq)
			max_load_freq = load_freq;
L
Linus Torvalds 已提交
463 464
	}

465
	/* Check for frequency increase */
466
	if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) {
467
		/* if we are already at full speed then break out early */
468 469 470 471 472 473 474 475 476 477 478 479
		if (!dbs_tuners_ins.powersave_bias) {
			if (policy->cur == policy->max)
				return;

			__cpufreq_driver_target(policy, policy->max,
				CPUFREQ_RELATION_H);
		} else {
			int freq = powersave_bias_target(policy, policy->max,
					CPUFREQ_RELATION_H);
			__cpufreq_driver_target(policy, freq,
				CPUFREQ_RELATION_L);
		}
L
Linus Torvalds 已提交
480 481 482 483
		return;
	}

	/* Check for frequency decrease */
484 485 486
	/* if we cannot reduce the frequency anymore, break out early */
	if (policy->cur == policy->min)
		return;
L
Linus Torvalds 已提交
487

488 489 490 491 492
	/*
	 * The optimal frequency is the frequency that is the lowest that
	 * can support the current CPU usage without triggering the up
	 * policy. To be safe, we focus 10 points under the threshold.
	 */
493 494 495
	if (max_load_freq <
	    (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) *
	     policy->cur) {
496
		unsigned int freq_next;
497 498 499
		freq_next = max_load_freq /
				(dbs_tuners_ins.up_threshold -
				 dbs_tuners_ins.down_differential);
500

501 502 503 504 505 506 507 508 509
		if (!dbs_tuners_ins.powersave_bias) {
			__cpufreq_driver_target(policy, freq_next,
					CPUFREQ_RELATION_L);
		} else {
			int freq = powersave_bias_target(policy, freq_next,
					CPUFREQ_RELATION_L);
			__cpufreq_driver_target(policy, freq,
				CPUFREQ_RELATION_L);
		}
510
	}
L
Linus Torvalds 已提交
511 512
}

D
David Howells 已提交
513
static void do_dbs_timer(struct work_struct *work)
514
{
515 516 517 518 519
	struct cpu_dbs_info_s *dbs_info =
		container_of(work, struct cpu_dbs_info_s, work.work);
	unsigned int cpu = dbs_info->cpu;
	int sample_type = dbs_info->sample_type;

520 521
	/* We want all CPUs to do sampling nearly on same jiffy */
	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
D
David Howells 已提交
522

523
	delay -= jiffies % delay;
524

525
	if (lock_policy_rwsem_write(cpu) < 0)
526
		return;
527 528 529 530 531 532

	if (!dbs_info->enable) {
		unlock_policy_rwsem_write(cpu);
		return;
	}

533
	/* Common NORMAL_SAMPLE setup */
D
David Howells 已提交
534
	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
535
	if (!dbs_tuners_ins.powersave_bias ||
D
David Howells 已提交
536
	    sample_type == DBS_NORMAL_SAMPLE) {
537 538 539
		dbs_check_cpu(dbs_info);
		if (dbs_info->freq_lo) {
			/* Setup timer for SUB_SAMPLE */
D
David Howells 已提交
540
			dbs_info->sample_type = DBS_SUB_SAMPLE;
541 542 543 544
			delay = dbs_info->freq_hi_jiffies;
		}
	} else {
		__cpufreq_driver_target(dbs_info->cur_policy,
545
			dbs_info->freq_lo, CPUFREQ_RELATION_H);
546
	}
547
	queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
548
	unlock_policy_rwsem_write(cpu);
549
}
L
Linus Torvalds 已提交
550

551
static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
L
Linus Torvalds 已提交
552
{
553 554 555
	/* We want all CPUs to do sampling nearly on same jiffy */
	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
	delay -= jiffies % delay;
556

D
Dave Jones 已提交
557
	dbs_info->enable = 1;
558
	ondemand_powersave_bias_init();
D
David Howells 已提交
559
	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
560
	INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
561
	queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
562
		delay);
L
Linus Torvalds 已提交
563 564
}

565
static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
L
Linus Torvalds 已提交
566
{
567
	dbs_info->enable = 0;
568
	cancel_delayed_work_sync(&dbs_info->work);
L
Linus Torvalds 已提交
569 570 571 572 573 574 575 576
}

static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
				   unsigned int event)
{
	unsigned int cpu = policy->cpu;
	struct cpu_dbs_info_s *this_dbs_info;
	unsigned int j;
J
Jeff Garzik 已提交
577
	int rc;
L
Linus Torvalds 已提交
578 579 580 581 582

	this_dbs_info = &per_cpu(cpu_dbs_info, cpu);

	switch (event) {
	case CPUFREQ_GOV_START:
583
		if ((!cpu_online(cpu)) || (!policy->cur))
L
Linus Torvalds 已提交
584 585 586 587
			return -EINVAL;

		if (this_dbs_info->enable) /* Already enabled */
			break;
588

589
		mutex_lock(&dbs_mutex);
590
		dbs_enable++;
J
Jeff Garzik 已提交
591 592 593 594 595 596 597 598

		rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
		if (rc) {
			dbs_enable--;
			mutex_unlock(&dbs_mutex);
			return rc;
		}

599
		for_each_cpu(j, policy->cpus) {
L
Linus Torvalds 已提交
600 601 602
			struct cpu_dbs_info_s *j_dbs_info;
			j_dbs_info = &per_cpu(cpu_dbs_info, j);
			j_dbs_info->cur_policy = policy;
603

604 605
			j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
						&j_dbs_info->prev_cpu_wall);
606 607 608 609
			if (dbs_tuners_ins.ignore_nice) {
				j_dbs_info->prev_cpu_nice =
						kstat_cpu(j).cpustat.nice;
			}
L
Linus Torvalds 已提交
610
		}
611
		this_dbs_info->cpu = cpu;
L
Linus Torvalds 已提交
612 613 614 615 616 617 618
		/*
		 * Start the timerschedule work, when this governor
		 * is used for first time
		 */
		if (dbs_enable == 1) {
			unsigned int latency;
			/* policy latency is in nS. Convert it to uS first */
619 620 621
			latency = policy->cpuinfo.transition_latency / 1000;
			if (latency == 0)
				latency = 1;
L
Linus Torvalds 已提交
622

623 624 625
			def_sampling_rate =
				max(latency * LATENCY_MULTIPLIER,
				    MIN_STAT_SAMPLING_RATE);
626

L
Linus Torvalds 已提交
627 628
			dbs_tuners_ins.sampling_rate = def_sampling_rate;
		}
629
		dbs_timer_init(this_dbs_info);
630

631
		mutex_unlock(&dbs_mutex);
L
Linus Torvalds 已提交
632 633 634
		break;

	case CPUFREQ_GOV_STOP:
635
		mutex_lock(&dbs_mutex);
636
		dbs_timer_exit(this_dbs_info);
L
Linus Torvalds 已提交
637 638
		sysfs_remove_group(&policy->kobj, &dbs_attr_group);
		dbs_enable--;
639
		mutex_unlock(&dbs_mutex);
L
Linus Torvalds 已提交
640 641 642 643

		break;

	case CPUFREQ_GOV_LIMITS:
644
		mutex_lock(&dbs_mutex);
L
Linus Torvalds 已提交
645
		if (policy->max < this_dbs_info->cur_policy->cur)
646
			__cpufreq_driver_target(this_dbs_info->cur_policy,
647
				policy->max, CPUFREQ_RELATION_H);
L
Linus Torvalds 已提交
648
		else if (policy->min > this_dbs_info->cur_policy->cur)
649
			__cpufreq_driver_target(this_dbs_info->cur_policy,
650
				policy->min, CPUFREQ_RELATION_L);
651
		mutex_unlock(&dbs_mutex);
L
Linus Torvalds 已提交
652 653 654 655 656
		break;
	}
	return 0;
}

657 658 659
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
static
#endif
660 661 662 663 664
struct cpufreq_governor cpufreq_gov_ondemand = {
	.name			= "ondemand",
	.governor		= cpufreq_governor_dbs,
	.max_transition_latency = TRANSITION_LATENCY_LIMIT,
	.owner			= THIS_MODULE,
L
Linus Torvalds 已提交
665 666 667 668
};

static int __init cpufreq_gov_dbs_init(void)
{
669
	int err;
670
	cputime64_t wall;
671 672
	u64 idle_time;
	int cpu = get_cpu();
673

674 675
	idle_time = get_cpu_idle_time_us(cpu, &wall);
	put_cpu();
676 677 678 679 680 681
	if (idle_time != -1ULL) {
		/* Idle micro accounting is supported. Use finer thresholds */
		dbs_tuners_ins.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
		dbs_tuners_ins.down_differential =
					MICRO_FREQUENCY_DOWN_DIFFERENTIAL;
	}
682

683 684 685 686 687
	kondemand_wq = create_workqueue("kondemand");
	if (!kondemand_wq) {
		printk(KERN_ERR "Creation of kondemand failed\n");
		return -EFAULT;
	}
688 689 690 691 692
	err = cpufreq_register_governor(&cpufreq_gov_ondemand);
	if (err)
		destroy_workqueue(kondemand_wq);

	return err;
L
Linus Torvalds 已提交
693 694 695 696
}

static void __exit cpufreq_gov_dbs_exit(void)
{
697
	cpufreq_unregister_governor(&cpufreq_gov_ondemand);
698
	destroy_workqueue(kondemand_wq);
L
Linus Torvalds 已提交
699 700 701
}


702 703 704
MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>");
MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
705
	"Low Latency Frequency Transition capable processors");
706
MODULE_LICENSE("GPL");
L
Linus Torvalds 已提交
707

708 709 710
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
fs_initcall(cpufreq_gov_dbs_init);
#else
L
Linus Torvalds 已提交
711
module_init(cpufreq_gov_dbs_init);
712
#endif
L
Linus Torvalds 已提交
713
module_exit(cpufreq_gov_dbs_exit);