cpufreq_governor.c 17.5 KB
Newer Older
1 2 3 4 5
/*
 * drivers/cpufreq/cpufreq_governor.c
 *
 * CPUFREQ governors common code
 *
6 7 8 9 10 11
 * Copyright	(C) 2001 Russell King
 *		(C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
 *		(C) 2003 Jun Nakajima <jun.nakajima@intel.com>
 *		(C) 2009 Alexander Clouter <alex@digriz.org.uk>
 *		(c) 2012 Viresh Kumar <viresh.kumar@linaro.org>
 *
12 13 14 15 16
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

17 18
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

19 20
#include <linux/export.h>
#include <linux/kernel_stat.h>
21
#include <linux/slab.h>
22 23 24

#include "cpufreq_governor.h"

25 26 27
DEFINE_MUTEX(dbs_data_mutex);
EXPORT_SYMBOL_GPL(dbs_data_mutex);

28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
/* Common sysfs tunables */
/**
 * store_sampling_rate - update sampling rate effective immediately if needed.
 *
 * If new rate is smaller than the old, simply updating
 * dbs.sampling_rate might not be appropriate. For example, if the
 * original sampling_rate was 1 second and the requested new sampling rate is 10
 * ms because the user needs immediate reaction from ondemand governor, but not
 * sure if higher frequency will be required or not, then, the governor may
 * change the sampling rate too late; up to 1 second later. Thus, if we are
 * reducing the sampling rate, we need to make the new value effective
 * immediately.
 *
 * This must be called with dbs_data->mutex held, otherwise traversing
 * policy_dbs_list isn't safe.
 */
ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
			    size_t count)
{
	struct policy_dbs_info *policy_dbs;
	unsigned int rate;
	int ret;
	ret = sscanf(buf, "%u", &rate);
	if (ret != 1)
		return -EINVAL;

	dbs_data->sampling_rate = max(rate, dbs_data->min_sampling_rate);

	/*
	 * We are operating under dbs_data->mutex and so the list and its
	 * entries can't be freed concurrently.
	 */
	list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
		mutex_lock(&policy_dbs->timer_mutex);
		/*
		 * On 32-bit architectures this may race with the
		 * sample_delay_ns read in dbs_update_util_handler(), but that
		 * really doesn't matter.  If the read returns a value that's
		 * too big, the sample will be skipped, but the next invocation
		 * of dbs_update_util_handler() (when the update has been
68
		 * completed) will take a sample.
69 70 71
		 *
		 * If this runs in parallel with dbs_work_handler(), we may end
		 * up overwriting the sample_delay_ns value that it has just
72 73
		 * written, but it will be corrected next time a sample is
		 * taken, so it shouldn't be significant.
74
		 */
75
		gov_update_sample_delay(policy_dbs, 0);
76 77 78 79 80 81 82
		mutex_unlock(&policy_dbs->timer_mutex);
	}

	return count;
}
EXPORT_SYMBOL_GPL(store_sampling_rate);

83
static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
84
{
85
	return container_of(kobj, struct dbs_data, kobj);
86 87
}

88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
static inline struct governor_attr *to_gov_attr(struct attribute *attr)
{
	return container_of(attr, struct governor_attr, attr);
}

static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
			     char *buf)
{
	struct dbs_data *dbs_data = to_dbs_data(kobj);
	struct governor_attr *gattr = to_gov_attr(attr);
	int ret = -EIO;

	if (gattr->show)
		ret = gattr->show(dbs_data, buf);

	return ret;
}

static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
			      const char *buf, size_t count)
{
	struct dbs_data *dbs_data = to_dbs_data(kobj);
	struct governor_attr *gattr = to_gov_attr(attr);
	int ret = -EIO;

	mutex_lock(&dbs_data->mutex);

	if (gattr->store)
		ret = gattr->store(dbs_data, buf, count);

	mutex_unlock(&dbs_data->mutex);

	return ret;
}

/*
 * Sysfs Ops for accessing governor attributes.
 *
 * All show/store invocations for governor specific sysfs attributes, will first
 * call the below show/store callbacks and the attribute specific callback will
 * be called from within it.
 */
static const struct sysfs_ops governor_sysfs_ops = {
	.show	= governor_show,
	.store	= governor_store,
};

135
unsigned int dbs_update(struct cpufreq_policy *policy)
136
{
137
	struct dbs_governor *gov = dbs_governor_of(policy);
138 139
	struct policy_dbs_info *policy_dbs = policy->governor_data;
	struct dbs_data *dbs_data = policy_dbs->dbs_data;
140
	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
141
	unsigned int ignore_nice = dbs_data->ignore_nice_load;
142
	unsigned int max_load = 0;
143
	unsigned int sampling_rate, j;
144

145 146 147 148 149 150 151
	/*
	 * Sometimes governors may use an additional multiplier to increase
	 * sample delays temporarily.  Apply that multiplier to sampling_rate
	 * so as to keep the wake-up-from-idle detection logic a bit
	 * conservative.
	 */
	sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult;
152

153
	/* Get Absolute Load */
154
	for_each_cpu(j, policy->cpus) {
155
		struct cpu_dbs_info *j_cdbs;
156 157
		u64 cur_wall_time, cur_idle_time;
		unsigned int idle_time, wall_time;
158
		unsigned int load;
159
		int io_busy = 0;
160

161
		j_cdbs = gov->get_cpu_cdbs(j);
162

163 164 165 166 167 168
		/*
		 * For the purpose of ondemand, waiting for disk IO is
		 * an indication that you're performance critical, and
		 * not that the system is actually idle. So do not add
		 * the iowait time to the cpu idle time.
		 */
169
		if (gov->governor == GOV_ONDEMAND)
170 171
			io_busy = od_tuners->io_is_busy;
		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
172

173
		wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
174 175
		j_cdbs->prev_cpu_wall = cur_wall_time;

176 177 178 179 180 181
		if (cur_idle_time <= j_cdbs->prev_cpu_idle) {
			idle_time = 0;
		} else {
			idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
			j_cdbs->prev_cpu_idle = cur_idle_time;
		}
182 183

		if (ignore_nice) {
184 185 186 187
			u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];

			idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice);
			j_cdbs->prev_cpu_nice = cur_nice;
188 189 190 191 192
		}

		if (unlikely(!wall_time || wall_time < idle_time))
			continue;

193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
		/*
		 * If the CPU had gone completely idle, and a task just woke up
		 * on this CPU now, it would be unfair to calculate 'load' the
		 * usual way for this elapsed time-window, because it will show
		 * near-zero load, irrespective of how CPU intensive that task
		 * actually is. This is undesirable for latency-sensitive bursty
		 * workloads.
		 *
		 * To avoid this, we reuse the 'load' from the previous
		 * time-window and give this task a chance to start with a
		 * reasonably high CPU frequency. (However, we shouldn't over-do
		 * this copy, lest we get stuck at a high load (high frequency)
		 * for too long, even when the current system load has actually
		 * dropped down. So we perform the copy only once, upon the
		 * first wake-up from idle.)
		 *
209 210 211 212
		 * Detecting this situation is easy: the governor's utilization
		 * update handler would not have run during CPU-idle periods.
		 * Hence, an unusually large 'wall_time' (as compared to the
		 * sampling rate) indicates this scenario.
213 214 215 216 217
		 *
		 * prev_load can be zero in two cases and we must recalculate it
		 * for both cases:
		 * - during long idle intervals
		 * - explicitly set to zero
218
		 */
219 220
		if (unlikely(wall_time > (2 * sampling_rate) &&
			     j_cdbs->prev_load)) {
221
			load = j_cdbs->prev_load;
222 223 224 225 226 227 228

			/*
			 * Perform a destructive copy, to ensure that we copy
			 * the previous load only once, upon the first wake-up
			 * from idle.
			 */
			j_cdbs->prev_load = 0;
229 230 231 232
		} else {
			load = 100 * (wall_time - idle_time) / wall_time;
			j_cdbs->prev_load = load;
		}
233 234 235 236

		if (load > max_load)
			max_load = load;
	}
237
	return max_load;
238
}
239
EXPORT_SYMBOL_GPL(dbs_update);
240

241
void gov_set_update_util(struct policy_dbs_info *policy_dbs,
242
			 unsigned int delay_us)
243
{
244
	struct cpufreq_policy *policy = policy_dbs->policy;
245
	struct dbs_governor *gov = dbs_governor_of(policy);
246
	int cpu;
247

248 249
	gov_update_sample_delay(policy_dbs, delay_us);
	policy_dbs->last_sample_time = 0;
250

251
	for_each_cpu(cpu, policy->cpus) {
252
		struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
253 254

		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
255 256
	}
}
257
EXPORT_SYMBOL_GPL(gov_set_update_util);
258

259
static inline void gov_clear_update_util(struct cpufreq_policy *policy)
260 261
{
	int i;
262

263 264 265 266
	for_each_cpu(i, policy->cpus)
		cpufreq_set_update_util_data(i, NULL);

	synchronize_rcu();
267 268
}

269
static void gov_cancel_work(struct cpufreq_policy *policy)
270
{
271 272
	struct policy_dbs_info *policy_dbs = policy->governor_data;

273 274 275
	gov_clear_update_util(policy_dbs->policy);
	irq_work_sync(&policy_dbs->irq_work);
	cancel_work_sync(&policy_dbs->work);
276
	atomic_set(&policy_dbs->work_count, 0);
277
	policy_dbs->work_in_progress = false;
278
}
279

280
static void dbs_work_handler(struct work_struct *work)
281
{
282
	struct policy_dbs_info *policy_dbs;
283
	struct cpufreq_policy *policy;
284
	struct dbs_governor *gov;
285

286 287
	policy_dbs = container_of(work, struct policy_dbs_info, work);
	policy = policy_dbs->policy;
288
	gov = dbs_governor_of(policy);
289

290
	/*
291 292
	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
	 * ondemand governor isn't updating the sampling rate in parallel.
293
	 */
294
	mutex_lock(&policy_dbs->timer_mutex);
295
	gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
296
	mutex_unlock(&policy_dbs->timer_mutex);
297

298 299
	/* Allow the utilization update handler to queue up more work. */
	atomic_set(&policy_dbs->work_count, 0);
300
	/*
301 302 303
	 * If the update below is reordered with respect to the sample delay
	 * modification, the utilization update handler may end up using a stale
	 * sample delay value.
304
	 */
305 306
	smp_wmb();
	policy_dbs->work_in_progress = false;
307 308 309 310
}

static void dbs_irq_work(struct irq_work *irq_work)
{
311
	struct policy_dbs_info *policy_dbs;
312

313 314
	policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
	schedule_work(&policy_dbs->work);
315 316
}

317 318 319 320
static void dbs_update_util_handler(struct update_util_data *data, u64 time,
				    unsigned long util, unsigned long max)
{
	struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
321
	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
322
	u64 delta_ns;
323 324

	/*
325 326 327 328
	 * The work may not be allowed to be queued up right now.
	 * Possible reasons:
	 * - Work has already been queued up or is in progress.
	 * - It is too early (too little time from the previous sample).
329
	 */
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
	if (policy_dbs->work_in_progress)
		return;

	/*
	 * If the reads below are reordered before the check above, the value
	 * of sample_delay_ns used in the computation may be stale.
	 */
	smp_rmb();
	delta_ns = time - policy_dbs->last_sample_time;
	if ((s64)delta_ns < policy_dbs->sample_delay_ns)
		return;

	/*
	 * If the policy is not shared, the irq_work may be queued up right away
	 * at this point.  Otherwise, we need to ensure that only one of the
	 * CPUs sharing the policy will do that.
	 */
	if (policy_dbs->is_shared &&
	    !atomic_add_unless(&policy_dbs->work_count, 1, 1))
		return;

	policy_dbs->last_sample_time = time;
	policy_dbs->work_in_progress = true;
	irq_work_queue(&policy_dbs->irq_work);
354
}
355

356 357
static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
						     struct dbs_governor *gov)
358
{
359
	struct policy_dbs_info *policy_dbs;
360 361 362
	int j;

	/* Allocate memory for the common information for policy->cpus */
363 364
	policy_dbs = kzalloc(sizeof(*policy_dbs), GFP_KERNEL);
	if (!policy_dbs)
365
		return NULL;
366

367
	policy_dbs->policy = policy;
368
	mutex_init(&policy_dbs->timer_mutex);
369
	atomic_set(&policy_dbs->work_count, 0);
370 371
	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
	INIT_WORK(&policy_dbs->work, dbs_work_handler);
372 373 374 375 376 377 378 379

	/* Set policy_dbs for all CPUs, online+offline */
	for_each_cpu(j, policy->related_cpus) {
		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);

		j_cdbs->policy_dbs = policy_dbs;
		j_cdbs->update_util.func = dbs_update_util_handler;
	}
380
	return policy_dbs;
381 382
}

383
static void free_policy_dbs_info(struct cpufreq_policy *policy,
384
				 struct dbs_governor *gov)
385
{
386
	struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
387
	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
388 389
	int j;

390
	mutex_destroy(&policy_dbs->timer_mutex);
391

392 393
	for_each_cpu(j, policy->related_cpus) {
		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
394

395 396 397
		j_cdbs->policy_dbs = NULL;
		j_cdbs->update_util.func = NULL;
	}
398
	kfree(policy_dbs);
399 400
}

401
static int cpufreq_governor_init(struct cpufreq_policy *policy)
402
{
403
	struct dbs_governor *gov = dbs_governor_of(policy);
404
	struct dbs_data *dbs_data = gov->gdbs_data;
405
	struct policy_dbs_info *policy_dbs;
406 407
	unsigned int latency;
	int ret;
408

409 410 411 412
	/* State should be equivalent to EXIT */
	if (policy->governor_data)
		return -EBUSY;

413 414 415
	policy_dbs = alloc_policy_dbs_info(policy, gov);
	if (!policy_dbs)
		return -ENOMEM;
416

417 418 419 420 421 422 423
	if (dbs_data) {
		if (WARN_ON(have_governor_per_policy())) {
			ret = -EINVAL;
			goto free_policy_dbs_info;
		}
		policy_dbs->dbs_data = dbs_data;
		policy->governor_data = policy_dbs;
424 425 426 427 428 429

		mutex_lock(&dbs_data->mutex);
		dbs_data->usage_count++;
		list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
		mutex_unlock(&dbs_data->mutex);

430 431
		return 0;
	}
432

433
	dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
434 435 436 437
	if (!dbs_data) {
		ret = -ENOMEM;
		goto free_policy_dbs_info;
	}
438

439
	INIT_LIST_HEAD(&dbs_data->policy_dbs_list);
440
	mutex_init(&dbs_data->mutex);
441

442
	ret = gov->init(dbs_data, !policy->governor->initialized);
443
	if (ret)
444
		goto free_policy_dbs_info;
445

446 447 448 449
	/* policy latency is in ns. Convert it to us first */
	latency = policy->cpuinfo.transition_latency / 1000;
	if (latency == 0)
		latency = 1;
450

451 452 453
	/* Bring kernel and HW constraints together */
	dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
					  MIN_LATENCY_MULTIPLIER * latency);
454 455
	dbs_data->sampling_rate = max(dbs_data->min_sampling_rate,
				      LATENCY_MULTIPLIER * latency);
456

457
	if (!have_governor_per_policy())
458
		gov->gdbs_data = dbs_data;
459

460
	policy->governor_data = policy_dbs;
461

462 463 464 465
	policy_dbs->dbs_data = dbs_data;
	dbs_data->usage_count = 1;
	list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);

466 467 468 469
	gov->kobj_type.sysfs_ops = &governor_sysfs_ops;
	ret = kobject_init_and_add(&dbs_data->kobj, &gov->kobj_type,
				   get_governor_parent_kobj(policy),
				   "%s", gov->gov.name);
470 471
	if (!ret)
		return 0;
472

473
	/* Failure, so roll back. */
474
	pr_err("cpufreq: Governor initialization failed (dbs_data kobject init error %d)\n", ret);
475

476 477
	policy->governor_data = NULL;

478
	if (!have_governor_per_policy())
479 480
		gov->gdbs_data = NULL;
	gov->exit(dbs_data, !policy->governor->initialized);
481 482
	kfree(dbs_data);

483 484
free_policy_dbs_info:
	free_policy_dbs_info(policy, gov);
485 486
	return ret;
}
487

488
static int cpufreq_governor_exit(struct cpufreq_policy *policy)
489
{
490
	struct dbs_governor *gov = dbs_governor_of(policy);
491 492
	struct policy_dbs_info *policy_dbs = policy->governor_data;
	struct dbs_data *dbs_data = policy_dbs->dbs_data;
493
	int count;
494

495 496 497 498 499 500
	mutex_lock(&dbs_data->mutex);
	list_del(&policy_dbs->list);
	count = --dbs_data->usage_count;
	mutex_unlock(&dbs_data->mutex);

	if (!count) {
501
		kobject_put(&dbs_data->kobj);
502

503 504
		policy->governor_data = NULL;

505
		if (!have_governor_per_policy())
506
			gov->gdbs_data = NULL;
507

508
		gov->exit(dbs_data, policy->governor->initialized == 1);
509
		mutex_destroy(&dbs_data->mutex);
510
		kfree(dbs_data);
511 512
	} else {
		policy->governor_data = NULL;
513
	}
514

515
	free_policy_dbs_info(policy, gov);
516
	return 0;
517
}
518

519
static int cpufreq_governor_start(struct cpufreq_policy *policy)
520
{
521
	struct dbs_governor *gov = dbs_governor_of(policy);
522 523
	struct policy_dbs_info *policy_dbs = policy->governor_data;
	struct dbs_data *dbs_data = policy_dbs->dbs_data;
524 525 526 527 528 529
	unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
	int io_busy = 0;

	if (!policy->cur)
		return -EINVAL;

530
	policy_dbs->is_shared = policy_is_shared(policy);
531
	policy_dbs->rate_mult = 1;
532

533 534
	sampling_rate = dbs_data->sampling_rate;
	ignore_nice = dbs_data->ignore_nice_load;
535

536
	if (gov->governor == GOV_ONDEMAND) {
537 538
		struct od_dbs_tuners *od_tuners = dbs_data->tuners;

539
		io_busy = od_tuners->io_is_busy;
540 541
	}

542
	for_each_cpu(j, policy->cpus) {
543
		struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
544
		unsigned int prev_load;
545

546
		j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
547

548 549
		prev_load = j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle;
		j_cdbs->prev_load = 100 * prev_load / (unsigned int)j_cdbs->prev_cpu_wall;
550

551 552 553
		if (ignore_nice)
			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
	}
554

555
	if (gov->governor == GOV_CONSERVATIVE) {
556
		struct cs_cpu_dbs_info_s *cs_dbs_info =
557
			gov->get_cpu_dbs_info_s(cpu);
558

559 560 561
		cs_dbs_info->down_skip = 0;
		cs_dbs_info->requested_freq = policy->cur;
	} else {
562 563
		struct od_ops *od_ops = gov->gov_ops;
		struct od_cpu_dbs_info_s *od_dbs_info = gov->get_cpu_dbs_info_s(cpu);
564

565 566 567
		od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
		od_ops->powersave_bias_init_cpu(cpu);
	}
568

569
	gov_set_update_util(policy_dbs, sampling_rate);
570 571 572
	return 0;
}

573
static int cpufreq_governor_stop(struct cpufreq_policy *policy)
574
{
575
	gov_cancel_work(policy);
576

577
	return 0;
578
}
579

580
static int cpufreq_governor_limits(struct cpufreq_policy *policy)
581
{
582
	struct policy_dbs_info *policy_dbs = policy->governor_data;
583

584
	mutex_lock(&policy_dbs->timer_mutex);
585

586 587 588 589
	if (policy->max < policy->cur)
		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
	else if (policy->min > policy->cur)
		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
590 591 592

	gov_update_sample_delay(policy_dbs, 0);

593
	mutex_unlock(&policy_dbs->timer_mutex);
594 595

	return 0;
596
}
597

598
int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
599
{
600
	int ret = -EINVAL;
601

602
	/* Lock governor to block concurrent initialization of governor */
603
	mutex_lock(&dbs_data_mutex);
604

605
	if (event == CPUFREQ_GOV_POLICY_INIT) {
606
		ret = cpufreq_governor_init(policy);
607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
	} else if (policy->governor_data) {
		switch (event) {
		case CPUFREQ_GOV_POLICY_EXIT:
			ret = cpufreq_governor_exit(policy);
			break;
		case CPUFREQ_GOV_START:
			ret = cpufreq_governor_start(policy);
			break;
		case CPUFREQ_GOV_STOP:
			ret = cpufreq_governor_stop(policy);
			break;
		case CPUFREQ_GOV_LIMITS:
			ret = cpufreq_governor_limits(policy);
			break;
		}
622
	}
623

624
	mutex_unlock(&dbs_data_mutex);
625
	return ret;
626 627
}
EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);