clocksource.c 33.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * linux/kernel/time/clocksource.c
 *
 * This file contains the functions which manage clocksource drivers.
 *
 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * TODO WishList:
 *   o Allow clocksource drivers to be unregistered
 */

26 27
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

28
#include <linux/device.h>
29 30 31
#include <linux/clocksource.h>
#include <linux/init.h>
#include <linux/module.h>
32
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
33
#include <linux/tick.h>
34
#include <linux/kthread.h>
35

36
#include "tick-internal.h"
37
#include "timekeeping_internal.h"
38

39 40 41 42 43 44
/**
 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
 * @mult:	pointer to mult variable
 * @shift:	pointer to shift variable
 * @from:	frequency to convert from
 * @to:		frequency to convert to
45
 * @maxsec:	guaranteed runtime conversion range in seconds
46 47 48 49 50 51 52 53
 *
 * The function evaluates the shift/mult pair for the scaled math
 * operations of clocksources and clockevents.
 *
 * @to and @from are frequency values in HZ. For clock sources @to is
 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
 * event @to is the counter frequency and @from is NSEC_PER_SEC.
 *
54
 * The @maxsec conversion range argument controls the time frame in
55 56 57 58 59 60 61 62
 * seconds which must be covered by the runtime conversion with the
 * calculated mult and shift factors. This guarantees that no 64bit
 * overflow happens when the input value of the conversion is
 * multiplied with the calculated mult factor. Larger ranges may
 * reduce the conversion accuracy by chosing smaller mult and shift
 * factors.
 */
void
63
clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
64 65 66 67 68 69 70 71
{
	u64 tmp;
	u32 sft, sftacc= 32;

	/*
	 * Calculate the shift factor which is limiting the conversion
	 * range:
	 */
72
	tmp = ((u64)maxsec * from) >> 32;
73 74 75 76 77 78 79 80 81 82 83
	while (tmp) {
		tmp >>=1;
		sftacc--;
	}

	/*
	 * Find the conversion shift/mult pair which has the best
	 * accuracy and fits the maxsec conversion range:
	 */
	for (sft = 32; sft > 0; sft--) {
		tmp = (u64) to << sft;
84
		tmp += from / 2;
85 86 87 88 89 90 91
		do_div(tmp, from);
		if ((tmp >> sftacc) == 0)
			break;
	}
	*mult = tmp;
	*shift = sft;
}
92
EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
93

94 95
/*[Clocksource internal variables]---------
 * curr_clocksource:
96
 *	currently selected clocksource.
97 98
 * suspend_clocksource:
 *	used to calculate the suspend time.
99 100
 * clocksource_list:
 *	linked list with the registered clocksources
101 102
 * clocksource_mutex:
 *	protects manipulations to curr_clocksource and the clocksource_list
103 104 105
 * override_name:
 *	Name of the user-specified clocksource.
 */
106
static struct clocksource *curr_clocksource;
107
static struct clocksource *suspend_clocksource;
108
static LIST_HEAD(clocksource_list);
109
static DEFINE_MUTEX(clocksource_mutex);
110
static char override_name[CS_NAME_LEN];
111
static int finished_booting;
112
static u64 suspend_start;
113

114
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
115
static void clocksource_watchdog_work(struct work_struct *work);
116
static void clocksource_select(void);
117

118 119 120
static LIST_HEAD(watchdog_list);
static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
121
static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
122
static DEFINE_SPINLOCK(watchdog_lock);
123
static int watchdog_running;
124
static atomic_t watchdog_reset_pending;
T
Thomas Gleixner 已提交
125

126 127 128 129 130 131 132 133 134 135
static void inline clocksource_watchdog_lock(unsigned long *flags)
{
	spin_lock_irqsave(&watchdog_lock, *flags);
}

static void inline clocksource_watchdog_unlock(unsigned long *flags)
{
	spin_unlock_irqrestore(&watchdog_lock, *flags);
}

136
/*
137
 * Interval: 0.5sec Threshold: 0.0625s
138 139
 */
#define WATCHDOG_INTERVAL (HZ >> 1)
140
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
141

142
static void __clocksource_unstable(struct clocksource *cs)
143 144
{
	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
145
	cs->flags |= CLOCK_SOURCE_UNSTABLE;
146

147
	/*
P
Peter Zijlstra 已提交
148
	 * If the clocksource is registered clocksource_watchdog_work() will
149 150 151 152
	 * re-rate and re-select.
	 */
	if (list_empty(&cs->list)) {
		cs->rating = 0;
153
		return;
154
	}
155

156 157 158
	if (cs->mark_unstable)
		cs->mark_unstable(cs);

P
Peter Zijlstra 已提交
159
	/* kick clocksource_watchdog_work() */
160 161
	if (finished_booting)
		schedule_work(&watchdog_work);
162 163
}

164 165 166 167
/**
 * clocksource_mark_unstable - mark clocksource unstable via watchdog
 * @cs:		clocksource to be marked unstable
 *
168
 * This function is called by the x86 TSC code to mark clocksources as unstable;
P
Peter Zijlstra 已提交
169
 * it defers demotion and re-selection to a work.
170 171 172 173 174 175 176
 */
void clocksource_mark_unstable(struct clocksource *cs)
{
	unsigned long flags;

	spin_lock_irqsave(&watchdog_lock, flags);
	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
177
		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
178 179 180 181 182 183
			list_add(&cs->wd_list, &watchdog_list);
		__clocksource_unstable(cs);
	}
	spin_unlock_irqrestore(&watchdog_lock, flags);
}

184
static void clocksource_watchdog(struct timer_list *unused)
185
{
186
	struct clocksource *cs;
187
	u64 csnow, wdnow, cslast, wdlast, delta;
188
	int64_t wd_nsec, cs_nsec;
189
	int next_cpu, reset_pending;
190 191

	spin_lock(&watchdog_lock);
192 193
	if (!watchdog_running)
		goto out;
194

195 196
	reset_pending = atomic_read(&watchdog_reset_pending);

197 198 199
	list_for_each_entry(cs, &watchdog_list, wd_list) {

		/* Clocksource already marked unstable? */
200
		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
201 202
			if (finished_booting)
				schedule_work(&watchdog_work);
203
			continue;
204
		}
205

206
		local_irq_disable();
207
		csnow = cs->read(cs);
208 209
		wdnow = watchdog->read(watchdog);
		local_irq_enable();
T
Thomas Gleixner 已提交
210

211
		/* Clocksource initialized ? */
212 213
		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
		    atomic_read(&watchdog_reset_pending)) {
214
			cs->flags |= CLOCK_SOURCE_WATCHDOG;
215 216
			cs->wd_last = wdnow;
			cs->cs_last = csnow;
T
Thomas Gleixner 已提交
217 218 219
			continue;
		}

220 221 222
		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
					     watchdog->shift);
223

224 225
		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
226 227
		wdlast = cs->wd_last; /* save these in case we print them */
		cslast = cs->cs_last;
228 229 230
		cs->cs_last = csnow;
		cs->wd_last = wdnow;

231 232 233
		if (atomic_read(&watchdog_reset_pending))
			continue;

234
		/* Check the deviation from the watchdog clocksource. */
A
Andrew Morton 已提交
235
		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
236 237
			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
				smp_processor_id(), cs->name);
238
			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
239
				watchdog->name, wdnow, wdlast, watchdog->mask);
240
			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
241 242
				cs->name, csnow, cslast, cs->mask);
			__clocksource_unstable(cs);
243 244 245
			continue;
		}

246 247 248
		if (cs == curr_clocksource && cs->tick_stable)
			cs->tick_stable(cs);

249 250 251
		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
252
			/* Mark it valid for high-res. */
253
			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
254 255 256 257 258 259 260 261

			/*
			 * clocksource_done_booting() will sort it if
			 * finished_booting is not set yet.
			 */
			if (!finished_booting)
				continue;

262
			/*
263 264 265 266 267 268
			 * If this is not the current clocksource let
			 * the watchdog thread reselect it. Due to the
			 * change to high res this clocksource might
			 * be preferred now. If it is the current
			 * clocksource let the tick code know about
			 * that change.
269
			 */
270 271 272 273 274 275
			if (cs != curr_clocksource) {
				cs->flags |= CLOCK_SOURCE_RESELECT;
				schedule_work(&watchdog_work);
			} else {
				tick_clock_notify();
			}
276 277 278
		}
	}

279 280 281 282 283 284 285
	/*
	 * We only clear the watchdog_reset_pending, when we did a
	 * full cycle through all clocksources.
	 */
	if (reset_pending)
		atomic_dec(&watchdog_reset_pending);

286 287 288 289 290 291 292 293 294
	/*
	 * Cycle through CPUs to check if the CPUs stay synchronized
	 * to each other.
	 */
	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
	if (next_cpu >= nr_cpu_ids)
		next_cpu = cpumask_first(cpu_online_mask);
	watchdog_timer.expires += WATCHDOG_INTERVAL;
	add_timer_on(&watchdog_timer, next_cpu);
295
out:
296 297
	spin_unlock(&watchdog_lock);
}
298

299 300 301 302
static inline void clocksource_start_watchdog(void)
{
	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
		return;
303
	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
304 305 306 307 308 309 310 311 312 313 314 315 316
	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
	watchdog_running = 1;
}

static inline void clocksource_stop_watchdog(void)
{
	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
		return;
	del_timer(&watchdog_timer);
	watchdog_running = 0;
}

317 318 319 320 321 322 323 324
static inline void clocksource_reset_watchdog(void)
{
	struct clocksource *cs;

	list_for_each_entry(cs, &watchdog_list, wd_list)
		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
}

T
Thomas Gleixner 已提交
325 326
static void clocksource_resume_watchdog(void)
{
327
	atomic_inc(&watchdog_reset_pending);
T
Thomas Gleixner 已提交
328 329
}

330
static void clocksource_enqueue_watchdog(struct clocksource *cs)
331
{
332 333
	INIT_LIST_HEAD(&cs->wd_list);

334
	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
335
		/* cs is a clocksource to be watched. */
336
		list_add(&cs->wd_list, &watchdog_list);
337
		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
338
	} else {
339
		/* cs is a watchdog. */
340
		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
341
			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
	}
}

static void clocksource_select_watchdog(bool fallback)
{
	struct clocksource *cs, *old_wd;
	unsigned long flags;

	spin_lock_irqsave(&watchdog_lock, flags);
	/* save current watchdog */
	old_wd = watchdog;
	if (fallback)
		watchdog = NULL;

	list_for_each_entry(cs, &clocksource_list, list) {
		/* cs is a clocksource to be watched. */
		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
			continue;

		/* Skip current if we were requested for a fallback. */
		if (fallback && cs == old_wd)
			continue;

365
		/* Pick the best watchdog. */
366
		if (!watchdog || cs->rating > watchdog->rating)
367 368
			watchdog = cs;
	}
369 370 371 372 373 374 375 376
	/* If we failed to find a fallback restore the old one. */
	if (!watchdog)
		watchdog = old_wd;

	/* If we changed the watchdog we need to reset cycles. */
	if (watchdog != old_wd)
		clocksource_reset_watchdog();

377 378
	/* Check if the watchdog timer needs to be started. */
	clocksource_start_watchdog();
379 380
	spin_unlock_irqrestore(&watchdog_lock, flags);
}
381 382 383

static void clocksource_dequeue_watchdog(struct clocksource *cs)
{
384 385 386 387 388 389
	if (cs != watchdog) {
		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
			/* cs is a watched clocksource. */
			list_del_init(&cs->wd_list);
			/* Check if the watchdog timer needs to be stopped. */
			clocksource_stop_watchdog();
390 391 392 393
		}
	}
}

P
Peter Zijlstra 已提交
394 395 396
static void __clocksource_change_rating(struct clocksource *cs, int rating);

static int __clocksource_watchdog_work(void)
397 398 399
{
	struct clocksource *cs, *tmp;
	unsigned long flags;
400
	int select = 0;
401 402

	spin_lock_irqsave(&watchdog_lock, flags);
403
	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
404 405
		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
			list_del_init(&cs->wd_list);
406
			__clocksource_change_rating(cs, 0);
407 408 409 410 411
			select = 1;
		}
		if (cs->flags & CLOCK_SOURCE_RESELECT) {
			cs->flags &= ~CLOCK_SOURCE_RESELECT;
			select = 1;
412
		}
413
	}
414 415
	/* Check if the watchdog timer needs to be stopped. */
	clocksource_stop_watchdog();
416 417
	spin_unlock_irqrestore(&watchdog_lock, flags);

418 419 420
	return select;
}

P
Peter Zijlstra 已提交
421
static void clocksource_watchdog_work(struct work_struct *work)
422 423
{
	mutex_lock(&clocksource_mutex);
P
Peter Zijlstra 已提交
424
	if (__clocksource_watchdog_work())
425
		clocksource_select();
426
	mutex_unlock(&clocksource_mutex);
427 428
}

429 430 431 432 433
static bool clocksource_is_watchdog(struct clocksource *cs)
{
	return cs == watchdog;
}

434 435 436
#else /* CONFIG_CLOCKSOURCE_WATCHDOG */

static void clocksource_enqueue_watchdog(struct clocksource *cs)
437 438 439 440
{
	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
T
Thomas Gleixner 已提交
441

442
static void clocksource_select_watchdog(bool fallback) { }
443
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
T
Thomas Gleixner 已提交
444
static inline void clocksource_resume_watchdog(void) { }
P
Peter Zijlstra 已提交
445
static inline int __clocksource_watchdog_work(void) { return 0; }
446
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
447
void clocksource_mark_unstable(struct clocksource *cs) { }
448

449 450
static inline void clocksource_watchdog_lock(unsigned long *flags) { }
static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
451

452
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
453

454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
static bool clocksource_is_suspend(struct clocksource *cs)
{
	return cs == suspend_clocksource;
}

static void __clocksource_suspend_select(struct clocksource *cs)
{
	/*
	 * Skip the clocksource which will be stopped in suspend state.
	 */
	if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
		return;

	/*
	 * The nonstop clocksource can be selected as the suspend clocksource to
	 * calculate the suspend time, so it should not supply suspend/resume
	 * interfaces to suspend the nonstop clocksource when system suspends.
	 */
	if (cs->suspend || cs->resume) {
		pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
			cs->name);
	}

	/* Pick the best rating. */
	if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
		suspend_clocksource = cs;
}

/**
 * clocksource_suspend_select - Select the best clocksource for suspend timing
 * @fallback:	if select a fallback clocksource
 */
static void clocksource_suspend_select(bool fallback)
{
	struct clocksource *cs, *old_suspend;

	old_suspend = suspend_clocksource;
	if (fallback)
		suspend_clocksource = NULL;

	list_for_each_entry(cs, &clocksource_list, list) {
		/* Skip current if we were requested for a fallback. */
		if (fallback && cs == old_suspend)
			continue;

		__clocksource_suspend_select(cs);
	}
}

/**
 * clocksource_start_suspend_timing - Start measuring the suspend timing
 * @cs:			current clocksource from timekeeping
 * @start_cycles:	current cycles from timekeeping
 *
 * This function will save the start cycle values of suspend timer to calculate
 * the suspend time when resuming system.
 *
 * This function is called late in the suspend process from timekeeping_suspend(),
 * that means processes are freezed, non-boot cpus and interrupts are disabled
 * now. It is therefore possible to start the suspend timer without taking the
 * clocksource mutex.
 */
void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
{
	if (!suspend_clocksource)
		return;

	/*
	 * If current clocksource is the suspend timer, we should use the
	 * tkr_mono.cycle_last value as suspend_start to avoid same reading
	 * from suspend timer.
	 */
	if (clocksource_is_suspend(cs)) {
		suspend_start = start_cycles;
		return;
	}

	if (suspend_clocksource->enable &&
	    suspend_clocksource->enable(suspend_clocksource)) {
		pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
		return;
	}

	suspend_start = suspend_clocksource->read(suspend_clocksource);
}

/**
 * clocksource_stop_suspend_timing - Stop measuring the suspend timing
 * @cs:		current clocksource from timekeeping
 * @cycle_now:	current cycles from timekeeping
 *
 * This function will calculate the suspend time from suspend timer.
 *
 * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
 *
 * This function is called early in the resume process from timekeeping_resume(),
 * that means there is only one cpu, no processes are running and the interrupts
 * are disabled. It is therefore possible to stop the suspend timer without
 * taking the clocksource mutex.
 */
u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
{
	u64 now, delta, nsec = 0;

	if (!suspend_clocksource)
		return 0;

	/*
	 * If current clocksource is the suspend timer, we should use the
	 * tkr_mono.cycle_last value from timekeeping as current cycle to
	 * avoid same reading from suspend timer.
	 */
	if (clocksource_is_suspend(cs))
		now = cycle_now;
	else
		now = suspend_clocksource->read(suspend_clocksource);

	if (now > suspend_start) {
		delta = clocksource_delta(now, suspend_start,
					  suspend_clocksource->mask);
		nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
				       suspend_clocksource->shift);
	}

	/*
	 * Disable the suspend timer to save power if current clocksource is
	 * not the suspend timer.
	 */
	if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
		suspend_clocksource->disable(suspend_clocksource);

	return nsec;
}

M
Magnus Damm 已提交
588 589 590 591 592 593 594 595 596 597 598 599
/**
 * clocksource_suspend - suspend the clocksource(s)
 */
void clocksource_suspend(void)
{
	struct clocksource *cs;

	list_for_each_entry_reverse(cs, &clocksource_list, list)
		if (cs->suspend)
			cs->suspend(cs);
}

T
Thomas Gleixner 已提交
600 601 602 603 604
/**
 * clocksource_resume - resume the clocksource(s)
 */
void clocksource_resume(void)
{
605
	struct clocksource *cs;
T
Thomas Gleixner 已提交
606

607
	list_for_each_entry(cs, &clocksource_list, list)
T
Thomas Gleixner 已提交
608
		if (cs->resume)
609
			cs->resume(cs);
T
Thomas Gleixner 已提交
610 611 612 613

	clocksource_resume_watchdog();
}

J
Jason Wessel 已提交
614 615 616 617
/**
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
618 619
 * to incorrectly trip the watchdog. This might fail when the kernel
 * was stopped in code which holds watchdog_lock.
J
Jason Wessel 已提交
620 621 622 623 624 625
 */
void clocksource_touch_watchdog(void)
{
	clocksource_resume_watchdog();
}

626 627 628 629 630 631 632 633 634
/**
 * clocksource_max_adjustment- Returns max adjustment amount
 * @cs:         Pointer to clocksource
 *
 */
static u32 clocksource_max_adjustment(struct clocksource *cs)
{
	u64 ret;
	/*
635
	 * We won't try to correct for more than 11% adjustments (110,000 ppm),
636 637 638 639 640 641
	 */
	ret = (u64)cs->mult * 11;
	do_div(ret,100);
	return (u32)ret;
}

642
/**
643 644 645 646 647
 * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
 * @mult:	cycle to nanosecond multiplier
 * @shift:	cycle to nanosecond divisor (power of two)
 * @maxadj:	maximum adjustment value to mult (~11%)
 * @mask:	bitmask for two's complement subtraction of non 64 bit counters
648 649
 * @max_cyc:	maximum cycle value before potential overflow (does not include
 *		any safety margin)
650
 *
651 652 653 654
 * NOTE: This function includes a safety margin of 50%, in other words, we
 * return half the number of nanoseconds the hardware counter can technically
 * cover. This is done so that we can potentially detect problems caused by
 * delayed timers or bad hardware, which might result in time intervals that
Z
Zhen Lei 已提交
655
 * are larger than what the math used can handle without overflows.
656
 */
657
u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
658 659 660 661 662
{
	u64 max_nsecs, max_cycles;

	/*
	 * Calculate the maximum number of cycles that we can pass to the
663
	 * cyc2ns() function without overflowing a 64-bit result.
664
	 */
665 666
	max_cycles = ULLONG_MAX;
	do_div(max_cycles, mult+maxadj);
667 668 669

	/*
	 * The actual maximum number of cycles we can defer the clocksource is
670
	 * determined by the minimum of max_cycles and mask.
671 672
	 * Note: Here we subtract the maxadj to make sure we don't sleep for
	 * too long if there's a large negative adjustment.
673
	 */
674 675 676
	max_cycles = min(max_cycles, mask);
	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);

677 678 679 680
	/* return the max_cycles value as well if requested */
	if (max_cyc)
		*max_cyc = max_cycles;

681 682 683
	/* Return 50% of the actual maximum, so we can detect bad values */
	max_nsecs >>= 1;

684 685 686 687
	return max_nsecs;
}

/**
688 689
 * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
 * @cs:         Pointer to clocksource to be updated
690 691
 *
 */
692
static inline void clocksource_update_max_deferment(struct clocksource *cs)
693
{
694 695 696
	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
						cs->maxadj, cs->mask,
						&cs->max_cycles);
697 698
}

J
John Stultz 已提交
699
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
700

701
static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
702 703 704 705 706 707 708 709 710 711 712 713
{
	struct clocksource *cs;

	if (!finished_booting || list_empty(&clocksource_list))
		return NULL;

	/*
	 * We pick the clocksource with the highest rating. If oneshot
	 * mode is active, we pick the highres valid clocksource with
	 * the best rating.
	 */
	list_for_each_entry(cs, &clocksource_list, list) {
714 715
		if (skipcur && cs == curr_clocksource)
			continue;
716 717 718 719 720 721 722
		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
			continue;
		return cs;
	}
	return NULL;
}

723
static void __clocksource_select(bool skipcur)
724
{
725
	bool oneshot = tick_oneshot_mode_active();
726
	struct clocksource *best, *cs;
727

728
	/* Find the best suitable clocksource */
729
	best = clocksource_find_best(oneshot, skipcur);
730
	if (!best)
731
		return;
732

733 734 735
	if (!strlen(override_name))
		goto found;

736 737
	/* Check for the override clocksource. */
	list_for_each_entry(cs, &clocksource_list, list) {
738 739
		if (skipcur && cs == curr_clocksource)
			continue;
740 741 742 743 744 745 746
		if (strcmp(cs->name, override_name) != 0)
			continue;
		/*
		 * Check to make sure we don't switch to a non-highres
		 * capable clocksource if the tick code is in oneshot
		 * mode (highres or nohz)
		 */
747
		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
748
			/* Override clocksource cannot be used. */
749 750 751 752 753 754 755 756 757 758 759 760
			if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
				pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
					cs->name);
				override_name[0] = 0;
			} else {
				/*
				 * The override cannot be currently verified.
				 * Deferring to let the watchdog check.
				 */
				pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
					cs->name);
			}
761 762 763 764 765
		} else
			/* Override clocksource can be used. */
			best = cs;
		break;
	}
766

767
found:
768 769
	if (curr_clocksource != best && !timekeeping_notify(best)) {
		pr_info("Switched to clocksource %s\n", best->name);
770 771
		curr_clocksource = best;
	}
772
}
773

774 775 776 777 778 779 780 781 782 783
/**
 * clocksource_select - Select the best clocksource available
 *
 * Private function. Must hold clocksource_mutex when called.
 *
 * Select the clocksource with the best rating, or the clocksource,
 * which is selected by userspace override.
 */
static void clocksource_select(void)
{
784
	__clocksource_select(false);
785 786
}

787 788
static void clocksource_select_fallback(void)
{
789
	__clocksource_select(true);
790 791
}

J
John Stultz 已提交
792
#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
793
static inline void clocksource_select(void) { }
794
static inline void clocksource_select_fallback(void) { }
795 796 797

#endif

798 799 800 801 802 803 804 805 806
/*
 * clocksource_done_booting - Called near the end of core bootup
 *
 * Hack to avoid lots of clocksource churn at boot time.
 * We use fs_initcall because we want this to start before
 * device_initcall but after subsys_initcall.
 */
static int __init clocksource_done_booting(void)
{
807 808
	mutex_lock(&clocksource_mutex);
	curr_clocksource = clocksource_default_clock();
809
	finished_booting = 1;
810 811 812
	/*
	 * Run the watchdog first to eliminate unstable clock sources
	 */
P
Peter Zijlstra 已提交
813
	__clocksource_watchdog_work();
814
	clocksource_select();
815
	mutex_unlock(&clocksource_mutex);
816 817 818 819
	return 0;
}
fs_initcall(clocksource_done_booting);

820 821
/*
 * Enqueue the clocksource sorted by rating
822
 */
823
static void clocksource_enqueue(struct clocksource *cs)
824
{
825 826
	struct list_head *entry = &clocksource_list;
	struct clocksource *tmp;
827

828
	list_for_each_entry(tmp, &clocksource_list, list) {
829
		/* Keep track of the place, where to insert */
830 831 832 833
		if (tmp->rating < cs->rating)
			break;
		entry = &tmp->list;
	}
834
	list_add(&cs->list, entry);
835 836
}

837
/**
838
 * __clocksource_update_freq_scale - Used update clocksource with new freq
839
 * @cs:		clocksource to be registered
840 841 842
 * @scale:	Scale factor multiplied against freq to get clocksource hz
 * @freq:	clocksource frequency (cycles per second) divided by scale
 *
843
 * This should only be called from the clocksource->enable() method.
844 845
 *
 * This *SHOULD NOT* be called directly! Please use the
846 847
 * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
 * functions.
848
 */
849
void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
850
{
851
	u64 sec;
852

853
	/*
854 855
	 * Default clocksources are *special* and self-define their mult/shift.
	 * But, you're not special, so you should specify a freq value.
856
	 */
857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
	if (freq) {
		/*
		 * Calc the maximum number of seconds which we can run before
		 * wrapping around. For clocksources which have a mask > 32-bit
		 * we need to limit the max sleep time to have a good
		 * conversion precision. 10 minutes is still a reasonable
		 * amount. That results in a shift value of 24 for a
		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
		 * ~ 0.06ppm granularity for NTP.
		 */
		sec = cs->mask;
		do_div(sec, freq);
		do_div(sec, scale);
		if (!sec)
			sec = 1;
		else if (sec > 600 && cs->mask > UINT_MAX)
			sec = 600;

		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
				       NSEC_PER_SEC / scale, sec * scale);
	}
878
	/*
879 880
	 * Ensure clocksources that have large 'mult' values don't overflow
	 * when adjusted.
881 882
	 */
	cs->maxadj = clocksource_max_adjustment(cs);
883 884
	while (freq && ((cs->mult + cs->maxadj < cs->mult)
		|| (cs->mult - cs->maxadj > cs->mult))) {
885 886 887 888 889
		cs->mult >>= 1;
		cs->shift--;
		cs->maxadj = clocksource_max_adjustment(cs);
	}

890 891 892 893 894 895 896 897
	/*
	 * Only warn for *special* clocksources that self-define
	 * their mult/shift values and don't specify a freq.
	 */
	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
		cs->name);

898
	clocksource_update_max_deferment(cs);
899

900 901
	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
902
}
903
EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
904 905 906

/**
 * __clocksource_register_scale - Used to install new clocksources
907
 * @cs:		clocksource to be registered
908 909 910 911 912 913 914 915 916 917
 * @scale:	Scale factor multiplied against freq to get clocksource hz
 * @freq:	clocksource frequency (cycles per second) divided by scale
 *
 * Returns -EBUSY if registration fails, zero otherwise.
 *
 * This *SHOULD NOT* be called directly! Please use the
 * clocksource_register_hz() or clocksource_register_khz helper functions.
 */
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
918
	unsigned long flags;
919

920
	/* Initialize mult/shift and max_idle_ns */
921
	__clocksource_update_freq_scale(cs, scale, freq);
922

923
	/* Add clocksource to the clocksource list */
924
	mutex_lock(&clocksource_mutex);
925 926

	clocksource_watchdog_lock(&flags);
927 928
	clocksource_enqueue(cs);
	clocksource_enqueue_watchdog(cs);
929 930
	clocksource_watchdog_unlock(&flags);

931
	clocksource_select();
932
	clocksource_select_watchdog(false);
933
	__clocksource_suspend_select(cs);
934 935 936 937 938
	mutex_unlock(&clocksource_mutex);
	return 0;
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);

939 940 941 942 943 944 945
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
	list_del(&cs->list);
	cs->rating = rating;
	clocksource_enqueue(cs);
}

946
/**
947
 * clocksource_change_rating - Change the rating of a registered clocksource
948 949
 * @cs:		clocksource to be changed
 * @rating:	new rating
950
 */
951
void clocksource_change_rating(struct clocksource *cs, int rating)
952
{
953 954
	unsigned long flags;

955
	mutex_lock(&clocksource_mutex);
956
	clocksource_watchdog_lock(&flags);
957
	__clocksource_change_rating(cs, rating);
958 959
	clocksource_watchdog_unlock(&flags);

960
	clocksource_select();
961
	clocksource_select_watchdog(false);
962
	clocksource_suspend_select(false);
963
	mutex_unlock(&clocksource_mutex);
964
}
965
EXPORT_SYMBOL(clocksource_change_rating);
966

967 968 969 970 971
/*
 * Unbind clocksource @cs. Called with clocksource_mutex held
 */
static int clocksource_unbind(struct clocksource *cs)
{
972 973
	unsigned long flags;

974 975 976 977 978 979
	if (clocksource_is_watchdog(cs)) {
		/* Select and try to install a replacement watchdog. */
		clocksource_select_watchdog(true);
		if (clocksource_is_watchdog(cs))
			return -EBUSY;
	}
980 981 982 983 984 985 986

	if (cs == curr_clocksource) {
		/* Select and try to install a replacement clock source */
		clocksource_select_fallback();
		if (curr_clocksource == cs)
			return -EBUSY;
	}
987

988 989 990 991 992 993 994 995 996
	if (clocksource_is_suspend(cs)) {
		/*
		 * Select and try to install a replacement suspend clocksource.
		 * If no replacement suspend clocksource, we will just let the
		 * clocksource go and have no suspend clocksource.
		 */
		clocksource_suspend_select(true);
	}

997
	clocksource_watchdog_lock(&flags);
998 999
	clocksource_dequeue_watchdog(cs);
	list_del_init(&cs->list);
1000 1001
	clocksource_watchdog_unlock(&flags);

1002 1003 1004
	return 0;
}

1005 1006
/**
 * clocksource_unregister - remove a registered clocksource
1007
 * @cs:	clocksource to be unregistered
1008
 */
1009
int clocksource_unregister(struct clocksource *cs)
1010
{
1011 1012
	int ret = 0;

1013
	mutex_lock(&clocksource_mutex);
1014 1015
	if (!list_empty(&cs->list))
		ret = clocksource_unbind(cs);
1016
	mutex_unlock(&clocksource_mutex);
1017
	return ret;
1018
}
1019
EXPORT_SYMBOL(clocksource_unregister);
1020

1021
#ifdef CONFIG_SYSFS
1022
/**
1023
 * current_clocksource_show - sysfs interface for current clocksource
1024
 * @dev:	unused
1025
 * @attr:	unused
1026 1027 1028 1029
 * @buf:	char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing current clocksource.
 */
1030 1031 1032
static ssize_t current_clocksource_show(struct device *dev,
					struct device_attribute *attr,
					char *buf)
1033
{
1034
	ssize_t count = 0;
1035

1036
	mutex_lock(&clocksource_mutex);
1037
	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
1038
	mutex_unlock(&clocksource_mutex);
1039

1040
	return count;
1041 1042
}

1043
ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
{
	size_t ret = cnt;

	/* strings from sysfs write are not 0 terminated! */
	if (!cnt || cnt >= CS_NAME_LEN)
		return -EINVAL;

	/* strip of \n: */
	if (buf[cnt-1] == '\n')
		cnt--;
	if (cnt > 0)
		memcpy(dst, buf, cnt);
	dst[cnt] = 0;
	return ret;
}

1060
/**
1061
 * current_clocksource_store - interface for manually overriding clocksource
1062
 * @dev:	unused
1063
 * @attr:	unused
1064 1065 1066 1067
 * @buf:	name of override clocksource
 * @count:	length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
1068
 * clocksource selection.
1069
 */
1070 1071 1072
static ssize_t current_clocksource_store(struct device *dev,
					 struct device_attribute *attr,
					 const char *buf, size_t count)
1073
{
1074
	ssize_t ret;
1075

1076
	mutex_lock(&clocksource_mutex);
1077

1078
	ret = sysfs_get_uname(buf, override_name, count);
1079 1080
	if (ret >= 0)
		clocksource_select();
1081

1082
	mutex_unlock(&clocksource_mutex);
1083 1084 1085

	return ret;
}
1086
static DEVICE_ATTR_RW(current_clocksource);
1087

1088
/**
1089
 * unbind_clocksource_store - interface for manually unbinding clocksource
1090 1091 1092 1093 1094 1095 1096
 * @dev:	unused
 * @attr:	unused
 * @buf:	unused
 * @count:	length of buffer
 *
 * Takes input from sysfs interface for manually unbinding a clocksource.
 */
1097
static ssize_t unbind_clocksource_store(struct device *dev,
1098 1099 1100 1101 1102
					struct device_attribute *attr,
					const char *buf, size_t count)
{
	struct clocksource *cs;
	char name[CS_NAME_LEN];
1103
	ssize_t ret;
1104

1105
	ret = sysfs_get_uname(buf, name, count);
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
	if (ret < 0)
		return ret;

	ret = -ENODEV;
	mutex_lock(&clocksource_mutex);
	list_for_each_entry(cs, &clocksource_list, list) {
		if (strcmp(cs->name, name))
			continue;
		ret = clocksource_unbind(cs);
		break;
	}
	mutex_unlock(&clocksource_mutex);

	return ret ? ret : count;
}
1121
static DEVICE_ATTR_WO(unbind_clocksource);
1122

1123
/**
1124
 * available_clocksource_show - sysfs interface for listing clocksource
1125
 * @dev:	unused
1126
 * @attr:	unused
1127 1128 1129 1130
 * @buf:	char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing registered clocksources
 */
1131 1132 1133
static ssize_t available_clocksource_show(struct device *dev,
					  struct device_attribute *attr,
					  char *buf)
1134
{
1135
	struct clocksource *src;
1136
	ssize_t count = 0;
1137

1138
	mutex_lock(&clocksource_mutex);
1139
	list_for_each_entry(src, &clocksource_list, list) {
1140 1141 1142 1143 1144 1145
		/*
		 * Don't show non-HRES clocksource if the tick code is
		 * in one shot mode (highres=on or nohz=on)
		 */
		if (!tick_oneshot_mode_active() ||
		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
1146
			count += snprintf(buf + count,
1147 1148
				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
				  "%s ", src->name);
1149
	}
1150
	mutex_unlock(&clocksource_mutex);
1151

1152 1153
	count += snprintf(buf + count,
			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
1154

1155
	return count;
1156
}
1157
static DEVICE_ATTR_RO(available_clocksource);
1158

B
Baolin Wang 已提交
1159 1160 1161 1162 1163 1164 1165 1166
static struct attribute *clocksource_attrs[] = {
	&dev_attr_current_clocksource.attr,
	&dev_attr_unbind_clocksource.attr,
	&dev_attr_available_clocksource.attr,
	NULL
};
ATTRIBUTE_GROUPS(clocksource);

1167
static struct bus_type clocksource_subsys = {
1168
	.name = "clocksource",
1169
	.dev_name = "clocksource",
1170 1171
};

1172
static struct device device_clocksource = {
1173
	.id	= 0,
1174
	.bus	= &clocksource_subsys,
B
Baolin Wang 已提交
1175
	.groups	= clocksource_groups,
1176 1177
};

1178
static int __init init_clocksource_sysfs(void)
1179
{
1180
	int error = subsys_system_register(&clocksource_subsys, NULL);
1181 1182

	if (!error)
1183
		error = device_register(&device_clocksource);
B
Baolin Wang 已提交
1184

1185 1186 1187 1188
	return error;
}

device_initcall(init_clocksource_sysfs);
1189
#endif /* CONFIG_SYSFS */
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199

/**
 * boot_override_clocksource - boot clock override
 * @str:	override name
 *
 * Takes a clocksource= boot argument and uses it
 * as the clocksource override name.
 */
static int __init boot_override_clocksource(char* str)
{
1200
	mutex_lock(&clocksource_mutex);
1201 1202
	if (str)
		strlcpy(override_name, str, sizeof(override_name));
1203
	mutex_unlock(&clocksource_mutex);
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
	return 1;
}

__setup("clocksource=", boot_override_clocksource);

/**
 * boot_override_clock - Compatibility layer for deprecated boot option
 * @str:	override name
 *
 * DEPRECATED! Takes a clock= boot argument and uses it
 * as the clocksource override name
 */
static int __init boot_override_clock(char* str)
{
1218
	if (!strcmp(str, "pmtmr")) {
1219
		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
1220 1221
		return boot_override_clocksource("acpi_pm");
	}
1222
	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
1223 1224 1225 1226
	return boot_override_clocksource(str);
}

__setup("clock=", boot_override_clock);