tsc_sync.c 11.4 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
D
Dave Jones 已提交
2
 * check TSC synchronization.
T
Thomas Gleixner 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
 *
 * We check whether all boot CPUs have their TSC's synchronized,
 * print a warning if not and turn off the TSC clock-source.
 *
 * The warp-check is point-to-point between two CPUs, the CPU
 * initiating the bootup is the 'source CPU', the freshly booting
 * CPU is the 'target CPU'.
 *
 * Only two CPUs may participate - they can enter in any order.
 * ( The serial nature of the boot logic and the CPU hotplug lock
 *   protects against more than 2 CPUs entering this code. )
 */
17
#include <linux/topology.h>
T
Thomas Gleixner 已提交
18 19 20 21 22 23
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <asm/tsc.h>

24
struct tsc_adjust {
25 26 27 28
	s64		bootval;
	s64		adjusted;
	unsigned long	nextcheck;
	bool		warned;
29 30 31 32
};

static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
void tsc_verify_tsc_adjust(void)
{
	struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
	s64 curval;

	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
		return;

	/* Rate limit the MSR check */
	if (time_before(jiffies, adj->nextcheck))
		return;

	adj->nextcheck = jiffies + HZ;

	rdmsrl(MSR_IA32_TSC_ADJUST, curval);
	if (adj->adjusted == curval)
		return;

	/* Restore the original value */
	wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);

	if (!adj->warned) {
		pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
			smp_processor_id(), adj->adjusted, curval);
		adj->warned = true;
	}
}

61
#ifndef CONFIG_SMP
62
bool __init tsc_store_and_check_tsc_adjust(void)
63
{
64
	struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
65 66 67
	s64 bootval;

	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
68
		return false;
69 70 71 72

	rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
	cur->bootval = bootval;
	cur->adjusted = bootval;
73
	cur->nextcheck = jiffies + HZ;
74
	pr_info("TSC ADJUST: Boot CPU0: %lld\n", bootval);
75
	return false;
76 77 78 79 80 81 82
}

#else /* !CONFIG_SMP */

/*
 * Store and check the TSC ADJUST MSR if available
 */
83
bool tsc_store_and_check_tsc_adjust(void)
84 85 86
{
	struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
	unsigned int refcpu, cpu = smp_processor_id();
87
	struct cpumask *mask;
88 89 90
	s64 bootval;

	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
91
		return false;
92 93 94

	rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
	cur->bootval = bootval;
95 96
	cur->nextcheck = jiffies + HZ;
	cur->warned = false;
97 98 99 100 101

	/*
	 * Check whether this CPU is the first in a package to come up. In
	 * this case do not check the boot value against another package
	 * because the package might have been physically hotplugged, where
102 103
	 * TSC_ADJUST is expected to be different. When called on the boot
	 * CPU topology_core_cpumask() might not be available yet.
104
	 */
105 106
	mask = topology_core_cpumask(cpu);
	refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
107 108 109 110 111 112 113 114 115 116

	if (refcpu >= nr_cpu_ids) {
		/*
		 * First online CPU in a package stores the boot value in
		 * the adjustment value. This value might change later via
		 * the sync mechanism. If that fails we still can yell
		 * about boot values not being consistent.
		 */
		cur->adjusted = bootval;
		pr_info_once("TSC ADJUST: Boot CPU%u: %lld\n", cpu,  bootval);
117
		return false;
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
	}

	ref = per_cpu_ptr(&tsc_adjust, refcpu);
	/*
	 * Compare the boot value and complain if it differs in the
	 * package.
	 */
	if (bootval != ref->bootval) {
		pr_warn("TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n",
			refcpu, ref->bootval, cpu, bootval);
	}
	/*
	 * The TSC_ADJUST values in a package must be the same. If the boot
	 * value on this newly upcoming CPU differs from the adjustment
	 * value of the already online CPU in this package, set it to that
	 * adjusted value.
	 */
	if (bootval != ref->adjusted) {
		pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n",
			refcpu, ref->adjusted, cpu, bootval);
		cur->adjusted = ref->adjusted;
		wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
	}
141 142 143 144 145
	/*
	 * We have the TSCs forced to be in sync on this package. Skip sync
	 * test:
	 */
	return true;
146 147
}

T
Thomas Gleixner 已提交
148 149 150 151
/*
 * Entry/exit counters that make sure that both CPUs
 * run the measurement code at once:
 */
152 153
static atomic_t start_count;
static atomic_t stop_count;
154
static atomic_t skip_test;
155
static atomic_t test_runs;
T
Thomas Gleixner 已提交
156 157 158 159 160 161

/*
 * We use a raw spinlock in this exceptional case, because
 * we want to have the fastest, inlined, non-debug version
 * of a critical section, to be able to prove TSC time-warps:
 */
162
static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
163

164 165 166
static cycles_t last_tsc;
static cycles_t max_warp;
static int nr_warps;
T
Thomas Gleixner 已提交
167
static int random_warps;
T
Thomas Gleixner 已提交
168 169

/*
170 171
 * TSC-warp measurement loop running on both CPUs.  This is not called
 * if there is no TSC.
T
Thomas Gleixner 已提交
172
 */
173
static cycles_t check_tsc_warp(unsigned int timeout)
T
Thomas Gleixner 已提交
174
{
175
	cycles_t start, now, prev, end, cur_max_warp = 0;
T
Thomas Gleixner 已提交
176
	int i, cur_warps = 0;
T
Thomas Gleixner 已提交
177

178
	start = rdtsc_ordered();
T
Thomas Gleixner 已提交
179
	/*
180
	 * The measurement runs for 'timeout' msecs:
T
Thomas Gleixner 已提交
181
	 */
182
	end = start + (cycles_t) tsc_khz * timeout;
T
Thomas Gleixner 已提交
183 184 185 186 187 188 189 190
	now = start;

	for (i = 0; ; i++) {
		/*
		 * We take the global lock, measure TSC, save the
		 * previous TSC that was measured (possibly on
		 * another CPU) and update the previous TSC timestamp.
		 */
191
		arch_spin_lock(&sync_lock);
T
Thomas Gleixner 已提交
192
		prev = last_tsc;
193
		now = rdtsc_ordered();
T
Thomas Gleixner 已提交
194
		last_tsc = now;
195
		arch_spin_unlock(&sync_lock);
T
Thomas Gleixner 已提交
196 197 198

		/*
		 * Be nice every now and then (and also check whether
I
Ingo Molnar 已提交
199
		 * measurement is done [we also insert a 10 million
T
Thomas Gleixner 已提交
200 201 202 203
		 * loops safety exit, so we dont lock up in case the
		 * TSC readout is totally broken]):
		 */
		if (unlikely(!(i & 7))) {
I
Ingo Molnar 已提交
204
			if (now > end || i > 10000000)
T
Thomas Gleixner 已提交
205 206 207 208 209 210 211 212 213
				break;
			cpu_relax();
			touch_nmi_watchdog();
		}
		/*
		 * Outside the critical section we can now see whether
		 * we saw a time-warp of the TSC going backwards:
		 */
		if (unlikely(prev > now)) {
214
			arch_spin_lock(&sync_lock);
T
Thomas Gleixner 已提交
215
			max_warp = max(max_warp, prev - now);
216
			cur_max_warp = max_warp;
T
Thomas Gleixner 已提交
217 218 219 220 221 222
			/*
			 * Check whether this bounces back and forth. Only
			 * one CPU should observe time going backwards.
			 */
			if (cur_warps != nr_warps)
				random_warps++;
T
Thomas Gleixner 已提交
223
			nr_warps++;
T
Thomas Gleixner 已提交
224
			cur_warps = nr_warps;
225
			arch_spin_unlock(&sync_lock);
T
Thomas Gleixner 已提交
226
		}
227
	}
228 229
	WARN(!(now-start),
		"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
230
			now-start, end-start);
231
	return cur_max_warp;
T
Thomas Gleixner 已提交
232 233
}

234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
/*
 * If the target CPU coming online doesn't have any of its core-siblings
 * online, a timeout of 20msec will be used for the TSC-warp measurement
 * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
 * information about this socket already (and this information grows as we
 * have more and more logical-siblings in that socket).
 *
 * Ideally we should be able to skip the TSC sync check on the other
 * core-siblings, if the first logical CPU in a socket passed the sync test.
 * But as the TSC is per-logical CPU and can potentially be modified wrongly
 * by the bios, TSC sync test for smaller duration should be able
 * to catch such errors. Also this will catch the condition where all the
 * cores in the socket doesn't get reset at the same time.
 */
static inline unsigned int loop_timeout(int cpu)
{
250
	return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
251 252
}

T
Thomas Gleixner 已提交
253 254 255 256
/*
 * Source CPU calls into this - it waits for the freshly booted
 * target CPU to arrive and then starts the measurement:
 */
257
void check_tsc_sync_source(int cpu)
T
Thomas Gleixner 已提交
258 259 260 261 262
{
	int cpus = 2;

	/*
	 * No need to check if we already know that the TSC is not
263
	 * synchronized or if we have no TSC.
T
Thomas Gleixner 已提交
264 265 266 267
	 */
	if (unsynchronized_tsc())
		return;

268
	if (tsc_clocksource_reliable) {
269 270 271
		if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
			pr_info(
			"Skipped synchronization checks as TSC is reliable.\n");
272 273 274
		return;
	}

275 276 277 278 279 280 281 282 283 284
	/*
	 * Set the maximum number of test runs to
	 *  1 if the CPU does not provide the TSC_ADJUST MSR
	 *  3 if the MSR is available, so the target can try to adjust
	 */
	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
		atomic_set(&test_runs, 1);
	else
		atomic_set(&test_runs, 3);
retry:
T
Thomas Gleixner 已提交
285
	/*
286
	 * Wait for the target to start or to skip the test:
T
Thomas Gleixner 已提交
287
	 */
288 289 290 291 292
	while (atomic_read(&start_count) != cpus - 1) {
		if (atomic_read(&skip_test) > 0) {
			atomic_set(&skip_test, 0);
			return;
		}
T
Thomas Gleixner 已提交
293
		cpu_relax();
294 295
	}

T
Thomas Gleixner 已提交
296 297 298 299 300
	/*
	 * Trigger the target to continue into the measurement too:
	 */
	atomic_inc(&start_count);

301
	check_tsc_warp(loop_timeout(cpu));
T
Thomas Gleixner 已提交
302 303 304 305

	while (atomic_read(&stop_count) != cpus-1)
		cpu_relax();

306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
	/*
	 * If the test was successful set the number of runs to zero and
	 * stop. If not, decrement the number of runs an check if we can
	 * retry. In case of random warps no retry is attempted.
	 */
	if (!nr_warps) {
		atomic_set(&test_runs, 0);

		pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
			smp_processor_id(), cpu);

	} else if (atomic_dec_and_test(&test_runs) || random_warps) {
		/* Force it to 0 if random warps brought us here */
		atomic_set(&test_runs, 0);

321 322
		pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
			smp_processor_id(), cpu);
323 324
		pr_warning("Measured %Ld cycles TSC warp between CPUs, "
			   "turning off TSC clock.\n", max_warp);
T
Thomas Gleixner 已提交
325 326
		if (random_warps)
			pr_warning("TSC warped randomly between CPUs\n");
T
Thomas Gleixner 已提交
327 328 329
		mark_tsc_unstable("check_tsc_sync_source failed");
	}

330 331 332 333
	/*
	 * Reset it - just in case we boot another CPU later:
	 */
	atomic_set(&start_count, 0);
T
Thomas Gleixner 已提交
334
	random_warps = 0;
335 336 337 338
	nr_warps = 0;
	max_warp = 0;
	last_tsc = 0;

T
Thomas Gleixner 已提交
339 340 341 342
	/*
	 * Let the target continue with the bootup:
	 */
	atomic_inc(&stop_count);
343 344 345 346 347 348

	/*
	 * Retry, if there is a chance to do so.
	 */
	if (atomic_read(&test_runs) > 0)
		goto retry;
T
Thomas Gleixner 已提交
349 350 351 352 353
}

/*
 * Freshly booted CPUs call into this:
 */
354
void check_tsc_sync_target(void)
T
Thomas Gleixner 已提交
355
{
356 357 358
	struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
	unsigned int cpu = smp_processor_id();
	cycles_t cur_max_warp, gbl_max_warp;
T
Thomas Gleixner 已提交
359 360
	int cpus = 2;

361
	/* Also aborts if there is no TSC. */
362
	if (unsynchronized_tsc() || tsc_clocksource_reliable)
T
Thomas Gleixner 已提交
363 364
		return;

365 366 367 368 369 370 371 372
	/*
	 * Store, verify and sanitize the TSC adjust register. If
	 * successful skip the test.
	 */
	if (tsc_store_and_check_tsc_adjust()) {
		atomic_inc(&skip_test);
		return;
	}
373

374
retry:
T
Thomas Gleixner 已提交
375 376 377 378 379 380 381 382
	/*
	 * Register this CPU's participation and wait for the
	 * source CPU to start the measurement:
	 */
	atomic_inc(&start_count);
	while (atomic_read(&start_count) != cpus)
		cpu_relax();

383 384 385 386 387 388
	cur_max_warp = check_tsc_warp(loop_timeout(cpu));

	/*
	 * Store the maximum observed warp value for a potential retry:
	 */
	gbl_max_warp = max_warp;
T
Thomas Gleixner 已提交
389 390 391 392 393 394 395 396 397 398 399

	/*
	 * Ok, we are done:
	 */
	atomic_inc(&stop_count);

	/*
	 * Wait for the source CPU to print stuff:
	 */
	while (atomic_read(&stop_count) != cpus)
		cpu_relax();
400 401 402 403 404

	/*
	 * Reset it for the next sync test:
	 */
	atomic_set(&stop_count, 0);
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440

	/*
	 * Check the number of remaining test runs. If not zero, the test
	 * failed and a retry with adjusted TSC is possible. If zero the
	 * test was either successful or failed terminally.
	 */
	if (!atomic_read(&test_runs))
		return;

	/*
	 * If the warp value of this CPU is 0, then the other CPU
	 * observed time going backwards so this TSC was ahead and
	 * needs to move backwards.
	 */
	if (!cur_max_warp)
		cur_max_warp = -gbl_max_warp;

	/*
	 * Add the result to the previous adjustment value.
	 *
	 * The adjustement value is slightly off by the overhead of the
	 * sync mechanism (observed values are ~200 TSC cycles), but this
	 * really depends on CPU, node distance and frequency. So
	 * compensating for this is hard to get right. Experiments show
	 * that the warp is not longer detectable when the observed warp
	 * value is used. In the worst case the adjustment needs to go
	 * through a 3rd run for fine tuning.
	 */
	cur->adjusted += cur_max_warp;

	pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
		cpu, cur_max_warp, cur->adjusted);

	wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
	goto retry;

T
Thomas Gleixner 已提交
441
}
442 443

#endif /* CONFIG_SMP */