tlb.c 22.2 KB
Newer Older
G
Glauber Costa 已提交
1 2 3 4 5 6
#include <linux/init.h>

#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
7
#include <linux/export.h>
8
#include <linux/cpu.h>
9
#include <linux/debugfs.h>
G
Glauber Costa 已提交
10 11 12

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
13
#include <asm/nospec-branch.h>
14
#include <asm/cache.h>
T
Tejun Heo 已提交
15
#include <asm/apic.h>
T
Tejun Heo 已提交
16
#include <asm/uv/uv.h>
17

G
Glauber Costa 已提交
18
/*
19
 *	TLB flushing, formerly SMP-only
G
Glauber Costa 已提交
20 21 22 23 24 25 26 27 28
 *		c/o Linus Torvalds.
 *
 *	These mean you can really definitely utterly forget about
 *	writing to user space from interrupts. (Its not allowed anyway).
 *
 *	Optimizations Manfred Spraul <manfred@colorfullife.com>
 *
 *	More scalable flush, from Andi Kleen
 *
29
 *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
G
Glauber Costa 已提交
30 31
 */

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
/*
 * We get here when we do something requiring a TLB invalidation
 * but could not go invalidate all of the contexts.  We do the
 * necessary invalidation by clearing out the 'ctx_id' which
 * forces a TLB flush when the context is loaded.
 */
void clear_asid_other(void)
{
	u16 asid;

	/*
	 * This is only expected to be set if we have disabled
	 * kernel _PAGE_GLOBAL pages.
	 */
	if (!static_cpu_has(X86_FEATURE_PTI)) {
		WARN_ON_ONCE(1);
		return;
	}

	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
		/* Do not need to flush the current asid */
		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
			continue;
		/*
		 * Make sure the next time we go to switch to
		 * this asid, we do a flush:
		 */
		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
	}
	this_cpu_write(cpu_tlbstate.invalidate_other, false);
}

64 65
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);

66

67 68 69 70 71 72 73 74 75 76 77
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
			    u16 *new_asid, bool *need_flush)
{
	u16 asid;

	if (!static_cpu_has(X86_FEATURE_PCID)) {
		*new_asid = 0;
		*need_flush = true;
		return;
	}

78 79 80
	if (this_cpu_read(cpu_tlbstate.invalidate_other))
		clear_asid_other();

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
		    next->context.ctx_id)
			continue;

		*new_asid = asid;
		*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
			       next_tlb_gen);
		return;
	}

	/*
	 * We don't currently own an ASID slot on this CPU.
	 * Allocate a slot.
	 */
	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
	if (*new_asid >= TLB_NR_DYN_ASIDS) {
		*new_asid = 0;
		this_cpu_write(cpu_tlbstate.next_asid, 1);
	}
	*need_flush = true;
}

D
Dave Hansen 已提交
104 105 106 107 108
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
	unsigned long new_mm_cr3;

	if (need_flush) {
109
		invalidate_user_asid(new_asid);
D
Dave Hansen 已提交
110 111 112 113 114 115 116 117 118 119 120 121 122
		new_mm_cr3 = build_cr3(pgdir, new_asid);
	} else {
		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
	}

	/*
	 * Caution: many callers of this function expect
	 * that load_cr3() is serializing and orders TLB
	 * fills with respect to the mm_cpumask writes.
	 */
	write_cr3(new_mm_cr3);
}

G
Glauber Costa 已提交
123 124
void leave_mm(int cpu)
{
125 126 127 128 129 130 131 132 133 134 135 136 137
	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);

	/*
	 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
	 * If so, our callers still expect us to flush the TLB, but there
	 * aren't any user TLB entries in init_mm to worry about.
	 *
	 * This needs to happen before any other sanity checks due to
	 * intel_idle's shenanigans.
	 */
	if (loaded_mm == &init_mm)
		return;

138
	/* Warn if we're not lazy. */
139
	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
140 141

	switch_mm(NULL, &init_mm, NULL);
G
Glauber Costa 已提交
142
}
143
EXPORT_SYMBOL_GPL(leave_mm);
G
Glauber Costa 已提交
144

145 146
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
	       struct task_struct *tsk)
147 148 149 150 151 152 153 154
{
	unsigned long flags;

	local_irq_save(flags);
	switch_mm_irqs_off(prev, next, tsk);
	local_irq_restore(flags);
}

155 156 157 158 159
static void sync_current_stack_to_mm(struct mm_struct *mm)
{
	unsigned long sp = current_stack_pointer;
	pgd_t *pgd = pgd_offset(mm, sp);

160
	if (pgtable_l5_enabled) {
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
		if (unlikely(pgd_none(*pgd))) {
			pgd_t *pgd_ref = pgd_offset_k(sp);

			set_pgd(pgd, *pgd_ref);
		}
	} else {
		/*
		 * "pgd" is faked.  The top level entries are "p4d"s, so sync
		 * the p4d.  This compiles to approximately the same code as
		 * the 5-level case.
		 */
		p4d_t *p4d = p4d_offset(pgd, sp);

		if (unlikely(p4d_none(*p4d))) {
			pgd_t *pgd_ref = pgd_offset_k(sp);
			p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);

			set_p4d(p4d, *p4d_ref);
		}
	}
}

183 184
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
			struct task_struct *tsk)
185
{
186
	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
187
	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
188 189
	unsigned cpu = smp_processor_id();
	u64 next_tlb_gen;
190

191
	/*
192 193 194 195
	 * NB: The scheduler will call us with prev == next when switching
	 * from lazy TLB mode to normal mode if active_mm isn't changing.
	 * When this happens, we don't assume that CR3 (and hence
	 * cpu_tlbstate.loaded_mm) matches next.
196 197 198
	 *
	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
	 */
199

200 201 202 203 204 205 206
	/* We don't want flush_tlb_func_* to run concurrently with us. */
	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
		WARN_ON_ONCE(!irqs_disabled());

	/*
	 * Verify that CR3 is what we think it is.  This will catch
	 * hypothetical buggy code that directly switches to swapper_pg_dir
207 208
	 * without going through leave_mm() / switch_mm_irqs_off() or that
	 * does something like write_cr3(read_cr3_pa()).
209 210 211
	 *
	 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
	 * isn't free.
212
	 */
213
#ifdef CONFIG_DEBUG_VM
214
	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
		/*
		 * If we were to BUG here, we'd be very likely to kill
		 * the system so hard that we don't see the call trace.
		 * Try to recover instead by ignoring the error and doing
		 * a global flush to minimize the chance of corruption.
		 *
		 * (This is far from being a fully correct recovery.
		 *  Architecturally, the CPU could prefetch something
		 *  back into an incorrect ASID slot and leave it there
		 *  to cause trouble down the road.  It's better than
		 *  nothing, though.)
		 */
		__flush_tlb_all();
	}
#endif
230
	this_cpu_write(cpu_tlbstate.is_lazy, false);
231

232
	/*
233 234 235 236
	 * The membarrier system call requires a full memory barrier and
	 * core serialization before returning to user-space, after
	 * storing to rq->curr. Writing to CR3 provides that full
	 * memory barrier and core serializing instruction.
237
	 */
238
	if (real_prev == next) {
239 240
		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
			   next->context.ctx_id);
241

242
		/*
243 244 245 246 247
		 * We don't currently support having a real mm loaded without
		 * our cpu set in mm_cpumask().  We have all the bookkeeping
		 * in place to figure out whether we would need to flush
		 * if our cpu were cleared in mm_cpumask(), but we don't
		 * currently use it.
248
		 */
249 250 251 252 253
		if (WARN_ON_ONCE(real_prev != &init_mm &&
				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
			cpumask_set_cpu(cpu, mm_cpumask(next));

		return;
254
	} else {
255 256
		u16 new_asid;
		bool need_flush;
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);

		/*
		 * Avoid user/user BTB poisoning by flushing the branch
		 * predictor when switching between processes. This stops
		 * one process from doing Spectre-v2 attacks on another.
		 *
		 * As an optimization, flush indirect branches only when
		 * switching into processes that disable dumping. This
		 * protects high value processes like gpg, without having
		 * too high performance overhead. IBPB is *expensive*!
		 *
		 * This will not flush branches when switching into kernel
		 * threads. It will also not flush if we switch to idle
		 * thread and back to the same process. It will flush if we
		 * switch to a different non-dumpable process.
		 */
		if (tsk && tsk->mm &&
		    tsk->mm->context.ctx_id != last_ctx_id &&
		    get_dumpable(tsk->mm) != SUID_DUMP_USER)
			indirect_branch_prediction_barrier();
278 279 280 281 282 283 284

		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
			/*
			 * If our current stack is in vmalloc space and isn't
			 * mapped in the new pgd, we'll double-fault.  Forcibly
			 * map it.
			 */
285
			sync_current_stack_to_mm(next);
286
		}
287

288
		/* Stop remote flushes for the previous mm */
289 290 291
		VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
				real_prev != &init_mm);
		cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
292

293 294 295 296 297
		/*
		 * Start remote flushes and then read tlb_gen.
		 */
		cpumask_set_cpu(cpu, mm_cpumask(next));
		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
298

299
		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
300

301 302 303
		if (need_flush) {
			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
D
Dave Hansen 已提交
304
			load_new_mm_cr3(next->pgd, new_asid, true);
305 306 307 308 309 310 311 312 313 314

			/*
			 * NB: This gets called via leave_mm() in the idle path
			 * where RCU functions differently.  Tracing normally
			 * uses RCU, so we need to use the _rcuidle variant.
			 *
			 * (There is no good reason for this.  The idle code should
			 *  be rearranged to call this before rcu_idle_enter().)
			 */
			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
315 316
		} else {
			/* The new ASID is already up to date. */
D
Dave Hansen 已提交
317
			load_new_mm_cr3(next->pgd, new_asid, false);
318 319 320

			/* See above wrt _rcuidle. */
			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
321 322
		}

323 324 325 326 327 328 329 330
		/*
		 * Record last user mm's context id, so we can avoid
		 * flushing branch buffer with IBPB if we switch back
		 * to the same user.
		 */
		if (next != &init_mm)
			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);

331 332
		this_cpu_write(cpu_tlbstate.loaded_mm, next);
		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
333
	}
334 335

	load_mm_cr4(next);
336
	switch_ldt(real_prev, next);
337 338
}

339
/*
340 341 342
 * Please ignore the name of this function.  It should be called
 * switch_to_kernel_thread().
 *
343 344 345 346 347 348 349 350 351 352 353 354 355 356
 * enter_lazy_tlb() is a hint from the scheduler that we are entering a
 * kernel thread or other context without an mm.  Acceptable implementations
 * include doing nothing whatsoever, switching to init_mm, or various clever
 * lazy tricks to try to minimize TLB flushes.
 *
 * The scheduler reserves the right to call enter_lazy_tlb() several times
 * in a row.  It will notify us that we're going back to a real mm by
 * calling switch_mm_irqs_off().
 */
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
		return;

357
	if (tlb_defer_switch_to_init_mm()) {
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
		/*
		 * There's a significant optimization that may be possible
		 * here.  We have accurate enough TLB flush tracking that we
		 * don't need to maintain coherence of TLB per se when we're
		 * lazy.  We do, however, need to maintain coherence of
		 * paging-structure caches.  We could, in principle, leave our
		 * old mm loaded and only switch to init_mm when
		 * tlb_remove_page() happens.
		 */
		this_cpu_write(cpu_tlbstate.is_lazy, true);
	} else {
		switch_mm(NULL, &init_mm, NULL);
	}
}

373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
/*
 * Call this when reinitializing a CPU.  It fixes the following potential
 * problems:
 *
 * - The ASID changed from what cpu_tlbstate thinks it is (most likely
 *   because the CPU was taken down and came back up with CR3's PCID
 *   bits clear.  CPU hotplug can do this.
 *
 * - The TLB contains junk in slots corresponding to inactive ASIDs.
 *
 * - The CPU went so far out to lunch that it may have missed a TLB
 *   flush.
 */
void initialize_tlbstate_and_flush(void)
{
	int i;
	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
	u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
	unsigned long cr3 = __read_cr3();

	/* Assert that CR3 already references the right mm. */
	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));

	/*
	 * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
	 * doesn't work like other CR4 bits because it can only be set from
	 * long mode.)
	 */
401
	WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
402 403 404
		!(cr4_read_shadow() & X86_CR4_PCIDE));

	/* Force ASID 0 and force a TLB flush. */
405
	write_cr3(build_cr3(mm->pgd, 0));
406 407

	/* Reinitialize tlbstate. */
408
	this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
409 410 411 412 413 414 415 416 417
	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
	this_cpu_write(cpu_tlbstate.next_asid, 1);
	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);

	for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
		this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
}

418 419 420 421 422 423 424
/*
 * flush_tlb_func_common()'s memory ordering requirement is that any
 * TLB fills that happen after we flush the TLB are ordered after we
 * read active_mm's tlb_gen.  We don't need any explicit barriers
 * because all x86 flush operations are serializing and the
 * atomic64_read operation won't be reordered by the compiler.
 */
425 426
static void flush_tlb_func_common(const struct flush_tlb_info *f,
				  bool local, enum tlb_flush_reason reason)
G
Glauber Costa 已提交
427
{
428 429 430 431 432 433 434 435 436 437
	/*
	 * We have three different tlb_gen values in here.  They are:
	 *
	 * - mm_tlb_gen:     the latest generation.
	 * - local_tlb_gen:  the generation that this CPU has already caught
	 *                   up to.
	 * - f->new_tlb_gen: the generation that the requester of the flush
	 *                   wants us to catch up to.
	 */
	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
438
	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
439
	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
440
	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
441

442 443 444
	/* This code cannot presently handle being reentered. */
	VM_WARN_ON(!irqs_disabled());

445 446 447
	if (unlikely(loaded_mm == &init_mm))
		return;

448
	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
449 450
		   loaded_mm->context.ctx_id);

451
	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
452
		/*
453 454 455 456
		 * We're in lazy mode.  We need to at least flush our
		 * paging-structure cache to avoid speculatively reading
		 * garbage into our TLB.  Since switching to init_mm is barely
		 * slower than a minimal flush, just switch to init_mm.
457
		 */
458
		switch_mm_irqs_off(NULL, &init_mm, NULL);
459 460
		return;
	}
G
Glauber Costa 已提交
461

462 463 464 465 466 467 468
	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
		/*
		 * There's nothing to do: we're already up to date.  This can
		 * happen if two concurrent flushes happen -- the first flush to
		 * be handled can catch us all the way up, leaving no work for
		 * the second flush.
		 */
469
		trace_tlb_flush(reason, 0);
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
		return;
	}

	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);

	/*
	 * If we get to this point, we know that our TLB is out of date.
	 * This does not strictly imply that we need to flush (it's
	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
	 * going to need to flush in the very near future, so we might
	 * as well get it over with.
	 *
	 * The only question is whether to do a full or partial flush.
	 *
	 * We do a partial flush if requested and two extra conditions
	 * are met:
	 *
	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
	 *    we've always done all needed flushes to catch up to
	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
	 *    us up to date for tlb_gen 3 is the partial flush we're
	 *    processing.
	 *
	 *    As an example of why this check is needed, suppose that there
	 *    are two concurrent flushes.  The first is a full flush that
	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
	 *    processed on this CPU in reverse order, we'll see
	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
501
	 *    If we were to use __flush_tlb_one_user() and set local_tlb_gen to
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
	 *    1 without the full flush that's needed for tlb_gen 2.
	 *
	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
	 *    Partial TLB flushes are not all that much cheaper than full TLB
	 *    flushes, so it seems unlikely that it would be a performance win
	 *    to do a partial flush if that won't bring our TLB fully up to
	 *    date.  By doing a full flush instead, we can increase
	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
	 *    avoid another flush in the very near future.
	 */
	if (f->end != TLB_FLUSH_ALL &&
	    f->new_tlb_gen == local_tlb_gen + 1 &&
	    f->new_tlb_gen == mm_tlb_gen) {
		/* Partial flush */
517
		unsigned long addr;
518
		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
519

520 521
		addr = f->start;
		while (addr < f->end) {
522
			__flush_tlb_one_user(addr);
523 524
			addr += PAGE_SIZE;
		}
525 526 527
		if (local)
			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
		trace_tlb_flush(reason, nr_pages);
528 529 530 531 532 533
	} else {
		/* Full flush. */
		local_flush_tlb();
		if (local)
			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
		trace_tlb_flush(reason, TLB_FLUSH_ALL);
534
	}
535 536

	/* Both paths above update our state to mm_tlb_gen. */
537
	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
G
Glauber Costa 已提交
538 539
}

540 541 542 543 544 545 546 547 548 549 550 551 552
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
{
	const struct flush_tlb_info *f = info;

	flush_tlb_func_common(f, true, reason);
}

static void flush_tlb_func_remote(void *info)
{
	const struct flush_tlb_info *f = info;

	inc_irq_stat(irq_tlb_count);

553
	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
554 555 556 557 558 559
		return;

	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}

560
void native_flush_tlb_others(const struct cpumask *cpumask,
561
			     const struct flush_tlb_info *info)
562
{
563
	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
564
	if (info->end == TLB_FLUSH_ALL)
565 566 567
		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
	else
		trace_tlb_flush(TLB_REMOTE_SEND_IPI,
568
				(info->end - info->start) >> PAGE_SHIFT);
569

570
	if (is_uv_system()) {
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
		/*
		 * This whole special case is confused.  UV has a "Broadcast
		 * Assist Unit", which seems to be a fancy way to send IPIs.
		 * Back when x86 used an explicit TLB flush IPI, UV was
		 * optimized to use its own mechanism.  These days, x86 uses
		 * smp_call_function_many(), but UV still uses a manual IPI,
		 * and that IPI's action is out of date -- it does a manual
		 * flush instead of calling flush_tlb_func_remote().  This
		 * means that the percpu tlb_gen variables won't be updated
		 * and we'll do pointless flushes on future context switches.
		 *
		 * Rather than hooking native_flush_tlb_others() here, I think
		 * that UV should be updated so that smp_call_function_many(),
		 * etc, are optimal on UV.
		 */
T
Tejun Heo 已提交
586
		unsigned int cpu;
587

588
		cpu = smp_processor_id();
589
		cpumask = uv_flush_tlb_others(cpumask, info);
T
Tejun Heo 已提交
590
		if (cpumask)
591
			smp_call_function_many(cpumask, flush_tlb_func_remote,
592
					       (void *)info, 1);
593
		return;
594
	}
595
	smp_call_function_many(cpumask, flush_tlb_func_remote,
596
			       (void *)info, 1);
G
Glauber Costa 已提交
597 598
}

599 600 601 602 603 604 605 606 607 608
/*
 * See Documentation/x86/tlb.txt for details.  We choose 33
 * because it is large enough to cover the vast majority (at
 * least 95%) of allocations, and is small enough that we are
 * confident it will not cause too much overhead.  Each single
 * flush is about 100 ns, so this caps the maximum overhead at
 * _about_ 3,000 ns.
 *
 * This is in units of pages.
 */
609
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
610

611 612 613
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
				unsigned long end, unsigned long vmflag)
{
614
	int cpu;
615

N
Nadav Amit 已提交
616
	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
617 618
		.mm = mm,
	};
619

620
	cpu = get_cpu();
621

622
	/* This is also a barrier that synchronizes with switch_mm(). */
623
	info.new_tlb_gen = inc_mm_tlb_gen(mm);
624

625 626 627 628 629 630
	/* Should we flush just the requested range? */
	if ((end != TLB_FLUSH_ALL) &&
	    !(vmflag & VM_HUGETLB) &&
	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
		info.start = start;
		info.end = end;
D
Dave Hansen 已提交
631
	} else {
632 633
		info.start = 0UL;
		info.end = TLB_FLUSH_ALL;
634
	}
635

636 637 638
	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
		VM_WARN_ON(irqs_disabled());
		local_irq_disable();
639
		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
640 641 642
		local_irq_enable();
	}

643
	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
644
		flush_tlb_others(mm_cpumask(mm), &info);
645

646
	put_cpu();
G
Glauber Costa 已提交
647 648
}

649

G
Glauber Costa 已提交
650 651
static void do_flush_tlb_all(void *info)
{
652
	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
G
Glauber Costa 已提交
653 654 655 656 657
	__flush_tlb_all();
}

void flush_tlb_all(void)
{
658
	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
659
	on_each_cpu(do_flush_tlb_all, NULL, 1);
G
Glauber Costa 已提交
660
}
661

662 663 664 665 666 667
static void do_kernel_range_flush(void *info)
{
	struct flush_tlb_info *f = info;
	unsigned long addr;

	/* flush range by one by one 'invlpg' */
668
	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
669
		__flush_tlb_one_kernel(addr);
670 671 672 673 674 675
}

void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{

	/* Balance as user space task's flush, a bit conservative */
676
	if (end == TLB_FLUSH_ALL ||
677
	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
678
		on_each_cpu(do_flush_tlb_all, NULL, 1);
679 680
	} else {
		struct flush_tlb_info info;
681 682
		info.start = start;
		info.end = end;
683 684 685
		on_each_cpu(do_kernel_range_flush, &info, 1);
	}
}
686

687 688
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
689 690 691 692 693 694
	struct flush_tlb_info info = {
		.mm = NULL,
		.start = 0UL,
		.end = TLB_FLUSH_ALL,
	};

695 696
	int cpu = get_cpu();

697 698 699
	if (cpumask_test_cpu(cpu, &batch->cpumask)) {
		VM_WARN_ON(irqs_disabled());
		local_irq_disable();
700
		flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
701 702 703
		local_irq_enable();
	}

704
	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
705
		flush_tlb_others(&batch->cpumask, &info);
706

707 708 709 710 711
	cpumask_clear(&batch->cpumask);

	put_cpu();
}

712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
			     size_t count, loff_t *ppos)
{
	char buf[32];
	unsigned int len;

	len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}

static ssize_t tlbflush_write_file(struct file *file,
		 const char __user *user_buf, size_t count, loff_t *ppos)
{
	char buf[32];
	ssize_t len;
	int ceiling;

	len = min(count, sizeof(buf) - 1);
	if (copy_from_user(buf, user_buf, len))
		return -EFAULT;

	buf[len] = '\0';
	if (kstrtoint(buf, 0, &ceiling))
		return -EINVAL;

	if (ceiling < 0)
		return -EINVAL;

	tlb_single_page_flush_ceiling = ceiling;
	return count;
}

static const struct file_operations fops_tlbflush = {
	.read = tlbflush_read_file,
	.write = tlbflush_write_file,
	.llseek = default_llseek,
};

static int __init create_tlb_single_page_flush_ceiling(void)
{
	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
			    arch_debugfs_dir, NULL, &fops_tlbflush);
	return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);