tlb.c 22.5 KB
Newer Older
G
Glauber Costa 已提交
1 2 3 4 5 6
#include <linux/init.h>

#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
7
#include <linux/export.h>
8
#include <linux/cpu.h>
9
#include <linux/debugfs.h>
G
Glauber Costa 已提交
10 11 12

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
13
#include <asm/nospec-branch.h>
14
#include <asm/cache.h>
T
Tejun Heo 已提交
15
#include <asm/apic.h>
T
Tejun Heo 已提交
16
#include <asm/uv/uv.h>
17

G
Glauber Costa 已提交
18
/*
19
 *	TLB flushing, formerly SMP-only
G
Glauber Costa 已提交
20 21 22 23 24 25 26 27 28
 *		c/o Linus Torvalds.
 *
 *	These mean you can really definitely utterly forget about
 *	writing to user space from interrupts. (Its not allowed anyway).
 *
 *	Optimizations Manfred Spraul <manfred@colorfullife.com>
 *
 *	More scalable flush, from Andi Kleen
 *
29
 *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
G
Glauber Costa 已提交
30 31
 */

32 33 34 35 36 37
/*
 * We get here when we do something requiring a TLB invalidation
 * but could not go invalidate all of the contexts.  We do the
 * necessary invalidation by clearing out the 'ctx_id' which
 * forces a TLB flush when the context is loaded.
 */
38
static void clear_asid_other(void)
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
{
	u16 asid;

	/*
	 * This is only expected to be set if we have disabled
	 * kernel _PAGE_GLOBAL pages.
	 */
	if (!static_cpu_has(X86_FEATURE_PTI)) {
		WARN_ON_ONCE(1);
		return;
	}

	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
		/* Do not need to flush the current asid */
		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
			continue;
		/*
		 * Make sure the next time we go to switch to
		 * this asid, we do a flush:
		 */
		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
	}
	this_cpu_write(cpu_tlbstate.invalidate_other, false);
}

64 65
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);

66

67 68 69 70 71 72 73 74 75 76 77
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
			    u16 *new_asid, bool *need_flush)
{
	u16 asid;

	if (!static_cpu_has(X86_FEATURE_PCID)) {
		*new_asid = 0;
		*need_flush = true;
		return;
	}

78 79 80
	if (this_cpu_read(cpu_tlbstate.invalidate_other))
		clear_asid_other();

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
		    next->context.ctx_id)
			continue;

		*new_asid = asid;
		*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
			       next_tlb_gen);
		return;
	}

	/*
	 * We don't currently own an ASID slot on this CPU.
	 * Allocate a slot.
	 */
	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
	if (*new_asid >= TLB_NR_DYN_ASIDS) {
		*new_asid = 0;
		this_cpu_write(cpu_tlbstate.next_asid, 1);
	}
	*need_flush = true;
}

D
Dave Hansen 已提交
104 105 106 107 108
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
	unsigned long new_mm_cr3;

	if (need_flush) {
109
		invalidate_user_asid(new_asid);
D
Dave Hansen 已提交
110 111 112 113 114 115 116 117 118 119 120 121 122
		new_mm_cr3 = build_cr3(pgdir, new_asid);
	} else {
		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
	}

	/*
	 * Caution: many callers of this function expect
	 * that load_cr3() is serializing and orders TLB
	 * fills with respect to the mm_cpumask writes.
	 */
	write_cr3(new_mm_cr3);
}

G
Glauber Costa 已提交
123 124
void leave_mm(int cpu)
{
125 126 127 128 129 130 131 132 133 134 135 136 137
	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);

	/*
	 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
	 * If so, our callers still expect us to flush the TLB, but there
	 * aren't any user TLB entries in init_mm to worry about.
	 *
	 * This needs to happen before any other sanity checks due to
	 * intel_idle's shenanigans.
	 */
	if (loaded_mm == &init_mm)
		return;

138
	/* Warn if we're not lazy. */
139
	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
140 141

	switch_mm(NULL, &init_mm, NULL);
G
Glauber Costa 已提交
142
}
143
EXPORT_SYMBOL_GPL(leave_mm);
G
Glauber Costa 已提交
144

145 146
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
	       struct task_struct *tsk)
147 148 149 150 151 152 153 154
{
	unsigned long flags;

	local_irq_save(flags);
	switch_mm_irqs_off(prev, next, tsk);
	local_irq_restore(flags);
}

155 156 157 158 159
static void sync_current_stack_to_mm(struct mm_struct *mm)
{
	unsigned long sp = current_stack_pointer;
	pgd_t *pgd = pgd_offset(mm, sp);

160
	if (pgtable_l5_enabled()) {
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
		if (unlikely(pgd_none(*pgd))) {
			pgd_t *pgd_ref = pgd_offset_k(sp);

			set_pgd(pgd, *pgd_ref);
		}
	} else {
		/*
		 * "pgd" is faked.  The top level entries are "p4d"s, so sync
		 * the p4d.  This compiles to approximately the same code as
		 * the 5-level case.
		 */
		p4d_t *p4d = p4d_offset(pgd, sp);

		if (unlikely(p4d_none(*p4d))) {
			pgd_t *pgd_ref = pgd_offset_k(sp);
			p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);

			set_p4d(p4d, *p4d_ref);
		}
	}
}

183 184
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
			struct task_struct *tsk)
185
{
186
	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
187
	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
188 189
	unsigned cpu = smp_processor_id();
	u64 next_tlb_gen;
190

191
	/*
192 193 194 195
	 * NB: The scheduler will call us with prev == next when switching
	 * from lazy TLB mode to normal mode if active_mm isn't changing.
	 * When this happens, we don't assume that CR3 (and hence
	 * cpu_tlbstate.loaded_mm) matches next.
196 197 198
	 *
	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
	 */
199

200 201 202 203 204 205 206
	/* We don't want flush_tlb_func_* to run concurrently with us. */
	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
		WARN_ON_ONCE(!irqs_disabled());

	/*
	 * Verify that CR3 is what we think it is.  This will catch
	 * hypothetical buggy code that directly switches to swapper_pg_dir
207 208
	 * without going through leave_mm() / switch_mm_irqs_off() or that
	 * does something like write_cr3(read_cr3_pa()).
209 210 211
	 *
	 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
	 * isn't free.
212
	 */
213
#ifdef CONFIG_DEBUG_VM
214
	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
		/*
		 * If we were to BUG here, we'd be very likely to kill
		 * the system so hard that we don't see the call trace.
		 * Try to recover instead by ignoring the error and doing
		 * a global flush to minimize the chance of corruption.
		 *
		 * (This is far from being a fully correct recovery.
		 *  Architecturally, the CPU could prefetch something
		 *  back into an incorrect ASID slot and leave it there
		 *  to cause trouble down the road.  It's better than
		 *  nothing, though.)
		 */
		__flush_tlb_all();
	}
#endif
230
	this_cpu_write(cpu_tlbstate.is_lazy, false);
231

232
	/*
233 234 235 236
	 * The membarrier system call requires a full memory barrier and
	 * core serialization before returning to user-space, after
	 * storing to rq->curr. Writing to CR3 provides that full
	 * memory barrier and core serializing instruction.
237
	 */
238
	if (real_prev == next) {
239 240
		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
			   next->context.ctx_id);
241

242
		/*
243 244 245 246 247
		 * We don't currently support having a real mm loaded without
		 * our cpu set in mm_cpumask().  We have all the bookkeeping
		 * in place to figure out whether we would need to flush
		 * if our cpu were cleared in mm_cpumask(), but we don't
		 * currently use it.
248
		 */
249 250 251 252
		if (WARN_ON_ONCE(real_prev != &init_mm &&
				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
			cpumask_set_cpu(cpu, mm_cpumask(next));

253
		return;
254
	} else {
255 256
		u16 new_asid;
		bool need_flush;
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);

		/*
		 * Avoid user/user BTB poisoning by flushing the branch
		 * predictor when switching between processes. This stops
		 * one process from doing Spectre-v2 attacks on another.
		 *
		 * As an optimization, flush indirect branches only when
		 * switching into processes that disable dumping. This
		 * protects high value processes like gpg, without having
		 * too high performance overhead. IBPB is *expensive*!
		 *
		 * This will not flush branches when switching into kernel
		 * threads. It will also not flush if we switch to idle
		 * thread and back to the same process. It will flush if we
		 * switch to a different non-dumpable process.
		 */
		if (tsk && tsk->mm &&
		    tsk->mm->context.ctx_id != last_ctx_id &&
		    get_dumpable(tsk->mm) != SUID_DUMP_USER)
			indirect_branch_prediction_barrier();
278 279 280 281 282 283 284

		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
			/*
			 * If our current stack is in vmalloc space and isn't
			 * mapped in the new pgd, we'll double-fault.  Forcibly
			 * map it.
			 */
285
			sync_current_stack_to_mm(next);
286
		}
287

288 289 290 291 292 293 294 295 296 297
		/*
		 * Stop remote flushes for the previous mm.
		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
		 * but the bitmap manipulation can cause cache line contention.
		 */
		if (real_prev != &init_mm) {
			VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
						mm_cpumask(real_prev)));
			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
		}
298

299 300 301
		/*
		 * Start remote flushes and then read tlb_gen.
		 */
302 303
		if (next != &init_mm)
			cpumask_set_cpu(cpu, mm_cpumask(next));
304
		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
305

306
		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
307

308 309 310 311
		/* Let nmi_uaccess_okay() know that we're changing CR3. */
		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
		barrier();

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
		if (need_flush) {
			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
			load_new_mm_cr3(next->pgd, new_asid, true);

			/*
			 * NB: This gets called via leave_mm() in the idle path
			 * where RCU functions differently.  Tracing normally
			 * uses RCU, so we need to use the _rcuidle variant.
			 *
			 * (There is no good reason for this.  The idle code should
			 *  be rearranged to call this before rcu_idle_enter().)
			 */
			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
		} else {
			/* The new ASID is already up to date. */
			load_new_mm_cr3(next->pgd, new_asid, false);

			/* See above wrt _rcuidle. */
			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
		}
333

334
		/*
335 336 337
		 * Record last user mm's context id, so we can avoid
		 * flushing branch buffer with IBPB if we switch back
		 * to the same user.
338
		 */
339 340
		if (next != &init_mm)
			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
341

342 343 344
		/* Make sure we write CR3 before loaded_mm. */
		barrier();

345 346
		this_cpu_write(cpu_tlbstate.loaded_mm, next);
		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
347
	}
348 349

	load_mm_cr4(next);
350
	switch_ldt(real_prev, next);
351 352
}

353
/*
354 355 356
 * Please ignore the name of this function.  It should be called
 * switch_to_kernel_thread().
 *
357 358 359 360 361 362 363 364 365 366 367 368 369 370
 * enter_lazy_tlb() is a hint from the scheduler that we are entering a
 * kernel thread or other context without an mm.  Acceptable implementations
 * include doing nothing whatsoever, switching to init_mm, or various clever
 * lazy tricks to try to minimize TLB flushes.
 *
 * The scheduler reserves the right to call enter_lazy_tlb() several times
 * in a row.  It will notify us that we're going back to a real mm by
 * calling switch_mm_irqs_off().
 */
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
		return;

371 372 373 374 375 376 377 378 379 380 381 382 383 384
	if (tlb_defer_switch_to_init_mm()) {
		/*
		 * There's a significant optimization that may be possible
		 * here.  We have accurate enough TLB flush tracking that we
		 * don't need to maintain coherence of TLB per se when we're
		 * lazy.  We do, however, need to maintain coherence of
		 * paging-structure caches.  We could, in principle, leave our
		 * old mm loaded and only switch to init_mm when
		 * tlb_remove_page() happens.
		 */
		this_cpu_write(cpu_tlbstate.is_lazy, true);
	} else {
		switch_mm(NULL, &init_mm, NULL);
	}
385 386
}

387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
/*
 * Call this when reinitializing a CPU.  It fixes the following potential
 * problems:
 *
 * - The ASID changed from what cpu_tlbstate thinks it is (most likely
 *   because the CPU was taken down and came back up with CR3's PCID
 *   bits clear.  CPU hotplug can do this.
 *
 * - The TLB contains junk in slots corresponding to inactive ASIDs.
 *
 * - The CPU went so far out to lunch that it may have missed a TLB
 *   flush.
 */
void initialize_tlbstate_and_flush(void)
{
	int i;
	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
	u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
	unsigned long cr3 = __read_cr3();

	/* Assert that CR3 already references the right mm. */
	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));

	/*
	 * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
	 * doesn't work like other CR4 bits because it can only be set from
	 * long mode.)
	 */
415
	WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
416 417 418
		!(cr4_read_shadow() & X86_CR4_PCIDE));

	/* Force ASID 0 and force a TLB flush. */
419
	write_cr3(build_cr3(mm->pgd, 0));
420 421

	/* Reinitialize tlbstate. */
422
	this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
423 424 425 426 427 428 429 430 431
	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
	this_cpu_write(cpu_tlbstate.next_asid, 1);
	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);

	for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
		this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
}

432 433 434 435 436 437 438
/*
 * flush_tlb_func_common()'s memory ordering requirement is that any
 * TLB fills that happen after we flush the TLB are ordered after we
 * read active_mm's tlb_gen.  We don't need any explicit barriers
 * because all x86 flush operations are serializing and the
 * atomic64_read operation won't be reordered by the compiler.
 */
439 440
static void flush_tlb_func_common(const struct flush_tlb_info *f,
				  bool local, enum tlb_flush_reason reason)
G
Glauber Costa 已提交
441
{
442 443 444 445 446 447 448 449 450 451
	/*
	 * We have three different tlb_gen values in here.  They are:
	 *
	 * - mm_tlb_gen:     the latest generation.
	 * - local_tlb_gen:  the generation that this CPU has already caught
	 *                   up to.
	 * - f->new_tlb_gen: the generation that the requester of the flush
	 *                   wants us to catch up to.
	 */
	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
452
	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
453
	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
454
	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
455

456 457 458
	/* This code cannot presently handle being reentered. */
	VM_WARN_ON(!irqs_disabled());

459 460 461
	if (unlikely(loaded_mm == &init_mm))
		return;

462
	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
463 464
		   loaded_mm->context.ctx_id);

465
	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
466
		/*
467 468 469 470
		 * We're in lazy mode.  We need to at least flush our
		 * paging-structure cache to avoid speculatively reading
		 * garbage into our TLB.  Since switching to init_mm is barely
		 * slower than a minimal flush, just switch to init_mm.
471
		 */
472
		switch_mm_irqs_off(NULL, &init_mm, NULL);
473 474
		return;
	}
G
Glauber Costa 已提交
475

476 477 478 479 480 481 482
	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
		/*
		 * There's nothing to do: we're already up to date.  This can
		 * happen if two concurrent flushes happen -- the first flush to
		 * be handled can catch us all the way up, leaving no work for
		 * the second flush.
		 */
483
		trace_tlb_flush(reason, 0);
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
		return;
	}

	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);

	/*
	 * If we get to this point, we know that our TLB is out of date.
	 * This does not strictly imply that we need to flush (it's
	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
	 * going to need to flush in the very near future, so we might
	 * as well get it over with.
	 *
	 * The only question is whether to do a full or partial flush.
	 *
	 * We do a partial flush if requested and two extra conditions
	 * are met:
	 *
	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
	 *    we've always done all needed flushes to catch up to
	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
	 *    us up to date for tlb_gen 3 is the partial flush we're
	 *    processing.
	 *
	 *    As an example of why this check is needed, suppose that there
	 *    are two concurrent flushes.  The first is a full flush that
	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
	 *    processed on this CPU in reverse order, we'll see
	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
515
	 *    If we were to use __flush_tlb_one_user() and set local_tlb_gen to
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
	 *    1 without the full flush that's needed for tlb_gen 2.
	 *
	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
	 *    Partial TLB flushes are not all that much cheaper than full TLB
	 *    flushes, so it seems unlikely that it would be a performance win
	 *    to do a partial flush if that won't bring our TLB fully up to
	 *    date.  By doing a full flush instead, we can increase
	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
	 *    avoid another flush in the very near future.
	 */
	if (f->end != TLB_FLUSH_ALL &&
	    f->new_tlb_gen == local_tlb_gen + 1 &&
	    f->new_tlb_gen == mm_tlb_gen) {
		/* Partial flush */
531
		unsigned long addr;
532
		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
533

534 535
		addr = f->start;
		while (addr < f->end) {
536
			__flush_tlb_one_user(addr);
537 538
			addr += PAGE_SIZE;
		}
539 540 541
		if (local)
			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
		trace_tlb_flush(reason, nr_pages);
542 543 544 545 546 547
	} else {
		/* Full flush. */
		local_flush_tlb();
		if (local)
			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
		trace_tlb_flush(reason, TLB_FLUSH_ALL);
548
	}
549 550

	/* Both paths above update our state to mm_tlb_gen. */
551
	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
G
Glauber Costa 已提交
552 553
}

554 555 556 557 558 559 560 561 562 563 564 565 566
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
{
	const struct flush_tlb_info *f = info;

	flush_tlb_func_common(f, true, reason);
}

static void flush_tlb_func_remote(void *info)
{
	const struct flush_tlb_info *f = info;

	inc_irq_stat(irq_tlb_count);

567
	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
568 569 570 571 572 573
		return;

	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}

574
void native_flush_tlb_others(const struct cpumask *cpumask,
575
			     const struct flush_tlb_info *info)
576
{
577
	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
578
	if (info->end == TLB_FLUSH_ALL)
579 580 581
		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
	else
		trace_tlb_flush(TLB_REMOTE_SEND_IPI,
582
				(info->end - info->start) >> PAGE_SHIFT);
583

584
	if (is_uv_system()) {
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
		/*
		 * This whole special case is confused.  UV has a "Broadcast
		 * Assist Unit", which seems to be a fancy way to send IPIs.
		 * Back when x86 used an explicit TLB flush IPI, UV was
		 * optimized to use its own mechanism.  These days, x86 uses
		 * smp_call_function_many(), but UV still uses a manual IPI,
		 * and that IPI's action is out of date -- it does a manual
		 * flush instead of calling flush_tlb_func_remote().  This
		 * means that the percpu tlb_gen variables won't be updated
		 * and we'll do pointless flushes on future context switches.
		 *
		 * Rather than hooking native_flush_tlb_others() here, I think
		 * that UV should be updated so that smp_call_function_many(),
		 * etc, are optimal on UV.
		 */
600 601
		unsigned int cpu;

602
		cpu = smp_processor_id();
603
		cpumask = uv_flush_tlb_others(cpumask, info);
T
Tejun Heo 已提交
604
		if (cpumask)
605
			smp_call_function_many(cpumask, flush_tlb_func_remote,
606
					       (void *)info, 1);
607
		return;
608
	}
609
	smp_call_function_many(cpumask, flush_tlb_func_remote,
610
			       (void *)info, 1);
G
Glauber Costa 已提交
611 612
}

613 614 615 616 617 618 619 620 621 622
/*
 * See Documentation/x86/tlb.txt for details.  We choose 33
 * because it is large enough to cover the vast majority (at
 * least 95%) of allocations, and is small enough that we are
 * confident it will not cause too much overhead.  Each single
 * flush is about 100 ns, so this caps the maximum overhead at
 * _about_ 3,000 ns.
 *
 * This is in units of pages.
 */
623
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
624

625 626 627
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
				unsigned long end, unsigned long vmflag)
{
628
	int cpu;
629

N
Nadav Amit 已提交
630
	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
631 632
		.mm = mm,
	};
633

634
	cpu = get_cpu();
635

636
	/* This is also a barrier that synchronizes with switch_mm(). */
637
	info.new_tlb_gen = inc_mm_tlb_gen(mm);
638

639 640 641 642 643 644
	/* Should we flush just the requested range? */
	if ((end != TLB_FLUSH_ALL) &&
	    !(vmflag & VM_HUGETLB) &&
	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
		info.start = start;
		info.end = end;
D
Dave Hansen 已提交
645
	} else {
646 647
		info.start = 0UL;
		info.end = TLB_FLUSH_ALL;
648
	}
649

650 651 652
	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
		VM_WARN_ON(irqs_disabled());
		local_irq_disable();
653
		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
654 655 656
		local_irq_enable();
	}

657
	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
658
		flush_tlb_others(mm_cpumask(mm), &info);
659

660
	put_cpu();
G
Glauber Costa 已提交
661 662
}

663

G
Glauber Costa 已提交
664 665
static void do_flush_tlb_all(void *info)
{
666
	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
G
Glauber Costa 已提交
667 668 669 670 671
	__flush_tlb_all();
}

void flush_tlb_all(void)
{
672
	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
673
	on_each_cpu(do_flush_tlb_all, NULL, 1);
G
Glauber Costa 已提交
674
}
675

676 677 678 679 680 681
static void do_kernel_range_flush(void *info)
{
	struct flush_tlb_info *f = info;
	unsigned long addr;

	/* flush range by one by one 'invlpg' */
682
	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
683
		__flush_tlb_one_kernel(addr);
684 685 686 687 688 689
}

void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{

	/* Balance as user space task's flush, a bit conservative */
690
	if (end == TLB_FLUSH_ALL ||
691
	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
692
		on_each_cpu(do_flush_tlb_all, NULL, 1);
693 694
	} else {
		struct flush_tlb_info info;
695 696
		info.start = start;
		info.end = end;
697 698 699
		on_each_cpu(do_kernel_range_flush, &info, 1);
	}
}
700

701 702
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
703 704 705 706 707 708
	struct flush_tlb_info info = {
		.mm = NULL,
		.start = 0UL,
		.end = TLB_FLUSH_ALL,
	};

709 710
	int cpu = get_cpu();

711 712 713
	if (cpumask_test_cpu(cpu, &batch->cpumask)) {
		VM_WARN_ON(irqs_disabled());
		local_irq_disable();
714
		flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
715 716 717
		local_irq_enable();
	}

718
	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
719
		flush_tlb_others(&batch->cpumask, &info);
720

721 722 723 724 725
	cpumask_clear(&batch->cpumask);

	put_cpu();
}

726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
			     size_t count, loff_t *ppos)
{
	char buf[32];
	unsigned int len;

	len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}

static ssize_t tlbflush_write_file(struct file *file,
		 const char __user *user_buf, size_t count, loff_t *ppos)
{
	char buf[32];
	ssize_t len;
	int ceiling;

	len = min(count, sizeof(buf) - 1);
	if (copy_from_user(buf, user_buf, len))
		return -EFAULT;

	buf[len] = '\0';
	if (kstrtoint(buf, 0, &ceiling))
		return -EINVAL;

	if (ceiling < 0)
		return -EINVAL;

	tlb_single_page_flush_ceiling = ceiling;
	return count;
}

static const struct file_operations fops_tlbflush = {
	.read = tlbflush_read_file,
	.write = tlbflush_write_file,
	.llseek = default_llseek,
};

static int __init create_tlb_single_page_flush_ceiling(void)
{
	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
			    arch_debugfs_dir, NULL, &fops_tlbflush);
	return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);