tlb.c 11.2 KB
Newer Older
G
Glauber Costa 已提交
1 2 3 4 5 6
#include <linux/init.h>

#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
T
Tejun Heo 已提交
7
#include <linux/module.h>
8
#include <linux/cpu.h>
G
Glauber Costa 已提交
9 10 11

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
12
#include <asm/cache.h>
T
Tejun Heo 已提交
13
#include <asm/apic.h>
T
Tejun Heo 已提交
14
#include <asm/uv/uv.h>
15

16 17 18
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
			= { &init_mm, 0, };

G
Glauber Costa 已提交
19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 *	Smarter SMP flushing macros.
 *		c/o Linus Torvalds.
 *
 *	These mean you can really definitely utterly forget about
 *	writing to user space from interrupts. (Its not allowed anyway).
 *
 *	Optimizations Manfred Spraul <manfred@colorfullife.com>
 *
 *	More scalable flush, from Andi Kleen
 *
 *	To avoid global state use 8 different call vectors.
 *	Each CPU uses a specific vector to trigger flushes on other
 *	CPUs. Depending on the received vector the target CPUs look into
33
 *	the right array slot for the flush data.
G
Glauber Costa 已提交
34 35 36 37 38 39 40 41 42 43
 *
 *	With more than 8 CPUs they are hashed to the 8 available
 *	vectors. The limited global vector space forces us to this right now.
 *	In future when interrupts are split into per CPU domains this could be
 *	fixed, at the cost of triggering multiple IPIs in some cases.
 */

union smp_flush_state {
	struct {
		struct mm_struct *flush_mm;
44 45
		unsigned long flush_start;
		unsigned long flush_end;
46
		raw_spinlock_t tlbstate_lock;
47
		DECLARE_BITMAP(flush_cpumask, NR_CPUS);
G
Glauber Costa 已提交
48
	};
49
	char pad[INTERNODE_CACHE_BYTES];
50
} ____cacheline_internodealigned_in_smp;
G
Glauber Costa 已提交
51 52 53 54

/* State is put into the per CPU data section, but padded
   to a full cache line because other CPUs can access it and we don't
   want false sharing in the per cpu data segment. */
55
static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
G
Glauber Costa 已提交
56

57 58
static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);

G
Glauber Costa 已提交
59 60 61 62 63 64
/*
 * We cannot call mmdrop() because we are in interrupt context,
 * instead update mm->cpu_vm_mask.
 */
void leave_mm(int cpu)
{
65
	struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
66
	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
G
Glauber Costa 已提交
67
		BUG();
68 69 70 71
	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
		load_cr3(swapper_pg_dir);
	}
G
Glauber Costa 已提交
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
}
EXPORT_SYMBOL_GPL(leave_mm);

/*
 *
 * The flush IPI assumes that a thread switch happens in this order:
 * [cpu0: the cpu that switches]
 * 1) switch_mm() either 1a) or 1b)
 * 1a) thread switch to a different mm
 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
 *	Stop ipi delivery for the old mm. This is not synchronized with
 *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
 *	for the wrong mm, and in the worst case we perform a superfluous
 *	tlb flush.
 * 1a2) set cpu mmu_state to TLBSTATE_OK
 *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
 *	was in lazy tlb mode.
 * 1a3) update cpu active_mm
 *	Now cpu0 accepts tlb flushes for the new mm.
 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
 *	Now the other cpus will send tlb flush ipis.
 * 1a4) change cr3.
 * 1b) thread switch without mm change
 *	cpu active_mm is correct, cpu0 already handles
 *	flush ipis.
 * 1b1) set cpu mmu_state to TLBSTATE_OK
 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
 *	Atomically set the bit [other cpus will start sending flush ipis],
 *	and test the bit.
 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
 * 2) switch %%esp, ie current
 *
 * The interrupt must handle 2 special cases:
 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
 *   runs in kernel space, the cpu could load tlb entries for user space
 *   pages.
 *
 * The good news is that cpu mmu_state is local to each cpu, no
 * write/read ordering problems.
 */

/*
 * TLB flush IPI:
 *
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 * 2) Leave the mm if we are in the lazy tlb mode.
 *
 * Interrupts are disabled.
 */

T
Tejun Heo 已提交
123 124 125 126 127 128 129 130 131 132 133
/*
 * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
 * but still used for documentation purpose but the usage is slightly
 * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
 * entry calls in with the first parameter in %eax.  Maybe define
 * intrlinkage?
 */
#ifdef CONFIG_X86_64
asmlinkage
#endif
void smp_invalidate_interrupt(struct pt_regs *regs)
G
Glauber Costa 已提交
134
{
T
Tejun Heo 已提交
135 136
	unsigned int cpu;
	unsigned int sender;
G
Glauber Costa 已提交
137 138 139 140 141 142 143 144
	union smp_flush_state *f;

	cpu = smp_processor_id();
	/*
	 * orig_rax contains the negated interrupt vector.
	 * Use that to determine where the sender put the data.
	 */
	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
145
	f = &flush_state[sender];
G
Glauber Costa 已提交
146

147
	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
G
Glauber Costa 已提交
148 149 150 151 152 153 154 155 156 157
		goto out;
		/*
		 * This was a BUG() but until someone can quote me the
		 * line from the intel manual that guarantees an IPI to
		 * multiple CPUs is retried _only_ on the erroring CPUs
		 * its staying as a return
		 *
		 * BUG();
		 */

158 159
	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
		if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
160 161
			if (f->flush_end == TLB_FLUSH_ALL
					|| !cpu_has_invlpg)
G
Glauber Costa 已提交
162
				local_flush_tlb();
163 164 165 166 167 168 169 170 171 172
			else if (!f->flush_end)
				__flush_tlb_single(f->flush_start);
			else {
				unsigned long addr;
				addr = f->flush_start;
				while (addr < f->flush_end) {
					__flush_tlb_single(addr);
					addr += PAGE_SIZE;
				}
			}
G
Glauber Costa 已提交
173 174 175 176 177
		} else
			leave_mm(cpu);
	}
out:
	ack_APIC_irq();
T
Tejun Heo 已提交
178
	smp_mb__before_clear_bit();
179
	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
T
Tejun Heo 已提交
180
	smp_mb__after_clear_bit();
181
	inc_irq_stat(irq_tlb_count);
G
Glauber Costa 已提交
182 183
}

184
static void flush_tlb_others_ipi(const struct cpumask *cpumask,
185 186
				 struct mm_struct *mm, unsigned long start,
				 unsigned long end)
G
Glauber Costa 已提交
187
{
T
Tejun Heo 已提交
188
	unsigned int sender;
G
Glauber Costa 已提交
189
	union smp_flush_state *f;
190

G
Glauber Costa 已提交
191
	/* Caller has disabled preemption */
192
	sender = this_cpu_read(tlb_vector_offset);
193
	f = &flush_state[sender];
G
Glauber Costa 已提交
194

195 196
	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
		raw_spin_lock(&f->tlbstate_lock);
G
Glauber Costa 已提交
197 198

	f->flush_mm = mm;
199 200
	f->flush_start = start;
	f->flush_end = end;
201 202 203 204 205 206 207
	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
		/*
		 * We have to send the IPI only to
		 * CPUs affected.
		 */
		apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
			      INVALIDATE_TLB_VECTOR_START + sender);
G
Glauber Costa 已提交
208

209 210 211
		while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
			cpu_relax();
	}
G
Glauber Costa 已提交
212 213

	f->flush_mm = NULL;
214 215
	f->flush_start = 0;
	f->flush_end = 0;
216 217
	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
		raw_spin_unlock(&f->tlbstate_lock);
G
Glauber Costa 已提交
218 219
}

220
void native_flush_tlb_others(const struct cpumask *cpumask,
221 222
				 struct mm_struct *mm, unsigned long start,
				 unsigned long end)
223 224
{
	if (is_uv_system()) {
T
Tejun Heo 已提交
225
		unsigned int cpu;
226

227
		cpu = smp_processor_id();
228
		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
T
Tejun Heo 已提交
229
		if (cpumask)
230
			flush_tlb_others_ipi(cpumask, mm, start, end);
231
		return;
232
	}
233
	flush_tlb_others_ipi(cpumask, mm, start, end);
234 235
}

236 237
static void __cpuinit calculate_tlb_offset(void)
{
238
	int cpu, node, nr_node_vecs, idx = 0;
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
	/*
	 * we are changing tlb_vector_offset for each CPU in runtime, but this
	 * will not cause inconsistency, as the write is atomic under X86. we
	 * might see more lock contentions in a short time, but after all CPU's
	 * tlb_vector_offset are changed, everything should go normal
	 *
	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
	 * waste some vectors.
	 **/
	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
		nr_node_vecs = 1;
	else
		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;

	for_each_online_node(node) {
254
		int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
255 256 257 258 259 260 261 262
			nr_node_vecs;
		int cpu_offset = 0;
		for_each_cpu(cpu, cpumask_of_node(node)) {
			per_cpu(tlb_vector_offset, cpu) = node_offset +
				cpu_offset;
			cpu_offset++;
			cpu_offset = cpu_offset % nr_node_vecs;
		}
263
		idx++;
264 265 266
	}
}

267
static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
268 269 270 271 272 273 274 275 276 277
		unsigned long action, void *hcpu)
{
	switch (action & 0xf) {
	case CPU_ONLINE:
	case CPU_DEAD:
		calculate_tlb_offset();
	}
	return NOTIFY_OK;
}

I
Ingo Molnar 已提交
278
static int __cpuinit init_smp_flush(void)
G
Glauber Costa 已提交
279 280 281
{
	int i;

282
	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
283
		raw_spin_lock_init(&flush_state[i].tlbstate_lock);
284

285 286
	calculate_tlb_offset();
	hotcpu_notifier(tlb_cpuhp_notify, 0);
G
Glauber Costa 已提交
287 288 289 290 291 292 293 294 295 296 297
	return 0;
}
core_initcall(init_smp_flush);

void flush_tlb_current_task(void)
{
	struct mm_struct *mm = current->mm;

	preempt_disable();

	local_flush_tlb();
298
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
299
		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
G
Glauber Costa 已提交
300 301 302 303 304 305 306 307 308 309 310 311 312
	preempt_enable();
}

void flush_tlb_mm(struct mm_struct *mm)
{
	preempt_disable();

	if (current->active_mm == mm) {
		if (current->mm)
			local_flush_tlb();
		else
			leave_mm(smp_processor_id());
	}
313
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
314 315 316 317 318
		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);

	preempt_enable();
}

319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline unsigned long has_large_page(struct mm_struct *mm,
				 unsigned long start, unsigned long end)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	unsigned long addr = ALIGN(start, HPAGE_SIZE);
	for (; addr < end; addr += HPAGE_SIZE) {
		pgd = pgd_offset(mm, addr);
		if (likely(!pgd_none(*pgd))) {
			pud = pud_offset(pgd, addr);
			if (likely(!pud_none(*pud))) {
				pmd = pmd_offset(pud, addr);
				if (likely(!pmd_none(*pmd)))
					if (pmd_large(*pmd))
						return addr;
			}
		}
	}
	return 0;
}
#else
static inline unsigned long has_large_page(struct mm_struct *mm,
				 unsigned long start, unsigned long end)
{
	return 0;
}
#endif
348 349 350 351 352
void flush_tlb_range(struct vm_area_struct *vma,
				   unsigned long start, unsigned long end)
{
	struct mm_struct *mm;

353
	if (vma->vm_flags & VM_HUGETLB || tlb_flushall_shift == -1) {
354
flush_all:
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
		flush_tlb_mm(vma->vm_mm);
		return;
	}

	preempt_disable();
	mm = vma->vm_mm;
	if (current->active_mm == mm) {
		if (current->mm) {
			unsigned long addr, vmflag = vma->vm_flags;
			unsigned act_entries, tlb_entries = 0;

			if (vmflag & VM_EXEC)
				tlb_entries = tlb_lli_4k[ENTRIES];
			else
				tlb_entries = tlb_lld_4k[ENTRIES];

			act_entries = tlb_entries > mm->total_vm ?
					mm->total_vm : tlb_entries;
G
Glauber Costa 已提交
373

374 375
			if ((end - start) >> PAGE_SHIFT >
					act_entries >> tlb_flushall_shift)
376 377
				local_flush_tlb();
			else {
378 379 380 381
				if (has_large_page(mm, start, end)) {
					preempt_enable();
					goto flush_all;
				}
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
				for (addr = start; addr < end;
						addr += PAGE_SIZE)
					__flush_tlb_single(addr);

				if (cpumask_any_but(mm_cpumask(mm),
					smp_processor_id()) < nr_cpu_ids)
					flush_tlb_others(mm_cpumask(mm), mm,
								start, end);
				preempt_enable();
				return;
			}
		} else {
			leave_mm(smp_processor_id());
		}
	}
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
G
Glauber Costa 已提交
399 400 401
	preempt_enable();
}

402 403

void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
G
Glauber Costa 已提交
404 405 406 407 408 409 410
{
	struct mm_struct *mm = vma->vm_mm;

	preempt_disable();

	if (current->active_mm == mm) {
		if (current->mm)
411
			__flush_tlb_one(start);
G
Glauber Costa 已提交
412 413 414 415
		else
			leave_mm(smp_processor_id());
	}

416
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
417
		flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
G
Glauber Costa 已提交
418 419 420 421 422 423 424

	preempt_enable();
}

static void do_flush_tlb_all(void *info)
{
	__flush_tlb_all();
425
	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
426
		leave_mm(smp_processor_id());
G
Glauber Costa 已提交
427 428 429 430
}

void flush_tlb_all(void)
{
431
	on_each_cpu(do_flush_tlb_all, NULL, 1);
G
Glauber Costa 已提交
432
}