tlb.c 12.1 KB
Newer Older
G
Glauber Costa 已提交
1 2 3 4 5 6
#include <linux/init.h>

#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
T
Tejun Heo 已提交
7
#include <linux/module.h>
8
#include <linux/cpu.h>
G
Glauber Costa 已提交
9 10 11

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
12
#include <asm/cache.h>
T
Tejun Heo 已提交
13
#include <asm/apic.h>
T
Tejun Heo 已提交
14
#include <asm/uv/uv.h>
15
#include <linux/debugfs.h>
16

17 18 19
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
			= { &init_mm, 0, };

G
Glauber Costa 已提交
20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 *	Smarter SMP flushing macros.
 *		c/o Linus Torvalds.
 *
 *	These mean you can really definitely utterly forget about
 *	writing to user space from interrupts. (Its not allowed anyway).
 *
 *	Optimizations Manfred Spraul <manfred@colorfullife.com>
 *
 *	More scalable flush, from Andi Kleen
 *
 *	To avoid global state use 8 different call vectors.
 *	Each CPU uses a specific vector to trigger flushes on other
 *	CPUs. Depending on the received vector the target CPUs look into
34
 *	the right array slot for the flush data.
G
Glauber Costa 已提交
35 36 37 38 39 40 41 42 43 44
 *
 *	With more than 8 CPUs they are hashed to the 8 available
 *	vectors. The limited global vector space forces us to this right now.
 *	In future when interrupts are split into per CPU domains this could be
 *	fixed, at the cost of triggering multiple IPIs in some cases.
 */

union smp_flush_state {
	struct {
		struct mm_struct *flush_mm;
45 46
		unsigned long flush_start;
		unsigned long flush_end;
47
		raw_spinlock_t tlbstate_lock;
48
		DECLARE_BITMAP(flush_cpumask, NR_CPUS);
G
Glauber Costa 已提交
49
	};
50
	char pad[INTERNODE_CACHE_BYTES];
51
} ____cacheline_internodealigned_in_smp;
G
Glauber Costa 已提交
52 53 54 55

/* State is put into the per CPU data section, but padded
   to a full cache line because other CPUs can access it and we don't
   want false sharing in the per cpu data segment. */
56
static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
G
Glauber Costa 已提交
57

58 59
static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);

G
Glauber Costa 已提交
60 61 62 63 64 65
/*
 * We cannot call mmdrop() because we are in interrupt context,
 * instead update mm->cpu_vm_mask.
 */
void leave_mm(int cpu)
{
66
	struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
67
	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
G
Glauber Costa 已提交
68
		BUG();
69 70 71 72
	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
		load_cr3(swapper_pg_dir);
	}
G
Glauber Costa 已提交
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
}
EXPORT_SYMBOL_GPL(leave_mm);

/*
 *
 * The flush IPI assumes that a thread switch happens in this order:
 * [cpu0: the cpu that switches]
 * 1) switch_mm() either 1a) or 1b)
 * 1a) thread switch to a different mm
 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
 *	Stop ipi delivery for the old mm. This is not synchronized with
 *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
 *	for the wrong mm, and in the worst case we perform a superfluous
 *	tlb flush.
 * 1a2) set cpu mmu_state to TLBSTATE_OK
 *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
 *	was in lazy tlb mode.
 * 1a3) update cpu active_mm
 *	Now cpu0 accepts tlb flushes for the new mm.
 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
 *	Now the other cpus will send tlb flush ipis.
 * 1a4) change cr3.
 * 1b) thread switch without mm change
 *	cpu active_mm is correct, cpu0 already handles
 *	flush ipis.
 * 1b1) set cpu mmu_state to TLBSTATE_OK
 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
 *	Atomically set the bit [other cpus will start sending flush ipis],
 *	and test the bit.
 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
 * 2) switch %%esp, ie current
 *
 * The interrupt must handle 2 special cases:
 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
 *   runs in kernel space, the cpu could load tlb entries for user space
 *   pages.
 *
 * The good news is that cpu mmu_state is local to each cpu, no
 * write/read ordering problems.
 */

/*
 * TLB flush IPI:
 *
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 * 2) Leave the mm if we are in the lazy tlb mode.
 *
 * Interrupts are disabled.
 */

T
Tejun Heo 已提交
124 125 126 127 128 129 130 131 132 133 134
/*
 * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
 * but still used for documentation purpose but the usage is slightly
 * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
 * entry calls in with the first parameter in %eax.  Maybe define
 * intrlinkage?
 */
#ifdef CONFIG_X86_64
asmlinkage
#endif
void smp_invalidate_interrupt(struct pt_regs *regs)
G
Glauber Costa 已提交
135
{
T
Tejun Heo 已提交
136 137
	unsigned int cpu;
	unsigned int sender;
G
Glauber Costa 已提交
138 139 140 141 142 143 144 145
	union smp_flush_state *f;

	cpu = smp_processor_id();
	/*
	 * orig_rax contains the negated interrupt vector.
	 * Use that to determine where the sender put the data.
	 */
	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
146
	f = &flush_state[sender];
G
Glauber Costa 已提交
147

148
	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
G
Glauber Costa 已提交
149 150 151 152 153 154 155 156 157 158
		goto out;
		/*
		 * This was a BUG() but until someone can quote me the
		 * line from the intel manual that guarantees an IPI to
		 * multiple CPUs is retried _only_ on the erroring CPUs
		 * its staying as a return
		 *
		 * BUG();
		 */

159 160
	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
		if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
161 162
			if (f->flush_end == TLB_FLUSH_ALL
					|| !cpu_has_invlpg)
G
Glauber Costa 已提交
163
				local_flush_tlb();
164 165 166 167 168 169 170 171 172 173
			else if (!f->flush_end)
				__flush_tlb_single(f->flush_start);
			else {
				unsigned long addr;
				addr = f->flush_start;
				while (addr < f->flush_end) {
					__flush_tlb_single(addr);
					addr += PAGE_SIZE;
				}
			}
G
Glauber Costa 已提交
174 175 176 177 178
		} else
			leave_mm(cpu);
	}
out:
	ack_APIC_irq();
T
Tejun Heo 已提交
179
	smp_mb__before_clear_bit();
180
	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
T
Tejun Heo 已提交
181
	smp_mb__after_clear_bit();
182
	inc_irq_stat(irq_tlb_count);
G
Glauber Costa 已提交
183 184
}

185
static void flush_tlb_others_ipi(const struct cpumask *cpumask,
186 187
				 struct mm_struct *mm, unsigned long start,
				 unsigned long end)
G
Glauber Costa 已提交
188
{
T
Tejun Heo 已提交
189
	unsigned int sender;
G
Glauber Costa 已提交
190
	union smp_flush_state *f;
191

G
Glauber Costa 已提交
192
	/* Caller has disabled preemption */
193
	sender = this_cpu_read(tlb_vector_offset);
194
	f = &flush_state[sender];
G
Glauber Costa 已提交
195

196 197
	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
		raw_spin_lock(&f->tlbstate_lock);
G
Glauber Costa 已提交
198 199

	f->flush_mm = mm;
200 201
	f->flush_start = start;
	f->flush_end = end;
202 203 204 205 206 207 208
	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
		/*
		 * We have to send the IPI only to
		 * CPUs affected.
		 */
		apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
			      INVALIDATE_TLB_VECTOR_START + sender);
G
Glauber Costa 已提交
209

210 211 212
		while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
			cpu_relax();
	}
G
Glauber Costa 已提交
213 214

	f->flush_mm = NULL;
215 216
	f->flush_start = 0;
	f->flush_end = 0;
217 218
	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
		raw_spin_unlock(&f->tlbstate_lock);
G
Glauber Costa 已提交
219 220
}

221
void native_flush_tlb_others(const struct cpumask *cpumask,
222 223
				 struct mm_struct *mm, unsigned long start,
				 unsigned long end)
224 225
{
	if (is_uv_system()) {
T
Tejun Heo 已提交
226
		unsigned int cpu;
227

228
		cpu = smp_processor_id();
229
		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
T
Tejun Heo 已提交
230
		if (cpumask)
231
			flush_tlb_others_ipi(cpumask, mm, start, end);
232
		return;
233
	}
234
	flush_tlb_others_ipi(cpumask, mm, start, end);
235 236
}

237 238
static void __cpuinit calculate_tlb_offset(void)
{
239
	int cpu, node, nr_node_vecs, idx = 0;
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
	/*
	 * we are changing tlb_vector_offset for each CPU in runtime, but this
	 * will not cause inconsistency, as the write is atomic under X86. we
	 * might see more lock contentions in a short time, but after all CPU's
	 * tlb_vector_offset are changed, everything should go normal
	 *
	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
	 * waste some vectors.
	 **/
	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
		nr_node_vecs = 1;
	else
		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;

	for_each_online_node(node) {
255
		int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
256 257 258 259 260 261 262 263
			nr_node_vecs;
		int cpu_offset = 0;
		for_each_cpu(cpu, cpumask_of_node(node)) {
			per_cpu(tlb_vector_offset, cpu) = node_offset +
				cpu_offset;
			cpu_offset++;
			cpu_offset = cpu_offset % nr_node_vecs;
		}
264
		idx++;
265 266 267
	}
}

268
static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
269 270 271 272 273 274 275 276 277 278
		unsigned long action, void *hcpu)
{
	switch (action & 0xf) {
	case CPU_ONLINE:
	case CPU_DEAD:
		calculate_tlb_offset();
	}
	return NOTIFY_OK;
}

I
Ingo Molnar 已提交
279
static int __cpuinit init_smp_flush(void)
G
Glauber Costa 已提交
280 281 282
{
	int i;

283
	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
284
		raw_spin_lock_init(&flush_state[i].tlbstate_lock);
285

286 287
	calculate_tlb_offset();
	hotcpu_notifier(tlb_cpuhp_notify, 0);
G
Glauber Costa 已提交
288 289 290 291 292 293 294 295 296 297 298
	return 0;
}
core_initcall(init_smp_flush);

void flush_tlb_current_task(void)
{
	struct mm_struct *mm = current->mm;

	preempt_disable();

	local_flush_tlb();
299
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300
		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
G
Glauber Costa 已提交
301 302 303
	preempt_enable();
}

304 305 306 307
/*
 * It can find out the THP large page, or
 * HUGETLB page in tlb_flush when THP disabled
 */
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
static inline unsigned long has_large_page(struct mm_struct *mm,
				 unsigned long start, unsigned long end)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	unsigned long addr = ALIGN(start, HPAGE_SIZE);
	for (; addr < end; addr += HPAGE_SIZE) {
		pgd = pgd_offset(mm, addr);
		if (likely(!pgd_none(*pgd))) {
			pud = pud_offset(pgd, addr);
			if (likely(!pud_none(*pud))) {
				pmd = pmd_offset(pud, addr);
				if (likely(!pmd_none(*pmd)))
					if (pmd_large(*pmd))
						return addr;
			}
		}
	}
	return 0;
}
329

330 331 332 333 334
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
				unsigned long end, unsigned long vmflag)
{
	unsigned long addr;
	unsigned act_entries, tlb_entries = 0;
335 336

	preempt_disable();
337 338
	if (current->active_mm != mm)
		goto flush_all;
339

340 341 342 343
	if (!current->mm) {
		leave_mm(smp_processor_id());
		goto flush_all;
	}
G
Glauber Costa 已提交
344

345 346 347 348 349
	if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
					|| vmflag == VM_HUGETLB) {
		local_flush_tlb();
		goto flush_all;
	}
350

351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
	/* In modern CPU, last level tlb used for both data/ins */
	if (vmflag & VM_EXEC)
		tlb_entries = tlb_lli_4k[ENTRIES];
	else
		tlb_entries = tlb_lld_4k[ENTRIES];
	/* Assume all of TLB entries was occupied by this task */
	act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;

	/* tlb_flushall_shift is on balance point, details in commit log */
	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
		local_flush_tlb();
	else {
		if (has_large_page(mm, start, end)) {
			local_flush_tlb();
			goto flush_all;
366
		}
367 368 369 370 371 372 373 374 375
		/* flush range by one by one 'invlpg' */
		for (addr = start; addr < end;	addr += PAGE_SIZE)
			__flush_tlb_single(addr);

		if (cpumask_any_but(mm_cpumask(mm),
				smp_processor_id()) < nr_cpu_ids)
			flush_tlb_others(mm_cpumask(mm), mm, start, end);
		preempt_enable();
		return;
376
	}
377 378

flush_all:
379 380
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
G
Glauber Costa 已提交
381 382 383
	preempt_enable();
}

384
void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
G
Glauber Costa 已提交
385 386 387 388 389 390 391
{
	struct mm_struct *mm = vma->vm_mm;

	preempt_disable();

	if (current->active_mm == mm) {
		if (current->mm)
392
			__flush_tlb_one(start);
G
Glauber Costa 已提交
393 394 395 396
		else
			leave_mm(smp_processor_id());
	}

397
	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
398
		flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
G
Glauber Costa 已提交
399 400 401 402 403 404 405

	preempt_enable();
}

static void do_flush_tlb_all(void *info)
{
	__flush_tlb_all();
406
	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
407
		leave_mm(smp_processor_id());
G
Glauber Costa 已提交
408 409 410 411
}

void flush_tlb_all(void)
{
412
	on_each_cpu(do_flush_tlb_all, NULL, 1);
G
Glauber Costa 已提交
413
}
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463

#ifdef CONFIG_DEBUG_TLBFLUSH
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
			     size_t count, loff_t *ppos)
{
	char buf[32];
	unsigned int len;

	len = sprintf(buf, "%hd\n", tlb_flushall_shift);
	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}

static ssize_t tlbflush_write_file(struct file *file,
		 const char __user *user_buf, size_t count, loff_t *ppos)
{
	char buf[32];
	ssize_t len;
	s8 shift;

	len = min(count, sizeof(buf) - 1);
	if (copy_from_user(buf, user_buf, len))
		return -EFAULT;

	buf[len] = '\0';
	if (kstrtos8(buf, 0, &shift))
		return -EINVAL;

	if (shift > 64)
		return -EINVAL;

	tlb_flushall_shift = shift;
	return count;
}

static const struct file_operations fops_tlbflush = {
	.read = tlbflush_read_file,
	.write = tlbflush_write_file,
	.llseek = default_llseek,
};

static int __cpuinit create_tlb_flushall_shift(void)
{
	if (cpu_has_invlpg) {
		debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
			arch_debugfs_dir, NULL, &fops_tlbflush);
	}
	return 0;
}
late_initcall(create_tlb_flushall_shift);
#endif