mmu_context.h 9.2 KB
Newer Older
H
H. Peter Anvin 已提交
1 2
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
J
Jeremy Fitzhardinge 已提交
3 4

#include <asm/desc.h>
A
Arun Sharma 已提交
5
#include <linux/atomic.h>
6 7 8 9
#include <linux/mm_types.h>

#include <trace/events/tlb.h>

J
Jeremy Fitzhardinge 已提交
10 11 12
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
13
#include <asm/mpx.h>
J
Jeremy Fitzhardinge 已提交
14 15 16 17 18 19 20
#ifndef CONFIG_PARAVIRT
static inline void paravirt_activate_mm(struct mm_struct *prev,
					struct mm_struct *next)
{
}
#endif	/* !CONFIG_PARAVIRT */

21
#ifdef CONFIG_PERF_EVENTS
22 23
extern struct static_key rdpmc_always_available;

24 25
static inline void load_mm_cr4(struct mm_struct *mm)
{
26
	if (static_key_false(&rdpmc_always_available) ||
27
	    atomic_read(&mm->context.perf_rdpmc_allowed))
28 29 30 31 32 33 34 35
		cr4_set_bits(X86_CR4_PCE);
	else
		cr4_clear_bits(X86_CR4_PCE);
}
#else
static inline void load_mm_cr4(struct mm_struct *mm) {}
#endif

36
#ifdef CONFIG_MODIFY_LDT_SYSCALL
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
/*
 * ldt_structs can be allocated, used, and freed, but they are never
 * modified while live.
 */
struct ldt_struct {
	/*
	 * Xen requires page-aligned LDTs with special permissions.  This is
	 * needed to prevent us from installing evil descriptors such as
	 * call gates.  On native, we could merge the ldt_struct and LDT
	 * allocations, but it's not worth trying to optimize.
	 */
	struct desc_struct *entries;
	int size;
};

52 53 54 55 56 57 58 59 60 61 62 63 64 65
/*
 * Used for LDT copy/destruction.
 */
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
#else	/* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context(struct task_struct *tsk,
				   struct mm_struct *mm)
{
	return 0;
}
static inline void destroy_context(struct mm_struct *mm) {}
#endif

66 67
static inline void load_mm_ldt(struct mm_struct *mm)
{
68
#ifdef CONFIG_MODIFY_LDT_SYSCALL
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
	struct ldt_struct *ldt;

	/* lockless_dereference synchronizes with smp_store_release */
	ldt = lockless_dereference(mm->context.ldt);

	/*
	 * Any change to mm->context.ldt is followed by an IPI to all
	 * CPUs with the mm active.  The LDT will not be freed until
	 * after the IPI is handled by all such CPUs.  This means that,
	 * if the ldt_struct changes before we return, the values we see
	 * will be safe, and the new values will be loaded before we run
	 * any user code.
	 *
	 * NB: don't try to convert this to use RCU without extreme care.
	 * We would still need IRQs off, because we don't want to change
	 * the local LDT after an IPI loaded a newer value than the one
	 * that we can see.
	 */

	if (unlikely(ldt))
		set_ldt(ldt->entries, ldt->size);
	else
		clear_LDT();
92 93 94
#else
	clear_LDT();
#endif
95 96 97 98

	DEBUG_LOCKS_WARN_ON(preemptible());
}

B
Brian Gerst 已提交
99 100 101
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
102 103
	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
		this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
B
Brian Gerst 已提交
104 105 106 107 108 109 110 111 112 113
#endif
}

static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
			     struct task_struct *tsk)
{
	unsigned cpu = smp_processor_id();

	if (likely(prev != next)) {
#ifdef CONFIG_SMP
114 115
		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		this_cpu_write(cpu_tlbstate.active_mm, next);
116
#endif
117
		cpumask_set_cpu(cpu, mm_cpumask(next));
B
Brian Gerst 已提交
118

119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
		/*
		 * Re-load page tables.
		 *
		 * This logic has an ordering constraint:
		 *
		 *  CPU 0: Write to a PTE for 'next'
		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
		 *  CPU 1: set bit 1 in next's mm_cpumask
		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
		 *
		 * We need to prevent an outcome in which CPU 1 observes
		 * the new PTE value and CPU 0 observes bit 1 clear in
		 * mm_cpumask.  (If that occurs, then the IPI will never
		 * be sent, and CPU 0's TLB will contain a stale entry.)
		 *
		 * The bad outcome can occur if either CPU's load is
135
		 * reordered before that CPU's store, so both CPUs must
136 137 138 139
		 * execute full barriers to prevent this from happening.
		 *
		 * Thus, switch_mm needs a full barrier between the
		 * store to mm_cpumask and any operation that could load
140 141 142 143 144
		 * from next->pgd.  TLB fills are special and can happen
		 * due to instruction fetches or for no reason at all,
		 * and neither LOCK nor MFENCE orders them.
		 * Fortunately, load_cr3() is serializing and gives the
		 * ordering guarantee we need.
145 146
		 *
		 */
B
Brian Gerst 已提交
147
		load_cr3(next->pgd);
148

149
		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
B
Brian Gerst 已提交
150

151
		/* Stop flush ipis for the previous mm */
152 153
		cpumask_clear_cpu(cpu, mm_cpumask(prev));

154 155 156
		/* Load per-mm CR4 state */
		load_mm_cr4(next);

157
#ifdef CONFIG_MODIFY_LDT_SYSCALL
158 159 160
		/*
		 * Load the LDT, if the LDT is different.
		 *
161 162 163 164 165
		 * It's possible that prev->context.ldt doesn't match
		 * the LDT register.  This can happen if leave_mm(prev)
		 * was called and then modify_ldt changed
		 * prev->context.ldt but suppressed an IPI to this CPU.
		 * In this case, prev->context.ldt != NULL, because we
166 167 168
		 * never set context.ldt to NULL while the mm still
		 * exists.  That means that next->context.ldt !=
		 * prev->context.ldt, because mms never share an LDT.
169
		 */
B
Brian Gerst 已提交
170
		if (unlikely(prev->context.ldt != next->context.ldt))
171
			load_mm_ldt(next);
172
#endif
B
Brian Gerst 已提交
173 174
	}
#ifdef CONFIG_SMP
175
	  else {
176 177
		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
B
Brian Gerst 已提交
178

179 180 181 182 183 184 185 186
		if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
			/*
			 * On established mms, the mm_cpumask is only changed
			 * from irq context, from ptep_clear_flush() while in
			 * lazy tlb mode, and here. Irqs are blocked during
			 * schedule, protecting us from simultaneous changes.
			 */
			cpumask_set_cpu(cpu, mm_cpumask(next));
187

188 189
			/*
			 * We were in lazy tlb mode and leave_mm disabled
B
Brian Gerst 已提交
190 191
			 * tlb flush IPI delivery. We must reload CR3
			 * to make sure to use no freed page tables.
192
			 *
193 194
			 * As above, load_cr3() is serializing and orders TLB
			 * fills with respect to the mm_cpumask write.
B
Brian Gerst 已提交
195 196
			 */
			load_cr3(next->pgd);
197
			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
198
			load_mm_cr4(next);
199
			load_mm_ldt(next);
B
Brian Gerst 已提交
200 201 202 203
		}
	}
#endif
}
J
Jeremy Fitzhardinge 已提交
204 205 206 207 208 209 210

#define activate_mm(prev, next)			\
do {						\
	paravirt_activate_mm((prev), (next));	\
	switch_mm((prev), (next), NULL);	\
} while (0);

B
Brian Gerst 已提交
211 212 213
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm)			\
do {						\
214
	lazy_load_gs(0);			\
B
Brian Gerst 已提交
215 216 217 218 219 220 221 222
} while (0)
#else
#define deactivate_mm(tsk, mm)			\
do {						\
	load_gs_index(0);			\
	loadsegment(fs, 0);			\
} while (0)
#endif
J
Jeremy Fitzhardinge 已提交
223

224 225 226 227 228 229 230 231 232 233 234
static inline void arch_dup_mmap(struct mm_struct *oldmm,
				 struct mm_struct *mm)
{
	paravirt_arch_dup_mmap(oldmm, mm);
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
	paravirt_arch_exit_mmap(mm);
}

235 236 237 238 239 240 241 242 243 244 245 246 247
#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
	return	!config_enabled(CONFIG_IA32_EMULATION) ||
		!(mm->context.ia32_compat == TIF_IA32);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
	return false;
}
#endif

248 249 250 251 252 253
static inline void arch_bprm_mm_init(struct mm_struct *mm,
		struct vm_area_struct *vma)
{
	mpx_mm_init(mm);
}

254 255 256
static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
			      unsigned long start, unsigned long end)
{
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
	/*
	 * mpx_notify_unmap() goes and reads a rarely-hot
	 * cacheline in the mm_struct.  That can be expensive
	 * enough to be seen in profiles.
	 *
	 * The mpx_notify_unmap() call and its contents have been
	 * observed to affect munmap() performance on hardware
	 * where MPX is not present.
	 *
	 * The unlikely() optimizes for the fast case: no MPX
	 * in the CPU, or no MPX use in the process.  Even if
	 * we get this wrong (in the unlikely event that MPX
	 * is widely enabled on some system) the overhead of
	 * MPX itself (reading bounds tables) is expected to
	 * overwhelm the overhead of getting this unlikely()
	 * consistently wrong.
	 */
	if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
		mpx_notify_unmap(mm, vma, start, end);
276 277
}

278 279 280 281 282 283 284 285 286 287 288
static inline int vma_pkey(struct vm_area_struct *vma)
{
	u16 pkey = 0;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
	pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
#endif
	return pkey;
}

289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
	u32 pkru = read_pkru();

	if (!__pkru_allows_read(pkru, pkey))
		return false;
	if (write && !__pkru_allows_write(pkru, pkey))
		return false;

	return true;
}

/*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
 * processes or any way to tell *which * PKRU in a threaded
 * process we could use.
 *
 * So do not enforce things if the VMA is not from the current
 * mm, or if we are in a kernel thread.
 */
static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
	if (!current->mm)
		return true;
	/*
	 * Should PKRU be enforced on the access to this VMA?  If
	 * the VMA is from another process, then PKRU has no
	 * relevance and should not be enforced.
	 */
	if (current->mm != vma->vm_mm)
		return true;

	return false;
}

static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write)
{
	/* allow access if the VMA is not one from this process */
	if (vma_is_foreign(vma))
		return true;
	return __pkru_allows_pkey(vma_pkey(vma), write);
}

static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
	return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
}

H
H. Peter Anvin 已提交
338
#endif /* _ASM_X86_MMU_CONTEXT_H */