vma.c 9.4 KB
Newer Older
1 2 3
/*
 * Copyright 2007 Andi Kleen, SUSE Labs.
 * Subject to the GPL, v.2
4 5
 *
 * This contains most of the x86 vDSO kernel-side code.
6 7
 */
#include <linux/mm.h>
A
Alexey Dobriyan 已提交
8
#include <linux/err.h>
9
#include <linux/sched.h>
10
#include <linux/sched/task_stack.h>
11
#include <linux/slab.h>
12 13
#include <linux/init.h>
#include <linux/random.h>
14
#include <linux/elf.h>
15
#include <linux/cpu.h>
16
#include <linux/ptrace.h>
17
#include <asm/pvclock.h>
18 19
#include <asm/vgtod.h>
#include <asm/proto.h>
R
Roland McGrath 已提交
20
#include <asm/vdso.h>
21
#include <asm/vvar.h>
22
#include <asm/page.h>
23
#include <asm/desc.h>
24
#include <asm/cpufeature.h>
25
#include <asm/mshyperv.h>
26

27
#if defined(CONFIG_X86_64)
28
unsigned int __read_mostly vdso64_enabled = 1;
29
#endif
H
H. J. Lu 已提交
30

31
void __init init_vdso_image(const struct vdso_image *image)
H
H. J. Lu 已提交
32
{
33
	BUG_ON(image->size % PAGE_SIZE != 0);
H
H. J. Lu 已提交
34

35 36 37
	apply_alternatives((struct alt_instr *)(image->data + image->alt),
			   (struct alt_instr *)(image->data + image->alt +
						image->alt_len));
H
H. J. Lu 已提交
38
}
39

40 41
struct linux_binprm;

42 43 44 45 46 47 48 49 50 51 52 53 54
static int vdso_fault(const struct vm_special_mapping *sm,
		      struct vm_area_struct *vma, struct vm_fault *vmf)
{
	const struct vdso_image *image = vma->vm_mm->context.vdso_image;

	if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
		return VM_FAULT_SIGBUS;

	vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
	get_page(vmf->page);
	return 0;
}

55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
static void vdso_fix_landing(const struct vdso_image *image,
		struct vm_area_struct *new_vma)
{
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
	if (in_ia32_syscall() && image == &vdso_image_32) {
		struct pt_regs *regs = current_pt_regs();
		unsigned long vdso_land = image->sym_int80_landing_pad;
		unsigned long old_land_addr = vdso_land +
			(unsigned long)current->mm->context.vdso;

		/* Fixing userspace landing - look at do_fast_syscall_32 */
		if (regs->ip == old_land_addr)
			regs->ip = new_vma->vm_start + vdso_land;
	}
#endif
}

static int vdso_mremap(const struct vm_special_mapping *sm,
		struct vm_area_struct *new_vma)
{
	unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
	const struct vdso_image *image = current->mm->context.vdso_image;

	if (image->size != new_size)
		return -EINVAL;

	vdso_fix_landing(image, new_vma);
	current->mm->context.vdso = (void __user *)new_vma->vm_start;

	return 0;
}
86

87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
static int vvar_fault(const struct vm_special_mapping *sm,
		      struct vm_area_struct *vma, struct vm_fault *vmf)
{
	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
	long sym_offset;
	int ret = -EFAULT;

	if (!image)
		return VM_FAULT_SIGBUS;

	sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
		image->sym_vvar_start;

	/*
	 * Sanity check: a symbol offset of zero means that the page
	 * does not exist for this vdso image, not that the page is at
	 * offset zero relative to the text mapping.  This should be
	 * impossible here, because sym_offset should only be zero for
	 * the page past the end of the vvar mapping.
	 */
	if (sym_offset == 0)
		return VM_FAULT_SIGBUS;

	if (sym_offset == image->sym_vvar_page) {
111
		ret = vm_insert_pfn(vma, vmf->address,
112 113 114 115
				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
	} else if (sym_offset == image->sym_pvclock_page) {
		struct pvclock_vsyscall_time_info *pvti =
			pvclock_pvti_cpu0_va();
116
		if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
117 118
			ret = vm_insert_pfn(
				vma,
119
				vmf->address,
120 121
				__pa(pvti) >> PAGE_SHIFT);
		}
122 123 124 125 126 127
	} else if (sym_offset == image->sym_hvclock_page) {
		struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();

		if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
			ret = vm_insert_pfn(vma, vmf->address,
					    vmalloc_to_pfn(tsc_pg));
128 129 130 131 132 133 134 135
	}

	if (ret == 0 || ret == -EBUSY)
		return VM_FAULT_NOPAGE;

	return VM_FAULT_SIGBUS;
}

136 137 138 139 140 141 142 143 144 145
static const struct vm_special_mapping vdso_mapping = {
	.name = "[vdso]",
	.fault = vdso_fault,
	.mremap = vdso_mremap,
};
static const struct vm_special_mapping vvar_mapping = {
	.name = "[vvar]",
	.fault = vvar_fault,
};

146 147 148 149 150 151
/*
 * Add vdso and vvar mappings to current process.
 * @image          - blob to map
 * @addr           - request a specific address (zero to map at free addr)
 */
static int map_vdso(const struct vdso_image *image, unsigned long addr)
152 153
{
	struct mm_struct *mm = current->mm;
154
	struct vm_area_struct *vma;
155
	unsigned long text_start;
156
	int ret = 0;
157

158 159
	if (down_write_killable(&mm->mmap_sem))
		return -EINTR;
160

161 162
	addr = get_unmapped_area(NULL, addr,
				 image->size - image->sym_vvar_start, 0, 0);
163 164 165 166 167
	if (IS_ERR_VALUE(addr)) {
		ret = addr;
		goto up_fail;
	}

168
	text_start = addr - image->sym_vvar_start;
169

170 171 172
	/*
	 * MAYWRITE to allow gdb to COW and set breakpoints
	 */
173
	vma = _install_special_mapping(mm,
174
				       text_start,
175 176 177
				       image->size,
				       VM_READ|VM_EXEC|
				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
178
				       &vdso_mapping);
179

180 181
	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
182
		goto up_fail;
183
	}
184 185

	vma = _install_special_mapping(mm,
186 187
				       addr,
				       -image->sym_vvar_start,
188 189
				       VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
				       VM_PFNMAP,
190
				       &vvar_mapping);
191 192 193

	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
194
		do_munmap(mm, text_start, image->size, NULL);
195 196 197
	} else {
		current->mm->context.vdso = (void __user *)text_start;
		current->mm->context.vdso_image = image;
198
	}
199 200 201 202 203 204

up_fail:
	up_write(&mm->mmap_sem);
	return ret;
}

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
#ifdef CONFIG_X86_64
/*
 * Put the vdso above the (randomized) stack with another randomized
 * offset.  This way there is no hole in the middle of address space.
 * To save memory make sure it is still in the same PTE as the stack
 * top.  This doesn't give that many random bits.
 *
 * Note that this algorithm is imperfect: the distribution of the vdso
 * start address within a PMD is biased toward the end.
 *
 * Only used for the 64-bit and x32 vdsos.
 */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
	unsigned long addr, end;
	unsigned offset;

	/*
	 * Round up the start address.  It can start out unaligned as a result
	 * of stack start randomization.
	 */
	start = PAGE_ALIGN(start);

	/* Round the lowest possible end address up to a PMD boundary. */
	end = (start + len + PMD_SIZE - 1) & PMD_MASK;
	if (end >= TASK_SIZE_MAX)
		end = TASK_SIZE_MAX;
	end -= len;

	if (end > start) {
		offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
		addr = start + (offset << PAGE_SHIFT);
	} else {
		addr = start;
	}

	/*
	 * Forcibly align the final address in case we have a hardware
	 * issue that requires alignment for performance reasons.
	 */
	addr = align_vdso_addr(addr);

	return addr;
}

250 251
static int map_vdso_randomized(const struct vdso_image *image)
{
252 253
	unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);

254 255
	return map_vdso(image, addr);
}
256
#endif
257

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
int map_vdso_once(const struct vdso_image *image, unsigned long addr)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;

	down_write(&mm->mmap_sem);
	/*
	 * Check if we have already mapped vdso blob - fail to prevent
	 * abusing from userspace install_speciall_mapping, which may
	 * not do accounting and rlimit right.
	 * We could search vma near context.vdso, but it's a slowpath,
	 * so let's explicitely check all VMAs to be completely sure.
	 */
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		if (vma_is_special_mapping(vma, &vdso_mapping) ||
				vma_is_special_mapping(vma, &vvar_mapping)) {
			up_write(&mm->mmap_sem);
			return -EEXIST;
		}
	}
	up_write(&mm->mmap_sem);

	return map_vdso(image, addr);
}

283
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
284 285 286 287 288
static int load_vdso32(void)
{
	if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
		return 0;

289
	return map_vdso(&vdso_image_32, 0);
290 291 292 293
}
#endif

#ifdef CONFIG_X86_64
H
H. J. Lu 已提交
294 295
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
296 297 298
	if (!vdso64_enabled)
		return 0;

299
	return map_vdso_randomized(&vdso_image_64);
H
H. J. Lu 已提交
300 301
}

302 303 304 305
#ifdef CONFIG_COMPAT
int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
				       int uses_interp)
{
H
H. J. Lu 已提交
306
#ifdef CONFIG_X86_X32_ABI
307 308 309
	if (test_thread_flag(TIF_X32)) {
		if (!vdso64_enabled)
			return 0;
310
		return map_vdso_randomized(&vdso_image_x32);
311 312
	}
#endif
313
#ifdef CONFIG_IA32_EMULATION
314
	return load_vdso32();
315 316 317
#else
	return 0;
#endif
318 319 320 321
}
#endif
#else
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
H
H. J. Lu 已提交
322
{
323
	return load_vdso32();
H
H. J. Lu 已提交
324 325 326
}
#endif

327
#ifdef CONFIG_X86_64
328 329
static __init int vdso_setup(char *s)
{
330
	vdso64_enabled = simple_strtoul(s, NULL, 0);
331 332 333
	return 0;
}
__setup("vdso=", vdso_setup);
334
#endif
335 336

#ifdef CONFIG_X86_64
337
static void vgetcpu_cpu_init(void *arg)
338
{
339
	int cpu = smp_processor_id();
340
	struct desc_struct d = { };
341 342 343 344
	unsigned long node = 0;
#ifdef CONFIG_NUMA
	node = cpu_to_node(cpu);
#endif
B
Borislav Petkov 已提交
345
	if (static_cpu_has(X86_FEATURE_RDTSCP))
346 347 348
		write_rdtscp_aux((node << 12) | cpu);

	/*
349 350 351
	 * Store cpu number in limit so that it can be loaded
	 * quickly in user space in vgetcpu. (12 bits for the CPU
	 * and 8 bits for the node)
352
	 */
353 354 355 356 357 358 359
	d.limit0 = cpu | ((node & 0xf) << 12);
	d.limit = node >> 4;
	d.type = 5;		/* RO data, expand down, accessed */
	d.dpl = 3;		/* Visible to user code */
	d.s = 1;		/* Not a system segment */
	d.p = 1;		/* Present */
	d.d = 1;		/* 32-bit */
360

361
	write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
362 363
}

364
static int vgetcpu_online(unsigned int cpu)
365
{
366
	return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
367 368
}

369
static int __init init_vdso(void)
370
{
371 372 373 374 375 376
	init_vdso_image(&vdso_image_64);

#ifdef CONFIG_X86_X32_ABI
	init_vdso_image(&vdso_image_x32);
#endif

377
	/* notifier priority > KVM */
378
	return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
T
Thomas Gleixner 已提交
379
				 "x86/vdso/vma:online", vgetcpu_online, NULL);
380
}
381 382
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */