vma.c 9.3 KB
Newer Older
1 2 3
/*
 * Copyright 2007 Andi Kleen, SUSE Labs.
 * Subject to the GPL, v.2
4 5
 *
 * This contains most of the x86 vDSO kernel-side code.
6 7
 */
#include <linux/mm.h>
A
Alexey Dobriyan 已提交
8
#include <linux/err.h>
9
#include <linux/sched.h>
10
#include <linux/slab.h>
11 12
#include <linux/init.h>
#include <linux/random.h>
13
#include <linux/elf.h>
14
#include <linux/cpu.h>
15
#include <linux/ptrace.h>
16
#include <asm/pvclock.h>
17 18
#include <asm/vgtod.h>
#include <asm/proto.h>
R
Roland McGrath 已提交
19
#include <asm/vdso.h>
20
#include <asm/vvar.h>
21
#include <asm/page.h>
22
#include <asm/desc.h>
23
#include <asm/cpufeature.h>
24

25
#if defined(CONFIG_X86_64)
26
unsigned int __read_mostly vdso64_enabled = 1;
27
#endif
H
H. J. Lu 已提交
28

29
void __init init_vdso_image(const struct vdso_image *image)
H
H. J. Lu 已提交
30
{
31
	BUG_ON(image->size % PAGE_SIZE != 0);
H
H. J. Lu 已提交
32

33 34 35
	apply_alternatives((struct alt_instr *)(image->data + image->alt),
			   (struct alt_instr *)(image->data + image->alt +
						image->alt_len));
H
H. J. Lu 已提交
36
}
37

38 39
struct linux_binprm;

40 41 42 43 44 45 46 47 48 49 50 51 52
static int vdso_fault(const struct vm_special_mapping *sm,
		      struct vm_area_struct *vma, struct vm_fault *vmf)
{
	const struct vdso_image *image = vma->vm_mm->context.vdso_image;

	if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
		return VM_FAULT_SIGBUS;

	vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
	get_page(vmf->page);
	return 0;
}

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
static void vdso_fix_landing(const struct vdso_image *image,
		struct vm_area_struct *new_vma)
{
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
	if (in_ia32_syscall() && image == &vdso_image_32) {
		struct pt_regs *regs = current_pt_regs();
		unsigned long vdso_land = image->sym_int80_landing_pad;
		unsigned long old_land_addr = vdso_land +
			(unsigned long)current->mm->context.vdso;

		/* Fixing userspace landing - look at do_fast_syscall_32 */
		if (regs->ip == old_land_addr)
			regs->ip = new_vma->vm_start + vdso_land;
	}
#endif
}

static int vdso_mremap(const struct vm_special_mapping *sm,
		struct vm_area_struct *new_vma)
{
	unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
	const struct vdso_image *image = current->mm->context.vdso_image;

	if (image->size != new_size)
		return -EINVAL;

	if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
		return -EFAULT;

	vdso_fix_landing(image, new_vma);
	current->mm->context.vdso = (void __user *)new_vma->vm_start;

	return 0;
}
87

88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
static int vvar_fault(const struct vm_special_mapping *sm,
		      struct vm_area_struct *vma, struct vm_fault *vmf)
{
	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
	long sym_offset;
	int ret = -EFAULT;

	if (!image)
		return VM_FAULT_SIGBUS;

	sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
		image->sym_vvar_start;

	/*
	 * Sanity check: a symbol offset of zero means that the page
	 * does not exist for this vdso image, not that the page is at
	 * offset zero relative to the text mapping.  This should be
	 * impossible here, because sym_offset should only be zero for
	 * the page past the end of the vvar mapping.
	 */
	if (sym_offset == 0)
		return VM_FAULT_SIGBUS;

	if (sym_offset == image->sym_vvar_page) {
		ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
	} else if (sym_offset == image->sym_pvclock_page) {
		struct pvclock_vsyscall_time_info *pvti =
			pvclock_pvti_cpu0_va();
117
		if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
118 119 120 121 122 123 124 125 126 127 128 129 130
			ret = vm_insert_pfn(
				vma,
				(unsigned long)vmf->virtual_address,
				__pa(pvti) >> PAGE_SHIFT);
		}
	}

	if (ret == 0 || ret == -EBUSY)
		return VM_FAULT_NOPAGE;

	return VM_FAULT_SIGBUS;
}

131 132 133 134 135 136 137 138 139 140
static const struct vm_special_mapping vdso_mapping = {
	.name = "[vdso]",
	.fault = vdso_fault,
	.mremap = vdso_mremap,
};
static const struct vm_special_mapping vvar_mapping = {
	.name = "[vvar]",
	.fault = vvar_fault,
};

141 142 143 144 145 146
/*
 * Add vdso and vvar mappings to current process.
 * @image          - blob to map
 * @addr           - request a specific address (zero to map at free addr)
 */
static int map_vdso(const struct vdso_image *image, unsigned long addr)
147 148
{
	struct mm_struct *mm = current->mm;
149
	struct vm_area_struct *vma;
150
	unsigned long text_start;
151
	int ret = 0;
152

153 154
	if (down_write_killable(&mm->mmap_sem))
		return -EINTR;
155

156 157
	addr = get_unmapped_area(NULL, addr,
				 image->size - image->sym_vvar_start, 0, 0);
158 159 160 161 162
	if (IS_ERR_VALUE(addr)) {
		ret = addr;
		goto up_fail;
	}

163 164
	text_start = addr - image->sym_vvar_start;
	current->mm->context.vdso = (void __user *)text_start;
165
	current->mm->context.vdso_image = image;
166

167 168 169
	/*
	 * MAYWRITE to allow gdb to COW and set breakpoints
	 */
170
	vma = _install_special_mapping(mm,
171
				       text_start,
172 173 174
				       image->size,
				       VM_READ|VM_EXEC|
				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
175
				       &vdso_mapping);
176

177 178
	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
179
		goto up_fail;
180
	}
181 182

	vma = _install_special_mapping(mm,
183 184
				       addr,
				       -image->sym_vvar_start,
185 186
				       VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
				       VM_PFNMAP,
187
				       &vvar_mapping);
188 189 190

	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
191
		do_munmap(mm, text_start, image->size);
192
	}
193 194

up_fail:
195
	if (ret) {
196
		current->mm->context.vdso = NULL;
197 198
		current->mm->context.vdso_image = NULL;
	}
199

200 201 202 203
	up_write(&mm->mmap_sem);
	return ret;
}

204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
#ifdef CONFIG_X86_64
/*
 * Put the vdso above the (randomized) stack with another randomized
 * offset.  This way there is no hole in the middle of address space.
 * To save memory make sure it is still in the same PTE as the stack
 * top.  This doesn't give that many random bits.
 *
 * Note that this algorithm is imperfect: the distribution of the vdso
 * start address within a PMD is biased toward the end.
 *
 * Only used for the 64-bit and x32 vdsos.
 */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
	unsigned long addr, end;
	unsigned offset;

	/*
	 * Round up the start address.  It can start out unaligned as a result
	 * of stack start randomization.
	 */
	start = PAGE_ALIGN(start);

	/* Round the lowest possible end address up to a PMD boundary. */
	end = (start + len + PMD_SIZE - 1) & PMD_MASK;
	if (end >= TASK_SIZE_MAX)
		end = TASK_SIZE_MAX;
	end -= len;

	if (end > start) {
		offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
		addr = start + (offset << PAGE_SHIFT);
	} else {
		addr = start;
	}

	/*
	 * Forcibly align the final address in case we have a hardware
	 * issue that requires alignment for performance reasons.
	 */
	addr = align_vdso_addr(addr);

	return addr;
}

249 250
static int map_vdso_randomized(const struct vdso_image *image)
{
251 252
	unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);

253 254
	return map_vdso(image, addr);
}
255
#endif
256

257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
int map_vdso_once(const struct vdso_image *image, unsigned long addr)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;

	down_write(&mm->mmap_sem);
	/*
	 * Check if we have already mapped vdso blob - fail to prevent
	 * abusing from userspace install_speciall_mapping, which may
	 * not do accounting and rlimit right.
	 * We could search vma near context.vdso, but it's a slowpath,
	 * so let's explicitely check all VMAs to be completely sure.
	 */
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		if (vma_is_special_mapping(vma, &vdso_mapping) ||
				vma_is_special_mapping(vma, &vvar_mapping)) {
			up_write(&mm->mmap_sem);
			return -EEXIST;
		}
	}
	up_write(&mm->mmap_sem);

	return map_vdso(image, addr);
}

282
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
283 284 285 286 287
static int load_vdso32(void)
{
	if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
		return 0;

288
	return map_vdso(&vdso_image_32, 0);
289 290 291 292
}
#endif

#ifdef CONFIG_X86_64
H
H. J. Lu 已提交
293 294
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
295 296 297
	if (!vdso64_enabled)
		return 0;

298
	return map_vdso_randomized(&vdso_image_64);
H
H. J. Lu 已提交
299 300
}

301 302 303 304
#ifdef CONFIG_COMPAT
int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
				       int uses_interp)
{
H
H. J. Lu 已提交
305
#ifdef CONFIG_X86_X32_ABI
306 307 308
	if (test_thread_flag(TIF_X32)) {
		if (!vdso64_enabled)
			return 0;
309
		return map_vdso_randomized(&vdso_image_x32);
310 311
	}
#endif
312
#ifdef CONFIG_IA32_EMULATION
313
	return load_vdso32();
314 315 316
#else
	return 0;
#endif
317 318 319 320
}
#endif
#else
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
H
H. J. Lu 已提交
321
{
322
	return load_vdso32();
H
H. J. Lu 已提交
323 324 325
}
#endif

326
#ifdef CONFIG_X86_64
327 328
static __init int vdso_setup(char *s)
{
329
	vdso64_enabled = simple_strtoul(s, NULL, 0);
330 331 332
	return 0;
}
__setup("vdso=", vdso_setup);
333
#endif
334 335

#ifdef CONFIG_X86_64
336
static void vgetcpu_cpu_init(void *arg)
337
{
338
	int cpu = smp_processor_id();
339
	struct desc_struct d = { };
340 341 342 343
	unsigned long node = 0;
#ifdef CONFIG_NUMA
	node = cpu_to_node(cpu);
#endif
B
Borislav Petkov 已提交
344
	if (static_cpu_has(X86_FEATURE_RDTSCP))
345 346 347
		write_rdtscp_aux((node << 12) | cpu);

	/*
348 349 350
	 * Store cpu number in limit so that it can be loaded
	 * quickly in user space in vgetcpu. (12 bits for the CPU
	 * and 8 bits for the node)
351
	 */
352 353 354 355 356 357 358
	d.limit0 = cpu | ((node & 0xf) << 12);
	d.limit = node >> 4;
	d.type = 5;		/* RO data, expand down, accessed */
	d.dpl = 3;		/* Visible to user code */
	d.s = 1;		/* Not a system segment */
	d.p = 1;		/* Present */
	d.d = 1;		/* 32-bit */
359 360 361 362

	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}

363
static int vgetcpu_online(unsigned int cpu)
364
{
365
	return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
366 367
}

368
static int __init init_vdso(void)
369
{
370 371 372 373 374 375
	init_vdso_image(&vdso_image_64);

#ifdef CONFIG_X86_X32_ABI
	init_vdso_image(&vdso_image_x32);
#endif

376
	/* notifier priority > KVM */
377 378
	return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
				 "AP_X86_VDSO_VMA_ONLINE", vgetcpu_online, NULL);
379
}
380 381
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */