mprotect.c 13.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  mm/mprotect.c
 *
 *  (C) Copyright 1994 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 *
A
Alan Cox 已提交
7
 *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20
 *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
21 22
#include <linux/swap.h>
#include <linux/swapops.h>
A
Andrea Arcangeli 已提交
23
#include <linux/mmu_notifier.h>
24
#include <linux/migrate.h>
25
#include <linux/perf_event.h>
26
#include <linux/pkeys.h>
27
#include <linux/ksm.h>
28
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
29 30
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
31
#include <asm/mmu_context.h>
L
Linus Torvalds 已提交
32 33
#include <asm/tlbflush.h>

34 35
#include "internal.h"

36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
/*
 * For a prot_numa update we only hold mmap_sem for read so there is a
 * potential race with faulting where a pmd was temporarily none. This
 * function checks for a transhuge pmd under the appropriate lock. It
 * returns a pte if it was successfully locked or NULL if it raced with
 * a transhuge insertion.
 */
static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
			unsigned long addr, int prot_numa, spinlock_t **ptl)
{
	pte_t *pte;
	spinlock_t *pmdl;

	/* !prot_numa is protected by mmap_sem held for write */
	if (!prot_numa)
		return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);

	pmdl = pmd_lock(vma->vm_mm, pmd);
	if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
		spin_unlock(pmdl);
		return NULL;
	}

	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
	spin_unlock(pmdl);
	return pte;
}

64
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
65
		unsigned long addr, unsigned long end, pgprot_t newprot,
66
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
67
{
68
	struct mm_struct *mm = vma->vm_mm;
69
	pte_t *pte, oldpte;
70
	spinlock_t *ptl;
71
	unsigned long pages = 0;
72
	int target_node = NUMA_NO_NODE;
L
Linus Torvalds 已提交
73

74 75 76 77
	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
	if (!pte)
		return 0;

78 79 80 81 82
	/* Get target node for single threaded private VMAs */
	if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
	    atomic_read(&vma->vm_mm->mm_users) == 1)
		target_node = numa_node_id();

83
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
84
	do {
85 86
		oldpte = *pte;
		if (pte_present(oldpte)) {
L
Linus Torvalds 已提交
87
			pte_t ptent;
88
			bool preserve_write = prot_numa && pte_write(oldpte);
L
Linus Torvalds 已提交
89

90 91 92 93 94 95 96 97 98 99
			/*
			 * Avoid trapping faults against the zero or KSM
			 * pages. See similar comment in change_huge_pmd.
			 */
			if (prot_numa) {
				struct page *page;

				page = vm_normal_page(vma, addr, oldpte);
				if (!page || PageKsm(page))
					continue;
100 101 102 103

				/* Avoid TLB flush if possible */
				if (pte_protnone(oldpte))
					continue;
104 105 106 107 108 109 110

				/*
				 * Don't mess with PTEs if page is already on the node
				 * a single-threaded process is running on.
				 */
				if (target_node == page_to_nid(page))
					continue;
111 112
			}

113 114
			ptent = ptep_modify_prot_start(mm, addr, pte);
			ptent = pte_modify(ptent, newprot);
115 116
			if (preserve_write)
				ptent = pte_mkwrite(ptent);
117

118 119 120 121 122
			/* Avoid taking write faults for known dirty pages */
			if (dirty_accountable && pte_dirty(ptent) &&
					(pte_soft_dirty(ptent) ||
					 !(vma->vm_flags & VM_SOFTDIRTY))) {
				ptent = pte_mkwrite(ptent);
123
			}
124 125
			ptep_modify_prot_commit(mm, addr, pte, ptent);
			pages++;
126
		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
127 128 129
			swp_entry_t entry = pte_to_swp_entry(oldpte);

			if (is_write_migration_entry(entry)) {
130
				pte_t newpte;
131 132 133 134 135
				/*
				 * A protection check is difficult so
				 * just be safe and disable write
				 */
				make_migration_entry_read(&entry);
136 137 138 139
				newpte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(oldpte))
					newpte = pte_swp_mksoft_dirty(newpte);
				set_pte_at(mm, addr, pte, newpte);
140 141

				pages++;
142
			}
L
Linus Torvalds 已提交
143 144
		}
	} while (pte++, addr += PAGE_SIZE, addr != end);
145
	arch_leave_lazy_mmu_mode();
146
	pte_unmap_unlock(pte - 1, ptl);
147 148

	return pages;
L
Linus Torvalds 已提交
149 150
}

151 152 153
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
		pud_t *pud, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
154 155
{
	pmd_t *pmd;
156
	struct mm_struct *mm = vma->vm_mm;
L
Linus Torvalds 已提交
157
	unsigned long next;
158
	unsigned long pages = 0;
159
	unsigned long nr_huge_updates = 0;
160
	unsigned long mni_start = 0;
L
Linus Torvalds 已提交
161 162 163

	pmd = pmd_offset(pud, addr);
	do {
164 165
		unsigned long this_pages;

L
Linus Torvalds 已提交
166
		next = pmd_addr_end(addr, end);
167 168
		if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
				&& pmd_none_or_clear_bad(pmd))
169
			continue;
170 171 172 173 174 175 176

		/* invoke the mmu notifier if the pmd is populated */
		if (!mni_start) {
			mni_start = addr;
			mmu_notifier_invalidate_range_start(mm, mni_start, end);
		}

177
		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
178
			if (next - addr != HPAGE_PMD_SIZE) {
179
				__split_huge_pmd(vma, pmd, addr, false, NULL);
180
				if (pmd_trans_unstable(pmd))
181 182
					continue;
			} else {
183
				int nr_ptes = change_huge_pmd(vma, pmd, addr,
184
						newprot, prot_numa);
185 186

				if (nr_ptes) {
187 188 189 190
					if (nr_ptes == HPAGE_PMD_NR) {
						pages += HPAGE_PMD_NR;
						nr_huge_updates++;
					}
191 192

					/* huge pmd was handled */
193 194
					continue;
				}
195
			}
196
			/* fall through, the trans huge pmd just split */
197
		}
198
		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
199
				 dirty_accountable, prot_numa);
200
		pages += this_pages;
L
Linus Torvalds 已提交
201
	} while (pmd++, addr = next, addr != end);
202

203 204 205
	if (mni_start)
		mmu_notifier_invalidate_range_end(mm, mni_start, end);

206 207
	if (nr_huge_updates)
		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
208
	return pages;
L
Linus Torvalds 已提交
209 210
}

211 212 213
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
		pgd_t *pgd, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
214 215 216
{
	pud_t *pud;
	unsigned long next;
217
	unsigned long pages = 0;
L
Linus Torvalds 已提交
218 219 220 221 222 223

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
224
		pages += change_pmd_range(vma, pud, addr, next, newprot,
225
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
226
	} while (pud++, addr = next, addr != end);
227 228

	return pages;
L
Linus Torvalds 已提交
229 230
}

231
static unsigned long change_protection_range(struct vm_area_struct *vma,
232
		unsigned long addr, unsigned long end, pgprot_t newprot,
233
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
234 235 236 237 238
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	unsigned long next;
	unsigned long start = addr;
239
	unsigned long pages = 0;
L
Linus Torvalds 已提交
240 241 242 243

	BUG_ON(addr >= end);
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
244
	set_tlb_flush_pending(mm);
L
Linus Torvalds 已提交
245 246 247 248
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
249
		pages += change_pud_range(vma, pgd, addr, next, newprot,
250
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
251
	} while (pgd++, addr = next, addr != end);
252

253 254 255
	/* Only flush the TLB if we actually modified any entries: */
	if (pages)
		flush_tlb_range(vma, start, end);
256
	clear_tlb_flush_pending(mm);
257 258 259 260 261 262

	return pages;
}

unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
		       unsigned long end, pgprot_t newprot,
263
		       int dirty_accountable, int prot_numa)
264 265 266 267 268 269
{
	unsigned long pages;

	if (is_vm_hugetlb_page(vma))
		pages = hugetlb_change_protection(vma, start, end, newprot);
	else
270
		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
271 272

	return pages;
L
Linus Torvalds 已提交
273 274
}

275
int
L
Linus Torvalds 已提交
276 277 278 279 280 281 282 283 284
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
	unsigned long start, unsigned long end, unsigned long newflags)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long oldflags = vma->vm_flags;
	long nrpages = (end - start) >> PAGE_SHIFT;
	unsigned long charged = 0;
	pgoff_t pgoff;
	int error;
285
	int dirty_accountable = 0;
L
Linus Torvalds 已提交
286 287 288 289 290 291 292 293 294

	if (newflags == oldflags) {
		*pprev = vma;
		return 0;
	}

	/*
	 * If we make a private mapping writable we increase our commit;
	 * but (without finer accounting) cannot reduce our commit if we
295 296
	 * make it unwritable again. hugetlb mapping were accounted for
	 * even if read-only so there is no need to account for them here
L
Linus Torvalds 已提交
297 298
	 */
	if (newflags & VM_WRITE) {
299 300 301 302
		/* Check space limits when area turns into data. */
		if (!may_expand_vm(mm, newflags, nrpages) &&
				may_expand_vm(mm, oldflags, nrpages))
			return -ENOMEM;
303
		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
304
						VM_SHARED|VM_NORESERVE))) {
L
Linus Torvalds 已提交
305
			charged = nrpages;
306
			if (security_vm_enough_memory_mm(mm, charged))
L
Linus Torvalds 已提交
307 308 309 310 311 312 313 314 315 316
				return -ENOMEM;
			newflags |= VM_ACCOUNT;
		}
	}

	/*
	 * First try to merge with previous and/or next vma.
	 */
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*pprev = vma_merge(mm, *pprev, start, end, newflags,
317 318
			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
			   vma->vm_userfaultfd_ctx);
L
Linus Torvalds 已提交
319 320
	if (*pprev) {
		vma = *pprev;
321
		VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
L
Linus Torvalds 已提交
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
		goto success;
	}

	*pprev = vma;

	if (start != vma->vm_start) {
		error = split_vma(mm, vma, start, 1);
		if (error)
			goto fail;
	}

	if (end != vma->vm_end) {
		error = split_vma(mm, vma, end, 0);
		if (error)
			goto fail;
	}

success:
	/*
	 * vm_flags and vm_page_prot are protected by the mmap_sem
	 * held in write mode.
	 */
	vma->vm_flags = newflags;
345
	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
346
	vma_set_page_prot(vma);
347

348 349
	change_protection(vma, start, end, vma->vm_page_prot,
			  dirty_accountable, 0);
350

351 352 353 354 355 356 357 358 359
	/*
	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
	 * fault on access.
	 */
	if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
			(newflags & VM_WRITE)) {
		populate_vma_page_range(vma, start, end, NULL);
	}

360 361
	vm_stat_account(mm, oldflags, -nrpages);
	vm_stat_account(mm, newflags, nrpages);
362
	perf_event_mmap(vma);
L
Linus Torvalds 已提交
363 364 365 366 367 368 369
	return 0;

fail:
	vm_unacct_memory(charged);
	return error;
}

370 371 372 373 374
/*
 * pkey==-1 when doing a legacy mprotect()
 */
static int do_mprotect_pkey(unsigned long start, size_t len,
		unsigned long prot, int pkey)
L
Linus Torvalds 已提交
375
{
376
	unsigned long nstart, end, tmp, reqprot;
L
Linus Torvalds 已提交
377 378 379
	struct vm_area_struct *vma, *prev;
	int error = -EINVAL;
	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
380 381 382
	const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
				(prot & PROT_READ);

L
Linus Torvalds 已提交
383 384 385 386 387 388 389 390 391 392 393 394
	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
		return -EINVAL;

	if (start & ~PAGE_MASK)
		return -EINVAL;
	if (!len)
		return 0;
	len = PAGE_ALIGN(len);
	end = start + len;
	if (end <= start)
		return -ENOMEM;
395
	if (!arch_validate_prot(prot))
L
Linus Torvalds 已提交
396 397 398 399
		return -EINVAL;

	reqprot = prot;

400 401
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
L
Linus Torvalds 已提交
402

403 404 405 406 407 408 409 410
	/*
	 * If userspace did not allocate the pkey, do not let
	 * them use it here.
	 */
	error = -EINVAL;
	if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
		goto out;

411
	vma = find_vma(current->mm, start);
L
Linus Torvalds 已提交
412 413 414
	error = -ENOMEM;
	if (!vma)
		goto out;
415
	prev = vma->vm_prev;
L
Linus Torvalds 已提交
416 417 418 419 420 421 422
	if (unlikely(grows & PROT_GROWSDOWN)) {
		if (vma->vm_start >= end)
			goto out;
		start = vma->vm_start;
		error = -EINVAL;
		if (!(vma->vm_flags & VM_GROWSDOWN))
			goto out;
423
	} else {
L
Linus Torvalds 已提交
424 425 426 427 428 429 430 431 432 433 434 435 436
		if (vma->vm_start > start)
			goto out;
		if (unlikely(grows & PROT_GROWSUP)) {
			end = vma->vm_end;
			error = -EINVAL;
			if (!(vma->vm_flags & VM_GROWSUP))
				goto out;
		}
	}
	if (start > vma->vm_start)
		prev = vma;

	for (nstart = start ; ; ) {
437
		unsigned long mask_off_old_flags;
L
Linus Torvalds 已提交
438
		unsigned long newflags;
439
		int new_vma_pkey;
L
Linus Torvalds 已提交
440

441
		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
L
Linus Torvalds 已提交
442

443 444 445 446
		/* Does the application expect PROT_READ to imply PROT_EXEC */
		if (rier && (vma->vm_flags & VM_MAYEXEC))
			prot |= PROT_EXEC;

447 448 449 450 451 452 453 454
		/*
		 * Each mprotect() call explicitly passes r/w/x permissions.
		 * If a permission is not passed to mprotect(), it must be
		 * cleared from the VMA.
		 */
		mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
					ARCH_VM_PKEY_FLAGS;

455 456
		new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
		newflags = calc_vm_prot_bits(prot, new_vma_pkey);
457
		newflags |= (vma->vm_flags & ~mask_off_old_flags);
L
Linus Torvalds 已提交
458

459 460
		/* newflags >> 4 shift VM_MAY% in place of VM_% */
		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
L
Linus Torvalds 已提交
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
			error = -EACCES;
			goto out;
		}

		error = security_file_mprotect(vma, reqprot, prot);
		if (error)
			goto out;

		tmp = vma->vm_end;
		if (tmp > end)
			tmp = end;
		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
		if (error)
			goto out;
		nstart = tmp;

		if (nstart < prev->vm_end)
			nstart = prev->vm_end;
		if (nstart >= end)
			goto out;

		vma = prev->vm_next;
		if (!vma || vma->vm_start != nstart) {
			error = -ENOMEM;
			goto out;
		}
487
		prot = reqprot;
L
Linus Torvalds 已提交
488 489 490 491 492
	}
out:
	up_write(&current->mm->mmap_sem);
	return error;
}
493 494 495 496 497 498 499

SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
		unsigned long, prot)
{
	return do_mprotect_pkey(start, len, prot, -1);
}

500 501
#ifdef CONFIG_ARCH_HAS_PKEYS

502 503 504 505 506
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
		unsigned long, prot, int, pkey)
{
	return do_mprotect_pkey(start, len, prot, pkey);
}
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551

SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
{
	int pkey;
	int ret;

	/* No flags supported yet. */
	if (flags)
		return -EINVAL;
	/* check for unsupported init values */
	if (init_val & ~PKEY_ACCESS_MASK)
		return -EINVAL;

	down_write(&current->mm->mmap_sem);
	pkey = mm_pkey_alloc(current->mm);

	ret = -ENOSPC;
	if (pkey == -1)
		goto out;

	ret = arch_set_user_pkey_access(current, pkey, init_val);
	if (ret) {
		mm_pkey_free(current->mm, pkey);
		goto out;
	}
	ret = pkey;
out:
	up_write(&current->mm->mmap_sem);
	return ret;
}

SYSCALL_DEFINE1(pkey_free, int, pkey)
{
	int ret;

	down_write(&current->mm->mmap_sem);
	ret = mm_pkey_free(current->mm, pkey);
	up_write(&current->mm->mmap_sem);

	/*
	 * We could provie warnings or errors if any VMA still
	 * has the pkey set here.
	 */
	return ret;
}
552 553

#endif /* CONFIG_ARCH_HAS_PKEYS */