mprotect.c 13.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  mm/mprotect.c
 *
 *  (C) Copyright 1994 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 *
A
Alan Cox 已提交
7
 *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20
 *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
21 22
#include <linux/swap.h>
#include <linux/swapops.h>
A
Andrea Arcangeli 已提交
23
#include <linux/mmu_notifier.h>
24
#include <linux/migrate.h>
25
#include <linux/perf_event.h>
26
#include <linux/pkeys.h>
27
#include <linux/ksm.h>
28
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
29 30
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
31
#include <asm/mmu_context.h>
L
Linus Torvalds 已提交
32 33
#include <asm/tlbflush.h>

34 35
#include "internal.h"

36
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
37
		unsigned long addr, unsigned long end, pgprot_t newprot,
38
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
39
{
40
	struct mm_struct *mm = vma->vm_mm;
41
	pte_t *pte, oldpte;
42
	spinlock_t *ptl;
43
	unsigned long pages = 0;
44
	int target_node = NUMA_NO_NODE;
L
Linus Torvalds 已提交
45

46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
	/*
	 * Can be called with only the mmap_sem for reading by
	 * prot_numa so we must check the pmd isn't constantly
	 * changing from under us from pmd_none to pmd_trans_huge
	 * and/or the other way around.
	 */
	if (pmd_trans_unstable(pmd))
		return 0;

	/*
	 * The pmd points to a regular pte so the pmd can't change
	 * from under us even if the mmap_sem is only hold for
	 * reading.
	 */
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
61 62 63
	if (!pte)
		return 0;

64 65 66 67 68
	/* Get target node for single threaded private VMAs */
	if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
	    atomic_read(&vma->vm_mm->mm_users) == 1)
		target_node = numa_node_id();

69
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
70
	do {
71 72
		oldpte = *pte;
		if (pte_present(oldpte)) {
L
Linus Torvalds 已提交
73
			pte_t ptent;
74
			bool preserve_write = prot_numa && pte_write(oldpte);
L
Linus Torvalds 已提交
75

76 77 78 79 80 81 82 83 84 85
			/*
			 * Avoid trapping faults against the zero or KSM
			 * pages. See similar comment in change_huge_pmd.
			 */
			if (prot_numa) {
				struct page *page;

				page = vm_normal_page(vma, addr, oldpte);
				if (!page || PageKsm(page))
					continue;
86 87 88 89

				/* Avoid TLB flush if possible */
				if (pte_protnone(oldpte))
					continue;
90 91 92 93 94 95 96

				/*
				 * Don't mess with PTEs if page is already on the node
				 * a single-threaded process is running on.
				 */
				if (target_node == page_to_nid(page))
					continue;
97 98
			}

99 100
			ptent = ptep_modify_prot_start(mm, addr, pte);
			ptent = pte_modify(ptent, newprot);
101
			if (preserve_write)
102
				ptent = pte_mk_savedwrite(ptent);
103

104 105 106 107 108
			/* Avoid taking write faults for known dirty pages */
			if (dirty_accountable && pte_dirty(ptent) &&
					(pte_soft_dirty(ptent) ||
					 !(vma->vm_flags & VM_SOFTDIRTY))) {
				ptent = pte_mkwrite(ptent);
109
			}
110 111
			ptep_modify_prot_commit(mm, addr, pte, ptent);
			pages++;
112
		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
113 114 115
			swp_entry_t entry = pte_to_swp_entry(oldpte);

			if (is_write_migration_entry(entry)) {
116
				pte_t newpte;
117 118 119 120 121
				/*
				 * A protection check is difficult so
				 * just be safe and disable write
				 */
				make_migration_entry_read(&entry);
122 123 124 125
				newpte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(oldpte))
					newpte = pte_swp_mksoft_dirty(newpte);
				set_pte_at(mm, addr, pte, newpte);
126 127

				pages++;
128
			}
L
Linus Torvalds 已提交
129 130
		}
	} while (pte++, addr += PAGE_SIZE, addr != end);
131
	arch_leave_lazy_mmu_mode();
132
	pte_unmap_unlock(pte - 1, ptl);
133 134

	return pages;
L
Linus Torvalds 已提交
135 136
}

137 138 139
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
		pud_t *pud, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
140 141
{
	pmd_t *pmd;
142
	struct mm_struct *mm = vma->vm_mm;
L
Linus Torvalds 已提交
143
	unsigned long next;
144
	unsigned long pages = 0;
145
	unsigned long nr_huge_updates = 0;
146
	unsigned long mni_start = 0;
L
Linus Torvalds 已提交
147 148 149

	pmd = pmd_offset(pud, addr);
	do {
150 151
		unsigned long this_pages;

L
Linus Torvalds 已提交
152
		next = pmd_addr_end(addr, end);
153 154
		if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
				&& pmd_none_or_clear_bad(pmd))
155
			continue;
156 157 158 159 160 161 162

		/* invoke the mmu notifier if the pmd is populated */
		if (!mni_start) {
			mni_start = addr;
			mmu_notifier_invalidate_range_start(mm, mni_start, end);
		}

163
		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
164
			if (next - addr != HPAGE_PMD_SIZE) {
165
				__split_huge_pmd(vma, pmd, addr, false, NULL);
166
			} else {
167
				int nr_ptes = change_huge_pmd(vma, pmd, addr,
168
						newprot, prot_numa);
169 170

				if (nr_ptes) {
171 172 173 174
					if (nr_ptes == HPAGE_PMD_NR) {
						pages += HPAGE_PMD_NR;
						nr_huge_updates++;
					}
175 176

					/* huge pmd was handled */
177 178
					continue;
				}
179
			}
180
			/* fall through, the trans huge pmd just split */
181
		}
182
		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
183
				 dirty_accountable, prot_numa);
184
		pages += this_pages;
L
Linus Torvalds 已提交
185
	} while (pmd++, addr = next, addr != end);
186

187 188 189
	if (mni_start)
		mmu_notifier_invalidate_range_end(mm, mni_start, end);

190 191
	if (nr_huge_updates)
		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
192
	return pages;
L
Linus Torvalds 已提交
193 194
}

195
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
196
		p4d_t *p4d, unsigned long addr, unsigned long end,
197
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
198 199 200
{
	pud_t *pud;
	unsigned long next;
201
	unsigned long pages = 0;
L
Linus Torvalds 已提交
202

203
	pud = pud_offset(p4d, addr);
L
Linus Torvalds 已提交
204 205 206 207
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
208
		pages += change_pmd_range(vma, pud, addr, next, newprot,
209
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
210
	} while (pud++, addr = next, addr != end);
211 212

	return pages;
L
Linus Torvalds 已提交
213 214
}

215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
		pgd_t *pgd, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
{
	p4d_t *p4d;
	unsigned long next;
	unsigned long pages = 0;

	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(p4d))
			continue;
		pages += change_pud_range(vma, p4d, addr, next, newprot,
				 dirty_accountable, prot_numa);
	} while (p4d++, addr = next, addr != end);

	return pages;
}

235
static unsigned long change_protection_range(struct vm_area_struct *vma,
236
		unsigned long addr, unsigned long end, pgprot_t newprot,
237
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
238 239 240 241 242
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	unsigned long next;
	unsigned long start = addr;
243
	unsigned long pages = 0;
L
Linus Torvalds 已提交
244 245 246 247

	BUG_ON(addr >= end);
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
248
	set_tlb_flush_pending(mm);
L
Linus Torvalds 已提交
249 250 251 252
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
253
		pages += change_p4d_range(vma, pgd, addr, next, newprot,
254
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
255
	} while (pgd++, addr = next, addr != end);
256

257 258 259
	/* Only flush the TLB if we actually modified any entries: */
	if (pages)
		flush_tlb_range(vma, start, end);
260
	clear_tlb_flush_pending(mm);
261 262 263 264 265 266

	return pages;
}

unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
		       unsigned long end, pgprot_t newprot,
267
		       int dirty_accountable, int prot_numa)
268 269 270 271 272 273
{
	unsigned long pages;

	if (is_vm_hugetlb_page(vma))
		pages = hugetlb_change_protection(vma, start, end, newprot);
	else
274
		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
275 276

	return pages;
L
Linus Torvalds 已提交
277 278
}

279
int
L
Linus Torvalds 已提交
280 281 282 283 284 285 286 287 288
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
	unsigned long start, unsigned long end, unsigned long newflags)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long oldflags = vma->vm_flags;
	long nrpages = (end - start) >> PAGE_SHIFT;
	unsigned long charged = 0;
	pgoff_t pgoff;
	int error;
289
	int dirty_accountable = 0;
L
Linus Torvalds 已提交
290 291 292 293 294 295 296 297 298

	if (newflags == oldflags) {
		*pprev = vma;
		return 0;
	}

	/*
	 * If we make a private mapping writable we increase our commit;
	 * but (without finer accounting) cannot reduce our commit if we
299 300
	 * make it unwritable again. hugetlb mapping were accounted for
	 * even if read-only so there is no need to account for them here
L
Linus Torvalds 已提交
301 302
	 */
	if (newflags & VM_WRITE) {
303 304 305 306
		/* Check space limits when area turns into data. */
		if (!may_expand_vm(mm, newflags, nrpages) &&
				may_expand_vm(mm, oldflags, nrpages))
			return -ENOMEM;
307
		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
308
						VM_SHARED|VM_NORESERVE))) {
L
Linus Torvalds 已提交
309
			charged = nrpages;
310
			if (security_vm_enough_memory_mm(mm, charged))
L
Linus Torvalds 已提交
311 312 313 314 315 316 317 318 319 320
				return -ENOMEM;
			newflags |= VM_ACCOUNT;
		}
	}

	/*
	 * First try to merge with previous and/or next vma.
	 */
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*pprev = vma_merge(mm, *pprev, start, end, newflags,
321 322
			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
			   vma->vm_userfaultfd_ctx);
L
Linus Torvalds 已提交
323 324
	if (*pprev) {
		vma = *pprev;
325
		VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
L
Linus Torvalds 已提交
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
		goto success;
	}

	*pprev = vma;

	if (start != vma->vm_start) {
		error = split_vma(mm, vma, start, 1);
		if (error)
			goto fail;
	}

	if (end != vma->vm_end) {
		error = split_vma(mm, vma, end, 0);
		if (error)
			goto fail;
	}

success:
	/*
	 * vm_flags and vm_page_prot are protected by the mmap_sem
	 * held in write mode.
	 */
	vma->vm_flags = newflags;
349
	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
350
	vma_set_page_prot(vma);
351

352 353
	change_protection(vma, start, end, vma->vm_page_prot,
			  dirty_accountable, 0);
354

355 356 357 358 359 360 361 362 363
	/*
	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
	 * fault on access.
	 */
	if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
			(newflags & VM_WRITE)) {
		populate_vma_page_range(vma, start, end, NULL);
	}

364 365
	vm_stat_account(mm, oldflags, -nrpages);
	vm_stat_account(mm, newflags, nrpages);
366
	perf_event_mmap(vma);
L
Linus Torvalds 已提交
367 368 369 370 371 372 373
	return 0;

fail:
	vm_unacct_memory(charged);
	return error;
}

374 375 376 377 378
/*
 * pkey==-1 when doing a legacy mprotect()
 */
static int do_mprotect_pkey(unsigned long start, size_t len,
		unsigned long prot, int pkey)
L
Linus Torvalds 已提交
379
{
380
	unsigned long nstart, end, tmp, reqprot;
L
Linus Torvalds 已提交
381 382 383
	struct vm_area_struct *vma, *prev;
	int error = -EINVAL;
	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
384 385 386
	const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
				(prot & PROT_READ);

L
Linus Torvalds 已提交
387 388 389 390 391 392 393 394 395 396 397 398
	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
		return -EINVAL;

	if (start & ~PAGE_MASK)
		return -EINVAL;
	if (!len)
		return 0;
	len = PAGE_ALIGN(len);
	end = start + len;
	if (end <= start)
		return -ENOMEM;
399
	if (!arch_validate_prot(prot))
L
Linus Torvalds 已提交
400 401 402 403
		return -EINVAL;

	reqprot = prot;

404 405
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
L
Linus Torvalds 已提交
406

407 408 409 410 411 412 413 414
	/*
	 * If userspace did not allocate the pkey, do not let
	 * them use it here.
	 */
	error = -EINVAL;
	if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
		goto out;

415
	vma = find_vma(current->mm, start);
L
Linus Torvalds 已提交
416 417 418
	error = -ENOMEM;
	if (!vma)
		goto out;
419
	prev = vma->vm_prev;
L
Linus Torvalds 已提交
420 421 422 423 424 425 426
	if (unlikely(grows & PROT_GROWSDOWN)) {
		if (vma->vm_start >= end)
			goto out;
		start = vma->vm_start;
		error = -EINVAL;
		if (!(vma->vm_flags & VM_GROWSDOWN))
			goto out;
427
	} else {
L
Linus Torvalds 已提交
428 429 430 431 432 433 434 435 436 437 438 439 440
		if (vma->vm_start > start)
			goto out;
		if (unlikely(grows & PROT_GROWSUP)) {
			end = vma->vm_end;
			error = -EINVAL;
			if (!(vma->vm_flags & VM_GROWSUP))
				goto out;
		}
	}
	if (start > vma->vm_start)
		prev = vma;

	for (nstart = start ; ; ) {
441
		unsigned long mask_off_old_flags;
L
Linus Torvalds 已提交
442
		unsigned long newflags;
443
		int new_vma_pkey;
L
Linus Torvalds 已提交
444

445
		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
L
Linus Torvalds 已提交
446

447 448 449 450
		/* Does the application expect PROT_READ to imply PROT_EXEC */
		if (rier && (vma->vm_flags & VM_MAYEXEC))
			prot |= PROT_EXEC;

451 452 453 454 455 456 457 458
		/*
		 * Each mprotect() call explicitly passes r/w/x permissions.
		 * If a permission is not passed to mprotect(), it must be
		 * cleared from the VMA.
		 */
		mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
					ARCH_VM_PKEY_FLAGS;

459 460
		new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
		newflags = calc_vm_prot_bits(prot, new_vma_pkey);
461
		newflags |= (vma->vm_flags & ~mask_off_old_flags);
L
Linus Torvalds 已提交
462

463 464
		/* newflags >> 4 shift VM_MAY% in place of VM_% */
		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
L
Linus Torvalds 已提交
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
			error = -EACCES;
			goto out;
		}

		error = security_file_mprotect(vma, reqprot, prot);
		if (error)
			goto out;

		tmp = vma->vm_end;
		if (tmp > end)
			tmp = end;
		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
		if (error)
			goto out;
		nstart = tmp;

		if (nstart < prev->vm_end)
			nstart = prev->vm_end;
		if (nstart >= end)
			goto out;

		vma = prev->vm_next;
		if (!vma || vma->vm_start != nstart) {
			error = -ENOMEM;
			goto out;
		}
491
		prot = reqprot;
L
Linus Torvalds 已提交
492 493 494 495 496
	}
out:
	up_write(&current->mm->mmap_sem);
	return error;
}
497 498 499 500 501 502 503

SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
		unsigned long, prot)
{
	return do_mprotect_pkey(start, len, prot, -1);
}

504 505
#ifdef CONFIG_ARCH_HAS_PKEYS

506 507 508 509 510
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
		unsigned long, prot, int, pkey)
{
	return do_mprotect_pkey(start, len, prot, pkey);
}
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555

SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
{
	int pkey;
	int ret;

	/* No flags supported yet. */
	if (flags)
		return -EINVAL;
	/* check for unsupported init values */
	if (init_val & ~PKEY_ACCESS_MASK)
		return -EINVAL;

	down_write(&current->mm->mmap_sem);
	pkey = mm_pkey_alloc(current->mm);

	ret = -ENOSPC;
	if (pkey == -1)
		goto out;

	ret = arch_set_user_pkey_access(current, pkey, init_val);
	if (ret) {
		mm_pkey_free(current->mm, pkey);
		goto out;
	}
	ret = pkey;
out:
	up_write(&current->mm->mmap_sem);
	return ret;
}

SYSCALL_DEFINE1(pkey_free, int, pkey)
{
	int ret;

	down_write(&current->mm->mmap_sem);
	ret = mm_pkey_free(current->mm, pkey);
	up_write(&current->mm->mmap_sem);

	/*
	 * We could provie warnings or errors if any VMA still
	 * has the pkey set here.
	 */
	return ret;
}
556 557

#endif /* CONFIG_ARCH_HAS_PKEYS */