mprotect.c 13.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  mm/mprotect.c
 *
 *  (C) Copyright 1994 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 *
A
Alan Cox 已提交
7
 *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20
 *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
21 22
#include <linux/swap.h>
#include <linux/swapops.h>
A
Andrea Arcangeli 已提交
23
#include <linux/mmu_notifier.h>
24
#include <linux/migrate.h>
25
#include <linux/perf_event.h>
26
#include <linux/pkeys.h>
27
#include <linux/ksm.h>
28
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
29 30
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
31
#include <asm/mmu_context.h>
L
Linus Torvalds 已提交
32 33
#include <asm/tlbflush.h>

34 35
#include "internal.h"

36
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
37
		unsigned long addr, unsigned long end, pgprot_t newprot,
38
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
39
{
40
	struct mm_struct *mm = vma->vm_mm;
41
	pte_t *pte, oldpte;
42
	spinlock_t *ptl;
43
	unsigned long pages = 0;
44
	int target_node = NUMA_NO_NODE;
L
Linus Torvalds 已提交
45

46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
	/*
	 * Can be called with only the mmap_sem for reading by
	 * prot_numa so we must check the pmd isn't constantly
	 * changing from under us from pmd_none to pmd_trans_huge
	 * and/or the other way around.
	 */
	if (pmd_trans_unstable(pmd))
		return 0;

	/*
	 * The pmd points to a regular pte so the pmd can't change
	 * from under us even if the mmap_sem is only hold for
	 * reading.
	 */
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
61 62 63
	if (!pte)
		return 0;

64 65 66 67 68
	/* Get target node for single threaded private VMAs */
	if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
	    atomic_read(&vma->vm_mm->mm_users) == 1)
		target_node = numa_node_id();

69
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
70
	do {
71 72
		oldpte = *pte;
		if (pte_present(oldpte)) {
L
Linus Torvalds 已提交
73
			pte_t ptent;
74
			bool preserve_write = prot_numa && pte_write(oldpte);
L
Linus Torvalds 已提交
75

76 77 78 79 80 81 82 83 84 85
			/*
			 * Avoid trapping faults against the zero or KSM
			 * pages. See similar comment in change_huge_pmd.
			 */
			if (prot_numa) {
				struct page *page;

				page = vm_normal_page(vma, addr, oldpte);
				if (!page || PageKsm(page))
					continue;
86 87 88 89

				/* Avoid TLB flush if possible */
				if (pte_protnone(oldpte))
					continue;
90 91 92 93 94 95 96

				/*
				 * Don't mess with PTEs if page is already on the node
				 * a single-threaded process is running on.
				 */
				if (target_node == page_to_nid(page))
					continue;
97 98
			}

99 100
			ptent = ptep_modify_prot_start(mm, addr, pte);
			ptent = pte_modify(ptent, newprot);
101
			if (preserve_write)
102
				ptent = pte_mk_savedwrite(ptent);
103

104 105 106 107 108
			/* Avoid taking write faults for known dirty pages */
			if (dirty_accountable && pte_dirty(ptent) &&
					(pte_soft_dirty(ptent) ||
					 !(vma->vm_flags & VM_SOFTDIRTY))) {
				ptent = pte_mkwrite(ptent);
109
			}
110 111
			ptep_modify_prot_commit(mm, addr, pte, ptent);
			pages++;
112
		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
113 114 115
			swp_entry_t entry = pte_to_swp_entry(oldpte);

			if (is_write_migration_entry(entry)) {
116
				pte_t newpte;
117 118 119 120 121
				/*
				 * A protection check is difficult so
				 * just be safe and disable write
				 */
				make_migration_entry_read(&entry);
122 123 124 125
				newpte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(oldpte))
					newpte = pte_swp_mksoft_dirty(newpte);
				set_pte_at(mm, addr, pte, newpte);
126 127

				pages++;
128
			}
L
Linus Torvalds 已提交
129 130
		}
	} while (pte++, addr += PAGE_SIZE, addr != end);
131
	arch_leave_lazy_mmu_mode();
132
	pte_unmap_unlock(pte - 1, ptl);
133 134

	return pages;
L
Linus Torvalds 已提交
135 136
}

137 138 139
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
		pud_t *pud, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
140 141
{
	pmd_t *pmd;
142
	struct mm_struct *mm = vma->vm_mm;
L
Linus Torvalds 已提交
143
	unsigned long next;
144
	unsigned long pages = 0;
145
	unsigned long nr_huge_updates = 0;
146
	unsigned long mni_start = 0;
L
Linus Torvalds 已提交
147 148 149

	pmd = pmd_offset(pud, addr);
	do {
150 151
		unsigned long this_pages;

L
Linus Torvalds 已提交
152
		next = pmd_addr_end(addr, end);
153 154
		if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
				&& pmd_none_or_clear_bad(pmd))
155
			continue;
156 157 158 159 160 161 162

		/* invoke the mmu notifier if the pmd is populated */
		if (!mni_start) {
			mni_start = addr;
			mmu_notifier_invalidate_range_start(mm, mni_start, end);
		}

163
		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
164
			if (next - addr != HPAGE_PMD_SIZE) {
165
				__split_huge_pmd(vma, pmd, addr, false, NULL);
166
			} else {
167
				int nr_ptes = change_huge_pmd(vma, pmd, addr,
168
						newprot, prot_numa);
169 170

				if (nr_ptes) {
171 172 173 174
					if (nr_ptes == HPAGE_PMD_NR) {
						pages += HPAGE_PMD_NR;
						nr_huge_updates++;
					}
175 176

					/* huge pmd was handled */
177 178
					continue;
				}
179
			}
180
			/* fall through, the trans huge pmd just split */
181
		}
182
		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
183
				 dirty_accountable, prot_numa);
184
		pages += this_pages;
L
Linus Torvalds 已提交
185
	} while (pmd++, addr = next, addr != end);
186

187 188 189
	if (mni_start)
		mmu_notifier_invalidate_range_end(mm, mni_start, end);

190 191
	if (nr_huge_updates)
		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
192
	return pages;
L
Linus Torvalds 已提交
193 194
}

195 196 197
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
		pgd_t *pgd, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
198 199 200
{
	pud_t *pud;
	unsigned long next;
201
	unsigned long pages = 0;
L
Linus Torvalds 已提交
202 203 204 205 206 207

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
208
		pages += change_pmd_range(vma, pud, addr, next, newprot,
209
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
210
	} while (pud++, addr = next, addr != end);
211 212

	return pages;
L
Linus Torvalds 已提交
213 214
}

215
static unsigned long change_protection_range(struct vm_area_struct *vma,
216
		unsigned long addr, unsigned long end, pgprot_t newprot,
217
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
218 219 220 221 222
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	unsigned long next;
	unsigned long start = addr;
223
	unsigned long pages = 0;
L
Linus Torvalds 已提交
224 225 226 227

	BUG_ON(addr >= end);
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
228
	set_tlb_flush_pending(mm);
L
Linus Torvalds 已提交
229 230 231 232
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
233
		pages += change_pud_range(vma, pgd, addr, next, newprot,
234
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
235
	} while (pgd++, addr = next, addr != end);
236

237 238 239
	/* Only flush the TLB if we actually modified any entries: */
	if (pages)
		flush_tlb_range(vma, start, end);
240
	clear_tlb_flush_pending(mm);
241 242 243 244 245 246

	return pages;
}

unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
		       unsigned long end, pgprot_t newprot,
247
		       int dirty_accountable, int prot_numa)
248 249 250 251 252 253
{
	unsigned long pages;

	if (is_vm_hugetlb_page(vma))
		pages = hugetlb_change_protection(vma, start, end, newprot);
	else
254
		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
255 256

	return pages;
L
Linus Torvalds 已提交
257 258
}

259
int
L
Linus Torvalds 已提交
260 261 262 263 264 265 266 267 268
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
	unsigned long start, unsigned long end, unsigned long newflags)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long oldflags = vma->vm_flags;
	long nrpages = (end - start) >> PAGE_SHIFT;
	unsigned long charged = 0;
	pgoff_t pgoff;
	int error;
269
	int dirty_accountable = 0;
L
Linus Torvalds 已提交
270 271 272 273 274 275 276 277 278

	if (newflags == oldflags) {
		*pprev = vma;
		return 0;
	}

	/*
	 * If we make a private mapping writable we increase our commit;
	 * but (without finer accounting) cannot reduce our commit if we
279 280
	 * make it unwritable again. hugetlb mapping were accounted for
	 * even if read-only so there is no need to account for them here
L
Linus Torvalds 已提交
281 282
	 */
	if (newflags & VM_WRITE) {
283 284 285 286
		/* Check space limits when area turns into data. */
		if (!may_expand_vm(mm, newflags, nrpages) &&
				may_expand_vm(mm, oldflags, nrpages))
			return -ENOMEM;
287
		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
288
						VM_SHARED|VM_NORESERVE))) {
L
Linus Torvalds 已提交
289
			charged = nrpages;
290
			if (security_vm_enough_memory_mm(mm, charged))
L
Linus Torvalds 已提交
291 292 293 294 295 296 297 298 299 300
				return -ENOMEM;
			newflags |= VM_ACCOUNT;
		}
	}

	/*
	 * First try to merge with previous and/or next vma.
	 */
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*pprev = vma_merge(mm, *pprev, start, end, newflags,
301 302
			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
			   vma->vm_userfaultfd_ctx);
L
Linus Torvalds 已提交
303 304
	if (*pprev) {
		vma = *pprev;
305
		VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
L
Linus Torvalds 已提交
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
		goto success;
	}

	*pprev = vma;

	if (start != vma->vm_start) {
		error = split_vma(mm, vma, start, 1);
		if (error)
			goto fail;
	}

	if (end != vma->vm_end) {
		error = split_vma(mm, vma, end, 0);
		if (error)
			goto fail;
	}

success:
	/*
	 * vm_flags and vm_page_prot are protected by the mmap_sem
	 * held in write mode.
	 */
	vma->vm_flags = newflags;
329
	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
330
	vma_set_page_prot(vma);
331

332 333
	change_protection(vma, start, end, vma->vm_page_prot,
			  dirty_accountable, 0);
334

335 336 337 338 339 340 341 342 343
	/*
	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
	 * fault on access.
	 */
	if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
			(newflags & VM_WRITE)) {
		populate_vma_page_range(vma, start, end, NULL);
	}

344 345
	vm_stat_account(mm, oldflags, -nrpages);
	vm_stat_account(mm, newflags, nrpages);
346
	perf_event_mmap(vma);
L
Linus Torvalds 已提交
347 348 349 350 351 352 353
	return 0;

fail:
	vm_unacct_memory(charged);
	return error;
}

354 355 356 357 358
/*
 * pkey==-1 when doing a legacy mprotect()
 */
static int do_mprotect_pkey(unsigned long start, size_t len,
		unsigned long prot, int pkey)
L
Linus Torvalds 已提交
359
{
360
	unsigned long nstart, end, tmp, reqprot;
L
Linus Torvalds 已提交
361 362 363
	struct vm_area_struct *vma, *prev;
	int error = -EINVAL;
	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
364 365 366
	const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
				(prot & PROT_READ);

L
Linus Torvalds 已提交
367 368 369 370 371 372 373 374 375 376 377 378
	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
		return -EINVAL;

	if (start & ~PAGE_MASK)
		return -EINVAL;
	if (!len)
		return 0;
	len = PAGE_ALIGN(len);
	end = start + len;
	if (end <= start)
		return -ENOMEM;
379
	if (!arch_validate_prot(prot))
L
Linus Torvalds 已提交
380 381 382 383
		return -EINVAL;

	reqprot = prot;

384 385
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
L
Linus Torvalds 已提交
386

387 388 389 390 391 392 393 394
	/*
	 * If userspace did not allocate the pkey, do not let
	 * them use it here.
	 */
	error = -EINVAL;
	if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
		goto out;

395
	vma = find_vma(current->mm, start);
L
Linus Torvalds 已提交
396 397 398
	error = -ENOMEM;
	if (!vma)
		goto out;
399
	prev = vma->vm_prev;
L
Linus Torvalds 已提交
400 401 402 403 404 405 406
	if (unlikely(grows & PROT_GROWSDOWN)) {
		if (vma->vm_start >= end)
			goto out;
		start = vma->vm_start;
		error = -EINVAL;
		if (!(vma->vm_flags & VM_GROWSDOWN))
			goto out;
407
	} else {
L
Linus Torvalds 已提交
408 409 410 411 412 413 414 415 416 417 418 419 420
		if (vma->vm_start > start)
			goto out;
		if (unlikely(grows & PROT_GROWSUP)) {
			end = vma->vm_end;
			error = -EINVAL;
			if (!(vma->vm_flags & VM_GROWSUP))
				goto out;
		}
	}
	if (start > vma->vm_start)
		prev = vma;

	for (nstart = start ; ; ) {
421
		unsigned long mask_off_old_flags;
L
Linus Torvalds 已提交
422
		unsigned long newflags;
423
		int new_vma_pkey;
L
Linus Torvalds 已提交
424

425
		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
L
Linus Torvalds 已提交
426

427 428 429 430
		/* Does the application expect PROT_READ to imply PROT_EXEC */
		if (rier && (vma->vm_flags & VM_MAYEXEC))
			prot |= PROT_EXEC;

431 432 433 434 435 436 437 438
		/*
		 * Each mprotect() call explicitly passes r/w/x permissions.
		 * If a permission is not passed to mprotect(), it must be
		 * cleared from the VMA.
		 */
		mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
					ARCH_VM_PKEY_FLAGS;

439 440
		new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
		newflags = calc_vm_prot_bits(prot, new_vma_pkey);
441
		newflags |= (vma->vm_flags & ~mask_off_old_flags);
L
Linus Torvalds 已提交
442

443 444
		/* newflags >> 4 shift VM_MAY% in place of VM_% */
		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
L
Linus Torvalds 已提交
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
			error = -EACCES;
			goto out;
		}

		error = security_file_mprotect(vma, reqprot, prot);
		if (error)
			goto out;

		tmp = vma->vm_end;
		if (tmp > end)
			tmp = end;
		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
		if (error)
			goto out;
		nstart = tmp;

		if (nstart < prev->vm_end)
			nstart = prev->vm_end;
		if (nstart >= end)
			goto out;

		vma = prev->vm_next;
		if (!vma || vma->vm_start != nstart) {
			error = -ENOMEM;
			goto out;
		}
471
		prot = reqprot;
L
Linus Torvalds 已提交
472 473 474 475 476
	}
out:
	up_write(&current->mm->mmap_sem);
	return error;
}
477 478 479 480 481 482 483

SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
		unsigned long, prot)
{
	return do_mprotect_pkey(start, len, prot, -1);
}

484 485
#ifdef CONFIG_ARCH_HAS_PKEYS

486 487 488 489 490
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
		unsigned long, prot, int, pkey)
{
	return do_mprotect_pkey(start, len, prot, pkey);
}
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535

SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
{
	int pkey;
	int ret;

	/* No flags supported yet. */
	if (flags)
		return -EINVAL;
	/* check for unsupported init values */
	if (init_val & ~PKEY_ACCESS_MASK)
		return -EINVAL;

	down_write(&current->mm->mmap_sem);
	pkey = mm_pkey_alloc(current->mm);

	ret = -ENOSPC;
	if (pkey == -1)
		goto out;

	ret = arch_set_user_pkey_access(current, pkey, init_val);
	if (ret) {
		mm_pkey_free(current->mm, pkey);
		goto out;
	}
	ret = pkey;
out:
	up_write(&current->mm->mmap_sem);
	return ret;
}

SYSCALL_DEFINE1(pkey_free, int, pkey)
{
	int ret;

	down_write(&current->mm->mmap_sem);
	ret = mm_pkey_free(current->mm, pkey);
	up_write(&current->mm->mmap_sem);

	/*
	 * We could provie warnings or errors if any VMA still
	 * has the pkey set here.
	 */
	return ret;
}
536 537

#endif /* CONFIG_ARCH_HAS_PKEYS */