mprotect.c 11.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  mm/mprotect.c
 *
 *  (C) Copyright 1994 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 *
A
Alan Cox 已提交
7
 *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20
 *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
21 22
#include <linux/swap.h>
#include <linux/swapops.h>
A
Andrea Arcangeli 已提交
23
#include <linux/mmu_notifier.h>
24
#include <linux/migrate.h>
25
#include <linux/perf_event.h>
26
#include <linux/ksm.h>
27
#include <linux/pkeys.h>
L
Linus Torvalds 已提交
28 29 30 31 32
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

33 34
#include "internal.h"

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
/*
 * For a prot_numa update we only hold mmap_sem for read so there is a
 * potential race with faulting where a pmd was temporarily none. This
 * function checks for a transhuge pmd under the appropriate lock. It
 * returns a pte if it was successfully locked or NULL if it raced with
 * a transhuge insertion.
 */
static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
			unsigned long addr, int prot_numa, spinlock_t **ptl)
{
	pte_t *pte;
	spinlock_t *pmdl;

	/* !prot_numa is protected by mmap_sem held for write */
	if (!prot_numa)
		return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);

	pmdl = pmd_lock(vma->vm_mm, pmd);
	if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
		spin_unlock(pmdl);
		return NULL;
	}

	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
	spin_unlock(pmdl);
	return pte;
}

63
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
64
		unsigned long addr, unsigned long end, pgprot_t newprot,
65
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
66
{
67
	struct mm_struct *mm = vma->vm_mm;
68
	pte_t *pte, oldpte;
69
	spinlock_t *ptl;
70
	unsigned long pages = 0;
L
Linus Torvalds 已提交
71

72 73 74 75
	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
	if (!pte)
		return 0;

76
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
77
	do {
78 79
		oldpte = *pte;
		if (pte_present(oldpte)) {
L
Linus Torvalds 已提交
80
			pte_t ptent;
81
			bool preserve_write = prot_numa && pte_write(oldpte);
L
Linus Torvalds 已提交
82

83 84 85 86 87 88 89 90 91 92
			/*
			 * Avoid trapping faults against the zero or KSM
			 * pages. See similar comment in change_huge_pmd.
			 */
			if (prot_numa) {
				struct page *page;

				page = vm_normal_page(vma, addr, oldpte);
				if (!page || PageKsm(page))
					continue;
93 94 95 96

				/* Avoid TLB flush if possible */
				if (pte_protnone(oldpte))
					continue;
97 98
			}

99 100
			ptent = ptep_modify_prot_start(mm, addr, pte);
			ptent = pte_modify(ptent, newprot);
101 102
			if (preserve_write)
				ptent = pte_mkwrite(ptent);
103

104 105 106 107 108
			/* Avoid taking write faults for known dirty pages */
			if (dirty_accountable && pte_dirty(ptent) &&
					(pte_soft_dirty(ptent) ||
					 !(vma->vm_flags & VM_SOFTDIRTY))) {
				ptent = pte_mkwrite(ptent);
109
			}
110 111
			ptep_modify_prot_commit(mm, addr, pte, ptent);
			pages++;
112
		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
113 114 115
			swp_entry_t entry = pte_to_swp_entry(oldpte);

			if (is_write_migration_entry(entry)) {
116
				pte_t newpte;
117 118 119 120 121
				/*
				 * A protection check is difficult so
				 * just be safe and disable write
				 */
				make_migration_entry_read(&entry);
122 123 124 125
				newpte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(oldpte))
					newpte = pte_swp_mksoft_dirty(newpte);
				set_pte_at(mm, addr, pte, newpte);
126 127

				pages++;
128
			}
L
Linus Torvalds 已提交
129 130
		}
	} while (pte++, addr += PAGE_SIZE, addr != end);
131
	arch_leave_lazy_mmu_mode();
132
	pte_unmap_unlock(pte - 1, ptl);
133 134

	return pages;
L
Linus Torvalds 已提交
135 136
}

137 138 139
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
		pud_t *pud, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
140 141
{
	pmd_t *pmd;
142
	struct mm_struct *mm = vma->vm_mm;
L
Linus Torvalds 已提交
143
	unsigned long next;
144
	unsigned long pages = 0;
145
	unsigned long nr_huge_updates = 0;
146
	unsigned long mni_start = 0;
L
Linus Torvalds 已提交
147 148 149

	pmd = pmd_offset(pud, addr);
	do {
150 151
		unsigned long this_pages;

L
Linus Torvalds 已提交
152
		next = pmd_addr_end(addr, end);
153 154
		if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
				&& pmd_none_or_clear_bad(pmd))
155
			continue;
156 157 158 159 160 161 162

		/* invoke the mmu notifier if the pmd is populated */
		if (!mni_start) {
			mni_start = addr;
			mmu_notifier_invalidate_range_start(mm, mni_start, end);
		}

163
		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
164
			if (next - addr != HPAGE_PMD_SIZE) {
165
				split_huge_pmd(vma, pmd, addr);
166
				if (pmd_trans_unstable(pmd))
167 168
					continue;
			} else {
169
				int nr_ptes = change_huge_pmd(vma, pmd, addr,
170
						newprot, prot_numa);
171 172

				if (nr_ptes) {
173 174 175 176
					if (nr_ptes == HPAGE_PMD_NR) {
						pages += HPAGE_PMD_NR;
						nr_huge_updates++;
					}
177 178

					/* huge pmd was handled */
179 180
					continue;
				}
181
			}
182
			/* fall through, the trans huge pmd just split */
183
		}
184
		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
185
				 dirty_accountable, prot_numa);
186
		pages += this_pages;
L
Linus Torvalds 已提交
187
	} while (pmd++, addr = next, addr != end);
188

189 190 191
	if (mni_start)
		mmu_notifier_invalidate_range_end(mm, mni_start, end);

192 193
	if (nr_huge_updates)
		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
194
	return pages;
L
Linus Torvalds 已提交
195 196
}

197 198 199
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
		pgd_t *pgd, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
200 201 202
{
	pud_t *pud;
	unsigned long next;
203
	unsigned long pages = 0;
L
Linus Torvalds 已提交
204 205 206 207 208 209

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
210
		pages += change_pmd_range(vma, pud, addr, next, newprot,
211
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
212
	} while (pud++, addr = next, addr != end);
213 214

	return pages;
L
Linus Torvalds 已提交
215 216
}

217
static unsigned long change_protection_range(struct vm_area_struct *vma,
218
		unsigned long addr, unsigned long end, pgprot_t newprot,
219
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
220 221 222 223 224
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	unsigned long next;
	unsigned long start = addr;
225
	unsigned long pages = 0;
L
Linus Torvalds 已提交
226 227 228 229

	BUG_ON(addr >= end);
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
230
	set_tlb_flush_pending(mm);
L
Linus Torvalds 已提交
231 232 233 234
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
235
		pages += change_pud_range(vma, pgd, addr, next, newprot,
236
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
237
	} while (pgd++, addr = next, addr != end);
238

239 240 241
	/* Only flush the TLB if we actually modified any entries: */
	if (pages)
		flush_tlb_range(vma, start, end);
242
	clear_tlb_flush_pending(mm);
243 244 245 246 247 248

	return pages;
}

unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
		       unsigned long end, pgprot_t newprot,
249
		       int dirty_accountable, int prot_numa)
250 251 252 253 254 255
{
	unsigned long pages;

	if (is_vm_hugetlb_page(vma))
		pages = hugetlb_change_protection(vma, start, end, newprot);
	else
256
		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
257 258

	return pages;
L
Linus Torvalds 已提交
259 260
}

261
int
L
Linus Torvalds 已提交
262 263 264 265 266 267 268 269 270
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
	unsigned long start, unsigned long end, unsigned long newflags)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long oldflags = vma->vm_flags;
	long nrpages = (end - start) >> PAGE_SHIFT;
	unsigned long charged = 0;
	pgoff_t pgoff;
	int error;
271
	int dirty_accountable = 0;
L
Linus Torvalds 已提交
272 273 274 275 276 277 278 279 280

	if (newflags == oldflags) {
		*pprev = vma;
		return 0;
	}

	/*
	 * If we make a private mapping writable we increase our commit;
	 * but (without finer accounting) cannot reduce our commit if we
281 282
	 * make it unwritable again. hugetlb mapping were accounted for
	 * even if read-only so there is no need to account for them here
L
Linus Torvalds 已提交
283 284
	 */
	if (newflags & VM_WRITE) {
285 286 287 288
		/* Check space limits when area turns into data. */
		if (!may_expand_vm(mm, newflags, nrpages) &&
				may_expand_vm(mm, oldflags, nrpages))
			return -ENOMEM;
289
		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
290
						VM_SHARED|VM_NORESERVE))) {
L
Linus Torvalds 已提交
291
			charged = nrpages;
292
			if (security_vm_enough_memory_mm(mm, charged))
L
Linus Torvalds 已提交
293 294 295 296 297 298 299 300 301 302
				return -ENOMEM;
			newflags |= VM_ACCOUNT;
		}
	}

	/*
	 * First try to merge with previous and/or next vma.
	 */
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*pprev = vma_merge(mm, *pprev, start, end, newflags,
303 304
			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
			   vma->vm_userfaultfd_ctx);
L
Linus Torvalds 已提交
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
	if (*pprev) {
		vma = *pprev;
		goto success;
	}

	*pprev = vma;

	if (start != vma->vm_start) {
		error = split_vma(mm, vma, start, 1);
		if (error)
			goto fail;
	}

	if (end != vma->vm_end) {
		error = split_vma(mm, vma, end, 0);
		if (error)
			goto fail;
	}

success:
	/*
	 * vm_flags and vm_page_prot are protected by the mmap_sem
	 * held in write mode.
	 */
	vma->vm_flags = newflags;
330 331
	dirty_accountable = vma_wants_writenotify(vma);
	vma_set_page_prot(vma);
332

333 334
	change_protection(vma, start, end, vma->vm_page_prot,
			  dirty_accountable, 0);
335

336 337 338 339 340 341 342 343 344
	/*
	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
	 * fault on access.
	 */
	if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
			(newflags & VM_WRITE)) {
		populate_vma_page_range(vma, start, end, NULL);
	}

345 346
	vm_stat_account(mm, oldflags, -nrpages);
	vm_stat_account(mm, newflags, nrpages);
347
	perf_event_mmap(vma);
L
Linus Torvalds 已提交
348 349 350 351 352 353 354
	return 0;

fail:
	vm_unacct_memory(charged);
	return error;
}

355 356 357 358 359
/*
 * pkey==-1 when doing a legacy mprotect()
 */
static int do_mprotect_pkey(unsigned long start, size_t len,
		unsigned long prot, int pkey)
L
Linus Torvalds 已提交
360
{
361
	unsigned long nstart, end, tmp, reqprot;
L
Linus Torvalds 已提交
362 363 364
	struct vm_area_struct *vma, *prev;
	int error = -EINVAL;
	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
365 366
	const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
				(prot & PROT_READ);
367 368 369 370 371 372
	/*
	 * A temporary safety check since we are not validating
	 * the pkey before we introduce the allocation code.
	 */
	if (pkey != -1)
		return -EINVAL;
373

L
Linus Torvalds 已提交
374 375 376 377 378 379 380 381 382 383 384 385
	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
		return -EINVAL;

	if (start & ~PAGE_MASK)
		return -EINVAL;
	if (!len)
		return 0;
	len = PAGE_ALIGN(len);
	end = start + len;
	if (end <= start)
		return -ENOMEM;
386
	if (!arch_validate_prot(prot))
L
Linus Torvalds 已提交
387 388 389 390
		return -EINVAL;

	reqprot = prot;

391 392
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
L
Linus Torvalds 已提交
393

394
	vma = find_vma(current->mm, start);
L
Linus Torvalds 已提交
395 396 397
	error = -ENOMEM;
	if (!vma)
		goto out;
398
	prev = vma->vm_prev;
L
Linus Torvalds 已提交
399 400 401 402 403 404 405
	if (unlikely(grows & PROT_GROWSDOWN)) {
		if (vma->vm_start >= end)
			goto out;
		start = vma->vm_start;
		error = -EINVAL;
		if (!(vma->vm_flags & VM_GROWSDOWN))
			goto out;
406
	} else {
L
Linus Torvalds 已提交
407 408 409 410 411 412 413 414 415 416 417 418 419 420
		if (vma->vm_start > start)
			goto out;
		if (unlikely(grows & PROT_GROWSUP)) {
			end = vma->vm_end;
			error = -EINVAL;
			if (!(vma->vm_flags & VM_GROWSUP))
				goto out;
		}
	}
	if (start > vma->vm_start)
		prev = vma;

	for (nstart = start ; ; ) {
		unsigned long newflags;
421
		int new_vma_pkey;
L
Linus Torvalds 已提交
422

423
		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
L
Linus Torvalds 已提交
424

425 426 427 428
		/* Does the application expect PROT_READ to imply PROT_EXEC */
		if (rier && (vma->vm_flags & VM_MAYEXEC))
			prot |= PROT_EXEC;

429 430
		new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
		newflags = calc_vm_prot_bits(prot, new_vma_pkey);
431
		newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
L
Linus Torvalds 已提交
432

433 434
		/* newflags >> 4 shift VM_MAY% in place of VM_% */
		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
L
Linus Torvalds 已提交
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
			error = -EACCES;
			goto out;
		}

		error = security_file_mprotect(vma, reqprot, prot);
		if (error)
			goto out;

		tmp = vma->vm_end;
		if (tmp > end)
			tmp = end;
		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
		if (error)
			goto out;
		nstart = tmp;

		if (nstart < prev->vm_end)
			nstart = prev->vm_end;
		if (nstart >= end)
			goto out;

		vma = prev->vm_next;
		if (!vma || vma->vm_start != nstart) {
			error = -ENOMEM;
			goto out;
		}
461
		prot = reqprot;
L
Linus Torvalds 已提交
462 463 464 465 466
	}
out:
	up_write(&current->mm->mmap_sem);
	return error;
}
467 468 469 470 471 472 473 474 475 476 477 478

SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
		unsigned long, prot)
{
	return do_mprotect_pkey(start, len, prot, -1);
}

SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
		unsigned long, prot, int, pkey)
{
	return do_mprotect_pkey(start, len, prot, pkey);
}