mprotect.c 11.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  mm/mprotect.c
 *
 *  (C) Copyright 1994 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 *
A
Alan Cox 已提交
7
 *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20
 *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
21 22
#include <linux/swap.h>
#include <linux/swapops.h>
A
Andrea Arcangeli 已提交
23
#include <linux/mmu_notifier.h>
24
#include <linux/migrate.h>
25
#include <linux/perf_event.h>
26
#include <linux/ksm.h>
27
#include <linux/pkeys.h>
L
Linus Torvalds 已提交
28 29 30 31 32
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

33 34
#include "internal.h"

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
/*
 * For a prot_numa update we only hold mmap_sem for read so there is a
 * potential race with faulting where a pmd was temporarily none. This
 * function checks for a transhuge pmd under the appropriate lock. It
 * returns a pte if it was successfully locked or NULL if it raced with
 * a transhuge insertion.
 */
static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
			unsigned long addr, int prot_numa, spinlock_t **ptl)
{
	pte_t *pte;
	spinlock_t *pmdl;

	/* !prot_numa is protected by mmap_sem held for write */
	if (!prot_numa)
		return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);

	pmdl = pmd_lock(vma->vm_mm, pmd);
	if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
		spin_unlock(pmdl);
		return NULL;
	}

	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
	spin_unlock(pmdl);
	return pte;
}

63
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
64
		unsigned long addr, unsigned long end, pgprot_t newprot,
65
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
66
{
67
	struct mm_struct *mm = vma->vm_mm;
68
	pte_t *pte, oldpte;
69
	spinlock_t *ptl;
70
	unsigned long pages = 0;
L
Linus Torvalds 已提交
71

72 73 74 75
	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
	if (!pte)
		return 0;

76
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
77
	do {
78 79
		oldpte = *pte;
		if (pte_present(oldpte)) {
L
Linus Torvalds 已提交
80
			pte_t ptent;
81
			bool preserve_write = prot_numa && pte_write(oldpte);
L
Linus Torvalds 已提交
82

83 84 85 86 87 88 89 90 91 92
			/*
			 * Avoid trapping faults against the zero or KSM
			 * pages. See similar comment in change_huge_pmd.
			 */
			if (prot_numa) {
				struct page *page;

				page = vm_normal_page(vma, addr, oldpte);
				if (!page || PageKsm(page))
					continue;
93 94 95 96

				/* Avoid TLB flush if possible */
				if (pte_protnone(oldpte))
					continue;
97 98
			}

99 100
			ptent = ptep_modify_prot_start(mm, addr, pte);
			ptent = pte_modify(ptent, newprot);
101 102
			if (preserve_write)
				ptent = pte_mkwrite(ptent);
103

104 105 106 107 108
			/* Avoid taking write faults for known dirty pages */
			if (dirty_accountable && pte_dirty(ptent) &&
					(pte_soft_dirty(ptent) ||
					 !(vma->vm_flags & VM_SOFTDIRTY))) {
				ptent = pte_mkwrite(ptent);
109
			}
110 111
			ptep_modify_prot_commit(mm, addr, pte, ptent);
			pages++;
112
		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
113 114 115
			swp_entry_t entry = pte_to_swp_entry(oldpte);

			if (is_write_migration_entry(entry)) {
116
				pte_t newpte;
117 118 119 120 121
				/*
				 * A protection check is difficult so
				 * just be safe and disable write
				 */
				make_migration_entry_read(&entry);
122 123 124 125
				newpte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(oldpte))
					newpte = pte_swp_mksoft_dirty(newpte);
				set_pte_at(mm, addr, pte, newpte);
126 127

				pages++;
128
			}
L
Linus Torvalds 已提交
129 130
		}
	} while (pte++, addr += PAGE_SIZE, addr != end);
131
	arch_leave_lazy_mmu_mode();
132
	pte_unmap_unlock(pte - 1, ptl);
133 134

	return pages;
L
Linus Torvalds 已提交
135 136
}

137 138 139
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
		pud_t *pud, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
140 141
{
	pmd_t *pmd;
142
	struct mm_struct *mm = vma->vm_mm;
L
Linus Torvalds 已提交
143
	unsigned long next;
144
	unsigned long pages = 0;
145
	unsigned long nr_huge_updates = 0;
146
	unsigned long mni_start = 0;
L
Linus Torvalds 已提交
147 148 149

	pmd = pmd_offset(pud, addr);
	do {
150 151
		unsigned long this_pages;

L
Linus Torvalds 已提交
152
		next = pmd_addr_end(addr, end);
153 154
		if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
				&& pmd_none_or_clear_bad(pmd))
155
			continue;
156 157 158 159 160 161 162

		/* invoke the mmu notifier if the pmd is populated */
		if (!mni_start) {
			mni_start = addr;
			mmu_notifier_invalidate_range_start(mm, mni_start, end);
		}

163
		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
164
			if (next - addr != HPAGE_PMD_SIZE) {
165
				split_huge_pmd(vma, pmd, addr);
166 167 168
				if (pmd_none(*pmd))
					continue;
			} else {
169
				int nr_ptes = change_huge_pmd(vma, pmd, addr,
170
						newprot, prot_numa);
171 172

				if (nr_ptes) {
173 174 175 176
					if (nr_ptes == HPAGE_PMD_NR) {
						pages += HPAGE_PMD_NR;
						nr_huge_updates++;
					}
177 178

					/* huge pmd was handled */
179 180
					continue;
				}
181
			}
182
			/* fall through, the trans huge pmd just split */
183
		}
184
		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
185
				 dirty_accountable, prot_numa);
186
		pages += this_pages;
L
Linus Torvalds 已提交
187
	} while (pmd++, addr = next, addr != end);
188

189 190 191
	if (mni_start)
		mmu_notifier_invalidate_range_end(mm, mni_start, end);

192 193
	if (nr_huge_updates)
		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
194
	return pages;
L
Linus Torvalds 已提交
195 196
}

197 198 199
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
		pgd_t *pgd, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
200 201 202
{
	pud_t *pud;
	unsigned long next;
203
	unsigned long pages = 0;
L
Linus Torvalds 已提交
204 205 206 207 208 209

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
210
		pages += change_pmd_range(vma, pud, addr, next, newprot,
211
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
212
	} while (pud++, addr = next, addr != end);
213 214

	return pages;
L
Linus Torvalds 已提交
215 216
}

217
static unsigned long change_protection_range(struct vm_area_struct *vma,
218
		unsigned long addr, unsigned long end, pgprot_t newprot,
219
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
220 221 222 223 224
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	unsigned long next;
	unsigned long start = addr;
225
	unsigned long pages = 0;
L
Linus Torvalds 已提交
226 227 228 229

	BUG_ON(addr >= end);
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
230
	set_tlb_flush_pending(mm);
L
Linus Torvalds 已提交
231 232 233 234
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
235
		pages += change_pud_range(vma, pgd, addr, next, newprot,
236
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
237
	} while (pgd++, addr = next, addr != end);
238

239 240 241
	/* Only flush the TLB if we actually modified any entries: */
	if (pages)
		flush_tlb_range(vma, start, end);
242
	clear_tlb_flush_pending(mm);
243 244 245 246 247 248

	return pages;
}

unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
		       unsigned long end, pgprot_t newprot,
249
		       int dirty_accountable, int prot_numa)
250 251 252 253 254 255
{
	unsigned long pages;

	if (is_vm_hugetlb_page(vma))
		pages = hugetlb_change_protection(vma, start, end, newprot);
	else
256
		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
257 258

	return pages;
L
Linus Torvalds 已提交
259 260
}

261
int
L
Linus Torvalds 已提交
262 263 264 265 266 267 268 269 270
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
	unsigned long start, unsigned long end, unsigned long newflags)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long oldflags = vma->vm_flags;
	long nrpages = (end - start) >> PAGE_SHIFT;
	unsigned long charged = 0;
	pgoff_t pgoff;
	int error;
271
	int dirty_accountable = 0;
L
Linus Torvalds 已提交
272 273 274 275 276 277 278 279 280

	if (newflags == oldflags) {
		*pprev = vma;
		return 0;
	}

	/*
	 * If we make a private mapping writable we increase our commit;
	 * but (without finer accounting) cannot reduce our commit if we
281 282
	 * make it unwritable again. hugetlb mapping were accounted for
	 * even if read-only so there is no need to account for them here
L
Linus Torvalds 已提交
283 284
	 */
	if (newflags & VM_WRITE) {
285 286 287 288
		/* Check space limits when area turns into data. */
		if (!may_expand_vm(mm, newflags, nrpages) &&
				may_expand_vm(mm, oldflags, nrpages))
			return -ENOMEM;
289
		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
290
						VM_SHARED|VM_NORESERVE))) {
L
Linus Torvalds 已提交
291
			charged = nrpages;
292
			if (security_vm_enough_memory_mm(mm, charged))
L
Linus Torvalds 已提交
293 294 295 296 297 298 299 300 301 302
				return -ENOMEM;
			newflags |= VM_ACCOUNT;
		}
	}

	/*
	 * First try to merge with previous and/or next vma.
	 */
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*pprev = vma_merge(mm, *pprev, start, end, newflags,
303 304
			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
			   vma->vm_userfaultfd_ctx);
L
Linus Torvalds 已提交
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
	if (*pprev) {
		vma = *pprev;
		goto success;
	}

	*pprev = vma;

	if (start != vma->vm_start) {
		error = split_vma(mm, vma, start, 1);
		if (error)
			goto fail;
	}

	if (end != vma->vm_end) {
		error = split_vma(mm, vma, end, 0);
		if (error)
			goto fail;
	}

success:
	/*
	 * vm_flags and vm_page_prot are protected by the mmap_sem
	 * held in write mode.
	 */
	vma->vm_flags = newflags;
330 331
	dirty_accountable = vma_wants_writenotify(vma);
	vma_set_page_prot(vma);
332

333 334
	change_protection(vma, start, end, vma->vm_page_prot,
			  dirty_accountable, 0);
335

336 337 338 339 340 341 342 343 344
	/*
	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
	 * fault on access.
	 */
	if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
			(newflags & VM_WRITE)) {
		populate_vma_page_range(vma, start, end, NULL);
	}

345 346
	vm_stat_account(mm, oldflags, -nrpages);
	vm_stat_account(mm, newflags, nrpages);
347
	perf_event_mmap(vma);
L
Linus Torvalds 已提交
348 349 350 351 352 353 354
	return 0;

fail:
	vm_unacct_memory(charged);
	return error;
}

355 356
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
		unsigned long, prot)
L
Linus Torvalds 已提交
357
{
358
	unsigned long nstart, end, tmp, reqprot;
L
Linus Torvalds 已提交
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
	struct vm_area_struct *vma, *prev;
	int error = -EINVAL;
	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
		return -EINVAL;

	if (start & ~PAGE_MASK)
		return -EINVAL;
	if (!len)
		return 0;
	len = PAGE_ALIGN(len);
	end = start + len;
	if (end <= start)
		return -ENOMEM;
374
	if (!arch_validate_prot(prot))
L
Linus Torvalds 已提交
375 376 377 378 379 380
		return -EINVAL;

	reqprot = prot;
	/*
	 * Does the application expect PROT_READ to imply PROT_EXEC:
	 */
381
	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
L
Linus Torvalds 已提交
382 383 384 385
		prot |= PROT_EXEC;

	down_write(&current->mm->mmap_sem);

386
	vma = find_vma(current->mm, start);
L
Linus Torvalds 已提交
387 388 389
	error = -ENOMEM;
	if (!vma)
		goto out;
390
	prev = vma->vm_prev;
L
Linus Torvalds 已提交
391 392 393 394 395 396 397
	if (unlikely(grows & PROT_GROWSDOWN)) {
		if (vma->vm_start >= end)
			goto out;
		start = vma->vm_start;
		error = -EINVAL;
		if (!(vma->vm_flags & VM_GROWSDOWN))
			goto out;
398
	} else {
L
Linus Torvalds 已提交
399 400 401 402 403 404 405 406 407 408 409 410 411 412
		if (vma->vm_start > start)
			goto out;
		if (unlikely(grows & PROT_GROWSUP)) {
			end = vma->vm_end;
			error = -EINVAL;
			if (!(vma->vm_flags & VM_GROWSUP))
				goto out;
		}
	}
	if (start > vma->vm_start)
		prev = vma;

	for (nstart = start ; ; ) {
		unsigned long newflags;
413
		int pkey = arch_override_mprotect_pkey(vma, prot, -1);
L
Linus Torvalds 已提交
414

415
		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
L
Linus Torvalds 已提交
416

417
		newflags = calc_vm_prot_bits(prot, pkey);
418
		newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
L
Linus Torvalds 已提交
419

420 421
		/* newflags >> 4 shift VM_MAY% in place of VM_% */
		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
L
Linus Torvalds 已提交
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
			error = -EACCES;
			goto out;
		}

		error = security_file_mprotect(vma, reqprot, prot);
		if (error)
			goto out;

		tmp = vma->vm_end;
		if (tmp > end)
			tmp = end;
		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
		if (error)
			goto out;
		nstart = tmp;

		if (nstart < prev->vm_end)
			nstart = prev->vm_end;
		if (nstart >= end)
			goto out;

		vma = prev->vm_next;
		if (!vma || vma->vm_start != nstart) {
			error = -ENOMEM;
			goto out;
		}
	}
out:
	up_write(&current->mm->mmap_sem);
	return error;
}