mprotect.c 10.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  mm/mprotect.c
 *
 *  (C) Copyright 1994 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 *
A
Alan Cox 已提交
7
 *  Address space accounting code	<alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20
 *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
21 22
#include <linux/swap.h>
#include <linux/swapops.h>
A
Andrea Arcangeli 已提交
23
#include <linux/mmu_notifier.h>
24
#include <linux/migrate.h>
25
#include <linux/perf_event.h>
26
#include <linux/ksm.h>
L
Linus Torvalds 已提交
27 28 29 30 31
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
/*
 * For a prot_numa update we only hold mmap_sem for read so there is a
 * potential race with faulting where a pmd was temporarily none. This
 * function checks for a transhuge pmd under the appropriate lock. It
 * returns a pte if it was successfully locked or NULL if it raced with
 * a transhuge insertion.
 */
static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
			unsigned long addr, int prot_numa, spinlock_t **ptl)
{
	pte_t *pte;
	spinlock_t *pmdl;

	/* !prot_numa is protected by mmap_sem held for write */
	if (!prot_numa)
		return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);

	pmdl = pmd_lock(vma->vm_mm, pmd);
	if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
		spin_unlock(pmdl);
		return NULL;
	}

	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
	spin_unlock(pmdl);
	return pte;
}

60
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
61
		unsigned long addr, unsigned long end, pgprot_t newprot,
62
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
63
{
64
	struct mm_struct *mm = vma->vm_mm;
65
	pte_t *pte, oldpte;
66
	spinlock_t *ptl;
67
	unsigned long pages = 0;
L
Linus Torvalds 已提交
68

69 70 71 72
	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
	if (!pte)
		return 0;

73
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
74
	do {
75 76
		oldpte = *pte;
		if (pte_present(oldpte)) {
L
Linus Torvalds 已提交
77 78
			pte_t ptent;

79 80 81 82 83 84 85 86 87 88 89 90
			/*
			 * Avoid trapping faults against the zero or KSM
			 * pages. See similar comment in change_huge_pmd.
			 */
			if (prot_numa) {
				struct page *page;

				page = vm_normal_page(vma, addr, oldpte);
				if (!page || PageKsm(page))
					continue;
			}

91 92
			ptent = ptep_modify_prot_start(mm, addr, pte);
			ptent = pte_modify(ptent, newprot);
93

94 95 96 97 98
			/* Avoid taking write faults for known dirty pages */
			if (dirty_accountable && pte_dirty(ptent) &&
					(pte_soft_dirty(ptent) ||
					 !(vma->vm_flags & VM_SOFTDIRTY))) {
				ptent = pte_mkwrite(ptent);
99
			}
100 101
			ptep_modify_prot_commit(mm, addr, pte, ptent);
			pages++;
102
		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
103 104 105
			swp_entry_t entry = pte_to_swp_entry(oldpte);

			if (is_write_migration_entry(entry)) {
106
				pte_t newpte;
107 108 109 110 111
				/*
				 * A protection check is difficult so
				 * just be safe and disable write
				 */
				make_migration_entry_read(&entry);
112 113 114 115
				newpte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(oldpte))
					newpte = pte_swp_mksoft_dirty(newpte);
				set_pte_at(mm, addr, pte, newpte);
116 117

				pages++;
118
			}
L
Linus Torvalds 已提交
119 120
		}
	} while (pte++, addr += PAGE_SIZE, addr != end);
121
	arch_leave_lazy_mmu_mode();
122
	pte_unmap_unlock(pte - 1, ptl);
123 124

	return pages;
L
Linus Torvalds 已提交
125 126
}

127 128 129
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
		pud_t *pud, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
130 131
{
	pmd_t *pmd;
132
	struct mm_struct *mm = vma->vm_mm;
L
Linus Torvalds 已提交
133
	unsigned long next;
134
	unsigned long pages = 0;
135
	unsigned long nr_huge_updates = 0;
136
	unsigned long mni_start = 0;
L
Linus Torvalds 已提交
137 138 139

	pmd = pmd_offset(pud, addr);
	do {
140 141
		unsigned long this_pages;

L
Linus Torvalds 已提交
142
		next = pmd_addr_end(addr, end);
143 144
		if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
			continue;
145 146 147 148 149 150 151

		/* invoke the mmu notifier if the pmd is populated */
		if (!mni_start) {
			mni_start = addr;
			mmu_notifier_invalidate_range_start(mm, mni_start, end);
		}

152 153
		if (pmd_trans_huge(*pmd)) {
			if (next - addr != HPAGE_PMD_SIZE)
154
				split_huge_page_pmd(vma, addr, pmd);
155 156
			else {
				int nr_ptes = change_huge_pmd(vma, pmd, addr,
157
						newprot, prot_numa);
158 159

				if (nr_ptes) {
160 161 162 163
					if (nr_ptes == HPAGE_PMD_NR) {
						pages += HPAGE_PMD_NR;
						nr_huge_updates++;
					}
164 165

					/* huge pmd was handled */
166 167
					continue;
				}
168
			}
169
			/* fall through, the trans huge pmd just split */
170
		}
171
		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
172
				 dirty_accountable, prot_numa);
173
		pages += this_pages;
L
Linus Torvalds 已提交
174
	} while (pmd++, addr = next, addr != end);
175

176 177 178
	if (mni_start)
		mmu_notifier_invalidate_range_end(mm, mni_start, end);

179 180
	if (nr_huge_updates)
		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
181
	return pages;
L
Linus Torvalds 已提交
182 183
}

184 185 186
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
		pgd_t *pgd, unsigned long addr, unsigned long end,
		pgprot_t newprot, int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
187 188 189
{
	pud_t *pud;
	unsigned long next;
190
	unsigned long pages = 0;
L
Linus Torvalds 已提交
191 192 193 194 195 196

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
197
		pages += change_pmd_range(vma, pud, addr, next, newprot,
198
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
199
	} while (pud++, addr = next, addr != end);
200 201

	return pages;
L
Linus Torvalds 已提交
202 203
}

204
static unsigned long change_protection_range(struct vm_area_struct *vma,
205
		unsigned long addr, unsigned long end, pgprot_t newprot,
206
		int dirty_accountable, int prot_numa)
L
Linus Torvalds 已提交
207 208 209 210 211
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	unsigned long next;
	unsigned long start = addr;
212
	unsigned long pages = 0;
L
Linus Torvalds 已提交
213 214 215 216

	BUG_ON(addr >= end);
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
217
	set_tlb_flush_pending(mm);
L
Linus Torvalds 已提交
218 219 220 221
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
222
		pages += change_pud_range(vma, pgd, addr, next, newprot,
223
				 dirty_accountable, prot_numa);
L
Linus Torvalds 已提交
224
	} while (pgd++, addr = next, addr != end);
225

226 227 228
	/* Only flush the TLB if we actually modified any entries: */
	if (pages)
		flush_tlb_range(vma, start, end);
229
	clear_tlb_flush_pending(mm);
230 231 232 233 234 235

	return pages;
}

unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
		       unsigned long end, pgprot_t newprot,
236
		       int dirty_accountable, int prot_numa)
237 238 239 240 241 242
{
	unsigned long pages;

	if (is_vm_hugetlb_page(vma))
		pages = hugetlb_change_protection(vma, start, end, newprot);
	else
243
		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
244 245

	return pages;
L
Linus Torvalds 已提交
246 247
}

248
int
L
Linus Torvalds 已提交
249 250 251 252 253 254 255 256 257
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
	unsigned long start, unsigned long end, unsigned long newflags)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long oldflags = vma->vm_flags;
	long nrpages = (end - start) >> PAGE_SHIFT;
	unsigned long charged = 0;
	pgoff_t pgoff;
	int error;
258
	int dirty_accountable = 0;
L
Linus Torvalds 已提交
259 260 261 262 263 264 265 266 267

	if (newflags == oldflags) {
		*pprev = vma;
		return 0;
	}

	/*
	 * If we make a private mapping writable we increase our commit;
	 * but (without finer accounting) cannot reduce our commit if we
268 269
	 * make it unwritable again. hugetlb mapping were accounted for
	 * even if read-only so there is no need to account for them here
L
Linus Torvalds 已提交
270 271
	 */
	if (newflags & VM_WRITE) {
272
		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
273
						VM_SHARED|VM_NORESERVE))) {
L
Linus Torvalds 已提交
274
			charged = nrpages;
275
			if (security_vm_enough_memory_mm(mm, charged))
L
Linus Torvalds 已提交
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
				return -ENOMEM;
			newflags |= VM_ACCOUNT;
		}
	}

	/*
	 * First try to merge with previous and/or next vma.
	 */
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*pprev = vma_merge(mm, *pprev, start, end, newflags,
			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
	if (*pprev) {
		vma = *pprev;
		goto success;
	}

	*pprev = vma;

	if (start != vma->vm_start) {
		error = split_vma(mm, vma, start, 1);
		if (error)
			goto fail;
	}

	if (end != vma->vm_end) {
		error = split_vma(mm, vma, end, 0);
		if (error)
			goto fail;
	}

success:
	/*
	 * vm_flags and vm_page_prot are protected by the mmap_sem
	 * held in write mode.
	 */
	vma->vm_flags = newflags;
312 313
	dirty_accountable = vma_wants_writenotify(vma);
	vma_set_page_prot(vma);
314

315 316
	change_protection(vma, start, end, vma->vm_page_prot,
			  dirty_accountable, 0);
317

318 319
	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
320
	perf_event_mmap(vma);
L
Linus Torvalds 已提交
321 322 323 324 325 326 327
	return 0;

fail:
	vm_unacct_memory(charged);
	return error;
}

328 329
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
		unsigned long, prot)
L
Linus Torvalds 已提交
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
{
	unsigned long vm_flags, nstart, end, tmp, reqprot;
	struct vm_area_struct *vma, *prev;
	int error = -EINVAL;
	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
		return -EINVAL;

	if (start & ~PAGE_MASK)
		return -EINVAL;
	if (!len)
		return 0;
	len = PAGE_ALIGN(len);
	end = start + len;
	if (end <= start)
		return -ENOMEM;
347
	if (!arch_validate_prot(prot))
L
Linus Torvalds 已提交
348 349 350 351 352 353
		return -EINVAL;

	reqprot = prot;
	/*
	 * Does the application expect PROT_READ to imply PROT_EXEC:
	 */
354
	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
L
Linus Torvalds 已提交
355 356 357 358 359 360
		prot |= PROT_EXEC;

	vm_flags = calc_vm_prot_bits(prot);

	down_write(&current->mm->mmap_sem);

361
	vma = find_vma(current->mm, start);
L
Linus Torvalds 已提交
362 363 364
	error = -ENOMEM;
	if (!vma)
		goto out;
365
	prev = vma->vm_prev;
L
Linus Torvalds 已提交
366 367 368 369 370 371 372
	if (unlikely(grows & PROT_GROWSDOWN)) {
		if (vma->vm_start >= end)
			goto out;
		start = vma->vm_start;
		error = -EINVAL;
		if (!(vma->vm_flags & VM_GROWSDOWN))
			goto out;
373
	} else {
L
Linus Torvalds 已提交
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
		if (vma->vm_start > start)
			goto out;
		if (unlikely(grows & PROT_GROWSUP)) {
			end = vma->vm_end;
			error = -EINVAL;
			if (!(vma->vm_flags & VM_GROWSUP))
				goto out;
		}
	}
	if (start > vma->vm_start)
		prev = vma;

	for (nstart = start ; ; ) {
		unsigned long newflags;

389
		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
L
Linus Torvalds 已提交
390

391 392
		newflags = vm_flags;
		newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
L
Linus Torvalds 已提交
393

394 395
		/* newflags >> 4 shift VM_MAY% in place of VM_% */
		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
L
Linus Torvalds 已提交
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
			error = -EACCES;
			goto out;
		}

		error = security_file_mprotect(vma, reqprot, prot);
		if (error)
			goto out;

		tmp = vma->vm_end;
		if (tmp > end)
			tmp = end;
		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
		if (error)
			goto out;
		nstart = tmp;

		if (nstart < prev->vm_end)
			nstart = prev->vm_end;
		if (nstart >= end)
			goto out;

		vma = prev->vm_next;
		if (!vma || vma->vm_start != nstart) {
			error = -ENOMEM;
			goto out;
		}
	}
out:
	up_write(&current->mm->mmap_sem);
	return error;
}