memory.c 149.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
T
Tobin C Harding 已提交
34
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
L
Linus Torvalds 已提交
35 36 37 38 39 40 41 42 43
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
44
#include <linux/mm_inline.h>
45
#include <linux/sched/mm.h>
46
#include <linux/sched/coredump.h>
47
#include <linux/sched/numa_balancing.h>
48
#include <linux/sched/task.h>
L
Linus Torvalds 已提交
49 50 51 52 53
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
54
#include <linux/memremap.h>
H
Hugh Dickins 已提交
55
#include <linux/ksm.h>
L
Linus Torvalds 已提交
56
#include <linux/rmap.h>
57
#include <linux/export.h>
58
#include <linux/delayacct.h>
L
Linus Torvalds 已提交
59
#include <linux/init.h>
60
#include <linux/pfn_t.h>
P
Peter Zijlstra 已提交
61
#include <linux/writeback.h>
62
#include <linux/memcontrol.h>
A
Andrea Arcangeli 已提交
63
#include <linux/mmu_notifier.h>
64 65
#include <linux/swapops.h>
#include <linux/elf.h>
66
#include <linux/gfp.h>
67
#include <linux/migrate.h>
A
Andy Shevchenko 已提交
68
#include <linux/string.h>
69
#include <linux/debugfs.h>
70
#include <linux/userfaultfd_k.h>
71
#include <linux/dax.h>
72
#include <linux/oom.h>
73
#include <linux/numa.h>
74 75
#include <linux/perf_event.h>
#include <linux/ptrace.h>
76
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
77

78 79
#include <trace/events/kmem.h>

A
Alexey Dobriyan 已提交
80
#include <asm/io.h>
81
#include <asm/mmu_context.h>
L
Linus Torvalds 已提交
82
#include <asm/pgalloc.h>
83
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
84 85 86
#include <asm/tlb.h>
#include <asm/tlbflush.h>

87
#include "pgalloc-track.h"
88 89
#include "internal.h"

90
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
91
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
92 93
#endif

94
#ifndef CONFIG_NUMA
L
Linus Torvalds 已提交
95 96
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
T
Tobin C Harding 已提交
97 98

struct page *mem_map;
L
Linus Torvalds 已提交
99 100 101 102 103 104 105 106 107 108
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
T
Tobin C Harding 已提交
109
void *high_memory;
L
Linus Torvalds 已提交
110 111
EXPORT_SYMBOL(high_memory);

112 113 114 115 116 117 118 119 120 121 122 123
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
124

125 126 127 128 129 130 131 132 133 134 135 136
#ifndef arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
	/*
	 * Those arches which don't have hw access flag feature need to
	 * implement their own helper. By default, "true" means pagefault
	 * will be hit on old pte.
	 */
	return true;
}
#endif

137 138 139 140 141 142 143 144 145 146 147 148
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
	/*
	 * Transitioning a PTE from 'old' to 'young' can be expensive on
	 * some architectures, even if it's performed in hardware. By
	 * default, "false" means prefaulted entries will be 'young'.
	 */
	return false;
}
#endif

149 150 151
static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
152
	return 1;
153 154 155
}
__setup("norandmaps", disable_randmaps);

H
Hugh Dickins 已提交
156
unsigned long zero_pfn __read_mostly;
157 158
EXPORT_SYMBOL(zero_pfn);

T
Tobin C Harding 已提交
159 160
unsigned long highest_memmap_pfn __read_mostly;

H
Hugh Dickins 已提交
161 162 163 164 165 166 167 168
/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
169
early_initcall(init_zero_pfn);
170

171
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
172
{
173
	trace_rss_stat(mm, member, count);
174
}
K
KAMEZAWA Hiroyuki 已提交
175

176 177
#if defined(SPLIT_RSS_COUNTING)

178
void sync_mm_rss(struct mm_struct *mm)
179 180 181 182
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
183 184 185
		if (current->rss_stat.count[i]) {
			add_mm_counter(mm, i, current->rss_stat.count[i]);
			current->rss_stat.count[i] = 0;
186 187
		}
	}
188
	current->rss_stat.events = 0;
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
	struct task_struct *task = current;

	if (likely(task->mm == mm))
		task->rss_stat.count[member] += val;
	else
		add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH	(64)
static void check_sync_rss_stat(struct task_struct *task)
{
	if (unlikely(task != current))
		return;
	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
210
		sync_mm_rss(task->mm);
211
}
212
#else /* SPLIT_RSS_COUNTING */
213 214 215 216 217 218 219 220

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

221 222
#endif /* SPLIT_RSS_COUNTING */

L
Linus Torvalds 已提交
223 224 225 226
/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
227 228
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
L
Linus Torvalds 已提交
229
{
230
	pgtable_t token = pmd_pgtable(*pmd);
231
	pmd_clear(pmd);
232
	pte_free_tlb(tlb, token, addr);
233
	mm_dec_nr_ptes(tlb->mm);
L
Linus Torvalds 已提交
234 235
}

236 237 238
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
L
Linus Torvalds 已提交
239 240 241
{
	pmd_t *pmd;
	unsigned long next;
242
	unsigned long start;
L
Linus Torvalds 已提交
243

244
	start = addr;
L
Linus Torvalds 已提交
245 246 247 248 249
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
250
		free_pte_range(tlb, pmd, addr);
L
Linus Torvalds 已提交
251 252
	} while (pmd++, addr = next, addr != end);

253 254 255 256 257 258 259
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
260
	}
261 262 263 264 265
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
266
	pmd_free_tlb(tlb, pmd, start);
267
	mm_dec_nr_pmds(tlb->mm);
L
Linus Torvalds 已提交
268 269
}

270
static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
271 272
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
L
Linus Torvalds 已提交
273 274 275
{
	pud_t *pud;
	unsigned long next;
276
	unsigned long start;
L
Linus Torvalds 已提交
277

278
	start = addr;
279
	pud = pud_offset(p4d, addr);
L
Linus Torvalds 已提交
280 281 282 283
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
284
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
L
Linus Torvalds 已提交
285 286
	} while (pud++, addr = next, addr != end);

287 288 289 290 291 292 293 294 295 296 297 298 299 300
	start &= P4D_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= P4D_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(p4d, start);
	p4d_clear(p4d);
	pud_free_tlb(tlb, pud, start);
K
Kirill A. Shutemov 已提交
301
	mm_dec_nr_puds(tlb->mm);
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
{
	p4d_t *p4d;
	unsigned long next;
	unsigned long start;

	start = addr;
	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(p4d))
			continue;
		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
	} while (p4d++, addr = next, addr != end);

321 322 323 324 325 326 327
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
328
	}
329 330 331
	if (end - 1 > ceiling - 1)
		return;

332
	p4d = p4d_offset(pgd, start);
333
	pgd_clear(pgd);
334
	p4d_free_tlb(tlb, p4d, start);
L
Linus Torvalds 已提交
335 336 337
}

/*
338
 * This function frees user-level page tables of a process.
L
Linus Torvalds 已提交
339
 */
340
void free_pgd_range(struct mmu_gather *tlb,
341 342
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
L
Linus Torvalds 已提交
343 344 345
{
	pgd_t *pgd;
	unsigned long next;
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
L
Linus Torvalds 已提交
372

373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;
388 389 390 391
	/*
	 * We add page table cache pages with PAGE_SIZE,
	 * (see pte_free_tlb()), flush the tlb if we need
	 */
392
	tlb_change_page_size(tlb, PAGE_SIZE);
393
	pgd = pgd_offset(tlb->mm, addr);
L
Linus Torvalds 已提交
394 395 396 397
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
398
		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
L
Linus Torvalds 已提交
399
	} while (pgd++, addr = next, addr != end);
400 401
}

402
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
403
		unsigned long floor, unsigned long ceiling)
404 405 406 407 408
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

409
		/*
N
npiggin@suse.de 已提交
410 411
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
412
		 */
413
		unlink_anon_vmas(vma);
414 415
		unlink_file_vma(vma);

416
		if (is_vm_hugetlb_page(vma)) {
417
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
T
Tobin C Harding 已提交
418
				floor, next ? next->vm_start : ceiling);
419 420 421 422 423
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
424
			       && !is_vm_hugetlb_page(next)) {
425 426
				vma = next;
				next = vma->vm_next;
427
				unlink_anon_vmas(vma);
428
				unlink_file_vma(vma);
429 430
			}
			free_pgd_range(tlb, addr, vma->vm_end,
T
Tobin C Harding 已提交
431
				floor, next ? next->vm_start : ceiling);
432
		}
433 434
		vma = next;
	}
L
Linus Torvalds 已提交
435 436
}

Q
Qi Zheng 已提交
437
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
L
Linus Torvalds 已提交
438
{
Q
Qi Zheng 已提交
439
	spinlock_t *ptl = pmd_lock(mm, pmd);
440

441
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
442
		mm_inc_nr_ptes(mm);
Q
Qi Zheng 已提交
443 444 445 446 447 448 449 450 451 452 453 454 455 456
		/*
		 * Ensure all pte setup (eg. pte page lock and page clearing) are
		 * visible before the pte is made visible to other CPUs by being
		 * put into page tables.
		 *
		 * The other side of the story is the pointer chasing in the page
		 * table walking code (when walking the page table without locking;
		 * ie. most of the time). Fortunately, these data accesses consist
		 * of a chain of data-dependent loads, meaning most CPUs (alpha
		 * being the notable exception) will already guarantee loads are
		 * seen in-order. See the alpha page table accessors for the
		 * smp_rmb() barriers in page table walking code.
		 */
		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
Q
Qi Zheng 已提交
457 458
		pmd_populate(mm, pmd, *pte);
		*pte = NULL;
459
	}
460
	spin_unlock(ptl);
Q
Qi Zheng 已提交
461 462
}

463
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
L
Linus Torvalds 已提交
464
{
465
	pgtable_t new = pte_alloc_one(mm);
466 467 468
	if (!new)
		return -ENOMEM;

Q
Qi Zheng 已提交
469
	pmd_install(mm, pmd, &new);
470 471
	if (new)
		pte_free(mm, new);
472
	return 0;
L
Linus Torvalds 已提交
473 474
}

475
int __pte_alloc_kernel(pmd_t *pmd)
L
Linus Torvalds 已提交
476
{
477
	pte_t *new = pte_alloc_one_kernel(&init_mm);
478 479 480 481
	if (!new)
		return -ENOMEM;

	spin_lock(&init_mm.page_table_lock);
482
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
Q
Qi Zheng 已提交
483
		smp_wmb(); /* See comment in pmd_install() */
484
		pmd_populate_kernel(&init_mm, pmd, new);
485
		new = NULL;
486
	}
487
	spin_unlock(&init_mm.page_table_lock);
488 489
	if (new)
		pte_free_kernel(&init_mm, new);
490
	return 0;
L
Linus Torvalds 已提交
491 492
}

K
KAMEZAWA Hiroyuki 已提交
493 494 495 496 497 498
static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
499
{
K
KAMEZAWA Hiroyuki 已提交
500 501
	int i;

502
	if (current->mm == mm)
503
		sync_mm_rss(mm);
K
KAMEZAWA Hiroyuki 已提交
504 505 506
	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
507 508
}

N
Nick Piggin 已提交
509
/*
510 511 512
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
N
Nick Piggin 已提交
513 514 515
 *
 * The calling function must still handle the error.
 */
516 517
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
N
Nick Piggin 已提交
518
{
519
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
520 521
	p4d_t *p4d = p4d_offset(pgd, addr);
	pud_t *pud = pud_offset(p4d, addr);
522 523 524
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
525 526 527 528 529 530 531 532 533 534 535 536 537 538
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
539 540
			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
				 nr_unshown);
541 542 543 544 545 546
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
547 548 549 550

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

551 552 553
	pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
		 current->comm,
		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
554
	if (page)
555
		dump_page(page, "bad pte");
556
	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
557
		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
558
	pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
559 560 561 562
		 vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->fault : NULL,
		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
		 mapping ? mapping->a_ops->readpage : NULL);
N
Nick Piggin 已提交
563
	dump_stack();
564
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
N
Nick Piggin 已提交
565 566
}

H
Hugh Dickins 已提交
567
/*
N
Nick Piggin 已提交
568
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
569
 *
N
Nick Piggin 已提交
570 571 572
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
J
Jared Hulbert 已提交
573
 *
N
Nick Piggin 已提交
574 575 576 577 578 579 580 581
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
582
 *
J
Jared Hulbert 已提交
583 584
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
N
Nick Piggin 已提交
585 586
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
587 588 589
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
N
Nick Piggin 已提交
590 591 592 593 594 595
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
J
Jared Hulbert 已提交
596 597
 *
 *
N
Nick Piggin 已提交
598
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
J
Jared Hulbert 已提交
599 600 601 602 603 604 605 606 607
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
H
Hugh Dickins 已提交
608
 */
609 610
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
			    pte_t pte)
H
Hugh Dickins 已提交
611
{
612
	unsigned long pfn = pte_pfn(pte);
N
Nick Piggin 已提交
613

L
Laurent Dufour 已提交
614
	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
615
		if (likely(!pte_special(pte)))
616
			goto check_pfn;
617 618
		if (vma->vm_ops && vma->vm_ops->find_special_page)
			return vma->vm_ops->find_special_page(vma, addr);
H
Hugh Dickins 已提交
619 620
		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
			return NULL;
621 622
		if (is_zero_pfn(pfn))
			return NULL;
623 624 625
		if (pte_devmap(pte))
			return NULL;

626
		print_bad_pte(vma, addr, pte, NULL);
N
Nick Piggin 已提交
627 628 629
		return NULL;
	}

L
Laurent Dufour 已提交
630
	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
N
Nick Piggin 已提交
631

J
Jared Hulbert 已提交
632 633 634 635 636 637
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
N
Nick Piggin 已提交
638 639
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
J
Jared Hulbert 已提交
640 641 642 643 644
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
645 646
	}

647 648
	if (is_zero_pfn(pfn))
		return NULL;
L
Laurent Dufour 已提交
649

650 651 652 653 654
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
655 656

	/*
N
Nick Piggin 已提交
657 658
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
659
	 */
J
Jared Hulbert 已提交
660
out:
661
	return pfn_to_page(pfn);
H
Hugh Dickins 已提交
662 663
}

664 665 666 667 668 669 670 671 672
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
				pmd_t pmd)
{
	unsigned long pfn = pmd_pfn(pmd);

	/*
	 * There is no pmd_special() but there may be special pmds, e.g.
	 * in a direct-access (dax) mapping, so let's just replicate the
L
Laurent Dufour 已提交
673
	 * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
	 */
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
	}

690 691
	if (pmd_devmap(pmd))
		return NULL;
692
	if (is_huge_zero_pmd(pmd))
693 694 695 696 697 698 699 700 701 702 703 704 705
		return NULL;
	if (unlikely(pfn > highest_memmap_pfn))
		return NULL;

	/*
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
	 */
out:
	return pfn_to_page(pfn);
}
#endif

706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
static void restore_exclusive_pte(struct vm_area_struct *vma,
				  struct page *page, unsigned long address,
				  pte_t *ptep)
{
	pte_t pte;
	swp_entry_t entry;

	pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
	if (pte_swp_soft_dirty(*ptep))
		pte = pte_mksoft_dirty(pte);

	entry = pte_to_swp_entry(*ptep);
	if (pte_swp_uffd_wp(*ptep))
		pte = pte_mkuffd_wp(pte);
	else if (is_writable_device_exclusive_entry(entry))
		pte = maybe_mkwrite(pte_mkdirty(pte), vma);

	/*
	 * No need to take a page reference as one was already
	 * created when the swap entry was made.
	 */
	if (PageAnon(page))
		page_add_anon_rmap(page, vma, address, false);
	else
		/*
		 * Currently device exclusive access only supports anonymous
		 * memory so the entry shouldn't point to a filebacked page.
		 */
		WARN_ON_ONCE(!PageAnon(page));

736 737
	set_pte_at(vma->vm_mm, address, ptep, pte);

738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
	if (vma->vm_flags & VM_LOCKED)
		mlock_vma_page(page);

	/*
	 * No need to invalidate - it was non-present before. However
	 * secondary CPUs may have mappings that need invalidating.
	 */
	update_mmu_cache(vma, address, ptep);
}

/*
 * Tries to restore an exclusive pte if the page lock can be acquired without
 * sleeping.
 */
static int
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
			unsigned long addr)
{
	swp_entry_t entry = pte_to_swp_entry(*src_pte);
	struct page *page = pfn_swap_entry_to_page(entry);

	if (trylock_page(page)) {
		restore_exclusive_pte(vma, page, addr, src_pte);
		unlock_page(page);
		return 0;
	}

	return -EBUSY;
}

L
Linus Torvalds 已提交
768 769 770 771 772 773
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

774 775
static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
776 777
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
L
Linus Torvalds 已提交
778
{
779
	unsigned long vm_flags = dst_vma->vm_flags;
L
Linus Torvalds 已提交
780 781
	pte_t pte = *src_pte;
	struct page *page;
782 783 784 785
	swp_entry_t entry = pte_to_swp_entry(pte);

	if (likely(!non_swap_entry(entry))) {
		if (swap_duplicate(entry) < 0)
786
			return -EIO;
787 788 789 790 791 792 793 794 795 796 797

		/* make sure dst_mm is on swapoff's mmlist. */
		if (unlikely(list_empty(&dst_mm->mmlist))) {
			spin_lock(&mmlist_lock);
			if (list_empty(&dst_mm->mmlist))
				list_add(&dst_mm->mmlist,
						&src_mm->mmlist);
			spin_unlock(&mmlist_lock);
		}
		rss[MM_SWAPENTS]++;
	} else if (is_migration_entry(entry)) {
798
		page = pfn_swap_entry_to_page(entry);
L
Linus Torvalds 已提交
799

800
		rss[mm_counter(page)]++;
801

802
		if (is_writable_migration_entry(entry) &&
803
				is_cow_mapping(vm_flags)) {
804
			/*
805 806
			 * COW mappings require pages in both
			 * parent and child to be set to read.
807
			 */
808 809
			entry = make_readable_migration_entry(
							swp_offset(entry));
810 811 812 813 814 815 816 817
			pte = swp_entry_to_pte(entry);
			if (pte_swp_soft_dirty(*src_pte))
				pte = pte_swp_mksoft_dirty(pte);
			if (pte_swp_uffd_wp(*src_pte))
				pte = pte_swp_mkuffd_wp(pte);
			set_pte_at(src_mm, addr, src_pte, pte);
		}
	} else if (is_device_private_entry(entry)) {
818
		page = pfn_swap_entry_to_page(entry);
819

820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839
		/*
		 * Update rss count even for unaddressable pages, as
		 * they should treated just like normal pages in this
		 * respect.
		 *
		 * We will likely want to have some new rss counters
		 * for unaddressable pages, at some point. But for now
		 * keep things as they are.
		 */
		get_page(page);
		rss[mm_counter(page)]++;
		page_dup_rmap(page, false);

		/*
		 * We do not preserve soft-dirty information, because so
		 * far, checkpoint/restore is the only feature that
		 * requires that. And checkpoint/restore does not work
		 * when a device driver is involved (you cannot easily
		 * save and restore device driver state).
		 */
840
		if (is_writable_device_private_entry(entry) &&
841
		    is_cow_mapping(vm_flags)) {
842 843
			entry = make_readable_device_private_entry(
							swp_offset(entry));
844 845 846 847
			pte = swp_entry_to_pte(entry);
			if (pte_swp_uffd_wp(*src_pte))
				pte = pte_swp_mkuffd_wp(pte);
			set_pte_at(src_mm, addr, src_pte, pte);
L
Linus Torvalds 已提交
848
		}
849 850 851 852 853 854 855 856 857 858 859
	} else if (is_device_exclusive_entry(entry)) {
		/*
		 * Make device exclusive entries present by restoring the
		 * original entry then copying as for a present pte. Device
		 * exclusive entries currently only support private writable
		 * (ie. COW) mappings.
		 */
		VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
		if (try_restore_exclusive_pte(src_pte, src_vma, addr))
			return -EBUSY;
		return -ENOENT;
L
Linus Torvalds 已提交
860
	}
861 862
	if (!userfaultfd_wp(dst_vma))
		pte = pte_swp_clear_uffd_wp(pte);
863 864 865 866
	set_pte_at(dst_mm, addr, dst_pte, pte);
	return 0;
}

867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887
/*
 * Copy a present and normal page if necessary.
 *
 * NOTE! The usual case is that this doesn't need to do
 * anything, and can just return a positive value. That
 * will let the caller know that it can just increase
 * the page refcount and re-use the pte the traditional
 * way.
 *
 * But _if_ we need to copy it because it needs to be
 * pinned in the parent (and the child should get its own
 * copy rather than just a reference to the same page),
 * we'll do that here and return zero to let the caller
 * know we're done.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
888 889 890
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
		  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
		  struct page **prealloc, pte_t pte, struct page *page)
891 892 893 894 895 896 897 898 899 900 901
{
	struct page *new_page;

	/*
	 * What we want to do is to check whether this page may
	 * have been pinned by the parent process.  If so,
	 * instead of wrprotect the pte on both sides, we copy
	 * the page immediately so that we'll always guarantee
	 * the pinned page won't be randomly replaced in the
	 * future.
	 *
902 903 904 905
	 * The page pinning checks are just "has this mm ever
	 * seen pinning", along with the (inexact) check of
	 * the page count. That might give false positives for
	 * for pinning, but it will work correctly.
906
	 */
907
	if (likely(!page_needs_cow_for_dma(src_vma, page)))
908 909 910 911 912 913 914 915 916 917 918
		return 1;

	new_page = *prealloc;
	if (!new_page)
		return -EAGAIN;

	/*
	 * We have a prealloc page, all good!  Take it
	 * over and copy the page & arm it.
	 */
	*prealloc = NULL;
919
	copy_user_highpage(new_page, page, addr, src_vma);
920
	__SetPageUptodate(new_page);
921 922
	page_add_new_anon_rmap(new_page, dst_vma, addr, false);
	lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
923 924 925
	rss[mm_counter(new_page)]++;

	/* All done, just insert the new page copy in the child */
926 927
	pte = mk_pte(new_page, dst_vma->vm_page_prot);
	pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
928 929 930
	if (userfaultfd_pte_wp(dst_vma, *src_pte))
		/* Uffd-wp needs to be delivered to dest pte as well */
		pte = pte_wrprotect(pte_mkuffd_wp(pte));
931
	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
932 933 934 935 936 937 938 939
	return 0;
}

/*
 * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
 * is required to copy this pte.
 */
static inline int
940 941 942
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
		 struct page **prealloc)
943
{
944 945
	struct mm_struct *src_mm = src_vma->vm_mm;
	unsigned long vm_flags = src_vma->vm_flags;
946 947 948
	pte_t pte = *src_pte;
	struct page *page;

949
	page = vm_normal_page(src_vma, addr, pte);
950 951 952
	if (page) {
		int retval;

953 954
		retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
					   addr, rss, prealloc, pte, page);
955 956 957 958 959 960 961 962
		if (retval <= 0)
			return retval;

		get_page(page);
		page_dup_rmap(page, false);
		rss[mm_counter(page)]++;
	}

L
Linus Torvalds 已提交
963 964 965 966
	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
967
	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
L
Linus Torvalds 已提交
968
		ptep_set_wrprotect(src_mm, addr, src_pte);
969
		pte = pte_wrprotect(pte);
L
Linus Torvalds 已提交
970 971 972 973 974 975 976 977 978
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
979

980
	if (!userfaultfd_wp(dst_vma))
981 982
		pte = pte_clear_uffd_wp(pte);

983
	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
984 985 986 987 988 989 990 991 992 993 994 995 996
	return 0;
}

static inline struct page *
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
		   unsigned long addr)
{
	struct page *new_page;

	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
	if (!new_page)
		return NULL;

997
	if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
998 999
		put_page(new_page);
		return NULL;
1000
	}
1001
	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
1002

1003
	return new_page;
L
Linus Torvalds 已提交
1004 1005
}

1006 1007 1008 1009
static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
	       unsigned long end)
L
Linus Torvalds 已提交
1010
{
1011 1012
	struct mm_struct *dst_mm = dst_vma->vm_mm;
	struct mm_struct *src_mm = src_vma->vm_mm;
1013
	pte_t *orig_src_pte, *orig_dst_pte;
L
Linus Torvalds 已提交
1014
	pte_t *src_pte, *dst_pte;
H
Hugh Dickins 已提交
1015
	spinlock_t *src_ptl, *dst_ptl;
1016
	int progress, ret = 0;
K
KAMEZAWA Hiroyuki 已提交
1017
	int rss[NR_MM_COUNTERS];
H
Hugh Dickins 已提交
1018
	swp_entry_t entry = (swp_entry_t){0};
1019
	struct page *prealloc = NULL;
L
Linus Torvalds 已提交
1020 1021

again:
1022
	progress = 0;
K
KAMEZAWA Hiroyuki 已提交
1023 1024
	init_rss_vec(rss);

H
Hugh Dickins 已提交
1025
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1026 1027 1028 1029
	if (!dst_pte) {
		ret = -ENOMEM;
		goto out;
	}
P
Peter Zijlstra 已提交
1030
	src_pte = pte_offset_map(src_pmd, addr);
H
Hugh Dickins 已提交
1031
	src_ptl = pte_lockptr(src_mm, src_pmd);
I
Ingo Molnar 已提交
1032
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1033 1034
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
1035
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
1036 1037 1038 1039 1040 1041

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
1042 1043 1044
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
N
Nick Piggin 已提交
1045
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1046 1047
				break;
		}
L
Linus Torvalds 已提交
1048 1049 1050 1051
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
1052
		if (unlikely(!pte_present(*src_pte))) {
1053 1054 1055 1056 1057 1058
			ret = copy_nonpresent_pte(dst_mm, src_mm,
						  dst_pte, src_pte,
						  dst_vma, src_vma,
						  addr, rss);
			if (ret == -EIO) {
				entry = pte_to_swp_entry(*src_pte);
1059
				break;
1060 1061 1062 1063 1064
			} else if (ret == -EBUSY) {
				break;
			} else if (!ret) {
				progress += 8;
				continue;
1065
			}
1066 1067 1068 1069 1070 1071

			/*
			 * Device exclusive entry restored, continue by copying
			 * the now present pte.
			 */
			WARN_ON_ONCE(ret != -ENOENT);
1072
		}
1073
		/* copy_present_pte() will clear `*prealloc' if consumed */
1074 1075
		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
				       addr, rss, &prealloc);
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
		/*
		 * If we need a pre-allocated page for this pte, drop the
		 * locks, allocate, and try again.
		 */
		if (unlikely(ret == -EAGAIN))
			break;
		if (unlikely(prealloc)) {
			/*
			 * pre-alloc page cannot be reused by next time so as
			 * to strictly follow mempolicy (e.g., alloc_page_vma()
			 * will allocate page according to address).  This
			 * could only happen if one pinned pte changed.
			 */
			put_page(prealloc);
			prealloc = NULL;
		}
L
Linus Torvalds 已提交
1092 1093 1094
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

1095
	arch_leave_lazy_mmu_mode();
H
Hugh Dickins 已提交
1096
	spin_unlock(src_ptl);
P
Peter Zijlstra 已提交
1097
	pte_unmap(orig_src_pte);
K
KAMEZAWA Hiroyuki 已提交
1098
	add_mm_rss_vec(dst_mm, rss);
1099
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
H
Hugh Dickins 已提交
1100
	cond_resched();
H
Hugh Dickins 已提交
1101

1102 1103
	if (ret == -EIO) {
		VM_WARN_ON_ONCE(!entry.val);
1104 1105 1106 1107 1108
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
			ret = -ENOMEM;
			goto out;
		}
		entry.val = 0;
1109 1110
	} else if (ret == -EBUSY) {
		goto out;
1111
	} else if (ret ==  -EAGAIN) {
1112
		prealloc = page_copy_prealloc(src_mm, src_vma, addr);
1113
		if (!prealloc)
H
Hugh Dickins 已提交
1114
			return -ENOMEM;
1115 1116
	} else if (ret) {
		VM_WARN_ON_ONCE(1);
H
Hugh Dickins 已提交
1117
	}
1118 1119 1120 1121

	/* We've captured and resolved the error. Reset, try again. */
	ret = 0;

L
Linus Torvalds 已提交
1122 1123
	if (addr != end)
		goto again;
1124 1125 1126 1127
out:
	if (unlikely(prealloc))
		put_page(prealloc);
	return ret;
L
Linus Torvalds 已提交
1128 1129
}

1130 1131 1132 1133
static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
	       pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
	       unsigned long end)
L
Linus Torvalds 已提交
1134
{
1135 1136
	struct mm_struct *dst_mm = dst_vma->vm_mm;
	struct mm_struct *src_mm = src_vma->vm_mm;
L
Linus Torvalds 已提交
1137 1138 1139 1140 1141 1142 1143 1144 1145
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
1146 1147
		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
			|| pmd_devmap(*src_pmd)) {
1148
			int err;
1149
			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1150 1151
			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
					    addr, dst_vma, src_vma);
1152 1153 1154 1155 1156 1157
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
L
Linus Torvalds 已提交
1158 1159
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
1160 1161
		if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
				   addr, next))
L
Linus Torvalds 已提交
1162 1163 1164 1165 1166
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

1167 1168 1169 1170
static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
	       p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
	       unsigned long end)
L
Linus Torvalds 已提交
1171
{
1172 1173
	struct mm_struct *dst_mm = dst_vma->vm_mm;
	struct mm_struct *src_mm = src_vma->vm_mm;
L
Linus Torvalds 已提交
1174 1175 1176
	pud_t *src_pud, *dst_pud;
	unsigned long next;

1177
	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
L
Linus Torvalds 已提交
1178 1179
	if (!dst_pud)
		return -ENOMEM;
1180
	src_pud = pud_offset(src_p4d, addr);
L
Linus Torvalds 已提交
1181 1182
	do {
		next = pud_addr_end(addr, end);
1183 1184 1185
		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
			int err;

1186
			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1187
			err = copy_huge_pud(dst_mm, src_mm,
1188
					    dst_pud, src_pud, addr, src_vma);
1189 1190 1191 1192 1193 1194
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
L
Linus Torvalds 已提交
1195 1196
		if (pud_none_or_clear_bad(src_pud))
			continue;
1197 1198
		if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
				   addr, next))
L
Linus Torvalds 已提交
1199 1200 1201 1202 1203
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

1204 1205 1206 1207
static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
	       pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
	       unsigned long end)
1208
{
1209
	struct mm_struct *dst_mm = dst_vma->vm_mm;
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220
	p4d_t *src_p4d, *dst_p4d;
	unsigned long next;

	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
	if (!dst_p4d)
		return -ENOMEM;
	src_p4d = p4d_offset(src_pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(src_p4d))
			continue;
1221 1222
		if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
				   addr, next))
1223 1224 1225 1226 1227
			return -ENOMEM;
	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
	return 0;
}

1228 1229
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
L
Linus Torvalds 已提交
1230 1231 1232
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
1233 1234 1235 1236
	unsigned long addr = src_vma->vm_start;
	unsigned long end = src_vma->vm_end;
	struct mm_struct *dst_mm = dst_vma->vm_mm;
	struct mm_struct *src_mm = src_vma->vm_mm;
1237
	struct mmu_notifier_range range;
1238
	bool is_cow;
A
Andrea Arcangeli 已提交
1239
	int ret;
L
Linus Torvalds 已提交
1240

1241 1242 1243 1244 1245 1246
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
1247 1248
	if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
	    !src_vma->anon_vma)
1249
		return 0;
1250

1251 1252
	if (is_vm_hugetlb_page(src_vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
L
Linus Torvalds 已提交
1253

1254
	if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1255 1256 1257 1258
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
1259
		ret = track_pfn_copy(src_vma);
1260 1261 1262 1263
		if (ret)
			return ret;
	}

A
Andrea Arcangeli 已提交
1264 1265 1266 1267 1268 1269
	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
1270
	is_cow = is_cow_mapping(src_vma->vm_flags);
1271 1272

	if (is_cow) {
1273
		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1274
					0, src_vma, src_mm, addr, end);
1275
		mmu_notifier_invalidate_range_start(&range);
1276 1277 1278 1279 1280 1281 1282 1283 1284
		/*
		 * Disabling preemption is not needed for the write side, as
		 * the read side doesn't spin, but goes to the mmap_lock.
		 *
		 * Use the raw variant of the seqcount_t write API to avoid
		 * lockdep complaining about preemptibility.
		 */
		mmap_assert_write_locked(src_mm);
		raw_write_seqcount_begin(&src_mm->write_protect_seq);
1285
	}
A
Andrea Arcangeli 已提交
1286 1287

	ret = 0;
L
Linus Torvalds 已提交
1288 1289 1290 1291 1292 1293
	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
1294 1295
		if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
					    addr, next))) {
A
Andrea Arcangeli 已提交
1296 1297 1298
			ret = -ENOMEM;
			break;
		}
L
Linus Torvalds 已提交
1299
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
A
Andrea Arcangeli 已提交
1300

1301 1302
	if (is_cow) {
		raw_write_seqcount_end(&src_mm->write_protect_seq);
1303
		mmu_notifier_invalidate_range_end(&range);
1304
	}
A
Andrea Arcangeli 已提交
1305
	return ret;
L
Linus Torvalds 已提交
1306 1307
}

1308 1309 1310 1311 1312 1313 1314 1315
/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
	struct address_space *zap_mapping;	/* Check page->mapping if set */
	struct folio *single_folio;	/* Locked folio to be unmapped */
};

1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
	/* By default, zap all pages */
	if (!details)
		return true;

	/* Or, we zap COWed pages only if the caller wants to */
	return !details->zap_mapping;
}

1327 1328
/*
 * We set details->zap_mapping when we want to unmap shared but keep private
1329
 * pages. Return true if we should zap this page, false otherwise.
1330
 */
1331
static inline bool should_zap_page(struct zap_details *details, struct page *page)
1332
{
1333 1334
	/* If we can make a decision without *page.. */
	if (should_zap_cows(details))
1335
		return true;
1336 1337 1338

	/* E.g. the caller passes NULL for the case of a zero page */
	if (!page)
1339
		return true;
1340

1341
	return details->zap_mapping == page_rmapping(page);
1342 1343
}

1344
static unsigned long zap_pte_range(struct mmu_gather *tlb,
N
Nick Piggin 已提交
1345
				struct vm_area_struct *vma, pmd_t *pmd,
L
Linus Torvalds 已提交
1346
				unsigned long addr, unsigned long end,
1347
				struct zap_details *details)
L
Linus Torvalds 已提交
1348
{
N
Nick Piggin 已提交
1349
	struct mm_struct *mm = tlb->mm;
P
Peter Zijlstra 已提交
1350
	int force_flush = 0;
K
KAMEZAWA Hiroyuki 已提交
1351
	int rss[NR_MM_COUNTERS];
1352
	spinlock_t *ptl;
1353
	pte_t *start_pte;
1354
	pte_t *pte;
1355
	swp_entry_t entry;
K
KAMEZAWA Hiroyuki 已提交
1356

1357
	tlb_change_page_size(tlb, PAGE_SIZE);
P
Peter Zijlstra 已提交
1358
again:
1359
	init_rss_vec(rss);
1360 1361
	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	pte = start_pte;
1362
	flush_tlb_batched_pending(mm);
1363
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
1364 1365
	do {
		pte_t ptent = *pte;
T
Tobin C Harding 已提交
1366
		if (pte_none(ptent))
L
Linus Torvalds 已提交
1367
			continue;
1368

1369 1370 1371
		if (need_resched())
			break;

L
Linus Torvalds 已提交
1372
		if (pte_present(ptent)) {
H
Hugh Dickins 已提交
1373
			struct page *page;
1374

1375
			page = vm_normal_page(vma, addr, ptent);
1376
			if (unlikely(!should_zap_page(details, page)))
P
Peter Xu 已提交
1377
				continue;
N
Nick Piggin 已提交
1378
			ptent = ptep_get_and_clear_full(mm, addr, pte,
1379
							tlb->fullmm);
L
Linus Torvalds 已提交
1380 1381 1382
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
1383 1384

			if (!PageAnon(page)) {
1385 1386
				if (pte_dirty(ptent)) {
					force_flush = 1;
1387
					set_page_dirty(page);
1388
				}
1389
				if (pte_young(ptent) &&
1390
				    likely(!(vma->vm_flags & VM_SEQ_READ)))
1391
					mark_page_accessed(page);
1392
			}
1393
			rss[mm_counter(page)]--;
1394
			page_remove_rmap(page, false);
1395 1396
			if (unlikely(page_mapcount(page) < 0))
				print_bad_pte(vma, addr, ptent, page);
1397
			if (unlikely(__tlb_remove_page(tlb, page))) {
1398
				force_flush = 1;
1399
				addr += PAGE_SIZE;
P
Peter Zijlstra 已提交
1400
				break;
1401
			}
L
Linus Torvalds 已提交
1402 1403
			continue;
		}
1404 1405

		entry = pte_to_swp_entry(ptent);
1406 1407
		if (is_device_private_entry(entry) ||
		    is_device_exclusive_entry(entry)) {
1408
			struct page *page = pfn_swap_entry_to_page(entry);
1409

1410
			if (unlikely(!should_zap_page(details, page)))
P
Peter Xu 已提交
1411
				continue;
1412 1413
			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
			rss[mm_counter(page)]--;
1414 1415 1416 1417

			if (is_device_private_entry(entry))
				page_remove_rmap(page, false);

1418 1419 1420 1421
			put_page(page);
			continue;
		}

1422 1423 1424 1425
		if (!non_swap_entry(entry)) {
			/* Genuine swap entry, hence a private anon page */
			if (!should_zap_cows(details))
				continue;
1426
			rss[MM_SWAPENTS]--;
1427
		} else if (is_migration_entry(entry)) {
1428
			struct page *page;
1429

1430
			page = pfn_swap_entry_to_page(entry);
1431
			if (!should_zap_page(details, page))
1432
				continue;
1433
			rss[mm_counter(page)]--;
1434 1435 1436 1437 1438 1439
		} else if (is_hwpoison_entry(entry)) {
			if (!should_zap_cows(details))
				continue;
		} else {
			/* We should have covered all the swap entry types */
			WARN_ON_ONCE(1);
K
KAMEZAWA Hiroyuki 已提交
1440
		}
1441 1442
		if (unlikely(!free_swap_and_cache(entry)))
			print_bad_pte(vma, addr, ptent, NULL);
1443
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1444
	} while (pte++, addr += PAGE_SIZE, addr != end);
1445

K
KAMEZAWA Hiroyuki 已提交
1446
	add_mm_rss_vec(mm, rss);
1447
	arch_leave_lazy_mmu_mode();
1448

1449
	/* Do the actual TLB flush before dropping ptl */
1450
	if (force_flush)
1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
		tlb_flush_mmu_tlbonly(tlb);
	pte_unmap_unlock(start_pte, ptl);

	/*
	 * If we forced a TLB flush (either due to running out of
	 * batch buffers or because we needed to flush dirty TLB
	 * entries before releasing the ptl), free the batched
	 * memory too. Restart if we didn't do everything.
	 */
	if (force_flush) {
		force_flush = 0;
1462
		tlb_flush_mmu(tlb);
1463 1464 1465 1466 1467
	}

	if (addr != end) {
		cond_resched();
		goto again;
P
Peter Zijlstra 已提交
1468 1469
	}

1470
	return addr;
L
Linus Torvalds 已提交
1471 1472
}

1473
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
N
Nick Piggin 已提交
1474
				struct vm_area_struct *vma, pud_t *pud,
L
Linus Torvalds 已提交
1475
				unsigned long addr, unsigned long end,
1476
				struct zap_details *details)
L
Linus Torvalds 已提交
1477 1478 1479 1480 1481 1482 1483
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
1484
		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1485
			if (next - addr != HPAGE_PMD_SIZE)
1486
				__split_huge_pmd(vma, pmd, addr, false, NULL);
1487
			else if (zap_huge_pmd(tlb, vma, pmd, addr))
1488
				goto next;
1489
			/* fall through */
1490 1491
		} else if (details && details->single_folio &&
			   folio_test_pmd_mappable(details->single_folio) &&
1492 1493 1494 1495 1496 1497 1498 1499
			   next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
			spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
			/*
			 * Take and drop THP pmd lock so that we cannot return
			 * prematurely, while zap_huge_pmd() has cleared *pmd,
			 * but not yet decremented compound_mapcount().
			 */
			spin_unlock(ptl);
1500
		}
1501

1502 1503 1504 1505
		/*
		 * Here there can be other concurrent MADV_DONTNEED or
		 * trans huge page faults running, and if the pmd is
		 * none or trans huge it can change under us. This is
1506
		 * because MADV_DONTNEED holds the mmap_lock in read
1507 1508 1509 1510
		 * mode.
		 */
		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			goto next;
1511
		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1512
next:
1513 1514
		cond_resched();
	} while (pmd++, addr = next, addr != end);
1515 1516

	return addr;
L
Linus Torvalds 已提交
1517 1518
}

1519
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1520
				struct vm_area_struct *vma, p4d_t *p4d,
L
Linus Torvalds 已提交
1521
				unsigned long addr, unsigned long end,
1522
				struct zap_details *details)
L
Linus Torvalds 已提交
1523 1524 1525 1526
{
	pud_t *pud;
	unsigned long next;

1527
	pud = pud_offset(p4d, addr);
L
Linus Torvalds 已提交
1528 1529
	do {
		next = pud_addr_end(addr, end);
1530 1531
		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
			if (next - addr != HPAGE_PUD_SIZE) {
1532
				mmap_assert_locked(tlb->mm);
1533 1534 1535 1536 1537
				split_huge_pud(vma, pud, addr);
			} else if (zap_huge_pud(tlb, vma, pud, addr))
				goto next;
			/* fall through */
		}
1538
		if (pud_none_or_clear_bad(pud))
L
Linus Torvalds 已提交
1539
			continue;
1540
		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1541 1542
next:
		cond_resched();
1543
	} while (pud++, addr = next, addr != end);
1544 1545

	return addr;
L
Linus Torvalds 已提交
1546 1547
}

1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566
static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
				struct vm_area_struct *vma, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				struct zap_details *details)
{
	p4d_t *p4d;
	unsigned long next;

	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(p4d))
			continue;
		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
	} while (p4d++, addr = next, addr != end);

	return addr;
}

M
Michal Hocko 已提交
1567
void unmap_page_range(struct mmu_gather *tlb,
A
Al Viro 已提交
1568 1569 1570
			     struct vm_area_struct *vma,
			     unsigned long addr, unsigned long end,
			     struct zap_details *details)
L
Linus Torvalds 已提交
1571 1572 1573 1574 1575 1576 1577 1578 1579
{
	pgd_t *pgd;
	unsigned long next;

	BUG_ON(addr >= end);
	tlb_start_vma(tlb, vma);
	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
1580
		if (pgd_none_or_clear_bad(pgd))
L
Linus Torvalds 已提交
1581
			continue;
1582
		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1583
	} while (pgd++, addr = next, addr != end);
L
Linus Torvalds 已提交
1584 1585
	tlb_end_vma(tlb, vma);
}
1586

1587 1588 1589

static void unmap_single_vma(struct mmu_gather *tlb,
		struct vm_area_struct *vma, unsigned long start_addr,
1590
		unsigned long end_addr,
1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601
		struct zap_details *details)
{
	unsigned long start = max(vma->vm_start, start_addr);
	unsigned long end;

	if (start >= vma->vm_end)
		return;
	end = min(vma->vm_end, end_addr);
	if (end <= vma->vm_start)
		return;

1602 1603 1604
	if (vma->vm_file)
		uprobe_munmap(vma, start, end);

1605
	if (unlikely(vma->vm_flags & VM_PFNMAP))
1606
		untrack_pfn(vma, 0, 0);
1607 1608 1609 1610 1611 1612 1613

	if (start != end) {
		if (unlikely(is_vm_hugetlb_page(vma))) {
			/*
			 * It is undesirable to test vma->vm_file as it
			 * should be non-null for valid hugetlb area.
			 * However, vm_file will be NULL in the error
1614
			 * cleanup path of mmap_region. When
1615
			 * hugetlbfs ->mmap method fails,
1616
			 * mmap_region() nullifies vma->vm_file
1617 1618 1619 1620
			 * before calling this function to clean up.
			 * Since no pte has actually been setup, it is
			 * safe to do nothing in this case.
			 */
1621
			if (vma->vm_file) {
1622
				i_mmap_lock_write(vma->vm_file->f_mapping);
1623
				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1624
				i_mmap_unlock_write(vma->vm_file->f_mapping);
1625
			}
1626 1627 1628
		} else
			unmap_page_range(tlb, vma, start, end, details);
	}
L
Linus Torvalds 已提交
1629 1630 1631 1632
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
1633
 * @tlb: address of the caller's struct mmu_gather
L
Linus Torvalds 已提交
1634 1635 1636 1637
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 *
1638
 * Unmap all pages in the vma list.
L
Linus Torvalds 已提交
1639 1640 1641 1642 1643 1644 1645 1646 1647 1648
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
A
Al Viro 已提交
1649
void unmap_vmas(struct mmu_gather *tlb,
L
Linus Torvalds 已提交
1650
		struct vm_area_struct *vma, unsigned long start_addr,
1651
		unsigned long end_addr)
L
Linus Torvalds 已提交
1652
{
1653
	struct mmu_notifier_range range;
L
Linus Torvalds 已提交
1654

1655 1656
	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
				start_addr, end_addr);
1657
	mmu_notifier_invalidate_range_start(&range);
1658
	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1659
		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1660
	mmu_notifier_invalidate_range_end(&range);
L
Linus Torvalds 已提交
1661 1662 1663 1664 1665
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
1666
 * @start: starting address of pages to zap
L
Linus Torvalds 已提交
1667
 * @size: number of bytes to zap
1668 1669
 *
 * Caller must protect the VMA list
L
Linus Torvalds 已提交
1670
 */
1671
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1672
		unsigned long size)
L
Linus Torvalds 已提交
1673
{
1674
	struct mmu_notifier_range range;
P
Peter Zijlstra 已提交
1675
	struct mmu_gather tlb;
L
Linus Torvalds 已提交
1676 1677

	lru_add_drain();
1678
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1679
				start, start + size);
1680
	tlb_gather_mmu(&tlb, vma->vm_mm);
1681 1682 1683 1684 1685
	update_hiwater_rss(vma->vm_mm);
	mmu_notifier_invalidate_range_start(&range);
	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
		unmap_single_vma(&tlb, vma, start, range.end, NULL);
	mmu_notifier_invalidate_range_end(&range);
1686
	tlb_finish_mmu(&tlb);
L
Linus Torvalds 已提交
1687 1688
}

1689 1690 1691 1692 1693
/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
1694
 * @details: details of shared cache invalidation
1695 1696
 *
 * The range must fit into one VMA.
L
Linus Torvalds 已提交
1697
 */
1698
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
L
Linus Torvalds 已提交
1699 1700
		unsigned long size, struct zap_details *details)
{
1701
	struct mmu_notifier_range range;
P
Peter Zijlstra 已提交
1702
	struct mmu_gather tlb;
L
Linus Torvalds 已提交
1703 1704

	lru_add_drain();
1705
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1706
				address, address + size);
1707
	tlb_gather_mmu(&tlb, vma->vm_mm);
1708 1709 1710 1711
	update_hiwater_rss(vma->vm_mm);
	mmu_notifier_invalidate_range_start(&range);
	unmap_single_vma(&tlb, vma, address, range.end, details);
	mmu_notifier_invalidate_range_end(&range);
1712
	tlb_finish_mmu(&tlb);
L
Linus Torvalds 已提交
1713 1714
}

1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
1726
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1727 1728 1729 1730
		unsigned long size)
{
	if (address < vma->vm_start || address + size > vma->vm_end ||
	    		!(vma->vm_flags & VM_PFNMAP))
1731 1732
		return;

1733
	zap_page_range_single(vma, address, size, NULL);
1734 1735 1736
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

A
Arjun Roy 已提交
1737
static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
1738
{
1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;

	pgd = pgd_offset(mm, addr);
	p4d = p4d_alloc(mm, pgd, addr);
	if (!p4d)
		return NULL;
	pud = pud_alloc(mm, p4d, addr);
	if (!pud)
		return NULL;
	pmd = pmd_alloc(mm, pud, addr);
	if (!pmd)
		return NULL;

	VM_BUG_ON(pmd_trans_huge(*pmd));
A
Arjun Roy 已提交
1756 1757 1758 1759 1760 1761 1762 1763 1764 1765
	return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
			spinlock_t **ptl)
{
	pmd_t *pmd = walk_to_pmd(mm, addr);

	if (!pmd)
		return NULL;
1766
	return pte_alloc_map_lock(mm, pmd, addr, ptl);
1767 1768
}

1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789
static int validate_page_before_insert(struct page *page)
{
	if (PageAnon(page) || PageSlab(page) || page_has_type(page))
		return -EINVAL;
	flush_dcache_page(page);
	return 0;
}

static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
			unsigned long addr, struct page *page, pgprot_t prot)
{
	if (!pte_none(*pte))
		return -EBUSY;
	/* Ok, finally just insert the thing.. */
	get_page(page);
	inc_mm_counter_fast(mm, mm_counter_file(page));
	page_add_file_rmap(page, false);
	set_pte_at(mm, addr, pte, mk_pte(page, prot));
	return 0;
}

1790 1791 1792 1793 1794 1795 1796
/*
 * This is the old fallback for page remapping.
 *
 * For historical reasons, it only allows reserved pages. Only
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
N
Nick Piggin 已提交
1797 1798
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
			struct page *page, pgprot_t prot)
1799
{
N
Nick Piggin 已提交
1800
	struct mm_struct *mm = vma->vm_mm;
1801
	int retval;
1802
	pte_t *pte;
1803 1804
	spinlock_t *ptl;

1805 1806
	retval = validate_page_before_insert(page);
	if (retval)
1807
		goto out;
1808
	retval = -ENOMEM;
1809
	pte = get_locked_pte(mm, addr, &ptl);
1810
	if (!pte)
1811
		goto out;
1812
	retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
1813 1814 1815 1816 1817
	pte_unmap_unlock(pte, ptl);
out:
	return retval;
}

A
Arjun Roy 已提交
1818
#ifdef pte_index
1819
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
A
Arjun Roy 已提交
1820 1821 1822 1823 1824 1825 1826
			unsigned long addr, struct page *page, pgprot_t prot)
{
	int err;

	if (!page_count(page))
		return -EINVAL;
	err = validate_page_before_insert(page);
1827 1828 1829
	if (err)
		return err;
	return insert_page_into_pte_locked(mm, pte, addr, page, prot);
A
Arjun Roy 已提交
1830 1831 1832 1833 1834 1835 1836 1837 1838
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop. Arch *must* define pte_index.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
			struct page **pages, unsigned long *num, pgprot_t prot)
{
	pmd_t *pmd = NULL;
1839 1840
	pte_t *start_pte, *pte;
	spinlock_t *pte_lock;
A
Arjun Roy 已提交
1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863
	struct mm_struct *const mm = vma->vm_mm;
	unsigned long curr_page_idx = 0;
	unsigned long remaining_pages_total = *num;
	unsigned long pages_to_write_in_pmd;
	int ret;
more:
	ret = -EFAULT;
	pmd = walk_to_pmd(mm, addr);
	if (!pmd)
		goto out;

	pages_to_write_in_pmd = min_t(unsigned long,
		remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

	/* Allocate the PTE if necessary; takes PMD lock once only. */
	ret = -ENOMEM;
	if (pte_alloc(mm, pmd))
		goto out;

	while (pages_to_write_in_pmd) {
		int pte_idx = 0;
		const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

1864 1865 1866
		start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
		for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
			int err = insert_page_in_batch_locked(mm, pte,
A
Arjun Roy 已提交
1867 1868
				addr, pages[curr_page_idx], prot);
			if (unlikely(err)) {
1869
				pte_unmap_unlock(start_pte, pte_lock);
A
Arjun Roy 已提交
1870 1871 1872 1873 1874 1875 1876
				ret = err;
				remaining_pages_total -= pte_idx;
				goto out;
			}
			addr += PAGE_SIZE;
			++curr_page_idx;
		}
1877
		pte_unmap_unlock(start_pte, pte_lock);
A
Arjun Roy 已提交
1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913
		pages_to_write_in_pmd -= batch_size;
		remaining_pages_total -= batch_size;
	}
	if (remaining_pages_total)
		goto more;
	ret = 0;
out:
	*num = remaining_pages_total;
	return ret;
}
#endif  /* ifdef pte_index */

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
			struct page **pages, unsigned long *num)
{
#ifdef pte_index
	const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

	if (addr < vma->vm_start || end_addr >= vma->vm_end)
		return -EFAULT;
	if (!(vma->vm_flags & VM_MIXEDMAP)) {
1914
		BUG_ON(mmap_read_trylock(vma->vm_mm));
A
Arjun Roy 已提交
1915 1916 1917 1918 1919 1920 1921
		BUG_ON(vma->vm_flags & VM_PFNMAP);
		vma->vm_flags |= VM_MIXEDMAP;
	}
	/* Defer page refcount checking till we're about to map that page. */
	return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
	unsigned long idx = 0, pgcount = *num;
1922
	int err = -EINVAL;
A
Arjun Roy 已提交
1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934

	for (; idx < pgcount; ++idx) {
		err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
		if (err)
			break;
	}
	*num = pgcount - idx;
	return err;
#endif  /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);

1935 1936 1937 1938 1939 1940
/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
1941 1942 1943 1944 1945 1946
 * This allows drivers to insert individual pages they've allocated
 * into a user vma.
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
N
Nick Piggin 已提交
1947
 * (see split_page()).
1948 1949 1950 1951 1952 1953 1954 1955
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
1956 1957
 *
 * Usually this function is called from f_op->mmap() handler
1958
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
1959 1960
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
1961 1962
 *
 * Return: %0 on success, negative error code otherwise.
1963
 */
N
Nick Piggin 已提交
1964 1965
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
			struct page *page)
1966 1967 1968 1969 1970
{
	if (addr < vma->vm_start || addr >= vma->vm_end)
		return -EFAULT;
	if (!page_count(page))
		return -EINVAL;
1971
	if (!(vma->vm_flags & VM_MIXEDMAP)) {
1972
		BUG_ON(mmap_read_trylock(vma->vm_mm));
1973 1974 1975
		BUG_ON(vma->vm_flags & VM_PFNMAP);
		vma->vm_flags |= VM_MIXEDMAP;
	}
N
Nick Piggin 已提交
1976
	return insert_page(vma, addr, page, vma->vm_page_prot);
1977
}
1978
EXPORT_SYMBOL(vm_insert_page);
1979

1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998
/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
				unsigned long num, unsigned long offset)
{
	unsigned long count = vma_pages(vma);
	unsigned long uaddr = vma->vm_start;
	int ret, i;

	/* Fail if the user requested offset is beyond the end of the object */
1999
	if (offset >= num)
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060
		return -ENXIO;

	/* Fail if the user requested size exceeds available object size */
	if (count > num - offset)
		return -ENXIO;

	for (i = 0; i < count; i++) {
		ret = vm_insert_page(vma, uaddr, pages[offset + i]);
		if (ret < 0)
			return ret;
		uaddr += PAGE_SIZE;
	}

	return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
				unsigned long num)
{
	return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
				unsigned long num)
{
	return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

2061
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
R
Ross Zwisler 已提交
2062
			pfn_t pfn, pgprot_t prot, bool mkwrite)
N
Nick Piggin 已提交
2063 2064 2065 2066 2067 2068 2069
{
	struct mm_struct *mm = vma->vm_mm;
	pte_t *pte, entry;
	spinlock_t *ptl;

	pte = get_locked_pte(mm, addr, &ptl);
	if (!pte)
2070
		return VM_FAULT_OOM;
R
Ross Zwisler 已提交
2071 2072 2073 2074 2075 2076 2077
	if (!pte_none(*pte)) {
		if (mkwrite) {
			/*
			 * For read faults on private mappings the PFN passed
			 * in may not match the PFN we have mapped if the
			 * mapped PFN is a writeable COW page.  In the mkwrite
			 * case we are creating a writable PTE for a shared
J
Jan Kara 已提交
2078 2079 2080 2081
			 * mapping and we expect the PFNs to match. If they
			 * don't match, we are likely racing with block
			 * allocation and mapping invalidation so just skip the
			 * update.
R
Ross Zwisler 已提交
2082
			 */
J
Jan Kara 已提交
2083 2084
			if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
R
Ross Zwisler 已提交
2085
				goto out_unlock;
J
Jan Kara 已提交
2086
			}
2087 2088 2089 2090 2091 2092
			entry = pte_mkyoung(*pte);
			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
				update_mmu_cache(vma, addr, pte);
		}
		goto out_unlock;
R
Ross Zwisler 已提交
2093
	}
N
Nick Piggin 已提交
2094 2095

	/* Ok, finally just insert the thing.. */
2096 2097 2098 2099
	if (pfn_t_devmap(pfn))
		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
	else
		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
R
Ross Zwisler 已提交
2100 2101 2102 2103 2104 2105

	if (mkwrite) {
		entry = pte_mkyoung(entry);
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	}

N
Nick Piggin 已提交
2106
	set_pte_at(mm, addr, pte, entry);
2107
	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
N
Nick Piggin 已提交
2108 2109 2110

out_unlock:
	pte_unmap_unlock(pte, ptl);
2111
	return VM_FAULT_NOPAGE;
N
Nick Piggin 已提交
2112 2113
}

2114 2115 2116 2117 2118 2119 2120
/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
2121
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
2122 2123 2124 2125
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
M
Matthew Wilcox 已提交
2126
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
2127 2128
 * impractical.
 *
2129 2130 2131
 * See vmf_insert_mixed_prot() for a discussion of the implication of using
 * a value of @pgprot different from that of @vma->vm_page_prot.
 *
M
Matthew Wilcox 已提交
2132
 * Context: Process context.  May allocate using %GFP_KERNEL.
2133 2134 2135 2136 2137
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
			unsigned long pfn, pgprot_t pgprot)
{
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157
	/*
	 * Technically, architectures with pte_special can avoid all these
	 * restrictions (same for remap_pfn_range).  However we would like
	 * consistency in testing and feature parity among all, so we should
	 * try to keep these invariants in place for everybody.
	 */
	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
						(VM_PFNMAP|VM_MIXEDMAP));
	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

	if (addr < vma->vm_start || addr >= vma->vm_end)
		return VM_FAULT_SIGBUS;

	if (!pfn_modify_allowed(pfn, pgprot))
		return VM_FAULT_SIGBUS;

	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));

2158
	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
2159
			false);
2160 2161
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
N
Nick Piggin 已提交
2162

M
Matthew Wilcox 已提交
2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189
/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
			unsigned long pfn)
{
	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
	/* these checks mirror the abort conditions in vm_normal_page */
	if (vma->vm_flags & VM_MIXEDMAP)
		return true;
	if (pfn_t_devmap(pfn))
		return true;
	if (pfn_t_special(pfn))
		return true;
	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
		return true;
	return false;
}

2204
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2205 2206
		unsigned long addr, pfn_t pfn, pgprot_t pgprot,
		bool mkwrite)
N
Nick Piggin 已提交
2207
{
2208
	int err;
2209

2210
	BUG_ON(!vm_mixed_ok(vma, pfn));
N
Nick Piggin 已提交
2211

N
Nick Piggin 已提交
2212
	if (addr < vma->vm_start || addr >= vma->vm_end)
2213
		return VM_FAULT_SIGBUS;
2214 2215

	track_pfn_insert(vma, &pgprot, pfn);
N
Nick Piggin 已提交
2216

2217
	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
2218
		return VM_FAULT_SIGBUS;
2219

N
Nick Piggin 已提交
2220 2221 2222 2223
	/*
	 * If we don't have pte special, then we have to use the pfn_valid()
	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
	 * refcount the page if pfn_valid is true (hence insert_page rather
H
Hugh Dickins 已提交
2224 2225
	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
	 * without pte special, it would there be refcounted as a normal page.
N
Nick Piggin 已提交
2226
	 */
L
Laurent Dufour 已提交
2227 2228
	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
	    !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
N
Nick Piggin 已提交
2229 2230
		struct page *page;

2231 2232 2233 2234 2235 2236
		/*
		 * At this point we are committed to insert_page()
		 * regardless of whether the caller specified flags that
		 * result in pfn_t_has_page() == false.
		 */
		page = pfn_to_page(pfn_t_to_pfn(pfn));
2237 2238
		err = insert_page(vma, addr, page, pgprot);
	} else {
2239
		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
N
Nick Piggin 已提交
2240
	}
R
Ross Zwisler 已提交
2241

M
Matthew Wilcox 已提交
2242 2243 2244 2245 2246 2247
	if (err == -ENOMEM)
		return VM_FAULT_OOM;
	if (err < 0 && err != -EBUSY)
		return VM_FAULT_SIGBUS;

	return VM_FAULT_NOPAGE;
N
Nick Piggin 已提交
2248
}
2249

2250 2251 2252 2253 2254 2255 2256
/**
 * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
2257
 * This is exactly like vmf_insert_mixed(), except that it allows drivers
2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280
 * to override pgprot on a per-page basis.
 *
 * Typically this function should be used by drivers to set caching- and
 * encryption bits different than those of @vma->vm_page_prot, because
 * the caching- or encryption mode may not be known at mmap() time.
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
				 pfn_t pfn, pgprot_t pgprot)
{
	return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
}
2281
EXPORT_SYMBOL(vmf_insert_mixed_prot);
2282

2283 2284 2285
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
		pfn_t pfn)
{
2286
	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
2287
}
M
Matthew Wilcox 已提交
2288
EXPORT_SYMBOL(vmf_insert_mixed);
N
Nick Piggin 已提交
2289

2290 2291 2292 2293 2294 2295 2296
/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
		unsigned long addr, pfn_t pfn)
R
Ross Zwisler 已提交
2297
{
2298
	return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
R
Ross Zwisler 已提交
2299
}
2300
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
R
Ross Zwisler 已提交
2301

L
Linus Torvalds 已提交
2302 2303 2304 2305 2306 2307 2308 2309 2310
/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
2311
	pte_t *pte, *mapped_pte;
H
Hugh Dickins 已提交
2312
	spinlock_t *ptl;
2313
	int err = 0;
L
Linus Torvalds 已提交
2314

2315
	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
L
Linus Torvalds 已提交
2316 2317
	if (!pte)
		return -ENOMEM;
2318
	arch_enter_lazy_mmu_mode();
L
Linus Torvalds 已提交
2319 2320
	do {
		BUG_ON(!pte_none(*pte));
2321 2322 2323 2324
		if (!pfn_modify_allowed(pfn, prot)) {
			err = -EACCES;
			break;
		}
N
Nick Piggin 已提交
2325
		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
L
Linus Torvalds 已提交
2326 2327
		pfn++;
	} while (pte++, addr += PAGE_SIZE, addr != end);
2328
	arch_leave_lazy_mmu_mode();
2329
	pte_unmap_unlock(mapped_pte, ptl);
2330
	return err;
L
Linus Torvalds 已提交
2331 2332 2333 2334 2335 2336 2337 2338
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
	pmd_t *pmd;
	unsigned long next;
2339
	int err;
L
Linus Torvalds 已提交
2340 2341 2342 2343 2344

	pfn -= addr >> PAGE_SHIFT;
	pmd = pmd_alloc(mm, pud, addr);
	if (!pmd)
		return -ENOMEM;
2345
	VM_BUG_ON(pmd_trans_huge(*pmd));
L
Linus Torvalds 已提交
2346 2347
	do {
		next = pmd_addr_end(addr, end);
2348 2349 2350 2351
		err = remap_pte_range(mm, pmd, addr, next,
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
			return err;
L
Linus Torvalds 已提交
2352 2353 2354 2355
	} while (pmd++, addr = next, addr != end);
	return 0;
}

2356
static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
L
Linus Torvalds 已提交
2357 2358 2359 2360 2361
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
	pud_t *pud;
	unsigned long next;
2362
	int err;
L
Linus Torvalds 已提交
2363 2364

	pfn -= addr >> PAGE_SHIFT;
2365
	pud = pud_alloc(mm, p4d, addr);
L
Linus Torvalds 已提交
2366 2367 2368 2369
	if (!pud)
		return -ENOMEM;
	do {
		next = pud_addr_end(addr, end);
2370 2371 2372 2373
		err = remap_pmd_range(mm, pud, addr, next,
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
			return err;
L
Linus Torvalds 已提交
2374 2375 2376 2377
	} while (pud++, addr = next, addr != end);
	return 0;
}

2378 2379 2380 2381 2382 2383
static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
			unsigned long addr, unsigned long end,
			unsigned long pfn, pgprot_t prot)
{
	p4d_t *p4d;
	unsigned long next;
2384
	int err;
2385 2386 2387 2388 2389 2390 2391

	pfn -= addr >> PAGE_SHIFT;
	p4d = p4d_alloc(mm, pgd, addr);
	if (!p4d)
		return -ENOMEM;
	do {
		next = p4d_addr_end(addr, end);
2392 2393 2394 2395
		err = remap_pud_range(mm, p4d, addr, next,
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
			return err;
2396 2397 2398 2399
	} while (p4d++, addr = next, addr != end);
	return 0;
}

2400 2401 2402
/*
 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
 * must have pre-validated the caching bits of the pgprot_t.
2403
 */
2404 2405
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
		unsigned long pfn, unsigned long size, pgprot_t prot)
L
Linus Torvalds 已提交
2406 2407 2408
{
	pgd_t *pgd;
	unsigned long next;
2409
	unsigned long end = addr + PAGE_ALIGN(size);
L
Linus Torvalds 已提交
2410 2411 2412
	struct mm_struct *mm = vma->vm_mm;
	int err;

2413 2414 2415
	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
		return -EINVAL;

L
Linus Torvalds 已提交
2416 2417 2418 2419 2420
	/*
	 * Physically remapped pages are special. Tell the
	 * rest of the world about it:
	 *   VM_IO tells people not to look at these pages
	 *	(accesses can have side effects).
2421 2422 2423
	 *   VM_PFNMAP tells the core MM that the base pages are just
	 *	raw PFN mappings, and do not have a "struct page" associated
	 *	with them.
2424 2425 2426 2427
	 *   VM_DONTEXPAND
	 *      Disable vma merging and expanding with mremap().
	 *   VM_DONTDUMP
	 *      Omit vma from core dump, even when VM_IO turned off.
L
Linus Torvalds 已提交
2428 2429 2430 2431
	 *
	 * There's a horrible special case to handle copy-on-write
	 * behaviour that some programs depend on. We mark the "original"
	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2432
	 * See vm_normal_page() for details.
L
Linus Torvalds 已提交
2433
	 */
2434 2435 2436
	if (is_cow_mapping(vma->vm_flags)) {
		if (addr != vma->vm_start || end != vma->vm_end)
			return -EINVAL;
L
Linus Torvalds 已提交
2437
		vma->vm_pgoff = pfn;
2438 2439
	}

2440
	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
L
Linus Torvalds 已提交
2441 2442 2443 2444 2445 2446 2447

	BUG_ON(addr >= end);
	pfn -= addr >> PAGE_SHIFT;
	pgd = pgd_offset(mm, addr);
	flush_cache_range(vma, addr, end);
	do {
		next = pgd_addr_end(addr, end);
2448
		err = remap_p4d_range(mm, pgd, addr, next,
L
Linus Torvalds 已提交
2449 2450
				pfn + (addr >> PAGE_SHIFT), prot);
		if (err)
2451
			return err;
L
Linus Torvalds 已提交
2452
	} while (pgd++, addr = next, addr != end);
2453

2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474
	return 0;
}

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
		    unsigned long pfn, unsigned long size, pgprot_t prot)
{
	int err;

	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2475
	if (err)
2476
		return -EINVAL;
2477

2478 2479 2480
	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
	if (err)
		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
L
Linus Torvalds 已提交
2481 2482 2483 2484
	return err;
}
EXPORT_SYMBOL(remap_pfn_range);

2485 2486 2487
/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
2488
 * @start: start of the physical memory to be mapped
2489 2490 2491 2492 2493 2494 2495 2496
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
2497 2498
 *
 * Return: %0 on success, negative error code otherwise.
2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
	unsigned long vm_len, pfn, pages;

	/* Check that the physical memory area passed in looks valid */
	if (start + len < start)
		return -EINVAL;
	/*
	 * You *really* shouldn't map things that aren't page-aligned,
	 * but we've historically allowed it because IO memory might
	 * just have smaller alignment.
	 */
	len += start & ~PAGE_MASK;
	pfn = start >> PAGE_SHIFT;
	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
	if (pfn + pages < pfn)
		return -EINVAL;

	/* We start the mapping 'vm_pgoff' pages into the area */
	if (vma->vm_pgoff > pages)
		return -EINVAL;
	pfn += vma->vm_pgoff;
	pages -= vma->vm_pgoff;

	/* Can we fit all of the mapping? */
	vm_len = vma->vm_end - vma->vm_start;
	if (vm_len >> PAGE_SHIFT > pages)
		return -EINVAL;

	/* Ok, let it rip */
	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

2534 2535
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
				     unsigned long addr, unsigned long end,
2536 2537
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2538
{
2539
	pte_t *pte, *mapped_pte;
2540
	int err = 0;
2541
	spinlock_t *ptl;
2542

2543
	if (create) {
2544
		mapped_pte = pte = (mm == &init_mm) ?
2545
			pte_alloc_kernel_track(pmd, addr, mask) :
2546 2547 2548 2549
			pte_alloc_map_lock(mm, pmd, addr, &ptl);
		if (!pte)
			return -ENOMEM;
	} else {
2550
		mapped_pte = pte = (mm == &init_mm) ?
2551 2552 2553
			pte_offset_kernel(pmd, addr) :
			pte_offset_map_lock(mm, pmd, addr, &ptl);
	}
2554 2555 2556

	BUG_ON(pmd_huge(*pmd));

2557 2558
	arch_enter_lazy_mmu_mode();

2559 2560 2561 2562 2563 2564 2565 2566 2567
	if (fn) {
		do {
			if (create || !pte_none(*pte)) {
				err = fn(pte++, addr, data);
				if (err)
					break;
			}
		} while (addr += PAGE_SIZE, addr != end);
	}
2568
	*mask |= PGTBL_PTE_MODIFIED;
2569

2570 2571
	arch_leave_lazy_mmu_mode();

2572
	if (mm != &init_mm)
2573
		pte_unmap_unlock(mapped_pte, ptl);
2574 2575 2576 2577 2578
	return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
				     unsigned long addr, unsigned long end,
2579 2580
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2581 2582 2583
{
	pmd_t *pmd;
	unsigned long next;
2584
	int err = 0;
2585

A
Andi Kleen 已提交
2586 2587
	BUG_ON(pud_huge(*pud));

2588
	if (create) {
2589
		pmd = pmd_alloc_track(mm, pud, addr, mask);
2590 2591 2592 2593 2594
		if (!pmd)
			return -ENOMEM;
	} else {
		pmd = pmd_offset(pud, addr);
	}
2595 2596
	do {
		next = pmd_addr_end(addr, end);
2597 2598 2599 2600 2601 2602 2603 2604
		if (pmd_none(*pmd) && !create)
			continue;
		if (WARN_ON_ONCE(pmd_leaf(*pmd)))
			return -EINVAL;
		if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
			if (!create)
				continue;
			pmd_clear_bad(pmd);
2605
		}
2606 2607 2608 2609
		err = apply_to_pte_range(mm, pmd, addr, next,
					 fn, data, create, mask);
		if (err)
			break;
2610
	} while (pmd++, addr = next, addr != end);
2611

2612 2613 2614
	return err;
}

2615
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2616
				     unsigned long addr, unsigned long end,
2617 2618
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2619 2620 2621
{
	pud_t *pud;
	unsigned long next;
2622
	int err = 0;
2623

2624
	if (create) {
2625
		pud = pud_alloc_track(mm, p4d, addr, mask);
2626 2627 2628 2629 2630
		if (!pud)
			return -ENOMEM;
	} else {
		pud = pud_offset(p4d, addr);
	}
2631 2632
	do {
		next = pud_addr_end(addr, end);
2633 2634 2635 2636 2637 2638 2639 2640
		if (pud_none(*pud) && !create)
			continue;
		if (WARN_ON_ONCE(pud_leaf(*pud)))
			return -EINVAL;
		if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
			if (!create)
				continue;
			pud_clear_bad(pud);
2641
		}
2642 2643 2644 2645
		err = apply_to_pmd_range(mm, pud, addr, next,
					 fn, data, create, mask);
		if (err)
			break;
2646
	} while (pud++, addr = next, addr != end);
2647

2648 2649 2650
	return err;
}

2651 2652
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
				     unsigned long addr, unsigned long end,
2653 2654
				     pte_fn_t fn, void *data, bool create,
				     pgtbl_mod_mask *mask)
2655 2656 2657
{
	p4d_t *p4d;
	unsigned long next;
2658
	int err = 0;
2659

2660
	if (create) {
2661
		p4d = p4d_alloc_track(mm, pgd, addr, mask);
2662 2663 2664 2665 2666
		if (!p4d)
			return -ENOMEM;
	} else {
		p4d = p4d_offset(pgd, addr);
	}
2667 2668
	do {
		next = p4d_addr_end(addr, end);
2669 2670 2671 2672 2673 2674 2675 2676
		if (p4d_none(*p4d) && !create)
			continue;
		if (WARN_ON_ONCE(p4d_leaf(*p4d)))
			return -EINVAL;
		if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
			if (!create)
				continue;
			p4d_clear_bad(p4d);
2677
		}
2678 2679 2680 2681
		err = apply_to_pud_range(mm, p4d, addr, next,
					 fn, data, create, mask);
		if (err)
			break;
2682
	} while (p4d++, addr = next, addr != end);
2683

2684 2685 2686
	return err;
}

2687 2688 2689
static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
				 unsigned long size, pte_fn_t fn,
				 void *data, bool create)
2690 2691
{
	pgd_t *pgd;
2692
	unsigned long start = addr, next;
2693
	unsigned long end = addr + size;
2694
	pgtbl_mod_mask mask = 0;
2695
	int err = 0;
2696

2697 2698 2699
	if (WARN_ON(addr >= end))
		return -EINVAL;

2700 2701 2702
	pgd = pgd_offset(mm, addr);
	do {
		next = pgd_addr_end(addr, end);
2703
		if (pgd_none(*pgd) && !create)
2704
			continue;
2705 2706 2707 2708 2709 2710 2711 2712 2713
		if (WARN_ON_ONCE(pgd_leaf(*pgd)))
			return -EINVAL;
		if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
			if (!create)
				continue;
			pgd_clear_bad(pgd);
		}
		err = apply_to_p4d_range(mm, pgd, addr, next,
					 fn, data, create, &mask);
2714 2715 2716
		if (err)
			break;
	} while (pgd++, addr = next, addr != end);
2717

2718 2719 2720
	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
		arch_sync_kernel_mappings(start, start + size);

2721 2722
	return err;
}
2723 2724 2725 2726 2727 2728 2729 2730 2731 2732

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
			unsigned long size, pte_fn_t fn, void *data)
{
	return __apply_to_page_range(mm, addr, size, fn, data, true);
}
2733 2734
EXPORT_SYMBOL_GPL(apply_to_page_range);

2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748
/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
				 unsigned long size, pte_fn_t fn, void *data)
{
	return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);

2749
/*
2750 2751 2752 2753 2754
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
2755
 * and do_anonymous_page can safely check later on).
2756
 */
2757
static inline int pte_unmap_same(struct vm_fault *vmf)
2758 2759
{
	int same = 1;
2760
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
2761
	if (sizeof(pte_t) > sizeof(unsigned long)) {
2762
		spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
H
Hugh Dickins 已提交
2763
		spin_lock(ptl);
2764
		same = pte_same(*vmf->pte, vmf->orig_pte);
H
Hugh Dickins 已提交
2765
		spin_unlock(ptl);
2766 2767
	}
#endif
2768 2769
	pte_unmap(vmf->pte);
	vmf->pte = NULL;
2770 2771 2772
	return same;
}

2773 2774
static inline bool cow_user_page(struct page *dst, struct page *src,
				 struct vm_fault *vmf)
2775
{
2776 2777 2778
	bool ret;
	void *kaddr;
	void __user *uaddr;
2779
	bool locked = false;
2780 2781 2782 2783 2784 2785 2786 2787 2788
	struct vm_area_struct *vma = vmf->vma;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long addr = vmf->address;

	if (likely(src)) {
		copy_user_highpage(dst, src, addr, vma);
		return true;
	}

2789 2790 2791 2792 2793 2794
	/*
	 * If the source page was a PFN mapping, we don't have
	 * a "struct page" for it. We do a best-effort copy by
	 * just copying from the original user address. If that
	 * fails, we just zero-fill it. Live with it.
	 */
2795 2796 2797 2798 2799 2800 2801
	kaddr = kmap_atomic(dst);
	uaddr = (void __user *)(addr & PAGE_MASK);

	/*
	 * On architectures with software "accessed" bits, we would
	 * take a double page fault, so mark it accessed here.
	 */
2802
	if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
2803
		pte_t entry;
L
Linus Torvalds 已提交
2804

2805
		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2806
		locked = true;
2807 2808 2809
		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
			/*
			 * Other thread has already handled the fault
2810
			 * and update local tlb only
2811
			 */
2812
			update_mmu_tlb(vma, addr, vmf->pte);
2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828
			ret = false;
			goto pte_unlock;
		}

		entry = pte_mkyoung(vmf->orig_pte);
		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
			update_mmu_cache(vma, addr, vmf->pte);
	}

	/*
	 * This really shouldn't fail, because the page is there
	 * in the page tables. But it might just be unreadable,
	 * in which case we just give up and fill the result with
	 * zeroes.
	 */
	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2829 2830 2831 2832 2833 2834 2835
		if (locked)
			goto warn;

		/* Re-validate under PTL if the page is still mapped */
		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
		locked = true;
		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2836 2837
			/* The PTE changed under us, update local tlb */
			update_mmu_tlb(vma, addr, vmf->pte);
2838 2839 2840 2841
			ret = false;
			goto pte_unlock;
		}

L
Linus Torvalds 已提交
2842
		/*
2843
		 * The same page can be mapped back since last copy attempt.
2844
		 * Try to copy again under PTL.
L
Linus Torvalds 已提交
2845
		 */
2846 2847 2848 2849 2850 2851 2852 2853 2854
		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
			/*
			 * Give a warn in case there can be some obscure
			 * use-case
			 */
warn:
			WARN_ON_ONCE(1);
			clear_page(kaddr);
		}
2855 2856 2857 2858 2859
	}

	ret = true;

pte_unlock:
2860
	if (locked)
2861 2862 2863 2864 2865
		pte_unmap_unlock(vmf->pte, vmf->ptl);
	kunmap_atomic(kaddr);
	flush_dcache_page(dst);

	return ret;
2866 2867
}

2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
	struct file *vm_file = vma->vm_file;

	if (vm_file)
		return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

	/*
	 * Special mappings (e.g. VDSO) do not have any file so fake
	 * a default GFP_KERNEL for them.
	 */
	return GFP_KERNEL;
}

2882 2883 2884 2885 2886 2887
/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
2888
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2889
{
2890
	vm_fault_t ret;
2891 2892
	struct page *page = vmf->page;
	unsigned int old_flags = vmf->flags;
2893

2894
	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2895

2896 2897 2898 2899
	if (vmf->vma->vm_file &&
	    IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
		return VM_FAULT_SIGBUS;

2900
	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2901 2902
	/* Restore original flags so that caller is not surprised */
	vmf->flags = old_flags;
2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
		return ret;
	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
		lock_page(page);
		if (!page->mapping) {
			unlock_page(page);
			return 0; /* retry */
		}
		ret |= VM_FAULT_LOCKED;
	} else
		VM_BUG_ON_PAGE(!PageLocked(page), page);
	return ret;
}

2917 2918 2919 2920 2921
/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
2922
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2923
{
2924
	struct vm_area_struct *vma = vmf->vma;
2925
	struct address_space *mapping;
2926
	struct page *page = vmf->page;
2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940
	bool dirtied;
	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

	dirtied = set_page_dirty(page);
	VM_BUG_ON_PAGE(PageAnon(page), page);
	/*
	 * Take a local copy of the address_space - page.mapping may be zeroed
	 * by truncate after unlock_page().   The address_space itself remains
	 * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
	 * release semantics to prevent the compiler from undoing this copying.
	 */
	mapping = page_rmapping(page);
	unlock_page(page);

2941 2942 2943 2944 2945 2946 2947 2948 2949
	if (!page_mkwrite)
		file_update_time(vma->vm_file);

	/*
	 * Throttle page dirtying rate down to writeback speed.
	 *
	 * mapping may be NULL here because some device drivers do not
	 * set page.mapping but still dirty their pages
	 *
2950
	 * Drop the mmap_lock before waiting on IO, if we can. The file
2951 2952
	 * is pinning the mapping, as per above.
	 */
2953
	if ((dirtied || page_mkwrite) && mapping) {
2954 2955 2956
		struct file *fpin;

		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2957
		balance_dirty_pages_ratelimited(mapping);
2958 2959 2960 2961
		if (fpin) {
			fput(fpin);
			return VM_FAULT_RETRY;
		}
2962 2963
	}

2964
	return 0;
2965 2966
}

2967 2968 2969 2970 2971 2972 2973 2974
/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
2975
static inline void wp_page_reuse(struct vm_fault *vmf)
J
Jan Kara 已提交
2976
	__releases(vmf->ptl)
2977
{
J
Jan Kara 已提交
2978
	struct vm_area_struct *vma = vmf->vma;
J
Jan Kara 已提交
2979
	struct page *page = vmf->page;
2980 2981 2982 2983 2984 2985 2986 2987 2988
	pte_t entry;
	/*
	 * Clear the pages cpupid information as the existing
	 * information potentially belongs to a now completely
	 * unrelated process.
	 */
	if (page)
		page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);

J
Jan Kara 已提交
2989 2990
	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
	entry = pte_mkyoung(vmf->orig_pte);
2991
	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
J
Jan Kara 已提交
2992 2993 2994
	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
		update_mmu_cache(vma, vmf->address, vmf->pte);
	pte_unmap_unlock(vmf->pte, vmf->ptl);
P
Peter Xu 已提交
2995
	count_vm_event(PGREUSE);
2996 2997
}

2998 2999 3000
/*
 * Handle the case of a page which we actually need to copy to a new page.
 *
3001
 * Called with mmap_lock locked and the old page referenced, but
3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
3014
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
3015
{
J
Jan Kara 已提交
3016
	struct vm_area_struct *vma = vmf->vma;
K
Kirill A. Shutemov 已提交
3017
	struct mm_struct *mm = vma->vm_mm;
J
Jan Kara 已提交
3018
	struct page *old_page = vmf->page;
3019 3020 3021
	struct page *new_page = NULL;
	pte_t entry;
	int page_copied = 0;
3022
	struct mmu_notifier_range range;
3023 3024 3025 3026

	if (unlikely(anon_vma_prepare(vma)))
		goto oom;

J
Jan Kara 已提交
3027
	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
J
Jan Kara 已提交
3028 3029
		new_page = alloc_zeroed_user_highpage_movable(vma,
							      vmf->address);
3030 3031 3032
		if (!new_page)
			goto oom;
	} else {
K
Kirill A. Shutemov 已提交
3033
		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
J
Jan Kara 已提交
3034
				vmf->address);
3035 3036
		if (!new_page)
			goto oom;
3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049

		if (!cow_user_page(new_page, old_page, vmf)) {
			/*
			 * COW failed, if the fault was solved by other,
			 * it's fine. If not, userspace would re-fault on
			 * the same address and we will handle the fault
			 * from the second attempt.
			 */
			put_page(new_page);
			if (old_page)
				put_page(old_page);
			return 0;
		}
3050 3051
	}

3052
	if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
3053
		goto oom_free_new;
3054
	cgroup_throttle_swaprate(new_page, GFP_KERNEL);
3055

3056 3057
	__SetPageUptodate(new_page);

3058
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
3059
				vmf->address & PAGE_MASK,
3060 3061
				(vmf->address & PAGE_MASK) + PAGE_SIZE);
	mmu_notifier_invalidate_range_start(&range);
3062 3063 3064 3065

	/*
	 * Re-check the pte - we dropped the lock
	 */
J
Jan Kara 已提交
3066
	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
J
Jan Kara 已提交
3067
	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
3068 3069
		if (old_page) {
			if (!PageAnon(old_page)) {
3070 3071
				dec_mm_counter_fast(mm,
						mm_counter_file(old_page));
3072 3073 3074 3075 3076
				inc_mm_counter_fast(mm, MM_ANONPAGES);
			}
		} else {
			inc_mm_counter_fast(mm, MM_ANONPAGES);
		}
J
Jan Kara 已提交
3077
		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3078
		entry = mk_pte(new_page, vma->vm_page_prot);
3079
		entry = pte_sw_mkyoung(entry);
3080
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3081

3082 3083
		/*
		 * Clear the pte entry and flush it first, before updating the
3084 3085 3086 3087
		 * pte with the new entry, to keep TLBs on different CPUs in
		 * sync. This code used to set the new PTE then flush TLBs, but
		 * that left a window where the new PTE could be loaded into
		 * some TLBs while the old PTE remains in others.
3088
		 */
J
Jan Kara 已提交
3089 3090
		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
		page_add_new_anon_rmap(new_page, vma, vmf->address, false);
3091
		lru_cache_add_inactive_or_unevictable(new_page, vma);
3092 3093 3094 3095 3096
		/*
		 * We call the notify macro here because, when using secondary
		 * mmu page tables (such as kvm shadow page tables), we want the
		 * new page to be mapped directly into the secondary page table.
		 */
J
Jan Kara 已提交
3097 3098
		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
		update_mmu_cache(vma, vmf->address, vmf->pte);
3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121
		if (old_page) {
			/*
			 * Only after switching the pte to the new page may
			 * we remove the mapcount here. Otherwise another
			 * process may come and find the rmap count decremented
			 * before the pte is switched to the new page, and
			 * "reuse" the old page writing into it while our pte
			 * here still points into it and can be read by other
			 * threads.
			 *
			 * The critical issue is to order this
			 * page_remove_rmap with the ptp_clear_flush above.
			 * Those stores are ordered by (if nothing else,)
			 * the barrier present in the atomic_add_negative
			 * in page_remove_rmap.
			 *
			 * Then the TLB flush in ptep_clear_flush ensures that
			 * no process can access the old page before the
			 * decremented mapcount is visible. And the old page
			 * cannot be reused until after the decremented
			 * mapcount is visible. So transitively, TLBs to
			 * old page will be flushed before it can be reused.
			 */
3122
			page_remove_rmap(old_page, false);
3123 3124 3125 3126 3127 3128
		}

		/* Free the old page.. */
		new_page = old_page;
		page_copied = 1;
	} else {
3129
		update_mmu_tlb(vma, vmf->address, vmf->pte);
3130 3131 3132
	}

	if (new_page)
3133
		put_page(new_page);
3134

J
Jan Kara 已提交
3135
	pte_unmap_unlock(vmf->pte, vmf->ptl);
3136 3137 3138 3139
	/*
	 * No need to double call mmu_notifier->invalidate_range() callback as
	 * the above ptep_clear_flush_notify() did already call it.
	 */
3140
	mmu_notifier_invalidate_range_only_end(&range);
3141 3142 3143 3144 3145 3146 3147
	if (old_page) {
		/*
		 * Don't let another task, with possibly unlocked vma,
		 * keep the mlocked page.
		 */
		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
			lock_page(old_page);	/* LRU manipulation */
3148 3149
			if (PageMlocked(old_page))
				munlock_vma_page(old_page);
3150 3151
			unlock_page(old_page);
		}
3152 3153
		if (page_copied)
			free_swap_cache(old_page);
3154
		put_page(old_page);
3155 3156 3157
	}
	return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
3158
	put_page(new_page);
3159 3160
oom:
	if (old_page)
3161
		put_page(old_page);
3162 3163 3164
	return VM_FAULT_OOM;
}

3165 3166 3167 3168 3169 3170 3171 3172
/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *			  writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
3173
 * It handles locking of PTE and modifying it.
3174 3175 3176
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
3177
 *
3178
 * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
3179
 * we acquired PTE lock.
3180
 */
3181
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
3182 3183 3184 3185 3186 3187 3188 3189 3190
{
	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
				       &vmf->ptl);
	/*
	 * We might have raced with another page fault while we released the
	 * pte_offset_map_lock.
	 */
	if (!pte_same(*vmf->pte, vmf->orig_pte)) {
3191
		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
3192
		pte_unmap_unlock(vmf->pte, vmf->ptl);
3193
		return VM_FAULT_NOPAGE;
3194 3195
	}
	wp_page_reuse(vmf);
3196
	return 0;
3197 3198
}

3199 3200 3201 3202
/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
3203
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3204
{
J
Jan Kara 已提交
3205
	struct vm_area_struct *vma = vmf->vma;
K
Kirill A. Shutemov 已提交
3206

3207
	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3208
		vm_fault_t ret;
3209

J
Jan Kara 已提交
3210
		pte_unmap_unlock(vmf->pte, vmf->ptl);
3211
		vmf->flags |= FAULT_FLAG_MKWRITE;
3212
		ret = vma->vm_ops->pfn_mkwrite(vmf);
3213
		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
3214
			return ret;
3215
		return finish_mkwrite_fault(vmf);
3216
	}
3217 3218
	wp_page_reuse(vmf);
	return VM_FAULT_WRITE;
3219 3220
}

3221
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
J
Jan Kara 已提交
3222
	__releases(vmf->ptl)
3223
{
J
Jan Kara 已提交
3224
	struct vm_area_struct *vma = vmf->vma;
3225
	vm_fault_t ret = VM_FAULT_WRITE;
3226

J
Jan Kara 已提交
3227
	get_page(vmf->page);
3228 3229

	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3230
		vm_fault_t tmp;
3231

J
Jan Kara 已提交
3232
		pte_unmap_unlock(vmf->pte, vmf->ptl);
3233
		tmp = do_page_mkwrite(vmf);
3234 3235
		if (unlikely(!tmp || (tmp &
				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
J
Jan Kara 已提交
3236
			put_page(vmf->page);
3237 3238
			return tmp;
		}
3239
		tmp = finish_mkwrite_fault(vmf);
3240
		if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
J
Jan Kara 已提交
3241 3242
			unlock_page(vmf->page);
			put_page(vmf->page);
3243
			return tmp;
3244
		}
3245 3246
	} else {
		wp_page_reuse(vmf);
3247
		lock_page(vmf->page);
3248
	}
3249
	ret |= fault_dirty_shared_page(vmf);
3250
	put_page(vmf->page);
3251

3252
	return ret;
3253 3254
}

L
Linus Torvalds 已提交
3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268
/*
 * This routine handles present pages, when users try to write
 * to a shared page. It is done by copying the page to a new address
 * and decrementing the shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus we can safely just mark it writable once we've done any necessary
 * COW.
 *
 * We also mark the page dirty at this point even though the page will
 * change only once the write actually happens. This avoids a few races,
 * and potentially makes it more efficient.
 *
3269
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
3270
 * but allow concurrent faults), with pte both mapped and locked.
3271
 * We return with mmap_lock still held, but pte unmapped and unlocked.
L
Linus Torvalds 已提交
3272
 */
3273
static vm_fault_t do_wp_page(struct vm_fault *vmf)
J
Jan Kara 已提交
3274
	__releases(vmf->ptl)
L
Linus Torvalds 已提交
3275
{
J
Jan Kara 已提交
3276
	struct vm_area_struct *vma = vmf->vma;
L
Linus Torvalds 已提交
3277

3278
	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
3279 3280 3281 3282
		pte_unmap_unlock(vmf->pte, vmf->ptl);
		return handle_userfault(vmf, VM_UFFD_WP);
	}

3283 3284 3285 3286 3287 3288 3289 3290
	/*
	 * Userfaultfd write-protect can defer flushes. Ensure the TLB
	 * is flushed in this case before copying.
	 */
	if (unlikely(userfaultfd_wp(vmf->vma) &&
		     mm_tlb_flush_pending(vmf->vma->vm_mm)))
		flush_tlb_page(vmf->vma, vmf->address);

J
Jan Kara 已提交
3291 3292
	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
	if (!vmf->page) {
3293
		/*
3294 3295
		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
		 * VM_PFNMAP VMA.
3296 3297
		 *
		 * We should not cow pages in a shared writeable mapping.
3298
		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
3299 3300 3301
		 */
		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
				     (VM_WRITE|VM_SHARED))
J
Jan Kara 已提交
3302
			return wp_pfn_shared(vmf);
3303

J
Jan Kara 已提交
3304
		pte_unmap_unlock(vmf->pte, vmf->ptl);
J
Jan Kara 已提交
3305
		return wp_page_copy(vmf);
3306
	}
L
Linus Torvalds 已提交
3307

3308
	/*
P
Peter Zijlstra 已提交
3309 3310
	 * Take out anonymous pages first, anonymous shared vmas are
	 * not dirty accountable.
3311
	 */
3312
	if (PageAnon(vmf->page)) {
L
Linus Torvalds 已提交
3313 3314 3315 3316 3317 3318 3319 3320 3321
		struct page *page = vmf->page;

		/* PageKsm() doesn't necessarily raise the page refcount */
		if (PageKsm(page) || page_count(page) != 1)
			goto copy;
		if (!trylock_page(page))
			goto copy;
		if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
			unlock_page(page);
3322
			goto copy;
3323
		}
L
Linus Torvalds 已提交
3324 3325 3326 3327 3328 3329
		/*
		 * Ok, we've got the only map reference, and the only
		 * page count reference, and the page is locked,
		 * it's dark out, and we're wearing sunglasses. Hit it.
		 */
		unlock_page(page);
3330
		wp_page_reuse(vmf);
L
Linus Torvalds 已提交
3331
		return VM_FAULT_WRITE;
P
Peter Zijlstra 已提交
3332
	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3333
					(VM_WRITE|VM_SHARED))) {
J
Jan Kara 已提交
3334
		return wp_page_shared(vmf);
L
Linus Torvalds 已提交
3335
	}
3336
copy:
L
Linus Torvalds 已提交
3337 3338 3339
	/*
	 * Ok, we need to copy. Oh, well..
	 */
J
Jan Kara 已提交
3340
	get_page(vmf->page);
3341

J
Jan Kara 已提交
3342
	pte_unmap_unlock(vmf->pte, vmf->ptl);
J
Jan Kara 已提交
3343
	return wp_page_copy(vmf);
L
Linus Torvalds 已提交
3344 3345
}

3346
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
L
Linus Torvalds 已提交
3347 3348 3349
		unsigned long start_addr, unsigned long end_addr,
		struct zap_details *details)
{
3350
	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
L
Linus Torvalds 已提交
3351 3352
}

3353
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
3354 3355
					    pgoff_t first_index,
					    pgoff_t last_index,
L
Linus Torvalds 已提交
3356 3357 3358 3359 3360
					    struct zap_details *details)
{
	struct vm_area_struct *vma;
	pgoff_t vba, vea, zba, zea;

3361
	vma_interval_tree_foreach(vma, root, first_index, last_index) {
L
Linus Torvalds 已提交
3362
		vba = vma->vm_pgoff;
3363
		vea = vba + vma_pages(vma) - 1;
3364
		zba = first_index;
L
Linus Torvalds 已提交
3365 3366
		if (zba < vba)
			zba = vba;
3367
		zea = last_index;
L
Linus Torvalds 已提交
3368 3369 3370
		if (zea > vea)
			zea = vea;

3371
		unmap_mapping_range_vma(vma,
L
Linus Torvalds 已提交
3372 3373
			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
3374
				details);
L
Linus Torvalds 已提交
3375 3376 3377
	}
}

3378
/**
3379 3380
 * unmap_mapping_folio() - Unmap single folio from processes.
 * @folio: The locked folio to be unmapped.
3381
 *
3382
 * Unmap this folio from any userspace process which still has it mmaped.
3383 3384
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
3385 3386
 * truncation or invalidation holds the lock on a folio, it may find that
 * the page has been remapped again: and then uses unmap_mapping_folio()
3387 3388
 * to unmap it finally.
 */
3389
void unmap_mapping_folio(struct folio *folio)
3390
{
3391
	struct address_space *mapping = folio->mapping;
3392
	struct zap_details details = { };
3393 3394
	pgoff_t	first_index;
	pgoff_t	last_index;
3395

3396
	VM_BUG_ON(!folio_test_locked(folio));
3397

3398 3399
	first_index = folio->index;
	last_index = folio->index + folio_nr_pages(folio) - 1;
3400

P
Peter Xu 已提交
3401
	details.zap_mapping = mapping;
3402
	details.single_folio = folio;
3403 3404 3405

	i_mmap_lock_write(mapping);
	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3406 3407
		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
					 last_index, &details);
3408 3409 3410
	i_mmap_unlock_write(mapping);
}

M
Matthew Wilcox 已提交
3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426
/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
		pgoff_t nr, bool even_cows)
{
	struct zap_details details = { };
3427 3428
	pgoff_t	first_index = start;
	pgoff_t	last_index = start + nr - 1;
M
Matthew Wilcox 已提交
3429

P
Peter Xu 已提交
3430
	details.zap_mapping = even_cows ? NULL : mapping;
3431 3432
	if (last_index < first_index)
		last_index = ULONG_MAX;
M
Matthew Wilcox 已提交
3433 3434 3435

	i_mmap_lock_write(mapping);
	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3436 3437
		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
					 last_index, &details);
M
Matthew Wilcox 已提交
3438 3439
	i_mmap_unlock_write(mapping);
}
3440
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
M
Matthew Wilcox 已提交
3441

L
Linus Torvalds 已提交
3442
/**
3443
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
M
Matthew Wilcox 已提交
3444
 * address_space corresponding to the specified byte range in the underlying
3445 3446
 * file.
 *
M
Martin Waitz 已提交
3447
 * @mapping: the address space containing mmaps to be unmapped.
L
Linus Torvalds 已提交
3448 3449
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
N
npiggin@suse.de 已提交
3450
 * boundary.  Note that this is different from truncate_pagecache(), which
L
Linus Torvalds 已提交
3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
		loff_t const holebegin, loff_t const holelen, int even_cows)
{
	pgoff_t hba = holebegin >> PAGE_SHIFT;
	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;

	/* Check for overflow. */
	if (sizeof(holelen) > sizeof(hlen)) {
		long long holeend =
			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
		if (holeend & ~(long long)ULONG_MAX)
			hlen = ULONG_MAX - hba + 1;
	}

M
Matthew Wilcox 已提交
3473
	unmap_mapping_pages(mapping, hba, hlen, even_cows);
L
Linus Torvalds 已提交
3474 3475 3476
}
EXPORT_SYMBOL(unmap_mapping_range);

3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504
/*
 * Restore a potential device exclusive pte to a working pte entry
 */
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
	struct page *page = vmf->page;
	struct vm_area_struct *vma = vmf->vma;
	struct mmu_notifier_range range;

	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
		return VM_FAULT_RETRY;
	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
				vma->vm_mm, vmf->address & PAGE_MASK,
				(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
	mmu_notifier_invalidate_range_start(&range);

	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
				&vmf->ptl);
	if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
		restore_exclusive_pte(vma, page, vmf->address, vmf->pte);

	pte_unmap_unlock(vmf->pte, vmf->ptl);
	unlock_page(page);

	mmu_notifier_invalidate_range_end(&range);
	return 0;
}

L
Linus Torvalds 已提交
3505
/*
3506
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
3507
 * but allow concurrent faults), and pte mapped but not yet locked.
3508 3509
 * We return with pte unmapped and unlocked.
 *
3510
 * We return with the mmap_lock locked or unlocked in the same cases
3511
 * as does filemap_fault().
L
Linus Torvalds 已提交
3512
 */
3513
vm_fault_t do_swap_page(struct vm_fault *vmf)
L
Linus Torvalds 已提交
3514
{
J
Jan Kara 已提交
3515
	struct vm_area_struct *vma = vmf->vma;
M
Minchan Kim 已提交
3516
	struct page *page = NULL, *swapcache;
3517
	struct swap_info_struct *si = NULL;
3518
	swp_entry_t entry;
L
Linus Torvalds 已提交
3519
	pte_t pte;
3520
	int locked;
3521
	int exclusive = 0;
3522
	vm_fault_t ret = 0;
3523
	void *shadow = NULL;
L
Linus Torvalds 已提交
3524

3525
	if (!pte_unmap_same(vmf))
3526
		goto out;
3527

J
Jan Kara 已提交
3528
	entry = pte_to_swp_entry(vmf->orig_pte);
3529 3530
	if (unlikely(non_swap_entry(entry))) {
		if (is_migration_entry(entry)) {
J
Jan Kara 已提交
3531 3532
			migration_entry_wait(vma->vm_mm, vmf->pmd,
					     vmf->address);
3533 3534 3535
		} else if (is_device_exclusive_entry(entry)) {
			vmf->page = pfn_swap_entry_to_page(entry);
			ret = remove_device_exclusive_entry(vmf);
3536
		} else if (is_device_private_entry(entry)) {
3537
			vmf->page = pfn_swap_entry_to_page(entry);
3538
			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3539 3540 3541
		} else if (is_hwpoison_entry(entry)) {
			ret = VM_FAULT_HWPOISON;
		} else {
J
Jan Kara 已提交
3542
			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
H
Hugh Dickins 已提交
3543
			ret = VM_FAULT_SIGBUS;
3544
		}
3545 3546
		goto out;
	}
3547

3548 3549 3550 3551
	/* Prevent swapoff from happening to us. */
	si = get_swap_device(entry);
	if (unlikely(!si))
		goto out;
3552

M
Minchan Kim 已提交
3553 3554
	page = lookup_swap_cache(entry, vma, vmf->address);
	swapcache = page;
3555

L
Linus Torvalds 已提交
3556
	if (!page) {
3557 3558
		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
		    __swap_count(entry) == 1) {
3559
			/* skip swapcache */
3560 3561
			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
							vmf->address);
3562 3563 3564
			if (page) {
				__SetPageLocked(page);
				__SetPageSwapBacked(page);
3565

3566 3567
				if (mem_cgroup_swapin_charge_page(page,
					vma->vm_mm, GFP_KERNEL, entry)) {
3568
					ret = VM_FAULT_OOM;
3569
					goto out_page;
3570
				}
3571
				mem_cgroup_swapin_uncharge_swap(entry);
3572

3573 3574
				shadow = get_shadow_from_swap_cache(entry);
				if (shadow)
3575 3576
					workingset_refault(page_folio(page),
								shadow);
3577

3578
				lru_cache_add(page);
3579 3580 3581

				/* To provide entry to swap_readpage() */
				set_page_private(page, entry.val);
3582
				swap_readpage(page, true);
3583
				set_page_private(page, 0);
3584
			}
3585
		} else {
3586 3587
			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
						vmf);
3588
			swapcache = page;
3589 3590
		}

L
Linus Torvalds 已提交
3591 3592
		if (!page) {
			/*
3593 3594
			 * Back out if somebody else faulted in this pte
			 * while we released the pte lock.
L
Linus Torvalds 已提交
3595
			 */
J
Jan Kara 已提交
3596 3597
			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
					vmf->address, &vmf->ptl);
J
Jan Kara 已提交
3598
			if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
L
Linus Torvalds 已提交
3599
				ret = VM_FAULT_OOM;
3600
			goto unlock;
L
Linus Torvalds 已提交
3601 3602 3603 3604
		}

		/* Had to read the page from swap area: Major fault */
		ret = VM_FAULT_MAJOR;
3605
		count_vm_event(PGMAJFAULT);
3606
		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
3607
	} else if (PageHWPoison(page)) {
3608 3609 3610 3611
		/*
		 * hwpoisoned dirty swapcache pages are kept for killing
		 * owner processes (which may be unknown at hwpoison time)
		 */
3612
		ret = VM_FAULT_HWPOISON;
3613
		goto out_release;
L
Linus Torvalds 已提交
3614 3615
	}

J
Jan Kara 已提交
3616
	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
R
Rik van Riel 已提交
3617

3618 3619 3620 3621
	if (!locked) {
		ret |= VM_FAULT_RETRY;
		goto out_release;
	}
3622

A
Andrea Arcangeli 已提交
3623
	/*
3624 3625 3626 3627
	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
	 * release the swapcache from under us.  The page pin, and pte_same
	 * test below, are not enough to exclude that.  Even if it is still
	 * swapcache, we need to check that the page's swap has not changed.
A
Andrea Arcangeli 已提交
3628
	 */
3629 3630
	if (unlikely((!PageSwapCache(page) ||
			page_private(page) != entry.val)) && swapcache)
A
Andrea Arcangeli 已提交
3631 3632
		goto out_page;

J
Jan Kara 已提交
3633
	page = ksm_might_need_to_copy(page, vma, vmf->address);
3634 3635 3636 3637
	if (unlikely(!page)) {
		ret = VM_FAULT_OOM;
		page = swapcache;
		goto out_page;
H
Hugh Dickins 已提交
3638 3639
	}

3640
	cgroup_throttle_swaprate(page, GFP_KERNEL);
3641

L
Linus Torvalds 已提交
3642
	/*
3643
	 * Back out if somebody else already faulted in this pte.
L
Linus Torvalds 已提交
3644
	 */
J
Jan Kara 已提交
3645 3646
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
			&vmf->ptl);
J
Jan Kara 已提交
3647
	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3648 3649 3650 3651 3652
		goto out_nomap;

	if (unlikely(!PageUptodate(page))) {
		ret = VM_FAULT_SIGBUS;
		goto out_nomap;
L
Linus Torvalds 已提交
3653 3654
	}

3655 3656 3657 3658 3659 3660 3661 3662 3663
	/*
	 * The page isn't present yet, go ahead with the fault.
	 *
	 * Be careful about the sequence of operations here.
	 * To get its accounting right, reuse_swap_page() must be called
	 * while the page is counted on swap but not yet in mapcount i.e.
	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
	 * must be called after the swap_free(), or it will never succeed.
	 */
L
Linus Torvalds 已提交
3664

K
Kirill A. Shutemov 已提交
3665 3666
	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
L
Linus Torvalds 已提交
3667
	pte = mk_pte(page, vma->vm_page_prot);
3668
	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
L
Linus Torvalds 已提交
3669
		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
J
Jan Kara 已提交
3670
		vmf->flags &= ~FAULT_FLAG_WRITE;
3671
		ret |= VM_FAULT_WRITE;
3672
		exclusive = RMAP_EXCLUSIVE;
L
Linus Torvalds 已提交
3673 3674
	}
	flush_icache_page(vma, page);
J
Jan Kara 已提交
3675
	if (pte_swp_soft_dirty(vmf->orig_pte))
3676
		pte = pte_mksoft_dirty(pte);
3677 3678 3679 3680
	if (pte_swp_uffd_wp(vmf->orig_pte)) {
		pte = pte_mkuffd_wp(pte);
		pte = pte_wrprotect(pte);
	}
J
Jan Kara 已提交
3681
	vmf->orig_pte = pte;
3682 3683 3684

	/* ksm created a completely new copy */
	if (unlikely(page != swapcache && swapcache)) {
J
Jan Kara 已提交
3685
		page_add_new_anon_rmap(page, vma, vmf->address, false);
3686
		lru_cache_add_inactive_or_unevictable(page, vma);
3687 3688
	} else {
		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3689
	}
L
Linus Torvalds 已提交
3690

3691 3692 3693
	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);

3694
	swap_free(entry);
3695 3696
	if (mem_cgroup_swap_full(page) ||
	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3697
		try_to_free_swap(page);
3698
	unlock_page(page);
3699
	if (page != swapcache && swapcache) {
A
Andrea Arcangeli 已提交
3700 3701 3702 3703 3704 3705 3706 3707 3708
		/*
		 * Hold the lock to avoid the swap entry to be reused
		 * until we take the PT lock for the pte_same() check
		 * (to avoid false positives from pte_same). For
		 * further safety release the lock after the swap_free
		 * so that the swap count won't change under a
		 * parallel locked swapcache.
		 */
		unlock_page(swapcache);
3709
		put_page(swapcache);
A
Andrea Arcangeli 已提交
3710
	}
3711

J
Jan Kara 已提交
3712
	if (vmf->flags & FAULT_FLAG_WRITE) {
J
Jan Kara 已提交
3713
		ret |= do_wp_page(vmf);
3714 3715
		if (ret & VM_FAULT_ERROR)
			ret &= VM_FAULT_ERROR;
L
Linus Torvalds 已提交
3716 3717 3718 3719
		goto out;
	}

	/* No need to invalidate - it was non-present before */
J
Jan Kara 已提交
3720
	update_mmu_cache(vma, vmf->address, vmf->pte);
3721
unlock:
J
Jan Kara 已提交
3722
	pte_unmap_unlock(vmf->pte, vmf->ptl);
L
Linus Torvalds 已提交
3723
out:
3724 3725
	if (si)
		put_swap_device(si);
L
Linus Torvalds 已提交
3726
	return ret;
3727
out_nomap:
J
Jan Kara 已提交
3728
	pte_unmap_unlock(vmf->pte, vmf->ptl);
3729
out_page:
3730
	unlock_page(page);
3731
out_release:
3732
	put_page(page);
3733
	if (page != swapcache && swapcache) {
A
Andrea Arcangeli 已提交
3734
		unlock_page(swapcache);
3735
		put_page(swapcache);
A
Andrea Arcangeli 已提交
3736
	}
3737 3738
	if (si)
		put_swap_device(si);
3739
	return ret;
L
Linus Torvalds 已提交
3740 3741 3742
}

/*
3743
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
3744
 * but allow concurrent faults), and pte mapped but not yet locked.
3745
 * We return with mmap_lock still held, but pte unmapped and unlocked.
L
Linus Torvalds 已提交
3746
 */
3747
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
L
Linus Torvalds 已提交
3748
{
J
Jan Kara 已提交
3749
	struct vm_area_struct *vma = vmf->vma;
3750
	struct page *page;
3751
	vm_fault_t ret = 0;
L
Linus Torvalds 已提交
3752 3753
	pte_t entry;

3754 3755 3756 3757
	/* File mapping without ->vm_ops ? */
	if (vma->vm_flags & VM_SHARED)
		return VM_FAULT_SIGBUS;

3758 3759 3760 3761 3762
	/*
	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
	 * pte_offset_map() on pmds where a huge pmd might be created
	 * from a different thread.
	 *
3763
	 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
3764 3765
	 * parallel threads are excluded by other means.
	 *
3766
	 * Here we only have mmap_read_lock(mm).
3767
	 */
3768
	if (pte_alloc(vma->vm_mm, vmf->pmd))
3769 3770
		return VM_FAULT_OOM;

3771
	/* See comment in handle_pte_fault() */
J
Jan Kara 已提交
3772
	if (unlikely(pmd_trans_unstable(vmf->pmd)))
3773 3774
		return 0;

3775
	/* Use the zero-page for reads */
J
Jan Kara 已提交
3776
	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
K
Kirill A. Shutemov 已提交
3777
			!mm_forbids_zeropage(vma->vm_mm)) {
J
Jan Kara 已提交
3778
		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
H
Hugh Dickins 已提交
3779
						vma->vm_page_prot));
J
Jan Kara 已提交
3780 3781
		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
				vmf->address, &vmf->ptl);
3782 3783
		if (!pte_none(*vmf->pte)) {
			update_mmu_tlb(vma, vmf->address, vmf->pte);
H
Hugh Dickins 已提交
3784
			goto unlock;
3785
		}
3786 3787 3788
		ret = check_stable_address_space(vma->vm_mm);
		if (ret)
			goto unlock;
3789 3790
		/* Deliver the page fault to userland, check inside PT lock */
		if (userfaultfd_missing(vma)) {
J
Jan Kara 已提交
3791 3792
			pte_unmap_unlock(vmf->pte, vmf->ptl);
			return handle_userfault(vmf, VM_UFFD_MISSING);
3793
		}
H
Hugh Dickins 已提交
3794 3795 3796
		goto setpte;
	}

N
Nick Piggin 已提交
3797 3798 3799
	/* Allocate our own private page. */
	if (unlikely(anon_vma_prepare(vma)))
		goto oom;
J
Jan Kara 已提交
3800
	page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
N
Nick Piggin 已提交
3801 3802
	if (!page)
		goto oom;
3803

3804
	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
3805
		goto oom_free_page;
3806
	cgroup_throttle_swaprate(page, GFP_KERNEL);
3807

3808 3809
	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
3810
	 * preceding stores to the page contents become visible before
3811 3812
	 * the set_pte_at() write.
	 */
N
Nick Piggin 已提交
3813
	__SetPageUptodate(page);
3814

N
Nick Piggin 已提交
3815
	entry = mk_pte(page, vma->vm_page_prot);
3816
	entry = pte_sw_mkyoung(entry);
H
Hugh Dickins 已提交
3817 3818
	if (vma->vm_flags & VM_WRITE)
		entry = pte_mkwrite(pte_mkdirty(entry));
L
Linus Torvalds 已提交
3819

J
Jan Kara 已提交
3820 3821
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
			&vmf->ptl);
3822 3823
	if (!pte_none(*vmf->pte)) {
		update_mmu_cache(vma, vmf->address, vmf->pte);
N
Nick Piggin 已提交
3824
		goto release;
3825
	}
H
Hugh Dickins 已提交
3826

3827 3828 3829 3830
	ret = check_stable_address_space(vma->vm_mm);
	if (ret)
		goto release;

3831 3832
	/* Deliver the page fault to userland, check inside PT lock */
	if (userfaultfd_missing(vma)) {
J
Jan Kara 已提交
3833
		pte_unmap_unlock(vmf->pte, vmf->ptl);
3834
		put_page(page);
J
Jan Kara 已提交
3835
		return handle_userfault(vmf, VM_UFFD_MISSING);
3836 3837
	}

K
Kirill A. Shutemov 已提交
3838
	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
J
Jan Kara 已提交
3839
	page_add_new_anon_rmap(page, vma, vmf->address, false);
3840
	lru_cache_add_inactive_or_unevictable(page, vma);
H
Hugh Dickins 已提交
3841
setpte:
J
Jan Kara 已提交
3842
	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
L
Linus Torvalds 已提交
3843 3844

	/* No need to invalidate - it was non-present before */
J
Jan Kara 已提交
3845
	update_mmu_cache(vma, vmf->address, vmf->pte);
3846
unlock:
J
Jan Kara 已提交
3847
	pte_unmap_unlock(vmf->pte, vmf->ptl);
3848
	return ret;
3849
release:
3850
	put_page(page);
3851
	goto unlock;
3852
oom_free_page:
3853
	put_page(page);
3854
oom:
L
Linus Torvalds 已提交
3855 3856 3857
	return VM_FAULT_OOM;
}

3858
/*
3859
 * The mmap_lock must have been held on entry, and may have been
3860 3861 3862
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
3863
static vm_fault_t __do_fault(struct vm_fault *vmf)
3864
{
J
Jan Kara 已提交
3865
	struct vm_area_struct *vma = vmf->vma;
3866
	vm_fault_t ret;
3867

3868 3869 3870 3871 3872 3873 3874 3875
	/*
	 * Preallocate pte before we take page_lock because this might lead to
	 * deadlocks for memcg reclaim which waits for pages under writeback:
	 *				lock_page(A)
	 *				SetPageWriteback(A)
	 *				unlock_page(A)
	 * lock_page(B)
	 *				lock_page(B)
3876
	 * pte_alloc_one
3877 3878 3879 3880 3881 3882 3883
	 *   shrink_page_list
	 *     wait_on_page_writeback(A)
	 *				SetPageWriteback(B)
	 *				unlock_page(B)
	 *				# flush A, B to clear the writeback
	 */
	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3884
		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3885 3886 3887 3888
		if (!vmf->prealloc_pte)
			return VM_FAULT_OOM;
	}

3889
	ret = vma->vm_ops->fault(vmf);
3890
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3891
			    VM_FAULT_DONE_COW)))
3892
		return ret;
3893

3894
	if (unlikely(PageHWPoison(vmf->page))) {
3895
		if (ret & VM_FAULT_LOCKED)
3896 3897
			unlock_page(vmf->page);
		put_page(vmf->page);
J
Jan Kara 已提交
3898
		vmf->page = NULL;
3899 3900 3901 3902
		return VM_FAULT_HWPOISON;
	}

	if (unlikely(!(ret & VM_FAULT_LOCKED)))
3903
		lock_page(vmf->page);
3904
	else
3905
		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3906 3907 3908 3909

	return ret;
}

3910
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
J
Jan Kara 已提交
3911
static void deposit_prealloc_pte(struct vm_fault *vmf)
3912
{
J
Jan Kara 已提交
3913
	struct vm_area_struct *vma = vmf->vma;
3914

J
Jan Kara 已提交
3915
	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3916 3917 3918 3919
	/*
	 * We are going to consume the prealloc table,
	 * count that as nr_ptes.
	 */
3920
	mm_inc_nr_ptes(vma->vm_mm);
3921
	vmf->prealloc_pte = NULL;
3922 3923
}

3924
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
K
Kirill A. Shutemov 已提交
3925
{
J
Jan Kara 已提交
3926 3927 3928
	struct vm_area_struct *vma = vmf->vma;
	bool write = vmf->flags & FAULT_FLAG_WRITE;
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
K
Kirill A. Shutemov 已提交
3929
	pmd_t entry;
3930
	int i;
3931
	vm_fault_t ret = VM_FAULT_FALLBACK;
K
Kirill A. Shutemov 已提交
3932 3933

	if (!transhuge_vma_suitable(vma, haddr))
3934
		return ret;
K
Kirill A. Shutemov 已提交
3935 3936

	page = compound_head(page);
3937 3938
	if (compound_order(page) != HPAGE_PMD_ORDER)
		return ret;
K
Kirill A. Shutemov 已提交
3939

3940 3941 3942 3943 3944 3945 3946 3947 3948
	/*
	 * Just backoff if any subpage of a THP is corrupted otherwise
	 * the corrupted page may mapped by PMD silently to escape the
	 * check.  This kind of THP just can be PTE mapped.  Access to
	 * the corrupted subpage should trigger SIGBUS as expected.
	 */
	if (unlikely(PageHasHWPoisoned(page)))
		return ret;

3949
	/*
I
Ingo Molnar 已提交
3950
	 * Archs like ppc64 need additional space to store information
3951 3952
	 * related to pte entry. Use the preallocated table for that.
	 */
J
Jan Kara 已提交
3953
	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3954
		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
J
Jan Kara 已提交
3955
		if (!vmf->prealloc_pte)
3956 3957 3958
			return VM_FAULT_OOM;
	}

J
Jan Kara 已提交
3959 3960
	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_none(*vmf->pmd)))
K
Kirill A. Shutemov 已提交
3961 3962 3963 3964 3965 3966 3967
		goto out;

	for (i = 0; i < HPAGE_PMD_NR; i++)
		flush_icache_page(vma, page + i);

	entry = mk_huge_pmd(page, vma->vm_page_prot);
	if (write)
3968
		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
K
Kirill A. Shutemov 已提交
3969

3970
	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
K
Kirill A. Shutemov 已提交
3971
	page_add_file_rmap(page, true);
3972 3973 3974 3975
	/*
	 * deposit and withdraw with pmd lock held
	 */
	if (arch_needs_pgtable_deposit())
J
Jan Kara 已提交
3976
		deposit_prealloc_pte(vmf);
K
Kirill A. Shutemov 已提交
3977

J
Jan Kara 已提交
3978
	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
K
Kirill A. Shutemov 已提交
3979

J
Jan Kara 已提交
3980
	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
K
Kirill A. Shutemov 已提交
3981 3982 3983

	/* fault is handled */
	ret = 0;
3984
	count_vm_event(THP_FILE_MAPPED);
K
Kirill A. Shutemov 已提交
3985
out:
J
Jan Kara 已提交
3986
	spin_unlock(vmf->ptl);
K
Kirill A. Shutemov 已提交
3987 3988 3989
	return ret;
}
#else
3990
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
K
Kirill A. Shutemov 已提交
3991
{
3992
	return VM_FAULT_FALLBACK;
K
Kirill A. Shutemov 已提交
3993 3994 3995
}
#endif

3996
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
3997
{
J
Jan Kara 已提交
3998 3999
	struct vm_area_struct *vma = vmf->vma;
	bool write = vmf->flags & FAULT_FLAG_WRITE;
4000
	bool prefault = vmf->address != addr;
4001
	pte_t entry;
4002

4003 4004
	flush_icache_page(vma, page);
	entry = mk_pte(page, vma->vm_page_prot);
4005 4006 4007

	if (prefault && arch_wants_old_prefaulted_pte())
		entry = pte_mkold(entry);
4008 4009
	else
		entry = pte_sw_mkyoung(entry);
4010

4011 4012
	if (write)
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
K
Kirill A. Shutemov 已提交
4013 4014
	/* copy-on-write page */
	if (write && !(vma->vm_flags & VM_SHARED)) {
4015
		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
4016
		page_add_new_anon_rmap(page, vma, addr, false);
4017
		lru_cache_add_inactive_or_unevictable(page, vma);
4018
	} else {
4019
		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
K
Kirill A. Shutemov 已提交
4020
		page_add_file_rmap(page, false);
4021
	}
4022
	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
4023 4024
}

4025 4026 4027 4028 4029 4030 4031 4032
/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
4033
 * addition.
4034 4035 4036
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
4037 4038
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
4039
 */
4040
vm_fault_t finish_fault(struct vm_fault *vmf)
4041
{
4042
	struct vm_area_struct *vma = vmf->vma;
4043
	struct page *page;
4044
	vm_fault_t ret;
4045 4046

	/* Did we COW the page? */
4047
	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
4048 4049 4050
		page = vmf->cow_page;
	else
		page = vmf->page;
4051 4052 4053 4054 4055

	/*
	 * check even for read faults because we might have lost our CoWed
	 * page
	 */
4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068
	if (!(vma->vm_flags & VM_SHARED)) {
		ret = check_stable_address_space(vma->vm_mm);
		if (ret)
			return ret;
	}

	if (pmd_none(*vmf->pmd)) {
		if (PageTransCompound(page)) {
			ret = do_set_pmd(vmf, page);
			if (ret != VM_FAULT_FALLBACK)
				return ret;
		}

Q
Qi Zheng 已提交
4069 4070 4071
		if (vmf->prealloc_pte)
			pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
		else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083
			return VM_FAULT_OOM;
	}

	/* See comment in handle_pte_fault() */
	if (pmd_devmap_trans_unstable(vmf->pmd))
		return 0;

	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
				      vmf->address, &vmf->ptl);
	ret = 0;
	/* Re-check under ptl */
	if (likely(pte_none(*vmf->pte)))
4084
		do_set_pte(vmf, page, vmf->address);
4085 4086 4087 4088 4089
	else
		ret = VM_FAULT_NOPAGE;

	update_mmu_tlb(vma, vmf->address, vmf->pte);
	pte_unmap_unlock(vmf->pte, vmf->ptl);
4090 4091 4092
	return ret;
}

4093 4094
static unsigned long fault_around_bytes __read_mostly =
	rounddown_pow_of_two(65536);
4095 4096 4097

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
4098
{
4099
	*val = fault_around_bytes;
4100 4101 4102
	return 0;
}

4103
/*
4104 4105
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
4106
 */
4107
static int fault_around_bytes_set(void *data, u64 val)
4108
{
4109
	if (val / PAGE_SIZE > PTRS_PER_PTE)
4110
		return -EINVAL;
4111 4112 4113 4114
	if (val > PAGE_SIZE)
		fault_around_bytes = rounddown_pow_of_two(val);
	else
		fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
4115 4116
	return 0;
}
4117
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
4118
		fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
4119 4120 4121

static int __init fault_around_debugfs(void)
{
4122 4123
	debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
				   &fault_around_bytes_fops);
4124 4125 4126 4127
	return 0;
}
late_initcall(fault_around_debugfs);
#endif
4128

4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143
/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function is called with the page table lock taken. In the split ptlock
 * case the page table lock only protects only those entries which belong to
 * the page table corresponding to the fault address.
 *
 * This function doesn't cross the VMA boundaries, in order to call map_pages()
 * only once.
 *
4144 4145 4146
 * fault_around_bytes defines how many bytes we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
4147
 *
4148 4149 4150 4151
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_bytes rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
4152
 */
4153
static vm_fault_t do_fault_around(struct vm_fault *vmf)
4154
{
J
Jan Kara 已提交
4155
	unsigned long address = vmf->address, nr_pages, mask;
4156
	pgoff_t start_pgoff = vmf->pgoff;
K
Kirill A. Shutemov 已提交
4157
	pgoff_t end_pgoff;
4158
	int off;
4159

4160
	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
4161 4162
	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;

4163 4164
	address = max(address & mask, vmf->vma->vm_start);
	off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
K
Kirill A. Shutemov 已提交
4165
	start_pgoff -= off;
4166 4167

	/*
4168 4169
	 *  end_pgoff is either the end of the page table, the end of
	 *  the vma or nr_pages from start_pgoff, depending what is nearest.
4170
	 */
K
Kirill A. Shutemov 已提交
4171
	end_pgoff = start_pgoff -
4172
		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
4173
		PTRS_PER_PTE - 1;
J
Jan Kara 已提交
4174
	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
K
Kirill A. Shutemov 已提交
4175
			start_pgoff + nr_pages - 1);
4176

J
Jan Kara 已提交
4177
	if (pmd_none(*vmf->pmd)) {
4178
		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
J
Jan Kara 已提交
4179
		if (!vmf->prealloc_pte)
4180
			return VM_FAULT_OOM;
4181 4182
	}

4183
	return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
4184 4185
}

4186
static vm_fault_t do_read_fault(struct vm_fault *vmf)
4187
{
J
Jan Kara 已提交
4188
	struct vm_area_struct *vma = vmf->vma;
4189
	vm_fault_t ret = 0;
4190 4191 4192 4193 4194 4195

	/*
	 * Let's call ->map_pages() first and use ->fault() as fallback
	 * if page by the offset is not ready to be mapped (cold cache or
	 * something).
	 */
4196
	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
4197 4198 4199 4200 4201
		if (likely(!userfaultfd_minor(vmf->vma))) {
			ret = do_fault_around(vmf);
			if (ret)
				return ret;
		}
4202
	}
4203

J
Jan Kara 已提交
4204
	ret = __do_fault(vmf);
4205 4206 4207
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		return ret;

4208
	ret |= finish_fault(vmf);
J
Jan Kara 已提交
4209
	unlock_page(vmf->page);
4210
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
J
Jan Kara 已提交
4211
		put_page(vmf->page);
4212 4213 4214
	return ret;
}

4215
static vm_fault_t do_cow_fault(struct vm_fault *vmf)
4216
{
J
Jan Kara 已提交
4217
	struct vm_area_struct *vma = vmf->vma;
4218
	vm_fault_t ret;
4219 4220 4221 4222

	if (unlikely(anon_vma_prepare(vma)))
		return VM_FAULT_OOM;

J
Jan Kara 已提交
4223 4224
	vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
	if (!vmf->cow_page)
4225 4226
		return VM_FAULT_OOM;

4227 4228
	if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
				GFP_KERNEL)) {
J
Jan Kara 已提交
4229
		put_page(vmf->cow_page);
4230 4231
		return VM_FAULT_OOM;
	}
4232
	cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
4233

J
Jan Kara 已提交
4234
	ret = __do_fault(vmf);
4235 4236
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		goto uncharge_out;
4237 4238
	if (ret & VM_FAULT_DONE_COW)
		return ret;
4239

4240
	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
J
Jan Kara 已提交
4241
	__SetPageUptodate(vmf->cow_page);
4242

4243
	ret |= finish_fault(vmf);
4244 4245
	unlock_page(vmf->page);
	put_page(vmf->page);
4246 4247
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		goto uncharge_out;
4248 4249
	return ret;
uncharge_out:
J
Jan Kara 已提交
4250
	put_page(vmf->cow_page);
4251 4252 4253
	return ret;
}

4254
static vm_fault_t do_shared_fault(struct vm_fault *vmf)
L
Linus Torvalds 已提交
4255
{
J
Jan Kara 已提交
4256
	struct vm_area_struct *vma = vmf->vma;
4257
	vm_fault_t ret, tmp;
4258

J
Jan Kara 已提交
4259
	ret = __do_fault(vmf);
4260
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4261
		return ret;
L
Linus Torvalds 已提交
4262 4263

	/*
4264 4265
	 * Check if the backing address space wants to know that the page is
	 * about to become writable
L
Linus Torvalds 已提交
4266
	 */
4267
	if (vma->vm_ops->page_mkwrite) {
J
Jan Kara 已提交
4268
		unlock_page(vmf->page);
4269
		tmp = do_page_mkwrite(vmf);
4270 4271
		if (unlikely(!tmp ||
				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
J
Jan Kara 已提交
4272
			put_page(vmf->page);
4273
			return tmp;
4274
		}
4275 4276
	}

4277
	ret |= finish_fault(vmf);
4278 4279
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
					VM_FAULT_RETRY))) {
J
Jan Kara 已提交
4280 4281
		unlock_page(vmf->page);
		put_page(vmf->page);
4282
		return ret;
L
Linus Torvalds 已提交
4283
	}
N
Nick Piggin 已提交
4284

4285
	ret |= fault_dirty_shared_page(vmf);
4286
	return ret;
4287
}
4288

4289
/*
4290
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
4291
 * but allow concurrent faults).
4292
 * The mmap_lock may have been released depending on flags and our
4293
 * return value.  See filemap_fault() and __folio_lock_or_retry().
4294
 * If mmap_lock is released, vma may become invalid (for example
4295
 * by other thread calling munmap()).
4296
 */
4297
static vm_fault_t do_fault(struct vm_fault *vmf)
4298
{
J
Jan Kara 已提交
4299
	struct vm_area_struct *vma = vmf->vma;
4300
	struct mm_struct *vm_mm = vma->vm_mm;
4301
	vm_fault_t ret;
4302

4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332
	/*
	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
	 */
	if (!vma->vm_ops->fault) {
		/*
		 * If we find a migration pmd entry or a none pmd entry, which
		 * should never happen, return SIGBUS
		 */
		if (unlikely(!pmd_present(*vmf->pmd)))
			ret = VM_FAULT_SIGBUS;
		else {
			vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
						       vmf->pmd,
						       vmf->address,
						       &vmf->ptl);
			/*
			 * Make sure this is not a temporary clearing of pte
			 * by holding ptl and checking again. A R/M/W update
			 * of pte involves: take ptl, clearing the pte so that
			 * we don't have concurrent modification by hardware
			 * followed by an update.
			 */
			if (unlikely(pte_none(*vmf->pte)))
				ret = VM_FAULT_SIGBUS;
			else
				ret = VM_FAULT_NOPAGE;

			pte_unmap_unlock(vmf->pte, vmf->ptl);
		}
	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
H
Hugh Dickins 已提交
4333 4334 4335 4336 4337 4338 4339 4340
		ret = do_read_fault(vmf);
	else if (!(vma->vm_flags & VM_SHARED))
		ret = do_cow_fault(vmf);
	else
		ret = do_shared_fault(vmf);

	/* preallocated pagetable is unused: free it */
	if (vmf->prealloc_pte) {
4341
		pte_free(vm_mm, vmf->prealloc_pte);
4342
		vmf->prealloc_pte = NULL;
H
Hugh Dickins 已提交
4343 4344
	}
	return ret;
4345 4346
}

4347 4348
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
		      unsigned long addr, int page_nid, int *flags)
4349 4350 4351 4352
{
	get_page(page);

	count_vm_numa_event(NUMA_HINT_FAULTS);
4353
	if (page_nid == numa_node_id()) {
4354
		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
4355 4356
		*flags |= TNF_FAULT_LOCAL;
	}
4357 4358 4359 4360

	return mpol_misplaced(page, vma, addr);
}

4361
static vm_fault_t do_numa_page(struct vm_fault *vmf)
4362
{
J
Jan Kara 已提交
4363
	struct vm_area_struct *vma = vmf->vma;
4364
	struct page *page = NULL;
4365
	int page_nid = NUMA_NO_NODE;
4366
	int last_cpupid;
4367
	int target_nid;
4368
	pte_t pte, old_pte;
4369
	bool was_writable = pte_savedwrite(vmf->orig_pte);
4370
	int flags = 0;
4371 4372

	/*
T
Tobin C Harding 已提交
4373 4374 4375 4376
	 * The "pte" at this point cannot be used safely without
	 * validation through pte_unmap_same(). It's of NUMA type but
	 * the pfn may be screwed if the read is non atomic.
	 */
J
Jan Kara 已提交
4377 4378
	vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
	spin_lock(vmf->ptl);
4379
	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
J
Jan Kara 已提交
4380
		pte_unmap_unlock(vmf->pte, vmf->ptl);
4381 4382 4383
		goto out;
	}

4384 4385
	/* Get the normal PTE  */
	old_pte = ptep_get(vmf->pte);
4386
	pte = pte_modify(old_pte, vma->vm_page_prot);
4387

J
Jan Kara 已提交
4388
	page = vm_normal_page(vma, vmf->address, pte);
4389 4390
	if (!page)
		goto out_map;
4391

4392
	/* TODO: handle PTE-mapped THP */
4393 4394
	if (PageCompound(page))
		goto out_map;
4395

4396
	/*
4397 4398 4399 4400 4401 4402
	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
	 * much anyway since they can be in shared cache state. This misses
	 * the case where a mapping is writable but the process never writes
	 * to it but pte_write gets cleared during protection updates and
	 * pte_dirty has unpredictable behaviour between PTE scan updates,
	 * background writeback, dirty balancing and application behaviour.
4403
	 */
4404
	if (!was_writable)
4405 4406
		flags |= TNF_NO_GROUP;

4407 4408 4409 4410 4411 4412 4413
	/*
	 * Flag if the page is shared between multiple address spaces. This
	 * is later used when determining whether to group tasks together
	 */
	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
		flags |= TNF_SHARED;

4414
	last_cpupid = page_cpupid_last(page);
4415
	page_nid = page_to_nid(page);
J
Jan Kara 已提交
4416
	target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
K
Kirill A. Shutemov 已提交
4417
			&flags);
4418
	if (target_nid == NUMA_NO_NODE) {
4419
		put_page(page);
4420
		goto out_map;
4421
	}
4422
	pte_unmap_unlock(vmf->pte, vmf->ptl);
4423 4424

	/* Migrate to the requested node */
4425
	if (migrate_misplaced_page(page, vma, target_nid)) {
4426
		page_nid = target_nid;
4427
		flags |= TNF_MIGRATED;
4428
	} else {
4429
		flags |= TNF_MIGRATE_FAIL;
4430 4431 4432 4433 4434 4435 4436 4437
		vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
		spin_lock(vmf->ptl);
		if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
			pte_unmap_unlock(vmf->pte, vmf->ptl);
			goto out;
		}
		goto out_map;
	}
4438 4439

out:
4440
	if (page_nid != NUMA_NO_NODE)
4441
		task_numa_fault(last_cpupid, page_nid, 1, flags);
4442
	return 0;
4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456
out_map:
	/*
	 * Make it present again, depending on how arch implements
	 * non-accessible ptes, some can allow access by kernel mode.
	 */
	old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
	pte = pte_modify(old_pte, vma->vm_page_prot);
	pte = pte_mkyoung(pte);
	if (was_writable)
		pte = pte_mkwrite(pte);
	ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
	update_mmu_cache(vma, vmf->address, vmf->pte);
	pte_unmap_unlock(vmf->pte, vmf->ptl);
	goto out;
4457 4458
}

4459
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
M
Matthew Wilcox 已提交
4460
{
4461
	if (vma_is_anonymous(vmf->vma))
J
Jan Kara 已提交
4462
		return do_huge_pmd_anonymous_page(vmf);
4463
	if (vmf->vma->vm_ops->huge_fault)
4464
		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
M
Matthew Wilcox 已提交
4465 4466 4467
	return VM_FAULT_FALLBACK;
}

4468
/* `inline' is required to avoid gcc 4.1.2 build error */
4469
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
M
Matthew Wilcox 已提交
4470
{
4471
	if (vma_is_anonymous(vmf->vma)) {
4472
		if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
4473
			return handle_userfault(vmf, VM_UFFD_WP);
4474
		return do_huge_pmd_wp_page(vmf);
4475
	}
4476 4477 4478 4479 4480 4481
	if (vmf->vma->vm_ops->huge_fault) {
		vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);

		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
	}
K
Kirill A. Shutemov 已提交
4482

4483
	/* COW or write-notify handled on pte level: split pmd. */
J
Jan Kara 已提交
4484
	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
K
Kirill A. Shutemov 已提交
4485

M
Matthew Wilcox 已提交
4486 4487 4488
	return VM_FAULT_FALLBACK;
}

4489
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4490
{
4491 4492
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4493 4494
	/* No support for anonymous transparent PUD pages yet */
	if (vma_is_anonymous(vmf->vma))
4495 4496 4497 4498 4499 4500 4501 4502 4503 4504
		goto split;
	if (vmf->vma->vm_ops->huge_fault) {
		vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);

		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
	}
split:
	/* COW or write-notify not handled on PUD level: split pud.*/
	__split_huge_pud(vmf->vma, vmf->pud, vmf->address);
4505 4506 4507 4508
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	return VM_FAULT_FALLBACK;
}

4509
static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4510 4511 4512 4513 4514 4515
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	/* No support for anonymous transparent PUD pages yet */
	if (vma_is_anonymous(vmf->vma))
		return VM_FAULT_FALLBACK;
	if (vmf->vma->vm_ops->huge_fault)
4516
		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4517 4518 4519 4520
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	return VM_FAULT_FALLBACK;
}

L
Linus Torvalds 已提交
4521 4522 4523 4524 4525 4526 4527 4528 4529
/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
4530
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
4531
 * concurrent faults).
4532
 *
4533
 * The mmap_lock may have been released depending on flags and our return value.
4534
 * See filemap_fault() and __folio_lock_or_retry().
L
Linus Torvalds 已提交
4535
 */
4536
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
L
Linus Torvalds 已提交
4537 4538 4539
{
	pte_t entry;

J
Jan Kara 已提交
4540
	if (unlikely(pmd_none(*vmf->pmd))) {
4541 4542 4543 4544 4545 4546
		/*
		 * Leave __pte_alloc() until later: because vm_ops->fault may
		 * want to allocate huge page, and if we expose page table
		 * for an instant, it will be difficult to retract from
		 * concurrent faults and from rmap lookups.
		 */
J
Jan Kara 已提交
4547
		vmf->pte = NULL;
4548
	} else {
4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560
		/*
		 * If a huge pmd materialized under us just retry later.  Use
		 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
		 * of pmd_trans_huge() to ensure the pmd didn't become
		 * pmd_trans_huge under us and then back to pmd_none, as a
		 * result of MADV_DONTNEED running immediately after a huge pmd
		 * fault in a different thread of this mm, in turn leading to a
		 * misleading pmd_trans_huge() retval. All we have to ensure is
		 * that it is a regular pmd that we can walk with
		 * pte_offset_map() and we can do that through an atomic read
		 * in C, which is what pmd_trans_unstable() provides.
		 */
4561
		if (pmd_devmap_trans_unstable(vmf->pmd))
4562 4563 4564 4565
			return 0;
		/*
		 * A regular pmd is established and it can't morph into a huge
		 * pmd from under us anymore at this point because we hold the
4566
		 * mmap_lock read mode and khugepaged takes it in write mode.
4567 4568
		 * So now it's safe to run pte_offset_map().
		 */
J
Jan Kara 已提交
4569
		vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
J
Jan Kara 已提交
4570
		vmf->orig_pte = *vmf->pte;
4571 4572 4573 4574

		/*
		 * some architectures can have larger ptes than wordsize,
		 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
4575 4576 4577
		 * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
		 * accesses.  The code below just needs a consistent view
		 * for the ifs and we later double check anyway with the
4578 4579 4580
		 * ptl lock held. So here a barrier will do.
		 */
		barrier();
J
Jan Kara 已提交
4581
		if (pte_none(vmf->orig_pte)) {
J
Jan Kara 已提交
4582 4583
			pte_unmap(vmf->pte);
			vmf->pte = NULL;
4584
		}
L
Linus Torvalds 已提交
4585 4586
	}

J
Jan Kara 已提交
4587 4588 4589
	if (!vmf->pte) {
		if (vma_is_anonymous(vmf->vma))
			return do_anonymous_page(vmf);
4590
		else
J
Jan Kara 已提交
4591
			return do_fault(vmf);
4592 4593
	}

J
Jan Kara 已提交
4594 4595
	if (!pte_present(vmf->orig_pte))
		return do_swap_page(vmf);
4596

J
Jan Kara 已提交
4597 4598
	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
		return do_numa_page(vmf);
4599

J
Jan Kara 已提交
4600 4601
	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	spin_lock(vmf->ptl);
J
Jan Kara 已提交
4602
	entry = vmf->orig_pte;
4603 4604
	if (unlikely(!pte_same(*vmf->pte, entry))) {
		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4605
		goto unlock;
4606
	}
J
Jan Kara 已提交
4607
	if (vmf->flags & FAULT_FLAG_WRITE) {
4608
		if (!pte_write(entry))
J
Jan Kara 已提交
4609
			return do_wp_page(vmf);
L
Linus Torvalds 已提交
4610 4611 4612
		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
J
Jan Kara 已提交
4613 4614 4615
	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
				vmf->flags & FAULT_FLAG_WRITE)) {
		update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4616
	} else {
4617 4618 4619
		/* Skip spurious TLB flush for retried page fault */
		if (vmf->flags & FAULT_FLAG_TRIED)
			goto unlock;
4620 4621 4622 4623 4624 4625
		/*
		 * This is needed only for protection faults but the arch code
		 * is not yet telling us if this is a protection fault or not.
		 * This still avoids useless tlb flushes for .text page faults
		 * with threads.
		 */
J
Jan Kara 已提交
4626 4627
		if (vmf->flags & FAULT_FLAG_WRITE)
			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4628
	}
4629
unlock:
J
Jan Kara 已提交
4630
	pte_unmap_unlock(vmf->pte, vmf->ptl);
N
Nick Piggin 已提交
4631
	return 0;
L
Linus Torvalds 已提交
4632 4633 4634 4635
}

/*
 * By the time we get here, we already hold the mm semaphore
4636
 *
4637
 * The mmap_lock may have been released depending on flags and our
4638
 * return value.  See filemap_fault() and __folio_lock_or_retry().
L
Linus Torvalds 已提交
4639
 */
4640 4641
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
		unsigned long address, unsigned int flags)
L
Linus Torvalds 已提交
4642
{
J
Jan Kara 已提交
4643
	struct vm_fault vmf = {
K
Kirill A. Shutemov 已提交
4644
		.vma = vma,
4645
		.address = address & PAGE_MASK,
K
Kirill A. Shutemov 已提交
4646
		.flags = flags,
4647
		.pgoff = linear_page_index(vma, address),
4648
		.gfp_mask = __get_fault_gfp_mask(vma),
K
Kirill A. Shutemov 已提交
4649
	};
4650
	unsigned int dirty = flags & FAULT_FLAG_WRITE;
4651
	struct mm_struct *mm = vma->vm_mm;
L
Linus Torvalds 已提交
4652
	pgd_t *pgd;
4653
	p4d_t *p4d;
4654
	vm_fault_t ret;
L
Linus Torvalds 已提交
4655 4656

	pgd = pgd_offset(mm, address);
4657 4658 4659
	p4d = p4d_alloc(mm, pgd, address);
	if (!p4d)
		return VM_FAULT_OOM;
4660

4661
	vmf.pud = pud_alloc(mm, p4d, address);
4662
	if (!vmf.pud)
H
Hugh Dickins 已提交
4663
		return VM_FAULT_OOM;
4664
retry_pud:
4665
	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676
		ret = create_huge_pud(&vmf);
		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
	} else {
		pud_t orig_pud = *vmf.pud;

		barrier();
		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {

			/* NUMA case for anonymous PUDs would go here */

4677
			if (dirty && !pud_write(orig_pud)) {
4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688
				ret = wp_huge_pud(&vmf, orig_pud);
				if (!(ret & VM_FAULT_FALLBACK))
					return ret;
			} else {
				huge_pud_set_accessed(&vmf, orig_pud);
				return 0;
			}
		}
	}

	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
J
Jan Kara 已提交
4689
	if (!vmf.pmd)
H
Hugh Dickins 已提交
4690
		return VM_FAULT_OOM;
4691 4692 4693 4694 4695

	/* Huge pud page fault raced with pmd_alloc? */
	if (pud_trans_unstable(vmf.pud))
		goto retry_pud;

4696
	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4697
		ret = create_huge_pmd(&vmf);
4698 4699
		if (!(ret & VM_FAULT_FALLBACK))
			return ret;
4700
	} else {
4701
		vmf.orig_pmd = *vmf.pmd;
4702

4703
		barrier();
4704
		if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
4705
			VM_BUG_ON(thp_migration_supported() &&
4706 4707
					  !is_pmd_migration_entry(vmf.orig_pmd));
			if (is_pmd_migration_entry(vmf.orig_pmd))
4708 4709 4710
				pmd_migration_entry_wait(mm, vmf.pmd);
			return 0;
		}
4711 4712 4713
		if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
				return do_huge_pmd_numa_page(&vmf);
4714

4715 4716
			if (dirty && !pmd_write(vmf.orig_pmd)) {
				ret = wp_huge_pmd(&vmf);
4717 4718
				if (!(ret & VM_FAULT_FALLBACK))
					return ret;
4719
			} else {
4720
				huge_pmd_set_accessed(&vmf);
4721
				return 0;
4722
			}
4723 4724 4725
		}
	}

J
Jan Kara 已提交
4726
	return handle_pte_fault(&vmf);
L
Linus Torvalds 已提交
4727 4728
}

4729
/**
I
Ingo Molnar 已提交
4730
 * mm_account_fault - Do page fault accounting
4731 4732 4733 4734 4735 4736 4737 4738
 *
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
I
Ingo Molnar 已提交
4739
 * This will take care of most of the page fault accounting.  Meanwhile, it
4740
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
I
Ingo Molnar 已提交
4741
 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct pt_regs *regs,
				    unsigned long address, unsigned int flags,
				    vm_fault_t ret)
{
	bool major;

	/*
	 * We don't do accounting for some specific faults:
	 *
	 * - Unsuccessful faults (e.g. when the address wasn't valid).  That
	 *   includes arch_vma_access_permitted() failing before reaching here.
	 *   So this is not a "this many hardware page faults" counter.  We
	 *   should use the hw profiling for that.
	 *
	 * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
	 *   once they're completed.
	 */
	if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
		return;

	/*
	 * We define the fault as a major fault when the final successful fault
	 * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
	 * handle it immediately previously).
	 */
	major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

4771 4772 4773 4774 4775
	if (major)
		current->maj_flt++;
	else
		current->min_flt++;

4776
	/*
4777 4778 4779
	 * If the fault is done for GUP, regs will be NULL.  We only do the
	 * accounting for the per thread fault counters who triggered the
	 * fault, and we skip the perf event updates.
4780 4781 4782 4783
	 */
	if (!regs)
		return;

4784
	if (major)
4785
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
4786
	else
4787 4788 4789
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

4790 4791 4792
/*
 * By the time we get here, we already hold the mm semaphore
 *
4793
 * The mmap_lock may have been released depending on flags and our
4794
 * return value.  See filemap_fault() and __folio_lock_or_retry().
4795
 */
4796
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4797
			   unsigned int flags, struct pt_regs *regs)
4798
{
4799
	vm_fault_t ret;
4800 4801 4802 4803

	__set_current_state(TASK_RUNNING);

	count_vm_event(PGFAULT);
4804
	count_memcg_event_mm(vma->vm_mm, PGFAULT);
4805 4806 4807 4808

	/* do counter updates before entering really critical section. */
	check_sync_rss_stat(current);

4809 4810 4811 4812 4813
	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
					    flags & FAULT_FLAG_INSTRUCTION,
					    flags & FAULT_FLAG_REMOTE))
		return VM_FAULT_SIGSEGV;

4814 4815 4816 4817 4818
	/*
	 * Enable the memcg OOM handling for faults triggered in user
	 * space.  Kernel faults are handled more gracefully.
	 */
	if (flags & FAULT_FLAG_USER)
4819
		mem_cgroup_enter_user_fault();
4820

K
Kirill A. Shutemov 已提交
4821 4822 4823 4824
	if (unlikely(is_vm_hugetlb_page(vma)))
		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
	else
		ret = __handle_mm_fault(vma, address, flags);
4825

4826
	if (flags & FAULT_FLAG_USER) {
4827
		mem_cgroup_exit_user_fault();
T
Tobin C Harding 已提交
4828 4829 4830 4831 4832 4833 4834 4835
		/*
		 * The task may have entered a memcg OOM situation but
		 * if the allocation error was handled gracefully (no
		 * VM_FAULT_OOM), there is no need to kill anything.
		 * Just clean up the OOM state peacefully.
		 */
		if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
			mem_cgroup_oom_synchronize(false);
4836
	}
4837

4838 4839
	mm_account_fault(regs, address, flags, ret);

4840 4841
	return ret;
}
4842
EXPORT_SYMBOL_GPL(handle_mm_fault);
4843

K
Kirill A. Shutemov 已提交
4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855
#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
	p4d_t *new = p4d_alloc_one(mm, address);
	if (!new)
		return -ENOMEM;

	spin_lock(&mm->page_table_lock);
Q
Qi Zheng 已提交
4856
	if (pgd_present(*pgd)) {	/* Another has populated it */
K
Kirill A. Shutemov 已提交
4857
		p4d_free(mm, new);
Q
Qi Zheng 已提交
4858 4859
	} else {
		smp_wmb(); /* See comment in pmd_install() */
K
Kirill A. Shutemov 已提交
4860
		pgd_populate(mm, pgd, new);
Q
Qi Zheng 已提交
4861
	}
K
Kirill A. Shutemov 已提交
4862 4863 4864 4865 4866
	spin_unlock(&mm->page_table_lock);
	return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

L
Linus Torvalds 已提交
4867 4868 4869
#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
H
Hugh Dickins 已提交
4870
 * We've already handled the fast-path in-line.
L
Linus Torvalds 已提交
4871
 */
4872
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
L
Linus Torvalds 已提交
4873
{
H
Hugh Dickins 已提交
4874 4875
	pud_t *new = pud_alloc_one(mm, address);
	if (!new)
4876
		return -ENOMEM;
L
Linus Torvalds 已提交
4877

H
Hugh Dickins 已提交
4878
	spin_lock(&mm->page_table_lock);
K
Kirill A. Shutemov 已提交
4879 4880
	if (!p4d_present(*p4d)) {
		mm_inc_nr_puds(mm);
Q
Qi Zheng 已提交
4881
		smp_wmb(); /* See comment in pmd_install() */
4882
		p4d_populate(mm, p4d, new);
K
Kirill A. Shutemov 已提交
4883
	} else	/* Another has populated it */
4884
		pud_free(mm, new);
H
Hugh Dickins 已提交
4885
	spin_unlock(&mm->page_table_lock);
4886
	return 0;
L
Linus Torvalds 已提交
4887 4888 4889 4890 4891 4892
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
H
Hugh Dickins 已提交
4893
 * We've already handled the fast-path in-line.
L
Linus Torvalds 已提交
4894
 */
4895
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
L
Linus Torvalds 已提交
4896
{
4897
	spinlock_t *ptl;
H
Hugh Dickins 已提交
4898 4899
	pmd_t *new = pmd_alloc_one(mm, address);
	if (!new)
4900
		return -ENOMEM;
L
Linus Torvalds 已提交
4901

4902
	ptl = pud_lock(mm, pud);
4903 4904
	if (!pud_present(*pud)) {
		mm_inc_nr_pmds(mm);
Q
Qi Zheng 已提交
4905
		smp_wmb(); /* See comment in pmd_install() */
4906
		pud_populate(mm, pud, new);
Q
Qi Zheng 已提交
4907
	} else {	/* Another has populated it */
4908
		pmd_free(mm, new);
Q
Qi Zheng 已提交
4909
	}
4910
	spin_unlock(ptl);
4911
	return 0;
4912
}
L
Linus Torvalds 已提交
4913 4914
#endif /* __PAGETABLE_PMD_FOLDED */

4915 4916 4917
int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
			  struct mmu_notifier_range *range, pte_t **ptepp,
			  pmd_t **pmdpp, spinlock_t **ptlp)
J
Johannes Weiner 已提交
4918 4919
{
	pgd_t *pgd;
4920
	p4d_t *p4d;
J
Johannes Weiner 已提交
4921 4922 4923 4924 4925 4926 4927 4928
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep;

	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		goto out;

4929 4930 4931 4932 4933
	p4d = p4d_offset(pgd, address);
	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
		goto out;

	pud = pud_offset(p4d, address);
J
Johannes Weiner 已提交
4934 4935 4936 4937
	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
		goto out;

	pmd = pmd_offset(pud, address);
4938
	VM_BUG_ON(pmd_trans_huge(*pmd));
J
Johannes Weiner 已提交
4939

R
Ross Zwisler 已提交
4940 4941 4942 4943
	if (pmd_huge(*pmd)) {
		if (!pmdpp)
			goto out;

4944
		if (range) {
4945
			mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4946 4947
						NULL, mm, address & PMD_MASK,
						(address & PMD_MASK) + PMD_SIZE);
4948
			mmu_notifier_invalidate_range_start(range);
4949
		}
R
Ross Zwisler 已提交
4950 4951 4952 4953 4954 4955
		*ptlp = pmd_lock(mm, pmd);
		if (pmd_huge(*pmd)) {
			*pmdpp = pmd;
			return 0;
		}
		spin_unlock(*ptlp);
4956 4957
		if (range)
			mmu_notifier_invalidate_range_end(range);
R
Ross Zwisler 已提交
4958 4959 4960
	}

	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
J
Johannes Weiner 已提交
4961 4962
		goto out;

4963
	if (range) {
4964
		mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4965 4966
					address & PAGE_MASK,
					(address & PAGE_MASK) + PAGE_SIZE);
4967
		mmu_notifier_invalidate_range_start(range);
4968
	}
J
Johannes Weiner 已提交
4969 4970 4971 4972 4973 4974 4975
	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
	if (!pte_present(*ptep))
		goto unlock;
	*ptepp = ptep;
	return 0;
unlock:
	pte_unmap_unlock(ptep, *ptlp);
4976 4977
	if (range)
		mmu_notifier_invalidate_range_end(range);
J
Johannes Weiner 已提交
4978 4979 4980 4981
out:
	return -EINVAL;
}

4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009
/**
 * follow_pte - look up PTE at a user virtual address
 * @mm: the mm_struct of the target address space
 * @address: user virtual address
 * @ptepp: location to store found PTE
 * @ptlp: location to store the lock for the PTE
 *
 * On a successful return, the pointer to the PTE is stored in @ptepp;
 * the corresponding lock is taken and its location is stored in @ptlp.
 * The contents of the PTE are only stable until @ptlp is released;
 * any further use, if any, must be protected against invalidation
 * with MMU notifiers.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read.
 *
 * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
 * it is not a good general-purpose API.
 *
 * Return: zero on success, -ve otherwise.
 */
int follow_pte(struct mm_struct *mm, unsigned long address,
	       pte_t **ptepp, spinlock_t **ptlp)
{
	return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
}
EXPORT_SYMBOL_GPL(follow_pte);

J
Johannes Weiner 已提交
5010 5011 5012 5013 5014 5015 5016 5017
/**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
 * @address: user virtual address
 * @pfn: location to store found PFN
 *
 * Only IO mappings and raw PFN mappings are allowed.
 *
5018 5019 5020
 * This function does not allow the caller to read the permissions
 * of the PTE.  Do not use it.
 *
5021
 * Return: zero and the pfn at @pfn on success, -ve otherwise.
J
Johannes Weiner 已提交
5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032
 */
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
	unsigned long *pfn)
{
	int ret = -EINVAL;
	spinlock_t *ptl;
	pte_t *ptep;

	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
		return ret;

5033
	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
J
Johannes Weiner 已提交
5034 5035 5036 5037 5038 5039 5040 5041
	if (ret)
		return ret;
	*pfn = pte_pfn(*ptep);
	pte_unmap_unlock(ptep, ptl);
	return 0;
}
EXPORT_SYMBOL(follow_pfn);

5042
#ifdef CONFIG_HAVE_IOREMAP_PROT
5043 5044 5045
int follow_phys(struct vm_area_struct *vma,
		unsigned long address, unsigned int flags,
		unsigned long *prot, resource_size_t *phys)
5046
{
5047
	int ret = -EINVAL;
5048 5049 5050
	pte_t *ptep, pte;
	spinlock_t *ptl;

5051 5052
	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
		goto out;
5053

5054
	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
5055
		goto out;
5056
	pte = *ptep;
5057

5058
	if ((flags & FOLL_WRITE) && !pte_write(pte))
5059 5060 5061
		goto unlock;

	*prot = pgprot_val(pte_pgprot(pte));
5062
	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
5063

5064
	ret = 0;
5065 5066 5067
unlock:
	pte_unmap_unlock(ptep, ptl);
out:
5068
	return ret;
5069 5070
}

5071 5072 5073
/**
 * generic_access_phys - generic implementation for iomem mmap access
 * @vma: the vma to access
I
Ingo Molnar 已提交
5074
 * @addr: userspace address, not relative offset within @vma
5075 5076 5077 5078 5079 5080 5081 5082
 * @buf: buffer to read/write
 * @len: length of transfer
 * @write: set to FOLL_WRITE when writing, otherwise reading
 *
 * This is a generic implementation for &vm_operations_struct.access for an
 * iomem mapping. This callback is used by access_process_vm() when the @vma is
 * not page based.
 */
5083 5084 5085 5086 5087
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
			void *buf, int len, int write)
{
	resource_size_t phys_addr;
	unsigned long prot = 0;
K
KOSAKI Motohiro 已提交
5088
	void __iomem *maddr;
5089 5090 5091 5092 5093 5094 5095 5096 5097
	pte_t *ptep, pte;
	spinlock_t *ptl;
	int offset = offset_in_page(addr);
	int ret = -EINVAL;

	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
		return -EINVAL;

retry:
5098
	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5099 5100 5101
		return -EINVAL;
	pte = *ptep;
	pte_unmap_unlock(ptep, ptl);
5102

5103 5104 5105 5106
	prot = pgprot_val(pte_pgprot(pte));
	phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;

	if ((write & FOLL_WRITE) && !pte_write(pte))
5107 5108
		return -EINVAL;

5109
	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
5110 5111 5112
	if (!maddr)
		return -ENOMEM;

5113
	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5114 5115 5116 5117 5118 5119 5120 5121 5122
		goto out_unmap;

	if (!pte_same(pte, *ptep)) {
		pte_unmap_unlock(ptep, ptl);
		iounmap(maddr);

		goto retry;
	}

5123 5124 5125 5126
	if (write)
		memcpy_toio(maddr + offset, buf, len);
	else
		memcpy_fromio(buf, maddr + offset, len);
5127 5128 5129
	ret = len;
	pte_unmap_unlock(ptep, ptl);
out_unmap:
5130 5131
	iounmap(maddr);

5132
	return ret;
5133
}
5134
EXPORT_SYMBOL_GPL(generic_access_phys);
5135 5136
#endif

5137
/*
5138
 * Access another process' address space as given in mm.
5139
 */
5140 5141
int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
		       int len, unsigned int gup_flags)
5142 5143 5144
{
	struct vm_area_struct *vma;
	void *old_buf = buf;
5145
	int write = gup_flags & FOLL_WRITE;
5146

5147
	if (mmap_read_lock_killable(mm))
5148 5149
		return 0;

S
Simon Arlott 已提交
5150
	/* ignore errors, just check how much was successfully transferred */
5151 5152 5153
	while (len) {
		int bytes, ret, offset;
		void *maddr;
5154
		struct page *page = NULL;
5155

5156
		ret = get_user_pages_remote(mm, addr, 1,
5157
				gup_flags, &page, &vma, NULL);
5158
		if (ret <= 0) {
5159 5160 5161
#ifndef CONFIG_HAVE_IOREMAP_PROT
			break;
#else
5162 5163 5164 5165
			/*
			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
			 * we can access using slightly different code.
			 */
5166 5167
			vma = vma_lookup(mm, addr);
			if (!vma)
5168 5169 5170 5171 5172 5173 5174
				break;
			if (vma->vm_ops && vma->vm_ops->access)
				ret = vma->vm_ops->access(vma, addr, buf,
							  len, write);
			if (ret <= 0)
				break;
			bytes = ret;
5175
#endif
5176
		} else {
5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191
			bytes = len;
			offset = addr & (PAGE_SIZE-1);
			if (bytes > PAGE_SIZE-offset)
				bytes = PAGE_SIZE-offset;

			maddr = kmap(page);
			if (write) {
				copy_to_user_page(vma, page, addr,
						  maddr + offset, buf, bytes);
				set_page_dirty_lock(page);
			} else {
				copy_from_user_page(vma, page, addr,
						    buf, maddr + offset, bytes);
			}
			kunmap(page);
5192
			put_page(page);
5193 5194 5195 5196 5197
		}
		len -= bytes;
		buf += bytes;
		addr += bytes;
	}
5198
	mmap_read_unlock(mm);
5199 5200 5201

	return buf - old_buf;
}
5202

S
Stephen Wilson 已提交
5203
/**
5204
 * access_remote_vm - access another process' address space
S
Stephen Wilson 已提交
5205 5206 5207 5208
 * @mm:		the mm_struct of the target address space
 * @addr:	start address to access
 * @buf:	source or destination buffer
 * @len:	number of bytes to transfer
5209
 * @gup_flags:	flags modifying lookup behaviour
S
Stephen Wilson 已提交
5210 5211
 *
 * The caller must hold a reference on @mm.
5212 5213
 *
 * Return: number of bytes copied from source to destination.
S
Stephen Wilson 已提交
5214 5215
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
5216
		void *buf, int len, unsigned int gup_flags)
S
Stephen Wilson 已提交
5217
{
5218
	return __access_remote_vm(mm, addr, buf, len, gup_flags);
S
Stephen Wilson 已提交
5219 5220
}

5221 5222 5223 5224 5225 5226
/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
5227
		void *buf, int len, unsigned int gup_flags)
5228 5229 5230 5231 5232 5233 5234 5235
{
	struct mm_struct *mm;
	int ret;

	mm = get_task_mm(tsk);
	if (!mm)
		return 0;

5236
	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
5237

5238 5239 5240 5241
	mmput(mm);

	return ret;
}
5242
EXPORT_SYMBOL_GPL(access_process_vm);
5243

5244 5245 5246 5247 5248 5249 5250 5251
/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;

5252
	/*
5253
	 * we might be running from an atomic context so we cannot sleep
5254
	 */
5255
	if (!mmap_read_trylock(mm))
5256 5257
		return;

5258 5259 5260
	vma = find_vma(mm, ip);
	if (vma && vma->vm_file) {
		struct file *f = vma->vm_file;
5261
		char *buf = (char *)__get_free_page(GFP_NOWAIT);
5262
		if (buf) {
A
Andy Shevchenko 已提交
5263
			char *p;
5264

M
Miklos Szeredi 已提交
5265
			p = file_path(f, buf, PAGE_SIZE);
5266 5267
			if (IS_ERR(p))
				p = "?";
A
Andy Shevchenko 已提交
5268
			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
5269 5270 5271 5272 5273
					vma->vm_start,
					vma->vm_end - vma->vm_start);
			free_page((unsigned long)buf);
		}
	}
5274
	mmap_read_unlock(mm);
5275
}
5276

5277
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5278
void __might_fault(const char *file, int line)
5279
{
5280 5281
	/*
	 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
5282
	 * holding the mmap_lock, this is safe because kernel memory doesn't
5283 5284 5285
	 * get paged out, therefore we'll never actually fault, and the
	 * below annotations will generate false positives.
	 */
A
Al Viro 已提交
5286
	if (uaccess_kernel())
5287
		return;
5288
	if (pagefault_disabled())
5289
		return;
5290
	__might_sleep(file, line);
5291
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5292
	if (current->mm)
5293
		might_lock_read(&current->mm->mmap_lock);
5294
#endif
5295
}
5296
EXPORT_SYMBOL(__might_fault);
5297
#endif
A
Andrea Arcangeli 已提交
5298 5299

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
5300 5301 5302 5303 5304 5305 5306 5307 5308
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline void process_huge_page(
	unsigned long addr_hint, unsigned int pages_per_huge_page,
	void (*process_subpage)(unsigned long addr, int idx, void *arg),
	void *arg)
A
Andrea Arcangeli 已提交
5309
{
5310 5311 5312
	int i, n, base, l;
	unsigned long addr = addr_hint &
		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
A
Andrea Arcangeli 已提交
5313

5314
	/* Process target subpage last to keep its cache lines hot */
A
Andrea Arcangeli 已提交
5315
	might_sleep();
5316 5317
	n = (addr_hint - addr) / PAGE_SIZE;
	if (2 * n <= pages_per_huge_page) {
5318
		/* If target subpage in first half of huge page */
5319 5320
		base = 0;
		l = n;
5321
		/* Process subpages at the end of huge page */
5322 5323
		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
			cond_resched();
5324
			process_subpage(addr + i * PAGE_SIZE, i, arg);
5325 5326
		}
	} else {
5327
		/* If target subpage in second half of huge page */
5328 5329
		base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
		l = pages_per_huge_page - n;
5330
		/* Process subpages at the begin of huge page */
5331 5332
		for (i = 0; i < base; i++) {
			cond_resched();
5333
			process_subpage(addr + i * PAGE_SIZE, i, arg);
5334 5335 5336
		}
	}
	/*
5337 5338
	 * Process remaining subpages in left-right-left-right pattern
	 * towards the target subpage
5339 5340 5341 5342 5343 5344
	 */
	for (i = 0; i < l; i++) {
		int left_idx = base + i;
		int right_idx = base + 2 * l - 1 - i;

		cond_resched();
5345
		process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
A
Andrea Arcangeli 已提交
5346
		cond_resched();
5347
		process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
A
Andrea Arcangeli 已提交
5348 5349 5350
	}
}

5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386
static void clear_gigantic_page(struct page *page,
				unsigned long addr,
				unsigned int pages_per_huge_page)
{
	int i;
	struct page *p = page;

	might_sleep();
	for (i = 0; i < pages_per_huge_page;
	     i++, p = mem_map_next(p, page, i)) {
		cond_resched();
		clear_user_highpage(p, addr + i * PAGE_SIZE);
	}
}

static void clear_subpage(unsigned long addr, int idx, void *arg)
{
	struct page *page = arg;

	clear_user_highpage(page + idx, addr);
}

void clear_huge_page(struct page *page,
		     unsigned long addr_hint, unsigned int pages_per_huge_page)
{
	unsigned long addr = addr_hint &
		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
		clear_gigantic_page(page, addr, pages_per_huge_page);
		return;
	}

	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}

A
Andrea Arcangeli 已提交
5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405
static void copy_user_gigantic_page(struct page *dst, struct page *src,
				    unsigned long addr,
				    struct vm_area_struct *vma,
				    unsigned int pages_per_huge_page)
{
	int i;
	struct page *dst_base = dst;
	struct page *src_base = src;

	for (i = 0; i < pages_per_huge_page; ) {
		cond_resched();
		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);

		i++;
		dst = mem_map_next(dst, dst_base, i);
		src = mem_map_next(src, src_base, i);
	}
}

5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419
struct copy_subpage_arg {
	struct page *dst;
	struct page *src;
	struct vm_area_struct *vma;
};

static void copy_subpage(unsigned long addr, int idx, void *arg)
{
	struct copy_subpage_arg *copy_arg = arg;

	copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
			   addr, copy_arg->vma);
}

A
Andrea Arcangeli 已提交
5420
void copy_user_huge_page(struct page *dst, struct page *src,
5421
			 unsigned long addr_hint, struct vm_area_struct *vma,
A
Andrea Arcangeli 已提交
5422 5423
			 unsigned int pages_per_huge_page)
{
5424 5425 5426 5427 5428 5429 5430
	unsigned long addr = addr_hint &
		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
	struct copy_subpage_arg arg = {
		.dst = dst,
		.src = src,
		.vma = vma,
	};
A
Andrea Arcangeli 已提交
5431 5432 5433 5434 5435 5436 5437

	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
		copy_user_gigantic_page(dst, src, addr, vma,
					pages_per_huge_page);
		return;
	}

5438
	process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
A
Andrea Arcangeli 已提交
5439
}
5440 5441 5442

long copy_huge_page_from_user(struct page *dst_page,
				const void __user *usr_src,
5443 5444
				unsigned int pages_per_huge_page,
				bool allow_pagefault)
5445 5446 5447 5448
{
	void *page_kaddr;
	unsigned long i, rc = 0;
	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
5449
	struct page *subpage = dst_page;
5450

5451 5452
	for (i = 0; i < pages_per_huge_page;
	     i++, subpage = mem_map_next(subpage, dst_page, i)) {
5453
		if (allow_pagefault)
5454
			page_kaddr = kmap(subpage);
5455
		else
5456
			page_kaddr = kmap_atomic(subpage);
5457
		rc = copy_from_user(page_kaddr,
5458
				usr_src + i * PAGE_SIZE, PAGE_SIZE);
5459
		if (allow_pagefault)
5460
			kunmap(subpage);
5461 5462
		else
			kunmap_atomic(page_kaddr);
5463 5464 5465 5466 5467

		ret_val -= (PAGE_SIZE - rc);
		if (rc)
			break;

5468 5469
		flush_dcache_page(subpage);

5470 5471 5472 5473
		cond_resched();
	}
	return ret_val;
}
A
Andrea Arcangeli 已提交
5474
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
5475

5476
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
5477 5478 5479 5480 5481 5482 5483 5484 5485

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
			SLAB_PANIC, NULL);
}

5486
bool ptlock_alloc(struct page *page)
5487 5488 5489
{
	spinlock_t *ptl;

5490
	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
5491 5492
	if (!ptl)
		return false;
5493
	page->ptl = ptl;
5494 5495 5496
	return true;
}

5497
void ptlock_free(struct page *page)
5498
{
5499
	kmem_cache_free(page_ptl_cachep, page->ptl);
5500 5501
}
#endif