huge_memory.c 84.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 */

6 7
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

8 9
#include <linux/mm.h>
#include <linux/sched.h>
10
#include <linux/sched/mm.h>
11
#include <linux/sched/coredump.h>
12
#include <linux/sched/numa_balancing.h>
13 14 15 16 17
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
18
#include <linux/shrinker.h>
A
Andrea Arcangeli 已提交
19
#include <linux/mm_inline.h>
20
#include <linux/swapops.h>
M
Matthew Wilcox 已提交
21
#include <linux/dax.h>
A
Andrea Arcangeli 已提交
22
#include <linux/khugepaged.h>
23
#include <linux/freezer.h>
24
#include <linux/pfn_t.h>
A
Andrea Arcangeli 已提交
25
#include <linux/mman.h>
26
#include <linux/memremap.h>
R
Ralf Baechle 已提交
27
#include <linux/pagemap.h>
28
#include <linux/debugfs.h>
29
#include <linux/migrate.h>
30
#include <linux/hashtable.h>
31
#include <linux/userfaultfd_k.h>
32
#include <linux/page_idle.h>
33
#include <linux/shmem_fs.h>
34
#include <linux/oom.h>
35
#include <linux/numa.h>
36
#include <linux/page_owner.h>
37
#include <linux/sched/sysctl.h>
38

39 40 41
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
N
NeilBrown 已提交
42
#include "swap.h"
43

44 45 46
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

A
Andrea Arcangeli 已提交
47
/*
48 49 50 51
 * By default, transparent hugepage support is disabled in order to avoid
 * risking an increased memory footprint for applications that are not
 * guaranteed to benefit from it. When transparent hugepage support is
 * enabled, it is for all mappings, and khugepaged scans all mappings.
52 53
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
A
Andrea Arcangeli 已提交
54
 */
55
unsigned long transparent_hugepage_flags __read_mostly =
56
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
A
Andrea Arcangeli 已提交
57
	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
58 59 60 61
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
62
	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
63 64
	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
A
Andrea Arcangeli 已提交
65

66
static struct shrinker deferred_split_shrinker;
67

68
static atomic_t huge_zero_refcount;
69
struct page *huge_zero_page __read_mostly;
70
unsigned long huge_zero_pfn __read_mostly = ~0UL;
71

72 73 74 75 76 77 78 79
static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
	return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
	       !inode_is_open_for_write(vma->vm_file->f_inode) &&
	       (vma->vm_flags & VM_EXEC);
}

bool transparent_hugepage_active(struct vm_area_struct *vma)
80
{
81 82 83 84 85
	/* The addr is used to check if the vma size fits */
	unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;

	if (!transhuge_vma_suitable(vma, addr))
		return false;
86 87
	if (vma_is_anonymous(vma))
		return __transparent_hugepage_enabled(vma);
88 89
	if (vma_is_shmem(vma))
		return shmem_huge_enabled(vma);
90 91
	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
		return file_thp_enabled(vma);
92 93 94 95

	return false;
}

96
static bool get_huge_zero_page(void)
97 98 99 100
{
	struct page *zero_page;
retry:
	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
101
		return true;
102 103

	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
104
			HPAGE_PMD_ORDER);
105 106
	if (!zero_page) {
		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
107
		return false;
108 109
	}
	count_vm_event(THP_ZERO_PAGE_ALLOC);
110
	preempt_disable();
111
	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
112
		preempt_enable();
113
		__free_pages(zero_page, compound_order(zero_page));
114 115
		goto retry;
	}
116
	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
117 118 119 120

	/* We take additional reference here. It will be put back by shrinker */
	atomic_set(&huge_zero_refcount, 2);
	preempt_enable();
121
	return true;
122 123
}

124
static void put_huge_zero_page(void)
125
{
126 127 128 129 130
	/*
	 * Counter should never go to zero here. Only shrinker can put
	 * last reference.
	 */
	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
131 132
}

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
struct page *mm_get_huge_zero_page(struct mm_struct *mm)
{
	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		return READ_ONCE(huge_zero_page);

	if (!get_huge_zero_page())
		return NULL;

	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		put_huge_zero_page();

	return READ_ONCE(huge_zero_page);
}

void mm_put_huge_zero_page(struct mm_struct *mm)
{
	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		put_huge_zero_page();
}

153 154
static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
					struct shrink_control *sc)
155
{
156 157 158
	/* we can free zero page only if last reference remains */
	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
159

160 161 162
static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
				       struct shrink_control *sc)
{
163
	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
164 165
		struct page *zero_page = xchg(&huge_zero_page, NULL);
		BUG_ON(zero_page == NULL);
166
		WRITE_ONCE(huge_zero_pfn, ~0UL);
167
		__free_pages(zero_page, compound_order(zero_page));
168
		return HPAGE_PMD_NR;
169 170 171
	}

	return 0;
172 173
}

174
static struct shrinker huge_zero_page_shrinker = {
175 176
	.count_objects = shrink_huge_zero_page_count,
	.scan_objects = shrink_huge_zero_page_scan,
177 178 179
	.seeks = DEFAULT_SEEKS,
};

180 181 182 183
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
			    struct kobj_attribute *attr, char *buf)
{
184 185
	const char *output;

186
	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
187 188 189 190
		output = "[always] madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			  &transparent_hugepage_flags))
		output = "always [madvise] never";
191
	else
192 193 194
		output = "always madvise [never]";

	return sysfs_emit(buf, "%s\n", output);
195
}
196

197 198 199 200
static ssize_t enabled_store(struct kobject *kobj,
			     struct kobj_attribute *attr,
			     const char *buf, size_t count)
{
201
	ssize_t ret = count;
A
Andrea Arcangeli 已提交
202

203
	if (sysfs_streq(buf, "always")) {
204 205
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
206
	} else if (sysfs_streq(buf, "madvise")) {
207 208
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
209
	} else if (sysfs_streq(buf, "never")) {
210 211 212 213
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else
		ret = -EINVAL;
A
Andrea Arcangeli 已提交
214 215

	if (ret > 0) {
216
		int err = start_stop_khugepaged();
A
Andrea Arcangeli 已提交
217 218 219 220
		if (err)
			ret = err;
	}
	return ret;
221 222 223 224
}
static struct kobj_attribute enabled_attr =
	__ATTR(enabled, 0644, enabled_show, enabled_store);

225
ssize_t single_hugepage_flag_show(struct kobject *kobj,
226 227
				  struct kobj_attribute *attr, char *buf,
				  enum transparent_hugepage_flag flag)
228
{
229 230
	return sysfs_emit(buf, "%d\n",
			  !!test_bit(flag, &transparent_hugepage_flags));
231
}
232

233
ssize_t single_hugepage_flag_store(struct kobject *kobj,
234 235 236 237
				 struct kobj_attribute *attr,
				 const char *buf, size_t count,
				 enum transparent_hugepage_flag flag)
{
238 239 240 241 242 243 244 245 246 247
	unsigned long value;
	int ret;

	ret = kstrtoul(buf, 10, &value);
	if (ret < 0)
		return ret;
	if (value > 1)
		return -EINVAL;

	if (value)
248
		set_bit(flag, &transparent_hugepage_flags);
249
	else
250 251 252 253 254 255 256 257
		clear_bit(flag, &transparent_hugepage_flags);

	return count;
}

static ssize_t defrag_show(struct kobject *kobj,
			   struct kobj_attribute *attr, char *buf)
{
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
	const char *output;

	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
		     &transparent_hugepage_flags))
		output = "[always] defer defer+madvise madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
			  &transparent_hugepage_flags))
		output = "always [defer] defer+madvise madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
			  &transparent_hugepage_flags))
		output = "always defer [defer+madvise] madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
			  &transparent_hugepage_flags))
		output = "always defer defer+madvise [madvise] never";
	else
		output = "always defer defer+madvise madvise [never]";

	return sysfs_emit(buf, "%s\n", output);
276
}
277

278 279 280 281
static ssize_t defrag_store(struct kobject *kobj,
			    struct kobj_attribute *attr,
			    const char *buf, size_t count)
{
282
	if (sysfs_streq(buf, "always")) {
283 284 285 286
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
287
	} else if (sysfs_streq(buf, "defer+madvise")) {
288 289 290 291
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
292
	} else if (sysfs_streq(buf, "defer")) {
293 294 295 296
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
297
	} else if (sysfs_streq(buf, "madvise")) {
298 299 300 301
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
302
	} else if (sysfs_streq(buf, "never")) {
303 304 305 306 307 308 309 310
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else
		return -EINVAL;

	return count;
311 312 313 314
}
static struct kobj_attribute defrag_attr =
	__ATTR(defrag, 0644, defrag_show, defrag_store);

315
static ssize_t use_zero_page_show(struct kobject *kobj,
316
				  struct kobj_attribute *attr, char *buf)
317
{
318
	return single_hugepage_flag_show(kobj, attr, buf,
319
					 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
320 321 322 323
}
static ssize_t use_zero_page_store(struct kobject *kobj,
		struct kobj_attribute *attr, const char *buf, size_t count)
{
324
	return single_hugepage_flag_store(kobj, attr, buf, count,
325 326 327 328
				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static struct kobj_attribute use_zero_page_attr =
	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
329 330

static ssize_t hpage_pmd_size_show(struct kobject *kobj,
331
				   struct kobj_attribute *attr, char *buf)
332
{
333
	return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
334 335 336 337
}
static struct kobj_attribute hpage_pmd_size_attr =
	__ATTR_RO(hpage_pmd_size);

338 339 340
static struct attribute *hugepage_attr[] = {
	&enabled_attr.attr,
	&defrag_attr.attr,
341
	&use_zero_page_attr.attr,
342
	&hpage_pmd_size_attr.attr,
343
#ifdef CONFIG_SHMEM
344
	&shmem_enabled_attr.attr,
345 346 347 348
#endif
	NULL,
};

349
static const struct attribute_group hugepage_attr_group = {
350
	.attrs = hugepage_attr,
A
Andrea Arcangeli 已提交
351 352
};

S
Shaohua Li 已提交
353
static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
354 355 356
{
	int err;

S
Shaohua Li 已提交
357 358
	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
	if (unlikely(!*hugepage_kobj)) {
359
		pr_err("failed to create transparent hugepage kobject\n");
S
Shaohua Li 已提交
360
		return -ENOMEM;
A
Andrea Arcangeli 已提交
361 362
	}

S
Shaohua Li 已提交
363
	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
A
Andrea Arcangeli 已提交
364
	if (err) {
365
		pr_err("failed to register transparent hugepage group\n");
S
Shaohua Li 已提交
366
		goto delete_obj;
A
Andrea Arcangeli 已提交
367 368
	}

S
Shaohua Li 已提交
369
	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
A
Andrea Arcangeli 已提交
370
	if (err) {
371
		pr_err("failed to register transparent hugepage group\n");
S
Shaohua Li 已提交
372
		goto remove_hp_group;
A
Andrea Arcangeli 已提交
373
	}
S
Shaohua Li 已提交
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406

	return 0;

remove_hp_group:
	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
delete_obj:
	kobject_put(*hugepage_kobj);
	return err;
}

static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
	kobject_put(hugepage_kobj);
}
#else
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
	return 0;
}

static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
}
#endif /* CONFIG_SYSFS */

static int __init hugepage_init(void)
{
	int err;
	struct kobject *hugepage_kobj;

	if (!has_transparent_hugepage()) {
407 408 409 410 411
		/*
		 * Hardware doesn't support hugepages, hence disable
		 * DAX PMD support.
		 */
		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
S
Shaohua Li 已提交
412 413 414
		return -EINVAL;
	}

415 416 417 418 419 420 421 422 423 424
	/*
	 * hugepages can't be allocated by the buddy allocator
	 */
	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
	/*
	 * we use page->mapping and page->index in second tail page
	 * as list_head: assuming THP order >= 2
	 */
	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);

S
Shaohua Li 已提交
425 426
	err = hugepage_init_sysfs(&hugepage_kobj);
	if (err)
427
		goto err_sysfs;
A
Andrea Arcangeli 已提交
428

429
	err = khugepaged_init();
A
Andrea Arcangeli 已提交
430
	if (err)
431
		goto err_slab;
A
Andrea Arcangeli 已提交
432

433 434 435
	err = register_shrinker(&huge_zero_page_shrinker);
	if (err)
		goto err_hzp_shrinker;
436 437 438
	err = register_shrinker(&deferred_split_shrinker);
	if (err)
		goto err_split_shrinker;
439

440 441 442 443 444
	/*
	 * By default disable transparent hugepages on smaller systems,
	 * where the extra memory used could hurt more than TLB overhead
	 * is likely to save.  The admin can still enable it through /sys.
	 */
445
	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
446
		transparent_hugepage_flags = 0;
447 448
		return 0;
	}
449

450
	err = start_stop_khugepaged();
451 452
	if (err)
		goto err_khugepaged;
A
Andrea Arcangeli 已提交
453

S
Shaohua Li 已提交
454
	return 0;
455
err_khugepaged:
456 457
	unregister_shrinker(&deferred_split_shrinker);
err_split_shrinker:
458 459
	unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
460
	khugepaged_destroy();
461
err_slab:
S
Shaohua Li 已提交
462
	hugepage_exit_sysfs(hugepage_kobj);
463
err_sysfs:
A
Andrea Arcangeli 已提交
464
	return err;
465
}
466
subsys_initcall(hugepage_init);
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493

static int __init setup_transparent_hugepage(char *str)
{
	int ret = 0;
	if (!str)
		goto out;
	if (!strcmp(str, "always")) {
		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
			&transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			  &transparent_hugepage_flags);
		ret = 1;
	} else if (!strcmp(str, "madvise")) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
			  &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			&transparent_hugepage_flags);
		ret = 1;
	} else if (!strcmp(str, "never")) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
			  &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			  &transparent_hugepage_flags);
		ret = 1;
	}
out:
	if (!ret)
494
		pr_warn("transparent_hugepage= cannot parse, ignored\n");
495 496 497 498
	return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);

499
pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
500
{
501
	if (likely(vma->vm_flags & VM_WRITE))
502 503 504 505
		pmd = pmd_mkwrite(pmd);
	return pmd;
}

506 507
#ifdef CONFIG_MEMCG
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
508
{
509
	struct mem_cgroup *memcg = page_memcg(compound_head(page));
510 511 512 513 514 515
	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));

	if (memcg)
		return &memcg->deferred_split_queue;
	else
		return &pgdat->deferred_split_queue;
516
}
517 518 519 520 521 522 523 524
#else
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));

	return &pgdat->deferred_split_queue;
}
#endif
525 526 527 528 529 530 531 532 533 534 535 536

void prep_transhuge_page(struct page *page)
{
	/*
	 * we use page->mapping and page->indexlru in second tail page
	 * as list_head: assuming THP order >= 2
	 */

	INIT_LIST_HEAD(page_deferred_list(page));
	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}

537
static inline bool is_transparent_hugepage(struct page *page)
538 539
{
	if (!PageCompound(page))
Z
Zou Wei 已提交
540
		return false;
541 542 543 544 545 546

	page = compound_head(page);
	return is_huge_zero_page(page) ||
	       page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
}

547 548
static unsigned long __thp_get_unmapped_area(struct file *filp,
		unsigned long addr, unsigned long len,
549 550 551 552
		loff_t off, unsigned long flags, unsigned long size)
{
	loff_t off_end = off + len;
	loff_t off_align = round_up(off, size);
553
	unsigned long len_pad, ret;
554 555 556 557 558 559 560 561

	if (off_end <= off_align || (off_end - off_align) < size)
		return 0;

	len_pad = len + size;
	if (len_pad < len || (off + len_pad) < off)
		return 0;

562
	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
563
					      off >> PAGE_SHIFT, flags);
564 565 566 567 568 569

	/*
	 * The failure might be due to length padding. The caller will retry
	 * without the padding.
	 */
	if (IS_ERR_VALUE(ret))
570 571
		return 0;

572 573 574 575 576 577 578 579 580
	/*
	 * Do not try to align to THP boundary if allocation at the address
	 * hint succeeds.
	 */
	if (ret == addr)
		return addr;

	ret += (off - ret) & (size - 1);
	return ret;
581 582 583 584 585
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
586
	unsigned long ret;
587 588
	loff_t off = (loff_t)pgoff << PAGE_SHIFT;

589 590 591
	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
	if (ret)
		return ret;
592

593 594 595 596
	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);

597 598
static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
			struct page *page, gfp_t gfp)
599
{
J
Jan Kara 已提交
600
	struct vm_area_struct *vma = vmf->vma;
601
	pgtable_t pgtable;
J
Jan Kara 已提交
602
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
603
	vm_fault_t ret = 0;
604

605
	VM_BUG_ON_PAGE(!PageCompound(page), page);
606

607
	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
608 609
		put_page(page);
		count_vm_event(THP_FAULT_FALLBACK);
610
		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
611 612
		return VM_FAULT_FALLBACK;
	}
613
	cgroup_throttle_swaprate(page, gfp);
614

615
	pgtable = pte_alloc_one(vma->vm_mm);
616
	if (unlikely(!pgtable)) {
617 618
		ret = VM_FAULT_OOM;
		goto release;
619
	}
620

621
	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
622 623 624 625 626
	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
	 * clear_huge_page writes become visible before the set_pmd_at()
	 * write.
	 */
627 628
	__SetPageUptodate(page);

J
Jan Kara 已提交
629 630
	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_none(*vmf->pmd))) {
631
		goto unlock_release;
632 633
	} else {
		pmd_t entry;
634

635 636 637 638
		ret = check_stable_address_space(vma->vm_mm);
		if (ret)
			goto unlock_release;

639 640
		/* Deliver the page fault to userland */
		if (userfaultfd_missing(vma)) {
J
Jan Kara 已提交
641
			spin_unlock(vmf->ptl);
642
			put_page(page);
K
Kirill A. Shutemov 已提交
643
			pte_free(vma->vm_mm, pgtable);
644 645 646
			ret = handle_userfault(vmf, VM_UFFD_MISSING);
			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
			return ret;
647 648
		}

649
		entry = mk_huge_pmd(page, vma->vm_page_prot);
650
		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
651
		page_add_new_anon_rmap(page, vma, haddr);
652
		lru_cache_add_inactive_or_unevictable(page, vma);
J
Jan Kara 已提交
653 654
		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
655
		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
K
Kirill A. Shutemov 已提交
656
		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
657
		mm_inc_nr_ptes(vma->vm_mm);
J
Jan Kara 已提交
658
		spin_unlock(vmf->ptl);
659
		count_vm_event(THP_FAULT_ALLOC);
660
		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
661 662
	}

663
	return 0;
664 665 666 667 668 669 670 671
unlock_release:
	spin_unlock(vmf->ptl);
release:
	if (pgtable)
		pte_free(vma->vm_mm, pgtable);
	put_page(page);
	return ret;

672 673
}

674
/*
675 676 677 678 679 680 681
 * always: directly stall for all thp allocations
 * defer: wake kswapd and fail if not immediately available
 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 *		  fail if not immediately available
 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 *	    available
 * never: never stall for any thp allocation
682
 */
683
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
684
{
685
	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
686

687
	/* Always do synchronous compaction */
688 689
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
690 691

	/* Kick kcompactd and fail quickly */
692
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
693
		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
694 695

	/* Synchronous compaction if madvised, otherwise kick kcompactd */
696
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
697 698 699
		return GFP_TRANSHUGE_LIGHT |
			(vma_madvised ? __GFP_DIRECT_RECLAIM :
					__GFP_KSWAPD_RECLAIM);
700 701

	/* Only do synchronous compaction if madvised */
702
	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
703 704
		return GFP_TRANSHUGE_LIGHT |
		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
705

706
	return GFP_TRANSHUGE_LIGHT;
707 708
}

709
/* Caller must hold page table lock. */
710
static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
711
		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
712
		struct page *zero_page)
713 714
{
	pmd_t entry;
A
Andrew Morton 已提交
715
	if (!pmd_none(*pmd))
716
		return;
717
	entry = mk_pmd(zero_page, vma->vm_page_prot);
718
	entry = pmd_mkhuge(entry);
719 720
	if (pgtable)
		pgtable_trans_huge_deposit(mm, pmd, pgtable);
721
	set_pmd_at(mm, haddr, pmd, entry);
722
	mm_inc_nr_ptes(mm);
723 724
}

725
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
726
{
J
Jan Kara 已提交
727
	struct vm_area_struct *vma = vmf->vma;
728
	gfp_t gfp;
729
	struct folio *folio;
J
Jan Kara 已提交
730
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
731

732
	if (!transhuge_vma_suitable(vma, haddr))
733
		return VM_FAULT_FALLBACK;
734 735
	if (unlikely(anon_vma_prepare(vma)))
		return VM_FAULT_OOM;
736
	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
737
		return VM_FAULT_OOM;
J
Jan Kara 已提交
738
	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
K
Kirill A. Shutemov 已提交
739
			!mm_forbids_zeropage(vma->vm_mm) &&
740 741 742
			transparent_hugepage_use_zero_page()) {
		pgtable_t pgtable;
		struct page *zero_page;
743
		vm_fault_t ret;
744
		pgtable = pte_alloc_one(vma->vm_mm);
745
		if (unlikely(!pgtable))
A
Andrea Arcangeli 已提交
746
			return VM_FAULT_OOM;
747
		zero_page = mm_get_huge_zero_page(vma->vm_mm);
748
		if (unlikely(!zero_page)) {
K
Kirill A. Shutemov 已提交
749
			pte_free(vma->vm_mm, pgtable);
750
			count_vm_event(THP_FAULT_FALLBACK);
751
			return VM_FAULT_FALLBACK;
A
Andrea Arcangeli 已提交
752
		}
J
Jan Kara 已提交
753
		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
754
		ret = 0;
J
Jan Kara 已提交
755
		if (pmd_none(*vmf->pmd)) {
756 757 758
			ret = check_stable_address_space(vma->vm_mm);
			if (ret) {
				spin_unlock(vmf->ptl);
759
				pte_free(vma->vm_mm, pgtable);
760
			} else if (userfaultfd_missing(vma)) {
J
Jan Kara 已提交
761
				spin_unlock(vmf->ptl);
762
				pte_free(vma->vm_mm, pgtable);
J
Jan Kara 已提交
763
				ret = handle_userfault(vmf, VM_UFFD_MISSING);
764 765
				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
			} else {
K
Kirill A. Shutemov 已提交
766
				set_huge_zero_page(pgtable, vma->vm_mm, vma,
J
Jan Kara 已提交
767
						   haddr, vmf->pmd, zero_page);
768
				update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
J
Jan Kara 已提交
769
				spin_unlock(vmf->ptl);
770
			}
771
		} else {
J
Jan Kara 已提交
772
			spin_unlock(vmf->ptl);
K
Kirill A. Shutemov 已提交
773
			pte_free(vma->vm_mm, pgtable);
774
		}
775
		return ret;
776
	}
777
	gfp = vma_thp_gfp_mask(vma);
778 779
	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
	if (unlikely(!folio)) {
780
		count_vm_event(THP_FAULT_FALLBACK);
781
		return VM_FAULT_FALLBACK;
782
	}
783
	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
784 785
}

786
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
787 788
		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
		pgtable_t pgtable)
M
Matthew Wilcox 已提交
789 790 791 792 793 794
{
	struct mm_struct *mm = vma->vm_mm;
	pmd_t entry;
	spinlock_t *ptl;

	ptl = pmd_lock(mm, pmd);
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
	if (!pmd_none(*pmd)) {
		if (write) {
			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
				goto out_unlock;
			}
			entry = pmd_mkyoung(*pmd);
			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
				update_mmu_cache_pmd(vma, addr, pmd);
		}

		goto out_unlock;
	}

810 811 812
	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
	if (pfn_t_devmap(pfn))
		entry = pmd_mkdevmap(entry);
813
	if (write) {
814 815
		entry = pmd_mkyoung(pmd_mkdirty(entry));
		entry = maybe_pmd_mkwrite(entry, vma);
M
Matthew Wilcox 已提交
816
	}
817 818 819

	if (pgtable) {
		pgtable_trans_huge_deposit(mm, pmd, pgtable);
820
		mm_inc_nr_ptes(mm);
821
		pgtable = NULL;
822 823
	}

824 825
	set_pmd_at(mm, addr, pmd, entry);
	update_mmu_cache_pmd(vma, addr, pmd);
826 827

out_unlock:
M
Matthew Wilcox 已提交
828
	spin_unlock(ptl);
829 830
	if (pgtable)
		pte_free(mm, pgtable);
M
Matthew Wilcox 已提交
831 832
}

833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
/**
 * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @pgprot: page protection to use
 * @write: whether it's a write fault
 *
 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
 * also consult the vmf_insert_mixed_prot() documentation when
 * @pgprot != @vmf->vma->vm_page_prot.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
				   pgprot_t pgprot, bool write)
M
Matthew Wilcox 已提交
848
{
849 850
	unsigned long addr = vmf->address & PMD_MASK;
	struct vm_area_struct *vma = vmf->vma;
851
	pgtable_t pgtable = NULL;
852

M
Matthew Wilcox 已提交
853 854 855 856 857
	/*
	 * If we had pmd_special, we could avoid all these restrictions,
	 * but we need to be consistent with PTEs and architectures that
	 * can't support a 'special' bit.
	 */
858 859
	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
			!pfn_t_devmap(pfn));
M
Matthew Wilcox 已提交
860 861 862 863 864 865
	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
						(VM_PFNMAP|VM_MIXEDMAP));
	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

	if (addr < vma->vm_start || addr >= vma->vm_end)
		return VM_FAULT_SIGBUS;
866

867
	if (arch_needs_pgtable_deposit()) {
868
		pgtable = pte_alloc_one(vma->vm_mm);
869 870 871 872
		if (!pgtable)
			return VM_FAULT_OOM;
	}

873 874
	track_pfn_insert(vma, &pgprot, pfn);

875
	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
876
	return VM_FAULT_NOPAGE;
M
Matthew Wilcox 已提交
877
}
878
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
M
Matthew Wilcox 已提交
879

880
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
881
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
882
{
883
	if (likely(vma->vm_flags & VM_WRITE))
884 885 886 887 888 889 890 891 892 893 894 895
		pud = pud_mkwrite(pud);
	return pud;
}

static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
{
	struct mm_struct *mm = vma->vm_mm;
	pud_t entry;
	spinlock_t *ptl;

	ptl = pud_lock(mm, pud);
896 897 898 899 900 901 902 903 904 905 906 907 908 909
	if (!pud_none(*pud)) {
		if (write) {
			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
				goto out_unlock;
			}
			entry = pud_mkyoung(*pud);
			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
				update_mmu_cache_pud(vma, addr, pud);
		}
		goto out_unlock;
	}

910 911 912 913
	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
	if (pfn_t_devmap(pfn))
		entry = pud_mkdevmap(entry);
	if (write) {
914 915
		entry = pud_mkyoung(pud_mkdirty(entry));
		entry = maybe_pud_mkwrite(entry, vma);
916 917 918
	}
	set_pud_at(mm, addr, pud, entry);
	update_mmu_cache_pud(vma, addr, pud);
919 920

out_unlock:
921 922 923
	spin_unlock(ptl);
}

924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
/**
 * vmf_insert_pfn_pud_prot - insert a pud size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @pgprot: page protection to use
 * @write: whether it's a write fault
 *
 * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
 * also consult the vmf_insert_mixed_prot() documentation when
 * @pgprot != @vmf->vma->vm_page_prot.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
				   pgprot_t pgprot, bool write)
939
{
940 941 942
	unsigned long addr = vmf->address & PUD_MASK;
	struct vm_area_struct *vma = vmf->vma;

943 944 945 946 947
	/*
	 * If we had pud_special, we could avoid all these restrictions,
	 * but we need to be consistent with PTEs and architectures that
	 * can't support a 'special' bit.
	 */
948 949
	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
			!pfn_t_devmap(pfn));
950 951 952 953 954 955 956 957 958
	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
						(VM_PFNMAP|VM_MIXEDMAP));
	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

	if (addr < vma->vm_start || addr >= vma->vm_end)
		return VM_FAULT_SIGBUS;

	track_pfn_insert(vma, &pgprot, pfn);

959
	insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
960 961
	return VM_FAULT_NOPAGE;
}
962
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
963 964
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

965
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
966
		pmd_t *pmd, int flags)
967 968 969
{
	pmd_t _pmd;

970 971 972
	_pmd = pmd_mkyoung(*pmd);
	if (flags & FOLL_WRITE)
		_pmd = pmd_mkdirty(_pmd);
973
	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
974
				pmd, _pmd, flags & FOLL_WRITE))
975 976 977 978
		update_mmu_cache_pmd(vma, addr, pmd);
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
979
		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
980 981 982 983 984 985 986
{
	unsigned long pfn = pmd_pfn(*pmd);
	struct mm_struct *mm = vma->vm_mm;
	struct page *page;

	assert_spin_locked(pmd_lockptr(mm, pmd));

987 988 989 990 991 992
	/*
	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
	 * not be in this function with `flags & FOLL_COW` set.
	 */
	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");

J
John Hubbard 已提交
993 994 995 996 997
	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
			 (FOLL_PIN | FOLL_GET)))
		return NULL;

998
	if (flags & FOLL_WRITE && !pmd_write(*pmd))
999 1000 1001 1002 1003 1004 1005 1006
		return NULL;

	if (pmd_present(*pmd) && pmd_devmap(*pmd))
		/* pass */;
	else
		return NULL;

	if (flags & FOLL_TOUCH)
1007
		touch_pmd(vma, addr, pmd, flags);
1008 1009 1010 1011 1012

	/*
	 * device mapped pages can only be returned if the
	 * caller will manage the page reference count.
	 */
J
John Hubbard 已提交
1013
	if (!(flags & (FOLL_GET | FOLL_PIN)))
1014 1015 1016
		return ERR_PTR(-EEXIST);

	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1017 1018
	*pgmap = get_dev_pagemap(pfn, *pgmap);
	if (!*pgmap)
1019 1020
		return ERR_PTR(-EFAULT);
	page = pfn_to_page(pfn);
J
John Hubbard 已提交
1021 1022
	if (!try_grab_page(page, flags))
		page = ERR_PTR(-ENOMEM);
1023 1024 1025 1026

	return page;
}

1027 1028
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1029
		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1030
{
1031
	spinlock_t *dst_ptl, *src_ptl;
1032 1033
	struct page *src_page;
	pmd_t pmd;
1034
	pgtable_t pgtable = NULL;
1035
	int ret = -ENOMEM;
1036

1037
	/* Skip if can be re-fill on fault */
1038
	if (!vma_is_anonymous(dst_vma))
1039 1040
		return 0;

1041
	pgtable = pte_alloc_one(dst_mm);
1042 1043
	if (unlikely(!pgtable))
		goto out;
1044

1045 1046 1047
	dst_ptl = pmd_lock(dst_mm, dst_pmd);
	src_ptl = pmd_lockptr(src_mm, src_pmd);
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1048 1049 1050

	ret = -EAGAIN;
	pmd = *src_pmd;
1051 1052 1053 1054 1055 1056

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
	if (unlikely(is_swap_pmd(pmd))) {
		swp_entry_t entry = pmd_to_swp_entry(pmd);

		VM_BUG_ON(!is_pmd_migration_entry(pmd));
1057
		if (!is_readable_migration_entry(entry)) {
1058 1059
			entry = make_readable_migration_entry(
							swp_offset(entry));
1060
			pmd = swp_entry_to_pmd(entry);
1061 1062
			if (pmd_swp_soft_dirty(*src_pmd))
				pmd = pmd_swp_mksoft_dirty(pmd);
1063 1064
			if (pmd_swp_uffd_wp(*src_pmd))
				pmd = pmd_swp_mkuffd_wp(pmd);
1065 1066
			set_pmd_at(src_mm, addr, src_pmd, pmd);
		}
1067
		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1068
		mm_inc_nr_ptes(dst_mm);
1069
		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1070 1071
		if (!userfaultfd_wp(dst_vma))
			pmd = pmd_swp_clear_uffd_wp(pmd);
1072 1073 1074 1075 1076 1077
		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
		ret = 0;
		goto out_unlock;
	}
#endif

1078
	if (unlikely(!pmd_trans_huge(pmd))) {
1079 1080 1081
		pte_free(dst_mm, pgtable);
		goto out_unlock;
	}
1082
	/*
1083
	 * When page table lock is held, the huge zero pmd should not be
1084 1085 1086 1087
	 * under splitting since we don't split the page itself, only pmd to
	 * a page table.
	 */
	if (is_huge_zero_pmd(pmd)) {
1088 1089 1090 1091 1092
		/*
		 * get_huge_zero_page() will never allocate a new page here,
		 * since we already have a zero page to copy. It just takes a
		 * reference.
		 */
1093 1094
		mm_get_huge_zero_page(dst_mm);
		goto out_zero_page;
1095
	}
1096

1097 1098
	src_page = pmd_page(pmd);
	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1099

1100 1101 1102 1103
	get_page(src_page);
	if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
		/* Page maybe pinned: split and retry the fault on PTEs. */
		put_page(src_page);
1104 1105 1106
		pte_free(dst_mm, pgtable);
		spin_unlock(src_ptl);
		spin_unlock(dst_ptl);
1107
		__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1108 1109
		return -EAGAIN;
	}
1110
	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1111
out_zero_page:
1112
	mm_inc_nr_ptes(dst_mm);
1113
	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1114
	pmdp_set_wrprotect(src_mm, addr, src_pmd);
1115 1116
	if (!userfaultfd_wp(dst_vma))
		pmd = pmd_clear_uffd_wp(pmd);
1117 1118 1119 1120 1121
	pmd = pmd_mkold(pmd_wrprotect(pmd));
	set_pmd_at(dst_mm, addr, dst_pmd, pmd);

	ret = 0;
out_unlock:
1122 1123
	spin_unlock(src_ptl);
	spin_unlock(dst_ptl);
1124 1125 1126 1127
out:
	return ret;
}

1128 1129
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1130
		pud_t *pud, int flags)
1131 1132 1133
{
	pud_t _pud;

1134 1135 1136
	_pud = pud_mkyoung(*pud);
	if (flags & FOLL_WRITE)
		_pud = pud_mkdirty(_pud);
1137
	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1138
				pud, _pud, flags & FOLL_WRITE))
1139 1140 1141 1142
		update_mmu_cache_pud(vma, addr, pud);
}

struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1143
		pud_t *pud, int flags, struct dev_pagemap **pgmap)
1144 1145 1146 1147 1148 1149 1150
{
	unsigned long pfn = pud_pfn(*pud);
	struct mm_struct *mm = vma->vm_mm;
	struct page *page;

	assert_spin_locked(pud_lockptr(mm, pud));

1151
	if (flags & FOLL_WRITE && !pud_write(*pud))
1152 1153
		return NULL;

J
John Hubbard 已提交
1154 1155 1156 1157 1158
	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
			 (FOLL_PIN | FOLL_GET)))
		return NULL;

1159 1160 1161 1162 1163 1164
	if (pud_present(*pud) && pud_devmap(*pud))
		/* pass */;
	else
		return NULL;

	if (flags & FOLL_TOUCH)
1165
		touch_pud(vma, addr, pud, flags);
1166 1167 1168 1169

	/*
	 * device mapped pages can only be returned if the
	 * caller will manage the page reference count.
J
John Hubbard 已提交
1170 1171
	 *
	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
1172
	 */
J
John Hubbard 已提交
1173
	if (!(flags & (FOLL_GET | FOLL_PIN)))
1174 1175 1176
		return ERR_PTR(-EEXIST);

	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1177 1178
	*pgmap = get_dev_pagemap(pfn, *pgmap);
	if (!*pgmap)
1179 1180
		return ERR_PTR(-EFAULT);
	page = pfn_to_page(pfn);
J
John Hubbard 已提交
1181 1182
	if (!try_grab_page(page, flags))
		page = ERR_PTR(-ENOMEM);
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212

	return page;
}

int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
		  struct vm_area_struct *vma)
{
	spinlock_t *dst_ptl, *src_ptl;
	pud_t pud;
	int ret;

	dst_ptl = pud_lock(dst_mm, dst_pud);
	src_ptl = pud_lockptr(src_mm, src_pud);
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

	ret = -EAGAIN;
	pud = *src_pud;
	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
		goto out_unlock;

	/*
	 * When page table lock is held, the huge zero pud should not be
	 * under splitting since we don't split the page itself, only pud to
	 * a page table.
	 */
	if (is_huge_zero_pud(pud)) {
		/* No huge zero pud yet */
	}

1213 1214 1215 1216
	/*
	 * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
	 * and split if duplicating fails.
	 */
1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
	pudp_set_wrprotect(src_mm, addr, src_pud);
	pud = pud_mkold(pud_wrprotect(pud));
	set_pud_at(dst_mm, addr, dst_pud, pud);

	ret = 0;
out_unlock:
	spin_unlock(src_ptl);
	spin_unlock(dst_ptl);
	return ret;
}

void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
	pud_t entry;
	unsigned long haddr;
	bool write = vmf->flags & FAULT_FLAG_WRITE;

	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
		goto unlock;

	entry = pud_mkyoung(orig_pud);
	if (write)
		entry = pud_mkdirty(entry);
	haddr = vmf->address & HPAGE_PUD_MASK;
	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);

unlock:
	spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

1250
void huge_pmd_set_accessed(struct vm_fault *vmf)
1251 1252 1253
{
	pmd_t entry;
	unsigned long haddr;
1254
	bool write = vmf->flags & FAULT_FLAG_WRITE;
1255
	pmd_t orig_pmd = vmf->orig_pmd;
1256

J
Jan Kara 已提交
1257 1258
	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1259 1260 1261
		goto unlock;

	entry = pmd_mkyoung(orig_pmd);
1262 1263
	if (write)
		entry = pmd_mkdirty(entry);
J
Jan Kara 已提交
1264
	haddr = vmf->address & HPAGE_PMD_MASK;
1265
	if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
J
Jan Kara 已提交
1266
		update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
1267 1268

unlock:
J
Jan Kara 已提交
1269
	spin_unlock(vmf->ptl);
1270 1271
}

1272
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1273
{
1274
	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
J
Jan Kara 已提交
1275
	struct vm_area_struct *vma = vmf->vma;
1276
	struct page *page;
J
Jan Kara 已提交
1277
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1278
	pmd_t orig_pmd = vmf->orig_pmd;
1279

J
Jan Kara 已提交
1280
	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1281
	VM_BUG_ON_VMA(!vma->anon_vma, vma);
1282

1283 1284 1285
	VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
	VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));

1286
	if (is_huge_zero_pmd(orig_pmd))
1287 1288
		goto fallback;

J
Jan Kara 已提交
1289
	spin_lock(vmf->ptl);
1290 1291 1292 1293 1294

	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
		spin_unlock(vmf->ptl);
		return 0;
	}
1295 1296

	page = pmd_page(orig_pmd);
1297
	VM_BUG_ON_PAGE(!PageHead(page), page);
1298

1299 1300 1301 1302
	/* Early check when only holding the PT lock. */
	if (PageAnonExclusive(page))
		goto reuse;

1303 1304 1305 1306 1307 1308
	if (!trylock_page(page)) {
		get_page(page);
		spin_unlock(vmf->ptl);
		lock_page(page);
		spin_lock(vmf->ptl);
		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1309
			spin_unlock(vmf->ptl);
1310 1311
			unlock_page(page);
			put_page(page);
1312
			return 0;
1313 1314 1315
		}
		put_page(page);
	}
1316

1317 1318 1319 1320 1321 1322
	/* Recheck after temporarily dropping the PT lock. */
	if (PageAnonExclusive(page)) {
		unlock_page(page);
		goto reuse;
	}

1323
	/*
1324
	 * See do_wp_page(): we can only reuse the page exclusively if there are
1325 1326
	 * no additional references. Note that we always drain the LRU
	 * pagevecs immediately after adding a THP.
1327
	 */
1328 1329 1330 1331 1332
	if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
		goto unlock_fallback;
	if (PageSwapCache(page))
		try_to_free_swap(page);
	if (page_count(page) == 1) {
1333
		pmd_t entry;
1334 1335

		page_move_anon_rmap(page, vma);
1336 1337
		unlock_page(page);
reuse:
1338 1339 1340 1341
		if (unlikely(unshare)) {
			spin_unlock(vmf->ptl);
			return 0;
		}
1342
		entry = pmd_mkyoung(orig_pmd);
1343
		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1344
		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
J
Jan Kara 已提交
1345 1346
			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
		spin_unlock(vmf->ptl);
1347
		return VM_FAULT_WRITE;
1348
	}
1349

1350
unlock_fallback:
1351
	unlock_page(page);
J
Jan Kara 已提交
1352
	spin_unlock(vmf->ptl);
1353 1354 1355
fallback:
	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
	return VM_FAULT_FALLBACK;
1356 1357
}

1358
/*
1359 1360
 * FOLL_FORCE can write to even unwritable pmd's, but only
 * after we've gone through a COW cycle and they are dirty.
1361 1362 1363
 */
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
1364 1365
	return pmd_write(pmd) ||
	       ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
1366 1367
}

1368
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1369 1370 1371 1372
				   unsigned long addr,
				   pmd_t *pmd,
				   unsigned int flags)
{
1373
	struct mm_struct *mm = vma->vm_mm;
1374 1375
	struct page *page = NULL;

1376
	assert_spin_locked(pmd_lockptr(mm, pmd));
1377

1378
	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
1379 1380
		goto out;

1381 1382 1383 1384
	/* Avoid dumping huge zero page */
	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
		return ERR_PTR(-EFAULT);

1385
	/* Full NUMA hinting faults to serialise migration in fault paths */
1386
	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1387 1388
		goto out;

1389
	page = pmd_page(*pmd);
1390
	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
J
John Hubbard 已提交
1391

1392 1393 1394
	if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
		return ERR_PTR(-EMLINK);

1395 1396 1397
	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
			!PageAnonExclusive(page), page);

J
John Hubbard 已提交
1398 1399 1400
	if (!try_grab_page(page, flags))
		return ERR_PTR(-ENOMEM);

1401
	if (flags & FOLL_TOUCH)
1402
		touch_pmd(vma, addr, pmd, flags);
J
John Hubbard 已提交
1403

1404
	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1405
	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1406 1407 1408 1409 1410

out:
	return page;
}

1411
/* NUMA hinting page fault entry point for trans huge pmds */
1412
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1413
{
J
Jan Kara 已提交
1414
	struct vm_area_struct *vma = vmf->vma;
Y
Yang Shi 已提交
1415 1416
	pmd_t oldpmd = vmf->orig_pmd;
	pmd_t pmd;
1417
	struct page *page;
J
Jan Kara 已提交
1418
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
Y
Yang Shi 已提交
1419
	int page_nid = NUMA_NO_NODE;
1420
	int target_nid, last_cpupid = -1;
1421
	bool migrated = false;
Y
Yang Shi 已提交
1422
	bool was_writable = pmd_savedwrite(oldpmd);
1423
	int flags = 0;
1424

J
Jan Kara 已提交
1425
	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
Y
Yang Shi 已提交
1426
	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
J
Jan Kara 已提交
1427
		spin_unlock(vmf->ptl);
1428 1429 1430
		goto out;
	}

Y
Yang Shi 已提交
1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449
	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
	page = vm_normal_page_pmd(vma, haddr, pmd);
	if (!page)
		goto out_map;

	/* See similar comment in do_numa_page for explanation */
	if (!was_writable)
		flags |= TNF_NO_GROUP;

	page_nid = page_to_nid(page);
	last_cpupid = page_cpupid_last(page);
	target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
				       &flags);

	if (target_nid == NUMA_NO_NODE) {
		put_page(page);
		goto out_map;
	}

J
Jan Kara 已提交
1450
	spin_unlock(vmf->ptl);
1451

Y
Yang Shi 已提交
1452
	migrated = migrate_misplaced_page(page, vma, target_nid);
1453 1454
	if (migrated) {
		flags |= TNF_MIGRATED;
1455
		page_nid = target_nid;
Y
Yang Shi 已提交
1456
	} else {
1457
		flags |= TNF_MIGRATE_FAIL;
Y
Yang Shi 已提交
1458 1459 1460 1461 1462 1463 1464
		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
			spin_unlock(vmf->ptl);
			goto out;
		}
		goto out_map;
	}
1465 1466

out:
1467
	if (page_nid != NUMA_NO_NODE)
J
Jan Kara 已提交
1468
		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
1469
				flags);
1470

1471
	return 0;
Y
Yang Shi 已提交
1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482

out_map:
	/* Restore the PMD */
	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
	pmd = pmd_mkyoung(pmd);
	if (was_writable)
		pmd = pmd_mkwrite(pmd);
	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
	spin_unlock(vmf->ptl);
	goto out;
1483 1484
}

1485 1486 1487 1488 1489
/*
 * Return true if we do MADV_FREE successfully on entire pmd page.
 * Otherwise, return false.
 */
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1490 1491 1492 1493 1494 1495
		pmd_t *pmd, unsigned long addr, unsigned long next)
{
	spinlock_t *ptl;
	pmd_t orig_pmd;
	struct page *page;
	struct mm_struct *mm = tlb->mm;
1496
	bool ret = false;
1497

1498
	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1499

1500 1501
	ptl = pmd_trans_huge_lock(pmd, vma);
	if (!ptl)
1502
		goto out_unlocked;
1503 1504

	orig_pmd = *pmd;
1505
	if (is_huge_zero_pmd(orig_pmd))
1506 1507
		goto out;

1508 1509 1510 1511 1512 1513
	if (unlikely(!pmd_present(orig_pmd))) {
		VM_BUG_ON(thp_migration_supported() &&
				  !is_pmd_migration_entry(orig_pmd));
		goto out;
	}

1514 1515 1516 1517 1518
	page = pmd_page(orig_pmd);
	/*
	 * If other processes are mapping this page, we couldn't discard
	 * the page unless they all do MADV_FREE so let's skip the page.
	 */
1519
	if (total_mapcount(page) != 1)
1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531
		goto out;

	if (!trylock_page(page))
		goto out;

	/*
	 * If user want to discard part-pages of THP, split it so MADV_FREE
	 * will deactivate only them.
	 */
	if (next - addr != HPAGE_PMD_SIZE) {
		get_page(page);
		spin_unlock(ptl);
1532
		split_huge_page(page);
1533
		unlock_page(page);
1534
		put_page(page);
1535 1536 1537 1538 1539 1540 1541 1542
		goto out_unlocked;
	}

	if (PageDirty(page))
		ClearPageDirty(page);
	unlock_page(page);

	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1543
		pmdp_invalidate(vma, addr, pmd);
1544 1545 1546 1547 1548 1549
		orig_pmd = pmd_mkold(orig_pmd);
		orig_pmd = pmd_mkclean(orig_pmd);

		set_pmd_at(mm, addr, pmd, orig_pmd);
		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
	}
S
Shaohua Li 已提交
1550 1551

	mark_page_lazyfree(page);
1552
	ret = true;
1553 1554 1555 1556 1557 1558
out:
	spin_unlock(ptl);
out_unlocked:
	return ret;
}

1559 1560 1561 1562 1563 1564
static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
	pgtable_t pgtable;

	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
	pte_free(mm, pgtable);
1565
	mm_dec_nr_ptes(mm);
1566 1567
}

1568
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
S
Shaohua Li 已提交
1569
		 pmd_t *pmd, unsigned long addr)
1570
{
1571
	pmd_t orig_pmd;
1572
	spinlock_t *ptl;
1573

1574
	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1575

1576 1577
	ptl = __pmd_trans_huge_lock(pmd, vma);
	if (!ptl)
1578 1579 1580 1581 1582 1583 1584
		return 0;
	/*
	 * For architectures like ppc64 we look at deposited pgtable
	 * when calling pmdp_huge_get_and_clear. So do the
	 * pgtable_trans_huge_withdraw after finishing pmdp related
	 * operations.
	 */
1585 1586
	orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
						tlb->fullmm);
1587
	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1588
	if (vma_is_special_huge(vma)) {
1589 1590
		if (arch_needs_pgtable_deposit())
			zap_deposited_table(tlb->mm, pmd);
1591 1592
		spin_unlock(ptl);
	} else if (is_huge_zero_pmd(orig_pmd)) {
1593
		zap_deposited_table(tlb->mm, pmd);
1594 1595
		spin_unlock(ptl);
	} else {
1596 1597 1598 1599 1600
		struct page *page = NULL;
		int flush_needed = 1;

		if (pmd_present(orig_pmd)) {
			page = pmd_page(orig_pmd);
1601
			page_remove_rmap(page, vma, true);
1602 1603 1604 1605 1606 1607 1608
			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
			VM_BUG_ON_PAGE(!PageHead(page), page);
		} else if (thp_migration_supported()) {
			swp_entry_t entry;

			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
			entry = pmd_to_swp_entry(orig_pmd);
1609
			page = pfn_swap_entry_to_page(entry);
1610 1611 1612 1613
			flush_needed = 0;
		} else
			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");

1614
		if (PageAnon(page)) {
1615
			zap_deposited_table(tlb->mm, pmd);
1616 1617
			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
		} else {
1618 1619
			if (arch_needs_pgtable_deposit())
				zap_deposited_table(tlb->mm, pmd);
1620
			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
1621
		}
1622

1623
		spin_unlock(ptl);
1624 1625
		if (flush_needed)
			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1626
	}
1627
	return 1;
1628 1629
}

1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644
#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
					 spinlock_t *old_pmd_ptl,
					 struct vm_area_struct *vma)
{
	/*
	 * With split pmd lock we also need to move preallocated
	 * PTE page table if new_pmd is on different PMD page table.
	 *
	 * We also don't deposit and withdraw tables for file pages.
	 */
	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif

1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655
static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
	if (unlikely(is_pmd_migration_entry(pmd)))
		pmd = pmd_swp_mksoft_dirty(pmd);
	else if (pmd_present(pmd))
		pmd = pmd_mksoft_dirty(pmd);
#endif
	return pmd;
}

1656
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1657
		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
1658
{
1659
	spinlock_t *old_ptl, *new_ptl;
1660 1661
	pmd_t pmd;
	struct mm_struct *mm = vma->vm_mm;
1662
	bool force_flush = false;
1663 1664 1665 1666 1667 1668 1669

	/*
	 * The destination pmd shouldn't be established, free_pgtables()
	 * should have release it.
	 */
	if (WARN_ON(!pmd_none(*new_pmd))) {
		VM_BUG_ON(pmd_trans_huge(*new_pmd));
1670
		return false;
1671 1672
	}

1673 1674
	/*
	 * We don't have to worry about the ordering of src and dst
1675
	 * ptlocks because exclusive mmap_lock prevents deadlock.
1676
	 */
1677 1678
	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
	if (old_ptl) {
1679 1680 1681
		new_ptl = pmd_lockptr(mm, new_pmd);
		if (new_ptl != old_ptl)
			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1682
		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1683
		if (pmd_present(pmd))
1684
			force_flush = true;
1685
		VM_BUG_ON(!pmd_none(*new_pmd));
1686

1687
		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1688
			pgtable_t pgtable;
1689 1690 1691
			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
		}
1692 1693
		pmd = move_soft_dirty_pmd(pmd);
		set_pmd_at(mm, new_addr, new_pmd, pmd);
1694 1695
		if (force_flush)
			flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1696 1697
		if (new_ptl != old_ptl)
			spin_unlock(new_ptl);
1698
		spin_unlock(old_ptl);
1699
		return true;
1700
	}
1701
	return false;
1702 1703
}

1704 1705 1706
/*
 * Returns
 *  - 0 if PMD could not be locked
I
Ingo Molnar 已提交
1707
 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
1708
 *      or if prot_numa but THP migration is not supported
I
Ingo Molnar 已提交
1709
 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
1710
 */
N
Nadav Amit 已提交
1711 1712 1713
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
		    unsigned long cp_flags)
1714 1715
{
	struct mm_struct *mm = vma->vm_mm;
1716
	spinlock_t *ptl;
1717
	pmd_t oldpmd, entry;
1718 1719
	bool preserve_write;
	int ret;
1720
	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
1721 1722
	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
1723

N
Nadav Amit 已提交
1724 1725
	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

1726 1727 1728
	if (prot_numa && !thp_migration_supported())
		return 1;

1729
	ptl = __pmd_trans_huge_lock(pmd, vma);
1730 1731
	if (!ptl)
		return 0;
1732

1733 1734
	preserve_write = prot_numa && pmd_write(*pmd);
	ret = 1;
1735

1736 1737 1738
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
	if (is_swap_pmd(*pmd)) {
		swp_entry_t entry = pmd_to_swp_entry(*pmd);
1739
		struct page *page = pfn_swap_entry_to_page(entry);
1740 1741

		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
1742
		if (is_writable_migration_entry(entry)) {
1743 1744 1745 1746 1747
			pmd_t newpmd;
			/*
			 * A protection check is difficult so
			 * just be safe and disable write
			 */
1748 1749 1750 1751
			if (PageAnon(page))
				entry = make_readable_exclusive_migration_entry(swp_offset(entry));
			else
				entry = make_readable_migration_entry(swp_offset(entry));
1752
			newpmd = swp_entry_to_pmd(entry);
1753 1754
			if (pmd_swp_soft_dirty(*pmd))
				newpmd = pmd_swp_mksoft_dirty(newpmd);
1755 1756
			if (pmd_swp_uffd_wp(*pmd))
				newpmd = pmd_swp_mkuffd_wp(newpmd);
1757 1758 1759 1760 1761 1762
			set_pmd_at(mm, addr, pmd, newpmd);
		}
		goto unlock;
	}
#endif

1763 1764 1765 1766 1767 1768 1769 1770 1771
	if (prot_numa) {
		struct page *page;
		/*
		 * Avoid trapping faults against the zero page. The read-only
		 * data is likely to be read-cached on the local CPU and
		 * local/remote hits to the zero page are not interesting.
		 */
		if (is_huge_zero_pmd(*pmd))
			goto unlock;
1772

1773 1774
		if (pmd_protnone(*pmd))
			goto unlock;
1775

1776 1777 1778 1779 1780 1781 1782 1783 1784
		page = pmd_page(*pmd);
		/*
		 * Skip scanning top tier node if normal numa
		 * balancing is disabled
		 */
		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
		    node_is_toptier(page_to_nid(page)))
			goto unlock;
	}
1785
	/*
1786
	 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
1787
	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
1788
	 * which is also under mmap_read_lock(mm):
1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802
	 *
	 *	CPU0:				CPU1:
	 *				change_huge_pmd(prot_numa=1)
	 *				 pmdp_huge_get_and_clear_notify()
	 * madvise_dontneed()
	 *  zap_pmd_range()
	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
	 *   // skip the pmd
	 *				 set_pmd_at();
	 *				 // pmd is re-established
	 *
	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
	 * which may break userspace.
	 *
1803
	 * pmdp_invalidate_ad() is required to make sure we don't miss
1804 1805
	 * dirty/young flags set by hardware.
	 */
1806
	oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
1807

1808
	entry = pmd_modify(oldpmd, newprot);
1809 1810
	if (preserve_write)
		entry = pmd_mk_savedwrite(entry);
1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821
	if (uffd_wp) {
		entry = pmd_wrprotect(entry);
		entry = pmd_mkuffd_wp(entry);
	} else if (uffd_wp_resolve) {
		/*
		 * Leave the write bit to be handled by PF interrupt
		 * handler, then things like COW could be properly
		 * handled.
		 */
		entry = pmd_clear_uffd_wp(entry);
	}
1822 1823
	ret = HPAGE_PMD_NR;
	set_pmd_at(mm, addr, pmd, entry);
N
Nadav Amit 已提交
1824

1825 1826
	if (huge_pmd_needs_flush(oldpmd, entry))
		tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
N
Nadav Amit 已提交
1827

1828 1829 1830
	BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
unlock:
	spin_unlock(ptl);
1831 1832 1833 1834
	return ret;
}

/*
1835
 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
1836
 *
1837 1838
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
1839
 */
1840
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1841
{
1842 1843
	spinlock_t *ptl;
	ptl = pmd_lock(vma->vm_mm, pmd);
1844 1845
	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
			pmd_devmap(*pmd)))
1846 1847 1848
		return ptl;
	spin_unlock(ptl);
	return NULL;
1849 1850
}

1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882
/*
 * Returns true if a given pud maps a thp, false otherwise.
 *
 * Note that if it returns true, this routine returns without unlocking page
 * table lock. So callers must unlock it.
 */
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
	spinlock_t *ptl;

	ptl = pud_lock(vma->vm_mm, pud);
	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
		return ptl;
	spin_unlock(ptl);
	return NULL;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
		 pud_t *pud, unsigned long addr)
{
	spinlock_t *ptl;

	ptl = __pud_trans_huge_lock(pud, vma);
	if (!ptl)
		return 0;
	/*
	 * For architectures like ppc64 we look at deposited pgtable
	 * when calling pudp_huge_get_and_clear. So do the
	 * pgtable_trans_huge_withdraw after finishing pudp related
	 * operations.
	 */
1883
	pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
1884
	tlb_remove_pud_tlb_entry(tlb, pud, addr);
1885
	if (vma_is_special_huge(vma)) {
1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902
		spin_unlock(ptl);
		/* No zero page support yet */
	} else {
		/* No support for anonymous PUD pages yet */
		BUG();
	}
	return 1;
}

static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
		unsigned long haddr)
{
	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));

1903
	count_vm_event(THP_SPLIT_PUD);
1904 1905 1906 1907 1908 1909 1910 1911

	pudp_huge_clear_flush_notify(vma, haddr, pud);
}

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
		unsigned long address)
{
	spinlock_t *ptl;
1912
	struct mmu_notifier_range range;
1913

1914
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1915
				address & HPAGE_PUD_MASK,
1916 1917 1918
				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
	mmu_notifier_invalidate_range_start(&range);
	ptl = pud_lock(vma->vm_mm, pud);
1919 1920
	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
		goto out;
1921
	__split_huge_pud_locked(vma, pud, range.start);
1922 1923 1924

out:
	spin_unlock(ptl);
1925 1926 1927 1928
	/*
	 * No need to double call mmu_notifier->invalidate_range() callback as
	 * the above pudp_huge_clear_flush_notify() did already call it.
	 */
1929
	mmu_notifier_invalidate_range_only_end(&range);
1930 1931 1932
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

1933 1934 1935 1936 1937 1938 1939 1940
static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
		unsigned long haddr, pmd_t *pmd)
{
	struct mm_struct *mm = vma->vm_mm;
	pgtable_t pgtable;
	pmd_t _pmd;
	int i;

1941 1942 1943 1944 1945 1946
	/*
	 * Leave pmd empty until pte is filled note that it is fine to delay
	 * notification until mmu_notifier_invalidate_range_end() as we are
	 * replacing a zero pmd write protected page with a zero pte write
	 * protected page.
	 *
1947
	 * See Documentation/vm/mmu_notifier.rst
1948 1949
	 */
	pmdp_huge_clear_flush(vma, haddr, pmd);
1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967

	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
	pmd_populate(mm, &_pmd, pgtable);

	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
		pte_t *pte, entry;
		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
		entry = pte_mkspecial(entry);
		pte = pte_offset_map(&_pmd, haddr);
		VM_BUG_ON(!pte_none(*pte));
		set_pte_at(mm, haddr, pte, entry);
		pte_unmap(pte);
	}
	smp_wmb(); /* make pte visible before pmd */
	pmd_populate(mm, pmd, pgtable);
}

static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
1968
		unsigned long haddr, bool freeze)
1969 1970 1971 1972
{
	struct mm_struct *mm = vma->vm_mm;
	struct page *page;
	pgtable_t pgtable;
1973
	pmd_t old_pmd, _pmd;
1974
	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
1975
	bool anon_exclusive = false;
1976
	unsigned long addr;
1977 1978 1979 1980 1981
	int i;

	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
1982 1983
	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
				&& !pmd_devmap(*pmd));
1984 1985 1986

	count_vm_event(THP_SPLIT_PMD);

1987
	if (!vma_is_anonymous(vma)) {
1988
		old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1989 1990 1991 1992 1993 1994
		/*
		 * We are going to unmap this huge page. So
		 * just go ahead and zap it
		 */
		if (arch_needs_pgtable_deposit())
			zap_deposited_table(mm, pmd);
1995
		if (vma_is_special_huge(vma))
1996
			return;
1997 1998 1999 2000
		if (unlikely(is_pmd_migration_entry(old_pmd))) {
			swp_entry_t entry;

			entry = pmd_to_swp_entry(old_pmd);
2001
			page = pfn_swap_entry_to_page(entry);
2002 2003 2004 2005 2006 2007
		} else {
			page = pmd_page(old_pmd);
			if (!PageDirty(page) && pmd_dirty(old_pmd))
				set_page_dirty(page);
			if (!PageReferenced(page) && pmd_young(old_pmd))
				SetPageReferenced(page);
2008
			page_remove_rmap(page, vma, true);
2009 2010
			put_page(page);
		}
2011
		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
2012
		return;
2013 2014
	}

2015
	if (is_huge_zero_pmd(*pmd)) {
2016 2017 2018 2019 2020 2021 2022 2023 2024
		/*
		 * FIXME: Do we want to invalidate secondary mmu by calling
		 * mmu_notifier_invalidate_range() see comments below inside
		 * __split_huge_pmd() ?
		 *
		 * We are going from a zero huge page write protected to zero
		 * small page also write protected so it does not seems useful
		 * to invalidate secondary mmu at this time.
		 */
2025 2026 2027
		return __split_huge_zero_page_pmd(vma, haddr, pmd);
	}

2028 2029 2030 2031 2032 2033 2034 2035
	/*
	 * Up to this point the pmd is present and huge and userland has the
	 * whole access to the hugepage during the split (which happens in
	 * place). If we overwrite the pmd with the not-huge version pointing
	 * to the pte here (which of course we could if all CPUs were bug
	 * free), userland could trigger a small page size TLB miss on the
	 * small sized TLB while the hugepage TLB entry is still established in
	 * the huge TLB. Some CPU doesn't like that.
2036 2037
	 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
	 * 383 on page 105. Intel should be safe but is also warns that it's
2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050
	 * only safe if the permission and cache attributes of the two entries
	 * loaded in the two TLB is identical (which should be the case here).
	 * But it is generally safer to never allow small and huge TLB entries
	 * for the same virtual address to be loaded simultaneously. So instead
	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
	 * current pmd notpresent (atomically because here the pmd_trans_huge
	 * must remain set at all times on the pmd until the split is complete
	 * for this pmd), then we flush the SMP TLB and finally we write the
	 * non-huge version of the pmd entry with pmd_populate.
	 */
	old_pmd = pmdp_invalidate(vma, haddr, pmd);

	pmd_migration = is_pmd_migration_entry(old_pmd);
2051
	if (unlikely(pmd_migration)) {
2052 2053
		swp_entry_t entry;

2054
		entry = pmd_to_swp_entry(old_pmd);
2055
		page = pfn_swap_entry_to_page(entry);
2056
		write = is_writable_migration_entry(entry);
2057 2058
		if (PageAnon(page))
			anon_exclusive = is_readable_exclusive_migration_entry(entry);
2059 2060
		young = false;
		soft_dirty = pmd_swp_soft_dirty(old_pmd);
2061
		uffd_wp = pmd_swp_uffd_wp(old_pmd);
2062
	} else {
2063
		page = pmd_page(old_pmd);
2064 2065 2066 2067 2068
		if (pmd_dirty(old_pmd))
			SetPageDirty(page);
		write = pmd_write(old_pmd);
		young = pmd_young(old_pmd);
		soft_dirty = pmd_soft_dirty(old_pmd);
2069
		uffd_wp = pmd_uffd_wp(old_pmd);
2070

2071 2072
		VM_BUG_ON_PAGE(!page_count(page), page);
		page_ref_add(page, HPAGE_PMD_NR - 1);
2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089

		/*
		 * Without "freeze", we'll simply split the PMD, propagating the
		 * PageAnonExclusive() flag for each PTE by setting it for
		 * each subpage -- no need to (temporarily) clear.
		 *
		 * With "freeze" we want to replace mapped pages by
		 * migration entries right away. This is only possible if we
		 * managed to clear PageAnonExclusive() -- see
		 * set_pmd_migration_entry().
		 *
		 * In case we cannot clear PageAnonExclusive(), split the PMD
		 * only and let try_to_migrate_one() fail later.
		 */
		anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
		if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
			freeze = false;
2090
	}
2091

2092 2093 2094 2095
	/*
	 * Withdraw the table only after we mark the pmd entry invalid.
	 * This's critical for some architectures (Power).
	 */
2096 2097 2098
	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
	pmd_populate(mm, &_pmd, pgtable);

2099
	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2100 2101 2102 2103 2104 2105
		pte_t entry, *pte;
		/*
		 * Note that NUMA hinting access restrictions are not
		 * transferred to avoid any possibility of altering
		 * permissions across VMAs.
		 */
2106
		if (freeze || pmd_migration) {
2107
			swp_entry_t swp_entry;
2108 2109 2110
			if (write)
				swp_entry = make_writable_migration_entry(
							page_to_pfn(page + i));
2111 2112 2113
			else if (anon_exclusive)
				swp_entry = make_readable_exclusive_migration_entry(
							page_to_pfn(page + i));
2114 2115 2116
			else
				swp_entry = make_readable_migration_entry(
							page_to_pfn(page + i));
2117
			entry = swp_entry_to_pte(swp_entry);
2118 2119
			if (soft_dirty)
				entry = pte_swp_mksoft_dirty(entry);
2120 2121
			if (uffd_wp)
				entry = pte_swp_mkuffd_wp(entry);
2122
		} else {
2123
			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2124
			entry = maybe_mkwrite(entry, vma);
2125 2126
			if (anon_exclusive)
				SetPageAnonExclusive(page + i);
2127 2128 2129 2130
			if (!write)
				entry = pte_wrprotect(entry);
			if (!young)
				entry = pte_mkold(entry);
2131 2132
			if (soft_dirty)
				entry = pte_mksoft_dirty(entry);
2133 2134
			if (uffd_wp)
				entry = pte_mkuffd_wp(entry);
2135
		}
2136
		pte = pte_offset_map(&_pmd, addr);
2137
		BUG_ON(!pte_none(*pte));
2138
		set_pte_at(mm, addr, pte, entry);
2139
		if (!pmd_migration)
2140
			atomic_inc(&page[i]._mapcount);
2141
		pte_unmap(pte);
2142 2143
	}

2144 2145 2146 2147 2148 2149 2150
	if (!pmd_migration) {
		/*
		 * Set PG_double_map before dropping compound_mapcount to avoid
		 * false-negative page_mapped().
		 */
		if (compound_mapcount(page) > 1 &&
		    !TestSetPageDoubleMap(page)) {
2151
			for (i = 0; i < HPAGE_PMD_NR; i++)
2152 2153 2154 2155 2156 2157
				atomic_inc(&page[i]._mapcount);
		}

		lock_page_memcg(page);
		if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
			/* Last compound_mapcount is gone. */
2158 2159
			__mod_lruvec_page_state(page, NR_ANON_THPS,
						-HPAGE_PMD_NR);
2160 2161 2162 2163 2164
			if (TestClearPageDoubleMap(page)) {
				/* No need in mapcount reference anymore */
				for (i = 0; i < HPAGE_PMD_NR; i++)
					atomic_dec(&page[i]._mapcount);
			}
2165
		}
2166
		unlock_page_memcg(page);
2167 2168 2169

		/* Above is effectively page_remove_rmap(page, vma, true) */
		munlock_vma_page(page, vma, true);
2170 2171 2172 2173
	}

	smp_wmb(); /* make pte visible before pmd */
	pmd_populate(mm, pmd, pgtable);
2174 2175

	if (freeze) {
2176
		for (i = 0; i < HPAGE_PMD_NR; i++) {
2177
			page_remove_rmap(page + i, vma, false);
2178 2179 2180
			put_page(page + i);
		}
	}
2181 2182 2183
}

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2184
		unsigned long address, bool freeze, struct folio *folio)
2185 2186
{
	spinlock_t *ptl;
2187
	struct mmu_notifier_range range;
2188

2189
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2190
				address & HPAGE_PMD_MASK,
2191 2192 2193
				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
	mmu_notifier_invalidate_range_start(&range);
	ptl = pmd_lock(vma->vm_mm, pmd);
2194 2195

	/*
2196 2197
	 * If caller asks to setup a migration entry, we need a folio to check
	 * pmd against. Otherwise we can end up replacing wrong folio.
2198
	 */
2199
	VM_BUG_ON(freeze && !folio);
2200
	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
2201

2202
	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
2203 2204 2205
	    is_pmd_migration_entry(*pmd)) {
		if (folio && folio != page_folio(pmd_page(*pmd)))
			goto out;
2206
		__split_huge_pmd_locked(vma, pmd, range.start, freeze);
2207
	}
2208

2209
out:
2210
	spin_unlock(ptl);
2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223
	/*
	 * No need to double call mmu_notifier->invalidate_range() callback.
	 * They are 3 cases to consider inside __split_huge_pmd_locked():
	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
	 *    fault will trigger a flush_notify before pointing to a new page
	 *    (it is fine if the secondary mmu keeps pointing to the old zero
	 *    page in the meantime)
	 *  3) Split a huge pmd into pte pointing to the same page. No need
	 *     to invalidate secondary tlb entry they are all still valid.
	 *     any further changes to individual pte will notify. So no need
	 *     to call mmu_notifier->invalidate_range()
	 */
2224
	mmu_notifier_invalidate_range_only_end(&range);
2225 2226
}

2227
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2228
		bool freeze, struct folio *folio)
2229
{
2230
	pgd_t *pgd;
2231
	p4d_t *p4d;
2232
	pud_t *pud;
2233 2234
	pmd_t *pmd;

2235
	pgd = pgd_offset(vma->vm_mm, address);
2236 2237 2238
	if (!pgd_present(*pgd))
		return;

2239 2240 2241 2242 2243
	p4d = p4d_offset(pgd, address);
	if (!p4d_present(*p4d))
		return;

	pud = pud_offset(p4d, address);
2244 2245 2246 2247
	if (!pud_present(*pud))
		return;

	pmd = pmd_offset(pud, address);
2248

2249
	__split_huge_pmd(vma, pmd, address, freeze, folio);
2250 2251
}

2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
{
	/*
	 * If the new address isn't hpage aligned and it could previously
	 * contain an hugepage: check if we need to split an huge pmd.
	 */
	if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
	    range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
			 ALIGN(address, HPAGE_PMD_SIZE)))
		split_huge_pmd_address(vma, address, false, NULL);
}

2264
void vma_adjust_trans_huge(struct vm_area_struct *vma,
2265 2266 2267 2268
			     unsigned long start,
			     unsigned long end,
			     long adjust_next)
{
2269 2270
	/* Check if we need to split start first. */
	split_huge_pmd_if_needed(vma, start);
2271

2272 2273
	/* Check if we need to split end next. */
	split_huge_pmd_if_needed(vma, end);
2274 2275

	/*
2276 2277
	 * If we're also updating the vma->vm_next->vm_start,
	 * check if we need to split it.
2278 2279 2280 2281
	 */
	if (adjust_next > 0) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long nstart = next->vm_start;
2282
		nstart += adjust_next;
2283
		split_huge_pmd_if_needed(next, nstart);
2284 2285
	}
}
2286

2287
static void unmap_page(struct page *page)
2288
{
2289
	struct folio *folio = page_folio(page);
2290 2291
	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
		TTU_SYNC;
2292 2293 2294

	VM_BUG_ON_PAGE(!PageHead(page), page);

2295 2296 2297 2298 2299
	/*
	 * Anon pages need migration entries to preserve them, but file
	 * pages can simply be left unmapped, then faulted back on demand.
	 * If that is ever changed (perhaps for mlock), update remap_page().
	 */
2300 2301
	if (folio_test_anon(folio))
		try_to_migrate(folio, ttu_flags);
2302
	else
2303
		try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
2304 2305
}

2306
static void remap_page(struct folio *folio, unsigned long nr)
2307
{
2308
	int i = 0;
2309

2310
	/* If unmap_page() uses try_to_migrate() on file, remove this check */
2311
	if (!folio_test_anon(folio))
2312
		return;
2313 2314 2315 2316 2317 2318
	for (;;) {
		remove_migration_ptes(folio, folio, true);
		i += folio_nr_pages(folio);
		if (i >= nr)
			break;
		folio = folio_next(folio);
2319
	}
2320 2321
}

2322
static void lru_add_page_tail(struct page *head, struct page *tail,
2323 2324
		struct lruvec *lruvec, struct list_head *list)
{
2325 2326 2327
	VM_BUG_ON_PAGE(!PageHead(head), head);
	VM_BUG_ON_PAGE(PageCompound(tail), head);
	VM_BUG_ON_PAGE(PageLRU(tail), head);
2328
	lockdep_assert_held(&lruvec->lru_lock);
2329

A
Alex Shi 已提交
2330
	if (list) {
2331
		/* page reclaim is reclaiming a huge page */
A
Alex Shi 已提交
2332
		VM_WARN_ON(PageLRU(head));
2333 2334
		get_page(tail);
		list_add_tail(&tail->lru, list);
2335
	} else {
A
Alex Shi 已提交
2336 2337
		/* head is still on lru (and we have it frozen) */
		VM_WARN_ON(!PageLRU(head));
2338 2339 2340 2341
		if (PageUnevictable(tail))
			tail->mlock_count = 0;
		else
			list_add_tail(&tail->lru, &head->lru);
A
Alex Shi 已提交
2342
		SetPageLRU(tail);
2343 2344 2345
	}
}

2346
static void __split_huge_page_tail(struct page *head, int tail,
2347 2348 2349 2350
		struct lruvec *lruvec, struct list_head *list)
{
	struct page *page_tail = head + tail;

2351
	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2352 2353

	/*
2354 2355 2356
	 * Clone page flags before unfreezing refcount.
	 *
	 * After successful get_page_unless_zero() might follow flags change,
2357
	 * for example lock_page() which set PG_waiters.
2358 2359 2360 2361 2362 2363 2364
	 *
	 * Note that for mapped sub-pages of an anonymous THP,
	 * PG_anon_exclusive has been cleared in unmap_page() and is stored in
	 * the migration entry instead from where remap_page() will restore it.
	 * We can still have PG_anon_exclusive set on effectively unmapped and
	 * unreferenced sub-pages of an anonymous THP: we can simply drop
	 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2365 2366 2367 2368 2369
	 */
	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
	page_tail->flags |= (head->flags &
			((1L << PG_referenced) |
			 (1L << PG_swapbacked) |
2370
			 (1L << PG_swapcache) |
2371 2372 2373
			 (1L << PG_mlocked) |
			 (1L << PG_uptodate) |
			 (1L << PG_active) |
2374
			 (1L << PG_workingset) |
2375
			 (1L << PG_locked) |
2376
			 (1L << PG_unevictable) |
2377 2378 2379
#ifdef CONFIG_64BIT
			 (1L << PG_arch_2) |
#endif
2380
			 (1L << PG_dirty)));
2381

2382 2383 2384 2385 2386 2387
	/* ->mapping in first tail page is compound_mapcount */
	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
			page_tail);
	page_tail->mapping = head->mapping;
	page_tail->index = head->index + tail;

2388
	/* Page flags must be visible before we make the page non-compound. */
2389 2390
	smp_wmb();

2391 2392 2393 2394 2395 2396
	/*
	 * Clear PageTail before unfreezing page refcount.
	 *
	 * After successful get_page_unless_zero() might follow put_page()
	 * which needs correct compound_head().
	 */
2397 2398
	clear_compound_head(page_tail);

2399 2400 2401 2402
	/* Finally unfreeze refcount. Additional reference from page cache. */
	page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
					  PageSwapCache(head)));

2403 2404 2405 2406 2407 2408
	if (page_is_young(head))
		set_page_young(page_tail);
	if (page_is_idle(head))
		set_page_idle(page_tail);

	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
M
Michal Hocko 已提交
2409 2410 2411 2412 2413 2414

	/*
	 * always add to the tail because some iterators expect new
	 * pages to show after the currently processed elements - e.g.
	 * migrate_pages
	 */
2415 2416 2417
	lru_add_page_tail(head, page_tail, lruvec, list);
}

2418
static void __split_huge_page(struct page *page, struct list_head *list,
A
Alex Shi 已提交
2419
		pgoff_t end)
2420
{
2421 2422
	struct folio *folio = page_folio(page);
	struct page *head = &folio->page;
2423
	struct lruvec *lruvec;
2424 2425
	struct address_space *swap_cache = NULL;
	unsigned long offset = 0;
2426
	unsigned int nr = thp_nr_pages(head);
2427
	int i;
2428 2429

	/* complete memcg works before add pages to LRU */
2430
	split_page_memcg(head, nr);
2431

2432 2433 2434 2435 2436 2437 2438 2439
	if (PageAnon(head) && PageSwapCache(head)) {
		swp_entry_t entry = { .val = page_private(head) };

		offset = swp_offset(entry);
		swap_cache = swap_address_space(entry);
		xa_lock(&swap_cache->i_pages);
	}

I
Ingo Molnar 已提交
2440
	/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
2441
	lruvec = folio_lruvec_lock(folio);
A
Alex Shi 已提交
2442

2443 2444
	ClearPageHasHWPoisoned(head);

2445
	for (i = nr - 1; i >= 1; i--) {
2446
		__split_huge_page_tail(head, i, lruvec, list);
2447
		/* Some pages can be beyond EOF: drop them from page cache */
2448
		if (head[i].index >= end) {
2449
			ClearPageDirty(head + i);
2450
			__delete_from_page_cache(head + i, NULL);
2451
			if (shmem_mapping(head->mapping))
2452
				shmem_uncharge(head->mapping->host, 1);
2453
			put_page(head + i);
2454 2455 2456 2457 2458 2459
		} else if (!PageAnon(page)) {
			__xa_store(&head->mapping->i_pages, head[i].index,
					head + i, 0);
		} else if (swap_cache) {
			__xa_store(&swap_cache->i_pages, offset + i,
					head + i, 0);
2460 2461
		}
	}
2462 2463

	ClearPageCompound(head);
2464
	unlock_page_lruvec(lruvec);
A
Alex Shi 已提交
2465
	/* Caller disabled irqs, so they are still disabled here */
2466

2467
	split_page_owner(head, nr);
2468

2469 2470
	/* See comment in __split_huge_page_tail() */
	if (PageAnon(head)) {
M
Matthew Wilcox 已提交
2471
		/* Additional pin to swap cache */
2472
		if (PageSwapCache(head)) {
2473
			page_ref_add(head, 2);
2474 2475
			xa_unlock(&swap_cache->i_pages);
		} else {
2476
			page_ref_inc(head);
2477
		}
2478
	} else {
M
Matthew Wilcox 已提交
2479
		/* Additional pin to page cache */
2480
		page_ref_add(head, 2);
M
Matthew Wilcox 已提交
2481
		xa_unlock(&head->mapping->i_pages);
2482
	}
A
Alex Shi 已提交
2483
	local_irq_enable();
2484

2485
	remap_page(folio, nr);
2486

H
Huang Ying 已提交
2487 2488 2489 2490 2491 2492
	if (PageSwapCache(head)) {
		swp_entry_t entry = { .val = page_private(head) };

		split_swap_cluster(entry);
	}

2493
	for (i = 0; i < nr; i++) {
2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509
		struct page *subpage = head + i;
		if (subpage == page)
			continue;
		unlock_page(subpage);

		/*
		 * Subpages may be freed if there wasn't any mapping
		 * like if add_to_swap() is running on a lru page that
		 * had its mapping zapped. And freeing these pages
		 * requires taking the lru_lock so we do the put_page
		 * of the tail pages after the split is complete.
		 */
		put_page(subpage);
	}
}

2510
/* Racy check whether the huge page can be split */
2511
bool can_split_folio(struct folio *folio, int *pextra_pins)
2512 2513 2514
{
	int extra_pins;

M
Matthew Wilcox 已提交
2515
	/* Additional pins from page cache */
2516 2517 2518
	if (folio_test_anon(folio))
		extra_pins = folio_test_swapcache(folio) ?
				folio_nr_pages(folio) : 0;
2519
	else
2520
		extra_pins = folio_nr_pages(folio);
2521 2522
	if (pextra_pins)
		*pextra_pins = extra_pins;
2523
	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
2524 2525
}

2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546
/*
 * This function splits huge page into normal pages. @page can point to any
 * subpage of huge page to split. Split doesn't change the position of @page.
 *
 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
 * The huge page must be locked.
 *
 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
 *
 * Both head page and tail pages will inherit mapping, flags, and so on from
 * the hugepage.
 *
 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
 * they are not mapped.
 *
 * Returns 0 if the hugepage is split successfully.
 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
 * us.
 */
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
2547 2548
	struct folio *folio = page_folio(page);
	struct page *head = &folio->page;
2549
	struct deferred_split *ds_queue = get_deferred_split_queue(head);
2550
	XA_STATE(xas, &head->mapping->i_pages, head->index);
2551 2552
	struct anon_vma *anon_vma = NULL;
	struct address_space *mapping = NULL;
2553
	int extra_pins, ret;
2554
	pgoff_t end;
2555

2556
	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
2557 2558
	VM_BUG_ON_PAGE(!PageLocked(head), head);
	VM_BUG_ON_PAGE(!PageCompound(head), head);
2559

2560
	if (PageWriteback(head))
2561 2562
		return -EBUSY;

2563 2564
	if (PageAnon(head)) {
		/*
2565
		 * The caller does not necessarily hold an mmap_lock that would
2566 2567
		 * prevent the anon_vma disappearing so we first we take a
		 * reference to it and then lock the anon_vma for write. This
2568
		 * is similar to folio_lock_anon_vma_read except the write lock
2569 2570 2571 2572 2573 2574 2575 2576
		 * is taken to serialise against parallel split or collapse
		 * operations.
		 */
		anon_vma = page_get_anon_vma(head);
		if (!anon_vma) {
			ret = -EBUSY;
			goto out;
		}
2577
		end = -1;
2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588
		mapping = NULL;
		anon_vma_lock_write(anon_vma);
	} else {
		mapping = head->mapping;

		/* Truncated ? */
		if (!mapping) {
			ret = -EBUSY;
			goto out;
		}

2589 2590 2591 2592 2593 2594 2595
		xas_split_alloc(&xas, head, compound_order(head),
				mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
		if (xas_error(&xas)) {
			ret = xas_error(&xas);
			goto out;
		}

2596 2597
		anon_vma = NULL;
		i_mmap_lock_read(mapping);
2598 2599 2600 2601 2602 2603 2604 2605 2606

		/*
		 *__split_huge_page() may need to trim off pages beyond EOF:
		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
		 * which cannot be nested inside the page tree lock. So note
		 * end now: i_size itself may be changed at any moment, but
		 * head page lock is good enough to serialize the trimming.
		 */
		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2607 2608
		if (shmem_mapping(mapping))
			end = shmem_fallocend(mapping->host, end);
2609 2610 2611
	}

	/*
2612
	 * Racy check if we can split the page, before unmap_page() will
2613 2614
	 * split PMDs
	 */
2615
	if (!can_split_folio(folio, &extra_pins)) {
2616 2617 2618 2619
		ret = -EBUSY;
		goto out_unlock;
	}

2620
	unmap_page(head);
2621

A
Alex Shi 已提交
2622 2623
	/* block interrupt reentry in xa_lock and spinlock */
	local_irq_disable();
2624 2625
	if (mapping) {
		/*
M
Matthew Wilcox 已提交
2626
		 * Check if the head page is present in page cache.
2627 2628
		 * We assume all tail are present too, if head is there.
		 */
2629 2630
		xas_lock(&xas);
		xas_reset(&xas);
M
Matthew Wilcox 已提交
2631
		if (xas_load(&xas) != head)
2632 2633 2634
			goto fail;
	}

2635
	/* Prevent deferred_split_scan() touching ->_refcount */
2636
	spin_lock(&ds_queue->split_queue_lock);
2637
	if (page_ref_freeze(head, 1 + extra_pins)) {
2638
		if (!list_empty(page_deferred_list(head))) {
2639
			ds_queue->split_queue_len--;
2640 2641
			list_del(page_deferred_list(head));
		}
2642
		spin_unlock(&ds_queue->split_queue_lock);
2643
		if (mapping) {
2644 2645
			int nr = thp_nr_pages(head);

2646
			xas_split(&xas, head, thp_order(head));
2647
			if (PageSwapBacked(head)) {
2648 2649
				__mod_lruvec_page_state(head, NR_SHMEM_THPS,
							-nr);
2650
			} else {
2651 2652
				__mod_lruvec_page_state(head, NR_FILE_THPS,
							-nr);
2653 2654
				filemap_nr_thps_dec(mapping);
			}
2655 2656
		}

A
Alex Shi 已提交
2657
		__split_huge_page(page, list, end);
H
Huang Ying 已提交
2658
		ret = 0;
2659
	} else {
2660
		spin_unlock(&ds_queue->split_queue_lock);
2661 2662
fail:
		if (mapping)
2663
			xas_unlock(&xas);
A
Alex Shi 已提交
2664
		local_irq_enable();
2665
		remap_page(folio, folio_nr_pages(folio));
2666 2667 2668 2669
		ret = -EBUSY;
	}

out_unlock:
2670 2671 2672 2673 2674 2675
	if (anon_vma) {
		anon_vma_unlock_write(anon_vma);
		put_anon_vma(anon_vma);
	}
	if (mapping)
		i_mmap_unlock_read(mapping);
2676
out:
2677 2678
	/* Free any memory we didn't use */
	xas_nomem(&xas, 0);
2679 2680 2681
	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
	return ret;
}
2682 2683 2684

void free_transhuge_page(struct page *page)
{
2685
	struct deferred_split *ds_queue = get_deferred_split_queue(page);
2686 2687
	unsigned long flags;

2688
	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2689
	if (!list_empty(page_deferred_list(page))) {
2690
		ds_queue->split_queue_len--;
2691 2692
		list_del(page_deferred_list(page));
	}
2693
	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2694 2695 2696 2697 2698
	free_compound_page(page);
}

void deferred_split_huge_page(struct page *page)
{
2699 2700
	struct deferred_split *ds_queue = get_deferred_split_queue(page);
#ifdef CONFIG_MEMCG
2701
	struct mem_cgroup *memcg = page_memcg(compound_head(page));
2702
#endif
2703 2704 2705 2706
	unsigned long flags;

	VM_BUG_ON_PAGE(!PageTransHuge(page), page);

2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719
	/*
	 * The try_to_unmap() in page reclaim path might reach here too,
	 * this may cause a race condition to corrupt deferred split queue.
	 * And, if page reclaim is already handling the same page, it is
	 * unnecessary to handle it again in shrinker.
	 *
	 * Check PageSwapCache to determine if the page is being
	 * handled by page reclaim since THP swap would add the page into
	 * swap cache before calling try_to_unmap().
	 */
	if (PageSwapCache(page))
		return;

2720
	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2721
	if (list_empty(page_deferred_list(page))) {
2722
		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
2723 2724
		list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
		ds_queue->split_queue_len++;
2725 2726
#ifdef CONFIG_MEMCG
		if (memcg)
2727 2728
			set_shrinker_bit(memcg, page_to_nid(page),
					 deferred_split_shrinker.id);
2729
#endif
2730
	}
2731
	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2732 2733 2734 2735 2736
}

static unsigned long deferred_split_count(struct shrinker *shrink,
		struct shrink_control *sc)
{
2737
	struct pglist_data *pgdata = NODE_DATA(sc->nid);
2738
	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2739 2740 2741 2742 2743

#ifdef CONFIG_MEMCG
	if (sc->memcg)
		ds_queue = &sc->memcg->deferred_split_queue;
#endif
2744
	return READ_ONCE(ds_queue->split_queue_len);
2745 2746 2747 2748 2749
}

static unsigned long deferred_split_scan(struct shrinker *shrink,
		struct shrink_control *sc)
{
2750
	struct pglist_data *pgdata = NODE_DATA(sc->nid);
2751
	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2752 2753 2754 2755 2756
	unsigned long flags;
	LIST_HEAD(list), *pos, *next;
	struct page *page;
	int split = 0;

2757 2758 2759 2760 2761
#ifdef CONFIG_MEMCG
	if (sc->memcg)
		ds_queue = &sc->memcg->deferred_split_queue;
#endif

2762
	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2763
	/* Take pin on all head pages to avoid freeing them under us */
2764
	list_for_each_safe(pos, next, &ds_queue->split_queue) {
2765
		page = list_entry((void *)pos, struct page, deferred_list);
2766
		page = compound_head(page);
2767 2768 2769 2770
		if (get_page_unless_zero(page)) {
			list_move(page_deferred_list(page), &list);
		} else {
			/* We lost race with put_compound_page() */
2771
			list_del_init(page_deferred_list(page));
2772
			ds_queue->split_queue_len--;
2773
		}
2774 2775
		if (!--sc->nr_to_scan)
			break;
2776
	}
2777
	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2778 2779

	list_for_each_safe(pos, next, &list) {
2780
		page = list_entry((void *)pos, struct page, deferred_list);
2781 2782
		if (!trylock_page(page))
			goto next;
2783 2784 2785 2786
		/* split_huge_page() removes page from list on success */
		if (!split_huge_page(page))
			split++;
		unlock_page(page);
2787
next:
2788 2789 2790
		put_page(page);
	}

2791 2792 2793
	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
	list_splice_tail(&list, &ds_queue->split_queue);
	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2794

2795 2796 2797 2798
	/*
	 * Stop shrinker if we didn't split any page, but the queue is empty.
	 * This can happen if pages were freed under us.
	 */
2799
	if (!split && list_empty(&ds_queue->split_queue))
2800 2801
		return SHRINK_STOP;
	return split;
2802 2803 2804 2805 2806 2807
}

static struct shrinker deferred_split_shrinker = {
	.count_objects = deferred_split_count,
	.scan_objects = deferred_split_scan,
	.seeks = DEFAULT_SEEKS,
2808 2809
	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
		 SHRINKER_NONSLAB,
2810
};
2811 2812

#ifdef CONFIG_DEBUG_FS
2813
static void split_huge_pages_all(void)
2814 2815 2816 2817 2818 2819
{
	struct zone *zone;
	struct page *page;
	unsigned long pfn, max_zone_pfn;
	unsigned long total = 0, split = 0;

2820
	pr_debug("Split all THPs\n");
2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833
	for_each_populated_zone(zone) {
		max_zone_pfn = zone_end_pfn(zone);
		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
			if (!pfn_valid(pfn))
				continue;

			page = pfn_to_page(pfn);
			if (!get_page_unless_zero(page))
				continue;

			if (zone != page_zone(page))
				goto next;

2834
			if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
2835 2836 2837 2838 2839 2840 2841 2842 2843
				goto next;

			total++;
			lock_page(page);
			if (!split_huge_page(page))
				split++;
			unlock_page(page);
next:
			put_page(page);
2844
			cond_resched();
2845 2846 2847
		}
	}

2848 2849
	pr_debug("%lu of %lu THP split\n", split, total);
}
2850

2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910
static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
{
	return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
		    is_vm_hugetlb_page(vma);
}

static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
				unsigned long vaddr_end)
{
	int ret = 0;
	struct task_struct *task;
	struct mm_struct *mm;
	unsigned long total = 0, split = 0;
	unsigned long addr;

	vaddr_start &= PAGE_MASK;
	vaddr_end &= PAGE_MASK;

	/* Find the task_struct from pid */
	rcu_read_lock();
	task = find_task_by_vpid(pid);
	if (!task) {
		rcu_read_unlock();
		ret = -ESRCH;
		goto out;
	}
	get_task_struct(task);
	rcu_read_unlock();

	/* Find the mm_struct */
	mm = get_task_mm(task);
	put_task_struct(task);

	if (!mm) {
		ret = -EINVAL;
		goto out;
	}

	pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
		 pid, vaddr_start, vaddr_end);

	mmap_read_lock(mm);
	/*
	 * always increase addr by PAGE_SIZE, since we could have a PTE page
	 * table filled with PTE-mapped THPs, each of which is distinct.
	 */
	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
		struct vm_area_struct *vma = find_vma(mm, addr);
		struct page *page;

		if (!vma || addr < vma->vm_start)
			break;

		/* skip special VMA and hugetlb VMA */
		if (vma_not_suitable_for_thp_split(vma)) {
			addr = vma->vm_end;
			continue;
		}

		/* FOLL_DUMP to ignore special (like zero) pages */
2911
		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
2912 2913 2914 2915 2916 2917 2918 2919 2920 2921

		if (IS_ERR(page))
			continue;
		if (!page)
			continue;

		if (!is_transparent_hugepage(page))
			goto next;

		total++;
2922
		if (!can_split_folio(page_folio(page), NULL))
2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942
			goto next;

		if (!trylock_page(page))
			goto next;

		if (!split_huge_page(page))
			split++;

		unlock_page(page);
next:
		put_page(page);
		cond_resched();
	}
	mmap_read_unlock(mm);
	mmput(mm);

	pr_debug("%lu of %lu THP split\n", split, total);

out:
	return ret;
2943
}
2944

2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
				pgoff_t off_end)
{
	struct filename *file;
	struct file *candidate;
	struct address_space *mapping;
	int ret = -EINVAL;
	pgoff_t index;
	int nr_pages = 1;
	unsigned long total = 0, split = 0;

	file = getname_kernel(file_path);
	if (IS_ERR(file))
		return ret;

	candidate = file_open_name(file, O_RDONLY, 0);
	if (IS_ERR(candidate))
		goto out;

	pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
		 file_path, off_start, off_end);

	mapping = candidate->f_mapping;

	for (index = off_start; index < off_end; index += nr_pages) {
		struct page *fpage = pagecache_get_page(mapping, index,
						FGP_ENTRY | FGP_HEAD, 0);

		nr_pages = 1;
		if (xa_is_value(fpage) || !fpage)
			continue;

		if (!is_transparent_hugepage(fpage))
			goto next;

		total++;
		nr_pages = thp_nr_pages(fpage);

		if (!trylock_page(fpage))
			goto next;

		if (!split_huge_page(fpage))
			split++;

		unlock_page(fpage);
next:
		put_page(fpage);
		cond_resched();
	}

	filp_close(candidate, NULL);
	ret = 0;

	pr_debug("%lu of %lu file-backed THP split\n", split, total);
out:
	putname(file);
	return ret;
}

3004 3005 3006 3007 3008 3009 3010
#define MAX_INPUT_BUF_SZ 255

static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
				size_t count, loff_t *ppops)
{
	static DEFINE_MUTEX(split_debug_mutex);
	ssize_t ret;
3011 3012
	/* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
	char input_buf[MAX_INPUT_BUF_SZ];
3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026
	int pid;
	unsigned long vaddr_start, vaddr_end;

	ret = mutex_lock_interruptible(&split_debug_mutex);
	if (ret)
		return ret;

	ret = -EFAULT;

	memset(input_buf, 0, MAX_INPUT_BUF_SZ);
	if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
		goto out;

	input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
3027 3028 3029 3030 3031 3032 3033 3034 3035 3036

	if (input_buf[0] == '/') {
		char *tok;
		char *buf = input_buf;
		char file_path[MAX_INPUT_BUF_SZ];
		pgoff_t off_start = 0, off_end = 0;
		size_t input_len = strlen(input_buf);

		tok = strsep(&buf, ",");
		if (tok) {
3037
			strcpy(file_path, tok);
3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054
		} else {
			ret = -EINVAL;
			goto out;
		}

		ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
		if (ret != 2) {
			ret = -EINVAL;
			goto out;
		}
		ret = split_huge_pages_in_file(file_path, off_start, off_end);
		if (!ret)
			ret = input_len;

		goto out;
	}

3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078
	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
	if (ret == 1 && pid == 1) {
		split_huge_pages_all();
		ret = strlen(input_buf);
		goto out;
	} else if (ret != 3) {
		ret = -EINVAL;
		goto out;
	}

	ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
	if (!ret)
		ret = strlen(input_buf);
out:
	mutex_unlock(&split_debug_mutex);
	return ret;

}

static const struct file_operations split_huge_pages_fops = {
	.owner	 = THIS_MODULE,
	.write	 = split_huge_pages_write,
	.llseek  = no_llseek,
};
3079 3080 3081

static int __init split_huge_pages_debugfs(void)
{
3082 3083
	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
			    &split_huge_pages_fops);
3084 3085 3086 3087
	return 0;
}
late_initcall(split_huge_pages_debugfs);
#endif
3088 3089

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
3090
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3091 3092 3093 3094 3095
		struct page *page)
{
	struct vm_area_struct *vma = pvmw->vma;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address = pvmw->address;
3096
	bool anon_exclusive;
3097 3098
	pmd_t pmdval;
	swp_entry_t entry;
3099
	pmd_t pmdswp;
3100 3101

	if (!(pvmw->pmd && !pvmw->pte))
3102
		return 0;
3103 3104

	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
3105
	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
3106 3107 3108 3109

	anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
	if (anon_exclusive && page_try_share_anon_rmap(page)) {
		set_pmd_at(mm, address, pvmw->pmd, pmdval);
3110
		return -EBUSY;
3111 3112
	}

3113 3114
	if (pmd_dirty(pmdval))
		set_page_dirty(page);
3115 3116
	if (pmd_write(pmdval))
		entry = make_writable_migration_entry(page_to_pfn(page));
3117 3118
	else if (anon_exclusive)
		entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
3119 3120
	else
		entry = make_readable_migration_entry(page_to_pfn(page));
3121 3122 3123 3124
	pmdswp = swp_entry_to_pmd(entry);
	if (pmd_soft_dirty(pmdval))
		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3125
	page_remove_rmap(page, vma, true);
3126
	put_page(page);
3127
	trace_set_migration_pmd(address, pmd_val(pmdswp));
3128 3129

	return 0;
3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146
}

void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
{
	struct vm_area_struct *vma = pvmw->vma;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address = pvmw->address;
	unsigned long mmun_start = address & HPAGE_PMD_MASK;
	pmd_t pmde;
	swp_entry_t entry;

	if (!(pvmw->pmd && !pvmw->pte))
		return;

	entry = pmd_to_swp_entry(*pvmw->pmd);
	get_page(new);
	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
3147 3148
	if (pmd_swp_soft_dirty(*pvmw->pmd))
		pmde = pmd_mksoft_dirty(pmde);
3149
	if (is_writable_migration_entry(entry))
3150
		pmde = maybe_pmd_mkwrite(pmde, vma);
3151 3152
	if (pmd_swp_uffd_wp(*pvmw->pmd))
		pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
3153

3154 3155 3156 3157 3158 3159 3160 3161
	if (PageAnon(new)) {
		rmap_t rmap_flags = RMAP_COMPOUND;

		if (!is_readable_migration_entry(entry))
			rmap_flags |= RMAP_EXCLUSIVE;

		page_add_anon_rmap(new, vma, mmun_start, rmap_flags);
	} else {
3162
		page_add_file_rmap(new, vma, true);
3163 3164
	}
	VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
3165
	set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
3166 3167

	/* No need to invalidate - it was non-present before */
3168
	update_mmu_cache_pmd(vma, address, pvmw->pmd);
3169
	trace_remove_migration_pmd(address, pmd_val(pmde));
3170 3171
}
#endif