hugetlbpage.c 21.7 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26
#include <asm/hugetlb.h>
27 28
#include <asm/pte-walk.h>

29 30

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
31

32
#define PAGE_SHIFT_64K	16
33 34
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
35 36
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
37

B
Becky Bruce 已提交
38
unsigned int HPAGE_SHIFT;
39
EXPORT_SYMBOL(HPAGE_SHIFT);
40

41
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
42

43
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
44
{
45 46 47 48 49
	/*
	 * Only called for hugetlbfs pages, hence can ignore THP and the
	 * irq disabled walk.
	 */
	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
50 51
}

52
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
53
			   unsigned long address, unsigned pdshift, unsigned pshift)
54
{
B
Becky Bruce 已提交
55 56 57
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
58 59 60 61 62 63 64 65 66
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
67

68
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
69

70 71 72
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

73 74 75
	if (! new)
		return -ENOMEM;

76 77 78 79 80 81 82
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

83
	spin_lock(&mm->page_table_lock);
84

B
Becky Bruce 已提交
85 86 87 88 89 90 91 92 93
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
94
		else {
95
#ifdef CONFIG_PPC_BOOK3S_64
96 97
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
98
#elif defined(CONFIG_PPC_8xx)
99
			*hpdp = __hugepd(__pa(new) | _PMD_USER |
100 101
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
102
#else
103
			/* We use the old format for PPC_FSL_BOOK3E */
104
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
105
#endif
106
		}
B
Becky Bruce 已提交
107 108 109 110
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
111
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
112 113
		kmem_cache_free(cachep, new);
	}
114 115 116 117
	spin_unlock(&mm->page_table_lock);
	return 0;
}

118 119 120 121
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
122
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
123 124 125 126 127 128 129
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#else
#define HUGEPD_PGD_SHIFT PUD_SHIFT
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

146
#ifdef CONFIG_PPC_BOOK3S_64
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
	else if (pshift > PUD_SHIFT)
		/*
		 * We need to use hugepd table
		 */
		hpdp = (hugepd_t *)pg;
	else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
		else if (pshift > PMD_SHIFT)
			hpdp = (hugepd_t *)pu;
		else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
			else
				hpdp = (hugepd_t *)pm;
		}
	}
#else
173
	if (pshift >= HUGEPD_PGD_SHIFT) {
174 175 176 177
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
178
		if (pshift >= HUGEPD_PUD_SHIFT) {
179 180 181 182 183 184 185
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}
186
#endif
187 188 189 190 191 192 193 194
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

195
	return hugepte_offset(*hpdp, addr, pdshift);
196 197
}

198
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
199
/*
200 201
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
202
 */
203 204 205
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
206 207

/*
208
 * Build list of addresses of gigantic pages.  This function is used in early
209
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
210
 */
211
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
212 213 214 215 216 217 218 219 220 221 222
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

223
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
224 225 226 227 228 229 230
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
231
	m->hstate = hstate;
232 233
	return 1;
}
B
Becky Bruce 已提交
234
#endif
235

236 237 238 239 240 241 242 243 244 245 246

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

247
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

275
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
276 277

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
278
	    mm_is_thread_local(tlb->mm)) {
B
Becky Bruce 已提交
279
		kmem_cache_free(hugepte_cache, hugepte);
280
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
281 282 283 284 285 286 287 288 289 290 291 292 293
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
294
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
295
}
296 297
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
298 299
#endif

300 301 302
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
303 304
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
305 306
	int i;

307
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
308
	unsigned int num_hugepd = 1;
309
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
310

311
	/* Note: On fsl the hpdp may be the first of several */
312 313
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
314 315 316 317 318 319 320 321 322 323 324

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
325

B
Becky Bruce 已提交
326
	for (i = 0; i < num_hugepd; i++, hpdp++)
327
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
328

329 330 331 332
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
333 334 335 336
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
337
				   unsigned long floor, unsigned long ceiling)
338 339 340 341 342 343 344
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
345 346
		unsigned long more;

347
		pmd = pmd_offset(pud, addr);
348
		next = pmd_addr_end(addr, end);
349
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
350 351 352 353 354
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
355
			continue;
356
		}
357 358 359 360 361 362
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
363 364 365 366
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

367 368
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
369
	} while (addr = next, addr != end);
370 371 372 373 374 375 376 377

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
378
	}
379 380
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
381

382 383
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
384
	pmd_free_tlb(tlb, pmd, start);
385
	mm_dec_nr_pmds(tlb->mm);
386 387 388 389 390 391 392 393 394 395 396 397
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
398
		pud = pud_offset(pgd, addr);
399
		next = pud_addr_end(addr, end);
400
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
401 402
			if (pud_none_or_clear_bad(pud))
				continue;
403
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
404
					       ceiling);
405
		} else {
406
			unsigned long more;
407 408 409 410 411 412
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
413 414 415 416
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

417 418
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
419
		}
420
	} while (addr = next, addr != end);
421 422 423 424 425 426 427 428 429 430 431 432 433 434

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
435
	pud_free_tlb(tlb, pud, start);
K
Kirill A. Shutemov 已提交
436
	mm_dec_nr_puds(tlb->mm);
437 438 439 440 441
}

/*
 * This function frees user-level page tables of a process.
 */
442
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
443 444 445 446 447 448 449
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
450 451 452 453 454 455 456 457 458 459
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
460
	 *
461 462 463
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
464 465 466 467
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
468
		pgd = pgd_offset(tlb->mm, addr);
469
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
470 471 472 473
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
474
			unsigned long more;
B
Becky Bruce 已提交
475 476
			/*
			 * Increment next by the size of the huge mapping since
477 478 479
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
480
			 */
481 482 483 484
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

485 486
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
487
		}
B
Becky Bruce 已提交
488
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
489 490
}

491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
524 525 526 527 528 529 530
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

531 532
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
533 534
{
	pte_t *ptep;
535
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
536
	unsigned long next;
537 538 539

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
540
		next = hugepte_addr_end(addr, end, sz);
541 542
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
543
	} while (ptep++, addr = next, addr != end);
544 545 546

	return 1;
}
L
Linus Torvalds 已提交
547

548
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
549 550 551 552
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
553 554
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
555

556
#ifdef CONFIG_PPC_RADIX_MMU
557 558 559
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
560
#endif
561
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
562
}
563
#endif
L
Linus Torvalds 已提交
564

565 566
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
567
#ifdef CONFIG_PPC_MM_SLICES
568
	/* With radix we don't use slice, so derive it from vma*/
569 570 571
	if (!radix_enabled()) {
		unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);

572
		return 1UL << mmu_psize_to_shift(psize);
573
	}
574
#endif
B
Becky Bruce 已提交
575 576 577 578 579 580 581 582 583 584 585
	if (!is_vm_hugetlb_page(vma))
		return PAGE_SIZE;

	return huge_page_size(hstate_vma(vma));
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
586 587
}

588
static int __init add_huge_page_size(unsigned long long size)
589
{
590 591
	int shift = __ffs(size);
	int mmu_psize;
592

593
	/* Check that it is a page size supported by the hardware and
594
	 * that it fits within pagetable and slice limits. */
595 596
	if (size <= PAGE_SIZE)
		return -EINVAL;
597
#if defined(CONFIG_PPC_FSL_BOOK3E)
598
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
599
		return -EINVAL;
600
#elif !defined(CONFIG_PPC_8xx)
601
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
602
		return -EINVAL;
B
Becky Bruce 已提交
603
#endif
604

605 606 607
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

608 609 610 611 612 613 614 615 616 617
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
	 * Radix: 2M
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
618 619 620 621 622
		if (mmu_psize != MMU_PAGE_2M) {
			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
			    (mmu_psize != MMU_PAGE_1G))
				return -EINVAL;
		}
623 624 625 626 627 628
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

629 630 631 632 633 634 635 636 637
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
638 639 640 641 642 643 644 645
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

646 647 648 649
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
650 651 652 653 654

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
655 656 657 658 659
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

660
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
661
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
662
		return -ENODEV;
663
#endif
664 665 666
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
667

668 669
		if (!mmu_psize_defs[psize].shift)
			continue;
670

671 672 673 674 675
		shift = mmu_psize_to_shift(psize);

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;

676
		if (shift < HUGEPD_PUD_SHIFT)
677
			pdshift = PMD_SHIFT;
678
		else if (shift < HUGEPD_PGD_SHIFT)
679 680 681
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
682 683 684 685
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
686
		if (pdshift > shift)
687
			pgtable_cache_add(pdshift - shift, NULL);
688
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
705
	}
706

707 708
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
709 710
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
711 712
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
713
#else
714 715 716 717 718 719 720
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
721 722
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
723
#endif
724 725
	return 0;
}
726

727
arch_initcall(hugetlbpage_init);
728 729 730 731

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
732
	void *start;
733 734 735

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
736 737 738 739
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
740
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
741
			__flush_dcache_icache(start);
742
			kunmap_atomic(start);
B
Becky Bruce 已提交
743 744
		}
	}
745
}
746 747 748 749 750 751 752

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
753 754
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
755 756 757
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
758
 * This function need to be called with interrupts disabled. We use this variant
759
 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
760
 */
761 762
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
			bool *is_thp, unsigned *hpage_shift)
763
{
764 765 766
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
767 768 769 770
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

771 772
	if (hpage_shift)
		*hpage_shift = 0;
773

774 775 776
	if (is_thp)
		*is_thp = false;

777
	pgdp = pgdir + pgd_index(ea);
778
	pgd  = READ_ONCE(*pgdp);
779
	/*
780 781 782 783
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
784
	 */
785
	if (pgd_none(pgd))
786
		return NULL;
787 788
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
789
		goto out;
790
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
791
		hpdp = (hugepd_t *)&pgd;
792
	else {
793 794 795 796 797
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
798
		pdshift = PUD_SHIFT;
799
		pudp = pud_offset(&pgd, ea);
800
		pud  = READ_ONCE(*pudp);
801

802
		if (pud_none(pud))
803
			return NULL;
804 805
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
806
			goto out;
807
		} else if (is_hugepd(__hugepd(pud_val(pud))))
808
			hpdp = (hugepd_t *)&pud;
809
		else {
810
			pdshift = PMD_SHIFT;
811
			pmdp = pmd_offset(&pud, ea);
812
			pmd  = READ_ONCE(*pmdp);
813 814 815 816
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
817
			if (pmd_none(pmd))
818
				return NULL;
819

820
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
821 822 823 824 825 826 827
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
828
				ret_pte = (pte_t *) pmdp;
829
				goto out;
830
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
831
				hpdp = (hugepd_t *)&pmd;
832
			else
833
				return pte_offset_kernel(&pmd, ea);
834 835 836 837 838
		}
	}
	if (!hpdp)
		return NULL;

839
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
840 841
	pdshift = hugepd_shift(*hpdp);
out:
842 843
	if (hpage_shift)
		*hpage_shift = pdshift;
844 845
	return ret_pte;
}
846
EXPORT_SYMBOL_GPL(__find_linux_pte);
847 848 849 850 851

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
852
	struct page *head, *page;
853 854 855 856 857 858 859
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

860
	pte = READ_ONCE(*ptep);
861

862
	if (!pte_access_permitted(pte, write))
863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}