hugetlbpage.c 21.6 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26
#include <asm/hugetlb.h>
27 28
#include <asm/pte-walk.h>

29 30

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
31

32
#define PAGE_SHIFT_64K	16
33 34
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
35 36
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
37

B
Becky Bruce 已提交
38
unsigned int HPAGE_SHIFT;
39
EXPORT_SYMBOL(HPAGE_SHIFT);
40

41
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
42

43
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
44
{
45 46 47 48 49
	/*
	 * Only called for hugetlbfs pages, hence can ignore THP and the
	 * irq disabled walk.
	 */
	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
50 51
}

52
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
53
			   unsigned long address, unsigned pdshift, unsigned pshift)
54
{
B
Becky Bruce 已提交
55 56 57
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
58 59 60 61 62 63 64 65 66
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
67

68
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
69

70 71 72
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

73 74 75
	if (! new)
		return -ENOMEM;

76 77 78 79 80 81 82
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

83
	spin_lock(&mm->page_table_lock);
84

B
Becky Bruce 已提交
85 86 87 88 89 90 91 92 93
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
94
		else {
95
#ifdef CONFIG_PPC_BOOK3S_64
96 97
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
98
#elif defined(CONFIG_PPC_8xx)
99 100 101
			*hpdp = __hugepd(__pa(new) |
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
102
#else
103
			/* We use the old format for PPC_FSL_BOOK3E */
104
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
105
#endif
106
		}
B
Becky Bruce 已提交
107 108 109 110
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
111
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
112 113
		kmem_cache_free(cachep, new);
	}
114 115 116 117
	spin_unlock(&mm->page_table_lock);
	return 0;
}

118 119 120 121
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
122
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
123 124 125 126 127 128 129
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#else
#define HUGEPD_PGD_SHIFT PUD_SHIFT
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

146
#ifdef CONFIG_PPC_BOOK3S_64
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
	else if (pshift > PUD_SHIFT)
		/*
		 * We need to use hugepd table
		 */
		hpdp = (hugepd_t *)pg;
	else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
		else if (pshift > PMD_SHIFT)
			hpdp = (hugepd_t *)pu;
		else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
			else
				hpdp = (hugepd_t *)pm;
		}
	}
#else
173
	if (pshift >= HUGEPD_PGD_SHIFT) {
174 175 176 177
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
178
		if (pshift >= HUGEPD_PUD_SHIFT) {
179 180 181 182 183 184 185
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}
186
#endif
187 188 189 190 191 192 193 194
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

195
	return hugepte_offset(*hpdp, addr, pdshift);
196 197
}

198
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
199
/*
200 201
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
202
 */
203 204 205
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
206 207

/*
208
 * Build list of addresses of gigantic pages.  This function is used in early
209
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
210
 */
211
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
212 213 214 215 216 217 218 219 220 221 222
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

223
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
224 225 226 227 228 229 230
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
231
	m->hstate = hstate;
232 233
	return 1;
}
B
Becky Bruce 已提交
234
#endif
235

236 237 238 239 240 241 242 243 244 245 246

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

247
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

275
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
276 277

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
278
	    mm_is_thread_local(tlb->mm)) {
B
Becky Bruce 已提交
279
		kmem_cache_free(hugepte_cache, hugepte);
280
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
281 282 283 284 285 286 287 288 289 290 291 292 293
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
294
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
295
}
296 297
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
298 299
#endif

300 301 302
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
303 304
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
305 306
	int i;

307
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
308
	unsigned int num_hugepd = 1;
309
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
310

311
	/* Note: On fsl the hpdp may be the first of several */
312 313
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
314 315 316 317 318 319 320 321 322 323 324

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
325

B
Becky Bruce 已提交
326
	for (i = 0; i < num_hugepd; i++, hpdp++)
327
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
328

329 330 331 332
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
333 334 335 336
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
337
				   unsigned long floor, unsigned long ceiling)
338 339 340 341 342 343 344
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
345 346
		unsigned long more;

347
		pmd = pmd_offset(pud, addr);
348
		next = pmd_addr_end(addr, end);
349
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
350 351 352 353 354
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
355
			continue;
356
		}
357 358 359 360 361 362
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
363 364 365 366
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

367 368
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
369
	} while (addr = next, addr != end);
370 371 372 373 374 375 376 377

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
378
	}
379 380
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
381

382 383
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
384
	pmd_free_tlb(tlb, pmd, start);
385
	mm_dec_nr_pmds(tlb->mm);
386 387 388 389 390 391 392 393 394 395 396 397
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
398
		pud = pud_offset(pgd, addr);
399
		next = pud_addr_end(addr, end);
400
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
401 402
			if (pud_none_or_clear_bad(pud))
				continue;
403
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
404
					       ceiling);
405
		} else {
406
			unsigned long more;
407 408 409 410 411 412
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
413 414 415 416
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

417 418
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
419
		}
420
	} while (addr = next, addr != end);
421 422 423 424 425 426 427 428 429 430 431 432 433 434

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
435
	pud_free_tlb(tlb, pud, start);
K
Kirill A. Shutemov 已提交
436
	mm_dec_nr_puds(tlb->mm);
437 438 439 440 441
}

/*
 * This function frees user-level page tables of a process.
 */
442
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
443 444 445 446 447 448 449
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
450 451 452 453 454 455 456 457 458 459
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
460
	 *
461 462 463
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
464 465 466 467
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
468
		pgd = pgd_offset(tlb->mm, addr);
469
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
470 471 472 473
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
474
			unsigned long more;
B
Becky Bruce 已提交
475 476
			/*
			 * Increment next by the size of the huge mapping since
477 478 479
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
480
			 */
481 482 483 484
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

485 486
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
487
		}
B
Becky Bruce 已提交
488
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
489 490
}

491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
524 525 526 527 528 529 530
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

531 532
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
533 534
{
	pte_t *ptep;
535
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
536
	unsigned long next;
537 538 539

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
540
		next = hugepte_addr_end(addr, end, sz);
541 542
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
543
	} while (ptep++, addr = next, addr != end);
544 545 546

	return 1;
}
L
Linus Torvalds 已提交
547

548
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
549 550 551 552
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
553 554
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
555

556 557 558
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
559
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
560
}
561
#endif
L
Linus Torvalds 已提交
562

563 564
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
565
#ifdef CONFIG_PPC_MM_SLICES
566
	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
567 568 569 570
	/* With radix we don't use slice, so derive it from vma*/
	if (!radix_enabled())
		return 1UL << mmu_psize_to_shift(psize);
#endif
B
Becky Bruce 已提交
571 572 573 574 575 576 577 578 579 580 581
	if (!is_vm_hugetlb_page(vma))
		return PAGE_SIZE;

	return huge_page_size(hstate_vma(vma));
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
582 583
}

584
static int __init add_huge_page_size(unsigned long long size)
585
{
586 587
	int shift = __ffs(size);
	int mmu_psize;
588

589
	/* Check that it is a page size supported by the hardware and
590
	 * that it fits within pagetable and slice limits. */
591 592
	if (size <= PAGE_SIZE)
		return -EINVAL;
593
#if defined(CONFIG_PPC_FSL_BOOK3E)
594
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
595
		return -EINVAL;
596
#elif !defined(CONFIG_PPC_8xx)
597
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
598
		return -EINVAL;
B
Becky Bruce 已提交
599
#endif
600

601 602 603
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

604 605 606 607 608 609 610 611 612 613
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
	 * Radix: 2M
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
614 615 616 617 618
		if (mmu_psize != MMU_PAGE_2M) {
			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
			    (mmu_psize != MMU_PAGE_1G))
				return -EINVAL;
		}
619 620 621 622 623 624
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

625 626 627 628 629 630 631 632 633
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
634 635 636 637 638 639 640 641
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

642 643 644 645
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
646 647 648 649 650

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
651 652 653 654 655
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

656
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
657
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
658
		return -ENODEV;
659
#endif
660 661 662
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
663

664 665
		if (!mmu_psize_defs[psize].shift)
			continue;
666

667 668 669 670 671
		shift = mmu_psize_to_shift(psize);

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;

672
		if (shift < HUGEPD_PUD_SHIFT)
673
			pdshift = PMD_SHIFT;
674
		else if (shift < HUGEPD_PGD_SHIFT)
675 676 677
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
678 679 680 681
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
682
		if (pdshift > shift)
683
			pgtable_cache_add(pdshift - shift, NULL);
684
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
701
	}
702

703 704
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
705 706
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
707 708
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
709
#else
710 711 712 713 714 715 716
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
717 718
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
719
#endif
720 721
	return 0;
}
722

723
arch_initcall(hugetlbpage_init);
724 725 726 727

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
728
	void *start;
729 730 731

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
732 733 734 735
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
736
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
737
			__flush_dcache_icache(start);
738
			kunmap_atomic(start);
B
Becky Bruce 已提交
739 740
		}
	}
741
}
742 743 744 745 746 747 748

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
749 750
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
751 752 753
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
754 755
 * This function need to be called with interrupts disabled. We use this variant
 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
756
 */
757 758
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
			bool *is_thp, unsigned *hpage_shift)
759
{
760 761 762
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
763 764 765 766
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

767 768
	if (hpage_shift)
		*hpage_shift = 0;
769

770 771 772
	if (is_thp)
		*is_thp = false;

773
	pgdp = pgdir + pgd_index(ea);
774
	pgd  = READ_ONCE(*pgdp);
775
	/*
776 777 778 779
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
780
	 */
781
	if (pgd_none(pgd))
782
		return NULL;
783 784
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
785
		goto out;
786
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
787
		hpdp = (hugepd_t *)&pgd;
788
	else {
789 790 791 792 793
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
794
		pdshift = PUD_SHIFT;
795
		pudp = pud_offset(&pgd, ea);
796
		pud  = READ_ONCE(*pudp);
797

798
		if (pud_none(pud))
799
			return NULL;
800 801
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
802
			goto out;
803
		} else if (is_hugepd(__hugepd(pud_val(pud))))
804
			hpdp = (hugepd_t *)&pud;
805
		else {
806
			pdshift = PMD_SHIFT;
807
			pmdp = pmd_offset(&pud, ea);
808
			pmd  = READ_ONCE(*pmdp);
809 810 811 812
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
813
			if (pmd_none(pmd))
814
				return NULL;
815

816
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
817 818 819 820 821 822 823
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
824
				ret_pte = (pte_t *) pmdp;
825
				goto out;
826
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
827
				hpdp = (hugepd_t *)&pmd;
828
			else
829
				return pte_offset_kernel(&pmd, ea);
830 831 832 833 834
		}
	}
	if (!hpdp)
		return NULL;

835
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
836 837
	pdshift = hugepd_shift(*hpdp);
out:
838 839
	if (hpage_shift)
		*hpage_shift = pdshift;
840 841
	return ret_pte;
}
842
EXPORT_SYMBOL_GPL(__find_linux_pte);
843 844 845 846 847

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
848
	struct page *head, *page;
849 850 851 852 853 854 855
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

856
	pte = READ_ONCE(*ptep);
857

858 859 860
	if (!pte_present(pte) || !pte_read(pte))
		return 0;
	if (write && !pte_write(pte))
861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}