hugetlbpage.c 21.6 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26
#include <asm/hugetlb.h>
27 28
#include <asm/pte-walk.h>

29 30

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
31

32
#define PAGE_SHIFT_64K	16
33 34
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
35 36
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
37

B
Becky Bruce 已提交
38
unsigned int HPAGE_SHIFT;
39
EXPORT_SYMBOL(HPAGE_SHIFT);
40

41
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
42

43
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
44
{
45 46 47 48 49
	/*
	 * Only called for hugetlbfs pages, hence can ignore THP and the
	 * irq disabled walk.
	 */
	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
50 51
}

52
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
53
			   unsigned long address, unsigned pdshift, unsigned pshift)
54
{
B
Becky Bruce 已提交
55 56 57
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
58 59 60 61 62 63 64 65 66
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
67

68
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
69

70 71 72
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

73 74 75
	if (! new)
		return -ENOMEM;

76 77 78 79 80 81 82
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

83
	spin_lock(&mm->page_table_lock);
84

B
Becky Bruce 已提交
85 86 87 88 89 90 91 92 93
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
94
		else {
95
#ifdef CONFIG_PPC_BOOK3S_64
96 97
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
98
#elif defined(CONFIG_PPC_8xx)
99 100 101
			*hpdp = __hugepd(__pa(new) |
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
102
#else
103
			/* We use the old format for PPC_FSL_BOOK3E */
104
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
105
#endif
106
		}
B
Becky Bruce 已提交
107 108 109 110
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
111
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
112 113
		kmem_cache_free(cachep, new);
	}
114 115 116 117
	spin_unlock(&mm->page_table_lock);
	return 0;
}

118 119 120 121
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
122
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
123 124 125 126 127 128 129
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#else
#define HUGEPD_PGD_SHIFT PUD_SHIFT
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

146
#ifdef CONFIG_PPC_BOOK3S_64
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
	else if (pshift > PUD_SHIFT)
		/*
		 * We need to use hugepd table
		 */
		hpdp = (hugepd_t *)pg;
	else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
		else if (pshift > PMD_SHIFT)
			hpdp = (hugepd_t *)pu;
		else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
			else
				hpdp = (hugepd_t *)pm;
		}
	}
#else
173
	if (pshift >= HUGEPD_PGD_SHIFT) {
174 175 176 177
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
178
		if (pshift >= HUGEPD_PUD_SHIFT) {
179 180 181 182 183 184 185
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}
186
#endif
187 188 189 190 191 192 193 194
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

195
	return hugepte_offset(*hpdp, addr, pdshift);
196 197
}

198
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
199
/*
200 201
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
202
 */
203 204 205
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
206 207

/*
208
 * Build list of addresses of gigantic pages.  This function is used in early
209
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
210
 */
211
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
212 213 214 215 216 217 218 219 220 221 222
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

223
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
224 225 226 227 228 229 230
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
231
	m->hstate = hstate;
232 233
	return 1;
}
B
Becky Bruce 已提交
234
#endif
235

236 237 238 239 240 241 242 243 244 245 246

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

247
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

275
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
276 277

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
278
	    mm_is_thread_local(tlb->mm)) {
B
Becky Bruce 已提交
279
		kmem_cache_free(hugepte_cache, hugepte);
280
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
281 282 283 284 285 286 287 288 289 290 291 292 293
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
294
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
295
}
296 297
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
298 299
#endif

300 301 302
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
303 304
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
305 306
	int i;

307
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
308
	unsigned int num_hugepd = 1;
309
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
310

311
	/* Note: On fsl the hpdp may be the first of several */
312 313
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
314 315 316 317 318 319 320 321 322 323 324

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
325

B
Becky Bruce 已提交
326
	for (i = 0; i < num_hugepd; i++, hpdp++)
327
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
328

329 330 331 332
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
333 334 335 336
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
337
				   unsigned long floor, unsigned long ceiling)
338 339 340 341 342 343 344
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
345 346
		unsigned long more;

347
		pmd = pmd_offset(pud, addr);
348
		next = pmd_addr_end(addr, end);
349
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
350 351 352 353 354
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
355
			continue;
356
		}
357 358 359 360 361 362
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
363 364 365 366
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

367 368
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
369
	} while (addr = next, addr != end);
370 371 372 373 374 375 376 377

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
378
	}
379 380
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
381

382 383
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
384
	pmd_free_tlb(tlb, pmd, start);
385
	mm_dec_nr_pmds(tlb->mm);
386 387 388 389 390 391 392 393 394 395 396 397
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
398
		pud = pud_offset(pgd, addr);
399
		next = pud_addr_end(addr, end);
400
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
401 402
			if (pud_none_or_clear_bad(pud))
				continue;
403
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
404
					       ceiling);
405
		} else {
406
			unsigned long more;
407 408 409 410 411 412
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
413 414 415 416
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

417 418
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
419
		}
420
	} while (addr = next, addr != end);
421 422 423 424 425 426 427 428 429 430 431 432 433 434

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
435
	pud_free_tlb(tlb, pud, start);
436 437 438 439 440
}

/*
 * This function frees user-level page tables of a process.
 */
441
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
442 443 444 445 446 447 448
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
449 450 451 452 453 454 455 456 457 458
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
459
	 *
460 461 462
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
463 464 465 466
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
467
		pgd = pgd_offset(tlb->mm, addr);
468
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
469 470 471 472
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
473
			unsigned long more;
B
Becky Bruce 已提交
474 475
			/*
			 * Increment next by the size of the huge mapping since
476 477 478
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
479
			 */
480 481 482 483
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

484 485
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
486
		}
B
Becky Bruce 已提交
487
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
488 489
}

490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
523 524 525 526 527 528 529
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

530 531
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
532 533
{
	pte_t *ptep;
534
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
535
	unsigned long next;
536 537 538

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
539
		next = hugepte_addr_end(addr, end, sz);
540 541
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
542
	} while (ptep++, addr = next, addr != end);
543 544 545

	return 1;
}
L
Linus Torvalds 已提交
546

547
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
548 549 550 551
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
552 553
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
554

555 556 557
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
558
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
559
}
560
#endif
L
Linus Torvalds 已提交
561

562 563
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
564
#ifdef CONFIG_PPC_MM_SLICES
565
	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
566 567 568 569
	/* With radix we don't use slice, so derive it from vma*/
	if (!radix_enabled())
		return 1UL << mmu_psize_to_shift(psize);
#endif
B
Becky Bruce 已提交
570 571 572 573 574 575 576 577 578 579 580
	if (!is_vm_hugetlb_page(vma))
		return PAGE_SIZE;

	return huge_page_size(hstate_vma(vma));
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
581 582
}

583
static int __init add_huge_page_size(unsigned long long size)
584
{
585 586
	int shift = __ffs(size);
	int mmu_psize;
587

588
	/* Check that it is a page size supported by the hardware and
589
	 * that it fits within pagetable and slice limits. */
590 591
	if (size <= PAGE_SIZE)
		return -EINVAL;
592
#if defined(CONFIG_PPC_FSL_BOOK3E)
593
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
594
		return -EINVAL;
595
#elif !defined(CONFIG_PPC_8xx)
596
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
597
		return -EINVAL;
B
Becky Bruce 已提交
598
#endif
599

600 601 602
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

603 604 605 606 607 608 609 610 611 612
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
	 * Radix: 2M
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
613 614 615 616 617
		if (mmu_psize != MMU_PAGE_2M) {
			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
			    (mmu_psize != MMU_PAGE_1G))
				return -EINVAL;
		}
618 619 620 621 622 623
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

624 625 626 627 628 629 630 631 632
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
633 634 635 636 637 638 639 640
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

641 642 643 644
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
645 646 647 648 649

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
650 651 652 653 654
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

655
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
656
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
657
		return -ENODEV;
658
#endif
659 660 661
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
662

663 664
		if (!mmu_psize_defs[psize].shift)
			continue;
665

666 667 668 669 670
		shift = mmu_psize_to_shift(psize);

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;

671
		if (shift < HUGEPD_PUD_SHIFT)
672
			pdshift = PMD_SHIFT;
673
		else if (shift < HUGEPD_PGD_SHIFT)
674 675 676
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
677 678 679 680
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
681
		if (pdshift > shift)
682
			pgtable_cache_add(pdshift - shift, NULL);
683
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
700
	}
701

702 703
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
704 705
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
706 707
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
708
#else
709 710 711 712 713 714 715
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
716 717
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
718
#endif
719 720
	return 0;
}
721

722
arch_initcall(hugetlbpage_init);
723 724 725 726

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
727
	void *start;
728 729 730

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
731 732 733 734
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
735
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
736
			__flush_dcache_icache(start);
737
			kunmap_atomic(start);
B
Becky Bruce 已提交
738 739
		}
	}
740
}
741 742 743 744 745 746 747

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
748 749
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
750 751 752
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
753 754
 * This function need to be called with interrupts disabled. We use this variant
 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
755
 */
756 757
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
			bool *is_thp, unsigned *hpage_shift)
758
{
759 760 761
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
762 763 764 765
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

766 767
	if (hpage_shift)
		*hpage_shift = 0;
768

769 770 771
	if (is_thp)
		*is_thp = false;

772
	pgdp = pgdir + pgd_index(ea);
773
	pgd  = READ_ONCE(*pgdp);
774
	/*
775 776 777 778
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
779
	 */
780
	if (pgd_none(pgd))
781
		return NULL;
782 783
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
784
		goto out;
785
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
786
		hpdp = (hugepd_t *)&pgd;
787
	else {
788 789 790 791 792
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
793
		pdshift = PUD_SHIFT;
794
		pudp = pud_offset(&pgd, ea);
795
		pud  = READ_ONCE(*pudp);
796

797
		if (pud_none(pud))
798
			return NULL;
799 800
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
801
			goto out;
802
		} else if (is_hugepd(__hugepd(pud_val(pud))))
803
			hpdp = (hugepd_t *)&pud;
804
		else {
805
			pdshift = PMD_SHIFT;
806
			pmdp = pmd_offset(&pud, ea);
807
			pmd  = READ_ONCE(*pmdp);
808 809 810 811
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
812
			if (pmd_none(pmd))
813
				return NULL;
814

815
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
816 817 818 819 820 821 822
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
823
				ret_pte = (pte_t *) pmdp;
824
				goto out;
825
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
826
				hpdp = (hugepd_t *)&pmd;
827
			else
828
				return pte_offset_kernel(&pmd, ea);
829 830 831 832 833
		}
	}
	if (!hpdp)
		return NULL;

834
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
835 836
	pdshift = hugepd_shift(*hpdp);
out:
837 838
	if (hpage_shift)
		*hpage_shift = pdshift;
839 840
	return ret_pte;
}
841
EXPORT_SYMBOL_GPL(__find_linux_pte);
842 843 844 845 846

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
847
	struct page *head, *page;
848 849 850 851 852 853 854
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

855
	pte = READ_ONCE(*ptep);
856

857 858 859
	if (!pte_present(pte) || !pte_read(pte))
		return 0;
	if (write && !pte_write(pte))
860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}