hugetlbpage.c 21.9 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26
#include <asm/hugetlb.h>
27 28
#include <asm/pte-walk.h>

29 30

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
31

32
#define PAGE_SHIFT_64K	16
33 34
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
35 36
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
37

38 39
bool hugetlb_disabled = false;

B
Becky Bruce 已提交
40
unsigned int HPAGE_SHIFT;
41
EXPORT_SYMBOL(HPAGE_SHIFT);
42

43
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
44

45
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
46
{
47 48 49 50 51
	/*
	 * Only called for hugetlbfs pages, hence can ignore THP and the
	 * irq disabled walk.
	 */
	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
52 53
}

54
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
55
			   unsigned long address, unsigned pdshift, unsigned pshift)
56
{
B
Becky Bruce 已提交
57 58 59
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
60 61 62 63 64 65 66 67 68
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
69

70
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
71

72 73 74
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

75 76 77
	if (! new)
		return -ENOMEM;

78 79 80 81 82 83 84
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

85
	spin_lock(&mm->page_table_lock);
86

B
Becky Bruce 已提交
87 88 89 90 91 92 93 94 95
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
96
		else {
97
#ifdef CONFIG_PPC_BOOK3S_64
98 99
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
100
#elif defined(CONFIG_PPC_8xx)
101
			*hpdp = __hugepd(__pa(new) | _PMD_USER |
102 103
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
104
#else
105
			/* We use the old format for PPC_FSL_BOOK3E */
106
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
107
#endif
108
		}
B
Becky Bruce 已提交
109 110 111 112
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
113
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
114 115
		kmem_cache_free(cachep, new);
	}
116 117 118 119
	spin_unlock(&mm->page_table_lock);
	return 0;
}

120 121 122 123
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
124
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
125 126 127 128
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#endif

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

145
#ifdef CONFIG_PPC_BOOK3S_64
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
	else if (pshift > PUD_SHIFT)
		/*
		 * We need to use hugepd table
		 */
		hpdp = (hugepd_t *)pg;
	else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
		else if (pshift > PMD_SHIFT)
			hpdp = (hugepd_t *)pu;
		else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
			else
				hpdp = (hugepd_t *)pm;
		}
	}
#else
172
	if (pshift >= HUGEPD_PGD_SHIFT) {
173 174 175 176
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
177
		if (pshift >= HUGEPD_PUD_SHIFT) {
178 179 180 181 182 183 184
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}
185
#endif
186 187 188 189 190 191 192 193
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

194
	return hugepte_offset(*hpdp, addr, pdshift);
195 196
}

197
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
198
/*
199 200
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
201
 */
202 203 204
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
205 206

/*
207
 * Build list of addresses of gigantic pages.  This function is used in early
208
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
209
 */
210
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
211 212 213 214 215 216 217 218 219 220 221
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

222
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
223 224 225 226 227 228 229
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
230
	m->hstate = hstate;
231 232
	return 1;
}
B
Becky Bruce 已提交
233
#endif
234

235 236 237 238 239 240 241 242 243 244 245

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

246
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

274
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
275 276

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
277
	    mm_is_thread_local(tlb->mm)) {
B
Becky Bruce 已提交
278
		kmem_cache_free(hugepte_cache, hugepte);
279
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
280 281 282 283 284 285 286 287 288 289 290 291 292
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
293
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
294
}
295 296
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
297 298
#endif

299 300 301
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
302 303
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
304 305
	int i;

306
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
307
	unsigned int num_hugepd = 1;
308
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
309

310
	/* Note: On fsl the hpdp may be the first of several */
311 312
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
313 314 315 316 317 318 319 320 321 322 323

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
324

B
Becky Bruce 已提交
325
	for (i = 0; i < num_hugepd; i++, hpdp++)
326
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
327

328 329 330 331
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
332 333 334 335
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
336
				   unsigned long floor, unsigned long ceiling)
337 338 339 340 341 342 343
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
344 345
		unsigned long more;

346
		pmd = pmd_offset(pud, addr);
347
		next = pmd_addr_end(addr, end);
348
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
349 350 351 352 353
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
354
			continue;
355
		}
356 357 358 359 360 361
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
362 363 364 365
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

366 367
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
368
	} while (addr = next, addr != end);
369 370 371 372 373 374 375 376

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
377
	}
378 379
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
380

381 382
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
383
	pmd_free_tlb(tlb, pmd, start);
384
	mm_dec_nr_pmds(tlb->mm);
385 386 387 388 389 390 391 392 393 394 395 396
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
397
		pud = pud_offset(pgd, addr);
398
		next = pud_addr_end(addr, end);
399
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
400 401
			if (pud_none_or_clear_bad(pud))
				continue;
402
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
403
					       ceiling);
404
		} else {
405
			unsigned long more;
406 407 408 409 410 411
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
412 413 414 415
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

416 417
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
418
		}
419
	} while (addr = next, addr != end);
420 421 422 423 424 425 426 427 428 429 430 431 432 433

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
434
	pud_free_tlb(tlb, pud, start);
K
Kirill A. Shutemov 已提交
435
	mm_dec_nr_puds(tlb->mm);
436 437 438 439 440
}

/*
 * This function frees user-level page tables of a process.
 */
441
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
442 443 444 445 446 447 448
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
449 450 451 452 453 454 455 456 457 458
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
459
	 *
460 461 462
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
463 464 465 466
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
467
		pgd = pgd_offset(tlb->mm, addr);
468
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
469 470 471 472
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
473
			unsigned long more;
B
Becky Bruce 已提交
474 475
			/*
			 * Increment next by the size of the huge mapping since
476 477 478
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
479
			 */
480 481 482 483
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

484 485
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
486
		}
B
Becky Bruce 已提交
487
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
488 489
}

490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
523 524 525 526 527 528 529
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

530 531
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
532 533
{
	pte_t *ptep;
534
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
535
	unsigned long next;
536 537 538

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
539
		next = hugepte_addr_end(addr, end, sz);
540 541
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
542
	} while (ptep++, addr = next, addr != end);
543 544 545

	return 1;
}
L
Linus Torvalds 已提交
546

547
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
548 549 550 551
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
552 553
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
554

555
#ifdef CONFIG_PPC_RADIX_MMU
556 557 558
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
559
#endif
560
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
561
}
562
#endif
L
Linus Torvalds 已提交
563

564 565
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
566
#ifdef CONFIG_PPC_MM_SLICES
567
	/* With radix we don't use slice, so derive it from vma*/
568 569 570
	if (!radix_enabled()) {
		unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);

571
		return 1UL << mmu_psize_to_shift(psize);
572
	}
573
#endif
574
	return vma_kernel_pagesize(vma);
B
Becky Bruce 已提交
575 576 577 578 579 580 581
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
582 583
}

584
static int __init add_huge_page_size(unsigned long long size)
585
{
586 587
	int shift = __ffs(size);
	int mmu_psize;
588

589
	/* Check that it is a page size supported by the hardware and
590
	 * that it fits within pagetable and slice limits. */
591 592
	if (size <= PAGE_SIZE)
		return -EINVAL;
593
#if defined(CONFIG_PPC_FSL_BOOK3E)
594
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
595
		return -EINVAL;
596
#elif !defined(CONFIG_PPC_8xx)
597
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
598
		return -EINVAL;
B
Becky Bruce 已提交
599
#endif
600

601 602 603
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

604 605 606 607 608 609 610 611 612 613
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
	 * Radix: 2M
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
614 615 616 617 618
		if (mmu_psize != MMU_PAGE_2M) {
			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
			    (mmu_psize != MMU_PAGE_1G))
				return -EINVAL;
		}
619 620 621 622 623 624
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

625 626 627 628 629 630 631 632 633
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
634 635 636 637 638 639 640 641
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

642 643 644 645
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
646 647 648 649 650

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
651 652 653 654 655
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

656 657 658 659 660
	if (hugetlb_disabled) {
		pr_info("HugeTLB support is disabled!\n");
		return 0;
	}

661
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
662
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
663
		return -ENODEV;
664
#endif
665 666 667
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
668

669 670
		if (!mmu_psize_defs[psize].shift)
			continue;
671

672 673
		shift = mmu_psize_to_shift(psize);

674 675
#ifdef CONFIG_PPC_BOOK3S_64
		if (shift > PGDIR_SHIFT)
676
			continue;
677 678 679 680 681 682 683
		else if (shift > PUD_SHIFT)
			pdshift = PGDIR_SHIFT;
		else if (shift > PMD_SHIFT)
			pdshift = PUD_SHIFT;
		else
			pdshift = PMD_SHIFT;
#else
684
		if (shift < HUGEPD_PUD_SHIFT)
685
			pdshift = PMD_SHIFT;
686
		else if (shift < HUGEPD_PGD_SHIFT)
687 688 689
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
690 691 692 693
#endif

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;
694 695 696 697
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
698
		if (pdshift > shift)
699
			pgtable_cache_add(pdshift - shift, NULL);
700
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
717
	}
718

719 720
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
721 722
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
723 724
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
725
#else
726 727 728 729 730 731 732
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
733 734
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
735
#endif
736 737
	return 0;
}
738

739
arch_initcall(hugetlbpage_init);
740 741 742 743

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
744
	void *start;
745 746 747

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
748 749 750 751
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
752
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
753
			__flush_dcache_icache(start);
754
			kunmap_atomic(start);
B
Becky Bruce 已提交
755 756
		}
	}
757
}
758 759 760 761 762 763 764

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
765 766
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
767 768 769
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
770
 * This function need to be called with interrupts disabled. We use this variant
771
 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
772
 */
773 774
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
			bool *is_thp, unsigned *hpage_shift)
775
{
776 777 778
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
779 780 781 782
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

783 784
	if (hpage_shift)
		*hpage_shift = 0;
785

786 787 788
	if (is_thp)
		*is_thp = false;

789
	pgdp = pgdir + pgd_index(ea);
790
	pgd  = READ_ONCE(*pgdp);
791
	/*
792 793 794 795
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
796
	 */
797
	if (pgd_none(pgd))
798
		return NULL;
799 800
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
801
		goto out;
802
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
803
		hpdp = (hugepd_t *)&pgd;
804
	else {
805 806 807 808 809
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
810
		pdshift = PUD_SHIFT;
811
		pudp = pud_offset(&pgd, ea);
812
		pud  = READ_ONCE(*pudp);
813

814
		if (pud_none(pud))
815
			return NULL;
816 817
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
818
			goto out;
819
		} else if (is_hugepd(__hugepd(pud_val(pud))))
820
			hpdp = (hugepd_t *)&pud;
821
		else {
822
			pdshift = PMD_SHIFT;
823
			pmdp = pmd_offset(&pud, ea);
824
			pmd  = READ_ONCE(*pmdp);
825 826 827 828
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
829
			if (pmd_none(pmd))
830
				return NULL;
831

832
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
833 834 835 836 837 838 839
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
840
				ret_pte = (pte_t *) pmdp;
841
				goto out;
842
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
843
				hpdp = (hugepd_t *)&pmd;
844
			else
845
				return pte_offset_kernel(&pmd, ea);
846 847 848 849 850
		}
	}
	if (!hpdp)
		return NULL;

851
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
852 853
	pdshift = hugepd_shift(*hpdp);
out:
854 855
	if (hpage_shift)
		*hpage_shift = pdshift;
856 857
	return ret_pte;
}
858
EXPORT_SYMBOL_GPL(__find_linux_pte);
859 860 861 862 863

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
864
	struct page *head, *page;
865 866 867 868 869 870 871
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

872
	pte = READ_ONCE(*ptep);
873

874
	if (!pte_access_permitted(pte, write))
875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}