hugetlbpage.c 22.2 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26
#include <asm/hugetlb.h>
27 28
#include <asm/pte-walk.h>

29 30

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
31

32
#define PAGE_SHIFT_64K	16
33 34
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
35 36
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
37

38 39
bool hugetlb_disabled = false;

B
Becky Bruce 已提交
40
unsigned int HPAGE_SHIFT;
41
EXPORT_SYMBOL(HPAGE_SHIFT);
42

43
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
44

45
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
46
{
47 48 49 50 51
	/*
	 * Only called for hugetlbfs pages, hence can ignore THP and the
	 * irq disabled walk.
	 */
	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
52 53
}

54
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
55 56
			   unsigned long address, unsigned int pdshift,
			   unsigned int pshift, spinlock_t *ptl)
57
{
B
Becky Bruce 已提交
58 59 60
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
61 62 63 64 65 66 67 68 69
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
70

71
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
72

73 74 75
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

76 77 78
	if (! new)
		return -ENOMEM;

79 80 81 82 83 84 85
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

86
	spin_lock(ptl);
B
Becky Bruce 已提交
87 88 89 90 91 92 93 94 95
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
96
		else {
97
#ifdef CONFIG_PPC_BOOK3S_64
98 99
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
100
#elif defined(CONFIG_PPC_8xx)
101
			*hpdp = __hugepd(__pa(new) | _PMD_USER |
102 103
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
104
#else
105
			/* We use the old format for PPC_FSL_BOOK3E */
106
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
107
#endif
108
		}
B
Becky Bruce 已提交
109 110 111 112
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
113
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
114 115
		kmem_cache_free(cachep, new);
	}
116
	spin_unlock(ptl);
117 118 119
	return 0;
}

120 121 122 123
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
124
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
125 126 127 128
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#endif

129 130 131 132 133 134 135 136 137 138 139 140
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;
141
	spinlock_t *ptl;
142 143 144 145

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

146
#ifdef CONFIG_PPC_BOOK3S_64
147 148 149
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
150
	else if (pshift > PUD_SHIFT) {
151 152 153
		/*
		 * We need to use hugepd table
		 */
154
		ptl = &mm->page_table_lock;
155
		hpdp = (hugepd_t *)pg;
156
	} else {
157 158 159 160
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
161 162
		else if (pshift > PMD_SHIFT) {
			ptl = pud_lockptr(mm, pu);
163
			hpdp = (hugepd_t *)pu;
164
		} else {
165 166 167 168 169
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
170 171
			else {
				ptl = pmd_lockptr(mm, pm);
172
				hpdp = (hugepd_t *)pm;
173
			}
174 175 176
		}
	}
#else
177
	if (pshift >= HUGEPD_PGD_SHIFT) {
178
		ptl = &mm->page_table_lock;
179 180 181 182
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
183
		if (pshift >= HUGEPD_PUD_SHIFT) {
184
			ptl = pud_lockptr(mm, pu);
185 186 187 188
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
189
			ptl = pmd_lockptr(mm, pm);
190 191 192
			hpdp = (hugepd_t *)pm;
		}
	}
193
#endif
194 195 196 197 198
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

199 200
	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
						  pdshift, pshift, ptl))
201 202
		return NULL;

203
	return hugepte_offset(*hpdp, addr, pdshift);
204 205
}

206
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
207
/*
208 209
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
210
 */
211 212 213
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
214 215

/*
216
 * Build list of addresses of gigantic pages.  This function is used in early
217
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
218
 */
219
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
220 221 222 223 224 225 226 227 228 229 230
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

231
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
232 233 234 235 236 237 238
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
239
	m->hstate = hstate;
240 241
	return 1;
}
B
Becky Bruce 已提交
242
#endif
243

244 245 246 247 248 249 250 251 252 253 254

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

255
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

283
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
284 285

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
286
	    mm_is_thread_local(tlb->mm)) {
B
Becky Bruce 已提交
287
		kmem_cache_free(hugepte_cache, hugepte);
288
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
289 290 291 292 293 294 295 296 297 298 299 300 301
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
302
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
303
}
304 305
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
306 307
#endif

308 309 310
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
311 312
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
313 314
	int i;

315
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
316
	unsigned int num_hugepd = 1;
317
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
318

319
	/* Note: On fsl the hpdp may be the first of several */
320 321
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
322 323 324 325 326 327 328 329 330 331 332

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
333

B
Becky Bruce 已提交
334
	for (i = 0; i < num_hugepd; i++, hpdp++)
335
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
336

337 338 339 340
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
341 342 343 344
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
345
				   unsigned long floor, unsigned long ceiling)
346 347 348 349 350 351 352
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
353 354
		unsigned long more;

355
		pmd = pmd_offset(pud, addr);
356
		next = pmd_addr_end(addr, end);
357
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
358 359 360 361 362
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
363
			continue;
364
		}
365 366 367 368 369 370
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
371 372 373 374
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

375 376
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
377
	} while (addr = next, addr != end);
378 379 380 381 382 383 384 385

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
386
	}
387 388
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
389

390 391
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
392
	pmd_free_tlb(tlb, pmd, start);
393
	mm_dec_nr_pmds(tlb->mm);
394 395 396 397 398 399 400 401 402 403 404 405
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
406
		pud = pud_offset(pgd, addr);
407
		next = pud_addr_end(addr, end);
408
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
409 410
			if (pud_none_or_clear_bad(pud))
				continue;
411
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
412
					       ceiling);
413
		} else {
414
			unsigned long more;
415 416 417 418 419 420
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
421 422 423 424
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

425 426
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
427
		}
428
	} while (addr = next, addr != end);
429 430 431 432 433 434 435 436 437 438 439 440 441 442

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
443
	pud_free_tlb(tlb, pud, start);
K
Kirill A. Shutemov 已提交
444
	mm_dec_nr_puds(tlb->mm);
445 446 447 448 449
}

/*
 * This function frees user-level page tables of a process.
 */
450
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
451 452 453 454 455 456 457
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
458 459 460 461 462 463 464 465 466 467
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
468
	 *
469 470 471
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
472 473 474 475
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
476
		pgd = pgd_offset(tlb->mm, addr);
477
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
478 479 480 481
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
482
			unsigned long more;
B
Becky Bruce 已提交
483 484
			/*
			 * Increment next by the size of the huge mapping since
485 486 487
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
488
			 */
489 490 491 492
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

493 494
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
495
		}
B
Becky Bruce 已提交
496
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
497 498
}

499 500 501 502 503 504 505 506 507 508 509 510
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
511 512 513 514
	/*
	 * hugepage directory entries are protected by mm->page_table_lock
	 * Use this instead of huge_pte_lockptr
	 */
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
536 537 538 539 540 541 542
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

543 544
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
545 546
{
	pte_t *ptep;
547
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
548
	unsigned long next;
549 550 551

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
552
		next = hugepte_addr_end(addr, end, sz);
553 554
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
555
	} while (ptep++, addr = next, addr != end);
556 557 558

	return 1;
}
L
Linus Torvalds 已提交
559

560
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
561 562 563 564
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
565 566
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
567

568
#ifdef CONFIG_PPC_RADIX_MMU
569 570 571
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
572
#endif
573
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
574
}
575
#endif
L
Linus Torvalds 已提交
576

577 578
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
579
#ifdef CONFIG_PPC_MM_SLICES
580
	/* With radix we don't use slice, so derive it from vma*/
581 582 583
	if (!radix_enabled()) {
		unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);

584
		return 1UL << mmu_psize_to_shift(psize);
585
	}
586
#endif
587
	return vma_kernel_pagesize(vma);
B
Becky Bruce 已提交
588 589 590 591 592 593 594
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
595 596
}

597
static int __init add_huge_page_size(unsigned long long size)
598
{
599 600
	int shift = __ffs(size);
	int mmu_psize;
601

602
	/* Check that it is a page size supported by the hardware and
603
	 * that it fits within pagetable and slice limits. */
604 605
	if (size <= PAGE_SIZE)
		return -EINVAL;
606
#if defined(CONFIG_PPC_FSL_BOOK3E)
607
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
608
		return -EINVAL;
609
#elif !defined(CONFIG_PPC_8xx)
610
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
611
		return -EINVAL;
B
Becky Bruce 已提交
612
#endif
613

614 615 616
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

617 618 619 620 621 622 623 624 625 626
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
	 * Radix: 2M
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
627 628 629 630 631
		if (mmu_psize != MMU_PAGE_2M) {
			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
			    (mmu_psize != MMU_PAGE_1G))
				return -EINVAL;
		}
632 633 634 635 636 637
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

638 639 640 641 642 643 644 645 646
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
647 648 649 650 651 652 653 654
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

655 656 657 658
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
659 660 661 662 663

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
664 665 666 667 668
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

669 670 671 672 673
	if (hugetlb_disabled) {
		pr_info("HugeTLB support is disabled!\n");
		return 0;
	}

674
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
675
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
676
		return -ENODEV;
677
#endif
678 679 680
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
681

682 683
		if (!mmu_psize_defs[psize].shift)
			continue;
684

685 686
		shift = mmu_psize_to_shift(psize);

687 688
#ifdef CONFIG_PPC_BOOK3S_64
		if (shift > PGDIR_SHIFT)
689
			continue;
690 691 692 693 694 695 696
		else if (shift > PUD_SHIFT)
			pdshift = PGDIR_SHIFT;
		else if (shift > PMD_SHIFT)
			pdshift = PUD_SHIFT;
		else
			pdshift = PMD_SHIFT;
#else
697
		if (shift < HUGEPD_PUD_SHIFT)
698
			pdshift = PMD_SHIFT;
699
		else if (shift < HUGEPD_PGD_SHIFT)
700 701 702
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
703 704 705 706
#endif

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;
707 708 709 710
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
711
		if (pdshift > shift)
712
			pgtable_cache_add(pdshift - shift, NULL);
713
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
730
	}
731

732 733
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
734 735
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
736 737
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
738
#else
739 740 741 742 743 744 745
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
746 747
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
748
#endif
749 750
	return 0;
}
751

752
arch_initcall(hugetlbpage_init);
753 754 755 756

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
757
	void *start;
758 759 760

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
761 762 763 764
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
765
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
766
			__flush_dcache_icache(start);
767
			kunmap_atomic(start);
B
Becky Bruce 已提交
768 769
		}
	}
770
}
771 772 773 774 775 776 777

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
778 779
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
780 781 782
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
783
 * This function need to be called with interrupts disabled. We use this variant
784
 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
785
 */
786 787
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
			bool *is_thp, unsigned *hpage_shift)
788
{
789 790 791
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
792 793 794 795
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

796 797
	if (hpage_shift)
		*hpage_shift = 0;
798

799 800 801
	if (is_thp)
		*is_thp = false;

802
	pgdp = pgdir + pgd_index(ea);
803
	pgd  = READ_ONCE(*pgdp);
804
	/*
805 806 807 808
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
809
	 */
810
	if (pgd_none(pgd))
811
		return NULL;
812 813
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
814
		goto out;
815
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
816
		hpdp = (hugepd_t *)&pgd;
817
	else {
818 819 820 821 822
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
823
		pdshift = PUD_SHIFT;
824
		pudp = pud_offset(&pgd, ea);
825
		pud  = READ_ONCE(*pudp);
826

827
		if (pud_none(pud))
828
			return NULL;
829 830
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
831
			goto out;
832
		} else if (is_hugepd(__hugepd(pud_val(pud))))
833
			hpdp = (hugepd_t *)&pud;
834
		else {
835
			pdshift = PMD_SHIFT;
836
			pmdp = pmd_offset(&pud, ea);
837
			pmd  = READ_ONCE(*pmdp);
838 839 840 841
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
842
			if (pmd_none(pmd))
843
				return NULL;
844

845
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
846 847 848 849 850 851 852
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
853
				ret_pte = (pte_t *) pmdp;
854
				goto out;
855
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
856
				hpdp = (hugepd_t *)&pmd;
857
			else
858
				return pte_offset_kernel(&pmd, ea);
859 860 861 862 863
		}
	}
	if (!hpdp)
		return NULL;

864
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
865 866
	pdshift = hugepd_shift(*hpdp);
out:
867 868
	if (hpage_shift)
		*hpage_shift = pdshift;
869 870
	return ret_pte;
}
871
EXPORT_SYMBOL_GPL(__find_linux_pte);
872 873 874 875 876

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
877
	struct page *head, *page;
878 879 880 881 882 883 884
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

885
	pte = READ_ONCE(*ptep);
886

887
	if (!pte_access_permitted(pte, write))
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}