hugetlbpage.c 21.6 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26
#include <asm/hugetlb.h>
27 28
#include <asm/pte-walk.h>

29 30

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
31

32
#define PAGE_SHIFT_64K	16
33 34
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
35 36
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
37

B
Becky Bruce 已提交
38
unsigned int HPAGE_SHIFT;
39
EXPORT_SYMBOL(HPAGE_SHIFT);
40

41
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
42

43
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
44
{
45 46 47 48 49
	/*
	 * Only called for hugetlbfs pages, hence can ignore THP and the
	 * irq disabled walk.
	 */
	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
50 51
}

52
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
53
			   unsigned long address, unsigned pdshift, unsigned pshift)
54
{
B
Becky Bruce 已提交
55 56 57
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
58 59 60 61 62 63 64 65 66
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
67

68
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
69

70 71 72
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

73 74 75
	if (! new)
		return -ENOMEM;

76 77 78 79 80 81 82
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

83
	spin_lock(&mm->page_table_lock);
84

B
Becky Bruce 已提交
85 86 87 88 89 90 91 92 93
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
94
		else {
95
#ifdef CONFIG_PPC_BOOK3S_64
96 97
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
98
#elif defined(CONFIG_PPC_8xx)
99
			*hpdp = __hugepd(__pa(new) | _PMD_USER |
100 101
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
102
#else
103
			/* We use the old format for PPC_FSL_BOOK3E */
104
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
105
#endif
106
		}
B
Becky Bruce 已提交
107 108 109 110
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
111
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
112 113
		kmem_cache_free(cachep, new);
	}
114 115 116 117
	spin_unlock(&mm->page_table_lock);
	return 0;
}

118 119 120 121
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
122
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
123 124 125 126 127 128 129
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#else
#define HUGEPD_PGD_SHIFT PUD_SHIFT
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

146
#ifdef CONFIG_PPC_BOOK3S_64
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
	else if (pshift > PUD_SHIFT)
		/*
		 * We need to use hugepd table
		 */
		hpdp = (hugepd_t *)pg;
	else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
		else if (pshift > PMD_SHIFT)
			hpdp = (hugepd_t *)pu;
		else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
			else
				hpdp = (hugepd_t *)pm;
		}
	}
#else
173
	if (pshift >= HUGEPD_PGD_SHIFT) {
174 175 176 177
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
178
		if (pshift >= HUGEPD_PUD_SHIFT) {
179 180 181 182 183 184 185
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}
186
#endif
187 188 189 190 191 192 193 194
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

195
	return hugepte_offset(*hpdp, addr, pdshift);
196 197
}

198
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
199
/*
200 201
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
202
 */
203 204 205
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
206 207

/*
208
 * Build list of addresses of gigantic pages.  This function is used in early
209
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
210
 */
211
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
212 213 214 215 216 217 218 219 220 221 222
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

223
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
224 225 226 227 228 229 230
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
231
	m->hstate = hstate;
232 233
	return 1;
}
B
Becky Bruce 已提交
234
#endif
235

236 237 238 239 240 241 242 243 244 245 246

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

247
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

275
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
276 277

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
278
	    mm_is_thread_local(tlb->mm)) {
B
Becky Bruce 已提交
279
		kmem_cache_free(hugepte_cache, hugepte);
280
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
281 282 283 284 285 286 287 288 289 290 291 292 293
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
294
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
295
}
296 297
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
298 299
#endif

300 301 302
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
303 304
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
305 306
	int i;

307
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
308
	unsigned int num_hugepd = 1;
309
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
310

311
	/* Note: On fsl the hpdp may be the first of several */
312 313
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
314 315 316 317 318 319 320 321 322 323 324

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
325

B
Becky Bruce 已提交
326
	for (i = 0; i < num_hugepd; i++, hpdp++)
327
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
328

329 330 331 332
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
333 334 335 336
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
337
				   unsigned long floor, unsigned long ceiling)
338 339 340 341 342 343 344
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
345 346
		unsigned long more;

347
		pmd = pmd_offset(pud, addr);
348
		next = pmd_addr_end(addr, end);
349
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
350 351 352 353 354
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
355
			continue;
356
		}
357 358 359 360 361 362
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
363 364 365 366
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

367 368
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
369
	} while (addr = next, addr != end);
370 371 372 373 374 375 376 377

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
378
	}
379 380
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
381

382 383
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
384
	pmd_free_tlb(tlb, pmd, start);
385
	mm_dec_nr_pmds(tlb->mm);
386 387 388 389 390 391 392 393 394 395 396 397
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
398
		pud = pud_offset(pgd, addr);
399
		next = pud_addr_end(addr, end);
400
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
401 402
			if (pud_none_or_clear_bad(pud))
				continue;
403
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
404
					       ceiling);
405
		} else {
406
			unsigned long more;
407 408 409 410 411 412
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
413 414 415 416
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

417 418
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
419
		}
420
	} while (addr = next, addr != end);
421 422 423 424 425 426 427 428 429 430 431 432 433 434

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
435
	pud_free_tlb(tlb, pud, start);
K
Kirill A. Shutemov 已提交
436
	mm_dec_nr_puds(tlb->mm);
437 438 439 440 441
}

/*
 * This function frees user-level page tables of a process.
 */
442
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
443 444 445 446 447 448 449
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
450 451 452 453 454 455 456 457 458 459
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
460
	 *
461 462 463
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
464 465 466 467
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
468
		pgd = pgd_offset(tlb->mm, addr);
469
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
470 471 472 473
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
474
			unsigned long more;
B
Becky Bruce 已提交
475 476
			/*
			 * Increment next by the size of the huge mapping since
477 478 479
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
480
			 */
481 482 483 484
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

485 486
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
487
		}
B
Becky Bruce 已提交
488
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
489 490
}

491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
524 525 526 527 528 529 530
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

531 532
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
533 534
{
	pte_t *ptep;
535
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
536
	unsigned long next;
537 538 539

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
540
		next = hugepte_addr_end(addr, end, sz);
541 542
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
543
	} while (ptep++, addr = next, addr != end);
544 545 546

	return 1;
}
L
Linus Torvalds 已提交
547

548
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
549 550 551 552
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
553 554
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
555

556
#ifdef CONFIG_PPC_RADIX_MMU
557 558 559
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
560
#endif
561
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
562
}
563
#endif
L
Linus Torvalds 已提交
564

565 566
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
567
#ifdef CONFIG_PPC_MM_SLICES
568
	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
569 570 571 572
	/* With radix we don't use slice, so derive it from vma*/
	if (!radix_enabled())
		return 1UL << mmu_psize_to_shift(psize);
#endif
B
Becky Bruce 已提交
573 574 575 576 577 578 579 580 581 582 583
	if (!is_vm_hugetlb_page(vma))
		return PAGE_SIZE;

	return huge_page_size(hstate_vma(vma));
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
584 585
}

586
static int __init add_huge_page_size(unsigned long long size)
587
{
588 589
	int shift = __ffs(size);
	int mmu_psize;
590

591
	/* Check that it is a page size supported by the hardware and
592
	 * that it fits within pagetable and slice limits. */
593 594
	if (size <= PAGE_SIZE)
		return -EINVAL;
595
#if defined(CONFIG_PPC_FSL_BOOK3E)
596
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
597
		return -EINVAL;
598
#elif !defined(CONFIG_PPC_8xx)
599
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
600
		return -EINVAL;
B
Becky Bruce 已提交
601
#endif
602

603 604 605
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

606 607 608 609 610 611 612 613 614 615
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
	 * Radix: 2M
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
616 617 618 619 620
		if (mmu_psize != MMU_PAGE_2M) {
			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
			    (mmu_psize != MMU_PAGE_1G))
				return -EINVAL;
		}
621 622 623 624 625 626
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

627 628 629 630 631 632 633 634 635
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
636 637 638 639 640 641 642 643
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

644 645 646 647
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
648 649 650 651 652

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
653 654 655 656 657
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

658
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
659
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
660
		return -ENODEV;
661
#endif
662 663 664
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
665

666 667
		if (!mmu_psize_defs[psize].shift)
			continue;
668

669 670 671 672 673
		shift = mmu_psize_to_shift(psize);

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;

674
		if (shift < HUGEPD_PUD_SHIFT)
675
			pdshift = PMD_SHIFT;
676
		else if (shift < HUGEPD_PGD_SHIFT)
677 678 679
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
680 681 682 683
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
684
		if (pdshift > shift)
685
			pgtable_cache_add(pdshift - shift, NULL);
686
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
703
	}
704

705 706
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
707 708
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
709 710
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
711
#else
712 713 714 715 716 717 718
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
719 720
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
721
#endif
722 723
	return 0;
}
724

725
arch_initcall(hugetlbpage_init);
726 727 728 729

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
730
	void *start;
731 732 733

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
734 735 736 737
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
738
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
739
			__flush_dcache_icache(start);
740
			kunmap_atomic(start);
B
Becky Bruce 已提交
741 742
		}
	}
743
}
744 745 746 747 748 749 750

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
751 752
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
753 754 755
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
756
 * This function need to be called with interrupts disabled. We use this variant
757
 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
758
 */
759 760
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
			bool *is_thp, unsigned *hpage_shift)
761
{
762 763 764
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
765 766 767 768
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

769 770
	if (hpage_shift)
		*hpage_shift = 0;
771

772 773 774
	if (is_thp)
		*is_thp = false;

775
	pgdp = pgdir + pgd_index(ea);
776
	pgd  = READ_ONCE(*pgdp);
777
	/*
778 779 780 781
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
782
	 */
783
	if (pgd_none(pgd))
784
		return NULL;
785 786
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
787
		goto out;
788
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
789
		hpdp = (hugepd_t *)&pgd;
790
	else {
791 792 793 794 795
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
796
		pdshift = PUD_SHIFT;
797
		pudp = pud_offset(&pgd, ea);
798
		pud  = READ_ONCE(*pudp);
799

800
		if (pud_none(pud))
801
			return NULL;
802 803
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
804
			goto out;
805
		} else if (is_hugepd(__hugepd(pud_val(pud))))
806
			hpdp = (hugepd_t *)&pud;
807
		else {
808
			pdshift = PMD_SHIFT;
809
			pmdp = pmd_offset(&pud, ea);
810
			pmd  = READ_ONCE(*pmdp);
811 812 813 814
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
815
			if (pmd_none(pmd))
816
				return NULL;
817

818
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
819 820 821 822 823 824 825
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
826
				ret_pte = (pte_t *) pmdp;
827
				goto out;
828
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
829
				hpdp = (hugepd_t *)&pmd;
830
			else
831
				return pte_offset_kernel(&pmd, ea);
832 833 834 835 836
		}
	}
	if (!hpdp)
		return NULL;

837
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
838 839
	pdshift = hugepd_shift(*hpdp);
out:
840 841
	if (hpage_shift)
		*hpage_shift = pdshift;
842 843
	return ret_pte;
}
844
EXPORT_SYMBOL_GPL(__find_linux_pte);
845 846 847 848 849

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
850
	struct page *head, *page;
851 852 853 854 855 856 857
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

858
	pte = READ_ONCE(*ptep);
859

860
	if (!pte_access_permitted(pte, write))
861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}