hugetlbpage.c 22.2 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26
#include <asm/hugetlb.h>
27 28
#include <asm/pte-walk.h>

29 30

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
31

32
#define PAGE_SHIFT_64K	16
33 34
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
35 36
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
37

38 39
bool hugetlb_disabled = false;

B
Becky Bruce 已提交
40
unsigned int HPAGE_SHIFT;
41
EXPORT_SYMBOL(HPAGE_SHIFT);
42

43
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
44

45
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
46
{
47 48 49 50 51
	/*
	 * Only called for hugetlbfs pages, hence can ignore THP and the
	 * irq disabled walk.
	 */
	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
52 53
}

54
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
55 56
			   unsigned long address, unsigned int pdshift,
			   unsigned int pshift, spinlock_t *ptl)
57
{
B
Becky Bruce 已提交
58 59 60
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
61 62 63 64 65 66 67 68 69
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
70

71
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
72

73 74 75
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

76 77 78
	if (! new)
		return -ENOMEM;

79 80 81 82 83 84 85
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

86
	spin_lock(ptl);
B
Becky Bruce 已提交
87 88 89 90 91 92 93 94 95
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
96
		else {
97
#ifdef CONFIG_PPC_BOOK3S_64
98 99
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
100
#elif defined(CONFIG_PPC_8xx)
101
			*hpdp = __hugepd(__pa(new) | _PMD_USER |
102 103
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
104
#else
105
			/* We use the old format for PPC_FSL_BOOK3E */
106
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
107
#endif
108
		}
B
Becky Bruce 已提交
109 110 111 112
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
113
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
114 115
		kmem_cache_free(cachep, new);
	}
116
	spin_unlock(ptl);
117 118 119
	return 0;
}

120 121 122 123
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
124
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
125 126 127 128
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#endif

129 130 131 132 133 134 135 136 137 138 139 140
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;
141
	spinlock_t *ptl;
142 143 144 145

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

146
#ifdef CONFIG_PPC_BOOK3S_64
147 148 149
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
150
	else if (pshift > PUD_SHIFT) {
151 152 153
		/*
		 * We need to use hugepd table
		 */
154
		ptl = &mm->page_table_lock;
155
		hpdp = (hugepd_t *)pg;
156
	} else {
157 158 159 160
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
161 162
		else if (pshift > PMD_SHIFT) {
			ptl = pud_lockptr(mm, pu);
163
			hpdp = (hugepd_t *)pu;
164
		} else {
165 166 167 168 169
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
170 171
			else {
				ptl = pmd_lockptr(mm, pm);
172
				hpdp = (hugepd_t *)pm;
173
			}
174 175 176
		}
	}
#else
177
	if (pshift >= HUGEPD_PGD_SHIFT) {
178
		ptl = &mm->page_table_lock;
179 180 181 182
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
183
		if (pshift >= HUGEPD_PUD_SHIFT) {
184
			ptl = pud_lockptr(mm, pu);
185 186 187 188
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
189
			ptl = pmd_lockptr(mm, pm);
190 191 192
			hpdp = (hugepd_t *)pm;
		}
	}
193
#endif
194 195 196 197 198
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

199 200
	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
						  pdshift, pshift, ptl))
201 202
		return NULL;

203
	return hugepte_offset(*hpdp, addr, pdshift);
204 205
}

206
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
207
/*
208 209
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
210
 */
211 212 213
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
214 215

/*
216
 * Build list of addresses of gigantic pages.  This function is used in early
217
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
218
 */
219
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
220 221 222 223 224 225 226 227 228 229 230
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

231
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
232 233 234 235 236 237 238
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
239
	m->hstate = hstate;
240 241
	return 1;
}
B
Becky Bruce 已提交
242
#endif
243

244 245 246 247 248 249 250 251 252 253 254

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

255
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

283
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
284 285

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
286
	    mm_is_thread_local(tlb->mm)) {
B
Becky Bruce 已提交
287
		kmem_cache_free(hugepte_cache, hugepte);
288
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
289 290 291 292 293 294 295 296 297 298 299 300 301
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
302
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
303
}
304 305
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
306 307
#endif

308 309 310
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
311 312
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
313 314
	int i;

315
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
316
	unsigned int num_hugepd = 1;
317
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
318

319
	/* Note: On fsl the hpdp may be the first of several */
320 321
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
322 323 324 325 326 327 328 329 330 331 332

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
333

B
Becky Bruce 已提交
334
	for (i = 0; i < num_hugepd; i++, hpdp++)
335
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
336

337 338 339
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
340 341
		pgtable_free_tlb(tlb, hugepte,
				 get_hugepd_cache_index(pdshift - shift));
342 343 344 345
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
346
				   unsigned long floor, unsigned long ceiling)
347 348 349 350 351 352 353
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
354 355
		unsigned long more;

356
		pmd = pmd_offset(pud, addr);
357
		next = pmd_addr_end(addr, end);
358
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
359 360 361 362 363
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
364
			continue;
365
		}
366 367 368 369 370 371
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
372 373 374 375
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

376 377
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
378
	} while (addr = next, addr != end);
379 380 381 382 383 384 385 386

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
387
	}
388 389
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
390

391 392
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
393
	pmd_free_tlb(tlb, pmd, start);
394
	mm_dec_nr_pmds(tlb->mm);
395 396 397 398 399 400 401 402 403 404 405 406
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
407
		pud = pud_offset(pgd, addr);
408
		next = pud_addr_end(addr, end);
409
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
410 411
			if (pud_none_or_clear_bad(pud))
				continue;
412
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
413
					       ceiling);
414
		} else {
415
			unsigned long more;
416 417 418 419 420 421
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
422 423 424 425
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

426 427
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
428
		}
429
	} while (addr = next, addr != end);
430 431 432 433 434 435 436 437 438 439 440 441 442 443

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
444
	pud_free_tlb(tlb, pud, start);
K
Kirill A. Shutemov 已提交
445
	mm_dec_nr_puds(tlb->mm);
446 447 448 449 450
}

/*
 * This function frees user-level page tables of a process.
 */
451
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
452 453 454 455 456 457 458
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
459 460 461 462 463 464 465 466 467 468
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
469
	 *
470 471 472
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
473 474 475 476
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
477
		pgd = pgd_offset(tlb->mm, addr);
478
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
479 480 481 482
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
483
			unsigned long more;
B
Becky Bruce 已提交
484 485
			/*
			 * Increment next by the size of the huge mapping since
486 487 488
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
489
			 */
490 491 492 493
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

494 495
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
496
		}
B
Becky Bruce 已提交
497
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
498 499
}

500 501 502 503 504 505 506 507 508 509 510 511
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
512 513 514 515
	/*
	 * hugepage directory entries are protected by mm->page_table_lock
	 * Use this instead of huge_pte_lockptr
	 */
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
537 538 539 540 541 542 543
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

544 545
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
546 547
{
	pte_t *ptep;
548
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
549
	unsigned long next;
550 551 552

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
553
		next = hugepte_addr_end(addr, end, sz);
554 555
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
556
	} while (ptep++, addr = next, addr != end);
557 558 559

	return 1;
}
L
Linus Torvalds 已提交
560

561
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
562 563 564 565
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
566 567
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
568

569
#ifdef CONFIG_PPC_RADIX_MMU
570 571 572
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
573
#endif
574
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
575
}
576
#endif
L
Linus Torvalds 已提交
577

578 579
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
580
#ifdef CONFIG_PPC_MM_SLICES
581
	/* With radix we don't use slice, so derive it from vma*/
582 583 584
	if (!radix_enabled()) {
		unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);

585
		return 1UL << mmu_psize_to_shift(psize);
586
	}
587
#endif
588
	return vma_kernel_pagesize(vma);
B
Becky Bruce 已提交
589 590 591 592 593 594 595
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
596 597
}

598
static int __init add_huge_page_size(unsigned long long size)
599
{
600 601
	int shift = __ffs(size);
	int mmu_psize;
602

603
	/* Check that it is a page size supported by the hardware and
604
	 * that it fits within pagetable and slice limits. */
605 606
	if (size <= PAGE_SIZE)
		return -EINVAL;
607
#if defined(CONFIG_PPC_FSL_BOOK3E)
608
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
609
		return -EINVAL;
610
#elif !defined(CONFIG_PPC_8xx)
611
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
612
		return -EINVAL;
B
Becky Bruce 已提交
613
#endif
614

615 616 617
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

618 619 620 621 622 623
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
624
	 * Radix: 2M and 1G
625 626 627
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
628 629
		if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
			return -EINVAL;
630 631 632 633 634 635
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

636 637 638 639 640 641 642 643 644
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
645 646 647 648 649 650 651 652
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

653 654 655 656
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
657 658 659 660 661

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
662 663 664 665 666
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

667 668 669 670 671
	if (hugetlb_disabled) {
		pr_info("HugeTLB support is disabled!\n");
		return 0;
	}

672
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
673
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
674
		return -ENODEV;
675
#endif
676 677 678
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
679

680 681
		if (!mmu_psize_defs[psize].shift)
			continue;
682

683 684
		shift = mmu_psize_to_shift(psize);

685 686
#ifdef CONFIG_PPC_BOOK3S_64
		if (shift > PGDIR_SHIFT)
687
			continue;
688 689 690 691 692 693 694
		else if (shift > PUD_SHIFT)
			pdshift = PGDIR_SHIFT;
		else if (shift > PMD_SHIFT)
			pdshift = PUD_SHIFT;
		else
			pdshift = PMD_SHIFT;
#else
695
		if (shift < HUGEPD_PUD_SHIFT)
696
			pdshift = PMD_SHIFT;
697
		else if (shift < HUGEPD_PGD_SHIFT)
698 699 700
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
701 702 703 704
#endif

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;
705 706 707 708
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
709
		if (pdshift > shift)
710
			pgtable_cache_add(pdshift - shift, NULL);
711
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
728
	}
729

730 731
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
732 733
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
734 735
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
736
#else
737 738 739 740 741 742 743
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
744 745
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
746
#endif
747 748
	return 0;
}
749

750
arch_initcall(hugetlbpage_init);
751 752 753 754

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
755
	void *start;
756 757 758

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
759 760 761 762
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
763
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
764
			__flush_dcache_icache(start);
765
			kunmap_atomic(start);
B
Becky Bruce 已提交
766 767
		}
	}
768
}
769 770 771 772 773 774 775

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
776 777
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
778 779 780
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
781
 * This function need to be called with interrupts disabled. We use this variant
782
 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
783
 */
784 785
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
			bool *is_thp, unsigned *hpage_shift)
786
{
787 788 789
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
790 791 792 793
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

794 795
	if (hpage_shift)
		*hpage_shift = 0;
796

797 798 799
	if (is_thp)
		*is_thp = false;

800
	pgdp = pgdir + pgd_index(ea);
801
	pgd  = READ_ONCE(*pgdp);
802
	/*
803 804 805 806
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
807
	 */
808
	if (pgd_none(pgd))
809
		return NULL;
810 811
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
812
		goto out;
813
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
814
		hpdp = (hugepd_t *)&pgd;
815
	else {
816 817 818 819 820
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
821
		pdshift = PUD_SHIFT;
822
		pudp = pud_offset(&pgd, ea);
823
		pud  = READ_ONCE(*pudp);
824

825
		if (pud_none(pud))
826
			return NULL;
827 828
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
829
			goto out;
830
		} else if (is_hugepd(__hugepd(pud_val(pud))))
831
			hpdp = (hugepd_t *)&pud;
832
		else {
833
			pdshift = PMD_SHIFT;
834
			pmdp = pmd_offset(&pud, ea);
835
			pmd  = READ_ONCE(*pmdp);
836 837 838 839
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
840
			if (pmd_none(pmd))
841
				return NULL;
842

843
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
844 845 846 847 848 849 850
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
851
				ret_pte = (pte_t *) pmdp;
852
				goto out;
853
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
854
				hpdp = (hugepd_t *)&pmd;
855
			else
856
				return pte_offset_kernel(&pmd, ea);
857 858 859 860 861
		}
	}
	if (!hpdp)
		return NULL;

862
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
863 864
	pdshift = hugepd_shift(*hpdp);
out:
865 866
	if (hpage_shift)
		*hpage_shift = pdshift;
867 868
	return ret_pte;
}
869
EXPORT_SYMBOL_GPL(__find_linux_pte);
870 871 872 873 874

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
875
	struct page *head, *page;
876 877 878 879 880 881 882
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

883
	pte = READ_ONCE(*ptep);
884

885
	if (!pte_access_permitted(pte, write))
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}