hugetlbpage.c 21.6 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20 21
#include <linux/swap.h>
#include <linux/swapops.h>
22
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
23 24
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
25
#include <asm/setup.h>
26 27 28
#include <asm/hugetlb.h>

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
29

30
#define PAGE_SHIFT_64K	16
31 32
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
33 34
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
35

B
Becky Bruce 已提交
36
unsigned int HPAGE_SHIFT;
37
EXPORT_SYMBOL(HPAGE_SHIFT);
38

39
#define hugepd_none(hpd)	(hpd_val(hpd) == 0)
40

41
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
42
{
43
	/* Only called for hugetlbfs pages, hence can ignore THP */
44
	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
45 46
}

47
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
48
			   unsigned long address, unsigned pdshift, unsigned pshift)
49
{
B
Becky Bruce 已提交
50 51 52
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
53 54 55 56 57 58 59 60 61
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
62

63
	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
64

65 66 67
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

68 69 70
	if (! new)
		return -ENOMEM;

71 72 73 74 75 76 77
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

78
	spin_lock(&mm->page_table_lock);
79

B
Becky Bruce 已提交
80 81 82 83 84 85 86 87 88
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
89
		else {
90
#ifdef CONFIG_PPC_BOOK3S_64
91 92
			*hpdp = __hugepd(__pa(new) |
					 (shift_to_mmu_psize(pshift) << 2));
93
#elif defined(CONFIG_PPC_8xx)
94 95 96
			*hpdp = __hugepd(__pa(new) |
					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
					  _PMD_PAGE_512K) | _PMD_PRESENT);
97
#else
98
			/* We use the old format for PPC_FSL_BOOK3E */
99
			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
100
#endif
101
		}
B
Becky Bruce 已提交
102 103 104 105
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
106
			*hpdp = __hugepd(0);
B
Becky Bruce 已提交
107 108
		kmem_cache_free(cachep, new);
	}
109 110 111 112
	spin_unlock(&mm->page_table_lock);
	return 0;
}

113 114 115 116
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
117
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
118 119 120 121 122 123 124
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#else
#define HUGEPD_PGD_SHIFT PUD_SHIFT
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif

125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

141
#ifdef CONFIG_PPC_BOOK3S_64
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
	else if (pshift > PUD_SHIFT)
		/*
		 * We need to use hugepd table
		 */
		hpdp = (hugepd_t *)pg;
	else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
		else if (pshift > PMD_SHIFT)
			hpdp = (hugepd_t *)pu;
		else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
			else
				hpdp = (hugepd_t *)pm;
		}
	}
#else
168
	if (pshift >= HUGEPD_PGD_SHIFT) {
169 170 171 172
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
173
		if (pshift >= HUGEPD_PUD_SHIFT) {
174 175 176 177 178 179 180
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}
181
#endif
182 183 184 185 186 187 188 189
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

190
	return hugepte_offset(*hpdp, addr, pdshift);
191 192
}

193
#ifdef CONFIG_PPC_BOOK3S_64
B
Becky Bruce 已提交
194
/*
195 196
 * Tracks gpages after the device tree is scanned and before the
 * huge_boot_pages list is ready on pseries.
B
Becky Bruce 已提交
197
 */
198 199 200
#define MAX_NUMBER_GPAGES	1024
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
__initdata static unsigned nr_gpages;
B
Becky Bruce 已提交
201 202

/*
203
 * Build list of addresses of gigantic pages.  This function is used in early
204
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
205
 */
206
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
207 208 209 210 211 212 213 214 215 216 217
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

218
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
219 220 221 222 223 224 225
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
226
	m->hstate = hstate;
227 228
	return 1;
}
B
Becky Bruce 已提交
229
#endif
230

231 232 233 234 235 236 237 238 239 240 241

int __init alloc_bootmem_huge_page(struct hstate *h)
{

#ifdef CONFIG_PPC_BOOK3S_64
	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
		return pseries_alloc_bootmem_huge_page(h);
#endif
	return __alloc_bootmem_huge_page(h);
}

242
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

270
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
271 272 273 274 275

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
	    cpumask_equal(mm_cpumask(tlb->mm),
			  cpumask_of(smp_processor_id()))) {
		kmem_cache_free(hugepte_cache, hugepte);
276
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
277 278 279 280 281 282 283 284 285 286 287 288 289
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
290
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
291
}
292 293
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
294 295
#endif

296 297 298
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
299 300
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
301 302
	int i;

303
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
304
	unsigned int num_hugepd = 1;
305
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
306

307
	/* Note: On fsl the hpdp may be the first of several */
308 309
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
310 311 312 313 314 315 316 317 318 319 320

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
321

B
Becky Bruce 已提交
322
	for (i = 0; i < num_hugepd; i++, hpdp++)
323
		*hpdp = __hugepd(0);
B
Becky Bruce 已提交
324

325 326 327 328
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
329 330 331 332
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
333
				   unsigned long floor, unsigned long ceiling)
334 335 336 337 338 339 340
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
341 342
		unsigned long more;

343
		pmd = pmd_offset(pud, addr);
344
		next = pmd_addr_end(addr, end);
345
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
346 347 348 349 350
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
351
			continue;
352
		}
353 354 355 356 357 358
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
359 360 361 362
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

363 364
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
365
	} while (addr = next, addr != end);
366 367 368 369 370 371 372 373

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
374
	}
375 376
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
377

378 379
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
380
	pmd_free_tlb(tlb, pmd, start);
381
	mm_dec_nr_pmds(tlb->mm);
382 383 384 385 386 387 388 389 390 391 392 393
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
394
		pud = pud_offset(pgd, addr);
395
		next = pud_addr_end(addr, end);
396
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
397 398
			if (pud_none_or_clear_bad(pud))
				continue;
399
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
400
					       ceiling);
401
		} else {
402
			unsigned long more;
403 404 405 406 407 408
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
409 410 411 412
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

413 414
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
415
		}
416
	} while (addr = next, addr != end);
417 418 419 420 421 422 423 424 425 426 427 428 429 430

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
431
	pud_free_tlb(tlb, pud, start);
432 433 434 435 436
}

/*
 * This function frees user-level page tables of a process.
 */
437
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
438 439 440 441 442 443 444
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
445 446 447 448 449 450 451 452 453 454
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
455
	 *
456 457 458
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
459 460 461 462
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
463
		pgd = pgd_offset(tlb->mm, addr);
464
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
465 466 467 468
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
469
			unsigned long more;
B
Becky Bruce 已提交
470 471
			/*
			 * Increment next by the size of the huge mapping since
472 473 474
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
475
			 */
476 477 478 479
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

480 481
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
482
		}
B
Becky Bruce 已提交
483
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
484 485
}

486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518
struct page *follow_huge_pd(struct vm_area_struct *vma,
			    unsigned long address, hugepd_t hpd,
			    int flags, int pdshift)
{
	pte_t *ptep;
	spinlock_t *ptl;
	struct page *page = NULL;
	unsigned long mask;
	int shift = hugepd_shift(hpd);
	struct mm_struct *mm = vma->vm_mm;

retry:
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

	ptep = hugepte_offset(hpd, address, pdshift);
	if (pte_present(*ptep)) {
		mask = (1UL << shift) - 1;
		page = pte_page(*ptep);
		page += ((address & mask) >> PAGE_SHIFT);
		if (flags & FOLL_GET)
			get_page(page);
	} else {
		if (is_hugetlb_entry_migration(*ptep)) {
			spin_unlock(ptl);
			__migration_entry_wait(mm, ptep, ptl);
			goto retry;
		}
	}
	spin_unlock(ptl);
	return page;
}

D
David Gibson 已提交
519 520 521 522 523 524 525
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

526 527
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
528 529
{
	pte_t *ptep;
530
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
531
	unsigned long next;
532 533 534

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
535
		next = hugepte_addr_end(addr, end, sz);
536 537
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
538
	} while (ptep++, addr = next, addr != end);
539 540 541

	return 1;
}
L
Linus Torvalds 已提交
542

543
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
544 545 546 547
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
548 549
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
550

551 552 553
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
554
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
555
}
556
#endif
L
Linus Torvalds 已提交
557

558 559
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
560
#ifdef CONFIG_PPC_MM_SLICES
561
	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
562 563 564 565
	/* With radix we don't use slice, so derive it from vma*/
	if (!radix_enabled())
		return 1UL << mmu_psize_to_shift(psize);
#endif
B
Becky Bruce 已提交
566 567 568 569 570 571 572 573 574 575 576
	if (!is_vm_hugetlb_page(vma))
		return PAGE_SIZE;

	return huge_page_size(hstate_vma(vma));
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
577 578
}

579
static int __init add_huge_page_size(unsigned long long size)
580
{
581 582
	int shift = __ffs(size);
	int mmu_psize;
583

584
	/* Check that it is a page size supported by the hardware and
585
	 * that it fits within pagetable and slice limits. */
586 587
	if (size <= PAGE_SIZE)
		return -EINVAL;
588
#if defined(CONFIG_PPC_FSL_BOOK3E)
589
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
590
		return -EINVAL;
591
#elif !defined(CONFIG_PPC_8xx)
592
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
593
		return -EINVAL;
B
Becky Bruce 已提交
594
#endif
595

596 597 598
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

599 600 601 602 603 604 605 606 607 608
#ifdef CONFIG_PPC_BOOK3S_64
	/*
	 * We need to make sure that for different page sizes reported by
	 * firmware we only add hugetlb support for page sizes that can be
	 * supported by linux page table layout.
	 * For now we have
	 * Radix: 2M
	 * Hash: 16M and 16G
	 */
	if (radix_enabled()) {
609 610 611 612 613
		if (mmu_psize != MMU_PAGE_2M) {
			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
			    (mmu_psize != MMU_PAGE_1G))
				return -EINVAL;
		}
614 615 616 617 618 619
	} else {
		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
			return -EINVAL;
	}
#endif

620 621 622 623 624 625 626 627 628
	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
629 630 631 632 633 634 635 636
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

637 638 639 640
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
641 642 643 644 645

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
646 647 648 649 650
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

651
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
652
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
653
		return -ENODEV;
654
#endif
655 656 657
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
658

659 660
		if (!mmu_psize_defs[psize].shift)
			continue;
661

662 663 664 665 666
		shift = mmu_psize_to_shift(psize);

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;

667
		if (shift < HUGEPD_PUD_SHIFT)
668
			pdshift = PMD_SHIFT;
669
		else if (shift < HUGEPD_PGD_SHIFT)
670 671 672
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
673 674 675 676
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
677
		if (pdshift > shift)
678
			pgtable_cache_add(pdshift - shift, NULL);
679
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
696
	}
697

698 699
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
700 701
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
702 703
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
704
#else
705 706 707 708 709 710 711
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
712 713
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
714
#endif
715 716
	return 0;
}
717

718
arch_initcall(hugetlbpage_init);
719 720 721 722

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
723
	void *start;
724 725 726

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
727 728 729 730
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
731
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
732
			__flush_dcache_icache(start);
733
			kunmap_atomic(start);
B
Becky Bruce 已提交
734 735
		}
	}
736
}
737 738 739 740 741 742 743

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
744 745
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
746 747 748
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
749 750
 * This function need to be called with interrupts disabled. We use this variant
 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
751
 */
752

753
pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
754
				   bool *is_thp, unsigned *shift)
755
{
756 757 758
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
759 760 761 762 763 764 765
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

	if (shift)
		*shift = 0;

766 767 768
	if (is_thp)
		*is_thp = false;

769
	pgdp = pgdir + pgd_index(ea);
770
	pgd  = READ_ONCE(*pgdp);
771
	/*
772 773 774 775
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
776
	 */
777
	if (pgd_none(pgd))
778
		return NULL;
779 780
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
781
		goto out;
782
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
783
		hpdp = (hugepd_t *)&pgd;
784
	else {
785 786 787 788 789
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
790
		pdshift = PUD_SHIFT;
791
		pudp = pud_offset(&pgd, ea);
792
		pud  = READ_ONCE(*pudp);
793

794
		if (pud_none(pud))
795
			return NULL;
796 797
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
798
			goto out;
799
		} else if (is_hugepd(__hugepd(pud_val(pud))))
800
			hpdp = (hugepd_t *)&pud;
801
		else {
802
			pdshift = PMD_SHIFT;
803
			pmdp = pmd_offset(&pud, ea);
804
			pmd  = READ_ONCE(*pmdp);
805 806 807 808
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
809
			if (pmd_none(pmd))
810
				return NULL;
811

812
			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
813 814 815 816 817 818 819
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
820
				ret_pte = (pte_t *) pmdp;
821
				goto out;
822
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
823
				hpdp = (hugepd_t *)&pmd;
824
			else
825
				return pte_offset_kernel(&pmd, ea);
826 827 828 829 830
		}
	}
	if (!hpdp)
		return NULL;

831
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
832 833 834 835 836 837
	pdshift = hugepd_shift(*hpdp);
out:
	if (shift)
		*shift = pdshift;
	return ret_pte;
}
838
EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
839 840 841 842 843

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long pte_end;
844
	struct page *head, *page;
845 846 847 848 849 850 851
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

852
	pte = READ_ONCE(*ptep);
853

854 855 856
	if (!pte_present(pte) || !pte_read(pte))
		return 0;
	if (write && !pte_write(pte))
857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}