hugetlbpage.c 25.6 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
B
Becky Bruce 已提交
2
 * PPC Huge TLB Page Support for Kernel.
L
Linus Torvalds 已提交
3 4
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
B
Becky Bruce 已提交
5
 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
L
Linus Torvalds 已提交
6 7 8 9 10 11
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
12
#include <linux/io.h>
13
#include <linux/slab.h>
L
Linus Torvalds 已提交
14
#include <linux/hugetlb.h>
15
#include <linux/export.h>
B
Becky Bruce 已提交
16 17 18
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
19
#include <linux/moduleparam.h>
20
#include <asm/pgtable.h>
L
Linus Torvalds 已提交
21 22
#include <asm/pgalloc.h>
#include <asm/tlb.h>
B
Becky Bruce 已提交
23
#include <asm/setup.h>
24 25 26
#include <asm/hugetlb.h>

#ifdef CONFIG_HUGETLB_PAGE
L
Linus Torvalds 已提交
27

28
#define PAGE_SHIFT_64K	16
29 30
#define PAGE_SHIFT_512K	19
#define PAGE_SHIFT_8M	23
31 32
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34
33

B
Becky Bruce 已提交
34
unsigned int HPAGE_SHIFT;
35

B
Becky Bruce 已提交
36 37
/*
 * Tracks gpages after the device tree is scanned and before the
38 39 40 41
 * huge_boot_pages list is ready.  On non-Freescale implementations, this is
 * just used to track 16G pages and so is a single array.  FSL-based
 * implementations may have more than one gpage size, so we need multiple
 * arrays
B
Becky Bruce 已提交
42
 */
43
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
44 45 46 47 48 49
#define MAX_NUMBER_GPAGES	128
struct psize_gpages {
	u64 gpage_list[MAX_NUMBER_GPAGES];
	unsigned int nr_gpages;
};
static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
50 51 52 53
#else
#define MAX_NUMBER_GPAGES	1024
static u64 gpage_freearray[MAX_NUMBER_GPAGES];
static unsigned nr_gpages;
B
Becky Bruce 已提交
54
#endif
55

56 57 58 59
#define hugepd_none(hpd)	((hpd).pd == 0)

pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
60
	/* Only called for hugetlbfs pages, hence can ignore THP */
61
	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
62 63
}

64
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
65
			   unsigned long address, unsigned pdshift, unsigned pshift)
66
{
B
Becky Bruce 已提交
67 68 69
	struct kmem_cache *cachep;
	pte_t *new;
	int i;
70 71 72 73 74 75 76 77 78
	int num_hugepd;

	if (pshift >= pdshift) {
		cachep = hugepte_cache;
		num_hugepd = 1 << (pshift - pdshift);
	} else {
		cachep = PGT_CACHE(pdshift - pshift);
		num_hugepd = 1;
	}
B
Becky Bruce 已提交
79

80
	new = kmem_cache_zalloc(cachep, GFP_KERNEL);
81

82 83 84
	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

85 86 87
	if (! new)
		return -ENOMEM;

88 89 90 91 92 93 94
	/*
	 * Make sure other cpus find the hugepd set only after a
	 * properly initialized page table is visible to them.
	 * For more details look for comment in __pte_alloc().
	 */
	smp_wmb();

95
	spin_lock(&mm->page_table_lock);
96

B
Becky Bruce 已提交
97 98 99 100 101 102 103 104 105 106
	/*
	 * We have multiple higher-level entries that point to the same
	 * actual pte location.  Fill in each as we go and backtrack on error.
	 * We need all of these so the DTLB pgtable walk code can find the
	 * right higher-level entry without knowing if it's a hugepage or not.
	 */
	for (i = 0; i < num_hugepd; i++, hpdp++) {
		if (unlikely(!hugepd_none(*hpdp)))
			break;
		else
107 108 109
#ifdef CONFIG_PPC_BOOK3S_64
			hpdp->pd = __pa(new) |
				   (shift_to_mmu_psize(pshift) << 2);
110 111 112 113 114
#elif defined(CONFIG_PPC_8xx)
			hpdp->pd = __pa(new) |
				   (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
							      _PMD_PAGE_512K) |
				   _PMD_PRESENT;
115
#else
116
			/* We use the old format for PPC_FSL_BOOK3E */
B
Becky Bruce 已提交
117
			hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
118
#endif
B
Becky Bruce 已提交
119 120 121 122 123 124 125
	}
	/* If we bailed from the for loop early, an error occurred, clean up */
	if (i < num_hugepd) {
		for (i = i - 1 ; i >= 0; i--, hpdp--)
			hpdp->pd = 0;
		kmem_cache_free(cachep, new);
	}
126 127 128 129
	spin_unlock(&mm->page_table_lock);
	return 0;
}

130 131 132 133
/*
 * These macros define how to determine which level of the page table holds
 * the hpdp.
 */
134
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
135 136 137 138 139 140 141
#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
#define HUGEPD_PUD_SHIFT PUD_SHIFT
#else
#define HUGEPD_PGD_SHIFT PUD_SHIFT
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
/*
 * At this point we do the placement change only for BOOK3S 64. This would
 * possibly work on other subarchs.
 */
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);
	pg = pgd_offset(mm, addr);

158
#ifdef CONFIG_PPC_BOOK3S_64
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
	if (pshift == PGDIR_SHIFT)
		/* 16GB huge page */
		return (pte_t *) pg;
	else if (pshift > PUD_SHIFT)
		/*
		 * We need to use hugepd table
		 */
		hpdp = (hugepd_t *)pg;
	else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift == PUD_SHIFT)
			return (pte_t *)pu;
		else if (pshift > PMD_SHIFT)
			hpdp = (hugepd_t *)pu;
		else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			if (pshift == PMD_SHIFT)
				/* 16MB hugepage */
				return (pte_t *)pm;
			else
				hpdp = (hugepd_t *)pm;
		}
	}
#else
185
	if (pshift >= HUGEPD_PGD_SHIFT) {
186 187 188 189
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
190
		if (pshift >= HUGEPD_PUD_SHIFT) {
191 192 193 194 195 196 197
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}
198
#endif
199 200 201 202 203 204 205 206
	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

207
	return hugepte_offset(*hpdp, addr, pdshift);
208 209
}

210
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
211
/* Build list of addresses of gigantic pages.  This function is used in early
212
 * boot before the buddy allocator is setup.
213
 */
B
Becky Bruce 已提交
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
{
	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
	int i;

	if (addr == 0)
		return;

	gpage_freearray[idx].nr_gpages = number_of_pages;

	for (i = 0; i < number_of_pages; i++) {
		gpage_freearray[idx].gpage_list[i] = addr;
		addr += page_size;
	}
}

/*
 * Moves the gigantic page addresses from the temporary list to the
 * huge_boot_pages list.
 */
int alloc_bootmem_huge_page(struct hstate *hstate)
{
	struct huge_bootmem_page *m;
237
	int idx = shift_to_mmu_psize(huge_page_shift(hstate));
B
Becky Bruce 已提交
238 239 240 241 242 243 244 245 246 247
	int nr_gpages = gpage_freearray[idx].nr_gpages;

	if (nr_gpages == 0)
		return 0;

#ifdef CONFIG_HIGHMEM
	/*
	 * If gpages can be in highmem we can't use the trick of storing the
	 * data structure in the page; allocate space for this
	 */
248
	m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
B
Becky Bruce 已提交
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
#else
	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
#endif

	list_add(&m->list, &huge_boot_pages);
	gpage_freearray[idx].nr_gpages = nr_gpages;
	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
	m->hstate = hstate;

	return 1;
}
/*
 * Scan the command line hugepagesz= options for gigantic pages; store those in
 * a list that we use to allocate the memory once all options are parsed.
 */

unsigned long gpage_npages[MMU_PAGE_COUNT];

268
static int __init do_gpage_early_setup(char *param, char *val,
269
				       const char *unused, void *arg)
B
Becky Bruce 已提交
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
{
	static phys_addr_t size;
	unsigned long npages;

	/*
	 * The hugepagesz and hugepages cmdline options are interleaved.  We
	 * use the size variable to keep track of whether or not this was done
	 * properly and skip over instances where it is incorrect.  Other
	 * command-line parsing code will issue warnings, so we don't need to.
	 *
	 */
	if ((strcmp(param, "default_hugepagesz") == 0) ||
	    (strcmp(param, "hugepagesz") == 0)) {
		size = memparse(val, NULL);
	} else if (strcmp(param, "hugepages") == 0) {
		if (size != 0) {
			if (sscanf(val, "%lu", &npages) <= 0)
				npages = 0;
288 289
			if (npages > MAX_NUMBER_GPAGES) {
				pr_warn("MMU: %lu pages requested for page "
290
#ifdef CONFIG_PHYS_ADDR_T_64BIT
291
					"size %llu KB, limiting to "
292 293 294
#else
					"size %u KB, limiting to "
#endif
295 296 297 298
					__stringify(MAX_NUMBER_GPAGES) "\n",
					npages, size / 1024);
				npages = MAX_NUMBER_GPAGES;
			}
B
Becky Bruce 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
			size = 0;
		}
	}
	return 0;
}


/*
 * This function allocates physical space for pages that are larger than the
 * buddy allocator can handle.  We want to allocate these in highmem because
 * the amount of lowmem is limited.  This means that this function MUST be
 * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
 * allocate to grab highmem.
 */
void __init reserve_hugetlb_gpages(void)
{
	static __initdata char cmdline[COMMAND_LINE_SIZE];
	phys_addr_t size, base;
	int i;

	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
321
	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
322
			NULL, &do_gpage_early_setup);
B
Becky Bruce 已提交
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342

	/*
	 * Walk gpage list in reverse, allocating larger page sizes first.
	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
	 * When we reach the point in the list where pages are no longer
	 * considered gpages, we're done.
	 */
	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
			continue;
		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
			break;

		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
		base = memblock_alloc_base(size * gpage_npages[i], size,
					   MEMBLOCK_ALLOC_ANYWHERE);
		add_gpage(base, size, gpage_npages[i]);
	}
}

343
#else /* !PPC_FSL_BOOK3E */
B
Becky Bruce 已提交
344 345

/* Build list of addresses of gigantic pages.  This function is used in early
346
 * boot before the buddy allocator is setup.
B
Becky Bruce 已提交
347 348
 */
void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
349 350 351 352 353 354 355 356 357 358 359
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

360
/* Moves the gigantic page addresses from the temporary list to the
361 362 363
 * huge_boot_pages list.
 */
int alloc_bootmem_huge_page(struct hstate *hstate)
364 365 366 367 368 369 370
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
371
	m->hstate = hstate;
372 373
	return 1;
}
B
Becky Bruce 已提交
374
#endif
375

376
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
B
Becky Bruce 已提交
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
#define HUGEPD_FREELIST_SIZE \
	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))

struct hugepd_freelist {
	struct rcu_head	rcu;
	unsigned int index;
	void *ptes[0];
};

static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);

static void hugepd_free_rcu_callback(struct rcu_head *head)
{
	struct hugepd_freelist *batch =
		container_of(head, struct hugepd_freelist, rcu);
	unsigned int i;

	for (i = 0; i < batch->index; i++)
		kmem_cache_free(hugepte_cache, batch->ptes[i]);

	free_page((unsigned long)batch);
}

static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
	struct hugepd_freelist **batchp;

404
	batchp = &get_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
405 406 407 408 409

	if (atomic_read(&tlb->mm->mm_users) < 2 ||
	    cpumask_equal(mm_cpumask(tlb->mm),
			  cpumask_of(smp_processor_id()))) {
		kmem_cache_free(hugepte_cache, hugepte);
410
		put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
411 412 413 414 415 416 417 418 419 420 421 422 423
		return;
	}

	if (*batchp == NULL) {
		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
		(*batchp)->index = 0;
	}

	(*batchp)->ptes[(*batchp)->index++] = hugepte;
	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		*batchp = NULL;
	}
424
	put_cpu_var(hugepd_freelist_cur);
B
Becky Bruce 已提交
425
}
426 427
#else
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
B
Becky Bruce 已提交
428 429
#endif

430 431 432
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
433 434
{
	pte_t *hugepte = hugepd_page(*hpdp);
B
Becky Bruce 已提交
435 436
	int i;

437
	unsigned long pdmask = ~((1UL << pdshift) - 1);
B
Becky Bruce 已提交
438
	unsigned int num_hugepd = 1;
439
	unsigned int shift = hugepd_shift(*hpdp);
B
Becky Bruce 已提交
440

441
	/* Note: On fsl the hpdp may be the first of several */
442 443
	if (shift > pdshift)
		num_hugepd = 1 << (shift - pdshift);
444 445 446 447 448 449 450 451 452 453 454

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;
455

B
Becky Bruce 已提交
456 457 458
	for (i = 0; i < num_hugepd; i++, hpdp++)
		hpdp->pd = 0;

459 460 461 462
	if (shift >= pdshift)
		hugepd_free(tlb, hugepte);
	else
		pgtable_free_tlb(tlb, hugepte, pdshift - shift);
463 464 465 466
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
467
				   unsigned long floor, unsigned long ceiling)
468 469 470 471 472 473 474
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
475 476
		unsigned long more;

477
		pmd = pmd_offset(pud, addr);
478
		next = pmd_addr_end(addr, end);
479
		if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
480 481 482 483 484
			/*
			 * if it is not hugepd pointer, we should already find
			 * it cleared.
			 */
			WARN_ON(!pmd_none_or_clear_bad(pmd));
485
			continue;
486
		}
487 488 489 490 491 492
		/*
		 * Increment next by the size of the huge mapping since
		 * there may be more than one entry at this level for a
		 * single hugepage, but all of them point to
		 * the same kmem cache that holds the hugepte.
		 */
493 494 495 496
		more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
		if (more > next)
			next = more;

497 498
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
499
	} while (addr = next, addr != end);
500 501 502 503 504 505 506 507

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
L
Linus Torvalds 已提交
508
	}
509 510
	if (end - 1 > ceiling - 1)
		return;
L
Linus Torvalds 已提交
511

512 513
	pmd = pmd_offset(pud, start);
	pud_clear(pud);
514
	pmd_free_tlb(tlb, pmd, start);
515
	mm_dec_nr_pmds(tlb->mm);
516 517 518 519 520 521 522 523 524 525 526 527
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	do {
528
		pud = pud_offset(pgd, addr);
529
		next = pud_addr_end(addr, end);
530
		if (!is_hugepd(__hugepd(pud_val(*pud)))) {
531 532
			if (pud_none_or_clear_bad(pud))
				continue;
533
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
534
					       ceiling);
535
		} else {
536
			unsigned long more;
537 538 539 540 541 542
			/*
			 * Increment next by the size of the huge mapping since
			 * there may be more than one entry at this level for a
			 * single hugepage, but all of them point to
			 * the same kmem cache that holds the hugepte.
			 */
543 544 545 546
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
			if (more > next)
				next = more;

547 548
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
549
		}
550
	} while (addr = next, addr != end);
551 552 553 554 555 556 557 558 559 560 561 562 563 564

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
565
	pud_free_tlb(tlb, pud, start);
566 567 568 569 570
}

/*
 * This function frees user-level page tables of a process.
 */
571
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
572 573 574 575 576 577 578
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
579 580 581 582 583 584 585 586 587 588
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
589
	 *
590 591 592
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
593 594 595 596
	 */

	do {
		next = pgd_addr_end(addr, end);
B
Becky Bruce 已提交
597
		pgd = pgd_offset(tlb->mm, addr);
598
		if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
599 600 601 602
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
603
			unsigned long more;
B
Becky Bruce 已提交
604 605
			/*
			 * Increment next by the size of the huge mapping since
606 607 608
			 * there may be more than one entry at the pgd level
			 * for a single hugepage, but all of them point to the
			 * same kmem cache that holds the hugepte.
B
Becky Bruce 已提交
609
			 */
610 611 612 613
			more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
			if (more > next)
				next = more;

614 615
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
616
		}
B
Becky Bruce 已提交
617
	} while (addr = next, addr != end);
L
Linus Torvalds 已提交
618 619
}

620 621 622 623
/*
 * We are holding mmap_sem, so a parallel huge page collapse cannot run.
 * To prevent hugepage split, disable irq.
 */
L
Linus Torvalds 已提交
624 625 626
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
627
	bool is_thp;
628
	pte_t *ptep, pte;
629
	unsigned shift;
630
	unsigned long mask, flags;
631 632 633
	struct page *page = ERR_PTR(-EINVAL);

	local_irq_save(flags);
634
	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
635 636 637
	if (!ptep)
		goto no_page;
	pte = READ_ONCE(*ptep);
638
	/*
639
	 * Verify it is a huge page else bail.
640 641 642
	 * Transparent hugepages are handled by generic code. We can skip them
	 * here.
	 */
643
	if (!shift || is_thp)
644
		goto no_page;
L
Linus Torvalds 已提交
645

646 647 648
	if (!pte_present(pte)) {
		page = NULL;
		goto no_page;
649
	}
650
	mask = (1UL << shift) - 1;
651
	page = pte_page(pte);
652 653
	if (page)
		page += (address & mask) / PAGE_SIZE;
L
Linus Torvalds 已提交
654

655
no_page:
656
	local_irq_restore(flags);
L
Linus Torvalds 已提交
657 658 659 660 661 662 663 664 665 666 667
	return page;
}

struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
		pmd_t *pmd, int write)
{
	BUG();
	return NULL;
}

668 669 670 671 672 673 674 675
struct page *
follow_huge_pud(struct mm_struct *mm, unsigned long address,
		pud_t *pud, int write)
{
	BUG();
	return NULL;
}

D
David Gibson 已提交
676 677 678 679 680 681 682
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				      unsigned long sz)
{
	unsigned long __boundary = (addr + sz) & ~(sz-1);
	return (__boundary - 1 < end - 1) ? __boundary : end;
}

683 684
int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
		unsigned long end, int write, struct page **pages, int *nr)
685 686
{
	pte_t *ptep;
687
	unsigned long sz = 1UL << hugepd_shift(hugepd);
D
David Gibson 已提交
688
	unsigned long next;
689 690 691

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
D
David Gibson 已提交
692
		next = hugepte_addr_end(addr, end, sz);
693 694
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
D
David Gibson 已提交
695
	} while (ptep++, addr = next, addr != end);
696 697 698

	return 1;
}
L
Linus Torvalds 已提交
699

700
#ifdef CONFIG_PPC_MM_SLICES
L
Linus Torvalds 已提交
701 702 703 704
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
705 706
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
707

708 709 710
	if (radix_enabled())
		return radix__hugetlb_get_unmapped_area(file, addr, len,
						       pgoff, flags);
711
	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
L
Linus Torvalds 已提交
712
}
713
#endif
L
Linus Torvalds 已提交
714

715 716
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
717
#ifdef CONFIG_PPC_MM_SLICES
718
	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
719 720 721 722
	/* With radix we don't use slice, so derive it from vma*/
	if (!radix_enabled())
		return 1UL << mmu_psize_to_shift(psize);
#endif
B
Becky Bruce 已提交
723 724 725 726 727 728 729 730 731 732 733
	if (!is_vm_hugetlb_page(vma))
		return PAGE_SIZE;

	return huge_page_size(hstate_vma(vma));
}

static inline bool is_power_of_4(unsigned long x)
{
	if (is_power_of_2(x))
		return (__ilog2(x) % 2) ? false : true;
	return false;
734 735
}

736
static int __init add_huge_page_size(unsigned long long size)
737
{
738 739
	int shift = __ffs(size);
	int mmu_psize;
740

741
	/* Check that it is a page size supported by the hardware and
742
	 * that it fits within pagetable and slice limits. */
743 744
	if (size <= PAGE_SIZE)
		return -EINVAL;
745
#if defined(CONFIG_PPC_FSL_BOOK3E)
746
	if (!is_power_of_4(size))
B
Becky Bruce 已提交
747
		return -EINVAL;
748
#elif !defined(CONFIG_PPC_8xx)
749
	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
750
		return -EINVAL;
B
Becky Bruce 已提交
751
#endif
752

753 754 755 756 757 758 759 760 761 762 763 764
	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
765 766 767 768 769 770 771 772
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

773 774 775 776
	if (add_huge_page_size(size) != 0) {
		hugetlb_bad_size();
		pr_err("Invalid huge page size specified(%llu)\n", size);
	}
777 778 779 780 781

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

B
Becky Bruce 已提交
782 783 784 785 786
struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
	int psize;

787
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
788
	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
789
		return -ENODEV;
790
#endif
791 792 793
	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;
794

795 796
		if (!mmu_psize_defs[psize].shift)
			continue;
797

798 799 800 801 802
		shift = mmu_psize_to_shift(psize);

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;

803
		if (shift < HUGEPD_PUD_SHIFT)
804
			pdshift = PMD_SHIFT;
805
		else if (shift < HUGEPD_PGD_SHIFT)
806 807 808
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;
809 810 811 812
		/*
		 * if we have pdshift and shift value same, we don't
		 * use pgt cache for hugepd.
		 */
813
		if (pdshift > shift)
814
			pgtable_cache_add(pdshift - shift, NULL);
815
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
		else if (!hugepte_cache) {
			/*
			 * Create a kmem cache for hugeptes.  The bottom bits in
			 * the pte have size information encoded in them, so
			 * align them to allow this
			 */
			hugepte_cache = kmem_cache_create("hugepte-cache",
							  sizeof(pte_t),
							  HUGEPD_SHIFT_MASK + 1,
							  0, NULL);
			if (hugepte_cache == NULL)
				panic("%s: Unable to create kmem cache "
				      "for hugeptes\n", __func__);

		}
#endif
832
	}
833

834 835
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
836 837
	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
838 839
	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
840
#else
841 842 843 844 845 846 847
	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
848 849
	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
850
#endif
851 852
	return 0;
}
853

854
arch_initcall(hugetlbpage_init);
855 856 857 858

void flush_dcache_icache_hugepage(struct page *page)
{
	int i;
B
Becky Bruce 已提交
859
	void *start;
860 861 862

	BUG_ON(!PageCompound(page));

B
Becky Bruce 已提交
863 864 865 866
	for (i = 0; i < (1UL << compound_order(page)); i++) {
		if (!PageHighMem(page)) {
			__flush_dcache_icache(page_address(page+i));
		} else {
867
			start = kmap_atomic(page+i);
B
Becky Bruce 已提交
868
			__flush_dcache_icache(start);
869
			kunmap_atomic(start);
B
Becky Bruce 已提交
870 871
		}
	}
872
}
873 874 875 876 877 878 879

#endif /* CONFIG_HUGETLB_PAGE */

/*
 * We have 4 cases for pgds and pmds:
 * (1) invalid (all zeroes)
 * (2) pointer to next table, as normal; bottom 6 bits == 0
A
Aneesh Kumar K.V 已提交
880 881
 * (3) leaf pte for huge page _PAGE_PTE set
 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
882 883 884
 *
 * So long as we atomically load page table pointers we are safe against teardown,
 * we can follow the address down to the the page and take a ref on it.
885 886
 * This function need to be called with interrupts disabled. We use this variant
 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
887
 */
888

889
pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
890
				   bool *is_thp, unsigned *shift)
891
{
892 893 894
	pgd_t pgd, *pgdp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
895 896 897 898 899 900 901
	pte_t *ret_pte;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

	if (shift)
		*shift = 0;

902 903 904
	if (is_thp)
		*is_thp = false;

905
	pgdp = pgdir + pgd_index(ea);
906
	pgd  = READ_ONCE(*pgdp);
907
	/*
908 909 910 911
	 * Always operate on the local stack value. This make sure the
	 * value don't get updated by a parallel THP split/collapse,
	 * page fault or a page unmap. The return pte_t * is still not
	 * stable. So should be checked there for above conditions.
912
	 */
913
	if (pgd_none(pgd))
914
		return NULL;
915 916
	else if (pgd_huge(pgd)) {
		ret_pte = (pte_t *) pgdp;
917
		goto out;
918
	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
919
		hpdp = (hugepd_t *)&pgd;
920
	else {
921 922 923 924 925
		/*
		 * Even if we end up with an unmap, the pgtable will not
		 * be freed, because we do an rcu free and here we are
		 * irq disabled
		 */
926
		pdshift = PUD_SHIFT;
927
		pudp = pud_offset(&pgd, ea);
928
		pud  = READ_ONCE(*pudp);
929

930
		if (pud_none(pud))
931
			return NULL;
932 933
		else if (pud_huge(pud)) {
			ret_pte = (pte_t *) pudp;
934
			goto out;
935
		} else if (is_hugepd(__hugepd(pud_val(pud))))
936
			hpdp = (hugepd_t *)&pud;
937
		else {
938
			pdshift = PMD_SHIFT;
939
			pmdp = pmd_offset(&pud, ea);
940
			pmd  = READ_ONCE(*pmdp);
941 942 943 944
			/*
			 * A hugepage collapse is captured by pmd_none, because
			 * it mark the pmd none and do a hpte invalidate.
			 */
945
			if (pmd_none(pmd))
946
				return NULL;
947

948 949 950 951 952 953 954 955
			if (pmd_trans_huge(pmd)) {
				if (is_thp)
					*is_thp = true;
				ret_pte = (pte_t *) pmdp;
				goto out;
			}

			if (pmd_huge(pmd)) {
956
				ret_pte = (pte_t *) pmdp;
957
				goto out;
958
			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
959
				hpdp = (hugepd_t *)&pmd;
960
			else
961
				return pte_offset_kernel(&pmd, ea);
962 963 964 965 966
		}
	}
	if (!hpdp)
		return NULL;

967
	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
968 969 970 971 972 973
	pdshift = hugepd_shift(*hpdp);
out:
	if (shift)
		*shift = pdshift;
	return ret_pte;
}
974
EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
975 976 977 978 979 980

int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long mask;
	unsigned long pte_end;
981
	struct page *head, *page;
982 983 984 985 986 987 988
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

989
	pte = READ_ONCE(*ptep);
990
	mask = _PAGE_PRESENT | _PAGE_READ;
991 992 993 994 995

	/*
	 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
	 * as 0 and _PAGE_RO has to be set when a page is not writable
	 */
996
	if (write)
997
		mask |= _PAGE_WRITE;
998 999
	else
		mask |= _PAGE_RO;
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033

	if ((pte_val(pte) & mask) != mask)
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		*nr -= refs;
		while (refs--)
			put_page(head);
		return 0;
	}

	return 1;
}