hash_pgtable.c 15.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2 3 4 5 6 7 8
/*
 * Copyright 2005, Paul Mackerras, IBM Corporation.
 * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
 */

#include <linux/sched.h>
9
#include <linux/mm_types.h>
10
#include <linux/mm.h>
11
#include <linux/stop_machine.h>
12

13 14
#include <asm/sections.h>
#include <asm/mmu.h>
15 16
#include <asm/tlb.h>

17
#include <mm/mmu_decl.h>
18

19 20
#include <trace/events/thp.h>

21 22 23 24
#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
#warning Limited user VSID range means pagetable space is wasted
#endif

25
#ifdef CONFIG_SPARSEMEM_VMEMMAP
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
/*
 * vmemmap is the starting address of the virtual address space where
 * struct pages are allocated for all possible PFNs present on the system
 * including holes and bad memory (hence sparse). These virtual struct
 * pages are stored in sequence in this virtual address space irrespective
 * of the fact whether the corresponding PFN is valid or not. This achieves
 * constant relationship between address of struct page and its PFN.
 *
 * During boot or memory hotplug operation when a new memory section is
 * added, physical memory allocation (including hash table bolting) will
 * be performed for the set of struct pages which are part of the memory
 * section. This saves memory by not allocating struct pages for PFNs
 * which are not valid.
 *
 *		----------------------------------------------
 *		| PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
 *		----------------------------------------------
 *
 *	   f000000000000000                  c000000000000000
 * vmemmap +--------------+                  +--------------+
 *  +      |  page struct | +--------------> |  page struct |
 *  |      +--------------+                  +--------------+
 *  |      |  page struct | +--------------> |  page struct |
 *  |      +--------------+ |                +--------------+
 *  |      |  page struct | +       +------> |  page struct |
 *  |      +--------------+         |        +--------------+
 *  |      |  page struct |         |   +--> |  page struct |
 *  |      +--------------+         |   |    +--------------+
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct | +-------+   |
 *  |      +--------------+             |
 *  |      |  page struct | +-----------+
 *  |      +--------------+
 *  |      |  page struct | No mapping
 *  |      +--------------+
 *  |      |  page struct | No mapping
 *  v      +--------------+
 *
 *		-----------------------------------------
 *		| RELATION BETWEEN STRUCT PAGES AND PFNS|
 *		-----------------------------------------
 *
 * vmemmap +--------------+                 +---------------+
 *  +      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |              |
 *  |      +--------------+
 *  |      |              |
 *  |      +--------------+
 *  |      |              |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |              |
 *  |      +--------------+
 *  |      |              |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  v      +--------------+                 +---------------+
 */
101 102 103 104
/*
 * On hash-based CPUs, the vmemmap is bolted in the hash table.
 *
 */
105 106 107
int __meminit hash__vmemmap_create_mapping(unsigned long start,
				       unsigned long page_size,
				       unsigned long phys)
108
{
109 110 111
	int rc;

	if ((start + page_size) >= H_VMEMMAP_END) {
112
		pr_warn("Outside the supported range\n");
113 114 115 116 117 118
		return -1;
	}

	rc = htab_bolt_mapping(start, start + page_size, phys,
			       pgprot_val(PAGE_KERNEL),
			       mmu_vmemmap_psize, mmu_kernel_ssize);
119 120 121 122 123 124 125 126 127 128
	if (rc < 0) {
		int rc2 = htab_remove_mapping(start, start + page_size,
					      mmu_vmemmap_psize,
					      mmu_kernel_ssize);
		BUG_ON(rc2 && (rc2 != -ENOENT));
	}
	return rc;
}

#ifdef CONFIG_MEMORY_HOTPLUG
129 130
void hash__vmemmap_remove_mapping(unsigned long start,
			      unsigned long page_size)
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
{
	int rc = htab_remove_mapping(start, start + page_size,
				     mmu_vmemmap_psize,
				     mmu_kernel_ssize);
	BUG_ON((rc < 0) && (rc != -ENOENT));
	WARN_ON(rc == -ENOENT);
}
#endif
#endif /* CONFIG_SPARSEMEM_VMEMMAP */

/*
 * map_kernel_page currently only called by __ioremap
 * map_kernel_page adds an entry to the ioremap page table
 * and adds an entry to the HPT, possibly bolting it
 */
146
int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
147 148
{
	pgd_t *pgdp;
149
	p4d_t *p4dp;
150 151 152 153
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

154
	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
155 156
	if (slab_is_available()) {
		pgdp = pgd_offset_k(ea);
157 158
		p4dp = p4d_offset(pgdp, ea);
		pudp = pud_alloc(&init_mm, p4dp, ea);
159 160 161 162 163 164 165 166
		if (!pudp)
			return -ENOMEM;
		pmdp = pmd_alloc(&init_mm, pudp, ea);
		if (!pmdp)
			return -ENOMEM;
		ptep = pte_alloc_kernel(pmdp, ea);
		if (!ptep)
			return -ENOMEM;
167
		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
168 169 170 171 172 173 174
	} else {
		/*
		 * If the mm subsystem is not fully up, we cannot create a
		 * linux page table entry for this mapping.  Simply bolt an
		 * entry in the hardware page table.
		 *
		 */
175
		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
176 177 178 179 180 181 182 183 184 185
				      mmu_io_psize, mmu_kernel_ssize)) {
			printk(KERN_ERR "Failed to do bolted mapping IO "
			       "memory at %016lx !\n", pa);
			return -ENOMEM;
		}
	}

	smp_wmb();
	return 0;
}
186 187 188

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

189 190 191
unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
				    pmd_t *pmdp, unsigned long clr,
				    unsigned long set)
192 193 194 195 196
{
	__be64 old_be, tmp;
	unsigned long old;

#ifdef CONFIG_DEBUG_VM
197
	WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
198
	assert_spin_locked(pmd_lockptr(mm, pmdp));
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
#endif

	__asm__ __volatile__(
	"1:	ldarx	%0,0,%3\n\
		and.	%1,%0,%6\n\
		bne-	1b \n\
		andc	%1,%0,%4 \n\
		or	%1,%1,%7\n\
		stdcx.	%1,0,%3 \n\
		bne-	1b"
	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
	  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
	: "cc" );

	old = be64_to_cpu(old_be);

	trace_hugepage_update(addr, old, clr, set);
	if (old & H_PAGE_HASHPTE)
		hpte_do_hugepage_flush(mm, addr, pmdp, old);
	return old;
}

222 223
pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			    pmd_t *pmdp)
224 225 226 227 228
{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(pmd_trans_huge(*pmdp));
229
	VM_BUG_ON(pmd_devmap(*pmdp));
230 231 232 233 234 235 236 237 238

	pmd = *pmdp;
	pmd_clear(pmdp);
	/*
	 * Wait for all pending hash_page to finish. This is needed
	 * in case of subpage collapse. When we collapse normal pages
	 * to hugepage, we first clear the pmd, then invalidate all
	 * the PTE entries. The assumption here is that any low level
	 * page fault will see a none pmd and take the slow path that
239
	 * will wait on mmap_lock. But we could very well be in a
240 241 242 243 244 245 246 247
	 * hash_page with local ptep pointer value. Such a hash page
	 * can result in adding new HPTE entries for normal subpages.
	 * That means we could be modifying the page content as we
	 * copy them to a huge page. So wait for parallel hash_page
	 * to finish before invalidating HPTE entries. We can do this
	 * by sending an IPI to all the cpus and executing a dummy
	 * function there.
	 */
248
	serialize_against_pte_lookup(vma->vm_mm);
249 250 251 252
	/*
	 * Now invalidate the hpte entries in the range
	 * covered by pmd. This make sure we take a
	 * fault and will find the pmd as none, which will
253
	 * result in a major fault which takes mmap_lock and
254 255 256 257 258 259 260 261 262 263 264 265
	 * hence wait for collapse to complete. Without this
	 * the __collapse_huge_page_copy can result in copying
	 * the old content.
	 */
	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
	return pmd;
}

/*
 * We want to put the pgtable in pmd and use pgtable for tracking
 * the base page size hptes
 */
266 267
void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				  pgtable_t pgtable)
268 269
{
	pgtable_t *pgtable_slot;
270 271

	assert_spin_locked(pmd_lockptr(mm, pmdp));
272 273 274 275 276 277 278 279 280 281 282 283 284 285
	/*
	 * we store the pgtable in the second half of PMD
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	*pgtable_slot = pgtable;
	/*
	 * expose the deposited pgtable to other cpus.
	 * before we set the hugepage PTE at pmd level
	 * hash fault code looks at the deposted pgtable
	 * to store hash index values.
	 */
	smp_wmb();
}

286
pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
287 288 289 290
{
	pgtable_t pgtable;
	pgtable_t *pgtable_slot;

291 292
	assert_spin_locked(pmd_lockptr(mm, pmdp));

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Once we withdraw, mark the entry NULL.
	 */
	*pgtable_slot = NULL;
	/*
	 * We store HPTE information in the deposited PTE fragment.
	 * zero out the content on withdraw.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
	return pgtable;
}

/*
 * A linux hugepage PMD was changed and the corresponding hash table entries
 * neesd to be flushed.
 */
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
			    pmd_t *pmdp, unsigned long old_pmd)
{
	int ssize;
	unsigned int psize;
	unsigned long vsid;
	unsigned long flags = 0;

	/* get the base page size,vsid and segment size */
#ifdef CONFIG_DEBUG_VM
	psize = get_slice_psize(mm, addr);
	BUG_ON(psize == MMU_PAGE_16M);
#endif
	if (old_pmd & H_PAGE_COMBO)
		psize = MMU_PAGE_4K;
	else
		psize = MMU_PAGE_64K;

	if (!is_kernel_addr(addr)) {
		ssize = user_segment_size(addr);
331
		vsid = get_user_vsid(&mm->context, addr, ssize);
332 333 334 335 336 337
		WARN_ON(vsid == 0);
	} else {
		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
		ssize = mmu_kernel_ssize;
	}

338
	if (mm_is_thread_local(mm))
339 340 341 342 343
		flags |= HPTE_LOCAL_UPDATE;

	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
}

344 345
pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
				unsigned long addr, pmd_t *pmdp)
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
{
	pmd_t old_pmd;
	pgtable_t pgtable;
	unsigned long old;
	pgtable_t *pgtable_slot;

	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
	old_pmd = __pmd(old);
	/*
	 * We have pmd == none and we are holding page_table_lock.
	 * So we can safely go and clear the pgtable hash
	 * index info.
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Let's zero out old valid and hash index details
	 * hash fault look at them.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
	return old_pmd;
}

369
int hash__has_transparent_hugepage(void)
370 371 372 373 374 375 376 377 378 379
{

	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
		return 0;
	/*
	 * We support THP only if PMD_SIZE is 16MB.
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
		return 0;
	/*
J
Julia Lawall 已提交
380
	 * We need to make sure that we support 16MB hugepage in a segment
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
	 * of 64K.
	 */
	/*
	 * If we have 64K HPTE, we will be using that by default
	 */
	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
		return 0;
	/*
	 * Ok we only have 4K HPTE
	 */
	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
		return 0;

	return 1;
}
398 399
EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);

400
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
401 402

#ifdef CONFIG_STRICT_KERNEL_RWX
403 404 405 406 407 408 409 410 411 412 413 414 415

struct change_memory_parms {
	unsigned long start, end, newpp;
	unsigned int step, nr_cpus, master_cpu;
	atomic_t cpu_counter;
};

// We'd rather this was on the stack but it has to be in the RMO
static struct change_memory_parms chmem_parms;

// And therefore we need a lock to protect it from concurrent use
static DEFINE_MUTEX(chmem_lock);

416 417 418 419 420 421 422 423 424 425 426 427 428 429
static void change_memory_range(unsigned long start, unsigned long end,
				unsigned int step, unsigned long newpp)
{
	unsigned long idx;

	pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
		 start, end, newpp, step);

	for (idx = start; idx < end; idx += step)
		/* Not sure if we can do much with the return value */
		mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
							mmu_kernel_ssize);
}

430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
{
	unsigned long msr, tmp, flags;
	int *p;

	p = &parms->cpu_counter.counter;

	local_irq_save(flags);
	hard_irq_disable();

	asm volatile (
	// Switch to real mode and leave interrupts off
	"mfmsr	%[msr]			;"
	"li	%[tmp], %[MSR_IR_DR]	;"
	"andc	%[tmp], %[msr], %[tmp]	;"
	"mtmsrd %[tmp]			;"

	// Tell the master we are in real mode
	"1:				"
	"lwarx	%[tmp], 0, %[p]		;"
	"addic	%[tmp], %[tmp], -1	;"
	"stwcx.	%[tmp], 0, %[p]		;"
	"bne-	1b			;"

	// Spin until the counter goes to zero
	"2:				;"
	"lwz	%[tmp], 0(%[p])		;"
	"cmpwi	%[tmp], 0		;"
	"bne-	2b			;"

	// Switch back to virtual mode
	"mtmsrd %[msr]			;"

	: // outputs
	  [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
	: // inputs
	  [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
	: // clobbers
	  "cc", "xer"
	);

	local_irq_restore(flags);

	return 0;
}

static int change_memory_range_fn(void *data)
{
	struct change_memory_parms *parms = data;

	if (parms->master_cpu != smp_processor_id())
		return chmem_secondary_loop(parms);

	// Wait for all but one CPU (this one) to call-in
	while (atomic_read(&parms->cpu_counter) > 1)
		barrier();

	change_memory_range(parms->start, parms->end, parms->step, parms->newpp);

	mb();

	// Signal the other CPUs that we're done
	atomic_dec(&parms->cpu_counter);

	return 0;
}

497 498
static bool hash__change_memory_range(unsigned long start, unsigned long end,
				      unsigned long newpp)
499 500 501 502 503 504
{
	unsigned int step, shift;

	shift = mmu_psize_defs[mmu_linear_psize].shift;
	step = 1 << shift;

505 506
	start = ALIGN_DOWN(start, step);
	end = ALIGN(end, step); // aligns up
507

508 509
	if (start >= end)
		return false;
510

511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
	if (firmware_has_feature(FW_FEATURE_LPAR)) {
		mutex_lock(&chmem_lock);

		chmem_parms.start = start;
		chmem_parms.end = end;
		chmem_parms.step = step;
		chmem_parms.newpp = newpp;
		chmem_parms.master_cpu = smp_processor_id();

		cpus_read_lock();

		atomic_set(&chmem_parms.cpu_counter, num_online_cpus());

		// Ensure state is consistent before we call the other CPUs
		mb();

		stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
					cpu_online_mask);

		cpus_read_unlock();
		mutex_unlock(&chmem_lock);
	} else
		change_memory_range(start, end, step, newpp);
534

535 536 537 538 539
	return true;
}

void hash__mark_rodata_ro(void)
{
540
	unsigned long start, end, pp;
541 542 543 544

	start = (unsigned long)_stext;
	end = (unsigned long)__init_begin;

545 546 547
	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);

	WARN_ON(!hash__change_memory_range(start, end, pp));
548
}
549 550 551 552 553 554 555 556

void hash__mark_initmem_nx(void)
{
	unsigned long start, end, pp;

	start = (unsigned long)__init_begin;
	end = (unsigned long)__init_end;

557
	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
558 559 560

	WARN_ON(!hash__change_memory_range(start, end, pp));
}
561
#endif