hash_pgtable.c 15.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2 3 4 5 6 7 8
/*
 * Copyright 2005, Paul Mackerras, IBM Corporation.
 * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
 */

#include <linux/sched.h>
9
#include <linux/mm_types.h>
10
#include <linux/mm.h>
11
#include <linux/stop_machine.h>
12

13 14
#include <asm/sections.h>
#include <asm/mmu.h>
15
#include <asm/tlb.h>
16
#include <asm/firmware.h>
17

18
#include <mm/mmu_decl.h>
19

20 21
#include <trace/events/thp.h>

22 23 24 25
#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
#warning Limited user VSID range means pagetable space is wasted
#endif

26
#ifdef CONFIG_SPARSEMEM_VMEMMAP
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
/*
 * vmemmap is the starting address of the virtual address space where
 * struct pages are allocated for all possible PFNs present on the system
 * including holes and bad memory (hence sparse). These virtual struct
 * pages are stored in sequence in this virtual address space irrespective
 * of the fact whether the corresponding PFN is valid or not. This achieves
 * constant relationship between address of struct page and its PFN.
 *
 * During boot or memory hotplug operation when a new memory section is
 * added, physical memory allocation (including hash table bolting) will
 * be performed for the set of struct pages which are part of the memory
 * section. This saves memory by not allocating struct pages for PFNs
 * which are not valid.
 *
 *		----------------------------------------------
 *		| PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
 *		----------------------------------------------
 *
 *	   f000000000000000                  c000000000000000
 * vmemmap +--------------+                  +--------------+
 *  +      |  page struct | +--------------> |  page struct |
 *  |      +--------------+                  +--------------+
 *  |      |  page struct | +--------------> |  page struct |
 *  |      +--------------+ |                +--------------+
 *  |      |  page struct | +       +------> |  page struct |
 *  |      +--------------+         |        +--------------+
 *  |      |  page struct |         |   +--> |  page struct |
 *  |      +--------------+         |   |    +--------------+
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct |         |   |
 *  |      +--------------+         |   |
 *  |      |  page struct | +-------+   |
 *  |      +--------------+             |
 *  |      |  page struct | +-----------+
 *  |      +--------------+
 *  |      |  page struct | No mapping
 *  |      +--------------+
 *  |      |  page struct | No mapping
 *  v      +--------------+
 *
 *		-----------------------------------------
 *		| RELATION BETWEEN STRUCT PAGES AND PFNS|
 *		-----------------------------------------
 *
 * vmemmap +--------------+                 +---------------+
 *  +      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |              |
 *  |      +--------------+
 *  |      |              |
 *  |      +--------------+
 *  |      |              |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |              |
 *  |      +--------------+
 *  |      |              |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  |      +--------------+                 +---------------+
 *  |      |  page struct | +-------------> |      PFN      |
 *  v      +--------------+                 +---------------+
 */
102 103 104 105
/*
 * On hash-based CPUs, the vmemmap is bolted in the hash table.
 *
 */
106 107 108
int __meminit hash__vmemmap_create_mapping(unsigned long start,
				       unsigned long page_size,
				       unsigned long phys)
109
{
110 111 112
	int rc;

	if ((start + page_size) >= H_VMEMMAP_END) {
113
		pr_warn("Outside the supported range\n");
114 115 116 117 118 119
		return -1;
	}

	rc = htab_bolt_mapping(start, start + page_size, phys,
			       pgprot_val(PAGE_KERNEL),
			       mmu_vmemmap_psize, mmu_kernel_ssize);
120 121 122 123 124 125 126 127 128 129
	if (rc < 0) {
		int rc2 = htab_remove_mapping(start, start + page_size,
					      mmu_vmemmap_psize,
					      mmu_kernel_ssize);
		BUG_ON(rc2 && (rc2 != -ENOENT));
	}
	return rc;
}

#ifdef CONFIG_MEMORY_HOTPLUG
130 131
void hash__vmemmap_remove_mapping(unsigned long start,
			      unsigned long page_size)
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
{
	int rc = htab_remove_mapping(start, start + page_size,
				     mmu_vmemmap_psize,
				     mmu_kernel_ssize);
	BUG_ON((rc < 0) && (rc != -ENOENT));
	WARN_ON(rc == -ENOENT);
}
#endif
#endif /* CONFIG_SPARSEMEM_VMEMMAP */

/*
 * map_kernel_page currently only called by __ioremap
 * map_kernel_page adds an entry to the ioremap page table
 * and adds an entry to the HPT, possibly bolting it
 */
147
int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
148 149
{
	pgd_t *pgdp;
150
	p4d_t *p4dp;
151 152 153 154
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

155
	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
156 157
	if (slab_is_available()) {
		pgdp = pgd_offset_k(ea);
158 159
		p4dp = p4d_offset(pgdp, ea);
		pudp = pud_alloc(&init_mm, p4dp, ea);
160 161 162 163 164 165 166 167
		if (!pudp)
			return -ENOMEM;
		pmdp = pmd_alloc(&init_mm, pudp, ea);
		if (!pmdp)
			return -ENOMEM;
		ptep = pte_alloc_kernel(pmdp, ea);
		if (!ptep)
			return -ENOMEM;
168
		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
169 170 171 172 173 174 175
	} else {
		/*
		 * If the mm subsystem is not fully up, we cannot create a
		 * linux page table entry for this mapping.  Simply bolt an
		 * entry in the hardware page table.
		 *
		 */
176
		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
177 178 179 180 181 182 183 184 185 186
				      mmu_io_psize, mmu_kernel_ssize)) {
			printk(KERN_ERR "Failed to do bolted mapping IO "
			       "memory at %016lx !\n", pa);
			return -ENOMEM;
		}
	}

	smp_wmb();
	return 0;
}
187 188 189

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

190 191 192
unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
				    pmd_t *pmdp, unsigned long clr,
				    unsigned long set)
193 194 195 196 197
{
	__be64 old_be, tmp;
	unsigned long old;

#ifdef CONFIG_DEBUG_VM
198
	WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
199
	assert_spin_locked(pmd_lockptr(mm, pmdp));
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
#endif

	__asm__ __volatile__(
	"1:	ldarx	%0,0,%3\n\
		and.	%1,%0,%6\n\
		bne-	1b \n\
		andc	%1,%0,%4 \n\
		or	%1,%1,%7\n\
		stdcx.	%1,0,%3 \n\
		bne-	1b"
	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
	  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
	: "cc" );

	old = be64_to_cpu(old_be);

	trace_hugepage_update(addr, old, clr, set);
	if (old & H_PAGE_HASHPTE)
		hpte_do_hugepage_flush(mm, addr, pmdp, old);
	return old;
}

223 224
pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			    pmd_t *pmdp)
225 226 227 228 229
{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(pmd_trans_huge(*pmdp));
230
	VM_BUG_ON(pmd_devmap(*pmdp));
231 232 233 234 235 236 237 238 239

	pmd = *pmdp;
	pmd_clear(pmdp);
	/*
	 * Wait for all pending hash_page to finish. This is needed
	 * in case of subpage collapse. When we collapse normal pages
	 * to hugepage, we first clear the pmd, then invalidate all
	 * the PTE entries. The assumption here is that any low level
	 * page fault will see a none pmd and take the slow path that
240
	 * will wait on mmap_lock. But we could very well be in a
241 242 243 244 245 246 247 248
	 * hash_page with local ptep pointer value. Such a hash page
	 * can result in adding new HPTE entries for normal subpages.
	 * That means we could be modifying the page content as we
	 * copy them to a huge page. So wait for parallel hash_page
	 * to finish before invalidating HPTE entries. We can do this
	 * by sending an IPI to all the cpus and executing a dummy
	 * function there.
	 */
249
	serialize_against_pte_lookup(vma->vm_mm);
250 251 252 253
	/*
	 * Now invalidate the hpte entries in the range
	 * covered by pmd. This make sure we take a
	 * fault and will find the pmd as none, which will
254
	 * result in a major fault which takes mmap_lock and
255 256 257 258 259 260 261 262 263 264 265 266
	 * hence wait for collapse to complete. Without this
	 * the __collapse_huge_page_copy can result in copying
	 * the old content.
	 */
	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
	return pmd;
}

/*
 * We want to put the pgtable in pmd and use pgtable for tracking
 * the base page size hptes
 */
267 268
void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				  pgtable_t pgtable)
269 270
{
	pgtable_t *pgtable_slot;
271 272

	assert_spin_locked(pmd_lockptr(mm, pmdp));
273 274 275 276 277 278 279 280 281 282 283 284 285 286
	/*
	 * we store the pgtable in the second half of PMD
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	*pgtable_slot = pgtable;
	/*
	 * expose the deposited pgtable to other cpus.
	 * before we set the hugepage PTE at pmd level
	 * hash fault code looks at the deposted pgtable
	 * to store hash index values.
	 */
	smp_wmb();
}

287
pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
288 289 290 291
{
	pgtable_t pgtable;
	pgtable_t *pgtable_slot;

292 293
	assert_spin_locked(pmd_lockptr(mm, pmdp));

294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Once we withdraw, mark the entry NULL.
	 */
	*pgtable_slot = NULL;
	/*
	 * We store HPTE information in the deposited PTE fragment.
	 * zero out the content on withdraw.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
	return pgtable;
}

/*
 * A linux hugepage PMD was changed and the corresponding hash table entries
 * neesd to be flushed.
 */
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
			    pmd_t *pmdp, unsigned long old_pmd)
{
	int ssize;
	unsigned int psize;
	unsigned long vsid;
	unsigned long flags = 0;

	/* get the base page size,vsid and segment size */
#ifdef CONFIG_DEBUG_VM
	psize = get_slice_psize(mm, addr);
	BUG_ON(psize == MMU_PAGE_16M);
#endif
	if (old_pmd & H_PAGE_COMBO)
		psize = MMU_PAGE_4K;
	else
		psize = MMU_PAGE_64K;

	if (!is_kernel_addr(addr)) {
		ssize = user_segment_size(addr);
332
		vsid = get_user_vsid(&mm->context, addr, ssize);
333 334 335 336 337 338
		WARN_ON(vsid == 0);
	} else {
		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
		ssize = mmu_kernel_ssize;
	}

339
	if (mm_is_thread_local(mm))
340 341 342 343 344
		flags |= HPTE_LOCAL_UPDATE;

	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
}

345 346
pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
				unsigned long addr, pmd_t *pmdp)
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
{
	pmd_t old_pmd;
	pgtable_t pgtable;
	unsigned long old;
	pgtable_t *pgtable_slot;

	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
	old_pmd = __pmd(old);
	/*
	 * We have pmd == none and we are holding page_table_lock.
	 * So we can safely go and clear the pgtable hash
	 * index info.
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Let's zero out old valid and hash index details
	 * hash fault look at them.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
	return old_pmd;
}

370
int hash__has_transparent_hugepage(void)
371 372 373 374 375 376 377 378 379 380
{

	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
		return 0;
	/*
	 * We support THP only if PMD_SIZE is 16MB.
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
		return 0;
	/*
J
Julia Lawall 已提交
381
	 * We need to make sure that we support 16MB hugepage in a segment
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
	 * of 64K.
	 */
	/*
	 * If we have 64K HPTE, we will be using that by default
	 */
	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
		return 0;
	/*
	 * Ok we only have 4K HPTE
	 */
	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
		return 0;

	return 1;
}
399 400
EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);

401
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
402 403

#ifdef CONFIG_STRICT_KERNEL_RWX
404 405 406 407 408 409 410 411 412 413 414 415 416

struct change_memory_parms {
	unsigned long start, end, newpp;
	unsigned int step, nr_cpus, master_cpu;
	atomic_t cpu_counter;
};

// We'd rather this was on the stack but it has to be in the RMO
static struct change_memory_parms chmem_parms;

// And therefore we need a lock to protect it from concurrent use
static DEFINE_MUTEX(chmem_lock);

417 418 419 420 421 422 423 424 425 426 427 428 429 430
static void change_memory_range(unsigned long start, unsigned long end,
				unsigned int step, unsigned long newpp)
{
	unsigned long idx;

	pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
		 start, end, newpp, step);

	for (idx = start; idx < end; idx += step)
		/* Not sure if we can do much with the return value */
		mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
							mmu_kernel_ssize);
}

431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
{
	unsigned long msr, tmp, flags;
	int *p;

	p = &parms->cpu_counter.counter;

	local_irq_save(flags);
	hard_irq_disable();

	asm volatile (
	// Switch to real mode and leave interrupts off
	"mfmsr	%[msr]			;"
	"li	%[tmp], %[MSR_IR_DR]	;"
	"andc	%[tmp], %[msr], %[tmp]	;"
	"mtmsrd %[tmp]			;"

	// Tell the master we are in real mode
	"1:				"
	"lwarx	%[tmp], 0, %[p]		;"
	"addic	%[tmp], %[tmp], -1	;"
	"stwcx.	%[tmp], 0, %[p]		;"
	"bne-	1b			;"

	// Spin until the counter goes to zero
	"2:				;"
	"lwz	%[tmp], 0(%[p])		;"
	"cmpwi	%[tmp], 0		;"
	"bne-	2b			;"

	// Switch back to virtual mode
	"mtmsrd %[msr]			;"

	: // outputs
	  [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
	: // inputs
	  [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
	: // clobbers
	  "cc", "xer"
	);

	local_irq_restore(flags);

	return 0;
}

static int change_memory_range_fn(void *data)
{
	struct change_memory_parms *parms = data;

	if (parms->master_cpu != smp_processor_id())
		return chmem_secondary_loop(parms);

	// Wait for all but one CPU (this one) to call-in
	while (atomic_read(&parms->cpu_counter) > 1)
		barrier();

	change_memory_range(parms->start, parms->end, parms->step, parms->newpp);

	mb();

	// Signal the other CPUs that we're done
	atomic_dec(&parms->cpu_counter);

	return 0;
}

498 499
static bool hash__change_memory_range(unsigned long start, unsigned long end,
				      unsigned long newpp)
500 501 502 503 504 505
{
	unsigned int step, shift;

	shift = mmu_psize_defs[mmu_linear_psize].shift;
	step = 1 << shift;

506 507
	start = ALIGN_DOWN(start, step);
	end = ALIGN(end, step); // aligns up
508

509 510
	if (start >= end)
		return false;
511

512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
	if (firmware_has_feature(FW_FEATURE_LPAR)) {
		mutex_lock(&chmem_lock);

		chmem_parms.start = start;
		chmem_parms.end = end;
		chmem_parms.step = step;
		chmem_parms.newpp = newpp;
		chmem_parms.master_cpu = smp_processor_id();

		cpus_read_lock();

		atomic_set(&chmem_parms.cpu_counter, num_online_cpus());

		// Ensure state is consistent before we call the other CPUs
		mb();

		stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
					cpu_online_mask);

		cpus_read_unlock();
		mutex_unlock(&chmem_lock);
	} else
		change_memory_range(start, end, step, newpp);
535

536 537 538 539 540
	return true;
}

void hash__mark_rodata_ro(void)
{
541
	unsigned long start, end, pp;
542 543

	start = (unsigned long)_stext;
544
	end = (unsigned long)__end_rodata;
545

546 547 548
	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);

	WARN_ON(!hash__change_memory_range(start, end, pp));
549
}
550 551 552 553 554 555 556 557

void hash__mark_initmem_nx(void)
{
	unsigned long start, end, pp;

	start = (unsigned long)__init_begin;
	end = (unsigned long)__init_end;

558
	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
559 560 561

	WARN_ON(!hash__change_memory_range(start, end, pp));
}
562
#endif