set_memory.c 55.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3
/*
 * Copyright 2002 Andi Kleen, SuSE Labs.
L
Linus Torvalds 已提交
4
 * Thanks to Ben LaHaise for precious feedback.
5
 */
L
Linus Torvalds 已提交
6
#include <linux/highmem.h>
M
Mike Rapoport 已提交
7
#include <linux/memblock.h>
8 9
#include <linux/sched.h>
#include <linux/mm.h>
10
#include <linux/interrupt.h>
11 12
#include <linux/seq_file.h>
#include <linux/debugfs.h>
13
#include <linux/pfn.h>
14
#include <linux/percpu.h>
15
#include <linux/gfp.h>
16
#include <linux/pci.h>
17
#include <linux/vmalloc.h>
18

19
#include <asm/e820/api.h>
L
Linus Torvalds 已提交
20 21
#include <asm/processor.h>
#include <asm/tlbflush.h>
D
Dave Jones 已提交
22
#include <asm/sections.h>
23
#include <asm/setup.h>
24
#include <linux/uaccess.h>
25
#include <asm/pgalloc.h>
T
Thomas Gleixner 已提交
26
#include <asm/proto.h>
27
#include <asm/pat.h>
L
Laura Abbott 已提交
28
#include <asm/set_memory.h>
L
Linus Torvalds 已提交
29

30
#include "../mm_internal.h"
31

I
Ingo Molnar 已提交
32 33 34
/*
 * The current flushing context - we pass it instead of 5 arguments:
 */
T
Thomas Gleixner 已提交
35
struct cpa_data {
36
	unsigned long	*vaddr;
37
	pgd_t		*pgd;
T
Thomas Gleixner 已提交
38 39
	pgprot_t	mask_set;
	pgprot_t	mask_clr;
40
	unsigned long	numpages;
41
	unsigned long	curpage;
T
Thomas Gleixner 已提交
42
	unsigned long	pfn;
43 44
	unsigned int	flags;
	unsigned int	force_split		: 1,
45
			force_static_prot	: 1;
46
	struct page	**pages;
T
Thomas Gleixner 已提交
47 48
};

49
enum cpa_warn {
50
	CPA_CONFLICT,
51 52 53 54 55 56
	CPA_PROTECT,
	CPA_DETECT,
};

static const int cpa_warn_level = CPA_PROTECT;

57 58 59 60 61 62 63 64
/*
 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
 * entries change the page attribute in parallel to some other cpu
 * splitting a large page entry along with changing the attribute.
 */
static DEFINE_SPINLOCK(cpa_lock);

65 66
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
67
#define CPA_PAGES_ARRAY 4
68
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
69

70
#ifdef CONFIG_PROC_FS
71 72
static unsigned long direct_pages_count[PG_LEVEL_NUM];

73
void update_page_count(int level, unsigned long pages)
74 75
{
	/* Protect against CPA */
A
Andrea Arcangeli 已提交
76
	spin_lock(&pgd_lock);
77
	direct_pages_count[level] += pages;
A
Andrea Arcangeli 已提交
78
	spin_unlock(&pgd_lock);
79 80 81 82
}

static void split_page_count(int level)
{
83 84 85
	if (direct_pages_count[level] == 0)
		return;

86 87 88 89
	direct_pages_count[level]--;
	direct_pages_count[level - 1] += PTRS_PER_PTE;
}

90
void arch_report_meminfo(struct seq_file *m)
91
{
92
	seq_printf(m, "DirectMap4k:    %8lu kB\n",
H
Hugh Dickins 已提交
93 94
			direct_pages_count[PG_LEVEL_4K] << 2);
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
95
	seq_printf(m, "DirectMap2M:    %8lu kB\n",
H
Hugh Dickins 已提交
96 97
			direct_pages_count[PG_LEVEL_2M] << 11);
#else
98
	seq_printf(m, "DirectMap4M:    %8lu kB\n",
H
Hugh Dickins 已提交
99 100 101
			direct_pages_count[PG_LEVEL_2M] << 12);
#endif
	if (direct_gbpages)
102
		seq_printf(m, "DirectMap1G:    %8lu kB\n",
H
Hugh Dickins 已提交
103
			direct_pages_count[PG_LEVEL_1G] << 20);
104
}
105 106 107
#else
static inline void split_page_count(int level) { }
#endif
108

109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
#ifdef CONFIG_X86_CPA_STATISTICS

static unsigned long cpa_1g_checked;
static unsigned long cpa_1g_sameprot;
static unsigned long cpa_1g_preserved;
static unsigned long cpa_2m_checked;
static unsigned long cpa_2m_sameprot;
static unsigned long cpa_2m_preserved;
static unsigned long cpa_4k_install;

static inline void cpa_inc_1g_checked(void)
{
	cpa_1g_checked++;
}

static inline void cpa_inc_2m_checked(void)
{
	cpa_2m_checked++;
}

static inline void cpa_inc_4k_install(void)
{
	cpa_4k_install++;
}

static inline void cpa_inc_lp_sameprot(int level)
{
	if (level == PG_LEVEL_1G)
		cpa_1g_sameprot++;
	else
		cpa_2m_sameprot++;
}

static inline void cpa_inc_lp_preserved(int level)
{
	if (level == PG_LEVEL_1G)
		cpa_1g_preserved++;
	else
		cpa_2m_preserved++;
}

static int cpastats_show(struct seq_file *m, void *p)
{
	seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
	seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
	seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
	seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
	seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
	seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
	seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
	return 0;
}

static int cpastats_open(struct inode *inode, struct file *file)
{
	return single_open(file, cpastats_show, NULL);
}

static const struct file_operations cpastats_fops = {
	.open		= cpastats_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static int __init cpa_stats_init(void)
{
	debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
			    &cpastats_fops);
	return 0;
}
late_initcall(cpa_stats_init);
#else
static inline void cpa_inc_1g_checked(void) { }
static inline void cpa_inc_2m_checked(void) { }
static inline void cpa_inc_4k_install(void) { }
static inline void cpa_inc_lp_sameprot(int level) { }
static inline void cpa_inc_lp_preserved(int level) { }
#endif


190 191 192 193 194 195 196 197 198 199 200 201
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
	return addr >= start && addr < end;
}

static inline int
within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
{
	return addr >= start && addr <= end;
}

T
Thomas Gleixner 已提交
202 203 204 205
#ifdef CONFIG_X86_64

static inline unsigned long highmap_start_pfn(void)
{
206
	return __pa_symbol(_text) >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
207 208 209 210
}

static inline unsigned long highmap_end_pfn(void)
{
211 212
	/* Do not reference physical address outside the kernel. */
	return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
213 214
}

215
static bool __cpa_pfn_in_highmap(unsigned long pfn)
I
Ingo Molnar 已提交
216
{
217 218 219 220 221
	/*
	 * Kernel text has an alias mapping at a high address, known
	 * here as "highmap".
	 */
	return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
222 223
}

224 225 226
#else

static bool __cpa_pfn_in_highmap(unsigned long pfn)
227
{
228 229
	/* There is no highmap on 32-bit */
	return false;
230 231
}

232 233
#endif

234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
/*
 * See set_mce_nospec().
 *
 * Machine check recovery code needs to change cache mode of poisoned pages to
 * UC to avoid speculative access logging another error. But passing the
 * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
 * speculative access. So we cheat and flip the top bit of the address. This
 * works fine for the code that updates the page tables. But at the end of the
 * process we need to flush the TLB and cache and the non-canonical address
 * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
 *
 * But in the common case we already have a canonical address. This code
 * will fix the top bit if needed and is a no-op otherwise.
 */
static inline unsigned long fix_addr(unsigned long addr)
{
#ifdef CONFIG_X86_64
	return (long)(addr << 1) >> 1;
#else
	return addr;
#endif
}

257
static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
258 259 260 261 262 263 264 265 266 267 268 269 270
{
	if (cpa->flags & CPA_PAGES_ARRAY) {
		struct page *page = cpa->pages[idx];

		if (unlikely(PageHighMem(page)))
			return 0;

		return (unsigned long)page_address(page);
	}

	if (cpa->flags & CPA_ARRAY)
		return cpa->vaddr[idx];

271
	return *cpa->vaddr + idx * PAGE_SIZE;
272 273
}

T
Thomas Gleixner 已提交
274 275 276
/*
 * Flushing functions
 */
277

P
Peter Zijlstra 已提交
278
static void clflush_cache_range_opt(void *vaddr, unsigned int size)
T
Thomas Gleixner 已提交
279
{
280 281
	const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
	void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
282
	void *vend = vaddr + size;
283 284 285

	if (p >= vend)
		return;
T
Thomas Gleixner 已提交
286

287
	for (; p < vend; p += clflush_size)
288
		clflushopt(p);
P
Peter Zijlstra 已提交
289
}
I
Ingo Molnar 已提交
290

P
Peter Zijlstra 已提交
291 292 293 294 295 296 297 298 299 300 301 302
/**
 * clflush_cache_range - flush a cache range with clflush
 * @vaddr:	virtual start address
 * @size:	number of bytes to flush
 *
 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
 * SFENCE to avoid ordering issues.
 */
void clflush_cache_range(void *vaddr, unsigned int size)
{
	mb();
	clflush_cache_range_opt(vaddr, size);
303
	mb();
T
Thomas Gleixner 已提交
304
}
305
EXPORT_SYMBOL_GPL(clflush_cache_range);
T
Thomas Gleixner 已提交
306

307 308 309 310 311 312
void arch_invalidate_pmem(void *addr, size_t size)
{
	clflush_cache_range(addr, size);
}
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);

313
static void __cpa_flush_all(void *arg)
T
Thomas Gleixner 已提交
314
{
315 316
	unsigned long cache = (unsigned long)arg;

T
Thomas Gleixner 已提交
317 318 319 320 321 322
	/*
	 * Flush all to work around Errata in early athlons regarding
	 * large page flushing.
	 */
	__flush_tlb_all();

323
	if (cache && boot_cpu_data.x86 >= 4)
T
Thomas Gleixner 已提交
324 325 326
		wbinvd();
}

327
static void cpa_flush_all(unsigned long cache)
T
Thomas Gleixner 已提交
328
{
329
	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
T
Thomas Gleixner 已提交
330

331
	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
T
Thomas Gleixner 已提交
332 333
}

334
void __cpa_flush_tlb(void *data)
335
{
336 337
	struct cpa_data *cpa = data;
	unsigned int i;
338

339
	for (i = 0; i < cpa->numpages; i++)
340
		__flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
341 342
}

343
static void cpa_flush(struct cpa_data *data, int cache)
344
{
345
	struct cpa_data *cpa = data;
346
	unsigned int i;
347

348
	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
349

350 351
	if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
		cpa_flush_all(cache);
352
		return;
I
Ingo Molnar 已提交
353
	}
354

355
	if (cpa->numpages <= tlb_single_page_flush_ceiling)
356
		on_each_cpu(__cpa_flush_tlb, cpa, 1);
357 358
	else
		flush_tlb_all();
359 360

	if (!cache)
361 362
		return;

P
Peter Zijlstra 已提交
363
	mb();
364 365 366
	for (i = 0; i < cpa->numpages; i++) {
		unsigned long addr = __cpa_addr(cpa, i);
		unsigned int level;
367

368
		pte_t *pte = lookup_address(addr, &level);
369 370 371 372 373

		/*
		 * Only flush present addresses:
		 */
		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
374
			clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
375
	}
P
Peter Zijlstra 已提交
376
	mb();
377 378
}

379 380 381 382 383 384 385
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
		     unsigned long r2_start, unsigned long r2_end)
{
	return (r1_start <= r2_end && r1_end >= r2_start) ||
		(r2_start <= r1_end && r2_end >= r1_start);
}

386
#ifdef CONFIG_PCI_BIOS
387
/*
388 389
 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
 * based config access (CONFIG_PCI_GOBIOS) support.
390
 */
391
#define BIOS_PFN	PFN_DOWN(BIOS_BEGIN)
392
#define BIOS_PFN_END	PFN_DOWN(BIOS_END - 1)
393

394
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
395
{
396
	if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
397 398 399 400
		return _PAGE_NX;
	return 0;
}
#else
401
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
402 403 404
{
	return 0;
}
405
#endif
406

407 408 409 410 411
/*
 * The .rodata section needs to be read-only. Using the pfn catches all
 * aliases.  This also includes __ro_after_init, so do not enforce until
 * kernel_set_to_readonly is true.
 */
412
static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
413
{
414 415 416 417 418 419 420
	unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));

	/*
	 * Note: __end_rodata is at page aligned and not inclusive, so
	 * subtract 1 to get the last enforced PFN in the rodata area.
	 */
	epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
421

422
	if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
423 424 425 426 427 428 429 430 431 432 433 434
		return _PAGE_RW;
	return 0;
}

/*
 * Protect kernel text against becoming non executable by forbidding
 * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
 * out of which the kernel actually executes.  Do not protect the low
 * mapping.
 *
 * This does not cover __inittext since that is gone after boot.
 */
435
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
436
{
437 438 439 440
	unsigned long t_end = (unsigned long)_etext - 1;
	unsigned long t_start = (unsigned long)_text;

	if (overlaps(start, end, t_start, t_end))
441 442 443
		return _PAGE_NX;
	return 0;
}
444

445
#if defined(CONFIG_X86_64)
446 447 448 449 450 451 452 453 454
/*
 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 * kernel text mappings for the large page aligned text, rodata sections
 * will be always read-only. For the kernel identity mappings covering the
 * holes caused by this alignment can be anything that user asks.
 *
 * This will preserve the large page mappings for kernel text/data at no
 * extra cost.
 */
455 456
static pgprotval_t protect_kernel_text_ro(unsigned long start,
					  unsigned long end)
457
{
458 459
	unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
	unsigned long t_start = (unsigned long)_text;
460 461
	unsigned int level;

462
	if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
463
		return 0;
464
	/*
465 466 467
	 * Don't enforce the !RW mapping for the kernel text mapping, if
	 * the current mapping is already using small page mapping.  No
	 * need to work hard to preserve large page mappings in this case.
468
	 *
469 470 471 472 473 474
	 * This also fixes the Linux Xen paravirt guest boot failure caused
	 * by unexpected read-only mappings for kernel identity
	 * mappings. In this paravirt guest case, the kernel text mapping
	 * and the kernel identity mapping share the same page-table pages,
	 * so the protections for kernel text and identity mappings have to
	 * be the same.
475
	 */
476
	if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
477 478 479 480
		return _PAGE_RW;
	return 0;
}
#else
481 482
static pgprotval_t protect_kernel_text_ro(unsigned long start,
					  unsigned long end)
483 484 485
{
	return 0;
}
486 487
#endif

488 489 490 491 492 493 494 495 496 497
static inline bool conflicts(pgprot_t prot, pgprotval_t val)
{
	return (pgprot_val(prot) & ~val) != pgprot_val(prot);
}

static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
				  unsigned long start, unsigned long end,
				  unsigned long pfn, const char *txt)
{
	static const char *lvltxt[] = {
498
		[CPA_CONFLICT]	= "conflict",
499 500 501 502 503 504 505 506 507 508 509 510
		[CPA_PROTECT]	= "protect",
		[CPA_DETECT]	= "detect",
	};

	if (warnlvl > cpa_warn_level || !conflicts(prot, val))
		return;

	pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
		lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
		(unsigned long long)val);
}

511 512 513 514 515 516
/*
 * Certain areas of memory on x86 require very specific protection flags,
 * for example the BIOS area or kernel text. Callers don't always get this
 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 * checks and fixes these known static required protection bits.
 */
517
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
518
					  unsigned long pfn, unsigned long npg,
519
					  unsigned long lpsize, int warnlvl)
520
{
521
	pgprotval_t forbidden, res;
522
	unsigned long end;
523

524 525 526 527 528 529 530
	/*
	 * There is no point in checking RW/NX conflicts when the requested
	 * mapping is setting the page !PRESENT.
	 */
	if (!(pgprot_val(prot) & _PAGE_PRESENT))
		return prot;

531
	/* Operate on the virtual address */
532
	end = start + npg * PAGE_SIZE - 1;
533 534 535 536 537

	res = protect_kernel_text(start, end);
	check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
	forbidden = res;

538 539 540 541 542 543 544 545 546 547 548
	/*
	 * Special case to preserve a large page. If the change spawns the
	 * full large page mapping then there is no point to split it
	 * up. Happens with ftrace and is going to be removed once ftrace
	 * switched to text_poke().
	 */
	if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
		res = protect_kernel_text_ro(start, end);
		check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
		forbidden |= res;
	}
549 550

	/* Check the PFN directly */
551 552 553 554 555 556 557
	res = protect_pci_bios(pfn, pfn + npg - 1);
	check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
	forbidden |= res;

	res = protect_rodata(pfn, pfn + npg - 1);
	check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
	forbidden |= res;
I
Ingo Molnar 已提交
558

559
	return __pgprot(pgprot_val(prot) & ~forbidden);
I
Ingo Molnar 已提交
560 561
}

562 563 564 565 566 567
/*
 * Lookup the page table entry for a virtual address in a specific pgd.
 * Return a pointer to the entry and the level of the mapping.
 */
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
			     unsigned int *level)
568
{
569
	p4d_t *p4d;
L
Linus Torvalds 已提交
570 571
	pud_t *pud;
	pmd_t *pmd;
572

T
Thomas Gleixner 已提交
573 574
	*level = PG_LEVEL_NONE;

L
Linus Torvalds 已提交
575 576
	if (pgd_none(*pgd))
		return NULL;
I
Ingo Molnar 已提交
577

578 579 580 581 582 583 584 585 586
	p4d = p4d_offset(pgd, address);
	if (p4d_none(*p4d))
		return NULL;

	*level = PG_LEVEL_512G;
	if (p4d_large(*p4d) || !p4d_present(*p4d))
		return (pte_t *)p4d;

	pud = pud_offset(p4d, address);
L
Linus Torvalds 已提交
587 588
	if (pud_none(*pud))
		return NULL;
589 590 591 592 593

	*level = PG_LEVEL_1G;
	if (pud_large(*pud) || !pud_present(*pud))
		return (pte_t *)pud;

L
Linus Torvalds 已提交
594 595 596
	pmd = pmd_offset(pud, address);
	if (pmd_none(*pmd))
		return NULL;
T
Thomas Gleixner 已提交
597 598

	*level = PG_LEVEL_2M;
T
Thomas Gleixner 已提交
599
	if (pmd_large(*pmd) || !pmd_present(*pmd))
L
Linus Torvalds 已提交
600 601
		return (pte_t *)pmd;

T
Thomas Gleixner 已提交
602
	*level = PG_LEVEL_4K;
I
Ingo Molnar 已提交
603

604 605
	return pte_offset_kernel(pmd, address);
}
606 607 608 609 610 611 612 613 614 615 616

/*
 * Lookup the page table entry for a virtual address. Return a pointer
 * to the entry and the level of the mapping.
 *
 * Note: We return pud and pmd either when the entry is marked large
 * or when the present bit is not set. Otherwise we would return a
 * pointer to a nonexisting mapping.
 */
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
617
	return lookup_address_in_pgd(pgd_offset_k(address), address, level);
618
}
619
EXPORT_SYMBOL_GPL(lookup_address);
620

621 622 623
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
				  unsigned int *level)
{
624
	if (cpa->pgd)
625
		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
626 627
					       address, level);

628
	return lookup_address(address, level);
629 630
}

631 632 633 634 635 636 637
/*
 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 * or NULL if not present.
 */
pmd_t *lookup_pmd_address(unsigned long address)
{
	pgd_t *pgd;
638
	p4d_t *p4d;
639 640 641 642 643 644
	pud_t *pud;

	pgd = pgd_offset_k(address);
	if (pgd_none(*pgd))
		return NULL;

645 646 647 648 649
	p4d = p4d_offset(pgd, address);
	if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
		return NULL;

	pud = pud_offset(p4d, address);
650 651 652 653 654 655
	if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
		return NULL;

	return pmd_offset(pud, address);
}

656 657 658 659 660 661 662 663 664 665 666 667 668 669
/*
 * This is necessary because __pa() does not work on some
 * kinds of memory, like vmalloc() or the alloc_remap()
 * areas on 32-bit NUMA systems.  The percpu areas can
 * end up in this kind of memory, for instance.
 *
 * This could be optimized, but it is only intended to be
 * used at inititalization time, and keeping it
 * unoptimized should increase the testing coverage for
 * the more obscure platforms.
 */
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
	unsigned long virt_addr = (unsigned long)__virt_addr;
670 671
	phys_addr_t phys_addr;
	unsigned long offset;
672 673 674 675 676
	enum pg_level level;
	pte_t *pte;

	pte = lookup_address(virt_addr, &level);
	BUG_ON(!pte);
677

678 679 680 681 682
	/*
	 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
	 * before being left-shifted PAGE_SHIFT bits -- this trick is to
	 * make 32-PAE kernel work correctly.
	 */
683 684
	switch (level) {
	case PG_LEVEL_1G:
685
		phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
686 687 688
		offset = virt_addr & ~PUD_PAGE_MASK;
		break;
	case PG_LEVEL_2M:
689
		phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
690 691 692
		offset = virt_addr & ~PMD_PAGE_MASK;
		break;
	default:
693
		phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
694 695 696 697
		offset = virt_addr & ~PAGE_MASK;
	}

	return (phys_addr_t)(phys_addr | offset);
698 699 700
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);

I
Ingo Molnar 已提交
701 702 703
/*
 * Set the new pmd in all the pgds we know about:
 */
I
Ingo Molnar 已提交
704
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
705 706 707
{
	/* change init_mm */
	set_pte_atomic(kpte, pte);
708
#ifdef CONFIG_X86_32
709
	if (!SHARED_KERNEL_PMD) {
710 711
		struct page *page;

712
		list_for_each_entry(page, &pgd_list, lru) {
713
			pgd_t *pgd;
714
			p4d_t *p4d;
715 716 717 718
			pud_t *pud;
			pmd_t *pmd;

			pgd = (pgd_t *)page_address(page) + pgd_index(address);
719 720
			p4d = p4d_offset(pgd, address);
			pud = pud_offset(p4d, address);
721 722 723
			pmd = pmd_offset(pud, address);
			set_pte_atomic((pte_t *)pmd, pte);
		}
L
Linus Torvalds 已提交
724
	}
725
#endif
L
Linus Torvalds 已提交
726 727
}

728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744
static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
{
	/*
	 * _PAGE_GLOBAL means "global page" for present PTEs.
	 * But, it is also used to indicate _PAGE_PROTNONE
	 * for non-present PTEs.
	 *
	 * This ensures that a _PAGE_GLOBAL PTE going from
	 * present to non-present is not confused as
	 * _PAGE_PROTNONE.
	 */
	if (!(pgprot_val(prot) & _PAGE_PRESENT))
		pgprot_val(prot) &= ~_PAGE_GLOBAL;

	return prot;
}

745 746
static int __should_split_large_page(pte_t *kpte, unsigned long address,
				     struct cpa_data *cpa)
747
{
748
	unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
749
	pgprot_t old_prot, new_prot, req_prot, chk_prot;
750
	pte_t new_pte, *tmp;
751
	enum pg_level level;
752 753 754 755 756

	/*
	 * Check for races, another CPU might have split this page
	 * up already:
	 */
757
	tmp = _lookup_address_cpa(cpa, address, &level);
758
	if (tmp != kpte)
759
		return 1;
760 761 762

	switch (level) {
	case PG_LEVEL_2M:
763 764
		old_prot = pmd_pgprot(*(pmd_t *)kpte);
		old_pfn = pmd_pfn(*(pmd_t *)kpte);
765
		cpa_inc_2m_checked();
766
		break;
767
	case PG_LEVEL_1G:
768 769
		old_prot = pud_pgprot(*(pud_t *)kpte);
		old_pfn = pud_pfn(*(pud_t *)kpte);
770
		cpa_inc_1g_checked();
771
		break;
772
	default:
773
		return -EINVAL;
774 775
	}

776 777 778
	psize = page_level_size(level);
	pmask = page_level_mask(level);

779 780 781 782
	/*
	 * Calculate the number of pages, which fit into this large
	 * page starting at address:
	 */
783 784
	lpaddr = (address + psize) & pmask;
	numpages = (lpaddr - address) >> PAGE_SHIFT;
785 786
	if (numpages < cpa->numpages)
		cpa->numpages = numpages;
787 788 789

	/*
	 * We are safe now. Check whether the new pgprot is the same:
790 791
	 * Convert protection attributes to 4k-format, as cpa->mask* are set
	 * up accordingly.
792
	 */
793

794
	/* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
795
	req_prot = pgprot_large_2_4k(old_prot);
796

797 798
	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
T
Thomas Gleixner 已提交
799

800 801 802 803 804 805
	/*
	 * req_prot is in format of 4k pages. It must be converted to large
	 * page format: the caching mode includes the PAT bit located at
	 * different bit positions in the two formats.
	 */
	req_prot = pgprot_4k_2_large(req_prot);
806
	req_prot = pgprot_clear_protnone_bits(req_prot);
807
	if (pgprot_val(req_prot) & _PAGE_PRESENT)
808
		pgprot_val(req_prot) |= _PAGE_PSE;
809

T
Thomas Gleixner 已提交
810
	/*
811 812
	 * old_pfn points to the large page base pfn. So we need to add the
	 * offset of the virtual address:
T
Thomas Gleixner 已提交
813
	 */
814
	pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
T
Thomas Gleixner 已提交
815 816
	cpa->pfn = pfn;

817 818 819 820 821 822
	/*
	 * Calculate the large page base address and the number of 4K pages
	 * in the large page
	 */
	lpaddr = address & pmask;
	numpages = psize >> PAGE_SHIFT;
823

824 825 826 827 828 829
	/*
	 * Sanity check that the existing mapping is correct versus the static
	 * protections. static_protections() guards against !PRESENT, so no
	 * extra conditional required here.
	 */
	chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
830
				      psize, CPA_CONFLICT);
831 832 833 834 835 836 837 838 839 840

	if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
		/*
		 * Split the large page and tell the split code to
		 * enforce static protections.
		 */
		cpa->force_static_prot = 1;
		return 1;
	}

841 842 843 844 845 846 847 848 849 850 851 852 853 854
	/*
	 * Optimization: If the requested pgprot is the same as the current
	 * pgprot, then the large page can be preserved and no updates are
	 * required independent of alignment and length of the requested
	 * range. The above already established that the current pgprot is
	 * correct, which in consequence makes the requested pgprot correct
	 * as well if it is the same. The static protection scan below will
	 * not come to a different conclusion.
	 */
	if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
		cpa_inc_lp_sameprot(level);
		return 0;
	}

855
	/*
856
	 * If the requested range does not cover the full page, split it up
857
	 */
858 859
	if (address != lpaddr || cpa->numpages != numpages)
		return 1;
860 861

	/*
862 863
	 * Check whether the requested pgprot is conflicting with a static
	 * protection requirement in the large page.
864
	 */
865
	new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
866
				      psize, CPA_DETECT);
867 868

	/*
869 870 871 872 873 874 875
	 * If there is a conflict, split the large page.
	 *
	 * There used to be a 4k wise evaluation trying really hard to
	 * preserve the large pages, but experimentation has shown, that this
	 * does not help at all. There might be corner cases which would
	 * preserve one large page occasionally, but it's really not worth the
	 * extra code and cycles for the common case.
876
	 */
877
	if (pgprot_val(req_prot) != pgprot_val(new_prot))
878 879 880 881 882 883
		return 1;

	/* All checks passed. Update the large page mapping. */
	new_pte = pfn_pte(old_pfn, new_prot);
	__set_pmd_pte(kpte, address, new_pte);
	cpa->flags |= CPA_FLUSHTLB;
884
	cpa_inc_lp_preserved(level);
885 886 887 888 889 890 891 892 893 894
	return 0;
}

static int should_split_large_page(pte_t *kpte, unsigned long address,
				   struct cpa_data *cpa)
{
	int do_split;

	if (cpa->force_split)
		return 1;
895

896 897
	spin_lock(&pgd_lock);
	do_split = __should_split_large_page(kpte, address, cpa);
A
Andrea Arcangeli 已提交
898
	spin_unlock(&pgd_lock);
I
Ingo Molnar 已提交
899

I
Ingo Molnar 已提交
900
	return do_split;
901 902
}

903 904 905 906 907 908 909 910 911 912 913 914 915 916
static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
			  pgprot_t ref_prot, unsigned long address,
			  unsigned long size)
{
	unsigned int npg = PFN_DOWN(size);
	pgprot_t prot;

	/*
	 * If should_split_large_page() discovered an inconsistent mapping,
	 * remove the invalid protection in the split mapping.
	 */
	if (!cpa->force_static_prot)
		goto set;

917 918
	/* Hand in lpsize = 0 to enforce the protection mechanism */
	prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);
919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938

	if (pgprot_val(prot) == pgprot_val(ref_prot))
		goto set;

	/*
	 * If this is splitting a PMD, fix it up. PUD splits cannot be
	 * fixed trivially as that would require to rescan the newly
	 * installed PMD mappings after returning from split_large_page()
	 * so an eventual further split can allocate the necessary PTE
	 * pages. Warn for now and revisit it in case this actually
	 * happens.
	 */
	if (size == PAGE_SIZE)
		ref_prot = prot;
	else
		pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
	set_pte(pte, pfn_pte(pfn, ref_prot));
}

939
static int
940 941
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
		   struct page *base)
942
{
943
	unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
944
	pte_t *pbase = (pte_t *)page_address(base);
I
Ingo Molnar 已提交
945 946
	unsigned int i, level;
	pgprot_t ref_prot;
947
	pte_t *tmp;
948

A
Andrea Arcangeli 已提交
949
	spin_lock(&pgd_lock);
950 951 952 953
	/*
	 * Check for races, another CPU might have split this page
	 * up for us already:
	 */
954
	tmp = _lookup_address_cpa(cpa, address, &level);
955 956 957 958
	if (tmp != kpte) {
		spin_unlock(&pgd_lock);
		return 1;
	}
959

960
	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
961

962 963 964
	switch (level) {
	case PG_LEVEL_2M:
		ref_prot = pmd_pgprot(*(pmd_t *)kpte);
965 966 967 968
		/*
		 * Clear PSE (aka _PAGE_PAT) and move
		 * PAT bit to correct position.
		 */
969
		ref_prot = pgprot_large_2_4k(ref_prot);
970
		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
971 972
		lpaddr = address & PMD_MASK;
		lpinc = PAGE_SIZE;
973
		break;
974

975 976 977
	case PG_LEVEL_1G:
		ref_prot = pud_pgprot(*(pud_t *)kpte);
		ref_pfn = pud_pfn(*(pud_t *)kpte);
978
		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
979 980
		lpaddr = address & PUD_MASK;
		lpinc = PMD_SIZE;
981
		/*
982
		 * Clear the PSE flags if the PRESENT flag is not set
983 984 985
		 * otherwise pmd_present/pmd_huge will return true
		 * even on a non present pmd.
		 */
986
		if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
987
			pgprot_val(ref_prot) &= ~_PAGE_PSE;
988 989 990 991 992
		break;

	default:
		spin_unlock(&pgd_lock);
		return 1;
993 994
	}

995
	ref_prot = pgprot_clear_protnone_bits(ref_prot);
996

997 998 999
	/*
	 * Get the target pfn from the original entry:
	 */
1000
	pfn = ref_pfn;
1001 1002
	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
		split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
1003

1004 1005 1006 1007 1008 1009
	if (virt_addr_valid(address)) {
		unsigned long pfn = PFN_DOWN(__pa(address));

		if (pfn_range_is_mapped(pfn, pfn + 1))
			split_page_count(level);
	}
1010

1011
	/*
1012
	 * Install the new, split up pagetable.
1013
	 *
1014 1015 1016
	 * We use the standard kernel pagetable protections for the new
	 * pagetable protections, the actual ptes set above control the
	 * primary protection behavior:
1017
	 */
1018
	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
1019 1020

	/*
1021 1022
	 * Do a global flush tlb after splitting the large page
	 * and before we do the actual change page attribute in the PTE.
1023
	 *
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
	 * Without this, we violate the TLB application note, that says:
	 * "The TLBs may contain both ordinary and large-page
	 *  translations for a 4-KByte range of linear addresses. This
	 *  may occur if software modifies the paging structures so that
	 *  the page size used for the address range changes. If the two
	 *  translations differ with respect to page frame or attributes
	 *  (e.g., permissions), processor behavior is undefined and may
	 *  be implementation-specific."
	 *
	 * We do this global tlb flush inside the cpa_lock, so that we
	 * don't allow any other cpu, with stale tlb entries change the
	 * page attribute in parallel, that also falls into the
	 * just split large page entry.
1037
	 */
1038
	flush_tlb_all();
1039
	spin_unlock(&pgd_lock);
1040

1041 1042
	return 0;
}
1043

1044 1045
static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
			    unsigned long address)
1046 1047 1048
{
	struct page *base;

1049
	if (!debug_pagealloc_enabled())
1050
		spin_unlock(&cpa_lock);
1051
	base = alloc_pages(GFP_KERNEL, 0);
1052
	if (!debug_pagealloc_enabled())
1053 1054 1055 1056
		spin_lock(&cpa_lock);
	if (!base)
		return -ENOMEM;

1057
	if (__split_large_page(cpa, kpte, address, base))
S
Suresh Siddha 已提交
1058
		__free_page(base);
1059 1060 1061 1062

	return 0;
}

1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
static bool try_to_free_pte_page(pte_t *pte)
{
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++)
		if (!pte_none(pte[i]))
			return false;

	free_page((unsigned long)pte);
	return true;
}

static bool try_to_free_pmd_page(pmd_t *pmd)
{
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++)
		if (!pmd_none(pmd[i]))
			return false;

	free_page((unsigned long)pmd);
	return true;
}

static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
	pte_t *pte = pte_offset_kernel(pmd, start);

	while (start < end) {
		set_pte(pte, __pte(0));

		start += PAGE_SIZE;
		pte++;
	}

	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
		pmd_clear(pmd);
		return true;
	}
	return false;
}

static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
			      unsigned long start, unsigned long end)
{
	if (unmap_pte_range(pmd, start, end))
		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
			pud_clear(pud);
}

static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
{
	pmd_t *pmd = pmd_offset(pud, start);

	/*
	 * Not on a 2MB page boundary?
	 */
	if (start & (PMD_SIZE - 1)) {
		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
		unsigned long pre_end = min_t(unsigned long, end, next_page);

		__unmap_pmd_range(pud, pmd, start, pre_end);

		start = pre_end;
		pmd++;
	}

	/*
	 * Try to unmap in 2M chunks.
	 */
	while (end - start >= PMD_SIZE) {
		if (pmd_large(*pmd))
			pmd_clear(pmd);
		else
			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);

		start += PMD_SIZE;
		pmd++;
	}

	/*
	 * 4K leftovers?
	 */
	if (start < end)
		return __unmap_pmd_range(pud, pmd, start, end);

	/*
	 * Try again to free the PMD page if haven't succeeded above.
	 */
	if (!pud_none(*pud))
		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
			pud_clear(pud);
}
1156

1157
static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
1158
{
1159
	pud_t *pud = pud_offset(p4d, start);
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199

	/*
	 * Not on a GB page boundary?
	 */
	if (start & (PUD_SIZE - 1)) {
		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
		unsigned long pre_end	= min_t(unsigned long, end, next_page);

		unmap_pmd_range(pud, start, pre_end);

		start = pre_end;
		pud++;
	}

	/*
	 * Try to unmap in 1G chunks?
	 */
	while (end - start >= PUD_SIZE) {

		if (pud_large(*pud))
			pud_clear(pud);
		else
			unmap_pmd_range(pud, start, start + PUD_SIZE);

		start += PUD_SIZE;
		pud++;
	}

	/*
	 * 2M leftovers?
	 */
	if (start < end)
		unmap_pmd_range(pud, start, end);

	/*
	 * No need to try to free the PUD page because we'll free it in
	 * populate_pgd's error path
	 */
}

1200 1201
static int alloc_pte_page(pmd_t *pmd)
{
1202
	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
1203 1204 1205 1206 1207 1208 1209
	if (!pte)
		return -1;

	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
	return 0;
}

1210 1211
static int alloc_pmd_page(pud_t *pud)
{
1212
	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
1213 1214 1215 1216 1217 1218 1219
	if (!pmd)
		return -1;

	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
	return 0;
}

1220 1221 1222 1223 1224 1225 1226 1227
static void populate_pte(struct cpa_data *cpa,
			 unsigned long start, unsigned long end,
			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
{
	pte_t *pte;

	pte = pte_offset_kernel(pmd, start);

1228
	pgprot = pgprot_clear_protnone_bits(pgprot);
1229 1230

	while (num_pages-- && start < end) {
1231
		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
1232 1233

		start	 += PAGE_SIZE;
1234
		cpa->pfn++;
1235 1236 1237
		pte++;
	}
}
1238

1239 1240 1241
static long populate_pmd(struct cpa_data *cpa,
			 unsigned long start, unsigned long end,
			 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
1242
{
1243
	long cur_pages = 0;
1244
	pmd_t *pmd;
1245
	pgprot_t pmd_pgprot;
1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276

	/*
	 * Not on a 2M boundary?
	 */
	if (start & (PMD_SIZE - 1)) {
		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;

		pre_end   = min_t(unsigned long, pre_end, next_page);
		cur_pages = (pre_end - start) >> PAGE_SHIFT;
		cur_pages = min_t(unsigned int, num_pages, cur_pages);

		/*
		 * Need a PTE page?
		 */
		pmd = pmd_offset(pud, start);
		if (pmd_none(*pmd))
			if (alloc_pte_page(pmd))
				return -1;

		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);

		start = pre_end;
	}

	/*
	 * We mapped them all?
	 */
	if (num_pages == cur_pages)
		return cur_pages;

1277 1278
	pmd_pgprot = pgprot_4k_2_large(pgprot);

1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
	while (end - start >= PMD_SIZE) {

		/*
		 * We cannot use a 1G page so allocate a PMD page if needed.
		 */
		if (pud_none(*pud))
			if (alloc_pmd_page(pud))
				return -1;

		pmd = pmd_offset(pud, start);

1290 1291
		set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
					canon_pgprot(pmd_pgprot))));
1292 1293

		start	  += PMD_SIZE;
1294
		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
		cur_pages += PMD_SIZE >> PAGE_SHIFT;
	}

	/*
	 * Map trailing 4K pages.
	 */
	if (start < end) {
		pmd = pmd_offset(pud, start);
		if (pmd_none(*pmd))
			if (alloc_pte_page(pmd))
				return -1;

		populate_pte(cpa, start, end, num_pages - cur_pages,
			     pmd, pgprot);
	}
	return num_pages;
}
1312

1313 1314
static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
			pgprot_t pgprot)
1315 1316 1317
{
	pud_t *pud;
	unsigned long end;
1318
	long cur_pages = 0;
1319
	pgprot_t pud_pgprot;
1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334

	end = start + (cpa->numpages << PAGE_SHIFT);

	/*
	 * Not on a Gb page boundary? => map everything up to it with
	 * smaller pages.
	 */
	if (start & (PUD_SIZE - 1)) {
		unsigned long pre_end;
		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;

		pre_end   = min_t(unsigned long, end, next_page);
		cur_pages = (pre_end - start) >> PAGE_SHIFT;
		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);

1335
		pud = pud_offset(p4d, start);
1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355

		/*
		 * Need a PMD page?
		 */
		if (pud_none(*pud))
			if (alloc_pmd_page(pud))
				return -1;

		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
					 pud, pgprot);
		if (cur_pages < 0)
			return cur_pages;

		start = pre_end;
	}

	/* We mapped them all? */
	if (cpa->numpages == cur_pages)
		return cur_pages;

1356
	pud = pud_offset(p4d, start);
1357
	pud_pgprot = pgprot_4k_2_large(pgprot);
1358 1359 1360 1361

	/*
	 * Map everything starting from the Gb boundary, possibly with 1G pages
	 */
1362
	while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
1363 1364
		set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
				   canon_pgprot(pud_pgprot))));
1365 1366

		start	  += PUD_SIZE;
1367
		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
1368 1369 1370 1371 1372 1373
		cur_pages += PUD_SIZE >> PAGE_SHIFT;
		pud++;
	}

	/* Map trailing leftover */
	if (start < end) {
1374
		long tmp;
1375

1376
		pud = pud_offset(p4d, start);
1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
		if (pud_none(*pud))
			if (alloc_pmd_page(pud))
				return -1;

		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
				   pud, pgprot);
		if (tmp < 0)
			return cur_pages;

		cur_pages += tmp;
	}
	return cur_pages;
}
1390 1391 1392 1393 1394 1395 1396 1397 1398

/*
 * Restrictions for kernel page table do not necessarily apply when mapping in
 * an alternate PGD.
 */
static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
	pud_t *pud = NULL;	/* shut up gcc */
1399
	p4d_t *p4d;
1400
	pgd_t *pgd_entry;
1401
	long ret;
1402 1403 1404

	pgd_entry = cpa->pgd + pgd_index(addr);

1405
	if (pgd_none(*pgd_entry)) {
1406
		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
1407 1408 1409 1410 1411 1412
		if (!p4d)
			return -1;

		set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
	}

1413 1414 1415
	/*
	 * Allocate a PUD page and hand it down for mapping.
	 */
1416 1417
	p4d = p4d_offset(pgd_entry, addr);
	if (p4d_none(*p4d)) {
1418
		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
1419 1420
		if (!pud)
			return -1;
1421

1422
		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
1423 1424 1425 1426 1427
	}

	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);

1428
	ret = populate_pud(cpa, addr, p4d, pgprot);
1429
	if (ret < 0) {
1430 1431 1432 1433 1434
		/*
		 * Leave the PUD page in place in case some other CPU or thread
		 * already found it, but remove any useless entries we just
		 * added to it.
		 */
1435
		unmap_pud_range(p4d, addr,
1436
				addr + (cpa->numpages << PAGE_SHIFT));
1437
		return ret;
1438
	}
1439

1440 1441 1442 1443
	cpa->numpages = ret;
	return 0;
}

1444 1445 1446
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
			       int primary)
{
1447 1448 1449 1450 1451 1452
	if (cpa->pgd) {
		/*
		 * Right now, we only execute this code path when mapping
		 * the EFI virtual memory map regions, no other users
		 * provide a ->pgd value. This may change in the future.
		 */
1453
		return populate_pgd(cpa, vaddr);
1454
	}
1455

1456 1457 1458
	/*
	 * Ignore all non primary paths.
	 */
1459 1460
	if (!primary) {
		cpa->numpages = 1;
1461
		return 0;
1462
	}
1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475

	/*
	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
	 * to have holes.
	 * Also set numpages to '1' indicating that we processed cpa req for
	 * one virtual address page and its pfn. TBD: numpages can be set based
	 * on the initial value and the level returned by lookup_address().
	 */
	if (within(vaddr, PAGE_OFFSET,
		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
		cpa->numpages = 1;
		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
		return 0;
1476 1477 1478 1479

	} else if (__cpa_pfn_in_highmap(cpa->pfn)) {
		/* Faults in the highmap are OK, so do not warn: */
		return -EFAULT;
1480 1481 1482 1483 1484 1485 1486 1487 1488
	} else {
		WARN(1, KERN_WARNING "CPA: called for zero pte. "
			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
			*cpa->vaddr);

		return -EFAULT;
	}
}

T
Thomas Gleixner 已提交
1489
static int __change_page_attr(struct cpa_data *cpa, int primary)
1490
{
1491
	unsigned long address;
1492 1493
	int do_split, err;
	unsigned int level;
T
Thomas Gleixner 已提交
1494
	pte_t *kpte, old_pte;
L
Linus Torvalds 已提交
1495

1496
	address = __cpa_addr(cpa, cpa->curpage);
1497
repeat:
1498
	kpte = _lookup_address_cpa(cpa, address, &level);
L
Linus Torvalds 已提交
1499
	if (!kpte)
1500
		return __cpa_process_fault(cpa, address, primary);
T
Thomas Gleixner 已提交
1501 1502

	old_pte = *kpte;
1503
	if (pte_none(old_pte))
1504
		return __cpa_process_fault(cpa, address, primary);
1505

T
Thomas Gleixner 已提交
1506
	if (level == PG_LEVEL_4K) {
T
Thomas Gleixner 已提交
1507
		pte_t new_pte;
1508
		pgprot_t new_prot = pte_pgprot(old_pte);
T
Thomas Gleixner 已提交
1509
		unsigned long pfn = pte_pfn(old_pte);
I
Ingo Molnar 已提交
1510

T
Thomas Gleixner 已提交
1511 1512
		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
I
Ingo Molnar 已提交
1513

1514
		cpa_inc_4k_install();
1515 1516
		/* Hand in lpsize = 0 to enforce the protection mechanism */
		new_prot = static_protections(new_prot, address, pfn, 1, 0,
1517
					      CPA_PROTECT);
I
Ingo Molnar 已提交
1518

1519
		new_prot = pgprot_clear_protnone_bits(new_prot);
1520

1521 1522 1523 1524 1525
		/*
		 * We need to keep the pfn from the existing PTE,
		 * after all we're only going to change it's attributes
		 * not the memory it points to
		 */
1526
		new_pte = pfn_pte(pfn, new_prot);
T
Thomas Gleixner 已提交
1527
		cpa->pfn = pfn;
1528 1529 1530 1531 1532
		/*
		 * Do we really change anything ?
		 */
		if (pte_val(old_pte) != pte_val(new_pte)) {
			set_pte_atomic(kpte, new_pte);
1533
			cpa->flags |= CPA_FLUSHTLB;
1534
		}
1535
		cpa->numpages = 1;
1536
		return 0;
L
Linus Torvalds 已提交
1537
	}
1538 1539 1540 1541 1542

	/*
	 * Check, whether we can keep the large page intact
	 * and just change the pte:
	 */
1543
	do_split = should_split_large_page(kpte, address, cpa);
1544 1545
	/*
	 * When the range fits into the existing large page,
1546
	 * return. cp->numpages and cpa->tlbflush have been updated in
1547 1548
	 * try_large_page:
	 */
I
Ingo Molnar 已提交
1549 1550
	if (do_split <= 0)
		return do_split;
1551 1552 1553 1554

	/*
	 * We have to split the large page:
	 */
1555
	err = split_large_page(cpa, kpte, address);
1556
	if (!err)
I
Ingo Molnar 已提交
1557
		goto repeat;
I
Ingo Molnar 已提交
1558

I
Ingo Molnar 已提交
1559
	return err;
1560
}
L
Linus Torvalds 已提交
1561

T
Thomas Gleixner 已提交
1562 1563 1564
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);

static int cpa_process_alias(struct cpa_data *cpa)
L
Linus Torvalds 已提交
1565
{
T
Thomas Gleixner 已提交
1566
	struct cpa_data alias_cpa;
T
Tejun Heo 已提交
1567
	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1568
	unsigned long vaddr;
T
Tejun Heo 已提交
1569
	int ret;
1570

1571
	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
T
Thomas Gleixner 已提交
1572
		return 0;
1573

1574 1575 1576 1577
	/*
	 * No need to redo, when the primary call touched the direct
	 * mapping already:
	 */
1578
	vaddr = __cpa_addr(cpa, cpa->curpage);
1579
	if (!(within(vaddr, PAGE_OFFSET,
1580
		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1581

1582
		alias_cpa = *cpa;
T
Tejun Heo 已提交
1583
		alias_cpa.vaddr = &laddr;
1584
		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1585
		alias_cpa.curpage = 0;
1586

1587
		ret = __change_page_attr_set_clr(&alias_cpa, 0);
T
Tejun Heo 已提交
1588 1589
		if (ret)
			return ret;
1590
	}
1591 1592

#ifdef CONFIG_X86_64
A
Arjan van de Ven 已提交
1593
	/*
T
Tejun Heo 已提交
1594 1595
	 * If the primary call didn't touch the high mapping already
	 * and the physical address is inside the kernel map, we need
1596
	 * to touch the high mapped kernel as well:
A
Arjan van de Ven 已提交
1597
	 */
T
Tejun Heo 已提交
1598
	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1599
	    __cpa_pfn_in_highmap(cpa->pfn)) {
T
Tejun Heo 已提交
1600 1601 1602 1603 1604
		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
					       __START_KERNEL_map - phys_base;
		alias_cpa = *cpa;
		alias_cpa.vaddr = &temp_cpa_vaddr;
		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1605
		alias_cpa.curpage = 0;
T
Thomas Gleixner 已提交
1606

T
Tejun Heo 已提交
1607 1608 1609 1610 1611 1612
		/*
		 * The high mapping range is imprecise, so ignore the
		 * return value.
		 */
		__change_page_attr_set_clr(&alias_cpa, 0);
	}
A
Arjan van de Ven 已提交
1613
#endif
T
Tejun Heo 已提交
1614 1615

	return 0;
L
Linus Torvalds 已提交
1616 1617
}

T
Thomas Gleixner 已提交
1618
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1619
{
1620
	unsigned long numpages = cpa->numpages;
1621 1622
	unsigned long rempages = numpages;
	int ret = 0;
1623

1624
	while (rempages) {
1625 1626 1627 1628
		/*
		 * Store the remaining nr of pages for the large page
		 * preservation check.
		 */
1629
		cpa->numpages = rempages;
1630
		/* for array changes, we can't use large page */
1631
		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1632
			cpa->numpages = 1;
T
Thomas Gleixner 已提交
1633

1634
		if (!debug_pagealloc_enabled())
1635
			spin_lock(&cpa_lock);
T
Thomas Gleixner 已提交
1636
		ret = __change_page_attr(cpa, checkalias);
1637
		if (!debug_pagealloc_enabled())
1638
			spin_unlock(&cpa_lock);
1639
		if (ret)
1640
			goto out;
1641

T
Thomas Gleixner 已提交
1642 1643 1644
		if (checkalias) {
			ret = cpa_process_alias(cpa);
			if (ret)
1645
				goto out;
T
Thomas Gleixner 已提交
1646 1647
		}

1648 1649 1650 1651 1652
		/*
		 * Adjust the number of pages with the result of the
		 * CPA operation. Either a large page has been
		 * preserved or a single page update happened.
		 */
1653 1654
		BUG_ON(cpa->numpages > rempages || !cpa->numpages);
		rempages -= cpa->numpages;
1655
		cpa->curpage += cpa->numpages;
1656
	}
1657 1658 1659 1660 1661

out:
	/* Restore the original numpages */
	cpa->numpages = numpages;
	return ret;
1662 1663
}

1664
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1665
				    pgprot_t mask_set, pgprot_t mask_clr,
1666 1667
				    int force_split, int in_flag,
				    struct page **pages)
1668
{
T
Thomas Gleixner 已提交
1669
	struct cpa_data cpa;
1670
	int ret, cache, checkalias;
1671

1672 1673
	memset(&cpa, 0, sizeof(cpa));

1674
	/*
1675 1676
	 * Check, if we are requested to set a not supported
	 * feature.  Clearing non-supported features is OK.
1677 1678
	 */
	mask_set = canon_pgprot(mask_set);
1679

1680
	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1681 1682
		return 0;

1683
	/* Ensure we are PAGE_SIZE aligned */
1684
	if (in_flag & CPA_ARRAY) {
1685 1686 1687 1688 1689 1690 1691
		int i;
		for (i = 0; i < numpages; i++) {
			if (addr[i] & ~PAGE_MASK) {
				addr[i] &= PAGE_MASK;
				WARN_ON_ONCE(1);
			}
		}
1692 1693 1694
	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
		/*
		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
I
Ingo Molnar 已提交
1695
		 * No need to check in that case
1696 1697 1698 1699 1700 1701 1702 1703
		 */
		if (*addr & ~PAGE_MASK) {
			*addr &= PAGE_MASK;
			/*
			 * People should not be passing in unaligned addresses:
			 */
			WARN_ON_ONCE(1);
		}
1704 1705
	}

1706 1707 1708
	/* Must avoid aliasing mappings in the highmem code */
	kmap_flush_unused();

N
Nick Piggin 已提交
1709 1710
	vm_unmap_aliases();

T
Thomas Gleixner 已提交
1711
	cpa.vaddr = addr;
1712
	cpa.pages = pages;
T
Thomas Gleixner 已提交
1713 1714 1715
	cpa.numpages = numpages;
	cpa.mask_set = mask_set;
	cpa.mask_clr = mask_clr;
1716 1717
	cpa.flags = 0;
	cpa.curpage = 0;
1718
	cpa.force_split = force_split;
T
Thomas Gleixner 已提交
1719

1720 1721
	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
		cpa.flags |= in_flag;
1722

1723 1724
	/* No alias checking for _NX bit modifications */
	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1725 1726 1727
	/* Has caller explicitly disabled alias checking? */
	if (in_flag & CPA_NO_CHECK_ALIAS)
		checkalias = 0;
1728 1729

	ret = __change_page_attr_set_clr(&cpa, checkalias);
1730

1731 1732 1733
	/*
	 * Check whether we really changed something:
	 */
1734
	if (!(cpa.flags & CPA_FLUSHTLB))
1735
		goto out;
1736

1737 1738 1739 1740
	/*
	 * No need to flush, when we did not set any of the caching
	 * attributes:
	 */
1741
	cache = !!pgprot2cachemode(mask_set);
1742

1743
	/*
1744
	 * On error; flush everything to be sure.
1745
	 */
1746
	if (ret) {
1747
		cpa_flush_all(cache);
1748 1749 1750
		goto out;
	}

1751
	cpa_flush(&cpa, cache);
1752
out:
1753 1754 1755
	return ret;
}

1756 1757
static inline int change_page_attr_set(unsigned long *addr, int numpages,
				       pgprot_t mask, int array)
1758
{
1759
	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1760
		(array ? CPA_ARRAY : 0), NULL);
1761 1762
}

1763 1764
static inline int change_page_attr_clear(unsigned long *addr, int numpages,
					 pgprot_t mask, int array)
1765
{
1766
	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1767
		(array ? CPA_ARRAY : 0), NULL);
1768 1769
}

1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783
static inline int cpa_set_pages_array(struct page **pages, int numpages,
				       pgprot_t mask)
{
	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
		CPA_PAGES_ARRAY, pages);
}

static inline int cpa_clear_pages_array(struct page **pages, int numpages,
					 pgprot_t mask)
{
	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
		CPA_PAGES_ARRAY, pages);
}

1784
int _set_memory_uc(unsigned long addr, int numpages)
1785
{
1786
	/*
C
Christoph Hellwig 已提交
1787
	 * for now UC MINUS. see comments in ioremap()
1788 1789 1790
	 * If you really need strong UC use ioremap_uc(), but note
	 * that you cannot override IO areas with set_memory_*() as
	 * these helpers cannot work with IO memory.
1791
	 */
1792
	return change_page_attr_set(&addr, numpages,
1793 1794
				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
				    0);
1795
}
1796 1797 1798

int set_memory_uc(unsigned long addr, int numpages)
{
1799 1800
	int ret;

1801
	/*
C
Christoph Hellwig 已提交
1802
	 * for now UC MINUS. see comments in ioremap()
1803
	 */
1804
	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1805
			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
1806 1807 1808 1809 1810 1811 1812 1813
	if (ret)
		goto out_err;

	ret = _set_memory_uc(addr, numpages);
	if (ret)
		goto out_free;

	return 0;
1814

1815 1816 1817 1818
out_free:
	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
out_err:
	return ret;
1819
}
1820 1821
EXPORT_SYMBOL(set_memory_uc);

1822 1823
int _set_memory_wc(unsigned long addr, int numpages)
{
1824
	int ret;
1825

1826
	ret = change_page_attr_set(&addr, numpages,
1827 1828
				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
				   0);
1829
	if (!ret) {
1830 1831
		ret = change_page_attr_set_clr(&addr, numpages,
					       cachemode2pgprot(_PAGE_CACHE_MODE_WC),
1832 1833
					       __pgprot(_PAGE_CACHE_MASK),
					       0, 0, NULL);
1834 1835
	}
	return ret;
1836 1837 1838 1839
}

int set_memory_wc(unsigned long addr, int numpages)
{
1840 1841 1842
	int ret;

	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1843
		_PAGE_CACHE_MODE_WC, NULL);
1844
	if (ret)
1845
		return ret;
1846

1847 1848
	ret = _set_memory_wc(addr, numpages);
	if (ret)
1849
		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1850 1851

	return ret;
1852 1853 1854
}
EXPORT_SYMBOL(set_memory_wc);

1855 1856 1857 1858 1859 1860
int _set_memory_wt(unsigned long addr, int numpages)
{
	return change_page_attr_set(&addr, numpages,
				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
}

1861
int _set_memory_wb(unsigned long addr, int numpages)
1862
{
1863
	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1864 1865
	return change_page_attr_clear(&addr, numpages,
				      __pgprot(_PAGE_CACHE_MASK), 0);
1866
}
1867 1868 1869

int set_memory_wb(unsigned long addr, int numpages)
{
1870 1871 1872 1873 1874 1875
	int ret;

	ret = _set_memory_wb(addr, numpages);
	if (ret)
		return ret;

1876
	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1877
	return 0;
1878
}
1879 1880 1881 1882
EXPORT_SYMBOL(set_memory_wb);

int set_memory_x(unsigned long addr, int numpages)
{
1883 1884 1885
	if (!(__supported_pte_mask & _PAGE_NX))
		return 0;

1886
	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1887 1888 1889 1890
}

int set_memory_nx(unsigned long addr, int numpages)
{
1891 1892 1893
	if (!(__supported_pte_mask & _PAGE_NX))
		return 0;

1894
	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1895 1896 1897 1898
}

int set_memory_ro(unsigned long addr, int numpages)
{
1899
	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1900 1901 1902 1903
}

int set_memory_rw(unsigned long addr, int numpages)
{
1904
	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1905
}
I
Ingo Molnar 已提交
1906 1907 1908

int set_memory_np(unsigned long addr, int numpages)
{
1909
	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
I
Ingo Molnar 已提交
1910
}
1911

1912 1913 1914 1915 1916 1917 1918 1919 1920
int set_memory_np_noalias(unsigned long addr, int numpages)
{
	int cpa_flags = CPA_NO_CHECK_ALIAS;

	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
					__pgprot(_PAGE_PRESENT), 0,
					cpa_flags, NULL);
}

1921 1922
int set_memory_4k(unsigned long addr, int numpages)
{
1923
	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1924
					__pgprot(0), 1, 0, NULL);
1925 1926
}

1927 1928 1929 1930 1931 1932
int set_memory_nonglobal(unsigned long addr, int numpages)
{
	return change_page_attr_clear(&addr, numpages,
				      __pgprot(_PAGE_GLOBAL), 0);
}

1933 1934 1935 1936 1937 1938
int set_memory_global(unsigned long addr, int numpages)
{
	return change_page_attr_set(&addr, numpages,
				    __pgprot(_PAGE_GLOBAL), 0);
}

1939 1940 1941 1942 1943
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
	struct cpa_data cpa;
	int ret;

1944 1945
	/* Nothing to do if memory encryption is not active */
	if (!mem_encrypt_active())
1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965
		return 0;

	/* Should not be working on unaligned addresses */
	if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
		addr &= PAGE_MASK;

	memset(&cpa, 0, sizeof(cpa));
	cpa.vaddr = &addr;
	cpa.numpages = numpages;
	cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
	cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
	cpa.pgd = init_mm.pgd;

	/* Must avoid aliasing mappings in the highmem code */
	kmap_flush_unused();
	vm_unmap_aliases();

	/*
	 * Before changing the encryption attribute, we need to flush caches.
	 */
1966
	cpa_flush(&cpa, 1);
1967 1968 1969 1970

	ret = __change_page_attr_set_clr(&cpa, 1);

	/*
1971 1972 1973 1974 1975
	 * After changing the encryption attribute, we need to flush TLBs again
	 * in case any speculative TLB caching occurred (but no need to flush
	 * caches again).  We could just use cpa_flush_all(), but in case TLB
	 * flushing gets optimized in the cpa_flush() path use the same logic
	 * as above.
1976
	 */
1977
	cpa_flush(&cpa, 0);
1978 1979 1980 1981 1982 1983 1984 1985

	return ret;
}

int set_memory_encrypted(unsigned long addr, int numpages)
{
	return __set_memory_enc_dec(addr, numpages, true);
}
1986
EXPORT_SYMBOL_GPL(set_memory_encrypted);
1987 1988 1989 1990 1991

int set_memory_decrypted(unsigned long addr, int numpages)
{
	return __set_memory_enc_dec(addr, numpages, false);
}
1992
EXPORT_SYMBOL_GPL(set_memory_decrypted);
1993

1994 1995 1996 1997
int set_pages_uc(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
1998
	return set_memory_uc(addr, numpages);
1999 2000 2001
}
EXPORT_SYMBOL(set_pages_uc);

2002
static int _set_pages_array(struct page **pages, int numpages,
2003
		enum page_cache_mode new_type)
2004 2005 2006
{
	unsigned long start;
	unsigned long end;
2007
	enum page_cache_mode set_type;
2008 2009
	int i;
	int free_idx;
2010
	int ret;
2011

2012
	for (i = 0; i < numpages; i++) {
2013 2014 2015
		if (PageHighMem(pages[i]))
			continue;
		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2016
		end = start + PAGE_SIZE;
2017
		if (reserve_memtype(start, end, new_type, NULL))
2018 2019 2020
			goto err_out;
	}

2021 2022 2023 2024
	/* If WC, set to UC- first and then WC */
	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
				_PAGE_CACHE_MODE_UC_MINUS : new_type;

2025
	ret = cpa_set_pages_array(pages, numpages,
2026
				  cachemode2pgprot(set_type));
2027
	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
2028
		ret = change_page_attr_set_clr(NULL, numpages,
2029 2030
					       cachemode2pgprot(
						_PAGE_CACHE_MODE_WC),
2031 2032 2033 2034 2035
					       __pgprot(_PAGE_CACHE_MASK),
					       0, CPA_PAGES_ARRAY, pages);
	if (ret)
		goto err_out;
	return 0; /* Success */
2036 2037 2038
err_out:
	free_idx = i;
	for (i = 0; i < free_idx; i++) {
2039 2040 2041
		if (PageHighMem(pages[i]))
			continue;
		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2042 2043 2044 2045 2046
		end = start + PAGE_SIZE;
		free_memtype(start, end);
	}
	return -EINVAL;
}
2047

2048
int set_pages_array_uc(struct page **pages, int numpages)
2049
{
2050
	return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
2051
}
2052 2053
EXPORT_SYMBOL(set_pages_array_uc);

2054
int set_pages_array_wc(struct page **pages, int numpages)
2055
{
2056
	return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
2057 2058 2059
}
EXPORT_SYMBOL(set_pages_array_wc);

2060
int set_pages_array_wt(struct page **pages, int numpages)
2061
{
2062
	return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WT);
2063 2064 2065
}
EXPORT_SYMBOL_GPL(set_pages_array_wt);

2066 2067 2068 2069
int set_pages_wb(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
2070
	return set_memory_wb(addr, numpages);
2071 2072 2073
}
EXPORT_SYMBOL(set_pages_wb);

2074
int set_pages_array_wb(struct page **pages, int numpages)
2075 2076 2077 2078 2079 2080
{
	int retval;
	unsigned long start;
	unsigned long end;
	int i;

2081
	/* WB cache mode is hard wired to all cache attribute bits being 0 */
2082
	retval = cpa_clear_pages_array(pages, numpages,
2083
			__pgprot(_PAGE_CACHE_MASK));
2084 2085
	if (retval)
		return retval;
2086

2087
	for (i = 0; i < numpages; i++) {
2088 2089 2090
		if (PageHighMem(pages[i]))
			continue;
		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2091 2092 2093 2094
		end = start + PAGE_SIZE;
		free_memtype(start, end);
	}

2095
	return 0;
2096 2097 2098
}
EXPORT_SYMBOL(set_pages_array_wb);

2099 2100 2101 2102
int set_pages_ro(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
2103
	return set_memory_ro(addr, numpages);
2104 2105 2106 2107 2108
}

int set_pages_rw(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);
2109

T
Thomas Gleixner 已提交
2110
	return set_memory_rw(addr, numpages);
I
Ingo Molnar 已提交
2111 2112
}

I
Ingo Molnar 已提交
2113 2114
static int __set_pages_p(struct page *page, int numpages)
{
2115 2116
	unsigned long tempaddr = (unsigned long) page_address(page);
	struct cpa_data cpa = { .vaddr = &tempaddr,
2117
				.pgd = NULL,
T
Thomas Gleixner 已提交
2118 2119
				.numpages = numpages,
				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
2120 2121
				.mask_clr = __pgprot(0),
				.flags = 0};
2122

2123 2124 2125 2126 2127 2128 2129
	/*
	 * No alias checking needed for setting present flag. otherwise,
	 * we may need to break large pages for 64-bit kernel text
	 * mappings (this adds to complexity if we want to do this from
	 * atomic context especially). Let's keep it simple!
	 */
	return __change_page_attr_set_clr(&cpa, 0);
I
Ingo Molnar 已提交
2130 2131 2132 2133
}

static int __set_pages_np(struct page *page, int numpages)
{
2134 2135
	unsigned long tempaddr = (unsigned long) page_address(page);
	struct cpa_data cpa = { .vaddr = &tempaddr,
2136
				.pgd = NULL,
T
Thomas Gleixner 已提交
2137 2138
				.numpages = numpages,
				.mask_set = __pgprot(0),
2139 2140
				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
				.flags = 0};
2141

2142 2143 2144 2145 2146 2147 2148
	/*
	 * No alias checking needed for setting not present flag. otherwise,
	 * we may need to break large pages for 64-bit kernel text
	 * mappings (this adds to complexity if we want to do this from
	 * atomic context especially). Let's keep it simple!
	 */
	return __change_page_attr_set_clr(&cpa, 0);
I
Ingo Molnar 已提交
2149 2150
}

2151 2152 2153 2154 2155 2156 2157 2158 2159 2160
int set_direct_map_invalid_noflush(struct page *page)
{
	return __set_pages_np(page, 1);
}

int set_direct_map_default_noflush(struct page *page)
{
	return __set_pages_p(page, 1);
}

2161
void __kernel_map_pages(struct page *page, int numpages, int enable)
L
Linus Torvalds 已提交
2162 2163 2164
{
	if (PageHighMem(page))
		return;
2165
	if (!enable) {
2166 2167
		debug_check_no_locks_freed(page_address(page),
					   numpages * PAGE_SIZE);
2168
	}
2169

2170
	/*
I
Ingo Molnar 已提交
2171
	 * The return value is ignored as the calls cannot fail.
2172 2173
	 * Large pages for identity mappings are not used at boot time
	 * and hence no memory allocations during large page split.
L
Linus Torvalds 已提交
2174
	 */
I
Ingo Molnar 已提交
2175 2176 2177 2178
	if (enable)
		__set_pages_p(page, numpages);
	else
		__set_pages_np(page, numpages);
2179 2180

	/*
2181
	 * We should perform an IPI and flush all tlbs,
2182 2183 2184
	 * but that can deadlock->flush only current cpu.
	 * Preemption needs to be disabled around __flush_tlb_all() due to
	 * CR3 reload in __native_flush_tlb().
L
Linus Torvalds 已提交
2185
	 */
2186
	preempt_disable();
L
Linus Torvalds 已提交
2187
	__flush_tlb_all();
2188
	preempt_enable();
2189 2190

	arch_flush_lazy_mmu_mode();
2191 2192
}

2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206
#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
{
	unsigned int level;
	pte_t *pte;

	if (PageHighMem(page))
		return false;

	pte = lookup_address((unsigned long)page_address(page), &level);
	return (pte_val(*pte) & _PAGE_PRESENT);
}
#endif /* CONFIG_HIBERNATION */

2207 2208
int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
				   unsigned numpages, unsigned long page_flags)
2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221
{
	int retval = -EINVAL;

	struct cpa_data cpa = {
		.vaddr = &address,
		.pfn = pfn,
		.pgd = pgd,
		.numpages = numpages,
		.mask_set = __pgprot(0),
		.mask_clr = __pgprot(0),
		.flags = 0,
	};

2222 2223
	WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

2224 2225 2226 2227 2228 2229
	if (!(__supported_pte_mask & _PAGE_NX))
		goto out;

	if (!(page_flags & _PAGE_NX))
		cpa.mask_clr = __pgprot(_PAGE_NX);

2230 2231 2232
	if (!(page_flags & _PAGE_RW))
		cpa.mask_clr = __pgprot(_PAGE_RW);

2233 2234 2235
	if (!(page_flags & _PAGE_ENC))
		cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);

2236 2237 2238 2239 2240 2241 2242 2243 2244
	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);

	retval = __change_page_attr_set_clr(&cpa, 0);
	__flush_tlb_all();

out:
	return retval;
}

2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278
/*
 * __flush_tlb_all() flushes mappings only on current CPU and hence this
 * function shouldn't be used in an SMP environment. Presently, it's used only
 * during boot (way before smp_init()) by EFI subsystem and hence is ok.
 */
int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
				     unsigned long numpages)
{
	int retval;

	/*
	 * The typical sequence for unmapping is to find a pte through
	 * lookup_address_in_pgd() (ideally, it should never return NULL because
	 * the address is already mapped) and change it's protections. As pfn is
	 * the *target* of a mapping, it's not useful while unmapping.
	 */
	struct cpa_data cpa = {
		.vaddr		= &address,
		.pfn		= 0,
		.pgd		= pgd,
		.numpages	= numpages,
		.mask_set	= __pgprot(0),
		.mask_clr	= __pgprot(_PAGE_PRESENT | _PAGE_RW),
		.flags		= 0,
	};

	WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

	retval = __change_page_attr_set_clr(&cpa, 0);
	__flush_tlb_all();

	return retval;
}

2279 2280 2281 2282 2283
/*
 * The testcases use internal knowledge of the implementation that shouldn't
 * be exposed to the rest of the kernel. Include these directly here.
 */
#ifdef CONFIG_CPA_DEBUG
2284
#include "cpa-test.c"
2285
#endif