pageattr.c 22.9 KB
Newer Older
1 2
/*
 * Copyright 2002 Andi Kleen, SuSE Labs.
L
Linus Torvalds 已提交
3
 * Thanks to Ben LaHaise for precious feedback.
4
 */
L
Linus Torvalds 已提交
5
#include <linux/highmem.h>
I
Ingo Molnar 已提交
6
#include <linux/bootmem.h>
L
Linus Torvalds 已提交
7
#include <linux/module.h>
8
#include <linux/sched.h>
L
Linus Torvalds 已提交
9
#include <linux/slab.h>
10
#include <linux/mm.h>
11
#include <linux/interrupt.h>
12 13
#include <linux/seq_file.h>
#include <linux/debugfs.h>
14

15
#include <asm/e820.h>
L
Linus Torvalds 已提交
16 17
#include <asm/processor.h>
#include <asm/tlbflush.h>
D
Dave Jones 已提交
18
#include <asm/sections.h>
19 20
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
T
Thomas Gleixner 已提交
21
#include <asm/proto.h>
22
#include <asm/pat.h>
L
Linus Torvalds 已提交
23

I
Ingo Molnar 已提交
24 25 26
/*
 * The current flushing context - we pass it instead of 5 arguments:
 */
T
Thomas Gleixner 已提交
27 28 29 30
struct cpa_data {
	unsigned long	vaddr;
	pgprot_t	mask_set;
	pgprot_t	mask_clr;
31
	int		numpages;
32
	int		flushtlb;
T
Thomas Gleixner 已提交
33
	unsigned long	pfn;
T
Thomas Gleixner 已提交
34 35
};

T
Thomas Gleixner 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49
#ifdef CONFIG_X86_64

static inline unsigned long highmap_start_pfn(void)
{
	return __pa(_text) >> PAGE_SHIFT;
}

static inline unsigned long highmap_end_pfn(void)
{
	return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
}

#endif

50 51 52 53 54 55
#ifdef CONFIG_DEBUG_PAGEALLOC
# define debug_pagealloc 1
#else
# define debug_pagealloc 0
#endif

56 57
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
I
Ingo Molnar 已提交
58
{
59 60 61
	return addr >= start && addr < end;
}

T
Thomas Gleixner 已提交
62 63 64
/*
 * Flushing functions
 */
65 66 67 68 69 70 71 72 73

/**
 * clflush_cache_range - flush a cache range with clflush
 * @addr:	virtual start address
 * @size:	number of bytes to flush
 *
 * clflush is an unordered instruction which needs fencing with mfence
 * to avoid ordering issues.
 */
I
Ingo Molnar 已提交
74
void clflush_cache_range(void *vaddr, unsigned int size)
T
Thomas Gleixner 已提交
75
{
I
Ingo Molnar 已提交
76
	void *vend = vaddr + size - 1;
T
Thomas Gleixner 已提交
77

78
	mb();
I
Ingo Molnar 已提交
79 80 81 82 83 84 85 86

	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
		clflush(vaddr);
	/*
	 * Flush any possible final partial cacheline:
	 */
	clflush(vend);

87
	mb();
T
Thomas Gleixner 已提交
88 89
}

90
static void __cpa_flush_all(void *arg)
T
Thomas Gleixner 已提交
91
{
92 93
	unsigned long cache = (unsigned long)arg;

T
Thomas Gleixner 已提交
94 95 96 97 98 99
	/*
	 * Flush all to work around Errata in early athlons regarding
	 * large page flushing.
	 */
	__flush_tlb_all();

100
	if (cache && boot_cpu_data.x86_model >= 4)
T
Thomas Gleixner 已提交
101 102 103
		wbinvd();
}

104
static void cpa_flush_all(unsigned long cache)
T
Thomas Gleixner 已提交
105 106 107
{
	BUG_ON(irqs_disabled());

108
	on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
T
Thomas Gleixner 已提交
109 110
}

111 112 113 114 115 116 117 118 119 120
static void __cpa_flush_range(void *arg)
{
	/*
	 * We could optimize that further and do individual per page
	 * tlb invalidates for a low number of pages. Caveat: we must
	 * flush the high aliases on 64bit as well.
	 */
	__flush_tlb_all();
}

121
static void cpa_flush_range(unsigned long start, int numpages, int cache)
122
{
I
Ingo Molnar 已提交
123 124 125
	unsigned int i, level;
	unsigned long addr;

126
	BUG_ON(irqs_disabled());
I
Ingo Molnar 已提交
127
	WARN_ON(PAGE_ALIGN(start) != start);
128

T
Thomas Gleixner 已提交
129
	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
130

131 132 133
	if (!cache)
		return;

T
Thomas Gleixner 已提交
134 135 136 137 138 139
	/*
	 * We only need to flush on one CPU,
	 * clflush is a MESI-coherent instruction that
	 * will cause all other CPUs to flush the same
	 * cachelines:
	 */
I
Ingo Molnar 已提交
140 141 142 143 144 145
	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
		pte_t *pte = lookup_address(addr, &level);

		/*
		 * Only flush present addresses:
		 */
146
		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
I
Ingo Molnar 已提交
147 148
			clflush_cache_range((void *) addr, PAGE_SIZE);
	}
149 150
}

151 152 153 154 155 156
/*
 * Certain areas of memory on x86 require very specific protection flags,
 * for example the BIOS area or kernel text. Callers don't always get this
 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 * checks and fixes these known static required protection bits.
 */
T
Thomas Gleixner 已提交
157 158
static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
				   unsigned long pfn)
159 160 161
{
	pgprot_t forbidden = __pgprot(0);

I
Ingo Molnar 已提交
162
	/*
163 164
	 * The BIOS area between 640k and 1Mb needs to be executable for
	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
I
Ingo Molnar 已提交
165
	 */
T
Thomas Gleixner 已提交
166
	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
167 168 169 170
		pgprot_val(forbidden) |= _PAGE_NX;

	/*
	 * The kernel text needs to be executable for obvious reasons
T
Thomas Gleixner 已提交
171 172
	 * Does not cover __inittext since that is gone later on. On
	 * 64bit we do not enforce !NX on the low mapping
173 174 175
	 */
	if (within(address, (unsigned long)_text, (unsigned long)_etext))
		pgprot_val(forbidden) |= _PAGE_NX;
176 177

	/*
T
Thomas Gleixner 已提交
178 179
	 * The .rodata section needs to be read-only. Using the pfn
	 * catches all aliases.
180
	 */
T
Thomas Gleixner 已提交
181 182
	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
183
		pgprot_val(forbidden) |= _PAGE_RW;
184 185

	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
I
Ingo Molnar 已提交
186 187 188 189

	return prot;
}

T
Thomas Gleixner 已提交
190 191 192 193 194 195 196 197
/*
 * Lookup the page table entry for a virtual address. Return a pointer
 * to the entry and the level of the mapping.
 *
 * Note: We return pud and pmd either when the entry is marked large
 * or when the present bit is not set. Otherwise we would return a
 * pointer to a nonexisting mapping.
 */
198
pte_t *lookup_address(unsigned long address, unsigned int *level)
199
{
L
Linus Torvalds 已提交
200 201 202
	pgd_t *pgd = pgd_offset_k(address);
	pud_t *pud;
	pmd_t *pmd;
203

T
Thomas Gleixner 已提交
204 205
	*level = PG_LEVEL_NONE;

L
Linus Torvalds 已提交
206 207
	if (pgd_none(*pgd))
		return NULL;
I
Ingo Molnar 已提交
208

L
Linus Torvalds 已提交
209 210 211
	pud = pud_offset(pgd, address);
	if (pud_none(*pud))
		return NULL;
212 213 214 215 216

	*level = PG_LEVEL_1G;
	if (pud_large(*pud) || !pud_present(*pud))
		return (pte_t *)pud;

L
Linus Torvalds 已提交
217 218 219
	pmd = pmd_offset(pud, address);
	if (pmd_none(*pmd))
		return NULL;
T
Thomas Gleixner 已提交
220 221

	*level = PG_LEVEL_2M;
T
Thomas Gleixner 已提交
222
	if (pmd_large(*pmd) || !pmd_present(*pmd))
L
Linus Torvalds 已提交
223 224
		return (pte_t *)pmd;

T
Thomas Gleixner 已提交
225
	*level = PG_LEVEL_4K;
I
Ingo Molnar 已提交
226

227 228 229
	return pte_offset_kernel(pmd, address);
}

I
Ingo Molnar 已提交
230 231 232
/*
 * Set the new pmd in all the pgds we know about:
 */
I
Ingo Molnar 已提交
233
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
234 235 236
{
	/* change init_mm */
	set_pte_atomic(kpte, pte);
237
#ifdef CONFIG_X86_32
238
	if (!SHARED_KERNEL_PMD) {
239 240
		struct page *page;

241
		list_for_each_entry(page, &pgd_list, lru) {
242 243 244 245 246 247 248 249 250
			pgd_t *pgd;
			pud_t *pud;
			pmd_t *pmd;

			pgd = (pgd_t *)page_address(page) + pgd_index(address);
			pud = pud_offset(pgd, address);
			pmd = pmd_offset(pud, address);
			set_pte_atomic((pte_t *)pmd, pte);
		}
L
Linus Torvalds 已提交
251
	}
252
#endif
L
Linus Torvalds 已提交
253 254
}

I
Ingo Molnar 已提交
255 256 257
static int
try_preserve_large_page(pte_t *kpte, unsigned long address,
			struct cpa_data *cpa)
258
{
T
Thomas Gleixner 已提交
259
	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
260 261
	pte_t new_pte, old_pte, *tmp;
	pgprot_t old_prot, new_prot;
262
	int i, do_split = 1;
263
	unsigned int level;
264 265 266 267 268 269 270 271 272 273 274 275

	spin_lock_irqsave(&pgd_lock, flags);
	/*
	 * Check for races, another CPU might have split this page
	 * up already:
	 */
	tmp = lookup_address(address, &level);
	if (tmp != kpte)
		goto out_unlock;

	switch (level) {
	case PG_LEVEL_2M:
276 277
		psize = PMD_PAGE_SIZE;
		pmask = PMD_PAGE_MASK;
278
		break;
279
#ifdef CONFIG_X86_64
280
	case PG_LEVEL_1G:
281 282
		psize = PUD_PAGE_SIZE;
		pmask = PUD_PAGE_MASK;
283 284
		break;
#endif
285
	default:
I
Ingo Molnar 已提交
286
		do_split = -EINVAL;
287 288 289 290 291 292 293 294 295
		goto out_unlock;
	}

	/*
	 * Calculate the number of pages, which fit into this large
	 * page starting at address:
	 */
	nextpage_addr = (address + psize) & pmask;
	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
296 297
	if (numpages < cpa->numpages)
		cpa->numpages = numpages;
298 299 300 301 302 303 304 305 306

	/*
	 * We are safe now. Check whether the new pgprot is the same:
	 */
	old_pte = *kpte;
	old_prot = new_prot = pte_pgprot(old_pte);

	pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
	pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
T
Thomas Gleixner 已提交
307 308 309 310 311 312 313 314 315

	/*
	 * old_pte points to the large page base address. So we need
	 * to add the offset of the virtual address:
	 */
	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
	cpa->pfn = pfn;

	new_prot = static_protections(new_prot, address, pfn);
316

317 318 319 320 321 322
	/*
	 * We need to check the full range, whether
	 * static_protection() requires a different pgprot for one of
	 * the pages in the range we try to preserve:
	 */
	addr = address + PAGE_SIZE;
T
Thomas Gleixner 已提交
323
	pfn++;
324
	for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
T
Thomas Gleixner 已提交
325
		pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
326 327 328 329 330

		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
			goto out_unlock;
	}

331 332 333 334 335
	/*
	 * If there are no changes, return. maxpages has been updated
	 * above:
	 */
	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
I
Ingo Molnar 已提交
336
		do_split = 0;
337 338 339 340 341 342 343 344 345 346 347
		goto out_unlock;
	}

	/*
	 * We need to change the attributes. Check, whether we can
	 * change the large page in one go. We request a split, when
	 * the address is not aligned and the number of pages is
	 * smaller than the number of pages in the large page. Note
	 * that we limited the number of possible pages already to
	 * the number of pages in the large page.
	 */
348
	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
349 350 351 352 353 354 355
		/*
		 * The address is aligned and the number of pages
		 * covers the full page.
		 */
		new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
		__set_pmd_pte(kpte, address, new_pte);
		cpa->flushtlb = 1;
I
Ingo Molnar 已提交
356
		do_split = 0;
357 358 359 360
	}

out_unlock:
	spin_unlock_irqrestore(&pgd_lock, flags);
I
Ingo Molnar 已提交
361

I
Ingo Molnar 已提交
362
	return do_split;
363 364
}

365 366
static LIST_HEAD(page_pool);
static unsigned long pool_size, pool_pages, pool_low;
367
static unsigned long pool_used, pool_failed;
368

369
static void cpa_fill_pool(struct page **ret)
370 371
{
	gfp_t gfp = GFP_KERNEL;
372 373
	unsigned long flags;
	struct page *p;
374 375

	/*
376 377
	 * Avoid recursion (on debug-pagealloc) and also signal
	 * our priority to get to these pagetables:
378
	 */
379
	if (current->flags & PF_MEMALLOC)
380
		return;
381
	current->flags |= PF_MEMALLOC;
382 383

	/*
384
	 * Allocate atomically from atomic contexts:
385
	 */
386 387
	if (in_atomic() || irqs_disabled() || debug_pagealloc)
		gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
388

389
	while (pool_pages < pool_size || (ret && !*ret)) {
390 391 392 393 394
		p = alloc_pages(gfp, 0);
		if (!p) {
			pool_failed++;
			break;
		}
395 396 397 398 399 400 401 402
		/*
		 * If the call site needs a page right now, provide it:
		 */
		if (ret && !*ret) {
			*ret = p;
			continue;
		}
		spin_lock_irqsave(&pgd_lock, flags);
403 404
		list_add(&p->lru, &page_pool);
		pool_pages++;
405
		spin_unlock_irqrestore(&pgd_lock, flags);
406
	}
407 408

	current->flags &= ~PF_MEMALLOC;
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
}

#define SHIFT_MB		(20 - PAGE_SHIFT)
#define ROUND_MB_GB		((1 << 10) - 1)
#define SHIFT_MB_GB		10
#define POOL_PAGES_PER_GB	16

void __init cpa_init(void)
{
	struct sysinfo si;
	unsigned long gb;

	si_meminfo(&si);
	/*
	 * Calculate the number of pool pages:
	 *
	 * Convert totalram (nr of pages) to MiB and round to the next
	 * GiB. Shift MiB to Gib and multiply the result by
	 * POOL_PAGES_PER_GB:
	 */
429 430 431 432 433 434
	if (debug_pagealloc) {
		gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
		pool_size = POOL_PAGES_PER_GB * gb;
	} else {
		pool_size = 1;
	}
435 436
	pool_low = pool_size;

437
	cpa_fill_pool(NULL);
438 439 440 441 442
	printk(KERN_DEBUG
	       "CPA: page pool initialized %lu of %lu pages preallocated\n",
	       pool_pages, pool_size);
}

443
static int split_large_page(pte_t *kpte, unsigned long address)
444
{
T
Thomas Gleixner 已提交
445
	unsigned long flags, pfn, pfninc = 1;
I
Ingo Molnar 已提交
446
	unsigned int i, level;
447
	pte_t *pbase, *tmp;
I
Ingo Molnar 已提交
448
	pgprot_t ref_prot;
449 450
	struct page *base;

T
Thomas Gleixner 已提交
451 452 453 454 455 456 457 458
	/*
	 * Get a page from the pool. The pool list is protected by the
	 * pgd_lock, which we have to take anyway for the split
	 * operation:
	 */
	spin_lock_irqsave(&pgd_lock, flags);
	if (list_empty(&page_pool)) {
		spin_unlock_irqrestore(&pgd_lock, flags);
459 460 461 462 463 464 465 466 467 468 469 470
		base = NULL;
		cpa_fill_pool(&base);
		if (!base)
			return -ENOMEM;
		spin_lock_irqsave(&pgd_lock, flags);
	} else {
		base = list_first_entry(&page_pool, struct page, lru);
		list_del(&base->lru);
		pool_pages--;

		if (pool_pages < pool_low)
			pool_low = pool_pages;
T
Thomas Gleixner 已提交
471 472
	}

473 474 475 476 477
	/*
	 * Check for races, another CPU might have split this page
	 * up for us already:
	 */
	tmp = lookup_address(address, &level);
I
Ingo Molnar 已提交
478
	if (tmp != kpte)
479 480 481
		goto out_unlock;

	pbase = (pte_t *)page_address(base);
482
#ifdef CONFIG_X86_32
483
	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
484
#endif
T
Thomas Gleixner 已提交
485
	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
486

487 488 489 490 491 492 493
#ifdef CONFIG_X86_64
	if (level == PG_LEVEL_1G) {
		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
		pgprot_val(ref_prot) |= _PAGE_PSE;
	}
#endif

494 495 496 497
	/*
	 * Get the target pfn from the original entry:
	 */
	pfn = pte_pfn(*kpte);
498
	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
499
		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
500 501

	/*
T
Thomas Gleixner 已提交
502
	 * Install the new, split up pagetable. Important details here:
503 504 505 506
	 *
	 * On Intel the NX bit of all levels must be cleared to make a
	 * page executable. See section 4.13.2 of Intel 64 and IA-32
	 * Architectures Software Developer's Manual).
T
Thomas Gleixner 已提交
507 508 509
	 *
	 * Mark the entry present. The current mapping might be
	 * set to not present, which we preserved above.
510
	 */
511
	ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
T
Thomas Gleixner 已提交
512
	pgprot_val(ref_prot) |= _PAGE_PRESENT;
I
Ingo Molnar 已提交
513
	__set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
514 515 516
	base = NULL;

out_unlock:
T
Thomas Gleixner 已提交
517 518 519 520 521 522 523 524 525
	/*
	 * If we dropped out via the lookup_address check under
	 * pgd_lock then stick the page back into the pool:
	 */
	if (base) {
		list_add(&base->lru, &page_pool);
		pool_pages++;
	} else
		pool_used++;
I
Ingo Molnar 已提交
526
	spin_unlock_irqrestore(&pgd_lock, flags);
527 528 529 530

	return 0;
}

T
Thomas Gleixner 已提交
531
static int __change_page_attr(struct cpa_data *cpa, int primary)
532
{
T
Thomas Gleixner 已提交
533
	unsigned long address = cpa->vaddr;
534 535
	int do_split, err;
	unsigned int level;
T
Thomas Gleixner 已提交
536
	pte_t *kpte, old_pte;
L
Linus Torvalds 已提交
537

538
repeat:
539
	kpte = lookup_address(address, &level);
L
Linus Torvalds 已提交
540
	if (!kpte)
T
Thomas Gleixner 已提交
541 542 543 544 545 546 547 548 549 550
		return primary ? -EINVAL : 0;

	old_pte = *kpte;
	if (!pte_val(old_pte)) {
		if (!primary)
			return 0;
		printk(KERN_WARNING "CPA: called for zero pte. "
		       "vaddr = %lx cpa->vaddr = %lx\n", address,
		       cpa->vaddr);
		WARN_ON(1);
L
Linus Torvalds 已提交
551
		return -EINVAL;
T
Thomas Gleixner 已提交
552
	}
553

T
Thomas Gleixner 已提交
554
	if (level == PG_LEVEL_4K) {
T
Thomas Gleixner 已提交
555
		pte_t new_pte;
556
		pgprot_t new_prot = pte_pgprot(old_pte);
T
Thomas Gleixner 已提交
557
		unsigned long pfn = pte_pfn(old_pte);
I
Ingo Molnar 已提交
558

T
Thomas Gleixner 已提交
559 560
		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
I
Ingo Molnar 已提交
561

T
Thomas Gleixner 已提交
562
		new_prot = static_protections(new_prot, address, pfn);
I
Ingo Molnar 已提交
563

564 565 566 567 568
		/*
		 * We need to keep the pfn from the existing PTE,
		 * after all we're only going to change it's attributes
		 * not the memory it points to
		 */
T
Thomas Gleixner 已提交
569 570
		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
		cpa->pfn = pfn;
571 572 573 574 575 576 577
		/*
		 * Do we really change anything ?
		 */
		if (pte_val(old_pte) != pte_val(new_pte)) {
			set_pte_atomic(kpte, new_pte);
			cpa->flushtlb = 1;
		}
578
		cpa->numpages = 1;
579
		return 0;
L
Linus Torvalds 已提交
580
	}
581 582 583 584 585

	/*
	 * Check, whether we can keep the large page intact
	 * and just change the pte:
	 */
I
Ingo Molnar 已提交
586
	do_split = try_preserve_large_page(kpte, address, cpa);
587 588
	/*
	 * When the range fits into the existing large page,
589
	 * return. cp->numpages and cpa->tlbflush have been updated in
590 591
	 * try_large_page:
	 */
I
Ingo Molnar 已提交
592 593
	if (do_split <= 0)
		return do_split;
594 595 596 597

	/*
	 * We have to split the large page:
	 */
I
Ingo Molnar 已提交
598 599 600 601 602
	err = split_large_page(kpte, address);
	if (!err) {
		cpa->flushtlb = 1;
		goto repeat;
	}
I
Ingo Molnar 已提交
603

I
Ingo Molnar 已提交
604
	return err;
605
}
L
Linus Torvalds 已提交
606

T
Thomas Gleixner 已提交
607 608 609
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);

static int cpa_process_alias(struct cpa_data *cpa)
L
Linus Torvalds 已提交
610
{
T
Thomas Gleixner 已提交
611
	struct cpa_data alias_cpa;
612
	int ret = 0;
613

T
Thomas Gleixner 已提交
614 615
	if (cpa->pfn > max_pfn_mapped)
		return 0;
616

617 618 619 620 621 622
	/*
	 * No need to redo, when the primary call touched the direct
	 * mapping already:
	 */
	if (!within(cpa->vaddr, PAGE_OFFSET,
		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
623

624 625 626 627 628
		alias_cpa = *cpa;
		alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);

		ret = __change_page_attr_set_clr(&alias_cpa, 0);
	}
629 630

#ifdef CONFIG_X86_64
T
Thomas Gleixner 已提交
631 632
	if (ret)
		return ret;
633 634 635 636 637 638 639
	/*
	 * No need to redo, when the primary call touched the high
	 * mapping already:
	 */
	if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
		return 0;

A
Arjan van de Ven 已提交
640
	/*
641 642
	 * If the physical address is inside the kernel map, we need
	 * to touch the high mapped kernel as well:
A
Arjan van de Ven 已提交
643
	 */
T
Thomas Gleixner 已提交
644 645
	if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
		return 0;
646

T
Thomas Gleixner 已提交
647 648 649 650 651 652 653 654
	alias_cpa = *cpa;
	alias_cpa.vaddr =
		(cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;

	/*
	 * The high mapping range is imprecise, so ignore the return value.
	 */
	__change_page_attr_set_clr(&alias_cpa, 0);
A
Arjan van de Ven 已提交
655
#endif
T
Thomas Gleixner 已提交
656
	return ret;
L
Linus Torvalds 已提交
657 658
}

T
Thomas Gleixner 已提交
659
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
660
{
661
	int ret, numpages = cpa->numpages;
662

663 664 665 666 667
	while (numpages) {
		/*
		 * Store the remaining nr of pages for the large page
		 * preservation check.
		 */
668
		cpa->numpages = numpages;
T
Thomas Gleixner 已提交
669 670

		ret = __change_page_attr(cpa, checkalias);
671 672 673
		if (ret)
			return ret;

T
Thomas Gleixner 已提交
674 675 676 677 678 679
		if (checkalias) {
			ret = cpa_process_alias(cpa);
			if (ret)
				return ret;
		}

680 681 682 683 684
		/*
		 * Adjust the number of pages with the result of the
		 * CPA operation. Either a large page has been
		 * preserved or a single page update happened.
		 */
685 686 687
		BUG_ON(cpa->numpages > numpages);
		numpages -= cpa->numpages;
		cpa->vaddr += cpa->numpages * PAGE_SIZE;
688
	}
689 690 691
	return 0;
}

692 693 694 695 696 697
static inline int cache_attr(pgprot_t attr)
{
	return pgprot_val(attr) &
		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
}

698 699 700
static int change_page_attr_set_clr(unsigned long addr, int numpages,
				    pgprot_t mask_set, pgprot_t mask_clr)
{
T
Thomas Gleixner 已提交
701
	struct cpa_data cpa;
702
	int ret, cache, checkalias;
703 704 705 706 707 708 709 710 711 712

	/*
	 * Check, if we are requested to change a not supported
	 * feature:
	 */
	mask_set = canon_pgprot(mask_set);
	mask_clr = canon_pgprot(mask_clr);
	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
		return 0;

713 714 715 716 717 718 719 720 721
	/* Ensure we are PAGE_SIZE aligned */
	if (addr & ~PAGE_MASK) {
		addr &= PAGE_MASK;
		/*
		 * People should not be passing in unaligned addresses:
		 */
		WARN_ON_ONCE(1);
	}

T
Thomas Gleixner 已提交
722 723 724 725
	cpa.vaddr = addr;
	cpa.numpages = numpages;
	cpa.mask_set = mask_set;
	cpa.mask_clr = mask_clr;
726
	cpa.flushtlb = 0;
T
Thomas Gleixner 已提交
727

728 729 730 731
	/* No alias checking for _NX bit modifications */
	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;

	ret = __change_page_attr_set_clr(&cpa, checkalias);
732

733 734 735 736
	/*
	 * Check whether we really changed something:
	 */
	if (!cpa.flushtlb)
737
		goto out;
738

739 740 741 742 743 744
	/*
	 * No need to flush, when we did not set any of the caching
	 * attributes:
	 */
	cache = cache_attr(mask_set);

745 746 747
	/*
	 * On success we use clflush, when the CPU supports it to
	 * avoid the wbindv. If the CPU does not support it and in the
748
	 * error case we fall back to cpa_flush_all (which uses
749 750 751
	 * wbindv):
	 */
	if (!ret && cpu_has_clflush)
752
		cpa_flush_range(addr, numpages, cache);
753
	else
754
		cpa_flush_all(cache);
755

756
out:
757 758
	cpa_fill_pool(NULL);

759 760 761
	return ret;
}

762 763
static inline int change_page_attr_set(unsigned long addr, int numpages,
				       pgprot_t mask)
764
{
765
	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
766 767
}

768 769
static inline int change_page_attr_clear(unsigned long addr, int numpages,
					 pgprot_t mask)
770
{
H
Huang, Ying 已提交
771
	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
772 773
}

774
int _set_memory_uc(unsigned long addr, int numpages)
775 776
{
	return change_page_attr_set(addr, numpages,
777
				    __pgprot(_PAGE_CACHE_UC));
778
}
779 780 781 782 783 784 785 786 787

int set_memory_uc(unsigned long addr, int numpages)
{
	if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
	                    _PAGE_CACHE_UC, NULL))
		return -EINVAL;

	return _set_memory_uc(addr, numpages);
}
788 789
EXPORT_SYMBOL(set_memory_uc);

790
int _set_memory_wb(unsigned long addr, int numpages)
791
{
792
	return change_page_attr_clear(addr, numpages,
793
				      __pgprot(_PAGE_CACHE_MASK));
794
}
795 796 797 798 799 800 801

int set_memory_wb(unsigned long addr, int numpages)
{
	free_memtype(addr, addr + numpages * PAGE_SIZE);

	return _set_memory_wb(addr, numpages);
}
802 803 804 805
EXPORT_SYMBOL(set_memory_wb);

int set_memory_x(unsigned long addr, int numpages)
{
806
	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
807 808 809 810 811
}
EXPORT_SYMBOL(set_memory_x);

int set_memory_nx(unsigned long addr, int numpages)
{
812
	return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
813 814 815 816 817
}
EXPORT_SYMBOL(set_memory_nx);

int set_memory_ro(unsigned long addr, int numpages)
{
818
	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
819 820 821 822
}

int set_memory_rw(unsigned long addr, int numpages)
{
823
	return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
824
}
I
Ingo Molnar 已提交
825 826 827

int set_memory_np(unsigned long addr, int numpages)
{
828
	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
I
Ingo Molnar 已提交
829
}
830 831 832 833 834

int set_pages_uc(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
835
	return set_memory_uc(addr, numpages);
836 837 838 839 840 841 842
}
EXPORT_SYMBOL(set_pages_uc);

int set_pages_wb(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
843
	return set_memory_wb(addr, numpages);
844 845 846 847 848 849 850
}
EXPORT_SYMBOL(set_pages_wb);

int set_pages_x(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
851
	return set_memory_x(addr, numpages);
852 853 854 855 856 857 858
}
EXPORT_SYMBOL(set_pages_x);

int set_pages_nx(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
859
	return set_memory_nx(addr, numpages);
860 861 862 863 864 865 866
}
EXPORT_SYMBOL(set_pages_nx);

int set_pages_ro(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);

T
Thomas Gleixner 已提交
867
	return set_memory_ro(addr, numpages);
868 869 870 871 872
}

int set_pages_rw(struct page *page, int numpages)
{
	unsigned long addr = (unsigned long)page_address(page);
873

T
Thomas Gleixner 已提交
874
	return set_memory_rw(addr, numpages);
I
Ingo Molnar 已提交
875 876
}

L
Linus Torvalds 已提交
877
#ifdef CONFIG_DEBUG_PAGEALLOC
I
Ingo Molnar 已提交
878 879 880

static int __set_pages_p(struct page *page, int numpages)
{
T
Thomas Gleixner 已提交
881 882 883 884
	struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
				.numpages = numpages,
				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
				.mask_clr = __pgprot(0)};
885

T
Thomas Gleixner 已提交
886
	return __change_page_attr_set_clr(&cpa, 1);
I
Ingo Molnar 已提交
887 888 889 890
}

static int __set_pages_np(struct page *page, int numpages)
{
T
Thomas Gleixner 已提交
891 892 893 894
	struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
				.numpages = numpages,
				.mask_set = __pgprot(0),
				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
895

T
Thomas Gleixner 已提交
896
	return __change_page_attr_set_clr(&cpa, 1);
I
Ingo Molnar 已提交
897 898
}

L
Linus Torvalds 已提交
899 900 901 902
void kernel_map_pages(struct page *page, int numpages, int enable)
{
	if (PageHighMem(page))
		return;
903
	if (!enable) {
904 905
		debug_check_no_locks_freed(page_address(page),
					   numpages * PAGE_SIZE);
906
	}
907

908 909 910 911 912 913
	/*
	 * If page allocator is not up yet then do not call c_p_a():
	 */
	if (!debug_pagealloc_enabled)
		return;

914
	/*
I
Ingo Molnar 已提交
915 916 917 918 919 920
	 * The return value is ignored as the calls cannot fail.
	 * Large pages are kept enabled at boot time, and are
	 * split up quickly with DEBUG_PAGEALLOC. If a splitup
	 * fails here (due to temporary memory shortage) no damage
	 * is done because we just keep the largepage intact up
	 * to the next attempt when it will likely be split up:
L
Linus Torvalds 已提交
921
	 */
I
Ingo Molnar 已提交
922 923 924 925
	if (enable)
		__set_pages_p(page, numpages);
	else
		__set_pages_np(page, numpages);
926 927

	/*
928 929
	 * We should perform an IPI and flush all tlbs,
	 * but that can deadlock->flush only current cpu:
L
Linus Torvalds 已提交
930 931
	 */
	__flush_tlb_all();
932 933 934 935 936

	/*
	 * Try to refill the page pool here. We can do this only after
	 * the tlb flush.
	 */
937
	cpa_fill_pool(NULL);
L
Linus Torvalds 已提交
938
}
939

940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978
#ifdef CONFIG_DEBUG_FS
static int dpa_show(struct seq_file *m, void *v)
{
	seq_puts(m, "DEBUG_PAGEALLOC\n");
	seq_printf(m, "pool_size     : %lu\n", pool_size);
	seq_printf(m, "pool_pages    : %lu\n", pool_pages);
	seq_printf(m, "pool_low      : %lu\n", pool_low);
	seq_printf(m, "pool_used     : %lu\n", pool_used);
	seq_printf(m, "pool_failed   : %lu\n", pool_failed);

	return 0;
}

static int dpa_open(struct inode *inode, struct file *filp)
{
	return single_open(filp, dpa_show, NULL);
}

static const struct file_operations dpa_fops = {
	.open		= dpa_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

int __init debug_pagealloc_proc_init(void)
{
	struct dentry *de;

	de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
				 &dpa_fops);
	if (!de)
		return -ENOMEM;

	return 0;
}
__initcall(debug_pagealloc_proc_init);
#endif

979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995
#ifdef CONFIG_HIBERNATION

bool kernel_page_present(struct page *page)
{
	unsigned int level;
	pte_t *pte;

	if (PageHighMem(page))
		return false;

	pte = lookup_address((unsigned long)page_address(page), &level);
	return (pte_val(*pte) & _PAGE_PRESENT);
}

#endif /* CONFIG_HIBERNATION */

#endif /* CONFIG_DEBUG_PAGEALLOC */
996 997 998 999 1000 1001 1002 1003

/*
 * The testcases use internal knowledge of the implementation that shouldn't
 * be exposed to the rest of the kernel. Include these directly here.
 */
#ifdef CONFIG_CPA_DEBUG
#include "pageattr-test.c"
#endif