dump_pagetables.c 13.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Debug helper to dump the current kernel pagetables of the system
 * so that we can see what the various memory ranges are set to.
 *
 * (C) Copyright 2008 Intel Corporation
 *
 * Author: Arjan van de Ven <arjan@linux.intel.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 */

15 16
#include <linux/debugfs.h>
#include <linux/mm.h>
17
#include <linux/init.h>
18
#include <linux/sched.h>
19 20
#include <linux/seq_file.h>

21
#include <asm/kasan.h>
22 23 24 25 26 27 28 29 30 31 32 33
#include <asm/pgtable.h>

/*
 * The dumper groups pagetable entries of the same type into one, and for
 * that it needs to keep some state when walking, and flush this state
 * when a "break" in the continuity is found.
 */
struct pg_state {
	int level;
	pgprot_t current_prot;
	unsigned long start_address;
	unsigned long current_address;
34
	const struct addr_marker *marker;
35
	unsigned long lines;
36
	bool to_dmesg;
S
Stephen Smalley 已提交
37 38
	bool check_wx;
	unsigned long wx_pages;
39 40
};

41 42 43
struct addr_marker {
	unsigned long start_address;
	const char *name;
44
	unsigned long max_lines;
45 46
};

47 48 49 50 51 52 53 54
/* indices for address_markers; keep sync'd w/ address_markers below */
enum address_markers_idx {
	USER_SPACE_NR = 0,
#ifdef CONFIG_X86_64
	KERNEL_SPACE_NR,
	LOW_KERNEL_NR,
	VMALLOC_START_NR,
	VMEMMAP_START_NR,
55 56 57 58
#ifdef CONFIG_KASAN
	KASAN_SHADOW_START_NR,
	KASAN_SHADOW_END_NR,
#endif
59
# ifdef CONFIG_X86_ESPFIX64
60
	ESPFIX_START_NR,
61
# endif
62 63 64 65 66 67 68 69 70 71 72 73 74 75
	HIGH_KERNEL_NR,
	MODULES_VADDR_NR,
	MODULES_END_NR,
#else
	KERNEL_SPACE_NR,
	VMALLOC_START_NR,
	VMALLOC_END_NR,
# ifdef CONFIG_HIGHMEM
	PKMAP_BASE_NR,
# endif
	FIXADDR_START_NR,
#endif
};

76 77 78 79 80
/* Address space markers hints */
static struct addr_marker address_markers[] = {
	{ 0, "User Space" },
#ifdef CONFIG_X86_64
	{ 0x8000000000000000UL, "Kernel Space" },
81 82 83
	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" },
	{ 0/* VMALLOC_START */, "vmalloc() Area" },
	{ 0/* VMEMMAP_START */, "Vmemmap" },
84 85 86 87
#ifdef CONFIG_KASAN
	{ KASAN_SHADOW_START,	"KASAN shadow" },
	{ KASAN_SHADOW_END,	"KASAN shadow end" },
#endif
88
# ifdef CONFIG_X86_ESPFIX64
89
	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
90 91 92
# endif
# ifdef CONFIG_EFI
	{ EFI_VA_END,		"EFI Runtime Services" },
93
# endif
94
	{ __START_KERNEL_map,   "High Kernel Mapping" },
95 96
	{ MODULES_VADDR,        "Modules" },
	{ MODULES_END,          "End Modules" },
97 98 99 100 101
#else
	{ PAGE_OFFSET,          "Kernel Mapping" },
	{ 0/* VMALLOC_START */, "vmalloc() Area" },
	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
L
Linus Torvalds 已提交
102
	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
103 104 105 106 107
# endif
	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
#endif
	{ -1, NULL }		/* End of list */
};
108

109 110 111 112
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
113
#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
114
#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
115

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
#define pt_dump_seq_printf(m, to_dmesg, fmt, args...)		\
({								\
	if (to_dmesg)					\
		printk(KERN_INFO fmt, ##args);			\
	else							\
		if (m)						\
			seq_printf(m, fmt, ##args);		\
})

#define pt_dump_cont_printf(m, to_dmesg, fmt, args...)		\
({								\
	if (to_dmesg)					\
		printk(KERN_CONT fmt, ##args);			\
	else							\
		if (m)						\
			seq_printf(m, fmt, ##args);		\
})

134 135 136
/*
 * Print a readable form of a pgprot_t to the seq_file
 */
137
static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
138
{
139 140 141 142 143 144
	pgprotval_t pr = pgprot_val(prot);
	static const char * const level_name[] =
		{ "cr3", "pgd", "pud", "pmd", "pte" };

	if (!pgprot_val(prot)) {
		/* Not present */
145
		pt_dump_cont_printf(m, dmsg, "                              ");
146 147
	} else {
		if (pr & _PAGE_USER)
148
			pt_dump_cont_printf(m, dmsg, "USR ");
149
		else
150
			pt_dump_cont_printf(m, dmsg, "    ");
151
		if (pr & _PAGE_RW)
152
			pt_dump_cont_printf(m, dmsg, "RW ");
153
		else
154
			pt_dump_cont_printf(m, dmsg, "ro ");
155
		if (pr & _PAGE_PWT)
156
			pt_dump_cont_printf(m, dmsg, "PWT ");
157
		else
158
			pt_dump_cont_printf(m, dmsg, "    ");
159
		if (pr & _PAGE_PCD)
160
			pt_dump_cont_printf(m, dmsg, "PCD ");
161
		else
162
			pt_dump_cont_printf(m, dmsg, "    ");
163

164 165 166 167 168 169 170
		/* Bit 7 has a different meaning on level 3 vs 4 */
		if (level <= 3 && pr & _PAGE_PSE)
			pt_dump_cont_printf(m, dmsg, "PSE ");
		else
			pt_dump_cont_printf(m, dmsg, "    ");
		if ((level == 4 && pr & _PAGE_PAT) ||
		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
171
			pt_dump_cont_printf(m, dmsg, "PAT ");
172 173
		else
			pt_dump_cont_printf(m, dmsg, "    ");
174
		if (pr & _PAGE_GLOBAL)
175
			pt_dump_cont_printf(m, dmsg, "GLB ");
176
		else
177
			pt_dump_cont_printf(m, dmsg, "    ");
178
		if (pr & _PAGE_NX)
179
			pt_dump_cont_printf(m, dmsg, "NX ");
180
		else
181
			pt_dump_cont_printf(m, dmsg, "x  ");
182
	}
183
	pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
184 185 186
}

/*
187
 * On 64 bits, sign-extend the 48 bit address to 64 bit
188
 */
189
static unsigned long normalize_addr(unsigned long u)
190
{
191 192 193
#ifdef CONFIG_X86_64
	return (signed long)(u << 16) >> 16;
#else
194
	return u;
195
#endif
196 197 198 199 200 201 202 203
}

/*
 * This function gets called on a break in a continuous series
 * of PTE entries; the next one is different so we need to
 * print what we collected so far.
 */
static void note_page(struct seq_file *m, struct pg_state *st,
204
		      pgprot_t new_prot, int level)
205
{
206
	pgprotval_t prot, cur;
207
	static const char units[] = "BKMGTPE";
208 209 210

	/*
	 * If we have a "break" in the series, we need to flush the state that
211 212
	 * we have now. "break" is either changing perms, levels or
	 * address space marker.
213
	 */
214 215
	prot = pgprot_val(new_prot);
	cur = pgprot_val(st->current_prot);
216

217 218 219 220 221
	if (!st->level) {
		/* First entry */
		st->current_prot = new_prot;
		st->level = level;
		st->marker = address_markers;
222
		st->lines = 0;
223 224
		pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
				   st->marker->name);
225 226 227
	} else if (prot != cur || level != st->level ||
		   st->current_address >= st->marker[1].start_address) {
		const char *unit = units;
228
		unsigned long delta;
229
		int width = sizeof(unsigned long) * 2;
S
Stephen Smalley 已提交
230 231 232 233 234 235 236 237 238 239
		pgprotval_t pr = pgprot_val(st->current_prot);

		if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
			WARN_ONCE(1,
				  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
				  (void *)st->start_address,
				  (void *)st->start_address);
			st->wx_pages += (st->current_address -
					 st->start_address) / PAGE_SIZE;
		}
240 241 242 243

		/*
		 * Now print the actual finished series
		 */
244 245 246 247 248 249
		if (!st->marker->max_lines ||
		    st->lines < st->marker->max_lines) {
			pt_dump_seq_printf(m, st->to_dmesg,
					   "0x%0*lx-0x%0*lx   ",
					   width, st->start_address,
					   width, st->current_address);
250

251 252 253 254 255 256 257 258 259
			delta = st->current_address - st->start_address;
			while (!(delta & 1023) && unit[1]) {
				delta >>= 10;
				unit++;
			}
			pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
					    delta, *unit);
			printk_prot(m, st->current_prot, st->level,
				    st->to_dmesg);
260
		}
261
		st->lines++;
262 263 264 265 266 267 268

		/*
		 * We print markers for special areas of address space,
		 * such as the start of vmalloc space etc.
		 * This helps in the interpretation.
		 */
		if (st->current_address >= st->marker[1].start_address) {
269 270 271 272 273 274 275 276 277
			if (st->marker->max_lines &&
			    st->lines > st->marker->max_lines) {
				unsigned long nskip =
					st->lines - st->marker->max_lines;
				pt_dump_seq_printf(m, st->to_dmesg,
						   "... %lu entr%s skipped ... \n",
						   nskip,
						   nskip == 1 ? "y" : "ies");
			}
278
			st->marker++;
279
			st->lines = 0;
280 281
			pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
					   st->marker->name);
282
		}
283

284 285 286
		st->start_address = st->current_address;
		st->current_prot = new_prot;
		st->level = level;
287
	}
288 289
}

290
static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
291 292 293
{
	int i;
	pte_t *start;
294
	pgprotval_t prot;
295

296
	start = (pte_t *)pmd_page_vaddr(addr);
297
	for (i = 0; i < PTRS_PER_PTE; i++) {
298
		prot = pte_flags(*start);
299
		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
300
		note_page(m, st, __pgprot(prot), 4);
301 302 303 304
		start++;
	}
}

305
#if PTRS_PER_PMD > 1
306

307
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
308 309 310
{
	int i;
	pmd_t *start;
311
	pgprotval_t prot;
312

313
	start = (pmd_t *)pud_page_vaddr(addr);
314
	for (i = 0; i < PTRS_PER_PMD; i++) {
315
		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
316
		if (!pmd_none(*start)) {
317 318
			if (pmd_large(*start) || !pmd_present(*start)) {
				prot = pmd_flags(*start);
319
				note_page(m, st, __pgprot(prot), 3);
320
			} else {
321 322
				walk_pte_level(m, st, *start,
					       P + i * PMD_LEVEL_MULT);
323
			}
324 325 326 327 328 329
		} else
			note_page(m, st, __pgprot(0), 3);
		start++;
	}
}

330 331 332 333 334
#else
#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
#define pud_large(a) pmd_large(__pmd(pud_val(a)))
#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
#endif
335

336 337
#if PTRS_PER_PUD > 1

338 339 340 341 342 343 344 345 346 347 348
/*
 * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
 * KASAN fills page tables with the same values. Since there is no
 * point in checking page table more than once we just skip repeated
 * entries. This saves us dozens of seconds during boot.
 */
static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
{
	return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}

349
static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
350 351 352
{
	int i;
	pud_t *start;
353
	pgprotval_t prot;
354
	pud_t *prev_pud = NULL;
355

356
	start = (pud_t *)p4d_page_vaddr(addr);
357 358

	for (i = 0; i < PTRS_PER_PUD; i++) {
359
		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
360 361
		if (!pud_none(*start) &&
		    !pud_already_checked(prev_pud, start, st->check_wx)) {
362 363
			if (pud_large(*start) || !pud_present(*start)) {
				prot = pud_flags(*start);
364
				note_page(m, st, __pgprot(prot), 2);
365
			} else {
366 367
				walk_pmd_level(m, st, *start,
					       P + i * PUD_LEVEL_MULT);
368
			}
369 370 371
		} else
			note_page(m, st, __pgprot(0), 2);

372
		prev_pud = start;
373 374 375 376
		start++;
	}
}

377
#else
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
#define p4d_large(a) pud_large(__pud(p4d_val(a)))
#define p4d_none(a)  pud_none(__pud(p4d_val(a)))
#endif

#if PTRS_PER_P4D > 1

static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
{
	int i;
	p4d_t *start;
	pgprotval_t prot;

	start = (p4d_t *)pgd_page_vaddr(addr);

	for (i = 0; i < PTRS_PER_P4D; i++) {
		st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
		if (!p4d_none(*start)) {
			if (p4d_large(*start) || !p4d_present(*start)) {
				prot = p4d_flags(*start);
				note_page(m, st, __pgprot(prot), 2);
			} else {
				walk_pud_level(m, st, *start,
					       P + i * P4D_LEVEL_MULT);
			}
		} else
			note_page(m, st, __pgprot(0), 2);

		start++;
	}
}

#else
#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
#define pgd_none(a)  p4d_none(__p4d(pgd_val(a)))
414 415
#endif

416 417
static inline bool is_hypervisor_range(int idx)
{
418
#ifdef CONFIG_X86_64
419 420 421 422
	/*
	 * ffff800000000000 - ffff87ffffffffff is reserved for
	 * the hypervisor.
	 */
423 424
	return	(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
		(idx <  pgd_index(__PAGE_OFFSET));
425
#else
426
	return false;
427
#endif
428
}
429

S
Stephen Smalley 已提交
430 431
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
				       bool checkwx)
432
{
433
#ifdef CONFIG_X86_64
434
	pgd_t *start = (pgd_t *) &init_level4_pgt;
435 436 437
#else
	pgd_t *start = swapper_pg_dir;
#endif
438
	pgprotval_t prot;
439
	int i;
440
	struct pg_state st = {};
441

442 443 444 445
	if (pgd) {
		start = pgd;
		st.to_dmesg = true;
	}
446

S
Stephen Smalley 已提交
447 448 449 450
	st.check_wx = checkwx;
	if (checkwx)
		st.wx_pages = 0;

451
	for (i = 0; i < PTRS_PER_PGD; i++) {
452
		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
453
		if (!pgd_none(*start) && !is_hypervisor_range(i)) {
454 455
			if (pgd_large(*start) || !pgd_present(*start)) {
				prot = pgd_flags(*start);
456
				note_page(m, &st, __pgprot(prot), 1);
457
			} else {
458
				walk_p4d_level(m, &st, *start,
459
					       i * PGD_LEVEL_MULT);
460
			}
461
		} else
462
			note_page(m, &st, __pgprot(0), 1);
463

464
		cond_resched();
465 466
		start++;
	}
467 468 469 470

	/* Flush out the last page */
	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
	note_page(m, &st, __pgprot(0), 0);
S
Stephen Smalley 已提交
471 472 473 474 475 476 477 478 479 480 481 482
	if (!checkwx)
		return;
	if (st.wx_pages)
		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
			st.wx_pages);
	else
		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
}

void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
	ptdump_walk_pgd_level_core(m, pgd, false);
483
}
484
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
485

S
Stephen Smalley 已提交
486 487 488 489 490
void ptdump_walk_pgd_level_checkwx(void)
{
	ptdump_walk_pgd_level_core(NULL, NULL, true);
}

491
static int __init pt_dump_init(void)
492
{
493 494 495 496 497 498 499 500 501
	/*
	 * Various markers are not compile-time constants, so assign them
	 * here.
	 */
#ifdef CONFIG_X86_64
	address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
	address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
#endif
502
#ifdef CONFIG_X86_32
503 504
	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
505
# ifdef CONFIG_HIGHMEM
506
	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
507
# endif
508
	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
509 510
#endif

511 512 513
	return 0;
}
__initcall(pt_dump_init);