dump_pagetables.c 13.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Debug helper to dump the current kernel pagetables of the system
 * so that we can see what the various memory ranges are set to.
 *
 * (C) Copyright 2008 Intel Corporation
 *
 * Author: Arjan van de Ven <arjan@linux.intel.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 */

15 16
#include <linux/debugfs.h>
#include <linux/mm.h>
17
#include <linux/init.h>
18
#include <linux/sched.h>
19 20
#include <linux/seq_file.h>

21
#include <asm/kasan.h>
22 23 24 25 26 27 28 29 30 31 32 33
#include <asm/pgtable.h>

/*
 * The dumper groups pagetable entries of the same type into one, and for
 * that it needs to keep some state when walking, and flush this state
 * when a "break" in the continuity is found.
 */
struct pg_state {
	int level;
	pgprot_t current_prot;
	unsigned long start_address;
	unsigned long current_address;
34
	const struct addr_marker *marker;
35
	unsigned long lines;
36
	bool to_dmesg;
S
Stephen Smalley 已提交
37 38
	bool check_wx;
	unsigned long wx_pages;
39 40
};

41 42 43
struct addr_marker {
	unsigned long start_address;
	const char *name;
44
	unsigned long max_lines;
45 46
};

47 48 49 50 51 52 53 54
/* indices for address_markers; keep sync'd w/ address_markers below */
enum address_markers_idx {
	USER_SPACE_NR = 0,
#ifdef CONFIG_X86_64
	KERNEL_SPACE_NR,
	LOW_KERNEL_NR,
	VMALLOC_START_NR,
	VMEMMAP_START_NR,
55 56 57 58
#ifdef CONFIG_KASAN
	KASAN_SHADOW_START_NR,
	KASAN_SHADOW_END_NR,
#endif
59
# ifdef CONFIG_X86_ESPFIX64
60
	ESPFIX_START_NR,
61
# endif
62 63 64 65 66 67 68 69 70 71 72 73 74 75
	HIGH_KERNEL_NR,
	MODULES_VADDR_NR,
	MODULES_END_NR,
#else
	KERNEL_SPACE_NR,
	VMALLOC_START_NR,
	VMALLOC_END_NR,
# ifdef CONFIG_HIGHMEM
	PKMAP_BASE_NR,
# endif
	FIXADDR_START_NR,
#endif
};

76 77 78 79 80
/* Address space markers hints */
static struct addr_marker address_markers[] = {
	{ 0, "User Space" },
#ifdef CONFIG_X86_64
	{ 0x8000000000000000UL, "Kernel Space" },
81 82 83
	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" },
	{ 0/* VMALLOC_START */, "vmalloc() Area" },
	{ 0/* VMEMMAP_START */, "Vmemmap" },
84 85 86 87
#ifdef CONFIG_KASAN
	{ KASAN_SHADOW_START,	"KASAN shadow" },
	{ KASAN_SHADOW_END,	"KASAN shadow end" },
#endif
88
# ifdef CONFIG_X86_ESPFIX64
89
	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
90 91 92
# endif
# ifdef CONFIG_EFI
	{ EFI_VA_END,		"EFI Runtime Services" },
93
# endif
94
	{ __START_KERNEL_map,   "High Kernel Mapping" },
95 96
	{ MODULES_VADDR,        "Modules" },
	{ MODULES_END,          "End Modules" },
97 98 99 100 101
#else
	{ PAGE_OFFSET,          "Kernel Mapping" },
	{ 0/* VMALLOC_START */, "vmalloc() Area" },
	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
L
Linus Torvalds 已提交
102
	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
103 104 105 106 107
# endif
	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
#endif
	{ -1, NULL }		/* End of list */
};
108

109 110 111 112
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
113
#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
114
#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
115

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
#define pt_dump_seq_printf(m, to_dmesg, fmt, args...)		\
({								\
	if (to_dmesg)					\
		printk(KERN_INFO fmt, ##args);			\
	else							\
		if (m)						\
			seq_printf(m, fmt, ##args);		\
})

#define pt_dump_cont_printf(m, to_dmesg, fmt, args...)		\
({								\
	if (to_dmesg)					\
		printk(KERN_CONT fmt, ##args);			\
	else							\
		if (m)						\
			seq_printf(m, fmt, ##args);		\
})

134 135 136
/*
 * Print a readable form of a pgprot_t to the seq_file
 */
137
static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
138
{
139 140 141 142 143 144
	pgprotval_t pr = pgprot_val(prot);
	static const char * const level_name[] =
		{ "cr3", "pgd", "pud", "pmd", "pte" };

	if (!pgprot_val(prot)) {
		/* Not present */
145
		pt_dump_cont_printf(m, dmsg, "                              ");
146 147
	} else {
		if (pr & _PAGE_USER)
148
			pt_dump_cont_printf(m, dmsg, "USR ");
149
		else
150
			pt_dump_cont_printf(m, dmsg, "    ");
151
		if (pr & _PAGE_RW)
152
			pt_dump_cont_printf(m, dmsg, "RW ");
153
		else
154
			pt_dump_cont_printf(m, dmsg, "ro ");
155
		if (pr & _PAGE_PWT)
156
			pt_dump_cont_printf(m, dmsg, "PWT ");
157
		else
158
			pt_dump_cont_printf(m, dmsg, "    ");
159
		if (pr & _PAGE_PCD)
160
			pt_dump_cont_printf(m, dmsg, "PCD ");
161
		else
162
			pt_dump_cont_printf(m, dmsg, "    ");
163

164 165 166 167 168 169 170
		/* Bit 7 has a different meaning on level 3 vs 4 */
		if (level <= 3 && pr & _PAGE_PSE)
			pt_dump_cont_printf(m, dmsg, "PSE ");
		else
			pt_dump_cont_printf(m, dmsg, "    ");
		if ((level == 4 && pr & _PAGE_PAT) ||
		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
171
			pt_dump_cont_printf(m, dmsg, "PAT ");
172 173
		else
			pt_dump_cont_printf(m, dmsg, "    ");
174
		if (pr & _PAGE_GLOBAL)
175
			pt_dump_cont_printf(m, dmsg, "GLB ");
176
		else
177
			pt_dump_cont_printf(m, dmsg, "    ");
178
		if (pr & _PAGE_NX)
179
			pt_dump_cont_printf(m, dmsg, "NX ");
180
		else
181
			pt_dump_cont_printf(m, dmsg, "x  ");
182
	}
183
	pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
184 185 186
}

/*
187
 * On 64 bits, sign-extend the 48 bit address to 64 bit
188
 */
189
static unsigned long normalize_addr(unsigned long u)
190
{
191 192 193 194 195 196
	int shift;
	if (!IS_ENABLED(CONFIG_X86_64))
		return u;

	shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
	return (signed long)(u << shift) >> shift;
197 198 199 200 201 202 203 204
}

/*
 * This function gets called on a break in a continuous series
 * of PTE entries; the next one is different so we need to
 * print what we collected so far.
 */
static void note_page(struct seq_file *m, struct pg_state *st,
205
		      pgprot_t new_prot, int level)
206
{
207
	pgprotval_t prot, cur;
208
	static const char units[] = "BKMGTPE";
209 210 211

	/*
	 * If we have a "break" in the series, we need to flush the state that
212 213
	 * we have now. "break" is either changing perms, levels or
	 * address space marker.
214
	 */
215 216
	prot = pgprot_val(new_prot);
	cur = pgprot_val(st->current_prot);
217

218 219 220 221 222
	if (!st->level) {
		/* First entry */
		st->current_prot = new_prot;
		st->level = level;
		st->marker = address_markers;
223
		st->lines = 0;
224 225
		pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
				   st->marker->name);
226 227 228
	} else if (prot != cur || level != st->level ||
		   st->current_address >= st->marker[1].start_address) {
		const char *unit = units;
229
		unsigned long delta;
230
		int width = sizeof(unsigned long) * 2;
S
Stephen Smalley 已提交
231 232 233 234 235 236 237 238 239 240
		pgprotval_t pr = pgprot_val(st->current_prot);

		if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
			WARN_ONCE(1,
				  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
				  (void *)st->start_address,
				  (void *)st->start_address);
			st->wx_pages += (st->current_address -
					 st->start_address) / PAGE_SIZE;
		}
241 242 243 244

		/*
		 * Now print the actual finished series
		 */
245 246 247 248 249 250
		if (!st->marker->max_lines ||
		    st->lines < st->marker->max_lines) {
			pt_dump_seq_printf(m, st->to_dmesg,
					   "0x%0*lx-0x%0*lx   ",
					   width, st->start_address,
					   width, st->current_address);
251

252 253 254 255 256 257 258 259 260
			delta = st->current_address - st->start_address;
			while (!(delta & 1023) && unit[1]) {
				delta >>= 10;
				unit++;
			}
			pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
					    delta, *unit);
			printk_prot(m, st->current_prot, st->level,
				    st->to_dmesg);
261
		}
262
		st->lines++;
263 264 265 266 267 268 269

		/*
		 * We print markers for special areas of address space,
		 * such as the start of vmalloc space etc.
		 * This helps in the interpretation.
		 */
		if (st->current_address >= st->marker[1].start_address) {
270 271 272 273 274 275 276 277 278
			if (st->marker->max_lines &&
			    st->lines > st->marker->max_lines) {
				unsigned long nskip =
					st->lines - st->marker->max_lines;
				pt_dump_seq_printf(m, st->to_dmesg,
						   "... %lu entr%s skipped ... \n",
						   nskip,
						   nskip == 1 ? "y" : "ies");
			}
279
			st->marker++;
280
			st->lines = 0;
281 282
			pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
					   st->marker->name);
283
		}
284

285 286 287
		st->start_address = st->current_address;
		st->current_prot = new_prot;
		st->level = level;
288
	}
289 290
}

291
static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
292 293 294
{
	int i;
	pte_t *start;
295
	pgprotval_t prot;
296

297
	start = (pte_t *)pmd_page_vaddr(addr);
298
	for (i = 0; i < PTRS_PER_PTE; i++) {
299
		prot = pte_flags(*start);
300
		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
301
		note_page(m, st, __pgprot(prot), 4);
302 303 304 305
		start++;
	}
}

306
#if PTRS_PER_PMD > 1
307

308
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
309 310 311
{
	int i;
	pmd_t *start;
312
	pgprotval_t prot;
313

314
	start = (pmd_t *)pud_page_vaddr(addr);
315
	for (i = 0; i < PTRS_PER_PMD; i++) {
316
		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
317
		if (!pmd_none(*start)) {
318 319
			if (pmd_large(*start) || !pmd_present(*start)) {
				prot = pmd_flags(*start);
320
				note_page(m, st, __pgprot(prot), 3);
321
			} else {
322 323
				walk_pte_level(m, st, *start,
					       P + i * PMD_LEVEL_MULT);
324
			}
325 326 327 328 329 330
		} else
			note_page(m, st, __pgprot(0), 3);
		start++;
	}
}

331 332 333 334 335
#else
#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
#define pud_large(a) pmd_large(__pmd(pud_val(a)))
#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
#endif
336

337 338
#if PTRS_PER_PUD > 1

339 340 341 342 343 344 345 346 347 348 349
/*
 * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
 * KASAN fills page tables with the same values. Since there is no
 * point in checking page table more than once we just skip repeated
 * entries. This saves us dozens of seconds during boot.
 */
static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
{
	return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}

350
static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
351 352 353
{
	int i;
	pud_t *start;
354
	pgprotval_t prot;
355
	pud_t *prev_pud = NULL;
356

357
	start = (pud_t *)p4d_page_vaddr(addr);
358 359

	for (i = 0; i < PTRS_PER_PUD; i++) {
360
		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
361 362
		if (!pud_none(*start) &&
		    !pud_already_checked(prev_pud, start, st->check_wx)) {
363 364
			if (pud_large(*start) || !pud_present(*start)) {
				prot = pud_flags(*start);
365
				note_page(m, st, __pgprot(prot), 2);
366
			} else {
367 368
				walk_pmd_level(m, st, *start,
					       P + i * PUD_LEVEL_MULT);
369
			}
370 371 372
		} else
			note_page(m, st, __pgprot(0), 2);

373
		prev_pud = start;
374 375 376 377
		start++;
	}
}

378
#else
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
#define p4d_large(a) pud_large(__pud(p4d_val(a)))
#define p4d_none(a)  pud_none(__pud(p4d_val(a)))
#endif

#if PTRS_PER_P4D > 1

static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
{
	int i;
	p4d_t *start;
	pgprotval_t prot;

	start = (p4d_t *)pgd_page_vaddr(addr);

	for (i = 0; i < PTRS_PER_P4D; i++) {
		st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
		if (!p4d_none(*start)) {
			if (p4d_large(*start) || !p4d_present(*start)) {
				prot = p4d_flags(*start);
				note_page(m, st, __pgprot(prot), 2);
			} else {
				walk_pud_level(m, st, *start,
					       P + i * P4D_LEVEL_MULT);
			}
		} else
			note_page(m, st, __pgprot(0), 2);

		start++;
	}
}

#else
#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
#define pgd_none(a)  p4d_none(__p4d(pgd_val(a)))
415 416
#endif

417 418
static inline bool is_hypervisor_range(int idx)
{
419
#ifdef CONFIG_X86_64
420 421 422 423
	/*
	 * ffff800000000000 - ffff87ffffffffff is reserved for
	 * the hypervisor.
	 */
424 425
	return	(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
		(idx <  pgd_index(__PAGE_OFFSET));
426
#else
427
	return false;
428
#endif
429
}
430

S
Stephen Smalley 已提交
431 432
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
				       bool checkwx)
433
{
434
#ifdef CONFIG_X86_64
435
	pgd_t *start = (pgd_t *) &init_top_pgt;
436 437 438
#else
	pgd_t *start = swapper_pg_dir;
#endif
439
	pgprotval_t prot;
440
	int i;
441
	struct pg_state st = {};
442

443 444 445 446
	if (pgd) {
		start = pgd;
		st.to_dmesg = true;
	}
447

S
Stephen Smalley 已提交
448 449 450 451
	st.check_wx = checkwx;
	if (checkwx)
		st.wx_pages = 0;

452
	for (i = 0; i < PTRS_PER_PGD; i++) {
453
		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
454
		if (!pgd_none(*start) && !is_hypervisor_range(i)) {
455 456
			if (pgd_large(*start) || !pgd_present(*start)) {
				prot = pgd_flags(*start);
457
				note_page(m, &st, __pgprot(prot), 1);
458
			} else {
459
				walk_p4d_level(m, &st, *start,
460
					       i * PGD_LEVEL_MULT);
461
			}
462
		} else
463
			note_page(m, &st, __pgprot(0), 1);
464

465
		cond_resched();
466 467
		start++;
	}
468 469 470 471

	/* Flush out the last page */
	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
	note_page(m, &st, __pgprot(0), 0);
S
Stephen Smalley 已提交
472 473 474 475 476 477 478 479 480 481 482 483
	if (!checkwx)
		return;
	if (st.wx_pages)
		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
			st.wx_pages);
	else
		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
}

void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
	ptdump_walk_pgd_level_core(m, pgd, false);
484
}
485
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
486

S
Stephen Smalley 已提交
487 488 489 490 491
void ptdump_walk_pgd_level_checkwx(void)
{
	ptdump_walk_pgd_level_core(NULL, NULL, true);
}

492
static int __init pt_dump_init(void)
493
{
494 495 496 497 498 499 500 501 502
	/*
	 * Various markers are not compile-time constants, so assign them
	 * here.
	 */
#ifdef CONFIG_X86_64
	address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
	address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
#endif
503
#ifdef CONFIG_X86_32
504 505
	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
506
# ifdef CONFIG_HIGHMEM
507
	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
508
# endif
509
	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
510 511
#endif

512 513 514
	return 0;
}
__initcall(pt_dump_init);