e820.c 32.1 KB
Newer Older
1
/*
2
 * Low level x86 E820 memory map handling functions.
3
 *
4 5
 * The firmware and bootloader passes us the "E820 table", which is the primary
 * physical memory layout description available about x86 systems.
6
 *
7 8 9
 * The kernel takes the E820 memory layout and optionally modifies it with
 * quirks and other tweaks, and feeds that into the generic Linux memory
 * allocation code routines via a platform independent interface (memblock, etc.).
10 11 12 13
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
14
#include <linux/crash_dump.h>
15
#include <linux/export.h>
16 17
#include <linux/bootmem.h>
#include <linux/pfn.h>
18
#include <linux/suspend.h>
19
#include <linux/acpi.h>
20
#include <linux/firmware-map.h>
21
#include <linux/memblock.h>
22
#include <linux/sort.h>
23

24
#include <asm/e820/api.h>
25
#include <asm/proto.h>
26
#include <asm/setup.h>
27
#include <asm/cpufeature.h>
28

29
/*
30 31 32 33 34 35 36 37 38 39 40
 * We organize the E820 table into two main data structures:
 *
 * - 'e820_table_firmware': the original firmware version passed to us by the
 *   bootloader - not modified by the kernel. We use this to:
 *
 *       - inform the user about the firmware's notion of memory layout
 *         via /sys/firmware/memmap
 *
 *       - the hibernation code uses it to generate a kernel-independent MD5
 *         fingerprint of the physical memory layout of a system.
 *
41
 *       - kexec, which is a bootloader in disguise, uses the original E820
42
 *         layout to pass to the kexec-ed kernel. This way the original kernel
43
 *         can have a restricted E820 map while the kexec()-ed kexec-kernel
44 45
 *         can have access to full memory - etc.
 *
46
 * - 'e820_table': this is the main E820 table that is massaged by the
47 48 49
 *   low level x86 platform code, or modified by boot parameters, before
 *   passed on to higher level MM layers.
 *
50
 * Once the E820 map has been converted to the standard Linux memory layout
51 52 53
 * information its role stops - modifying it has no effect and does not get
 * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
 * specific memory layout data during early bootup.
54
 */
55 56 57 58 59
static struct e820_table e820_table_init		__initdata;
static struct e820_table e820_table_firmware_init	__initdata;

struct e820_table *e820_table __refdata			= &e820_table_init;
struct e820_table *e820_table_firmware __refdata	= &e820_table_firmware_init;
60 61 62 63 64 65 66 67 68 69 70

/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif

/*
 * This function checks if any part of the range <start,end> is mapped
 * with type.
 */
71
int e820_any_mapped(u64 start, u64 end, unsigned type)
72 73 74
{
	int i;

75
	for (i = 0; i < e820_table->nr_entries; i++) {
76
		struct e820_entry *entry = &e820_table->entries[i];
77

78
		if (type && entry->type != type)
79
			continue;
80
		if (entry->addr >= end || entry->addr + entry->size <= start)
81 82 83 84 85 86 87 88
			continue;
		return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(e820_any_mapped);

/*
89
 * This function checks if the entire <start,end> range is mapped with 'type'.
90
 *
91 92
 * Note: this function only works correctly once the E820 table is sorted and
 * not-overlapping (at least for the range specified), which is the case normally.
93 94 95 96 97
 */
int __init e820_all_mapped(u64 start, u64 end, unsigned type)
{
	int i;

98
	for (i = 0; i < e820_table->nr_entries; i++) {
99
		struct e820_entry *entry = &e820_table->entries[i];
100

101
		if (type && entry->type != type)
102
			continue;
103 104

		/* Is the region (part) in overlap with the current region? */
105
		if (entry->addr >= end || entry->addr + entry->size <= start)
106 107
			continue;

108 109 110
		/*
		 * If the region is at the beginning of <start,end> we move
		 * 'start' to the end of the region since it's ok until there
111
		 */
112 113
		if (entry->addr <= start)
			start = entry->addr + entry->size;
114

115
		/*
116 117
		 * If 'start' is now at or beyond 'end', we're done, full
		 * coverage of the desired range exists:
118 119 120 121 122 123 124 125
		 */
		if (start >= end)
			return 1;
	}
	return 0;
}

/*
126
 * Add a memory region to the kernel E820 map.
127
 */
128
static void __init __e820_add_region(struct e820_table *table, u64 start, u64 size, int type)
129
{
130
	int x = table->nr_entries;
131

132
	if (x >= ARRAY_SIZE(table->entries)) {
133
		pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
134 135 136
		return;
	}

137 138 139 140
	table->entries[x].addr = start;
	table->entries[x].size = size;
	table->entries[x].type = type;
	table->nr_entries++;
Y
Yinghai Lu 已提交
141 142 143 144
}

void __init e820_add_region(u64 start, u64 size, int type)
{
145
	__e820_add_region(e820_table, start, size, type);
146 147
}

148 149 150
static void __init e820_print_type(u32 type)
{
	switch (type) {
151
	case E820_RAM:			/* Fall through: */
152 153 154 155 156
	case E820_RESERVED_KERN:	pr_cont("usable");			break;
	case E820_RESERVED:		pr_cont("reserved");			break;
	case E820_ACPI:			pr_cont("ACPI data");			break;
	case E820_NVS:			pr_cont("ACPI NVS");			break;
	case E820_UNUSABLE:		pr_cont("unusable");			break;
157
	case E820_PMEM:			/* Fall through: */
158 159
	case E820_PRAM:			pr_cont("persistent (type %u)", type);	break;
	default:			pr_cont("type %u", type);		break;
160 161 162
	}
}

163 164 165 166
void __init e820_print_map(char *who)
{
	int i;

167
	for (i = 0; i < e820_table->nr_entries; i++) {
168
		pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
169 170 171
		       e820_table->entries[i].addr,
		       e820_table->entries[i].addr + e820_table->entries[i].size - 1);

172
		e820_print_type(e820_table->entries[i].type);
173
		pr_cont("\n");
174 175 176 177
	}
}

/*
178
 * Sanitize the BIOS E820 map.
179
 *
180 181
 * Some E820 responses include overlapping entries. The following
 * replaces the original E820 map with a new one, removing overlaps,
182 183
 * and resolving conflicting memory types in favor of highest
 * numbered type.
184
 *
185
 * The input parameter biosmap points to an array of 'struct
186
 * e820_entry' which on entry has elements in the range [0, *pnr_map)
187
 * valid, and which has space for up to max_nr_map entries.
188
 * On return, the resulting sanitized E820 map entries will be in
189 190 191
 * overwritten in the same location, starting at biosmap.
 *
 * The integer pointed to by pnr_map must be valid on entry (the
192 193 194
 * current number of valid entries located at biosmap). If the
 * sanitizing succeeds the *pnr_map will be updated with the new
 * number of valid entries (something no more than max_nr_map).
195
 *
196
 * The return value from sanitize_e820_table() is zero if it
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
 * successfully 'sanitized' the map entries passed in, and is -1
 * if it did nothing, which can happen if either of (1) it was
 * only passed one map entry, or (2) any of the input map entries
 * were invalid (start + size < start, meaning that the size was
 * so big the described memory range wrapped around through zero.)
 *
 *	Visually we're performing the following
 *	(1,2,3,4 = memory types)...
 *
 *	Sample memory map (w/overlaps):
 *	   ____22__________________
 *	   ______________________4_
 *	   ____1111________________
 *	   _44_____________________
 *	   11111111________________
 *	   ____________________33__
 *	   ___________44___________
 *	   __________33333_________
 *	   ______________22________
 *	   ___________________2222_
 *	   _________111111111______
 *	   _____________________11_
 *	   _________________4______
 *
 *	Sanitized equivalent (no overlap):
 *	   1_______________________
 *	   _44_____________________
 *	   ___1____________________
 *	   ____22__________________
 *	   ______11________________
 *	   _________1______________
 *	   __________3_____________
 *	   ___________44___________
 *	   _____________33_________
 *	   _______________2________
 *	   ________________1_______
 *	   _________________4______
 *	   ___________________2____
 *	   ____________________33__
 *	   ______________________4_
237
 */
238
struct change_member {
239 240 241 242
	/* Pointer to the original BIOS entry: */
	struct e820_entry	*pbios;
	/* Address for this change point: */
	unsigned long long	addr;
243 244 245 246 247 248 249 250 251
};

static int __init cpcompare(const void *a, const void *b)
{
	struct change_member * const *app = a, * const *bpp = b;
	const struct change_member *ap = *app, *bp = *bpp;

	/*
	 * Inputs are pointers to two elements of change_point[].  If their
252
	 * addresses are not equal, their difference dominates.  If the addresses
253 254 255 256 257 258 259 260
	 * are equal, then consider one that represents the end of its region
	 * to be greater than one that does not.
	 */
	if (ap->addr != bp->addr)
		return ap->addr > bp->addr ? 1 : -1;

	return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
}
261

262
int __init sanitize_e820_table(struct e820_entry *biosmap, int max_nr_map, u32 *pnr_map)
263
{
264 265
	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
	static struct change_member *change_point[2*E820_X_MAX] __initdata;
266 267
	static struct e820_entry *overlap_list[E820_X_MAX] __initdata;
	static struct e820_entry new_bios[E820_X_MAX] __initdata;
268 269
	unsigned long current_type, last_type;
	unsigned long long last_addr;
270
	int chgidx;
271 272 273 274 275
	int overlap_entries;
	int new_bios_entry;
	int old_nr, new_nr, chg_nr;
	int i;

276
	/* If there's only one memory region, don't bother: */
277 278 279 280
	if (*pnr_map < 2)
		return -1;

	old_nr = *pnr_map;
281
	BUG_ON(old_nr > max_nr_map);
282

283 284
	/* Bail out if we find any unreasonable addresses in the BIOS map: */
	for (i = 0; i < old_nr; i++) {
285 286
		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
			return -1;
287
	}
288

289
	/* Create pointers for initial change-point information (for sorting): */
290 291 292
	for (i = 0; i < 2 * old_nr; i++)
		change_point[i] = &change_point_list[i];

293 294 295 296
	/*
	 * Record all known change-points (starting and ending addresses),
	 * omitting empty memory regions:
	 */
297 298 299
	chgidx = 0;
	for (i = 0; i < old_nr; i++)	{
		if (biosmap[i].size != 0) {
300 301 302 303
			change_point[chgidx]->addr	= biosmap[i].addr;
			change_point[chgidx++]->pbios	= &biosmap[i];
			change_point[chgidx]->addr	= biosmap[i].addr + biosmap[i].size;
			change_point[chgidx++]->pbios	= &biosmap[i];
304 305 306 307
		}
	}
	chg_nr = chgidx;

308
	/* Sort change-point list by memory addresses (low -> high): */
309
	sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
310

311 312 313 314 315
	/* Create a new BIOS memory map, removing overlaps: */
	overlap_entries = 0;	 /* Number of entries in the overlap table */
	new_bios_entry = 0;	 /* Index for creating new bios map entries */
	last_type = 0;		 /* Start with undefined memory type */
	last_addr = 0;		 /* Start with 0 as last starting address */
316

317
	/* Loop through change-points, determining effect on the new BIOS map: */
318
	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
319 320 321 322
		/* Keep track of all overlapping BIOS entries */
		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) {
			/* Add map entry to overlap list (> 1 entry implies an overlap) */
			overlap_list[overlap_entries++] = change_point[chgidx]->pbios;
323
		} else {
324
			/* Remove entry from list (order independent, so swap with last): */
325
			for (i = 0; i < overlap_entries; i++) {
326 327
				if (overlap_list[i] == change_point[chgidx]->pbios)
					overlap_list[i] = overlap_list[overlap_entries-1];
328 329 330 331
			}
			overlap_entries--;
		}
		/*
332
		 * If there are overlapping entries, decide which
333 334 335 336
		 * "type" to use (larger value takes precedence --
		 * 1=usable, 2,3,4,4+=unusable)
		 */
		current_type = 0;
337
		for (i = 0; i < overlap_entries; i++) {
338 339
			if (overlap_list[i]->type > current_type)
				current_type = overlap_list[i]->type;
340 341 342
		}

		/* Continue building up new BIOS map based on this information: */
343
		if (current_type != last_type || current_type == E820_PRAM) {
344
			if (last_type != 0)	 {
345 346
				new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr;
				/* Move forward only if the new size was non-zero: */
347
				if (new_bios[new_bios_entry].size != 0)
348
					/* No more space left for new BIOS entries? */
349
					if (++new_bios_entry >= max_nr_map)
350 351 352
						break;
			}
			if (current_type != 0)	{
353
				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
354 355 356 357 358 359
				new_bios[new_bios_entry].type = current_type;
				last_addr = change_point[chgidx]->addr;
			}
			last_type = current_type;
		}
	}
360 361

	/* Retain count for new BIOS entries: */
362 363
	new_nr = new_bios_entry;

364 365
	/* Copy new BIOS mapping into the original location: */
	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820_entry));
366 367 368 369 370
	*pnr_map = new_nr;

	return 0;
}

371
static int __init __append_e820_table(struct e820_entry *biosmap, int nr_map)
372 373 374 375
{
	while (nr_map) {
		u64 start = biosmap->addr;
		u64 size = biosmap->size;
376
		u64 end = start + size - 1;
377 378
		u32 type = biosmap->type;

379
		/* Ignore the entry on 64-bit overflow: */
380
		if (start > end && likely(size))
381 382 383 384 385 386 387 388 389 390
			return -1;

		e820_add_region(start, size, type);

		biosmap++;
		nr_map--;
	}
	return 0;
}

391
/*
392
 * Copy the BIOS E820 map into a safe place.
393 394 395 396 397 398 399
 *
 * Sanity-check it while we're at it..
 *
 * If we're lucky and live on a modern system, the setup code
 * will have given us a memory map that we can use to properly
 * set up memory.  If we aren't, we'll fake a memory map.
 */
400
static int __init append_e820_table(struct e820_entry *biosmap, int nr_map)
401 402 403 404 405
{
	/* Only one memory region (or negative)? Ignore it */
	if (nr_map < 2)
		return -1;

406
	return __append_e820_table(biosmap, nr_map);
407 408
}

409 410
static u64 __init
__e820_update_range(struct e820_table *table, u64 start, u64 size, unsigned old_type, unsigned new_type)
411
{
412
	u64 end;
Y
Yinghai Lu 已提交
413
	unsigned int i;
414 415 416 417
	u64 real_updated_size = 0;

	BUG_ON(old_type == new_type);

418 419 420
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

421
	end = start + size;
422
	pr_debug("e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
423
	e820_print_type(old_type);
424
	pr_cont(" ==> ");
425
	e820_print_type(new_type);
426
	pr_cont("\n");
427

428
	for (i = 0; i < table->nr_entries; i++) {
429
		struct e820_entry *entry = &table->entries[i];
430
		u64 final_start, final_end;
431
		u64 entry_end;
432

433
		if (entry->type != old_type)
434
			continue;
435

436
		entry_end = entry->addr + entry->size;
437 438

		/* Completely covered by new range? */
439 440 441
		if (entry->addr >= start && entry_end <= end) {
			entry->type = new_type;
			real_updated_size += entry->size;
442 443
			continue;
		}
444

445
		/* New range is completely covered? */
446
		if (entry->addr < start && entry_end > end) {
447
			__e820_add_region(table, start, size, new_type);
448 449
			__e820_add_region(table, end, entry_end - end, entry->type);
			entry->size = start - entry->addr;
450 451 452 453
			real_updated_size += size;
			continue;
		}

454
		/* Partially covered: */
455 456
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
457 458
		if (final_start >= final_end)
			continue;
459

460
		__e820_add_region(table, final_start, final_end - final_start, new_type);
461

462
		real_updated_size += final_end - final_start;
463

Y
Yinghai Lu 已提交
464
		/*
465 466
		 * Left range could be head or tail, so need to update
		 * its size first:
Y
Yinghai Lu 已提交
467
		 */
468 469
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
470
			continue;
471

472
		entry->addr = final_end;
473 474 475 476
	}
	return real_updated_size;
}

477
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type)
478
{
479
	return __e820_update_range(e820_table, start, size, old_type, new_type);
480 481
}

482
static u64 __init e820_update_range_firmware(u64 start, u64 size, unsigned old_type, unsigned new_type)
483
{
484
	return __e820_update_range(e820_table_firmware, start, size, old_type, new_type);
485 486
}

487 488
/* Remove a range of memory from the E820 table: */
u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype)
Y
Yinghai Lu 已提交
489 490
{
	int i;
491
	u64 end;
Y
Yinghai Lu 已提交
492 493
	u64 real_removed_size = 0;

494 495 496
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

497
	end = start + size;
498
	pr_debug("e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
499 500
	if (checktype)
		e820_print_type(old_type);
501
	pr_cont("\n");
502

503
	for (i = 0; i < e820_table->nr_entries; i++) {
504
		struct e820_entry *entry = &e820_table->entries[i];
Y
Yinghai Lu 已提交
505
		u64 final_start, final_end;
506
		u64 entry_end;
Y
Yinghai Lu 已提交
507

508
		if (checktype && entry->type != old_type)
Y
Yinghai Lu 已提交
509
			continue;
510

511
		entry_end = entry->addr + entry->size;
512 513

		/* Completely covered? */
514 515 516
		if (entry->addr >= start && entry_end <= end) {
			real_removed_size += entry->size;
			memset(entry, 0, sizeof(struct e820_entry));
Y
Yinghai Lu 已提交
517 518
			continue;
		}
519

520
		/* Is the new range completely covered? */
521 522 523
		if (entry->addr < start && entry_end > end) {
			e820_add_region(end, entry_end - end, entry->type);
			entry->size = start - entry->addr;
524 525 526 527
			real_removed_size += size;
			continue;
		}

528
		/* Partially covered: */
529 530
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
Y
Yinghai Lu 已提交
531 532
		if (final_start >= final_end)
			continue;
533

Y
Yinghai Lu 已提交
534 535
		real_removed_size += final_end - final_start;

536
		/*
537 538
		 * Left range could be head or tail, so need to update
		 * the size first:
539
		 */
540 541
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
Y
Yinghai Lu 已提交
542
			continue;
543

544
		entry->addr = final_end;
Y
Yinghai Lu 已提交
545 546 547 548
	}
	return real_removed_size;
}

549 550
void __init update_e820(void)
{
551
	if (sanitize_e820_table(e820_table->entries, ARRAY_SIZE(e820_table->entries), &e820_table->nr_entries))
552
		return;
553

554
	pr_info("e820: modified physical RAM map:\n");
555 556
	e820_print_map("modified");
}
557

558
static void __init update_e820_table_firmware(void)
559
{
560
	sanitize_e820_table(e820_table_firmware->entries, ARRAY_SIZE(e820_table_firmware->entries), &e820_table_firmware->nr_entries);
561
}
562

A
Alok Kataria 已提交
563
#define MAX_GAP_END 0x100000000ull
564

565
/*
566
 * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
567
 */
568
static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
569
{
570
	unsigned long long last = MAX_GAP_END;
571
	int i = e820_table->nr_entries;
572 573 574
	int found = 0;

	while (--i >= 0) {
575 576
		unsigned long long start = e820_table->entries[i].addr;
		unsigned long long end = start + e820_table->entries[i].size;
577 578 579

		/*
		 * Since "last" is at most 4GB, we know we'll
580
		 * fit in 32 bits if this condition is true:
581 582 583 584
		 */
		if (last > end) {
			unsigned long gap = last - end;

585 586 587
			if (gap >= *gapsize) {
				*gapsize = gap;
				*gapstart = end;
588 589 590 591 592 593
				found = 1;
			}
		}
		if (start < last)
			last = start;
	}
594 595 596 597
	return found;
}

/*
598 599 600 601 602
 * Search for the biggest gap in the low 32 bits of the E820
 * memory space. We pass this space to the PCI subsystem, so
 * that it can assign MMIO resources for hotplug or
 * unconfigured devices in.
 *
603 604 605 606
 * Hopefully the BIOS let enough space left.
 */
__init void e820_setup_gap(void)
{
607
	unsigned long gapstart, gapsize;
608 609 610
	int found;

	gapsize = 0x400000;
611
	found  = e820_search_gap(&gapstart, &gapsize);
612 613

	if (!found) {
614
#ifdef CONFIG_X86_64
Y
Yinghai Lu 已提交
615
		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
616
		pr_err(
617 618
			"e820: Cannot find an available gap in the 32-bit address range\n"
			"e820: PCI devices with unassigned 32-bit BARs may not work!\n");
619 620
#else
		gapstart = 0x10000000;
621
#endif
622
	}
623 624

	/*
625
	 * e820_reserve_resources_late protect stolen RAM already
626
	 */
627
	pci_mem_start = gapstart;
628

629
	pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
630 631
}

632 633 634
/*
 * Called late during init, in free_initmem().
 *
635 636 637 638 639 640 641 642
 * Initial e820_table and e820_table_firmware are largish __initdata arrays.
 *
 * Copy them to a (usually much smaller) dynamically allocated area that is
 * sized precisely after the number of e820 entries.
 *
 * This is done after we've performed all the fixes and tweaks to the tables.
 * All functions which modify them are __init functions, which won't exist
 * after free_initmem().
643 644 645
 */
__init void e820_reallocate_tables(void)
{
646
	struct e820_table *n;
647 648
	int size;

649
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
650 651
	n = kmalloc(size, GFP_KERNEL);
	BUG_ON(!n);
652 653
	memcpy(n, e820_table, size);
	e820_table = n;
654

655
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
656 657
	n = kmalloc(size, GFP_KERNEL);
	BUG_ON(!n);
658 659
	memcpy(n, e820_table_firmware, size);
	e820_table_firmware = n;
660 661
}

662 663 664 665 666
/*
 * Because of the small fixed size of struct boot_params, only the first
 * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
 * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
 * struct setup_data, which is parsed here.
667
 */
668
void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
669 670
{
	int entries;
671
	struct e820_entry *extmap;
672
	struct setup_data *sdata;
673

674
	sdata = early_memremap(phys_addr, data_len);
675 676
	entries = sdata->len / sizeof(struct e820_entry);
	extmap = (struct e820_entry *)(sdata->data);
677

678
	__append_e820_table(extmap, entries);
679
	sanitize_e820_table(e820_table->entries, ARRAY_SIZE(e820_table->entries), &e820_table->nr_entries);
680

681
	early_memunmap(sdata, data_len);
682
	pr_info("e820: extended physical RAM map:\n");
683 684 685
	e820_print_map("extended");
}

686 687
/**
 * Find the ranges of physical addresses that do not correspond to
688 689
 * E820 RAM areas and mark the corresponding pages as 'nosave' for
 * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
690
 *
691
 * This function requires the E820 map to be sorted and without any
692
 * overlapping entries.
693 694 695 696
 */
void __init e820_mark_nosave_regions(unsigned long limit_pfn)
{
	int i;
697
	unsigned long pfn = 0;
698

699
	for (i = 0; i < e820_table->nr_entries; i++) {
700
		struct e820_entry *entry = &e820_table->entries[i];
701

702 703
		if (pfn < PFN_UP(entry->addr))
			register_nosave_region(pfn, PFN_UP(entry->addr));
704

705
		pfn = PFN_DOWN(entry->addr + entry->size);
706

707 708
		if (entry->type != E820_RAM && entry->type != E820_RESERVED_KERN)
			register_nosave_region(PFN_UP(entry->addr), pfn);
709 710 711 712 713

		if (pfn >= limit_pfn)
			break;
	}
}
714

H
Huang Ying 已提交
715
#ifdef CONFIG_ACPI
716 717 718
/*
 * Register ACPI NVS memory regions, so that we can save/restore them during
 * hibernation and the subsequent resume:
719 720 721 722 723
 */
static int __init e820_mark_nvs_memory(void)
{
	int i;

724
	for (i = 0; i < e820_table->nr_entries; i++) {
725
		struct e820_entry *entry = &e820_table->entries[i];
726

727 728
		if (entry->type == E820_NVS)
			acpi_nvs_register(entry->addr, entry->size);
729 730 731 732 733 734 735
	}

	return 0;
}
core_initcall(e820_mark_nvs_memory);
#endif

Y
Yinghai Lu 已提交
736
/*
737
 * pre allocated 4k and reserved it in memblock and e820_table_firmware
Y
Yinghai Lu 已提交
738
 */
739
u64 __init early_reserve_e820(u64 size, u64 align)
Y
Yinghai Lu 已提交
740 741 742
{
	u64 addr;

743 744
	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
	if (addr) {
745
		e820_update_range_firmware(addr, size, E820_RAM, E820_RESERVED);
746
		pr_info("e820: update e820_table_firmware for early_reserve_e820\n");
747
		update_e820_table_firmware();
748
	}
Y
Yinghai Lu 已提交
749 750 751 752

	return addr;
}

753 754 755 756 757 758 759
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
#  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
# else
#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
# endif
#else /* CONFIG_X86_32 */
760
# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
761 762 763 764 765
#endif

/*
 * Find the highest page frame number we have available
 */
766
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
767
{
768 769
	int i;
	unsigned long last_pfn = 0;
770 771
	unsigned long max_arch_pfn = MAX_ARCH_PFN;

772
	for (i = 0; i < e820_table->nr_entries; i++) {
773
		struct e820_entry *entry = &e820_table->entries[i];
774
		unsigned long start_pfn;
775 776
		unsigned long end_pfn;

777
		if (entry->type != type)
778 779
			continue;

780 781
		start_pfn = entry->addr >> PAGE_SHIFT;
		end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
782 783 784 785 786 787 788

		if (start_pfn >= limit_pfn)
			continue;
		if (end_pfn > limit_pfn) {
			last_pfn = limit_pfn;
			break;
		}
789 790 791
		if (end_pfn > last_pfn)
			last_pfn = end_pfn;
	}
792 793 794 795

	if (last_pfn > max_arch_pfn)
		last_pfn = max_arch_pfn;

796
	pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
797 798 799
			 last_pfn, max_arch_pfn);
	return last_pfn;
}
800

801 802
unsigned long __init e820_end_of_ram_pfn(void)
{
803
	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
804
}
805

806 807
unsigned long __init e820_end_of_low_ram_pfn(void)
{
808
	return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM);
809
}
810

811
static void __init early_panic(char *msg)
812 813 814 815 816
{
	early_printk(msg);
	panic(msg);
}

817 818
static int userdef __initdata;

819
/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
820 821 822 823 824 825 826 827
static int __init parse_memopt(char *p)
{
	u64 mem_size;

	if (!p)
		return -EINVAL;

	if (!strcmp(p, "nopentium")) {
828
#ifdef CONFIG_X86_32
829 830
		setup_clear_cpu_cap(X86_FEATURE_PSE);
		return 0;
831
#else
832
		pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
833
		return -EINVAL;
834
#endif
835
	}
836

837
	userdef = 1;
838
	mem_size = memparse(p, &p);
839 840

	/* Don't remove all memory when getting "mem={invalid}" parameter: */
841 842
	if (mem_size == 0)
		return -EINVAL;
843

844
	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
845

846 847 848 849
	return 0;
}
early_param("mem", parse_memopt);

850
static int __init parse_memmap_one(char *p)
851 852 853 854
{
	char *oldp;
	u64 start_at, mem_size;

855 856 857
	if (!p)
		return -EINVAL;

858
	if (!strncmp(p, "exactmap", 8)) {
859 860 861
#ifdef CONFIG_CRASH_DUMP
		/*
		 * If we are doing a crash dump, we still need to know
862
		 * the real memory size before the original memory map is
863 864
		 * reset.
		 */
865
		saved_max_pfn = e820_end_of_ram_pfn();
866
#endif
867
		e820_table->nr_entries = 0;
868 869 870 871 872 873 874 875 876 877 878 879
		userdef = 1;
		return 0;
	}

	oldp = p;
	mem_size = memparse(p, &p);
	if (p == oldp)
		return -EINVAL;

	userdef = 1;
	if (*p == '@') {
		start_at = memparse(p+1, &p);
880
		e820_add_region(start_at, mem_size, E820_RAM);
881 882
	} else if (*p == '#') {
		start_at = memparse(p+1, &p);
883
		e820_add_region(start_at, mem_size, E820_ACPI);
884 885
	} else if (*p == '$') {
		start_at = memparse(p+1, &p);
886
		e820_add_region(start_at, mem_size, E820_RESERVED);
887 888 889
	} else if (*p == '!') {
		start_at = memparse(p+1, &p);
		e820_add_region(start_at, mem_size, E820_PRAM);
890
	} else {
891
		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
892
	}
Y
Yinghai Lu 已提交
893

894 895
	return *p == '\0' ? 0 : -EINVAL;
}
896

897 898 899 900 901 902 903 904 905 906 907 908 909 910
static int __init parse_memmap_opt(char *str)
{
	while (str) {
		char *k = strchr(str, ',');

		if (k)
			*k++ = 0;

		parse_memmap_one(str);
		str = k;
	}

	return 0;
}
911 912
early_param("memmap", parse_memmap_opt);

913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
void __init e820_reserve_setup_data(void)
{
	struct setup_data *data;
	u64 pa_data;

	pa_data = boot_params.hdr.setup_data;
	if (!pa_data)
		return;

	while (pa_data) {
		data = early_memremap(pa_data, sizeof(*data));
		e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN);
		pa_data = data->next;
		early_memunmap(data, sizeof(*data));
	}

	sanitize_e820_table(e820_table->entries, ARRAY_SIZE(e820_table->entries), &e820_table->nr_entries);
	memcpy(e820_table_firmware, e820_table, sizeof(struct e820_table));
	printk(KERN_INFO "extended physical RAM map:\n");
	e820_print_map("reserve setup_data");
}

935 936 937
void __init finish_e820_parsing(void)
{
	if (userdef) {
938
		if (sanitize_e820_table(e820_table->entries, ARRAY_SIZE(e820_table->entries), &e820_table->nr_entries) < 0)
939 940
			early_panic("Invalid user supplied memory map");

941
		pr_info("e820: user-defined physical RAM map:\n");
942 943 944
		e820_print_map("user");
	}
}
945

946
static const char *__init e820_type_to_string(int e820_type)
947 948
{
	switch (e820_type) {
949 950 951 952 953 954 955 956
	case E820_RESERVED_KERN: /* Fall-through: */
	case E820_RAM:		 return "System RAM";
	case E820_ACPI:		 return "ACPI Tables";
	case E820_NVS:		 return "ACPI Non-volatile Storage";
	case E820_UNUSABLE:	 return "Unusable memory";
	case E820_PRAM:		 return "Persistent Memory (legacy)";
	case E820_PMEM:		 return "Persistent Memory";
	default:		 return "Reserved";
957 958 959
	}
}

960
static unsigned long __init e820_type_to_iomem_type(int e820_type)
961 962
{
	switch (e820_type) {
963 964 965 966 967 968 969 970
	case E820_RESERVED_KERN: /* Fall-through: */
	case E820_RAM:		 return IORESOURCE_SYSTEM_RAM;
	case E820_ACPI:		 /* Fall-through: */
	case E820_NVS:		 /* Fall-through: */
	case E820_UNUSABLE:	 /* Fall-through: */
	case E820_PRAM:		 /* Fall-through: */
	case E820_PMEM:		 /* Fall-through: */
	default:		 return IORESOURCE_MEM;
971 972 973
	}
}

974
static unsigned long __init e820_type_to_iores_desc(int e820_type)
975 976
{
	switch (e820_type) {
977 978 979 980 981 982 983 984
	case E820_ACPI:		 return IORES_DESC_ACPI_TABLES;
	case E820_NVS:		 return IORES_DESC_ACPI_NV_STORAGE;
	case E820_PMEM:		 return IORES_DESC_PERSISTENT_MEMORY;
	case E820_PRAM:		 return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
	case E820_RESERVED_KERN: /* Fall-through: */
	case E820_RAM:		 /* Fall-through: */
	case E820_UNUSABLE:	 /* Fall-through: */
	default:		 return IORES_DESC_NONE;
985 986 987
	}
}

988
static bool __init do_mark_busy(u32 type, struct resource *res)
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
{
	/* this is the legacy bios/dos rom-shadow + mmio region */
	if (res->start < (1ULL<<20))
		return true;

	/*
	 * Treat persistent memory like device memory, i.e. reserve it
	 * for exclusive use of a driver
	 */
	switch (type) {
	case E820_RESERVED:
	case E820_PRAM:
	case E820_PMEM:
		return false;
	default:
		return true;
	}
}

1008
/*
1009
 * Mark E820 reserved areas as busy for the resource manager:
1010
 */
1011

1012
static struct resource __initdata *e820_res;
1013

1014 1015 1016
void __init e820_reserve_resources(void)
{
	int i;
1017
	struct resource *res;
1018
	u64 end;
1019

1020
	res = alloc_bootmem(sizeof(struct resource) * e820_table->nr_entries);
1021
	e820_res = res;
1022 1023
	for (i = 0; i < e820_table->nr_entries; i++) {
		end = e820_table->entries[i].addr + e820_table->entries[i].size - 1;
1024
		if (end != (resource_size_t)end) {
1025 1026 1027
			res++;
			continue;
		}
1028 1029
		res->name = e820_type_to_string(e820_table->entries[i].type);
		res->start = e820_table->entries[i].addr;
1030 1031
		res->end = end;

1032 1033
		res->flags = e820_type_to_iomem_type(e820_table->entries[i].type);
		res->desc = e820_type_to_iores_desc(e820_table->entries[i].type);
1034 1035 1036 1037 1038 1039

		/*
		 * don't register the region that could be conflicted with
		 * pci device BAR resource and insert them later in
		 * pcibios_resource_survey()
		 */
1040
		if (do_mark_busy(e820_table->entries[i].type, res)) {
1041
			res->flags |= IORESOURCE_BUSY;
1042
			insert_resource(&iomem_resource, res);
1043
		}
1044 1045
		res++;
	}
1046

1047 1048
	for (i = 0; i < e820_table_firmware->nr_entries; i++) {
		struct e820_entry *entry = &e820_table_firmware->entries[i];
1049 1050

		firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry->type));
1051
	}
1052 1053
}

1054
/* How much should we pad RAM ending depending on where it is? */
1055
static unsigned long __init ram_alignment(resource_size_t pos)
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
{
	unsigned long mb = pos >> 20;

	/* To 64kB in the first megabyte */
	if (!mb)
		return 64*1024;

	/* To 1MB in the first 16MB */
	if (mb < 16)
		return 1024*1024;

1067 1068
	/* To 64MB for anything above that */
	return 64*1024*1024;
1069 1070
}

1071 1072
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)

1073 1074 1075 1076 1077 1078
void __init e820_reserve_resources_late(void)
{
	int i;
	struct resource *res;

	res = e820_res;
1079
	for (i = 0; i < e820_table->nr_entries; i++) {
1080
		if (!res->parent && res->end)
1081
			insert_resource_expand_to_fit(&iomem_resource, res);
1082 1083
		res++;
	}
1084 1085

	/*
1086
	 * Try to bump up RAM regions to reasonable boundaries, to
1087 1088
	 * avoid stolen RAM:
	 */
1089 1090
	for (i = 0; i < e820_table->nr_entries; i++) {
		struct e820_entry *entry = &e820_table->entries[i];
1091
		u64 start, end;
1092 1093 1094

		if (entry->type != E820_RAM)
			continue;
1095

1096
		start = entry->addr + entry->size;
1097 1098 1099 1100
		end = round_up(start, ram_alignment(start)) - 1;
		if (end > MAX_RESOURCE_SIZE)
			end = MAX_RESOURCE_SIZE;
		if (start >= end)
1101
			continue;
1102

1103
		pr_debug("e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1104
		reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1105
	}
1106 1107
}

1108 1109 1110
/*
 * Pass the firmware (bootloader) E820 map to the kernel and process it:
 */
1111
char *__init e820__memory_setup_default(void)
1112 1113
{
	char *who = "BIOS-e820";
1114
	u32 new_nr;
1115

1116 1117 1118 1119 1120 1121 1122
	/*
	 * Try to copy the BIOS-supplied E820-map.
	 *
	 * Otherwise fake a memory map; one section from 0k->640k,
	 * the next section from 1mb->appropriate_mem_k
	 */
	new_nr = boot_params.e820_entries;
1123
	sanitize_e820_table(boot_params.e820_table, ARRAY_SIZE(boot_params.e820_table), &new_nr);
1124
	boot_params.e820_entries = new_nr;
1125 1126

	if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1127
		u64 mem_size;
1128

1129 1130
		/* Compare results from other methods and take the one that gives more RAM: */
		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1131 1132 1133 1134 1135 1136 1137
			mem_size = boot_params.screen_info.ext_mem_k;
			who = "BIOS-88";
		} else {
			mem_size = boot_params.alt_mem_k;
			who = "BIOS-e801";
		}

1138
		e820_table->nr_entries = 0;
1139 1140 1141 1142 1143 1144 1145
		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
	}

	return who;
}

1146 1147 1148 1149 1150 1151
/*
 * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
 * E820 map - with an optional platform quirk available for virtual platforms
 * to override this method of boot environment processing:
 */
void __init e820__memory_setup(void)
1152
{
1153 1154
	char *who;

1155
	who = x86_init.resources.memory_setup();
1156

1157
	memcpy(e820_table_firmware, e820_table, sizeof(struct e820_table));
1158

1159
	pr_info("e820: BIOS-provided physical RAM map:\n");
1160
	e820_print_map(who);
1161
}
1162

1163
void __init e820__memblock_setup(void)
1164 1165 1166 1167 1168
{
	int i;
	u64 end;

	/*
1169 1170 1171 1172 1173 1174 1175
	 * The bootstrap memblock region count maximum is 128 entries
	 * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
	 * than that - so allow memblock resizing.
	 *
	 * This is safe, because this call happens pretty late during x86 setup,
	 * so we know about reserved memory regions already. (This is important
	 * so that memblock resizing does no stomp over reserved areas.)
1176
	 */
1177
	memblock_allow_resize();
1178

1179
	for (i = 0; i < e820_table->nr_entries; i++) {
1180
		struct e820_entry *entry = &e820_table->entries[i];
1181

1182
		end = entry->addr + entry->size;
1183 1184 1185
		if (end != (resource_size_t)end)
			continue;

1186
		if (entry->type != E820_RAM && entry->type != E820_RESERVED_KERN)
1187 1188
			continue;

1189
		memblock_add(entry->addr, entry->size);
1190 1191
	}

1192
	/* Throw away partial pages: */
1193 1194
	memblock_trim_memory(PAGE_SIZE);

1195 1196
	memblock_dump_all();
}