e820.c 33.0 KB
Newer Older
1
/*
2
 * Low level x86 E820 memory map handling functions.
3
 *
4 5
 * The firmware and bootloader passes us the "E820 table", which is the primary
 * physical memory layout description available about x86 systems.
6
 *
7 8 9
 * The kernel takes the E820 memory layout and optionally modifies it with
 * quirks and other tweaks, and feeds that into the generic Linux memory
 * allocation code routines via a platform independent interface (memblock, etc.).
10 11 12 13
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
14
#include <linux/crash_dump.h>
15
#include <linux/export.h>
16 17
#include <linux/bootmem.h>
#include <linux/pfn.h>
18
#include <linux/suspend.h>
19
#include <linux/acpi.h>
20
#include <linux/firmware-map.h>
21
#include <linux/memblock.h>
22
#include <linux/sort.h>
23

24
#include <asm/e820/api.h>
25
#include <asm/proto.h>
26
#include <asm/setup.h>
27
#include <asm/cpufeature.h>
28

29
/*
30 31 32 33 34 35 36 37 38 39 40
 * We organize the E820 table into two main data structures:
 *
 * - 'e820_table_firmware': the original firmware version passed to us by the
 *   bootloader - not modified by the kernel. We use this to:
 *
 *       - inform the user about the firmware's notion of memory layout
 *         via /sys/firmware/memmap
 *
 *       - the hibernation code uses it to generate a kernel-independent MD5
 *         fingerprint of the physical memory layout of a system.
 *
41
 *       - kexec, which is a bootloader in disguise, uses the original E820
42
 *         layout to pass to the kexec-ed kernel. This way the original kernel
43
 *         can have a restricted E820 map while the kexec()-ed kexec-kernel
44 45
 *         can have access to full memory - etc.
 *
46
 * - 'e820_table': this is the main E820 table that is massaged by the
47 48 49
 *   low level x86 platform code, or modified by boot parameters, before
 *   passed on to higher level MM layers.
 *
50
 * Once the E820 map has been converted to the standard Linux memory layout
51 52 53
 * information its role stops - modifying it has no effect and does not get
 * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
 * specific memory layout data during early bootup.
54
 */
55 56 57 58 59
static struct e820_table e820_table_init		__initdata;
static struct e820_table e820_table_firmware_init	__initdata;

struct e820_table *e820_table __refdata			= &e820_table_init;
struct e820_table *e820_table_firmware __refdata	= &e820_table_firmware_init;
60 61 62 63 64 65 66 67 68 69 70

/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif

/*
 * This function checks if any part of the range <start,end> is mapped
 * with type.
 */
71
int e820__mapped_any(u64 start, u64 end, enum e820_type type)
72 73 74
{
	int i;

75
	for (i = 0; i < e820_table->nr_entries; i++) {
76
		struct e820_entry *entry = &e820_table->entries[i];
77

78
		if (type && entry->type != type)
79
			continue;
80
		if (entry->addr >= end || entry->addr + entry->size <= start)
81 82 83 84 85
			continue;
		return 1;
	}
	return 0;
}
86
EXPORT_SYMBOL_GPL(e820__mapped_any);
87 88

/*
89
 * This function checks if the entire <start,end> range is mapped with 'type'.
90
 *
91 92
 * Note: this function only works correctly once the E820 table is sorted and
 * not-overlapping (at least for the range specified), which is the case normally.
93
 */
94
int __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
95 96 97
{
	int i;

98
	for (i = 0; i < e820_table->nr_entries; i++) {
99
		struct e820_entry *entry = &e820_table->entries[i];
100

101
		if (type && entry->type != type)
102
			continue;
103 104

		/* Is the region (part) in overlap with the current region? */
105
		if (entry->addr >= end || entry->addr + entry->size <= start)
106 107
			continue;

108 109 110
		/*
		 * If the region is at the beginning of <start,end> we move
		 * 'start' to the end of the region since it's ok until there
111
		 */
112 113
		if (entry->addr <= start)
			start = entry->addr + entry->size;
114

115
		/*
116 117
		 * If 'start' is now at or beyond 'end', we're done, full
		 * coverage of the desired range exists:
118 119 120 121 122 123 124 125
		 */
		if (start >= end)
			return 1;
	}
	return 0;
}

/*
126
 * Add a memory region to the kernel E820 map.
127
 */
128
static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
129
{
130
	int x = table->nr_entries;
131

132
	if (x >= ARRAY_SIZE(table->entries)) {
133
		pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
134 135 136
		return;
	}

137 138 139 140
	table->entries[x].addr = start;
	table->entries[x].size = size;
	table->entries[x].type = type;
	table->nr_entries++;
Y
Yinghai Lu 已提交
141 142
}

143
void __init e820__range_add(u64 start, u64 size, enum e820_type type)
Y
Yinghai Lu 已提交
144
{
145
	__e820__range_add(e820_table, start, size, type);
146 147
}

148
static void __init e820_print_type(enum e820_type type)
149 150
{
	switch (type) {
151 152 153 154 155 156 157 158
	case E820_TYPE_RAM:		/* Fall through: */
	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
	case E820_TYPE_PMEM:		/* Fall through: */
	case E820_TYPE_PRAM:		pr_cont("persistent (type %u)", type);	break;
159
	default:			pr_cont("type %u", type);		break;
160 161 162
	}
}

163
void __init e820__print_table(char *who)
164 165 166
{
	int i;

167
	for (i = 0; i < e820_table->nr_entries; i++) {
168
		pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
169 170 171
		       e820_table->entries[i].addr,
		       e820_table->entries[i].addr + e820_table->entries[i].size - 1);

172
		e820_print_type(e820_table->entries[i].type);
173
		pr_cont("\n");
174 175 176 177
	}
}

/*
178
 * Sanitize an E820 map.
179
 *
180
 * Some E820 layouts include overlapping entries. The following
181
 * replaces the original E820 map with a new one, removing overlaps,
182 183
 * and resolving conflicting memory types in favor of highest
 * numbered type.
184
 *
185 186 187
 * The input parameter 'entries' points to an array of 'struct
 * e820_entry' which on entry has elements in the range [0, *nr_entries)
 * valid, and which has space for up to max_nr_entries entries.
188
 * On return, the resulting sanitized E820 map entries will be in
189
 * overwritten in the same location, starting at 'entries'.
190
 *
191 192 193 194
 * The integer pointed to by nr_entries must be valid on entry (the
 * current number of valid entries located at 'entries'). If the
 * sanitizing succeeds the *nr_entries will be updated with the new
 * number of valid entries (something no more than max_nr_entries).
195
 *
196
 * The return value from e820__update_table() is zero if it
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
 * successfully 'sanitized' the map entries passed in, and is -1
 * if it did nothing, which can happen if either of (1) it was
 * only passed one map entry, or (2) any of the input map entries
 * were invalid (start + size < start, meaning that the size was
 * so big the described memory range wrapped around through zero.)
 *
 *	Visually we're performing the following
 *	(1,2,3,4 = memory types)...
 *
 *	Sample memory map (w/overlaps):
 *	   ____22__________________
 *	   ______________________4_
 *	   ____1111________________
 *	   _44_____________________
 *	   11111111________________
 *	   ____________________33__
 *	   ___________44___________
 *	   __________33333_________
 *	   ______________22________
 *	   ___________________2222_
 *	   _________111111111______
 *	   _____________________11_
 *	   _________________4______
 *
 *	Sanitized equivalent (no overlap):
 *	   1_______________________
 *	   _44_____________________
 *	   ___1____________________
 *	   ____22__________________
 *	   ______11________________
 *	   _________1______________
 *	   __________3_____________
 *	   ___________44___________
 *	   _____________33_________
 *	   _______________2________
 *	   ________________1_______
 *	   _________________4______
 *	   ___________________2____
 *	   ____________________33__
 *	   ______________________4_
237
 */
238
struct change_member {
239 240
	/* Pointer to the original entry: */
	struct e820_entry	*entry;
241 242
	/* Address for this change point: */
	unsigned long long	addr;
243 244 245 246 247 248 249 250 251
};

static int __init cpcompare(const void *a, const void *b)
{
	struct change_member * const *app = a, * const *bpp = b;
	const struct change_member *ap = *app, *bp = *bpp;

	/*
	 * Inputs are pointers to two elements of change_point[].  If their
252
	 * addresses are not equal, their difference dominates.  If the addresses
253 254 255 256 257 258
	 * are equal, then consider one that represents the end of its region
	 * to be greater than one that does not.
	 */
	if (ap->addr != bp->addr)
		return ap->addr > bp->addr ? 1 : -1;

259
	return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
260
}
261

262
static int __init __e820__update_table(struct e820_entry *entries, u32 max_nr_entries, u32 *nr_entries)
263
{
264 265 266
	static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata;
	static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata;
	static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata;
267
	static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata;
268
	enum e820_type current_type, last_type;
269
	unsigned long long last_addr;
270 271 272 273 274
	u32 chgidx;
	u32 overlap_entries;
	u32 new_nr_entries;
	u32 old_nr, new_nr, chg_nr;
	u32 i;
275

276
	/* If there's only one memory region, don't bother: */
277
	if (*nr_entries < 2)
278 279
		return -1;

280 281
	old_nr = *nr_entries;
	BUG_ON(old_nr > max_nr_entries);
282

283
	/* Bail out if we find any unreasonable addresses in the map: */
284
	for (i = 0; i < old_nr; i++) {
285
		if (entries[i].addr + entries[i].size < entries[i].addr)
286
			return -1;
287
	}
288

289
	/* Create pointers for initial change-point information (for sorting): */
290 291 292
	for (i = 0; i < 2 * old_nr; i++)
		change_point[i] = &change_point_list[i];

293 294 295 296
	/*
	 * Record all known change-points (starting and ending addresses),
	 * omitting empty memory regions:
	 */
297 298
	chgidx = 0;
	for (i = 0; i < old_nr; i++)	{
299 300 301 302 303
		if (entries[i].size != 0) {
			change_point[chgidx]->addr	= entries[i].addr;
			change_point[chgidx++]->entry	= &entries[i];
			change_point[chgidx]->addr	= entries[i].addr + entries[i].size;
			change_point[chgidx++]->entry	= &entries[i];
304 305 306 307
		}
	}
	chg_nr = chgidx;

308
	/* Sort change-point list by memory addresses (low -> high): */
309
	sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
310

311
	/* Create a new memory map, removing overlaps: */
312
	overlap_entries = 0;	 /* Number of entries in the overlap table */
313
	new_nr_entries = 0;	 /* Index for creating new map entries */
314 315
	last_type = 0;		 /* Start with undefined memory type */
	last_addr = 0;		 /* Start with 0 as last starting address */
316

317
	/* Loop through change-points, determining effect on the new map: */
318
	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
319 320
		/* Keep track of all overlapping entries */
		if (change_point[chgidx]->addr == change_point[chgidx]->entry->addr) {
321
			/* Add map entry to overlap list (> 1 entry implies an overlap) */
322
			overlap_list[overlap_entries++] = change_point[chgidx]->entry;
323
		} else {
324
			/* Remove entry from list (order independent, so swap with last): */
325
			for (i = 0; i < overlap_entries; i++) {
326
				if (overlap_list[i] == change_point[chgidx]->entry)
327
					overlap_list[i] = overlap_list[overlap_entries-1];
328 329 330 331
			}
			overlap_entries--;
		}
		/*
332
		 * If there are overlapping entries, decide which
333 334 335 336
		 * "type" to use (larger value takes precedence --
		 * 1=usable, 2,3,4,4+=unusable)
		 */
		current_type = 0;
337
		for (i = 0; i < overlap_entries; i++) {
338 339
			if (overlap_list[i]->type > current_type)
				current_type = overlap_list[i]->type;
340 341
		}

342
		/* Continue building up new map based on this information: */
343
		if (current_type != last_type || current_type == E820_TYPE_PRAM) {
344
			if (last_type != 0)	 {
345
				new_entries[new_nr_entries].size = change_point[chgidx]->addr - last_addr;
346
				/* Move forward only if the new size was non-zero: */
347 348 349
				if (new_entries[new_nr_entries].size != 0)
					/* No more space left for new entries? */
					if (++new_nr_entries >= max_nr_entries)
350 351 352
						break;
			}
			if (current_type != 0)	{
353 354
				new_entries[new_nr_entries].addr = change_point[chgidx]->addr;
				new_entries[new_nr_entries].type = current_type;
355 356 357 358 359
				last_addr = change_point[chgidx]->addr;
			}
			last_type = current_type;
		}
	}
360

361 362
	/* Retain count for the new entries: */
	new_nr = new_nr_entries;
363

364 365 366
	/* Copy the new entries into the original location: */
	memcpy(entries, new_entries, new_nr*sizeof(*entries));
	*nr_entries = new_nr;
367 368 369 370

	return 0;
}

371 372 373 374 375
int __init e820__update_table(struct e820_table *table)
{
	return __e820__update_table(table->entries, ARRAY_SIZE(table->entries), &table->nr_entries);
}

376
static int __init __append_e820_table(struct e820_entry *entries, u32 nr_entries)
377
{
378 379 380 381 382
	struct e820_entry *entry = entries;

	while (nr_entries) {
		u64 start = entry->addr;
		u64 size = entry->size;
383
		u64 end = start + size - 1;
384
		u32 type = entry->type;
385

386
		/* Ignore the entry on 64-bit overflow: */
387
		if (start > end && likely(size))
388 389
			return -1;

390
		e820__range_add(start, size, type);
391

392 393
		entry++;
		nr_entries--;
394 395 396 397
	}
	return 0;
}

398
/*
399
 * Copy the BIOS E820 map into a safe place.
400 401 402 403 404 405 406
 *
 * Sanity-check it while we're at it..
 *
 * If we're lucky and live on a modern system, the setup code
 * will have given us a memory map that we can use to properly
 * set up memory.  If we aren't, we'll fake a memory map.
 */
407
static int __init append_e820_table(struct e820_entry *entries, u32 nr_entries)
408 409
{
	/* Only one memory region (or negative)? Ignore it */
410
	if (nr_entries < 2)
411 412
		return -1;

413
	return __append_e820_table(entries, nr_entries);
414 415
}

416
static u64 __init
417
__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
418
{
419
	u64 end;
Y
Yinghai Lu 已提交
420
	unsigned int i;
421 422 423 424
	u64 real_updated_size = 0;

	BUG_ON(old_type == new_type);

425 426 427
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

428
	end = start + size;
429
	pr_debug("e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
430
	e820_print_type(old_type);
431
	pr_cont(" ==> ");
432
	e820_print_type(new_type);
433
	pr_cont("\n");
434

435
	for (i = 0; i < table->nr_entries; i++) {
436
		struct e820_entry *entry = &table->entries[i];
437
		u64 final_start, final_end;
438
		u64 entry_end;
439

440
		if (entry->type != old_type)
441
			continue;
442

443
		entry_end = entry->addr + entry->size;
444 445

		/* Completely covered by new range? */
446 447 448
		if (entry->addr >= start && entry_end <= end) {
			entry->type = new_type;
			real_updated_size += entry->size;
449 450
			continue;
		}
451

452
		/* New range is completely covered? */
453
		if (entry->addr < start && entry_end > end) {
454 455
			__e820__range_add(table, start, size, new_type);
			__e820__range_add(table, end, entry_end - end, entry->type);
456
			entry->size = start - entry->addr;
457 458 459 460
			real_updated_size += size;
			continue;
		}

461
		/* Partially covered: */
462 463
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
464 465
		if (final_start >= final_end)
			continue;
466

467
		__e820__range_add(table, final_start, final_end - final_start, new_type);
468

469
		real_updated_size += final_end - final_start;
470

Y
Yinghai Lu 已提交
471
		/*
472 473
		 * Left range could be head or tail, so need to update
		 * its size first:
Y
Yinghai Lu 已提交
474
		 */
475 476
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
477
			continue;
478

479
		entry->addr = final_end;
480 481 482 483
	}
	return real_updated_size;
}

484
u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
485
{
486
	return __e820__range_update(e820_table, start, size, old_type, new_type);
487 488
}

489
static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
490
{
491
	return __e820__range_update(e820_table_firmware, start, size, old_type, new_type);
492 493
}

494
/* Remove a range of memory from the E820 table: */
495
u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, int checktype)
Y
Yinghai Lu 已提交
496 497
{
	int i;
498
	u64 end;
Y
Yinghai Lu 已提交
499 500
	u64 real_removed_size = 0;

501 502 503
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

504
	end = start + size;
505
	pr_debug("e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
506 507
	if (checktype)
		e820_print_type(old_type);
508
	pr_cont("\n");
509

510
	for (i = 0; i < e820_table->nr_entries; i++) {
511
		struct e820_entry *entry = &e820_table->entries[i];
Y
Yinghai Lu 已提交
512
		u64 final_start, final_end;
513
		u64 entry_end;
Y
Yinghai Lu 已提交
514

515
		if (checktype && entry->type != old_type)
Y
Yinghai Lu 已提交
516
			continue;
517

518
		entry_end = entry->addr + entry->size;
519 520

		/* Completely covered? */
521 522
		if (entry->addr >= start && entry_end <= end) {
			real_removed_size += entry->size;
523
			memset(entry, 0, sizeof(*entry));
Y
Yinghai Lu 已提交
524 525
			continue;
		}
526

527
		/* Is the new range completely covered? */
528
		if (entry->addr < start && entry_end > end) {
529
			e820__range_add(end, entry_end - end, entry->type);
530
			entry->size = start - entry->addr;
531 532 533 534
			real_removed_size += size;
			continue;
		}

535
		/* Partially covered: */
536 537
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
Y
Yinghai Lu 已提交
538 539
		if (final_start >= final_end)
			continue;
540

Y
Yinghai Lu 已提交
541 542
		real_removed_size += final_end - final_start;

543
		/*
544 545
		 * Left range could be head or tail, so need to update
		 * the size first:
546
		 */
547 548
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
Y
Yinghai Lu 已提交
549
			continue;
550

551
		entry->addr = final_end;
Y
Yinghai Lu 已提交
552 553 554 555
	}
	return real_removed_size;
}

556
void __init e820__update_table_print(void)
557
{
558
	if (e820__update_table(e820_table))
559
		return;
560

561
	pr_info("e820: modified physical RAM map:\n");
562
	e820__print_table("modified");
563
}
564

565
static void __init e820__update_table_firmware(void)
566
{
567
	e820__update_table(e820_table_firmware);
568
}
569

A
Alok Kataria 已提交
570
#define MAX_GAP_END 0x100000000ull
571

572
/*
573
 * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
574
 */
575
static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
576
{
577
	unsigned long long last = MAX_GAP_END;
578
	int i = e820_table->nr_entries;
579 580 581
	int found = 0;

	while (--i >= 0) {
582 583
		unsigned long long start = e820_table->entries[i].addr;
		unsigned long long end = start + e820_table->entries[i].size;
584 585 586

		/*
		 * Since "last" is at most 4GB, we know we'll
587
		 * fit in 32 bits if this condition is true:
588 589 590 591
		 */
		if (last > end) {
			unsigned long gap = last - end;

592 593 594
			if (gap >= *gapsize) {
				*gapsize = gap;
				*gapstart = end;
595 596 597 598 599 600
				found = 1;
			}
		}
		if (start < last)
			last = start;
	}
601 602 603 604
	return found;
}

/*
605 606 607 608 609
 * Search for the biggest gap in the low 32 bits of the E820
 * memory space. We pass this space to the PCI subsystem, so
 * that it can assign MMIO resources for hotplug or
 * unconfigured devices in.
 *
610 611
 * Hopefully the BIOS let enough space left.
 */
612
__init void e820__setup_pci_gap(void)
613
{
614
	unsigned long gapstart, gapsize;
615 616 617
	int found;

	gapsize = 0x400000;
618
	found  = e820_search_gap(&gapstart, &gapsize);
619 620

	if (!found) {
621
#ifdef CONFIG_X86_64
Y
Yinghai Lu 已提交
622
		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
623
		pr_err(
624 625
			"e820: Cannot find an available gap in the 32-bit address range\n"
			"e820: PCI devices with unassigned 32-bit BARs may not work!\n");
626 627
#else
		gapstart = 0x10000000;
628
#endif
629
	}
630 631

	/*
632
	 * e820_reserve_resources_late protect stolen RAM already
633
	 */
634
	pci_mem_start = gapstart;
635

636
	pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
637 638
}

639 640 641
/*
 * Called late during init, in free_initmem().
 *
642 643 644 645 646 647 648 649
 * Initial e820_table and e820_table_firmware are largish __initdata arrays.
 *
 * Copy them to a (usually much smaller) dynamically allocated area that is
 * sized precisely after the number of e820 entries.
 *
 * This is done after we've performed all the fixes and tweaks to the tables.
 * All functions which modify them are __init functions, which won't exist
 * after free_initmem().
650 651 652
 */
__init void e820_reallocate_tables(void)
{
653
	struct e820_table *n;
654 655
	int size;

656
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
657 658
	n = kmalloc(size, GFP_KERNEL);
	BUG_ON(!n);
659 660
	memcpy(n, e820_table, size);
	e820_table = n;
661

662
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
663 664
	n = kmalloc(size, GFP_KERNEL);
	BUG_ON(!n);
665 666
	memcpy(n, e820_table_firmware, size);
	e820_table_firmware = n;
667 668
}

669 670 671 672 673
/*
 * Because of the small fixed size of struct boot_params, only the first
 * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
 * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
 * struct setup_data, which is parsed here.
674
 */
675
void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
676 677
{
	int entries;
678
	struct e820_entry *extmap;
679
	struct setup_data *sdata;
680

681
	sdata = early_memremap(phys_addr, data_len);
682
	entries = sdata->len / sizeof(*extmap);
683
	extmap = (struct e820_entry *)(sdata->data);
684

685
	__append_e820_table(extmap, entries);
686
	e820__update_table(e820_table);
687

688
	early_memunmap(sdata, data_len);
689
	pr_info("e820: extended physical RAM map:\n");
690
	e820__print_table("extended");
691 692
}

693 694
/**
 * Find the ranges of physical addresses that do not correspond to
695 696
 * E820 RAM areas and mark the corresponding pages as 'nosave' for
 * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
697
 *
698
 * This function requires the E820 map to be sorted and without any
699
 * overlapping entries.
700 701 702 703
 */
void __init e820_mark_nosave_regions(unsigned long limit_pfn)
{
	int i;
704
	unsigned long pfn = 0;
705

706
	for (i = 0; i < e820_table->nr_entries; i++) {
707
		struct e820_entry *entry = &e820_table->entries[i];
708

709 710
		if (pfn < PFN_UP(entry->addr))
			register_nosave_region(pfn, PFN_UP(entry->addr));
711

712
		pfn = PFN_DOWN(entry->addr + entry->size);
713

714
		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
715
			register_nosave_region(PFN_UP(entry->addr), pfn);
716 717 718 719 720

		if (pfn >= limit_pfn)
			break;
	}
}
721

H
Huang Ying 已提交
722
#ifdef CONFIG_ACPI
723 724 725
/*
 * Register ACPI NVS memory regions, so that we can save/restore them during
 * hibernation and the subsequent resume:
726 727 728 729 730
 */
static int __init e820_mark_nvs_memory(void)
{
	int i;

731
	for (i = 0; i < e820_table->nr_entries; i++) {
732
		struct e820_entry *entry = &e820_table->entries[i];
733

734
		if (entry->type == E820_TYPE_NVS)
735
			acpi_nvs_register(entry->addr, entry->size);
736 737 738 739 740 741 742
	}

	return 0;
}
core_initcall(e820_mark_nvs_memory);
#endif

Y
Yinghai Lu 已提交
743
/*
744 745 746 747 748 749
 * Allocate the requested number of bytes with the requsted alignment
 * and return (the physical address) to the caller. Also register this
 * range in the 'firmware' E820 table as a reserved range.
 *
 * This allows kexec to fake a new mptable, as if it came from the real
 * system.
Y
Yinghai Lu 已提交
750
 */
751
u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
Y
Yinghai Lu 已提交
752 753 754
{
	u64 addr;

755 756
	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
	if (addr) {
757
		e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
758
		pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n");
759
		e820__update_table_firmware();
760
	}
Y
Yinghai Lu 已提交
761 762 763 764

	return addr;
}

765 766 767 768 769 770 771
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
#  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
# else
#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
# endif
#else /* CONFIG_X86_32 */
772
# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
773 774 775 776 777
#endif

/*
 * Find the highest page frame number we have available
 */
778
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
779
{
780 781
	int i;
	unsigned long last_pfn = 0;
782 783
	unsigned long max_arch_pfn = MAX_ARCH_PFN;

784
	for (i = 0; i < e820_table->nr_entries; i++) {
785
		struct e820_entry *entry = &e820_table->entries[i];
786
		unsigned long start_pfn;
787 788
		unsigned long end_pfn;

789
		if (entry->type != type)
790 791
			continue;

792 793
		start_pfn = entry->addr >> PAGE_SHIFT;
		end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
794 795 796 797 798 799 800

		if (start_pfn >= limit_pfn)
			continue;
		if (end_pfn > limit_pfn) {
			last_pfn = limit_pfn;
			break;
		}
801 802 803
		if (end_pfn > last_pfn)
			last_pfn = end_pfn;
	}
804 805 806 807

	if (last_pfn > max_arch_pfn)
		last_pfn = max_arch_pfn;

808
	pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
809 810 811
			 last_pfn, max_arch_pfn);
	return last_pfn;
}
812

813 814
unsigned long __init e820_end_of_ram_pfn(void)
{
815
	return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
816
}
817

818 819
unsigned long __init e820_end_of_low_ram_pfn(void)
{
820
	return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
821
}
822

823
static void __init early_panic(char *msg)
824 825 826 827 828
{
	early_printk(msg);
	panic(msg);
}

829 830
static int userdef __initdata;

831
/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
832 833 834 835 836 837 838 839
static int __init parse_memopt(char *p)
{
	u64 mem_size;

	if (!p)
		return -EINVAL;

	if (!strcmp(p, "nopentium")) {
840
#ifdef CONFIG_X86_32
841 842
		setup_clear_cpu_cap(X86_FEATURE_PSE);
		return 0;
843
#else
844
		pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
845
		return -EINVAL;
846
#endif
847
	}
848

849
	userdef = 1;
850
	mem_size = memparse(p, &p);
851 852

	/* Don't remove all memory when getting "mem={invalid}" parameter: */
853 854
	if (mem_size == 0)
		return -EINVAL;
855

856
	e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
857

858 859 860 861
	return 0;
}
early_param("mem", parse_memopt);

862
static int __init parse_memmap_one(char *p)
863 864 865 866
{
	char *oldp;
	u64 start_at, mem_size;

867 868 869
	if (!p)
		return -EINVAL;

870
	if (!strncmp(p, "exactmap", 8)) {
871 872 873
#ifdef CONFIG_CRASH_DUMP
		/*
		 * If we are doing a crash dump, we still need to know
874
		 * the real memory size before the original memory map is
875 876
		 * reset.
		 */
877
		saved_max_pfn = e820_end_of_ram_pfn();
878
#endif
879
		e820_table->nr_entries = 0;
880 881 882 883 884 885 886 887 888 889 890 891
		userdef = 1;
		return 0;
	}

	oldp = p;
	mem_size = memparse(p, &p);
	if (p == oldp)
		return -EINVAL;

	userdef = 1;
	if (*p == '@') {
		start_at = memparse(p+1, &p);
892
		e820__range_add(start_at, mem_size, E820_TYPE_RAM);
893 894
	} else if (*p == '#') {
		start_at = memparse(p+1, &p);
895
		e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
896 897
	} else if (*p == '$') {
		start_at = memparse(p+1, &p);
898
		e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
899 900
	} else if (*p == '!') {
		start_at = memparse(p+1, &p);
901
		e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
902
	} else {
903
		e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
904
	}
Y
Yinghai Lu 已提交
905

906 907
	return *p == '\0' ? 0 : -EINVAL;
}
908

909 910 911 912 913 914 915 916 917 918 919 920 921 922
static int __init parse_memmap_opt(char *str)
{
	while (str) {
		char *k = strchr(str, ',');

		if (k)
			*k++ = 0;

		parse_memmap_one(str);
		str = k;
	}

	return 0;
}
923 924
early_param("memmap", parse_memmap_opt);

925 926 927 928 929 930
/*
 * Reserve all entries from the bootloader's extensible data nodes list,
 * because if present we are going to use it later on to fetch e820
 * entries from it:
 */
void __init e820__reserve_setup_data(void)
931 932 933 934 935 936 937 938 939 940
{
	struct setup_data *data;
	u64 pa_data;

	pa_data = boot_params.hdr.setup_data;
	if (!pa_data)
		return;

	while (pa_data) {
		data = early_memremap(pa_data, sizeof(*data));
941
		e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
942 943 944 945
		pa_data = data->next;
		early_memunmap(data, sizeof(*data));
	}

946
	e820__update_table(e820_table);
947

948
	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
949 950

	pr_info("extended physical RAM map:\n");
951
	e820__print_table("reserve setup_data");
952 953
}

954 955 956 957 958 959
/*
 * Called after parse_early_param(), after early parameters (such as mem=)
 * have been processed, in which case we already have an E820 table filled in
 * via the parameter callback function(s), but it's not sorted and printed yet:
 */
void __init e820__finish_early_params(void)
960 961
{
	if (userdef) {
962
		if (e820__update_table(e820_table) < 0)
963 964
			early_panic("Invalid user supplied memory map");

965
		pr_info("e820: user-defined physical RAM map:\n");
966
		e820__print_table("user");
967 968
	}
}
969

970
static const char *__init e820_type_to_string(struct e820_entry *entry)
971
{
972
	switch (entry->type) {
973 974 975 976 977 978 979 980
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		return "System RAM";
	case E820_TYPE_ACPI:		return "ACPI Tables";
	case E820_TYPE_NVS:		return "ACPI Non-volatile Storage";
	case E820_TYPE_UNUSABLE:	return "Unusable memory";
	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
	case E820_TYPE_PMEM:		return "Persistent Memory";
	default:			return "Reserved";
981 982 983
	}
}

984
static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
985
{
986
	switch (entry->type) {
987 988 989 990 991 992 993 994
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		return IORESOURCE_SYSTEM_RAM;
	case E820_TYPE_ACPI:		/* Fall-through: */
	case E820_TYPE_NVS:		/* Fall-through: */
	case E820_TYPE_UNUSABLE:	/* Fall-through: */
	case E820_TYPE_PRAM:		/* Fall-through: */
	case E820_TYPE_PMEM:		/* Fall-through: */
	default:			return IORESOURCE_MEM;
995 996 997
	}
}

998
static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
999
{
1000
	switch (entry->type) {
1001 1002 1003 1004 1005 1006 1007 1008
	case E820_TYPE_ACPI:		return IORES_DESC_ACPI_TABLES;
	case E820_TYPE_NVS:		return IORES_DESC_ACPI_NV_STORAGE;
	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		/* Fall-through: */
	case E820_TYPE_UNUSABLE:	/* Fall-through: */
	default:			return IORES_DESC_NONE;
1009 1010 1011
	}
}

1012
static bool __init do_mark_busy(u32 type, struct resource *res)
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
{
	/* this is the legacy bios/dos rom-shadow + mmio region */
	if (res->start < (1ULL<<20))
		return true;

	/*
	 * Treat persistent memory like device memory, i.e. reserve it
	 * for exclusive use of a driver
	 */
	switch (type) {
1023 1024 1025
	case E820_TYPE_RESERVED:
	case E820_TYPE_PRAM:
	case E820_TYPE_PMEM:
1026 1027 1028 1029 1030 1031
		return false;
	default:
		return true;
	}
}

1032
/*
1033
 * Mark E820 reserved areas as busy for the resource manager:
1034
 */
1035

1036
static struct resource __initdata *e820_res;
1037

1038 1039 1040
void __init e820_reserve_resources(void)
{
	int i;
1041
	struct resource *res;
1042
	u64 end;
1043

1044
	res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries);
1045
	e820_res = res;
1046

1047
	for (i = 0; i < e820_table->nr_entries; i++) {
1048 1049 1050
		struct e820_entry *entry = e820_table->entries + i;

		end = entry->addr + entry->size - 1;
1051
		if (end != (resource_size_t)end) {
1052 1053 1054
			res++;
			continue;
		}
1055 1056 1057 1058 1059
		res->start = entry->addr;
		res->end   = end;
		res->name  = e820_type_to_string(entry);
		res->flags = e820_type_to_iomem_type(entry);
		res->desc  = e820_type_to_iores_desc(entry);
1060 1061 1062 1063 1064 1065

		/*
		 * don't register the region that could be conflicted with
		 * pci device BAR resource and insert them later in
		 * pcibios_resource_survey()
		 */
1066
		if (do_mark_busy(entry->type, res)) {
1067
			res->flags |= IORESOURCE_BUSY;
1068
			insert_resource(&iomem_resource, res);
1069
		}
1070 1071
		res++;
	}
1072

1073
	for (i = 0; i < e820_table_firmware->nr_entries; i++) {
1074
		struct e820_entry *entry = e820_table_firmware->entries + i;
1075

1076
		firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1077
	}
1078 1079
}

1080
/* How much should we pad RAM ending depending on where it is? */
1081
static unsigned long __init ram_alignment(resource_size_t pos)
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
{
	unsigned long mb = pos >> 20;

	/* To 64kB in the first megabyte */
	if (!mb)
		return 64*1024;

	/* To 1MB in the first 16MB */
	if (mb < 16)
		return 1024*1024;

1093 1094
	/* To 64MB for anything above that */
	return 64*1024*1024;
1095 1096
}

1097 1098
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)

1099 1100 1101 1102 1103 1104
void __init e820_reserve_resources_late(void)
{
	int i;
	struct resource *res;

	res = e820_res;
1105
	for (i = 0; i < e820_table->nr_entries; i++) {
1106
		if (!res->parent && res->end)
1107
			insert_resource_expand_to_fit(&iomem_resource, res);
1108 1109
		res++;
	}
1110 1111

	/*
1112
	 * Try to bump up RAM regions to reasonable boundaries, to
1113 1114
	 * avoid stolen RAM:
	 */
1115 1116
	for (i = 0; i < e820_table->nr_entries; i++) {
		struct e820_entry *entry = &e820_table->entries[i];
1117
		u64 start, end;
1118

1119
		if (entry->type != E820_TYPE_RAM)
1120
			continue;
1121

1122
		start = entry->addr + entry->size;
1123 1124 1125 1126
		end = round_up(start, ram_alignment(start)) - 1;
		if (end > MAX_RESOURCE_SIZE)
			end = MAX_RESOURCE_SIZE;
		if (start >= end)
1127
			continue;
1128

1129
		pr_debug("e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1130
		reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1131
	}
1132 1133
}

1134 1135 1136
/*
 * Pass the firmware (bootloader) E820 map to the kernel and process it:
 */
1137
char *__init e820__memory_setup_default(void)
1138 1139
{
	char *who = "BIOS-e820";
1140
	u32 new_nr;
1141

1142 1143 1144 1145 1146 1147 1148
	/*
	 * Try to copy the BIOS-supplied E820-map.
	 *
	 * Otherwise fake a memory map; one section from 0k->640k,
	 * the next section from 1mb->appropriate_mem_k
	 */
	new_nr = boot_params.e820_entries;
1149
	__e820__update_table(boot_params.e820_table, ARRAY_SIZE(boot_params.e820_table), &new_nr);
1150
	boot_params.e820_entries = new_nr;
1151 1152

	if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1153
		u64 mem_size;
1154

1155 1156
		/* Compare results from other methods and take the one that gives more RAM: */
		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1157 1158 1159 1160 1161 1162 1163
			mem_size = boot_params.screen_info.ext_mem_k;
			who = "BIOS-88";
		} else {
			mem_size = boot_params.alt_mem_k;
			who = "BIOS-e801";
		}

1164
		e820_table->nr_entries = 0;
1165 1166
		e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
		e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1167 1168 1169 1170 1171
	}

	return who;
}

1172 1173 1174 1175 1176 1177
/*
 * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
 * E820 map - with an optional platform quirk available for virtual platforms
 * to override this method of boot environment processing:
 */
void __init e820__memory_setup(void)
1178
{
1179 1180
	char *who;

1181 1182 1183
	/* This is a firmware interface ABI - make sure we don't break it: */
	BUILD_BUG_ON(sizeof(struct e820_entry) != 20);

1184
	who = x86_init.resources.memory_setup();
1185

1186
	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1187

1188
	pr_info("e820: BIOS-provided physical RAM map:\n");
1189
	e820__print_table(who);
1190
}
1191

1192
void __init e820__memblock_setup(void)
1193 1194 1195 1196 1197
{
	int i;
	u64 end;

	/*
1198 1199 1200 1201 1202 1203 1204
	 * The bootstrap memblock region count maximum is 128 entries
	 * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
	 * than that - so allow memblock resizing.
	 *
	 * This is safe, because this call happens pretty late during x86 setup,
	 * so we know about reserved memory regions already. (This is important
	 * so that memblock resizing does no stomp over reserved areas.)
1205
	 */
1206
	memblock_allow_resize();
1207

1208
	for (i = 0; i < e820_table->nr_entries; i++) {
1209
		struct e820_entry *entry = &e820_table->entries[i];
1210

1211
		end = entry->addr + entry->size;
1212 1213 1214
		if (end != (resource_size_t)end)
			continue;

1215
		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1216 1217
			continue;

1218
		memblock_add(entry->addr, entry->size);
1219 1220
	}

1221
	/* Throw away partial pages: */
1222 1223
	memblock_trim_memory(PAGE_SIZE);

1224 1225
	memblock_dump_all();
}