e820.c 33.1 KB
Newer Older
1
/*
2
 * Low level x86 E820 memory map handling functions.
3
 *
4 5
 * The firmware and bootloader passes us the "E820 table", which is the primary
 * physical memory layout description available about x86 systems.
6
 *
7 8 9
 * The kernel takes the E820 memory layout and optionally modifies it with
 * quirks and other tweaks, and feeds that into the generic Linux memory
 * allocation code routines via a platform independent interface (memblock, etc.).
10
 */
11
#include <linux/crash_dump.h>
12
#include <linux/bootmem.h>
13
#include <linux/suspend.h>
14
#include <linux/acpi.h>
15
#include <linux/firmware-map.h>
16
#include <linux/memblock.h>
17
#include <linux/sort.h>
18

19
#include <asm/e820/api.h>
20 21
#include <asm/setup.h>

22
/*
23 24 25 26 27 28 29 30 31 32 33
 * We organize the E820 table into two main data structures:
 *
 * - 'e820_table_firmware': the original firmware version passed to us by the
 *   bootloader - not modified by the kernel. We use this to:
 *
 *       - inform the user about the firmware's notion of memory layout
 *         via /sys/firmware/memmap
 *
 *       - the hibernation code uses it to generate a kernel-independent MD5
 *         fingerprint of the physical memory layout of a system.
 *
34
 *       - kexec, which is a bootloader in disguise, uses the original E820
35
 *         layout to pass to the kexec-ed kernel. This way the original kernel
36
 *         can have a restricted E820 map while the kexec()-ed kexec-kernel
37 38
 *         can have access to full memory - etc.
 *
39
 * - 'e820_table': this is the main E820 table that is massaged by the
40 41 42
 *   low level x86 platform code, or modified by boot parameters, before
 *   passed on to higher level MM layers.
 *
43
 * Once the E820 map has been converted to the standard Linux memory layout
44 45 46
 * information its role stops - modifying it has no effect and does not get
 * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
 * specific memory layout data during early bootup.
47
 */
48 49 50 51 52
static struct e820_table e820_table_init		__initdata;
static struct e820_table e820_table_firmware_init	__initdata;

struct e820_table *e820_table __refdata			= &e820_table_init;
struct e820_table *e820_table_firmware __refdata	= &e820_table_firmware_init;
53 54 55 56 57 58 59 60 61 62 63

/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif

/*
 * This function checks if any part of the range <start,end> is mapped
 * with type.
 */
64
bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
65 66 67
{
	int i;

68
	for (i = 0; i < e820_table->nr_entries; i++) {
69
		struct e820_entry *entry = &e820_table->entries[i];
70

71
		if (type && entry->type != type)
72
			continue;
73
		if (entry->addr >= end || entry->addr + entry->size <= start)
74 75 76 77 78
			continue;
		return 1;
	}
	return 0;
}
79
EXPORT_SYMBOL_GPL(e820__mapped_any);
80 81

/*
82
 * This function checks if the entire <start,end> range is mapped with 'type'.
83
 *
84 85
 * Note: this function only works correctly once the E820 table is sorted and
 * not-overlapping (at least for the range specified), which is the case normally.
86
 */
87
bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
88 89 90
{
	int i;

91
	for (i = 0; i < e820_table->nr_entries; i++) {
92
		struct e820_entry *entry = &e820_table->entries[i];
93

94
		if (type && entry->type != type)
95
			continue;
96 97

		/* Is the region (part) in overlap with the current region? */
98
		if (entry->addr >= end || entry->addr + entry->size <= start)
99 100
			continue;

101 102 103
		/*
		 * If the region is at the beginning of <start,end> we move
		 * 'start' to the end of the region since it's ok until there
104
		 */
105 106
		if (entry->addr <= start)
			start = entry->addr + entry->size;
107

108
		/*
109 110
		 * If 'start' is now at or beyond 'end', we're done, full
		 * coverage of the desired range exists:
111 112 113 114 115 116 117 118
		 */
		if (start >= end)
			return 1;
	}
	return 0;
}

/*
119
 * Add a memory region to the kernel E820 map.
120
 */
121
static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
122
{
123
	int x = table->nr_entries;
124

125
	if (x >= ARRAY_SIZE(table->entries)) {
126
		pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
127 128 129
		return;
	}

130 131 132 133
	table->entries[x].addr = start;
	table->entries[x].size = size;
	table->entries[x].type = type;
	table->nr_entries++;
Y
Yinghai Lu 已提交
134 135
}

136
void __init e820__range_add(u64 start, u64 size, enum e820_type type)
Y
Yinghai Lu 已提交
137
{
138
	__e820__range_add(e820_table, start, size, type);
139 140
}

141
static void __init e820_print_type(enum e820_type type)
142 143
{
	switch (type) {
144 145 146 147 148 149 150 151
	case E820_TYPE_RAM:		/* Fall through: */
	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
	case E820_TYPE_PMEM:		/* Fall through: */
	case E820_TYPE_PRAM:		pr_cont("persistent (type %u)", type);	break;
152
	default:			pr_cont("type %u", type);		break;
153 154 155
	}
}

156
void __init e820__print_table(char *who)
157 158 159
{
	int i;

160
	for (i = 0; i < e820_table->nr_entries; i++) {
161
		pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
162 163 164
		       e820_table->entries[i].addr,
		       e820_table->entries[i].addr + e820_table->entries[i].size - 1);

165
		e820_print_type(e820_table->entries[i].type);
166
		pr_cont("\n");
167 168 169 170
	}
}

/*
171
 * Sanitize an E820 map.
172
 *
173
 * Some E820 layouts include overlapping entries. The following
174
 * replaces the original E820 map with a new one, removing overlaps,
175 176
 * and resolving conflicting memory types in favor of highest
 * numbered type.
177
 *
178 179 180
 * The input parameter 'entries' points to an array of 'struct
 * e820_entry' which on entry has elements in the range [0, *nr_entries)
 * valid, and which has space for up to max_nr_entries entries.
181
 * On return, the resulting sanitized E820 map entries will be in
182
 * overwritten in the same location, starting at 'entries'.
183
 *
184 185 186 187
 * The integer pointed to by nr_entries must be valid on entry (the
 * current number of valid entries located at 'entries'). If the
 * sanitizing succeeds the *nr_entries will be updated with the new
 * number of valid entries (something no more than max_nr_entries).
188
 *
189
 * The return value from e820__update_table() is zero if it
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
 * successfully 'sanitized' the map entries passed in, and is -1
 * if it did nothing, which can happen if either of (1) it was
 * only passed one map entry, or (2) any of the input map entries
 * were invalid (start + size < start, meaning that the size was
 * so big the described memory range wrapped around through zero.)
 *
 *	Visually we're performing the following
 *	(1,2,3,4 = memory types)...
 *
 *	Sample memory map (w/overlaps):
 *	   ____22__________________
 *	   ______________________4_
 *	   ____1111________________
 *	   _44_____________________
 *	   11111111________________
 *	   ____________________33__
 *	   ___________44___________
 *	   __________33333_________
 *	   ______________22________
 *	   ___________________2222_
 *	   _________111111111______
 *	   _____________________11_
 *	   _________________4______
 *
 *	Sanitized equivalent (no overlap):
 *	   1_______________________
 *	   _44_____________________
 *	   ___1____________________
 *	   ____22__________________
 *	   ______11________________
 *	   _________1______________
 *	   __________3_____________
 *	   ___________44___________
 *	   _____________33_________
 *	   _______________2________
 *	   ________________1_______
 *	   _________________4______
 *	   ___________________2____
 *	   ____________________33__
 *	   ______________________4_
230
 */
231
struct change_member {
232 233
	/* Pointer to the original entry: */
	struct e820_entry	*entry;
234 235
	/* Address for this change point: */
	unsigned long long	addr;
236 237
};

238 239 240 241 242
static struct change_member	change_point_list[2*E820_MAX_ENTRIES]	__initdata;
static struct change_member	*change_point[2*E820_MAX_ENTRIES]	__initdata;
static struct e820_entry	*overlap_list[E820_MAX_ENTRIES]		__initdata;
static struct e820_entry	new_entries[E820_MAX_ENTRIES]		__initdata;

243 244 245 246 247 248 249
static int __init cpcompare(const void *a, const void *b)
{
	struct change_member * const *app = a, * const *bpp = b;
	const struct change_member *ap = *app, *bp = *bpp;

	/*
	 * Inputs are pointers to two elements of change_point[].  If their
250
	 * addresses are not equal, their difference dominates.  If the addresses
251 252 253 254 255 256
	 * are equal, then consider one that represents the end of its region
	 * to be greater than one that does not.
	 */
	if (ap->addr != bp->addr)
		return ap->addr > bp->addr ? 1 : -1;

257
	return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
258
}
259

260
int __init e820__update_table(struct e820_table *table)
261
{
262 263
	struct e820_entry *entries = table->entries;
	u32 max_nr_entries = ARRAY_SIZE(table->entries);
264
	enum e820_type current_type, last_type;
265
	unsigned long long last_addr;
266 267
	u32 new_nr_entries, overlap_entries;
	u32 i, chg_idx, chg_nr;
268

269
	/* If there's only one memory region, don't bother: */
270
	if (table->nr_entries < 2)
271 272
		return -1;

273 274
	table->nr_entries = table->nr_entries;
	BUG_ON(table->nr_entries > max_nr_entries);
275

276
	/* Bail out if we find any unreasonable addresses in the map: */
277
	for (i = 0; i < table->nr_entries; i++) {
278
		if (entries[i].addr + entries[i].size < entries[i].addr)
279
			return -1;
280
	}
281

282
	/* Create pointers for initial change-point information (for sorting): */
283
	for (i = 0; i < 2 * table->nr_entries; i++)
284 285
		change_point[i] = &change_point_list[i];

286 287 288 289
	/*
	 * Record all known change-points (starting and ending addresses),
	 * omitting empty memory regions:
	 */
290 291
	chg_idx = 0;
	for (i = 0; i < table->nr_entries; i++)	{
292
		if (entries[i].size != 0) {
293 294 295 296
			change_point[chg_idx]->addr	= entries[i].addr;
			change_point[chg_idx++]->entry	= &entries[i];
			change_point[chg_idx]->addr	= entries[i].addr + entries[i].size;
			change_point[chg_idx++]->entry	= &entries[i];
297 298
		}
	}
299
	chg_nr = chg_idx;
300

301
	/* Sort change-point list by memory addresses (low -> high): */
302
	sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
303

304
	/* Create a new memory map, removing overlaps: */
305
	overlap_entries = 0;	 /* Number of entries in the overlap table */
306
	new_nr_entries = 0;	 /* Index for creating new map entries */
307 308
	last_type = 0;		 /* Start with undefined memory type */
	last_addr = 0;		 /* Start with 0 as last starting address */
309

310
	/* Loop through change-points, determining effect on the new map: */
311
	for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
312
		/* Keep track of all overlapping entries */
313
		if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
314
			/* Add map entry to overlap list (> 1 entry implies an overlap) */
315
			overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
316
		} else {
317
			/* Remove entry from list (order independent, so swap with last): */
318
			for (i = 0; i < overlap_entries; i++) {
319
				if (overlap_list[i] == change_point[chg_idx]->entry)
320
					overlap_list[i] = overlap_list[overlap_entries-1];
321 322 323 324
			}
			overlap_entries--;
		}
		/*
325
		 * If there are overlapping entries, decide which
326 327 328 329
		 * "type" to use (larger value takes precedence --
		 * 1=usable, 2,3,4,4+=unusable)
		 */
		current_type = 0;
330
		for (i = 0; i < overlap_entries; i++) {
331 332
			if (overlap_list[i]->type > current_type)
				current_type = overlap_list[i]->type;
333 334
		}

335
		/* Continue building up new map based on this information: */
336
		if (current_type != last_type || current_type == E820_TYPE_PRAM) {
337
			if (last_type != 0)	 {
338
				new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
339
				/* Move forward only if the new size was non-zero: */
340 341 342
				if (new_entries[new_nr_entries].size != 0)
					/* No more space left for new entries? */
					if (++new_nr_entries >= max_nr_entries)
343 344 345
						break;
			}
			if (current_type != 0)	{
346
				new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
347
				new_entries[new_nr_entries].type = current_type;
348
				last_addr = change_point[chg_idx]->addr;
349 350 351 352
			}
			last_type = current_type;
		}
	}
353

354
	/* Copy the new entries into the original location: */
355 356
	memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
	table->nr_entries = new_nr_entries;
357 358 359 360

	return 0;
}

361
static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
362
{
363
	struct boot_e820_entry *entry = entries;
364 365 366 367

	while (nr_entries) {
		u64 start = entry->addr;
		u64 size = entry->size;
368
		u64 end = start + size - 1;
369
		u32 type = entry->type;
370

371
		/* Ignore the entry on 64-bit overflow: */
372
		if (start > end && likely(size))
373 374
			return -1;

375
		e820__range_add(start, size, type);
376

377 378
		entry++;
		nr_entries--;
379 380 381 382
	}
	return 0;
}

383
/*
384
 * Copy the BIOS E820 map into a safe place.
385 386 387 388 389 390 391
 *
 * Sanity-check it while we're at it..
 *
 * If we're lucky and live on a modern system, the setup code
 * will have given us a memory map that we can use to properly
 * set up memory.  If we aren't, we'll fake a memory map.
 */
392
static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
393 394
{
	/* Only one memory region (or negative)? Ignore it */
395
	if (nr_entries < 2)
396 397
		return -1;

398
	return __append_e820_table(entries, nr_entries);
399 400
}

401
static u64 __init
402
__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
403
{
404
	u64 end;
Y
Yinghai Lu 已提交
405
	unsigned int i;
406 407 408 409
	u64 real_updated_size = 0;

	BUG_ON(old_type == new_type);

410 411 412
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

413
	end = start + size;
414
	printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
415
	e820_print_type(old_type);
416
	pr_cont(" ==> ");
417
	e820_print_type(new_type);
418
	pr_cont("\n");
419

420
	for (i = 0; i < table->nr_entries; i++) {
421
		struct e820_entry *entry = &table->entries[i];
422
		u64 final_start, final_end;
423
		u64 entry_end;
424

425
		if (entry->type != old_type)
426
			continue;
427

428
		entry_end = entry->addr + entry->size;
429 430

		/* Completely covered by new range? */
431 432 433
		if (entry->addr >= start && entry_end <= end) {
			entry->type = new_type;
			real_updated_size += entry->size;
434 435
			continue;
		}
436

437
		/* New range is completely covered? */
438
		if (entry->addr < start && entry_end > end) {
439 440
			__e820__range_add(table, start, size, new_type);
			__e820__range_add(table, end, entry_end - end, entry->type);
441
			entry->size = start - entry->addr;
442 443 444 445
			real_updated_size += size;
			continue;
		}

446
		/* Partially covered: */
447 448
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
449 450
		if (final_start >= final_end)
			continue;
451

452
		__e820__range_add(table, final_start, final_end - final_start, new_type);
453

454
		real_updated_size += final_end - final_start;
455

Y
Yinghai Lu 已提交
456
		/*
457 458
		 * Left range could be head or tail, so need to update
		 * its size first:
Y
Yinghai Lu 已提交
459
		 */
460 461
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
462
			continue;
463

464
		entry->addr = final_end;
465 466 467 468
	}
	return real_updated_size;
}

469
u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
470
{
471
	return __e820__range_update(e820_table, start, size, old_type, new_type);
472 473
}

474
static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
475
{
476
	return __e820__range_update(e820_table_firmware, start, size, old_type, new_type);
477 478
}

479
/* Remove a range of memory from the E820 table: */
480
u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
Y
Yinghai Lu 已提交
481 482
{
	int i;
483
	u64 end;
Y
Yinghai Lu 已提交
484 485
	u64 real_removed_size = 0;

486 487 488
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

489
	end = start + size;
490
	printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
491
	if (check_type)
492
		e820_print_type(old_type);
493
	pr_cont("\n");
494

495
	for (i = 0; i < e820_table->nr_entries; i++) {
496
		struct e820_entry *entry = &e820_table->entries[i];
Y
Yinghai Lu 已提交
497
		u64 final_start, final_end;
498
		u64 entry_end;
Y
Yinghai Lu 已提交
499

500
		if (check_type && entry->type != old_type)
Y
Yinghai Lu 已提交
501
			continue;
502

503
		entry_end = entry->addr + entry->size;
504 505

		/* Completely covered? */
506 507
		if (entry->addr >= start && entry_end <= end) {
			real_removed_size += entry->size;
508
			memset(entry, 0, sizeof(*entry));
Y
Yinghai Lu 已提交
509 510
			continue;
		}
511

512
		/* Is the new range completely covered? */
513
		if (entry->addr < start && entry_end > end) {
514
			e820__range_add(end, entry_end - end, entry->type);
515
			entry->size = start - entry->addr;
516 517 518 519
			real_removed_size += size;
			continue;
		}

520
		/* Partially covered: */
521 522
		final_start = max(start, entry->addr);
		final_end = min(end, entry_end);
Y
Yinghai Lu 已提交
523 524
		if (final_start >= final_end)
			continue;
525

Y
Yinghai Lu 已提交
526 527
		real_removed_size += final_end - final_start;

528
		/*
529 530
		 * Left range could be head or tail, so need to update
		 * the size first:
531
		 */
532 533
		entry->size -= final_end - final_start;
		if (entry->addr < final_start)
Y
Yinghai Lu 已提交
534
			continue;
535

536
		entry->addr = final_end;
Y
Yinghai Lu 已提交
537 538 539 540
	}
	return real_removed_size;
}

541
void __init e820__update_table_print(void)
542
{
543
	if (e820__update_table(e820_table))
544
		return;
545

546
	pr_info("e820: modified physical RAM map:\n");
547
	e820__print_table("modified");
548
}
549

550
static void __init e820__update_table_firmware(void)
551
{
552
	e820__update_table(e820_table_firmware);
553
}
554

A
Alok Kataria 已提交
555
#define MAX_GAP_END 0x100000000ull
556

557
/*
558
 * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
559
 */
560
static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
561
{
562
	unsigned long long last = MAX_GAP_END;
563
	int i = e820_table->nr_entries;
564 565 566
	int found = 0;

	while (--i >= 0) {
567 568
		unsigned long long start = e820_table->entries[i].addr;
		unsigned long long end = start + e820_table->entries[i].size;
569 570 571

		/*
		 * Since "last" is at most 4GB, we know we'll
572
		 * fit in 32 bits if this condition is true:
573 574 575 576
		 */
		if (last > end) {
			unsigned long gap = last - end;

577 578 579
			if (gap >= *gapsize) {
				*gapsize = gap;
				*gapstart = end;
580 581 582 583 584 585
				found = 1;
			}
		}
		if (start < last)
			last = start;
	}
586 587 588 589
	return found;
}

/*
590 591 592 593 594
 * Search for the biggest gap in the low 32 bits of the E820
 * memory space. We pass this space to the PCI subsystem, so
 * that it can assign MMIO resources for hotplug or
 * unconfigured devices in.
 *
595 596
 * Hopefully the BIOS let enough space left.
 */
597
__init void e820__setup_pci_gap(void)
598
{
599
	unsigned long gapstart, gapsize;
600 601 602
	int found;

	gapsize = 0x400000;
603
	found  = e820_search_gap(&gapstart, &gapsize);
604 605

	if (!found) {
606
#ifdef CONFIG_X86_64
Y
Yinghai Lu 已提交
607
		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
608
		pr_err(
609 610
			"e820: Cannot find an available gap in the 32-bit address range\n"
			"e820: PCI devices with unassigned 32-bit BARs may not work!\n");
611 612
#else
		gapstart = 0x10000000;
613
#endif
614
	}
615 616

	/*
617
	 * e820__reserve_resources_late() protects stolen RAM already:
618
	 */
619
	pci_mem_start = gapstart;
620

621
	pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
622 623
}

624 625 626
/*
 * Called late during init, in free_initmem().
 *
627 628 629 630 631 632 633 634
 * Initial e820_table and e820_table_firmware are largish __initdata arrays.
 *
 * Copy them to a (usually much smaller) dynamically allocated area that is
 * sized precisely after the number of e820 entries.
 *
 * This is done after we've performed all the fixes and tweaks to the tables.
 * All functions which modify them are __init functions, which won't exist
 * after free_initmem().
635
 */
636
__init void e820__reallocate_tables(void)
637
{
638
	struct e820_table *n;
639 640
	int size;

641
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
642 643
	n = kmalloc(size, GFP_KERNEL);
	BUG_ON(!n);
644 645
	memcpy(n, e820_table, size);
	e820_table = n;
646

647
	size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
648 649
	n = kmalloc(size, GFP_KERNEL);
	BUG_ON(!n);
650 651
	memcpy(n, e820_table_firmware, size);
	e820_table_firmware = n;
652 653
}

654 655 656 657 658
/*
 * Because of the small fixed size of struct boot_params, only the first
 * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
 * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
 * struct setup_data, which is parsed here.
659
 */
660
void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
661 662
{
	int entries;
663
	struct boot_e820_entry *extmap;
664
	struct setup_data *sdata;
665

666
	sdata = early_memremap(phys_addr, data_len);
667
	entries = sdata->len / sizeof(*extmap);
668
	extmap = (struct boot_e820_entry *)(sdata->data);
669

670
	__append_e820_table(extmap, entries);
671
	e820__update_table(e820_table);
672

673
	early_memunmap(sdata, data_len);
674
	pr_info("e820: extended physical RAM map:\n");
675
	e820__print_table("extended");
676 677
}

678
/*
679
 * Find the ranges of physical addresses that do not correspond to
680
 * E820 RAM areas and register the corresponding pages as 'nosave' for
681
 * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
682
 *
683
 * This function requires the E820 map to be sorted and without any
684
 * overlapping entries.
685
 */
686
void __init e820__register_nosave_regions(unsigned long limit_pfn)
687 688
{
	int i;
689
	unsigned long pfn = 0;
690

691
	for (i = 0; i < e820_table->nr_entries; i++) {
692
		struct e820_entry *entry = &e820_table->entries[i];
693

694 695
		if (pfn < PFN_UP(entry->addr))
			register_nosave_region(pfn, PFN_UP(entry->addr));
696

697
		pfn = PFN_DOWN(entry->addr + entry->size);
698

699
		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
700
			register_nosave_region(PFN_UP(entry->addr), pfn);
701 702 703 704 705

		if (pfn >= limit_pfn)
			break;
	}
}
706

H
Huang Ying 已提交
707
#ifdef CONFIG_ACPI
708 709 710
/*
 * Register ACPI NVS memory regions, so that we can save/restore them during
 * hibernation and the subsequent resume:
711
 */
712
static int __init e820__register_nvs_regions(void)
713 714 715
{
	int i;

716
	for (i = 0; i < e820_table->nr_entries; i++) {
717
		struct e820_entry *entry = &e820_table->entries[i];
718

719
		if (entry->type == E820_TYPE_NVS)
720
			acpi_nvs_register(entry->addr, entry->size);
721 722 723 724
	}

	return 0;
}
725
core_initcall(e820__register_nvs_regions);
726 727
#endif

Y
Yinghai Lu 已提交
728
/*
729 730 731 732 733 734
 * Allocate the requested number of bytes with the requsted alignment
 * and return (the physical address) to the caller. Also register this
 * range in the 'firmware' E820 table as a reserved range.
 *
 * This allows kexec to fake a new mptable, as if it came from the real
 * system.
Y
Yinghai Lu 已提交
735
 */
736
u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
Y
Yinghai Lu 已提交
737 738 739
{
	u64 addr;

740 741
	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
	if (addr) {
742
		e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
743
		pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n");
744
		e820__update_table_firmware();
745
	}
Y
Yinghai Lu 已提交
746 747 748 749

	return addr;
}

750 751 752 753 754 755 756
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
#  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
# else
#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
# endif
#else /* CONFIG_X86_32 */
757
# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
758 759 760 761 762
#endif

/*
 * Find the highest page frame number we have available
 */
763
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
764
{
765 766
	int i;
	unsigned long last_pfn = 0;
767 768
	unsigned long max_arch_pfn = MAX_ARCH_PFN;

769
	for (i = 0; i < e820_table->nr_entries; i++) {
770
		struct e820_entry *entry = &e820_table->entries[i];
771
		unsigned long start_pfn;
772 773
		unsigned long end_pfn;

774
		if (entry->type != type)
775 776
			continue;

777 778
		start_pfn = entry->addr >> PAGE_SHIFT;
		end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
779 780 781 782 783 784 785

		if (start_pfn >= limit_pfn)
			continue;
		if (end_pfn > limit_pfn) {
			last_pfn = limit_pfn;
			break;
		}
786 787 788
		if (end_pfn > last_pfn)
			last_pfn = end_pfn;
	}
789 790 791 792

	if (last_pfn > max_arch_pfn)
		last_pfn = max_arch_pfn;

793
	pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
794 795 796
			 last_pfn, max_arch_pfn);
	return last_pfn;
}
797

798
unsigned long __init e820__end_of_ram_pfn(void)
799
{
800
	return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
801
}
802

803
unsigned long __init e820__end_of_low_ram_pfn(void)
804
{
805
	return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
806
}
807

808
static void __init early_panic(char *msg)
809 810 811 812 813
{
	early_printk(msg);
	panic(msg);
}

814 815
static int userdef __initdata;

816
/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
817 818 819 820 821 822 823 824
static int __init parse_memopt(char *p)
{
	u64 mem_size;

	if (!p)
		return -EINVAL;

	if (!strcmp(p, "nopentium")) {
825
#ifdef CONFIG_X86_32
826 827
		setup_clear_cpu_cap(X86_FEATURE_PSE);
		return 0;
828
#else
829
		pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
830
		return -EINVAL;
831
#endif
832
	}
833

834
	userdef = 1;
835
	mem_size = memparse(p, &p);
836 837

	/* Don't remove all memory when getting "mem={invalid}" parameter: */
838 839
	if (mem_size == 0)
		return -EINVAL;
840

841
	e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
842

843 844 845 846
	return 0;
}
early_param("mem", parse_memopt);

847
static int __init parse_memmap_one(char *p)
848 849 850 851
{
	char *oldp;
	u64 start_at, mem_size;

852 853 854
	if (!p)
		return -EINVAL;

855
	if (!strncmp(p, "exactmap", 8)) {
856 857 858
#ifdef CONFIG_CRASH_DUMP
		/*
		 * If we are doing a crash dump, we still need to know
859
		 * the real memory size before the original memory map is
860 861
		 * reset.
		 */
862
		saved_max_pfn = e820__end_of_ram_pfn();
863
#endif
864
		e820_table->nr_entries = 0;
865 866 867 868 869 870 871 872 873 874 875 876
		userdef = 1;
		return 0;
	}

	oldp = p;
	mem_size = memparse(p, &p);
	if (p == oldp)
		return -EINVAL;

	userdef = 1;
	if (*p == '@') {
		start_at = memparse(p+1, &p);
877
		e820__range_add(start_at, mem_size, E820_TYPE_RAM);
878 879
	} else if (*p == '#') {
		start_at = memparse(p+1, &p);
880
		e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
881 882
	} else if (*p == '$') {
		start_at = memparse(p+1, &p);
883
		e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
884 885
	} else if (*p == '!') {
		start_at = memparse(p+1, &p);
886
		e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
887
	} else {
888
		e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
889
	}
Y
Yinghai Lu 已提交
890

891 892
	return *p == '\0' ? 0 : -EINVAL;
}
893

894 895 896 897 898 899 900 901 902 903 904 905 906 907
static int __init parse_memmap_opt(char *str)
{
	while (str) {
		char *k = strchr(str, ',');

		if (k)
			*k++ = 0;

		parse_memmap_one(str);
		str = k;
	}

	return 0;
}
908 909
early_param("memmap", parse_memmap_opt);

910 911 912 913 914 915
/*
 * Reserve all entries from the bootloader's extensible data nodes list,
 * because if present we are going to use it later on to fetch e820
 * entries from it:
 */
void __init e820__reserve_setup_data(void)
916 917 918 919 920 921 922 923 924 925
{
	struct setup_data *data;
	u64 pa_data;

	pa_data = boot_params.hdr.setup_data;
	if (!pa_data)
		return;

	while (pa_data) {
		data = early_memremap(pa_data, sizeof(*data));
926
		e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
927 928 929 930
		pa_data = data->next;
		early_memunmap(data, sizeof(*data));
	}

931
	e820__update_table(e820_table);
932

933
	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
934 935

	pr_info("extended physical RAM map:\n");
936
	e820__print_table("reserve setup_data");
937 938
}

939 940 941 942 943 944
/*
 * Called after parse_early_param(), after early parameters (such as mem=)
 * have been processed, in which case we already have an E820 table filled in
 * via the parameter callback function(s), but it's not sorted and printed yet:
 */
void __init e820__finish_early_params(void)
945 946
{
	if (userdef) {
947
		if (e820__update_table(e820_table) < 0)
948 949
			early_panic("Invalid user supplied memory map");

950
		pr_info("e820: user-defined physical RAM map:\n");
951
		e820__print_table("user");
952 953
	}
}
954

955
static const char *__init e820_type_to_string(struct e820_entry *entry)
956
{
957
	switch (entry->type) {
958 959 960 961 962 963 964
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		return "System RAM";
	case E820_TYPE_ACPI:		return "ACPI Tables";
	case E820_TYPE_NVS:		return "ACPI Non-volatile Storage";
	case E820_TYPE_UNUSABLE:	return "Unusable memory";
	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
	case E820_TYPE_PMEM:		return "Persistent Memory";
965 966
	case E820_TYPE_RESERVED:	return "Reserved";
	default:			return "Unknown E820 type";
967 968 969
	}
}

970
static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
971
{
972
	switch (entry->type) {
973 974 975 976 977 978 979
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		return IORESOURCE_SYSTEM_RAM;
	case E820_TYPE_ACPI:		/* Fall-through: */
	case E820_TYPE_NVS:		/* Fall-through: */
	case E820_TYPE_UNUSABLE:	/* Fall-through: */
	case E820_TYPE_PRAM:		/* Fall-through: */
	case E820_TYPE_PMEM:		/* Fall-through: */
980
	case E820_TYPE_RESERVED:	/* Fall-through: */
981
	default:			return IORESOURCE_MEM;
982 983 984
	}
}

985
static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
986
{
987
	switch (entry->type) {
988 989 990 991 992 993 994
	case E820_TYPE_ACPI:		return IORES_DESC_ACPI_TABLES;
	case E820_TYPE_NVS:		return IORES_DESC_ACPI_NV_STORAGE;
	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
	case E820_TYPE_RAM:		/* Fall-through: */
	case E820_TYPE_UNUSABLE:	/* Fall-through: */
995
	case E820_TYPE_RESERVED:	/* Fall-through: */
996
	default:			return IORES_DESC_NONE;
997 998 999
	}
}

1000
static bool __init do_mark_busy(enum e820_type type, struct resource *res)
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
{
	/* this is the legacy bios/dos rom-shadow + mmio region */
	if (res->start < (1ULL<<20))
		return true;

	/*
	 * Treat persistent memory like device memory, i.e. reserve it
	 * for exclusive use of a driver
	 */
	switch (type) {
1011 1012 1013
	case E820_TYPE_RESERVED:
	case E820_TYPE_PRAM:
	case E820_TYPE_PMEM:
1014
		return false;
1015 1016 1017 1018 1019
	case E820_TYPE_RESERVED_KERN:
	case E820_TYPE_RAM:
	case E820_TYPE_ACPI:
	case E820_TYPE_NVS:
	case E820_TYPE_UNUSABLE:
1020 1021 1022 1023 1024
	default:
		return true;
	}
}

1025
/*
1026
 * Mark E820 reserved areas as busy for the resource manager:
1027
 */
1028

1029
static struct resource __initdata *e820_res;
1030

1031
void __init e820__reserve_resources(void)
1032 1033
{
	int i;
1034
	struct resource *res;
1035
	u64 end;
1036

1037
	res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries);
1038
	e820_res = res;
1039

1040
	for (i = 0; i < e820_table->nr_entries; i++) {
1041 1042 1043
		struct e820_entry *entry = e820_table->entries + i;

		end = entry->addr + entry->size - 1;
1044
		if (end != (resource_size_t)end) {
1045 1046 1047
			res++;
			continue;
		}
1048 1049 1050 1051 1052
		res->start = entry->addr;
		res->end   = end;
		res->name  = e820_type_to_string(entry);
		res->flags = e820_type_to_iomem_type(entry);
		res->desc  = e820_type_to_iores_desc(entry);
1053 1054

		/*
1055 1056 1057
		 * Don't register the region that could be conflicted with
		 * PCI device BAR resources and insert them later in
		 * pcibios_resource_survey():
1058
		 */
1059
		if (do_mark_busy(entry->type, res)) {
1060
			res->flags |= IORESOURCE_BUSY;
1061
			insert_resource(&iomem_resource, res);
1062
		}
1063 1064
		res++;
	}
1065

1066
	for (i = 0; i < e820_table_firmware->nr_entries; i++) {
1067
		struct e820_entry *entry = e820_table_firmware->entries + i;
1068

1069
		firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1070
	}
1071 1072
}

1073 1074 1075
/*
 * How much should we pad the end of RAM, depending on where it is?
 */
1076
static unsigned long __init ram_alignment(resource_size_t pos)
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
{
	unsigned long mb = pos >> 20;

	/* To 64kB in the first megabyte */
	if (!mb)
		return 64*1024;

	/* To 1MB in the first 16MB */
	if (mb < 16)
		return 1024*1024;

1088 1089
	/* To 64MB for anything above that */
	return 64*1024*1024;
1090 1091
}

1092 1093
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)

1094
void __init e820__reserve_resources_late(void)
1095 1096 1097 1098 1099
{
	int i;
	struct resource *res;

	res = e820_res;
1100
	for (i = 0; i < e820_table->nr_entries; i++) {
1101
		if (!res->parent && res->end)
1102
			insert_resource_expand_to_fit(&iomem_resource, res);
1103 1104
		res++;
	}
1105 1106

	/*
1107
	 * Try to bump up RAM regions to reasonable boundaries, to
1108 1109
	 * avoid stolen RAM:
	 */
1110 1111
	for (i = 0; i < e820_table->nr_entries; i++) {
		struct e820_entry *entry = &e820_table->entries[i];
1112
		u64 start, end;
1113

1114
		if (entry->type != E820_TYPE_RAM)
1115
			continue;
1116

1117
		start = entry->addr + entry->size;
1118 1119 1120 1121
		end = round_up(start, ram_alignment(start)) - 1;
		if (end > MAX_RESOURCE_SIZE)
			end = MAX_RESOURCE_SIZE;
		if (start >= end)
1122
			continue;
1123

1124
		printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1125
		reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1126
	}
1127 1128
}

1129 1130 1131
/*
 * Pass the firmware (bootloader) E820 map to the kernel and process it:
 */
1132
char *__init e820__memory_setup_default(void)
1133 1134
{
	char *who = "BIOS-e820";
1135

1136 1137 1138 1139 1140 1141
	/*
	 * Try to copy the BIOS-supplied E820-map.
	 *
	 * Otherwise fake a memory map; one section from 0k->640k,
	 * the next section from 1mb->appropriate_mem_k
	 */
1142
	if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1143
		u64 mem_size;
1144

1145 1146
		/* Compare results from other methods and take the one that gives more RAM: */
		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1147 1148 1149 1150 1151 1152 1153
			mem_size = boot_params.screen_info.ext_mem_k;
			who = "BIOS-88";
		} else {
			mem_size = boot_params.alt_mem_k;
			who = "BIOS-e801";
		}

1154
		e820_table->nr_entries = 0;
1155 1156
		e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
		e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1157 1158
	}

1159 1160 1161
	/* We just appended a lot of ranges, sanitize the table: */
	e820__update_table(e820_table);

1162 1163 1164
	return who;
}

1165 1166 1167 1168 1169 1170
/*
 * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
 * E820 map - with an optional platform quirk available for virtual platforms
 * to override this method of boot environment processing:
 */
void __init e820__memory_setup(void)
1171
{
1172 1173
	char *who;

1174
	/* This is a firmware interface ABI - make sure we don't break it: */
1175
	BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
1176

1177
	who = x86_init.resources.memory_setup();
1178

1179
	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1180

1181
	pr_info("e820: BIOS-provided physical RAM map:\n");
1182
	e820__print_table(who);
1183
}
1184

1185
void __init e820__memblock_setup(void)
1186 1187 1188 1189 1190
{
	int i;
	u64 end;

	/*
1191 1192 1193 1194 1195 1196 1197
	 * The bootstrap memblock region count maximum is 128 entries
	 * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
	 * than that - so allow memblock resizing.
	 *
	 * This is safe, because this call happens pretty late during x86 setup,
	 * so we know about reserved memory regions already. (This is important
	 * so that memblock resizing does no stomp over reserved areas.)
1198
	 */
1199
	memblock_allow_resize();
1200

1201
	for (i = 0; i < e820_table->nr_entries; i++) {
1202
		struct e820_entry *entry = &e820_table->entries[i];
1203

1204
		end = entry->addr + entry->size;
1205 1206 1207
		if (end != (resource_size_t)end)
			continue;

1208
		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1209 1210
			continue;

1211
		memblock_add(entry->addr, entry->size);
1212 1213
	}

1214
	/* Throw away partial pages: */
1215 1216
	memblock_trim_memory(PAGE_SIZE);

1217 1218
	memblock_dump_all();
}