e820.c 28.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Handle the memory map.
 * The functions here do the job until bootmem takes over.
 *
 *  Getting sanitize_e820_map() in sync with i386 version by applying change:
 *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
 *     Alex Achenbach <xela@slit.de>, December 2002.
 *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
 *
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
14
#include <linux/crash_dump.h>
15
#include <linux/export.h>
16 17
#include <linux/bootmem.h>
#include <linux/pfn.h>
18
#include <linux/suspend.h>
19
#include <linux/acpi.h>
20
#include <linux/firmware-map.h>
21
#include <linux/memblock.h>
22
#include <linux/sort.h>
23 24

#include <asm/e820.h>
25
#include <asm/proto.h>
26 27
#include <asm/setup.h>

28 29 30 31 32 33 34 35 36 37 38 39 40 41
/*
 * The e820 map is the map that gets modified e.g. with command line parameters
 * and that is also registered with modifications in the kernel resource tree
 * with the iomem_resource as parent.
 *
 * The e820_saved is directly saved after the BIOS-provided memory map is
 * copied. It doesn't get modified afterwards. It's registered for the
 * /sys/firmware/memmap interface.
 *
 * That memory map is not modified and is used as base for kexec. The kexec'd
 * kernel should get the same memory map as the firmware provides. Then the
 * user can e.g. boot the original kernel with mem=1G while still booting the
 * next kernel with full memory.
 */
42
struct e820map e820;
43
struct e820map e820_saved;
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109

/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif

/*
 * This function checks if any part of the range <start,end> is mapped
 * with type.
 */
int
e820_any_mapped(u64 start, u64 end, unsigned type)
{
	int i;

	for (i = 0; i < e820.nr_map; i++) {
		struct e820entry *ei = &e820.map[i];

		if (type && ei->type != type)
			continue;
		if (ei->addr >= end || ei->addr + ei->size <= start)
			continue;
		return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(e820_any_mapped);

/*
 * This function checks if the entire range <start,end> is mapped with type.
 *
 * Note: this function only works correct if the e820 table is sorted and
 * not-overlapping, which is the case
 */
int __init e820_all_mapped(u64 start, u64 end, unsigned type)
{
	int i;

	for (i = 0; i < e820.nr_map; i++) {
		struct e820entry *ei = &e820.map[i];

		if (type && ei->type != type)
			continue;
		/* is the region (part) in overlap with the current region ?*/
		if (ei->addr >= end || ei->addr + ei->size <= start)
			continue;

		/* if the region is at the beginning of <start,end> we move
		 * start to the end of the region since it's ok until there
		 */
		if (ei->addr <= start)
			start = ei->addr + ei->size;
		/*
		 * if start is now at or beyond end, we're done, full
		 * coverage
		 */
		if (start >= end)
			return 1;
	}
	return 0;
}

/*
 * Add a memory region to the kernel e820 map.
 */
Y
Yinghai Lu 已提交
110 111
static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
					 int type)
112
{
Y
Yinghai Lu 已提交
113
	int x = e820x->nr_map;
114

115
	if (x >= ARRAY_SIZE(e820x->map)) {
116 117 118
		printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
		       (unsigned long long) start,
		       (unsigned long long) (start + size - 1));
119 120 121
		return;
	}

Y
Yinghai Lu 已提交
122 123 124 125 126 127 128 129 130
	e820x->map[x].addr = start;
	e820x->map[x].size = size;
	e820x->map[x].type = type;
	e820x->nr_map++;
}

void __init e820_add_region(u64 start, u64 size, int type)
{
	__e820_add_region(&e820, start, size, type);
131 132
}

133 134 135 136 137
static void __init e820_print_type(u32 type)
{
	switch (type) {
	case E820_RAM:
	case E820_RESERVED_KERN:
138
		printk(KERN_CONT "usable");
139 140
		break;
	case E820_RESERVED:
141
		printk(KERN_CONT "reserved");
142 143
		break;
	case E820_ACPI:
144
		printk(KERN_CONT "ACPI data");
145 146
		break;
	case E820_NVS:
147
		printk(KERN_CONT "ACPI NVS");
148 149
		break;
	case E820_UNUSABLE:
150
		printk(KERN_CONT "unusable");
151
		break;
152
	case E820_PMEM:
153 154 155
	case E820_PRAM:
		printk(KERN_CONT "persistent (type %u)", type);
		break;
156 157 158 159 160 161
	default:
		printk(KERN_CONT "type %u", type);
		break;
	}
}

162 163 164 165 166
void __init e820_print_map(char *who)
{
	int i;

	for (i = 0; i < e820.nr_map; i++) {
167
		printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
168 169
		       (unsigned long long) e820.map[i].addr,
		       (unsigned long long)
170
		       (e820.map[i].addr + e820.map[i].size - 1));
171 172
		e820_print_type(e820.map[i].type);
		printk(KERN_CONT "\n");
173 174 175 176 177 178 179
	}
}

/*
 * Sanitize the BIOS e820 map.
 *
 * Some e820 responses include overlapping entries. The following
180 181 182
 * replaces the original e820 map with a new one, removing overlaps,
 * and resolving conflicting memory types in favor of highest
 * numbered type.
183
 *
184 185 186 187 188 189 190
 * The input parameter biosmap points to an array of 'struct
 * e820entry' which on entry has elements in the range [0, *pnr_map)
 * valid, and which has space for up to max_nr_map entries.
 * On return, the resulting sanitized e820 map entries will be in
 * overwritten in the same location, starting at biosmap.
 *
 * The integer pointed to by pnr_map must be valid on entry (the
191 192 193
 * current number of valid entries located at biosmap). If the
 * sanitizing succeeds the *pnr_map will be updated with the new
 * number of valid entries (something no more than max_nr_map).
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
 *
 * The return value from sanitize_e820_map() is zero if it
 * successfully 'sanitized' the map entries passed in, and is -1
 * if it did nothing, which can happen if either of (1) it was
 * only passed one map entry, or (2) any of the input map entries
 * were invalid (start + size < start, meaning that the size was
 * so big the described memory range wrapped around through zero.)
 *
 *	Visually we're performing the following
 *	(1,2,3,4 = memory types)...
 *
 *	Sample memory map (w/overlaps):
 *	   ____22__________________
 *	   ______________________4_
 *	   ____1111________________
 *	   _44_____________________
 *	   11111111________________
 *	   ____________________33__
 *	   ___________44___________
 *	   __________33333_________
 *	   ______________22________
 *	   ___________________2222_
 *	   _________111111111______
 *	   _____________________11_
 *	   _________________4______
 *
 *	Sanitized equivalent (no overlap):
 *	   1_______________________
 *	   _44_____________________
 *	   ___1____________________
 *	   ____22__________________
 *	   ______11________________
 *	   _________1______________
 *	   __________3_____________
 *	   ___________44___________
 *	   _____________33_________
 *	   _______________2________
 *	   ________________1_______
 *	   _________________4______
 *	   ___________________2____
 *	   ____________________33__
 *	   ______________________4_
236
 */
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
struct change_member {
	struct e820entry *pbios; /* pointer to original bios entry */
	unsigned long long addr; /* address for this change point */
};

static int __init cpcompare(const void *a, const void *b)
{
	struct change_member * const *app = a, * const *bpp = b;
	const struct change_member *ap = *app, *bp = *bpp;

	/*
	 * Inputs are pointers to two elements of change_point[].  If their
	 * addresses are unequal, their difference dominates.  If the addresses
	 * are equal, then consider one that represents the end of its region
	 * to be greater than one that does not.
	 */
	if (ap->addr != bp->addr)
		return ap->addr > bp->addr ? 1 : -1;

	return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
}
258

259
int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
260
			     u32 *pnr_map)
261
{
262 263 264 265
	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
	static struct change_member *change_point[2*E820_X_MAX] __initdata;
	static struct e820entry *overlap_list[E820_X_MAX] __initdata;
	static struct e820entry new_bios[E820_X_MAX] __initdata;
266 267
	unsigned long current_type, last_type;
	unsigned long long last_addr;
268
	int chgidx;
269 270 271 272 273 274 275 276 277 278
	int overlap_entries;
	int new_bios_entry;
	int old_nr, new_nr, chg_nr;
	int i;

	/* if there's only one memory region, don't bother */
	if (*pnr_map < 2)
		return -1;

	old_nr = *pnr_map;
279
	BUG_ON(old_nr > max_nr_map);
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304

	/* bail out if we find any unreasonable addresses in bios map */
	for (i = 0; i < old_nr; i++)
		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
			return -1;

	/* create pointers for initial change-point information (for sorting) */
	for (i = 0; i < 2 * old_nr; i++)
		change_point[i] = &change_point_list[i];

	/* record all known change-points (starting and ending addresses),
	   omitting those that are for empty memory regions */
	chgidx = 0;
	for (i = 0; i < old_nr; i++)	{
		if (biosmap[i].size != 0) {
			change_point[chgidx]->addr = biosmap[i].addr;
			change_point[chgidx++]->pbios = &biosmap[i];
			change_point[chgidx]->addr = biosmap[i].addr +
				biosmap[i].size;
			change_point[chgidx++]->pbios = &biosmap[i];
		}
	}
	chg_nr = chgidx;

	/* sort change-point list by memory addresses (low -> high) */
305
	sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349

	/* create a new bios memory map, removing overlaps */
	overlap_entries = 0;	 /* number of entries in the overlap table */
	new_bios_entry = 0;	 /* index for creating new bios map entries */
	last_type = 0;		 /* start with undefined memory type */
	last_addr = 0;		 /* start with 0 as last starting address */

	/* loop through change-points, determining affect on the new bios map */
	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
		/* keep track of all overlapping bios entries */
		if (change_point[chgidx]->addr ==
		    change_point[chgidx]->pbios->addr) {
			/*
			 * add map entry to overlap list (> 1 entry
			 * implies an overlap)
			 */
			overlap_list[overlap_entries++] =
				change_point[chgidx]->pbios;
		} else {
			/*
			 * remove entry from list (order independent,
			 * so swap with last)
			 */
			for (i = 0; i < overlap_entries; i++) {
				if (overlap_list[i] ==
				    change_point[chgidx]->pbios)
					overlap_list[i] =
						overlap_list[overlap_entries-1];
			}
			overlap_entries--;
		}
		/*
		 * if there are overlapping entries, decide which
		 * "type" to use (larger value takes precedence --
		 * 1=usable, 2,3,4,4+=unusable)
		 */
		current_type = 0;
		for (i = 0; i < overlap_entries; i++)
			if (overlap_list[i]->type > current_type)
				current_type = overlap_list[i]->type;
		/*
		 * continue building up new bios map based on this
		 * information
		 */
350
		if (current_type != last_type || current_type == E820_PRAM) {
351 352 353 354 355 356 357 358 359 360 361 362
			if (last_type != 0)	 {
				new_bios[new_bios_entry].size =
					change_point[chgidx]->addr - last_addr;
				/*
				 * move forward only if the new size
				 * was non-zero
				 */
				if (new_bios[new_bios_entry].size != 0)
					/*
					 * no more space left for new
					 * bios entries ?
					 */
363
					if (++new_bios_entry >= max_nr_map)
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
						break;
			}
			if (current_type != 0)	{
				new_bios[new_bios_entry].addr =
					change_point[chgidx]->addr;
				new_bios[new_bios_entry].type = current_type;
				last_addr = change_point[chgidx]->addr;
			}
			last_type = current_type;
		}
	}
	/* retain count for new bios entries */
	new_nr = new_bios_entry;

	/* copy new bios mapping into original location */
	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
	*pnr_map = new_nr;

	return 0;
}

385
static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
{
	while (nr_map) {
		u64 start = biosmap->addr;
		u64 size = biosmap->size;
		u64 end = start + size;
		u32 type = biosmap->type;

		/* Overflow in 64 bits? Ignore the memory map. */
		if (start > end)
			return -1;

		e820_add_region(start, size, type);

		biosmap++;
		nr_map--;
	}
	return 0;
}

405 406 407 408 409 410 411 412 413
/*
 * Copy the BIOS e820 map into a safe place.
 *
 * Sanity-check it while we're at it..
 *
 * If we're lucky and live on a modern system, the setup code
 * will have given us a memory map that we can use to properly
 * set up memory.  If we aren't, we'll fake a memory map.
 */
414
static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
415 416 417 418 419
{
	/* Only one memory region (or negative)? Ignore it */
	if (nr_map < 2)
		return -1;

420
	return __append_e820_map(biosmap, nr_map);
421 422
}

Y
Yinghai Lu 已提交
423
static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
424 425
					u64 size, unsigned old_type,
					unsigned new_type)
426
{
427
	u64 end;
Y
Yinghai Lu 已提交
428
	unsigned int i;
429 430 431 432
	u64 real_updated_size = 0;

	BUG_ON(old_type == new_type);

433 434 435
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

436
	end = start + size;
437 438
	printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
	       (unsigned long long) start, (unsigned long long) (end - 1));
439 440 441 442 443
	e820_print_type(old_type);
	printk(KERN_CONT " ==> ");
	e820_print_type(new_type);
	printk(KERN_CONT "\n");

444
	for (i = 0; i < e820x->nr_map; i++) {
445
		struct e820entry *ei = &e820x->map[i];
446
		u64 final_start, final_end;
447 448
		u64 ei_end;

449 450
		if (ei->type != old_type)
			continue;
451 452 453 454

		ei_end = ei->addr + ei->size;
		/* totally covered by new range? */
		if (ei->addr >= start && ei_end <= end) {
455 456 457 458
			ei->type = new_type;
			real_updated_size += ei->size;
			continue;
		}
459 460 461 462 463 464 465 466 467 468

		/* new range is totally covered? */
		if (ei->addr < start && ei_end > end) {
			__e820_add_region(e820x, start, size, new_type);
			__e820_add_region(e820x, end, ei_end - end, ei->type);
			ei->size = start - ei->addr;
			real_updated_size += size;
			continue;
		}

469 470
		/* partially covered */
		final_start = max(start, ei->addr);
471
		final_end = min(end, ei_end);
472 473
		if (final_start >= final_end)
			continue;
474

Y
Yinghai Lu 已提交
475 476
		__e820_add_region(e820x, final_start, final_end - final_start,
				  new_type);
477

478
		real_updated_size += final_end - final_start;
479

Y
Yinghai Lu 已提交
480 481 482 483 484
		/*
		 * left range could be head or tail, so need to update
		 * size at first.
		 */
		ei->size -= final_end - final_start;
485 486 487
		if (ei->addr < final_start)
			continue;
		ei->addr = final_end;
488 489 490 491
	}
	return real_updated_size;
}

492 493 494
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
			     unsigned new_type)
{
Y
Yinghai Lu 已提交
495
	return __e820_update_range(&e820, start, size, old_type, new_type);
496 497 498 499 500
}

static u64 __init e820_update_range_saved(u64 start, u64 size,
					  unsigned old_type, unsigned new_type)
{
Y
Yinghai Lu 已提交
501
	return __e820_update_range(&e820_saved, start, size, old_type,
502 503 504
				     new_type);
}

Y
Yinghai Lu 已提交
505 506 507 508 509
/* make e820 not cover the range */
u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
			     int checktype)
{
	int i;
510
	u64 end;
Y
Yinghai Lu 已提交
511 512
	u64 real_removed_size = 0;

513 514 515
	if (size > (ULLONG_MAX - start))
		size = ULLONG_MAX - start;

516
	end = start + size;
517 518
	printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
	       (unsigned long long) start, (unsigned long long) (end - 1));
519 520
	if (checktype)
		e820_print_type(old_type);
521 522
	printk(KERN_CONT "\n");

Y
Yinghai Lu 已提交
523 524 525
	for (i = 0; i < e820.nr_map; i++) {
		struct e820entry *ei = &e820.map[i];
		u64 final_start, final_end;
526
		u64 ei_end;
Y
Yinghai Lu 已提交
527 528 529

		if (checktype && ei->type != old_type)
			continue;
530 531

		ei_end = ei->addr + ei->size;
Y
Yinghai Lu 已提交
532
		/* totally covered? */
533
		if (ei->addr >= start && ei_end <= end) {
Y
Yinghai Lu 已提交
534 535 536 537
			real_removed_size += ei->size;
			memset(ei, 0, sizeof(struct e820entry));
			continue;
		}
538 539 540 541 542 543 544 545 546

		/* new range is totally covered? */
		if (ei->addr < start && ei_end > end) {
			e820_add_region(end, ei_end - end, ei->type);
			ei->size = start - ei->addr;
			real_removed_size += size;
			continue;
		}

Y
Yinghai Lu 已提交
547 548
		/* partially covered */
		final_start = max(start, ei->addr);
549
		final_end = min(end, ei_end);
Y
Yinghai Lu 已提交
550 551 552 553
		if (final_start >= final_end)
			continue;
		real_removed_size += final_end - final_start;

554 555 556 557
		/*
		 * left range could be head or tail, so need to update
		 * size at first.
		 */
Y
Yinghai Lu 已提交
558 559 560 561 562 563 564 565
		ei->size -= final_end - final_start;
		if (ei->addr < final_start)
			continue;
		ei->addr = final_end;
	}
	return real_removed_size;
}

566 567
void __init update_e820(void)
{
568
	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map))
569
		return;
570
	printk(KERN_INFO "e820: modified physical RAM map:\n");
571 572
	e820_print_map("modified");
}
573 574
static void __init update_e820_saved(void)
{
575 576
	sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map),
				&e820_saved.nr_map);
577
}
A
Alok Kataria 已提交
578
#define MAX_GAP_END 0x100000000ull
579
/*
A
Alok Kataria 已提交
580
 * Search for a gap in the e820 memory space from start_addr to end_addr.
581
 */
582
__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
A
Alok Kataria 已提交
583
		unsigned long start_addr, unsigned long long end_addr)
584
{
A
Alok Kataria 已提交
585
	unsigned long long last;
586
	int i = e820.nr_map;
587 588
	int found = 0;

A
Alok Kataria 已提交
589 590
	last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;

591 592 593 594
	while (--i >= 0) {
		unsigned long long start = e820.map[i].addr;
		unsigned long long end = start + e820.map[i].size;

595 596 597
		if (end < start_addr)
			continue;

598 599 600 601 602 603 604
		/*
		 * Since "last" is at most 4GB, we know we'll
		 * fit in 32 bits if this condition is true
		 */
		if (last > end) {
			unsigned long gap = last - end;

605 606 607
			if (gap >= *gapsize) {
				*gapsize = gap;
				*gapstart = end;
608 609 610 611 612 613
				found = 1;
			}
		}
		if (start < last)
			last = start;
	}
614 615 616 617 618 619 620 621 622 623 624
	return found;
}

/*
 * Search for the biggest gap in the low 32 bits of the e820
 * memory space.  We pass this space to PCI to assign MMIO resources
 * for hotplug or unconfigured devices in.
 * Hopefully the BIOS let enough space left.
 */
__init void e820_setup_gap(void)
{
625
	unsigned long gapstart, gapsize;
626 627 628 629
	int found;

	gapstart = 0x10000000;
	gapsize = 0x400000;
A
Alok Kataria 已提交
630
	found  = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
631 632 633

#ifdef CONFIG_X86_64
	if (!found) {
Y
Yinghai Lu 已提交
634
		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
635
		printk(KERN_ERR
636 637
	"e820: cannot find a gap in the 32bit address range\n"
	"e820: PCI devices with unassigned 32bit BARs may break!\n");
638 639 640 641
	}
#endif

	/*
642
	 * e820_reserve_resources_late protect stolen RAM already
643
	 */
644
	pci_mem_start = gapstart;
645 646

	printk(KERN_INFO
647 648
	       "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
	       gapstart, gapstart + gapsize - 1);
649 650
}

651 652 653 654 655 656
/**
 * Because of the size limitation of struct boot_params, only first
 * 128 E820 memory entries are passed to kernel via
 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
 * linked list of struct setup_data, which is parsed here.
 */
657
void __init parse_e820_ext(u64 phys_addr, u32 data_len)
658 659 660
{
	int entries;
	struct e820entry *extmap;
661
	struct setup_data *sdata;
662

663
	sdata = early_memremap(phys_addr, data_len);
664 665
	entries = sdata->len / sizeof(struct e820entry);
	extmap = (struct e820entry *)(sdata->data);
666
	__append_e820_map(extmap, entries);
667
	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
668
	early_memunmap(sdata, data_len);
669
	printk(KERN_INFO "e820: extended physical RAM map:\n");
670 671 672
	e820_print_map("extended");
}

673 674 675 676 677 678 679 680
#if defined(CONFIG_X86_64) || \
	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
/**
 * Find the ranges of physical addresses that do not correspond to
 * e820 RAM areas and mark the corresponding pages as nosave for
 * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
 *
 * This function requires the e820 map to be sorted and without any
681
 * overlapping entries.
682 683 684 685
 */
void __init e820_mark_nosave_regions(unsigned long limit_pfn)
{
	int i;
686
	unsigned long pfn = 0;
687

688
	for (i = 0; i < e820.nr_map; i++) {
689 690 691 692 693 694
		struct e820entry *ei = &e820.map[i];

		if (pfn < PFN_UP(ei->addr))
			register_nosave_region(pfn, PFN_UP(ei->addr));

		pfn = PFN_DOWN(ei->addr + ei->size);
695

696
		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
697 698 699 700 701 702 703
			register_nosave_region(PFN_UP(ei->addr), pfn);

		if (pfn >= limit_pfn)
			break;
	}
}
#endif
704

H
Huang Ying 已提交
705
#ifdef CONFIG_ACPI
706 707 708 709 710 711 712 713 714 715 716 717
/**
 * Mark ACPI NVS memory region, so that we can save/restore it during
 * hibernation and the subsequent resume.
 */
static int __init e820_mark_nvs_memory(void)
{
	int i;

	for (i = 0; i < e820.nr_map; i++) {
		struct e820entry *ei = &e820.map[i];

		if (ei->type == E820_NVS)
H
Huang Ying 已提交
718
			acpi_nvs_register(ei->addr, ei->size);
719 720 721 722 723 724 725
	}

	return 0;
}
core_initcall(e820_mark_nvs_memory);
#endif

Y
Yinghai Lu 已提交
726
/*
727
 * pre allocated 4k and reserved it in memblock and e820_saved
Y
Yinghai Lu 已提交
728
 */
729
u64 __init early_reserve_e820(u64 size, u64 align)
Y
Yinghai Lu 已提交
730 731 732
{
	u64 addr;

733 734 735
	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
	if (addr) {
		e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
736
		printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
737
		update_e820_saved();
738
	}
Y
Yinghai Lu 已提交
739 740 741 742

	return addr;
}

743 744 745 746 747 748 749
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
#  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
# else
#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
# endif
#else /* CONFIG_X86_32 */
750
# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
751 752 753 754 755
#endif

/*
 * Find the highest page frame number we have available
 */
756
static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
757
{
758 759
	int i;
	unsigned long last_pfn = 0;
760 761
	unsigned long max_arch_pfn = MAX_ARCH_PFN;

762 763
	for (i = 0; i < e820.nr_map; i++) {
		struct e820entry *ei = &e820.map[i];
764
		unsigned long start_pfn;
765 766
		unsigned long end_pfn;

767 768 769 770 771
		/*
		 * Persistent memory is accounted as ram for purposes of
		 * establishing max_pfn and mem_map.
		 */
		if (ei->type != E820_RAM && ei->type != E820_PRAM)
772 773
			continue;

774
		start_pfn = ei->addr >> PAGE_SHIFT;
775
		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
776 777 778 779 780 781 782

		if (start_pfn >= limit_pfn)
			continue;
		if (end_pfn > limit_pfn) {
			last_pfn = limit_pfn;
			break;
		}
783 784 785
		if (end_pfn > last_pfn)
			last_pfn = end_pfn;
	}
786 787 788 789

	if (last_pfn > max_arch_pfn)
		last_pfn = max_arch_pfn;

790
	printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
791 792 793
			 last_pfn, max_arch_pfn);
	return last_pfn;
}
794 795
unsigned long __init e820_end_of_ram_pfn(void)
{
796
	return e820_end_pfn(MAX_ARCH_PFN);
797
}
798

799 800
unsigned long __init e820_end_of_low_ram_pfn(void)
{
801
	return e820_end_pfn(1UL << (32-PAGE_SHIFT));
802
}
803

804 805 806 807 808 809
static void early_panic(char *msg)
{
	early_printk(msg);
	panic(msg);
}

810 811
static int userdef __initdata;

812 813 814 815 816 817 818 819 820
/* "mem=nopentium" disables the 4MB page tables. */
static int __init parse_memopt(char *p)
{
	u64 mem_size;

	if (!p)
		return -EINVAL;

	if (!strcmp(p, "nopentium")) {
821
#ifdef CONFIG_X86_32
822 823
		setup_clear_cpu_cap(X86_FEATURE_PSE);
		return 0;
824 825 826
#else
		printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
		return -EINVAL;
827
#endif
828
	}
829

830
	userdef = 1;
831
	mem_size = memparse(p, &p);
832 833 834
	/* don't remove all of memory when handling "mem={invalid}" param */
	if (mem_size == 0)
		return -EINVAL;
835
	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
836

837 838 839 840
	return 0;
}
early_param("mem", parse_memopt);

841
static int __init parse_memmap_one(char *p)
842 843 844 845
{
	char *oldp;
	u64 start_at, mem_size;

846 847 848
	if (!p)
		return -EINVAL;

849
	if (!strncmp(p, "exactmap", 8)) {
850 851 852 853 854 855
#ifdef CONFIG_CRASH_DUMP
		/*
		 * If we are doing a crash dump, we still need to know
		 * the real mem size before original memory map is
		 * reset.
		 */
856
		saved_max_pfn = e820_end_of_ram_pfn();
857 858 859 860 861 862 863 864 865 866 867 868 869 870
#endif
		e820.nr_map = 0;
		userdef = 1;
		return 0;
	}

	oldp = p;
	mem_size = memparse(p, &p);
	if (p == oldp)
		return -EINVAL;

	userdef = 1;
	if (*p == '@') {
		start_at = memparse(p+1, &p);
871
		e820_add_region(start_at, mem_size, E820_RAM);
872 873
	} else if (*p == '#') {
		start_at = memparse(p+1, &p);
874
		e820_add_region(start_at, mem_size, E820_ACPI);
875 876
	} else if (*p == '$') {
		start_at = memparse(p+1, &p);
877
		e820_add_region(start_at, mem_size, E820_RESERVED);
878 879 880
	} else if (*p == '!') {
		start_at = memparse(p+1, &p);
		e820_add_region(start_at, mem_size, E820_PRAM);
Y
Yinghai Lu 已提交
881
	} else
882
		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
Y
Yinghai Lu 已提交
883

884 885
	return *p == '\0' ? 0 : -EINVAL;
}
886 887 888 889 890 891 892 893 894 895 896 897 898 899
static int __init parse_memmap_opt(char *str)
{
	while (str) {
		char *k = strchr(str, ',');

		if (k)
			*k++ = 0;

		parse_memmap_one(str);
		str = k;
	}

	return 0;
}
900 901 902 903 904
early_param("memmap", parse_memmap_opt);

void __init finish_e820_parsing(void)
{
	if (userdef) {
905 906
		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map),
					&e820.nr_map) < 0)
907 908
			early_panic("Invalid user supplied memory map");

909
		printk(KERN_INFO "e820: user-defined physical RAM map:\n");
910 911 912
		e820_print_map("user");
	}
}
913

914 915 916 917 918 919 920
static inline const char *e820_type_to_string(int e820_type)
{
	switch (e820_type) {
	case E820_RESERVED_KERN:
	case E820_RAM:	return "System RAM";
	case E820_ACPI:	return "ACPI Tables";
	case E820_NVS:	return "ACPI Non-volatile Storage";
921
	case E820_UNUSABLE:	return "Unusable memory";
922 923
	case E820_PRAM: return "Persistent Memory (legacy)";
	case E820_PMEM: return "Persistent Memory";
924 925 926 927
	default:	return "reserved";
	}
}

928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947
static bool do_mark_busy(u32 type, struct resource *res)
{
	/* this is the legacy bios/dos rom-shadow + mmio region */
	if (res->start < (1ULL<<20))
		return true;

	/*
	 * Treat persistent memory like device memory, i.e. reserve it
	 * for exclusive use of a driver
	 */
	switch (type) {
	case E820_RESERVED:
	case E820_PRAM:
	case E820_PMEM:
		return false;
	default:
		return true;
	}
}

948 949 950
/*
 * Mark e820 reserved areas as busy for the resource manager.
 */
951
static struct resource __initdata *e820_res;
952 953 954
void __init e820_reserve_resources(void)
{
	int i;
955
	struct resource *res;
956
	u64 end;
957

958
	res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
959
	e820_res = res;
960
	for (i = 0; i < e820.nr_map; i++) {
961
		end = e820.map[i].addr + e820.map[i].size - 1;
962
		if (end != (resource_size_t)end) {
963 964 965
			res++;
			continue;
		}
966
		res->name = e820_type_to_string(e820.map[i].type);
967 968 969
		res->start = e820.map[i].addr;
		res->end = end;

970
		res->flags = IORESOURCE_MEM;
971 972 973 974 975 976

		/*
		 * don't register the region that could be conflicted with
		 * pci device BAR resource and insert them later in
		 * pcibios_resource_survey()
		 */
977
		if (do_mark_busy(e820.map[i].type, res)) {
978
			res->flags |= IORESOURCE_BUSY;
979
			insert_resource(&iomem_resource, res);
980
		}
981 982
		res++;
	}
983 984 985 986

	for (i = 0; i < e820_saved.nr_map; i++) {
		struct e820entry *entry = &e820_saved.map[i];
		firmware_map_add_early(entry->addr,
987
			entry->addr + entry->size,
988 989
			e820_type_to_string(entry->type));
	}
990 991
}

992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
/* How much should we pad RAM ending depending on where it is? */
static unsigned long ram_alignment(resource_size_t pos)
{
	unsigned long mb = pos >> 20;

	/* To 64kB in the first megabyte */
	if (!mb)
		return 64*1024;

	/* To 1MB in the first 16MB */
	if (mb < 16)
		return 1024*1024;

1005 1006
	/* To 64MB for anything above that */
	return 64*1024*1024;
1007 1008
}

1009 1010
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)

1011 1012 1013 1014 1015 1016 1017
void __init e820_reserve_resources_late(void)
{
	int i;
	struct resource *res;

	res = e820_res;
	for (i = 0; i < e820.nr_map; i++) {
1018
		if (!res->parent && res->end)
1019
			insert_resource_expand_to_fit(&iomem_resource, res);
1020 1021
		res++;
	}
1022 1023 1024 1025 1026 1027

	/*
	 * Try to bump up RAM regions to reasonable boundaries to
	 * avoid stolen RAM:
	 */
	for (i = 0; i < e820.nr_map; i++) {
1028 1029
		struct e820entry *entry = &e820.map[i];
		u64 start, end;
1030 1031 1032 1033

		if (entry->type != E820_RAM)
			continue;
		start = entry->addr + entry->size;
1034 1035 1036 1037
		end = round_up(start, ram_alignment(start)) - 1;
		if (end > MAX_RESOURCE_SIZE)
			end = MAX_RESOURCE_SIZE;
		if (start >= end)
1038
			continue;
1039 1040 1041
		printk(KERN_DEBUG
		       "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
		       start, end);
1042 1043
		reserve_region_with_split(&iomem_resource, start, end,
					  "RAM buffer");
1044
	}
1045 1046
}

1047
char *__init default_machine_specific_memory_setup(void)
1048 1049
{
	char *who = "BIOS-e820";
1050
	u32 new_nr;
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
	/*
	 * Try to copy the BIOS-supplied E820-map.
	 *
	 * Otherwise fake a memory map; one section from 0k->640k,
	 * the next section from 1mb->appropriate_mem_k
	 */
	new_nr = boot_params.e820_entries;
	sanitize_e820_map(boot_params.e820_map,
			ARRAY_SIZE(boot_params.e820_map),
			&new_nr);
	boot_params.e820_entries = new_nr;
1062 1063
	if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
	  < 0) {
1064
		u64 mem_size;
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086

		/* compare results from other methods and take the greater */
		if (boot_params.alt_mem_k
		    < boot_params.screen_info.ext_mem_k) {
			mem_size = boot_params.screen_info.ext_mem_k;
			who = "BIOS-88";
		} else {
			mem_size = boot_params.alt_mem_k;
			who = "BIOS-e801";
		}

		e820.nr_map = 0;
		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
	}

	/* In case someone cares... */
	return who;
}

void __init setup_memory_map(void)
{
1087 1088
	char *who;

1089
	who = x86_init.resources.memory_setup();
1090
	memcpy(&e820_saved, &e820, sizeof(struct e820map));
1091
	printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
1092
	e820_print_map(who);
1093
}
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104

void __init memblock_x86_fill(void)
{
	int i;
	u64 end;

	/*
	 * EFI may have more than 128 entries
	 * We are safe to enable resizing, beause memblock_x86_fill()
	 * is rather later for x86
	 */
1105
	memblock_allow_resize();
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119

	for (i = 0; i < e820.nr_map; i++) {
		struct e820entry *ei = &e820.map[i];

		end = ei->addr + ei->size;
		if (end != (resource_size_t)end)
			continue;

		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
			continue;

		memblock_add(ei->addr, ei->size);
	}

1120 1121 1122
	/* throw away partial pages */
	memblock_trim_memory(PAGE_SIZE);

1123 1124
	memblock_dump_all();
}
1125 1126 1127 1128

void __init memblock_find_dma_reserve(void)
{
#ifdef CONFIG_X86_64
1129 1130 1131 1132 1133 1134
	u64 nr_pages = 0, nr_free_pages = 0;
	unsigned long start_pfn, end_pfn;
	phys_addr_t start, end;
	int i;
	u64 u;

1135 1136 1137 1138 1139
	/*
	 * need to find out used area below MAX_DMA_PFN
	 * need to use memblock to get free size in [0, MAX_DMA_PFN]
	 * at first, and assume boot_mem will not take below MAX_DMA_PFN
	 */
1140
	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
1141 1142
		start_pfn = min(start_pfn, MAX_DMA_PFN);
		end_pfn = min(end_pfn, MAX_DMA_PFN);
1143 1144 1145
		nr_pages += end_pfn - start_pfn;
	}

1146 1147
	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
				NULL) {
1148 1149 1150 1151 1152 1153 1154
		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
		if (start_pfn < end_pfn)
			nr_free_pages += end_pfn - start_pfn;
	}

	set_dma_reserve(nr_pages - nr_free_pages);
1155 1156
#endif
}