setup.c 27.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/*
 * Machine specific setup for xen
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
11
#include <linux/memblock.h>
12
#include <linux/cpuidle.h>
13
#include <linux/cpufreq.h>
14 15

#include <asm/elf.h>
R
Roland McGrath 已提交
16
#include <asm/vdso.h>
17 18
#include <asm/e820.h>
#include <asm/setup.h>
19
#include <asm/acpi.h>
20
#include <asm/numa.h>
21 22 23
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

24
#include <xen/xen.h>
25
#include <xen/page.h>
26
#include <xen/interface/callback.h>
I
Ian Campbell 已提交
27
#include <xen/interface/memory.h>
28 29
#include <xen/interface/physdev.h>
#include <xen/features.h>
30
#include <xen/hvc-console.h>
31
#include "xen-ops.h"
32
#include "vdso.h"
33
#include "mmu.h"
34

35 36
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)

37
/* Amount of extra memory space we add to the e820 ranges */
38
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
39

40 41 42
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;

43 44 45 46
/* E820 map used during setting up memory. */
static struct e820entry xen_e820_map[E820MAX] __initdata;
static u32 xen_e820_map_entries __initdata;

47 48 49 50 51 52 53 54 55 56 57 58 59
/*
 * Buffer used to remap identity mapped pages. We only need the virtual space.
 * The physical page behind this address is remapped as needed to different
 * buffer pages.
 */
#define REMAP_SIZE	(P2M_PER_PAGE - 3)
static struct {
	unsigned long	next_area_mfn;
	unsigned long	target_pfn;
	unsigned long	size;
	unsigned long	mfns[REMAP_SIZE];
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
60

61 62 63 64 65 66 67 68 69 70 71 72
/* 
 * The maximum amount of extra memory compared to the base size.  The
 * main scaling factor is the size of struct page.  At extreme ratios
 * of base:extra, all the base memory can be filled with page
 * structures for the extra memory, leaving no space for anything
 * else.
 * 
 * 10x seems like a reasonable balance between scaling flexibility and
 * leaving a practically usable system.
 */
#define EXTRA_MEM_RATIO		(10)

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);

static void __init xen_parse_512gb(void)
{
	bool val = false;
	char *arg;

	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
	if (!arg)
		return;

	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
	if (!arg)
		val = true;
	else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
		return;

	xen_512gb_limit = val;
}

93
static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
94
{
95
	int i;
96

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		/* Add new region. */
		if (xen_extra_mem[i].size == 0) {
			xen_extra_mem[i].start = start;
			xen_extra_mem[i].size  = size;
			break;
		}
		/* Append to existing region. */
		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
			xen_extra_mem[i].size += size;
			break;
		}
	}
	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
112

113
	memblock_reserve(start, size);
114
}
115

116
static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
117 118
{
	int i;
119
	phys_addr_t start_r, size_r;
120

121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		start_r = xen_extra_mem[i].start;
		size_r = xen_extra_mem[i].size;

		/* Start of region. */
		if (start_r == start) {
			BUG_ON(size > size_r);
			xen_extra_mem[i].start += size;
			xen_extra_mem[i].size -= size;
			break;
		}
		/* End of region. */
		if (start_r + size_r == start + size) {
			BUG_ON(size > size_r);
			xen_extra_mem[i].size -= size;
			break;
		}
		/* Mid of region. */
		if (start > start_r && start < start_r + size_r) {
			BUG_ON(start + size > start_r + size_r);
			xen_extra_mem[i].size = start - start_r;
			/* Calling memblock_reserve() again is okay. */
			xen_add_extra_mem(start + size, start_r + size_r -
					  (start + size));
			break;
		}
	}
	memblock_free(start, size);
}

/*
 * Called during boot before the p2m list can take entries beyond the
 * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
 * invalid.
 */
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
	int i;
159
	phys_addr_t addr = PFN_PHYS(pfn);
160

161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		if (addr >= xen_extra_mem[i].start &&
		    addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
			return INVALID_P2M_ENTRY;
	}

	return IDENTITY_FRAME(pfn);
}

/*
 * Mark all pfns of extra mem as invalid in p2m list.
 */
void __init xen_inv_extra_mem(void)
{
	unsigned long pfn, pfn_s, pfn_e;
	int i;

	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
179 180
		if (!xen_extra_mem[i].size)
			continue;
181 182 183 184
		pfn_s = PFN_DOWN(xen_extra_mem[i].start);
		pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
		for (pfn = pfn_s; pfn < pfn_e; pfn++)
			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
185
	}
186 187
}

188 189 190 191 192
/*
 * Finds the next RAM pfn available in the E820 map after min_pfn.
 * This function updates min_pfn with the pfn found and returns
 * the size of that range or zero if not found.
 */
193
static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
194
{
195
	const struct e820entry *entry = xen_e820_map;
196 197 198
	unsigned int i;
	unsigned long done = 0;

199
	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
200 201 202 203 204 205
		unsigned long s_pfn;
		unsigned long e_pfn;

		if (entry->type != E820_RAM)
			continue;

206
		e_pfn = PFN_DOWN(entry->addr + entry->size);
207

208 209
		/* We only care about E820 after this */
		if (e_pfn < *min_pfn)
210 211
			continue;

212
		s_pfn = PFN_UP(entry->addr);
213 214 215

		/* If min_pfn falls within the E820 entry, we want to start
		 * at the min_pfn PFN.
216
		 */
217 218
		if (s_pfn <= *min_pfn) {
			done = e_pfn - *min_pfn;
219
		} else {
220 221
			done = e_pfn - s_pfn;
			*min_pfn = s_pfn;
222
		}
223 224
		break;
	}
225

226 227
	return done;
}
228

229 230 231 232 233 234 235 236 237 238 239 240 241 242
static int __init xen_free_mfn(unsigned long mfn)
{
	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};

	set_xen_guest_handle(reservation.extent_start, &mfn);
	reservation.nr_extents = 1;

	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
}

243
/*
244
 * This releases a chunk of memory and then does the identity map. It's used
245 246 247
 * as a fallback if the remapping fails.
 */
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
248
			unsigned long end_pfn, unsigned long nr_pages)
249
{
250 251 252
	unsigned long pfn, end;
	int ret;

253 254
	WARN_ON(start_pfn > end_pfn);

255
	/* Release pages first. */
256 257 258 259 260 261 262 263 264 265 266 267
	end = min(end_pfn, nr_pages);
	for (pfn = start_pfn; pfn < end; pfn++) {
		unsigned long mfn = pfn_to_mfn(pfn);

		/* Make sure pfn exists to start with */
		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
			continue;

		ret = xen_free_mfn(mfn);
		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);

		if (ret == 1) {
268
			xen_released_pages++;
269 270 271 272 273 274
			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
				break;
		} else
			break;
	}

275
	set_phys_range_identity(start_pfn, end_pfn);
276 277 278
}

/*
279
 * Helper function to update the p2m and m2p tables and kernel mapping.
280
 */
281
static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
282 283
{
	struct mmu_update update = {
284
		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
285 286 287 288
		.val = pfn
	};

	/* Update p2m */
289
	if (!set_phys_to_machine(pfn, mfn)) {
290 291
		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
		     pfn, mfn);
292
		BUG();
293
	}
294 295 296 297 298

	/* Update m2p */
	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
		     mfn, pfn);
299
		BUG();
300 301
	}

302
	/* Update kernel mapping, but not for highmem. */
303
	if (pfn >= PFN_UP(__pa(high_memory - 1)))
304 305 306 307 308 309 310 311
		return;

	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
		      mfn, pfn);
		BUG();
	}
312
}
313

314 315
/*
 * This function updates the p2m and m2p tables with an identity map from
316 317 318 319 320 321 322 323 324
 * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
 * original allocation at remap_pfn. The information needed for remapping is
 * saved in the memory itself to avoid the need for allocating buffers. The
 * complete remap information is contained in a list of MFNs each containing
 * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
 * This enables us to preserve the original mfn sequence while doing the
 * remapping at a time when the memory management is capable of allocating
 * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
 * its callers.
325
 */
326
static void __init xen_do_set_identity_and_remap_chunk(
327
        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
328
{
329 330
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn;
331
	unsigned long ident_pfn_iter, remap_pfn_iter;
332
	unsigned long ident_end_pfn = start_pfn + size;
333
	unsigned long left = size;
334
	unsigned int i, chunk;
335 336 337 338

	WARN_ON(size == 0);

	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
339

340
	mfn_save = virt_to_mfn(buf);
341

342 343 344 345
	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
	     ident_pfn_iter < ident_end_pfn;
	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
346

347 348 349
		/* Map first pfn to xen_remap_buf */
		mfn = pfn_to_mfn(ident_pfn_iter);
		set_pte_mfn(buf, mfn, PAGE_KERNEL);
350

351 352 353 354 355 356
		/* Save mapping information in page */
		xen_remap_buf.next_area_mfn = xen_remap_mfn;
		xen_remap_buf.target_pfn = remap_pfn_iter;
		xen_remap_buf.size = chunk;
		for (i = 0; i < chunk; i++)
			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
357

358 359
		/* Put remap buf into list. */
		xen_remap_mfn = mfn;
360

361
		/* Set identity map */
362
		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
363

364
		left -= chunk;
365
	}
366

367 368
	/* Restore old xen_remap_buf mapping */
	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
369 370
}

371 372 373 374 375 376 377 378 379 380 381
/*
 * This function takes a contiguous pfn range that needs to be identity mapped
 * and:
 *
 *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
 *  2) Calls the do_ function to actually do the mapping/remapping work.
 *
 * The goal is to not allocate additional memory but to remap the existing
 * pages. In the case of an error the underlying memory is simply released back
 * to Xen and not remapped.
 */
382
static unsigned long __init xen_set_identity_and_remap_chunk(
383
	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
384
	unsigned long remap_pfn)
385 386 387 388 389 390 391 392 393 394 395 396 397 398
{
	unsigned long pfn;
	unsigned long i = 0;
	unsigned long n = end_pfn - start_pfn;

	while (i < n) {
		unsigned long cur_pfn = start_pfn + i;
		unsigned long left = n - i;
		unsigned long size = left;
		unsigned long remap_range_size;

		/* Do not remap pages beyond the current allocation */
		if (cur_pfn >= nr_pages) {
			/* Identity map remaining pages */
399
			set_phys_range_identity(cur_pfn, cur_pfn + size);
400 401 402 403 404
			break;
		}
		if (cur_pfn + size > nr_pages)
			size = nr_pages - cur_pfn;

405
		remap_range_size = xen_find_pfn_range(&remap_pfn);
406 407 408
		if (!remap_range_size) {
			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
			xen_set_identity_and_release_chunk(cur_pfn,
409
						cur_pfn + left, nr_pages);
410 411 412 413 414 415
			break;
		}
		/* Adjust size to fit in current e820 RAM region */
		if (size > remap_range_size)
			size = remap_range_size;

416
		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434

		/* Update variables to reflect new mappings. */
		i += size;
		remap_pfn += size;
	}

	/*
	 * If the PFNs are currently mapped, the VA mapping also needs
	 * to be updated to be 1:1.
	 */
	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
		(void)HYPERVISOR_update_va_mapping(
			(unsigned long)__va(pfn << PAGE_SHIFT),
			mfn_pte(pfn, PAGE_KERNEL_IO), 0);

	return remap_pfn;
}

435
static void __init xen_set_identity_and_remap(unsigned long nr_pages)
436
{
437
	phys_addr_t start = 0;
438
	unsigned long last_pfn = nr_pages;
439
	const struct e820entry *entry = xen_e820_map;
440 441
	int i;

442 443 444
	/*
	 * Combine non-RAM regions and gaps until a RAM region (or the
	 * end of the map) is reached, then set the 1:1 map and
445
	 * remap the memory in those non-RAM regions.
446 447 448 449 450 451 452
	 *
	 * The combined non-RAM regions are rounded to a whole number
	 * of pages so any partial pages are accessible via the 1:1
	 * mapping.  This is needed for some BIOSes that put (for
	 * example) the DMI tables in a reserved region that begins on
	 * a non-page boundary.
	 */
453
	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
454
		phys_addr_t end = entry->addr + entry->size;
455
		if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
456 457
			unsigned long start_pfn = PFN_DOWN(start);
			unsigned long end_pfn = PFN_UP(end);
458

459 460
			if (entry->type == E820_RAM)
				end_pfn = PFN_UP(entry->addr);
461

462
			if (start_pfn < end_pfn)
463
				last_pfn = xen_set_identity_and_remap_chunk(
464
						start_pfn, end_pfn, nr_pages,
465
						last_pfn);
466
			start = end;
467 468
		}
	}
469

470
	pr_info("Released %ld page(s)\n", xen_released_pages);
471
}
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509

/*
 * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
 * The remap information (which mfn remap to which pfn) is contained in the
 * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
 * This scheme allows to remap the different chunks in arbitrary order while
 * the resulting mapping will be independant from the order.
 */
void __init xen_remap_memory(void)
{
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn, pfn;
	unsigned long remapped = 0;
	unsigned int i;
	unsigned long pfn_s = ~0UL;
	unsigned long len = 0;

	mfn_save = virt_to_mfn(buf);

	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
		/* Map the remap information */
		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);

		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);

		pfn = xen_remap_buf.target_pfn;
		for (i = 0; i < xen_remap_buf.size; i++) {
			mfn = xen_remap_buf.mfns[i];
			xen_update_mem_tables(pfn, mfn);
			remapped++;
			pfn++;
		}
		if (pfn_s == ~0UL || pfn == pfn_s) {
			pfn_s = xen_remap_buf.target_pfn;
			len += xen_remap_buf.size;
		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
			len += xen_remap_buf.size;
		} else {
510
			xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
511 512 513 514 515 516 517 518 519
			pfn_s = xen_remap_buf.target_pfn;
			len = xen_remap_buf.size;
		}

		mfn = xen_remap_mfn;
		xen_remap_mfn = xen_remap_buf.next_area_mfn;
	}

	if (pfn_s != ~0UL && len)
520
		xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
521 522 523 524 525 526

	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);

	pr_info("Remapped %ld page(s)\n", remapped);
}

527 528 529 530 531 532 533 534 535 536 537 538 539 540
static unsigned long __init xen_get_pages_limit(void)
{
	unsigned long limit;

#ifdef CONFIG_X86_32
	limit = GB(64) / PAGE_SIZE;
#else
	limit = ~0ul;
	if (!xen_initial_domain() && xen_512gb_limit)
		limit = GB(512) / PAGE_SIZE;
#endif
	return limit;
}

541 542
static unsigned long __init xen_get_max_pages(void)
{
543
	unsigned long max_pages, limit;
544 545 546
	domid_t domid = DOMID_SELF;
	int ret;

547 548 549
	limit = xen_get_pages_limit();
	max_pages = limit;

550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
	/*
	 * For the initial domain we use the maximum reservation as
	 * the maximum page.
	 *
	 * For guest domains the current maximum reservation reflects
	 * the current maximum rather than the static maximum. In this
	 * case the e820 map provided to us will cover the static
	 * maximum region.
	 */
	if (xen_initial_domain()) {
		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
		if (ret > 0)
			max_pages = ret;
	}

565
	return min(max_pages, limit);
566 567
}

568 569
static void __init xen_align_and_add_e820_region(phys_addr_t start,
						 phys_addr_t size, int type)
570
{
571
	phys_addr_t end = start + size;
572 573 574 575

	/* Align RAM regions to page boundaries. */
	if (type == E820_RAM) {
		start = PAGE_ALIGN(start);
576
		end &= ~((phys_addr_t)PAGE_SIZE - 1);
577 578 579 580 581
	}

	e820_add_region(start, end - start, type);
}

582
static void __init xen_ignore_unusable(void)
583
{
584
	struct e820entry *entry = xen_e820_map;
585 586
	unsigned int i;

587
	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
588 589 590 591 592
		if (entry->type == E820_UNUSABLE)
			entry->type = E820_RAM;
	}
}

593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
{
	unsigned long extra = 0;
	const struct e820entry *entry = xen_e820_map;
	int i;

	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
		unsigned long start_pfn = PFN_DOWN(entry->addr);
		unsigned long end_pfn = PFN_UP(entry->addr + entry->size);

		if (start_pfn >= max_pfn)
			break;
		if (entry->type == E820_RAM)
			continue;
		if (end_pfn >= max_pfn)
			end_pfn = max_pfn;
		extra += end_pfn - start_pfn;
	}

	return extra;
}

615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
	struct e820entry *entry;
	unsigned mapcnt;
	phys_addr_t end;

	if (!size)
		return false;

	end = start + size;
	entry = xen_e820_map;

	for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
		if (entry->type == E820_RAM && entry->addr <= start &&
		    (entry->addr + entry->size) >= end)
			return false;

		entry++;
	}

	return true;
}

638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
/*
 * Find a free area in physical memory not yet reserved and compliant with
 * E820 map.
 * Used to relocate pre-allocated areas like initrd or p2m list which are in
 * conflict with the to be used E820 map.
 * In case no area is found, return 0. Otherwise return the physical address
 * of the area which is already reserved for convenience.
 */
phys_addr_t __init xen_find_free_area(phys_addr_t size)
{
	unsigned mapcnt;
	phys_addr_t addr, start;
	struct e820entry *entry = xen_e820_map;

	for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
		if (entry->type != E820_RAM || entry->size < size)
			continue;
		start = entry->addr;
		for (addr = start; addr < start + size; addr += PAGE_SIZE) {
			if (!memblock_is_reserved(addr))
				continue;
			start = addr + PAGE_SIZE;
			if (start + size > entry->addr + entry->size)
				break;
		}
		if (addr >= start + size) {
			memblock_reserve(start, size);
			return start;
		}
	}

	return 0;
}

672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701
/*
 * Like memcpy, but with physical addresses for dest and src.
 */
static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
				   phys_addr_t n)
{
	phys_addr_t dest_off, src_off, dest_len, src_len, len;
	void *from, *to;

	while (n) {
		dest_off = dest & ~PAGE_MASK;
		src_off = src & ~PAGE_MASK;
		dest_len = n;
		if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
			dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
		src_len = n;
		if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
			src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
		len = min(dest_len, src_len);
		to = early_memremap(dest - dest_off, dest_len + dest_off);
		from = early_memremap(src - src_off, src_len + src_off);
		memcpy(to, from, len);
		early_memunmap(to, dest_len + dest_off);
		early_memunmap(from, src_len + src_off);
		n -= len;
		dest += len;
		src += len;
	}
}

702 703 704 705 706
/*
 * Reserve Xen mfn_list.
 */
static void __init xen_reserve_xen_mfnlist(void)
{
707 708
	phys_addr_t start, size;

709
	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
710 711 712 713 714 715 716 717 718 719
		start = __pa(xen_start_info->mfn_list);
		size = PFN_ALIGN(xen_start_info->nr_pages *
				 sizeof(unsigned long));
	} else {
		start = PFN_PHYS(xen_start_info->first_p2m_pfn);
		size = PFN_PHYS(xen_start_info->nr_p2m_frames);
	}

	if (!xen_is_e820_reserved(start, size)) {
		memblock_reserve(start, size);
720 721 722
		return;
	}

723 724 725 726 727 728 729 730 731 732
#ifdef CONFIG_X86_32
	/*
	 * Relocating the p2m on 32 bit system to an arbitrary virtual address
	 * is not supported, so just give up.
	 */
	xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
	BUG();
#else
	xen_relocate_p2m();
#endif
733 734
}

735 736 737 738 739
/**
 * machine_specific_memory_setup - Hook for machine specific memory setup.
 **/
char * __init xen_memory_setup(void)
{
740
	unsigned long max_pfn;
741 742
	phys_addr_t mem_end, addr, size, chunk_size;
	u32 type;
I
Ian Campbell 已提交
743 744
	int rc;
	struct xen_memory_map memmap;
745
	unsigned long max_pages;
746
	unsigned long extra_pages = 0;
I
Ian Campbell 已提交
747
	int i;
I
Ian Campbell 已提交
748
	int op;
749

750 751 752
	xen_parse_512gb();
	max_pfn = xen_get_pages_limit();
	max_pfn = min(max_pfn, xen_start_info->nr_pages);
I
Ian Campbell 已提交
753 754 755
	mem_end = PFN_PHYS(max_pfn);

	memmap.nr_entries = E820MAX;
756
	set_xen_guest_handle(memmap.buffer, xen_e820_map);
I
Ian Campbell 已提交
757

I
Ian Campbell 已提交
758 759 760 761
	op = xen_initial_domain() ?
		XENMEM_machine_memory_map :
		XENMEM_memory_map;
	rc = HYPERVISOR_memory_op(op, &memmap);
I
Ian Campbell 已提交
762
	if (rc == -ENOSYS) {
763
		BUG_ON(xen_initial_domain());
I
Ian Campbell 已提交
764
		memmap.nr_entries = 1;
765 766
		xen_e820_map[0].addr = 0ULL;
		xen_e820_map[0].size = mem_end;
I
Ian Campbell 已提交
767
		/* 8MB slack (to balance backend allocations). */
768 769
		xen_e820_map[0].size += 8ULL << 20;
		xen_e820_map[0].type = E820_RAM;
I
Ian Campbell 已提交
770 771 772
		rc = 0;
	}
	BUG_ON(rc);
773
	BUG_ON(memmap.nr_entries == 0);
774
	xen_e820_map_entries = memmap.nr_entries;
775

776 777 778 779 780 781 782 783 784
	/*
	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
	 * regions, so if we're using the machine memory map leave the
	 * region as RAM as it is in the pseudo-physical map.
	 *
	 * UNUSABLE regions in domUs are not handled and will need
	 * a patch in the future.
	 */
	if (xen_initial_domain())
785
		xen_ignore_unusable();
786

787
	/* Make sure the Xen-supplied memory map is well-ordered. */
788 789
	sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
			  &xen_e820_map_entries);
790 791 792 793 794

	max_pages = xen_get_max_pages();
	if (max_pages > max_pfn)
		extra_pages += max_pages - max_pfn;

795 796
	/* How many extra pages do we need due to remapping? */
	extra_pages += xen_count_remap_pages(max_pfn);
797

798 799 800 801 802 803 804
	/*
	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
	 * factor the base size.  On non-highmem systems, the base
	 * size is the full initial memory allocation; on highmem it
	 * is limited to the max size of lowmem, so that it doesn't
	 * get completely filled.
	 *
805 806 807
	 * Make sure we have no memory above max_pages, as this area
	 * isn't handled by the p2m management.
	 *
808 809 810 811
	 * In principle there could be a problem in lowmem systems if
	 * the initial memory is also very large with respect to
	 * lowmem, but we won't try to deal with that here.
	 */
812 813
	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
			   extra_pages, max_pages - max_pfn);
814
	i = 0;
815 816
	addr = xen_e820_map[0].addr;
	size = xen_e820_map[0].size;
817
	while (i < xen_e820_map_entries) {
818 819
		chunk_size = size;
		type = xen_e820_map[i].type;
820 821 822

		if (type == E820_RAM) {
			if (addr < mem_end) {
823
				chunk_size = min(size, mem_end - addr);
824
			} else if (extra_pages) {
825 826 827 828
				chunk_size = min(size, PFN_PHYS(extra_pages));
				extra_pages -= PFN_DOWN(chunk_size);
				xen_add_extra_mem(addr, chunk_size);
				xen_max_p2m_pfn = PFN_DOWN(addr + chunk_size);
829 830
			} else
				type = E820_UNUSABLE;
831 832
		}

833
		xen_align_and_add_e820_region(addr, chunk_size, type);
834

835 836 837
		addr += chunk_size;
		size -= chunk_size;
		if (size == 0) {
838
			i++;
839 840 841 842 843
			if (i < xen_e820_map_entries) {
				addr = xen_e820_map[i].addr;
				size = xen_e820_map[i].size;
			}
		}
I
Ian Campbell 已提交
844
	}
845

846 847 848 849
	/*
	 * Set the rest as identity mapped, in case PCI BARs are
	 * located here.
	 */
850
	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
851

852
	/*
853 854
	 * In domU, the ISA region is normal, usable memory, but we
	 * reserve ISA memory anyway because too many things poke
855 856 857 858
	 * about in there.
	 */
	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
			E820_RESERVED);
859

860 861
	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);

862 863 864 865 866 867 868 869 870 871 872
	/*
	 * Check whether the kernel itself conflicts with the target E820 map.
	 * Failing now is better than running into weird problems later due
	 * to relocating (and even reusing) pages with kernel text or data.
	 */
	if (xen_is_e820_reserved(__pa_symbol(_text),
			__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
		xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
		BUG();
	}

873 874 875 876 877 878
	/*
	 * Check for a conflict of the hypervisor supplied page tables with
	 * the target E820 map.
	 */
	xen_pt_check_e820();

879 880
	xen_reserve_xen_mfnlist();

881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901
	/* Check for a conflict of the initrd with the target E820 map. */
	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
				 boot_params.hdr.ramdisk_size)) {
		phys_addr_t new_area, start, size;

		new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
		if (!new_area) {
			xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
			BUG();
		}

		start = boot_params.hdr.ramdisk_image;
		size = boot_params.hdr.ramdisk_size;
		xen_phys_memcpy(new_area, start, size);
		pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
			start, start + size, new_area, new_area + size);
		memblock_free(start, size);
		boot_params.hdr.ramdisk_image = new_area;
		boot_params.ext_ramdisk_image = new_area >> 32;
	}

902 903 904 905 906 907
	/*
	 * Set identity map on non-RAM pages and prepare remapping the
	 * underlying RAM.
	 */
	xen_set_identity_and_remap(max_pfn);

908 909 910
	return "Xen";
}

911 912 913 914 915 916 917 918 919 920
/*
 * Machine specific memory setup for auto-translated guests.
 */
char * __init xen_auto_xlated_memory_setup(void)
{
	struct xen_memory_map memmap;
	int i;
	int rc;

	memmap.nr_entries = E820MAX;
921
	set_xen_guest_handle(memmap.buffer, xen_e820_map);
922 923 924 925 926

	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
	if (rc < 0)
		panic("No memory map (%d)\n", rc);

927 928 929 930
	xen_e820_map_entries = memmap.nr_entries;

	sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
			  &xen_e820_map_entries);
931

932 933 934
	for (i = 0; i < xen_e820_map_entries; i++)
		e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
				xen_e820_map[i].type);
935

936 937 938 939
	/* Remove p2m info, it is not needed. */
	xen_start_info->mfn_list = 0;
	xen_start_info->first_p2m_pfn = 0;
	xen_start_info->nr_p2m_frames = 0;
940 941 942 943

	return "Xen";
}

944 945
/*
 * Set the bit indicating "nosegneg" library variants should be used.
946 947
 * We only need to bother in pure 32-bit mode; compat 32-bit processes
 * can have un-truncated segments, so wrapping around is allowed.
948
 */
949
static void __init fiddle_vdso(void)
950
{
951
#ifdef CONFIG_X86_32
952 953 954 955 956
	/*
	 * This could be called before selected_vdso32 is initialized, so
	 * just fiddle with both possible images.  vdso_image_32_syscall
	 * can't be selected, since it only exists on 64-bit systems.
	 */
957
	u32 *mask;
958 959
	mask = vdso_image_32_int80.data +
		vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
960
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
961 962
	mask = vdso_image_32_sysenter.data +
		vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
963
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
964
#endif
965 966
}

967
static int register_callback(unsigned type, const void *func)
968
{
969 970 971
	struct callback_register callback = {
		.type = type,
		.address = XEN_CALLBACK(__KERNEL_CS, func),
972 973 974
		.flags = CALLBACKF_mask_events,
	};

975 976 977
	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}

978
void xen_enable_sysenter(void)
979
{
980
	int ret;
981
	unsigned sysenter_feature;
982 983

#ifdef CONFIG_X86_32
984
	sysenter_feature = X86_FEATURE_SEP;
985
#else
986
	sysenter_feature = X86_FEATURE_SYSENTER32;
987
#endif
988

989 990 991
	if (!boot_cpu_has(sysenter_feature))
		return;

992
	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
993 994
	if(ret != 0)
		setup_clear_cpu_cap(sysenter_feature);
995 996
}

997
void xen_enable_syscall(void)
998 999 1000 1001 1002 1003
{
#ifdef CONFIG_X86_64
	int ret;

	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
	if (ret != 0) {
1004
		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
1005 1006 1007 1008 1009
		/* Pretty fatal; 64-bit userspace has no other
		   mechanism for syscalls. */
	}

	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
1010 1011
		ret = register_callback(CALLBACKTYPE_syscall32,
					xen_syscall32_target);
1012
		if (ret != 0)
1013
			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
1014 1015 1016
	}
#endif /* CONFIG_X86_64 */
}
1017

1018
void __init xen_pvmmu_arch_setup(void)
1019 1020 1021 1022
{
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);

1023 1024
	HYPERVISOR_vm_assist(VMASST_CMD_enable,
			     VMASST_TYPE_pae_extended_cr3);
1025

1026 1027 1028
	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
		BUG();
1029

1030
	xen_enable_sysenter();
1031
	xen_enable_syscall();
1032 1033 1034 1035 1036 1037 1038 1039 1040
}

/* This function is not called for HVM domains */
void __init xen_arch_setup(void)
{
	xen_panic_handler_init();
	if (!xen_feature(XENFEAT_auto_translated_physmap))
		xen_pvmmu_arch_setup();

1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051
#ifdef CONFIG_ACPI
	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
		disable_acpi();
	}
#endif

	memcpy(boot_command_line, xen_start_info->cmd_line,
	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);

J
Jeremy Fitzhardinge 已提交
1052
	/* Set up idle, making sure it calls safe_halt() pvop */
1053
	disable_cpuidle();
1054
	disable_cpufreq();
1055
	WARN_ON(xen_set_default_idle());
1056
	fiddle_vdso();
1057 1058 1059
#ifdef CONFIG_NUMA
	numa_off = 1;
#endif
1060
}