setup.c 23.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/*
 * Machine specific setup for xen
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
11
#include <linux/memblock.h>
12
#include <linux/cpuidle.h>
13
#include <linux/cpufreq.h>
14 15

#include <asm/elf.h>
R
Roland McGrath 已提交
16
#include <asm/vdso.h>
17 18
#include <asm/e820.h>
#include <asm/setup.h>
19
#include <asm/acpi.h>
20
#include <asm/numa.h>
21 22 23
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

24
#include <xen/xen.h>
25
#include <xen/page.h>
26
#include <xen/interface/callback.h>
I
Ian Campbell 已提交
27
#include <xen/interface/memory.h>
28 29 30
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
31
#include "vdso.h"
32
#include "p2m.h"
33
#include "mmu.h"
34 35 36 37

/* These are code, but not functions.  Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
38
#ifdef CONFIG_X86_64
39
extern asmlinkage void nmi(void);
40
#endif
T
Tej 已提交
41 42 43
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
44

45
/* Amount of extra memory space we add to the e820 ranges */
46
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
47

48 49 50
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;

51 52 53 54 55 56 57 58 59 60 61 62 63
/*
 * Buffer used to remap identity mapped pages. We only need the virtual space.
 * The physical page behind this address is remapped as needed to different
 * buffer pages.
 */
#define REMAP_SIZE	(P2M_PER_PAGE - 3)
static struct {
	unsigned long	next_area_mfn;
	unsigned long	target_pfn;
	unsigned long	size;
	unsigned long	mfns[REMAP_SIZE];
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
64

65 66 67 68 69 70 71 72 73 74 75 76
/* 
 * The maximum amount of extra memory compared to the base size.  The
 * main scaling factor is the size of struct page.  At extreme ratios
 * of base:extra, all the base memory can be filled with page
 * structures for the extra memory, leaving no space for anything
 * else.
 * 
 * 10x seems like a reasonable balance between scaling flexibility and
 * leaving a practically usable system.
 */
#define EXTRA_MEM_RATIO		(10)

77
static void __init xen_add_extra_mem(u64 start, u64 size)
78
{
79
	int i;
80

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		/* Add new region. */
		if (xen_extra_mem[i].size == 0) {
			xen_extra_mem[i].start = start;
			xen_extra_mem[i].size  = size;
			break;
		}
		/* Append to existing region. */
		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
			xen_extra_mem[i].size += size;
			break;
		}
	}
	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
96

97
	memblock_reserve(start, size);
98
}
99

100 101 102 103
static void __init xen_del_extra_mem(u64 start, u64 size)
{
	int i;
	u64 start_r, size_r;
104

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		start_r = xen_extra_mem[i].start;
		size_r = xen_extra_mem[i].size;

		/* Start of region. */
		if (start_r == start) {
			BUG_ON(size > size_r);
			xen_extra_mem[i].start += size;
			xen_extra_mem[i].size -= size;
			break;
		}
		/* End of region. */
		if (start_r + size_r == start + size) {
			BUG_ON(size > size_r);
			xen_extra_mem[i].size -= size;
			break;
		}
		/* Mid of region. */
		if (start > start_r && start < start_r + size_r) {
			BUG_ON(start + size > start_r + size_r);
			xen_extra_mem[i].size = start - start_r;
			/* Calling memblock_reserve() again is okay. */
			xen_add_extra_mem(start + size, start_r + size_r -
					  (start + size));
			break;
		}
	}
	memblock_free(start, size);
}

/*
 * Called during boot before the p2m list can take entries beyond the
 * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
 * invalid.
 */
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
	int i;
	unsigned long addr = PFN_PHYS(pfn);
144

145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		if (addr >= xen_extra_mem[i].start &&
		    addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
			return INVALID_P2M_ENTRY;
	}

	return IDENTITY_FRAME(pfn);
}

/*
 * Mark all pfns of extra mem as invalid in p2m list.
 */
void __init xen_inv_extra_mem(void)
{
	unsigned long pfn, pfn_s, pfn_e;
	int i;

	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		pfn_s = PFN_DOWN(xen_extra_mem[i].start);
		pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
		for (pfn = pfn_s; pfn < pfn_e; pfn++)
			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
167
	}
168 169
}

170 171 172 173 174 175
/*
 * Finds the next RAM pfn available in the E820 map after min_pfn.
 * This function updates min_pfn with the pfn found and returns
 * the size of that range or zero if not found.
 */
static unsigned long __init xen_find_pfn_range(
176
	const struct e820entry *list, size_t map_size,
177
	unsigned long *min_pfn)
178 179 180 181 182 183 184 185 186 187 188 189
{
	const struct e820entry *entry;
	unsigned int i;
	unsigned long done = 0;

	for (i = 0, entry = list; i < map_size; i++, entry++) {
		unsigned long s_pfn;
		unsigned long e_pfn;

		if (entry->type != E820_RAM)
			continue;

190
		e_pfn = PFN_DOWN(entry->addr + entry->size);
191

192 193
		/* We only care about E820 after this */
		if (e_pfn < *min_pfn)
194 195
			continue;

196
		s_pfn = PFN_UP(entry->addr);
197 198 199

		/* If min_pfn falls within the E820 entry, we want to start
		 * at the min_pfn PFN.
200
		 */
201 202
		if (s_pfn <= *min_pfn) {
			done = e_pfn - *min_pfn;
203
		} else {
204 205
			done = e_pfn - s_pfn;
			*min_pfn = s_pfn;
206
		}
207 208
		break;
	}
209

210 211
	return done;
}
212

213 214 215 216 217 218 219 220 221 222 223 224 225 226
static int __init xen_free_mfn(unsigned long mfn)
{
	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};

	set_xen_guest_handle(reservation.extent_start, &mfn);
	reservation.nr_extents = 1;

	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
}

227
/*
228
 * This releases a chunk of memory and then does the identity map. It's used
229 230 231
 * as a fallback if the remapping fails.
 */
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
232
	unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
233
{
234 235 236
	unsigned long pfn, end;
	int ret;

237 238
	WARN_ON(start_pfn > end_pfn);

239
	/* Release pages first. */
240 241 242 243 244 245 246 247 248 249 250 251
	end = min(end_pfn, nr_pages);
	for (pfn = start_pfn; pfn < end; pfn++) {
		unsigned long mfn = pfn_to_mfn(pfn);

		/* Make sure pfn exists to start with */
		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
			continue;

		ret = xen_free_mfn(mfn);
		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);

		if (ret == 1) {
252
			(*released)++;
253 254 255 256 257 258
			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
				break;
		} else
			break;
	}

259
	set_phys_range_identity(start_pfn, end_pfn);
260 261 262
}

/*
263
 * Helper function to update the p2m and m2p tables and kernel mapping.
264
 */
265
static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
266 267 268 269 270 271 272
{
	struct mmu_update update = {
		.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
		.val = pfn
	};

	/* Update p2m */
273
	if (!set_phys_to_machine(pfn, mfn)) {
274 275
		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
		     pfn, mfn);
276
		BUG();
277
	}
278 279 280 281 282

	/* Update m2p */
	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
		     mfn, pfn);
283
		BUG();
284 285
	}

286 287 288 289 290 291 292 293 294 295
	/* Update kernel mapping, but not for highmem. */
	if ((pfn << PAGE_SHIFT) >= __pa(high_memory))
		return;

	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
		      mfn, pfn);
		BUG();
	}
296
}
297

298 299
/*
 * This function updates the p2m and m2p tables with an identity map from
300 301 302 303 304 305 306 307 308
 * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
 * original allocation at remap_pfn. The information needed for remapping is
 * saved in the memory itself to avoid the need for allocating buffers. The
 * complete remap information is contained in a list of MFNs each containing
 * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
 * This enables us to preserve the original mfn sequence while doing the
 * remapping at a time when the memory management is capable of allocating
 * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
 * its callers.
309
 */
310
static void __init xen_do_set_identity_and_remap_chunk(
311
        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
312
{
313 314
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn;
315
	unsigned long ident_pfn_iter, remap_pfn_iter;
316
	unsigned long ident_end_pfn = start_pfn + size;
317
	unsigned long left = size;
318
	unsigned int i, chunk;
319 320 321 322

	WARN_ON(size == 0);

	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
323

324
	mfn_save = virt_to_mfn(buf);
325

326 327 328 329
	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
	     ident_pfn_iter < ident_end_pfn;
	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
330

331 332 333
		/* Map first pfn to xen_remap_buf */
		mfn = pfn_to_mfn(ident_pfn_iter);
		set_pte_mfn(buf, mfn, PAGE_KERNEL);
334

335 336 337 338 339 340
		/* Save mapping information in page */
		xen_remap_buf.next_area_mfn = xen_remap_mfn;
		xen_remap_buf.target_pfn = remap_pfn_iter;
		xen_remap_buf.size = chunk;
		for (i = 0; i < chunk; i++)
			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
341

342 343
		/* Put remap buf into list. */
		xen_remap_mfn = mfn;
344

345
		/* Set identity map */
346
		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
347

348
		left -= chunk;
349
	}
350

351 352
	/* Restore old xen_remap_buf mapping */
	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
353 354
}

355 356 357 358 359 360 361 362 363 364 365
/*
 * This function takes a contiguous pfn range that needs to be identity mapped
 * and:
 *
 *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
 *  2) Calls the do_ function to actually do the mapping/remapping work.
 *
 * The goal is to not allocate additional memory but to remap the existing
 * pages. In the case of an error the underlying memory is simply released back
 * to Xen and not remapped.
 */
366
static unsigned long __init xen_set_identity_and_remap_chunk(
367 368
        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
369
	unsigned long *released, unsigned long *remapped)
370 371 372 373 374 375 376 377 378 379 380 381 382 383
{
	unsigned long pfn;
	unsigned long i = 0;
	unsigned long n = end_pfn - start_pfn;

	while (i < n) {
		unsigned long cur_pfn = start_pfn + i;
		unsigned long left = n - i;
		unsigned long size = left;
		unsigned long remap_range_size;

		/* Do not remap pages beyond the current allocation */
		if (cur_pfn >= nr_pages) {
			/* Identity map remaining pages */
384
			set_phys_range_identity(cur_pfn, cur_pfn + size);
385 386 387 388 389 390 391 392 393 394
			break;
		}
		if (cur_pfn + size > nr_pages)
			size = nr_pages - cur_pfn;

		remap_range_size = xen_find_pfn_range(list, map_size,
						      &remap_pfn);
		if (!remap_range_size) {
			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
			xen_set_identity_and_release_chunk(cur_pfn,
395
				cur_pfn + left, nr_pages, released);
396 397 398 399 400 401
			break;
		}
		/* Adjust size to fit in current e820 RAM region */
		if (size > remap_range_size)
			size = remap_range_size;

402
		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
403 404 405 406

		/* Update variables to reflect new mappings. */
		i += size;
		remap_pfn += size;
407
		*remapped += size;
408 409 410 411 412 413 414 415 416 417 418 419 420 421
	}

	/*
	 * If the PFNs are currently mapped, the VA mapping also needs
	 * to be updated to be 1:1.
	 */
	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
		(void)HYPERVISOR_update_va_mapping(
			(unsigned long)__va(pfn << PAGE_SHIFT),
			mfn_pte(pfn, PAGE_KERNEL_IO), 0);

	return remap_pfn;
}

422
static void __init xen_set_identity_and_remap(
423
	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
424
	unsigned long *released, unsigned long *remapped)
425
{
426
	phys_addr_t start = 0;
427
	unsigned long last_pfn = nr_pages;
428
	const struct e820entry *entry;
429
	unsigned long num_released = 0;
430
	unsigned long num_remapped = 0;
431 432
	int i;

433 434 435
	/*
	 * Combine non-RAM regions and gaps until a RAM region (or the
	 * end of the map) is reached, then set the 1:1 map and
436
	 * remap the memory in those non-RAM regions.
437 438 439 440 441 442 443
	 *
	 * The combined non-RAM regions are rounded to a whole number
	 * of pages so any partial pages are accessible via the 1:1
	 * mapping.  This is needed for some BIOSes that put (for
	 * example) the DMI tables in a reserved region that begins on
	 * a non-page boundary.
	 */
444
	for (i = 0, entry = list; i < map_size; i++, entry++) {
445 446 447 448
		phys_addr_t end = entry->addr + entry->size;
		if (entry->type == E820_RAM || i == map_size - 1) {
			unsigned long start_pfn = PFN_DOWN(start);
			unsigned long end_pfn = PFN_UP(end);
449

450 451
			if (entry->type == E820_RAM)
				end_pfn = PFN_UP(entry->addr);
452

453
			if (start_pfn < end_pfn)
454 455 456
				last_pfn = xen_set_identity_and_remap_chunk(
						list, map_size, start_pfn,
						end_pfn, nr_pages, last_pfn,
457
						&num_released, &num_remapped);
458
			start = end;
459 460
		}
	}
461

462
	*released = num_released;
463
	*remapped = num_remapped;
464

465 466
	pr_info("Released %ld page(s)\n", num_released);
}
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504

/*
 * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
 * The remap information (which mfn remap to which pfn) is contained in the
 * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
 * This scheme allows to remap the different chunks in arbitrary order while
 * the resulting mapping will be independant from the order.
 */
void __init xen_remap_memory(void)
{
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn, pfn;
	unsigned long remapped = 0;
	unsigned int i;
	unsigned long pfn_s = ~0UL;
	unsigned long len = 0;

	mfn_save = virt_to_mfn(buf);

	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
		/* Map the remap information */
		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);

		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);

		pfn = xen_remap_buf.target_pfn;
		for (i = 0; i < xen_remap_buf.size; i++) {
			mfn = xen_remap_buf.mfns[i];
			xen_update_mem_tables(pfn, mfn);
			remapped++;
			pfn++;
		}
		if (pfn_s == ~0UL || pfn == pfn_s) {
			pfn_s = xen_remap_buf.target_pfn;
			len += xen_remap_buf.size;
		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
			len += xen_remap_buf.size;
		} else {
505
			xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
506 507 508 509 510 511 512 513 514
			pfn_s = xen_remap_buf.target_pfn;
			len = xen_remap_buf.size;
		}

		mfn = xen_remap_mfn;
		xen_remap_mfn = xen_remap_buf.next_area_mfn;
	}

	if (pfn_s != ~0UL && len)
515
		xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
516 517 518 519 520 521

	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);

	pr_info("Remapped %ld page(s)\n", remapped);
}

522 523 524 525 526 527
static unsigned long __init xen_get_max_pages(void)
{
	unsigned long max_pages = MAX_DOMAIN_PAGES;
	domid_t domid = DOMID_SELF;
	int ret;

528 529 530 531 532 533 534 535 536 537 538 539 540 541 542
	/*
	 * For the initial domain we use the maximum reservation as
	 * the maximum page.
	 *
	 * For guest domains the current maximum reservation reflects
	 * the current maximum rather than the static maximum. In this
	 * case the e820 map provided to us will cover the static
	 * maximum region.
	 */
	if (xen_initial_domain()) {
		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
		if (ret > 0)
			max_pages = ret;
	}

543 544 545
	return min(max_pages, MAX_DOMAIN_PAGES);
}

546 547 548 549 550 551 552 553 554 555 556 557 558
static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
{
	u64 end = start + size;

	/* Align RAM regions to page boundaries. */
	if (type == E820_RAM) {
		start = PAGE_ALIGN(start);
		end &= ~((u64)PAGE_SIZE - 1);
	}

	e820_add_region(start, end - start, type);
}

559 560 561 562 563 564 565 566 567 568 569
void xen_ignore_unusable(struct e820entry *list, size_t map_size)
{
	struct e820entry *entry;
	unsigned int i;

	for (i = 0, entry = list; i < map_size; i++, entry++) {
		if (entry->type == E820_UNUSABLE)
			entry->type = E820_RAM;
	}
}

570 571 572 573 574
/**
 * machine_specific_memory_setup - Hook for machine specific memory setup.
 **/
char * __init xen_memory_setup(void)
{
I
Ian Campbell 已提交
575 576
	static struct e820entry map[E820MAX] __initdata;

577
	unsigned long max_pfn = xen_start_info->nr_pages;
I
Ian Campbell 已提交
578 579 580
	unsigned long long mem_end;
	int rc;
	struct xen_memory_map memmap;
581
	unsigned long max_pages;
582
	unsigned long extra_pages = 0;
583
	unsigned long remapped_pages;
I
Ian Campbell 已提交
584
	int i;
I
Ian Campbell 已提交
585
	int op;
586

587
	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
I
Ian Campbell 已提交
588 589 590 591 592
	mem_end = PFN_PHYS(max_pfn);

	memmap.nr_entries = E820MAX;
	set_xen_guest_handle(memmap.buffer, map);

I
Ian Campbell 已提交
593 594 595 596
	op = xen_initial_domain() ?
		XENMEM_machine_memory_map :
		XENMEM_memory_map;
	rc = HYPERVISOR_memory_op(op, &memmap);
I
Ian Campbell 已提交
597
	if (rc == -ENOSYS) {
598
		BUG_ON(xen_initial_domain());
I
Ian Campbell 已提交
599 600 601 602 603 604 605 606 607
		memmap.nr_entries = 1;
		map[0].addr = 0ULL;
		map[0].size = mem_end;
		/* 8MB slack (to balance backend allocations). */
		map[0].size += 8ULL << 20;
		map[0].type = E820_RAM;
		rc = 0;
	}
	BUG_ON(rc);
608
	BUG_ON(memmap.nr_entries == 0);
609

610 611 612 613 614 615 616 617 618 619 620
	/*
	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
	 * regions, so if we're using the machine memory map leave the
	 * region as RAM as it is in the pseudo-physical map.
	 *
	 * UNUSABLE regions in domUs are not handled and will need
	 * a patch in the future.
	 */
	if (xen_initial_domain())
		xen_ignore_unusable(map, memmap.nr_entries);

621 622 623 624 625 626 627
	/* Make sure the Xen-supplied memory map is well-ordered. */
	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);

	max_pages = xen_get_max_pages();
	if (max_pages > max_pfn)
		extra_pages += max_pages - max_pfn;

628
	/*
629 630
	 * Set identity map on non-RAM pages and prepare remapping the
	 * underlying RAM.
631
	 */
632
	xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
633
				   &xen_released_pages, &remapped_pages);
634

635
	extra_pages += xen_released_pages;
636
	extra_pages += remapped_pages;
637

638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663
	/*
	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
	 * factor the base size.  On non-highmem systems, the base
	 * size is the full initial memory allocation; on highmem it
	 * is limited to the max size of lowmem, so that it doesn't
	 * get completely filled.
	 *
	 * In principle there could be a problem in lowmem systems if
	 * the initial memory is also very large with respect to
	 * lowmem, but we won't try to deal with that here.
	 */
	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
			  extra_pages);
	i = 0;
	while (i < memmap.nr_entries) {
		u64 addr = map[i].addr;
		u64 size = map[i].size;
		u32 type = map[i].type;

		if (type == E820_RAM) {
			if (addr < mem_end) {
				size = min(size, mem_end - addr);
			} else if (extra_pages) {
				size = min(size, (u64)extra_pages * PAGE_SIZE);
				extra_pages -= size / PAGE_SIZE;
				xen_add_extra_mem(addr, size);
664
				xen_max_p2m_pfn = PFN_DOWN(addr + size);
665 666
			} else
				type = E820_UNUSABLE;
667 668
		}

669
		xen_align_and_add_e820_region(addr, size, type);
670

671 672 673 674
		map[i].addr += size;
		map[i].size -= size;
		if (map[i].size == 0)
			i++;
I
Ian Campbell 已提交
675
	}
676

677 678 679 680 681 682 683 684 685
	/*
	 * Set the rest as identity mapped, in case PCI BARs are
	 * located here.
	 *
	 * PFNs above MAX_P2M_PFN are considered identity mapped as
	 * well.
	 */
	set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);

686
	/*
687 688
	 * In domU, the ISA region is normal, usable memory, but we
	 * reserve ISA memory anyway because too many things poke
689 690 691 692
	 * about in there.
	 */
	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
			E820_RESERVED);
693

694 695 696 697 698
	/*
	 * Reserve Xen bits:
	 *  - mfn_list
	 *  - xen_start_info
	 * See comment above "struct start_info" in <xen/interface/xen.h>
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
	 * We tried to make the the memblock_reserve more selective so
	 * that it would be clear what region is reserved. Sadly we ran
	 * in the problem wherein on a 64-bit hypervisor with a 32-bit
	 * initial domain, the pt_base has the cr3 value which is not
	 * neccessarily where the pagetable starts! As Jan put it: "
	 * Actually, the adjustment turns out to be correct: The page
	 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
	 * "first L2", "first L3", so the offset to the page table base is
	 * indeed 2. When reading xen/include/public/xen.h's comment
	 * very strictly, this is not a violation (since there nothing is said
	 * that the first thing in the page table space is pointed to by
	 * pt_base; I admit that this seems to be implied though, namely
	 * do I think that it is implied that the page table space is the
	 * range [pt_base, pt_base + nt_pt_frames), whereas that
	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
	 * which - without a priori knowledge - the kernel would have
	 * difficulty to figure out)." - so lets just fall back to the
	 * easy way and reserve the whole region.
717
	 */
718 719
	memblock_reserve(__pa(xen_start_info->mfn_list),
			 xen_start_info->pt_base - xen_start_info->mfn_list);
720 721 722

	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);

723 724 725
	return "Xen";
}

726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
/*
 * Machine specific memory setup for auto-translated guests.
 */
char * __init xen_auto_xlated_memory_setup(void)
{
	static struct e820entry map[E820MAX] __initdata;

	struct xen_memory_map memmap;
	int i;
	int rc;

	memmap.nr_entries = E820MAX;
	set_xen_guest_handle(memmap.buffer, map);

	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
	if (rc < 0)
		panic("No memory map (%d)\n", rc);

	sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);

	for (i = 0; i < memmap.nr_entries; i++)
		e820_add_region(map[i].addr, map[i].size, map[i].type);

	memblock_reserve(__pa(xen_start_info->mfn_list),
			 xen_start_info->pt_base - xen_start_info->mfn_list);

	return "Xen";
}

755 756
/*
 * Set the bit indicating "nosegneg" library variants should be used.
757 758
 * We only need to bother in pure 32-bit mode; compat 32-bit processes
 * can have un-truncated segments, so wrapping around is allowed.
759
 */
760
static void __init fiddle_vdso(void)
761
{
762
#ifdef CONFIG_X86_32
763 764 765 766 767
	/*
	 * This could be called before selected_vdso32 is initialized, so
	 * just fiddle with both possible images.  vdso_image_32_syscall
	 * can't be selected, since it only exists on 64-bit systems.
	 */
768
	u32 *mask;
769 770
	mask = vdso_image_32_int80.data +
		vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
771
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
772 773
	mask = vdso_image_32_sysenter.data +
		vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
774
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
775
#endif
776 777
}

778
static int register_callback(unsigned type, const void *func)
779
{
780 781 782
	struct callback_register callback = {
		.type = type,
		.address = XEN_CALLBACK(__KERNEL_CS, func),
783 784 785
		.flags = CALLBACKF_mask_events,
	};

786 787 788
	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}

789
void xen_enable_sysenter(void)
790
{
791
	int ret;
792
	unsigned sysenter_feature;
793 794

#ifdef CONFIG_X86_32
795
	sysenter_feature = X86_FEATURE_SEP;
796
#else
797
	sysenter_feature = X86_FEATURE_SYSENTER32;
798
#endif
799

800 801 802
	if (!boot_cpu_has(sysenter_feature))
		return;

803
	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
804 805
	if(ret != 0)
		setup_clear_cpu_cap(sysenter_feature);
806 807
}

808
void xen_enable_syscall(void)
809 810 811 812 813 814
{
#ifdef CONFIG_X86_64
	int ret;

	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
	if (ret != 0) {
815
		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
816 817 818 819 820
		/* Pretty fatal; 64-bit userspace has no other
		   mechanism for syscalls. */
	}

	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
821 822
		ret = register_callback(CALLBACKTYPE_syscall32,
					xen_syscall32_target);
823
		if (ret != 0)
824
			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
825 826 827
	}
#endif /* CONFIG_X86_64 */
}
828

829
void __init xen_pvmmu_arch_setup(void)
830 831 832 833
{
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);

834 835
	HYPERVISOR_vm_assist(VMASST_CMD_enable,
			     VMASST_TYPE_pae_extended_cr3);
836

837 838 839
	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
		BUG();
840

841
	xen_enable_sysenter();
842
	xen_enable_syscall();
843 844 845 846 847 848 849 850 851
}

/* This function is not called for HVM domains */
void __init xen_arch_setup(void)
{
	xen_panic_handler_init();
	if (!xen_feature(XENFEAT_auto_translated_physmap))
		xen_pvmmu_arch_setup();

852 853 854 855 856 857 858 859 860 861 862
#ifdef CONFIG_ACPI
	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
		disable_acpi();
	}
#endif

	memcpy(boot_command_line, xen_start_info->cmd_line,
	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);

J
Jeremy Fitzhardinge 已提交
863
	/* Set up idle, making sure it calls safe_halt() pvop */
864
	disable_cpuidle();
865
	disable_cpufreq();
866
	WARN_ON(xen_set_default_idle());
867
	fiddle_vdso();
868 869 870
#ifdef CONFIG_NUMA
	numa_off = 1;
#endif
871
}