setup.c 23.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/*
 * Machine specific setup for xen
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
11
#include <linux/memblock.h>
12
#include <linux/cpuidle.h>
13
#include <linux/cpufreq.h>
14 15

#include <asm/elf.h>
R
Roland McGrath 已提交
16
#include <asm/vdso.h>
17 18
#include <asm/e820.h>
#include <asm/setup.h>
19
#include <asm/acpi.h>
20
#include <asm/numa.h>
21 22 23
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

24
#include <xen/xen.h>
25
#include <xen/page.h>
26
#include <xen/interface/callback.h>
I
Ian Campbell 已提交
27
#include <xen/interface/memory.h>
28 29 30
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
31
#include "vdso.h"
32
#include "p2m.h"
33
#include "mmu.h"
34 35 36 37

/* These are code, but not functions.  Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
38
#ifdef CONFIG_X86_64
39
extern asmlinkage void nmi(void);
40
#endif
T
Tej 已提交
41 42 43
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
44

45
/* Amount of extra memory space we add to the e820 ranges */
46
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
47

48 49 50
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;

51 52 53 54 55 56 57 58 59 60 61 62 63
/*
 * Buffer used to remap identity mapped pages. We only need the virtual space.
 * The physical page behind this address is remapped as needed to different
 * buffer pages.
 */
#define REMAP_SIZE	(P2M_PER_PAGE - 3)
static struct {
	unsigned long	next_area_mfn;
	unsigned long	target_pfn;
	unsigned long	size;
	unsigned long	mfns[REMAP_SIZE];
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
64

65 66 67 68 69 70 71 72 73 74 75 76
/* 
 * The maximum amount of extra memory compared to the base size.  The
 * main scaling factor is the size of struct page.  At extreme ratios
 * of base:extra, all the base memory can be filled with page
 * structures for the extra memory, leaving no space for anything
 * else.
 * 
 * 10x seems like a reasonable balance between scaling flexibility and
 * leaving a practically usable system.
 */
#define EXTRA_MEM_RATIO		(10)

77
static void __init xen_add_extra_mem(u64 start, u64 size)
78
{
79
	int i;
80

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		/* Add new region. */
		if (xen_extra_mem[i].size == 0) {
			xen_extra_mem[i].start = start;
			xen_extra_mem[i].size  = size;
			break;
		}
		/* Append to existing region. */
		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
			xen_extra_mem[i].size += size;
			break;
		}
	}
	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
96

97
	memblock_reserve(start, size);
98
}
99

100 101 102 103
static void __init xen_del_extra_mem(u64 start, u64 size)
{
	int i;
	u64 start_r, size_r;
104

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		start_r = xen_extra_mem[i].start;
		size_r = xen_extra_mem[i].size;

		/* Start of region. */
		if (start_r == start) {
			BUG_ON(size > size_r);
			xen_extra_mem[i].start += size;
			xen_extra_mem[i].size -= size;
			break;
		}
		/* End of region. */
		if (start_r + size_r == start + size) {
			BUG_ON(size > size_r);
			xen_extra_mem[i].size -= size;
			break;
		}
		/* Mid of region. */
		if (start > start_r && start < start_r + size_r) {
			BUG_ON(start + size > start_r + size_r);
			xen_extra_mem[i].size = start - start_r;
			/* Calling memblock_reserve() again is okay. */
			xen_add_extra_mem(start + size, start_r + size_r -
					  (start + size));
			break;
		}
	}
	memblock_free(start, size);
}

/*
 * Called during boot before the p2m list can take entries beyond the
 * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
 * invalid.
 */
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
	int i;
143
	phys_addr_t addr = PFN_PHYS(pfn);
144

145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		if (addr >= xen_extra_mem[i].start &&
		    addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
			return INVALID_P2M_ENTRY;
	}

	return IDENTITY_FRAME(pfn);
}

/*
 * Mark all pfns of extra mem as invalid in p2m list.
 */
void __init xen_inv_extra_mem(void)
{
	unsigned long pfn, pfn_s, pfn_e;
	int i;

	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
163 164
		if (!xen_extra_mem[i].size)
			continue;
165 166 167 168
		pfn_s = PFN_DOWN(xen_extra_mem[i].start);
		pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
		for (pfn = pfn_s; pfn < pfn_e; pfn++)
			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
169
	}
170 171
}

172 173 174 175 176 177
/*
 * Finds the next RAM pfn available in the E820 map after min_pfn.
 * This function updates min_pfn with the pfn found and returns
 * the size of that range or zero if not found.
 */
static unsigned long __init xen_find_pfn_range(
178
	const struct e820entry *list, size_t map_size,
179
	unsigned long *min_pfn)
180 181 182 183 184 185 186 187 188 189 190 191
{
	const struct e820entry *entry;
	unsigned int i;
	unsigned long done = 0;

	for (i = 0, entry = list; i < map_size; i++, entry++) {
		unsigned long s_pfn;
		unsigned long e_pfn;

		if (entry->type != E820_RAM)
			continue;

192
		e_pfn = PFN_DOWN(entry->addr + entry->size);
193

194 195
		/* We only care about E820 after this */
		if (e_pfn < *min_pfn)
196 197
			continue;

198
		s_pfn = PFN_UP(entry->addr);
199 200 201

		/* If min_pfn falls within the E820 entry, we want to start
		 * at the min_pfn PFN.
202
		 */
203 204
		if (s_pfn <= *min_pfn) {
			done = e_pfn - *min_pfn;
205
		} else {
206 207
			done = e_pfn - s_pfn;
			*min_pfn = s_pfn;
208
		}
209 210
		break;
	}
211

212 213
	return done;
}
214

215 216 217 218 219 220 221 222 223 224 225 226 227 228
static int __init xen_free_mfn(unsigned long mfn)
{
	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};

	set_xen_guest_handle(reservation.extent_start, &mfn);
	reservation.nr_extents = 1;

	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
}

229
/*
230
 * This releases a chunk of memory and then does the identity map. It's used
231 232 233
 * as a fallback if the remapping fails.
 */
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
234
	unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
235
{
236 237 238
	unsigned long pfn, end;
	int ret;

239 240
	WARN_ON(start_pfn > end_pfn);

241
	/* Release pages first. */
242 243 244 245 246 247 248 249 250 251 252 253
	end = min(end_pfn, nr_pages);
	for (pfn = start_pfn; pfn < end; pfn++) {
		unsigned long mfn = pfn_to_mfn(pfn);

		/* Make sure pfn exists to start with */
		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
			continue;

		ret = xen_free_mfn(mfn);
		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);

		if (ret == 1) {
254
			(*released)++;
255 256 257 258 259 260
			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
				break;
		} else
			break;
	}

261
	set_phys_range_identity(start_pfn, end_pfn);
262 263 264
}

/*
265
 * Helper function to update the p2m and m2p tables and kernel mapping.
266
 */
267
static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
268 269 270 271 272 273 274
{
	struct mmu_update update = {
		.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
		.val = pfn
	};

	/* Update p2m */
275
	if (!set_phys_to_machine(pfn, mfn)) {
276 277
		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
		     pfn, mfn);
278
		BUG();
279
	}
280 281 282 283 284

	/* Update m2p */
	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
		     mfn, pfn);
285
		BUG();
286 287
	}

288
	/* Update kernel mapping, but not for highmem. */
289
	if (pfn >= PFN_UP(__pa(high_memory - 1)))
290 291 292 293 294 295 296 297
		return;

	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
		      mfn, pfn);
		BUG();
	}
298
}
299

300 301
/*
 * This function updates the p2m and m2p tables with an identity map from
302 303 304 305 306 307 308 309 310
 * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
 * original allocation at remap_pfn. The information needed for remapping is
 * saved in the memory itself to avoid the need for allocating buffers. The
 * complete remap information is contained in a list of MFNs each containing
 * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
 * This enables us to preserve the original mfn sequence while doing the
 * remapping at a time when the memory management is capable of allocating
 * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
 * its callers.
311
 */
312
static void __init xen_do_set_identity_and_remap_chunk(
313
        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
314
{
315 316
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn;
317
	unsigned long ident_pfn_iter, remap_pfn_iter;
318
	unsigned long ident_end_pfn = start_pfn + size;
319
	unsigned long left = size;
320
	unsigned int i, chunk;
321 322 323 324

	WARN_ON(size == 0);

	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
325

326
	mfn_save = virt_to_mfn(buf);
327

328 329 330 331
	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
	     ident_pfn_iter < ident_end_pfn;
	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
332

333 334 335
		/* Map first pfn to xen_remap_buf */
		mfn = pfn_to_mfn(ident_pfn_iter);
		set_pte_mfn(buf, mfn, PAGE_KERNEL);
336

337 338 339 340 341 342
		/* Save mapping information in page */
		xen_remap_buf.next_area_mfn = xen_remap_mfn;
		xen_remap_buf.target_pfn = remap_pfn_iter;
		xen_remap_buf.size = chunk;
		for (i = 0; i < chunk; i++)
			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
343

344 345
		/* Put remap buf into list. */
		xen_remap_mfn = mfn;
346

347
		/* Set identity map */
348
		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
349

350
		left -= chunk;
351
	}
352

353 354
	/* Restore old xen_remap_buf mapping */
	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
355 356
}

357 358 359 360 361 362 363 364 365 366 367
/*
 * This function takes a contiguous pfn range that needs to be identity mapped
 * and:
 *
 *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
 *  2) Calls the do_ function to actually do the mapping/remapping work.
 *
 * The goal is to not allocate additional memory but to remap the existing
 * pages. In the case of an error the underlying memory is simply released back
 * to Xen and not remapped.
 */
368
static unsigned long __init xen_set_identity_and_remap_chunk(
369 370
        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
371
	unsigned long *released, unsigned long *remapped)
372 373 374 375 376 377 378 379 380 381 382 383 384 385
{
	unsigned long pfn;
	unsigned long i = 0;
	unsigned long n = end_pfn - start_pfn;

	while (i < n) {
		unsigned long cur_pfn = start_pfn + i;
		unsigned long left = n - i;
		unsigned long size = left;
		unsigned long remap_range_size;

		/* Do not remap pages beyond the current allocation */
		if (cur_pfn >= nr_pages) {
			/* Identity map remaining pages */
386
			set_phys_range_identity(cur_pfn, cur_pfn + size);
387 388 389 390 391 392 393 394 395 396
			break;
		}
		if (cur_pfn + size > nr_pages)
			size = nr_pages - cur_pfn;

		remap_range_size = xen_find_pfn_range(list, map_size,
						      &remap_pfn);
		if (!remap_range_size) {
			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
			xen_set_identity_and_release_chunk(cur_pfn,
397
				cur_pfn + left, nr_pages, released);
398 399 400 401 402 403
			break;
		}
		/* Adjust size to fit in current e820 RAM region */
		if (size > remap_range_size)
			size = remap_range_size;

404
		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
405 406 407 408

		/* Update variables to reflect new mappings. */
		i += size;
		remap_pfn += size;
409
		*remapped += size;
410 411 412 413 414 415 416 417 418 419 420 421 422 423
	}

	/*
	 * If the PFNs are currently mapped, the VA mapping also needs
	 * to be updated to be 1:1.
	 */
	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
		(void)HYPERVISOR_update_va_mapping(
			(unsigned long)__va(pfn << PAGE_SHIFT),
			mfn_pte(pfn, PAGE_KERNEL_IO), 0);

	return remap_pfn;
}

424
static void __init xen_set_identity_and_remap(
425
	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
426
	unsigned long *released, unsigned long *remapped)
427
{
428
	phys_addr_t start = 0;
429
	unsigned long last_pfn = nr_pages;
430
	const struct e820entry *entry;
431
	unsigned long num_released = 0;
432
	unsigned long num_remapped = 0;
433 434
	int i;

435 436 437
	/*
	 * Combine non-RAM regions and gaps until a RAM region (or the
	 * end of the map) is reached, then set the 1:1 map and
438
	 * remap the memory in those non-RAM regions.
439 440 441 442 443 444 445
	 *
	 * The combined non-RAM regions are rounded to a whole number
	 * of pages so any partial pages are accessible via the 1:1
	 * mapping.  This is needed for some BIOSes that put (for
	 * example) the DMI tables in a reserved region that begins on
	 * a non-page boundary.
	 */
446
	for (i = 0, entry = list; i < map_size; i++, entry++) {
447 448 449 450
		phys_addr_t end = entry->addr + entry->size;
		if (entry->type == E820_RAM || i == map_size - 1) {
			unsigned long start_pfn = PFN_DOWN(start);
			unsigned long end_pfn = PFN_UP(end);
451

452 453
			if (entry->type == E820_RAM)
				end_pfn = PFN_UP(entry->addr);
454

455
			if (start_pfn < end_pfn)
456 457 458
				last_pfn = xen_set_identity_and_remap_chunk(
						list, map_size, start_pfn,
						end_pfn, nr_pages, last_pfn,
459
						&num_released, &num_remapped);
460
			start = end;
461 462
		}
	}
463

464
	*released = num_released;
465
	*remapped = num_remapped;
466

467 468
	pr_info("Released %ld page(s)\n", num_released);
}
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506

/*
 * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
 * The remap information (which mfn remap to which pfn) is contained in the
 * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
 * This scheme allows to remap the different chunks in arbitrary order while
 * the resulting mapping will be independant from the order.
 */
void __init xen_remap_memory(void)
{
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn, pfn;
	unsigned long remapped = 0;
	unsigned int i;
	unsigned long pfn_s = ~0UL;
	unsigned long len = 0;

	mfn_save = virt_to_mfn(buf);

	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
		/* Map the remap information */
		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);

		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);

		pfn = xen_remap_buf.target_pfn;
		for (i = 0; i < xen_remap_buf.size; i++) {
			mfn = xen_remap_buf.mfns[i];
			xen_update_mem_tables(pfn, mfn);
			remapped++;
			pfn++;
		}
		if (pfn_s == ~0UL || pfn == pfn_s) {
			pfn_s = xen_remap_buf.target_pfn;
			len += xen_remap_buf.size;
		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
			len += xen_remap_buf.size;
		} else {
507
			xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
508 509 510 511 512 513 514 515 516
			pfn_s = xen_remap_buf.target_pfn;
			len = xen_remap_buf.size;
		}

		mfn = xen_remap_mfn;
		xen_remap_mfn = xen_remap_buf.next_area_mfn;
	}

	if (pfn_s != ~0UL && len)
517
		xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
518 519 520 521 522 523

	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);

	pr_info("Remapped %ld page(s)\n", remapped);
}

524 525 526 527 528 529
static unsigned long __init xen_get_max_pages(void)
{
	unsigned long max_pages = MAX_DOMAIN_PAGES;
	domid_t domid = DOMID_SELF;
	int ret;

530 531 532 533 534 535 536 537 538 539 540 541 542 543 544
	/*
	 * For the initial domain we use the maximum reservation as
	 * the maximum page.
	 *
	 * For guest domains the current maximum reservation reflects
	 * the current maximum rather than the static maximum. In this
	 * case the e820 map provided to us will cover the static
	 * maximum region.
	 */
	if (xen_initial_domain()) {
		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
		if (ret > 0)
			max_pages = ret;
	}

545 546 547
	return min(max_pages, MAX_DOMAIN_PAGES);
}

548 549 550 551 552 553 554 555 556 557 558 559 560
static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
{
	u64 end = start + size;

	/* Align RAM regions to page boundaries. */
	if (type == E820_RAM) {
		start = PAGE_ALIGN(start);
		end &= ~((u64)PAGE_SIZE - 1);
	}

	e820_add_region(start, end - start, type);
}

561 562 563 564 565 566 567 568 569 570 571
void xen_ignore_unusable(struct e820entry *list, size_t map_size)
{
	struct e820entry *entry;
	unsigned int i;

	for (i = 0, entry = list; i < map_size; i++, entry++) {
		if (entry->type == E820_UNUSABLE)
			entry->type = E820_RAM;
	}
}

572 573 574 575 576
/**
 * machine_specific_memory_setup - Hook for machine specific memory setup.
 **/
char * __init xen_memory_setup(void)
{
I
Ian Campbell 已提交
577 578
	static struct e820entry map[E820MAX] __initdata;

579
	unsigned long max_pfn = xen_start_info->nr_pages;
I
Ian Campbell 已提交
580 581 582
	unsigned long long mem_end;
	int rc;
	struct xen_memory_map memmap;
583
	unsigned long max_pages;
584
	unsigned long extra_pages = 0;
585
	unsigned long remapped_pages;
I
Ian Campbell 已提交
586
	int i;
I
Ian Campbell 已提交
587
	int op;
588

589
	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
I
Ian Campbell 已提交
590 591 592 593 594
	mem_end = PFN_PHYS(max_pfn);

	memmap.nr_entries = E820MAX;
	set_xen_guest_handle(memmap.buffer, map);

I
Ian Campbell 已提交
595 596 597 598
	op = xen_initial_domain() ?
		XENMEM_machine_memory_map :
		XENMEM_memory_map;
	rc = HYPERVISOR_memory_op(op, &memmap);
I
Ian Campbell 已提交
599
	if (rc == -ENOSYS) {
600
		BUG_ON(xen_initial_domain());
I
Ian Campbell 已提交
601 602 603 604 605 606 607 608 609
		memmap.nr_entries = 1;
		map[0].addr = 0ULL;
		map[0].size = mem_end;
		/* 8MB slack (to balance backend allocations). */
		map[0].size += 8ULL << 20;
		map[0].type = E820_RAM;
		rc = 0;
	}
	BUG_ON(rc);
610
	BUG_ON(memmap.nr_entries == 0);
611

612 613 614 615 616 617 618 619 620 621 622
	/*
	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
	 * regions, so if we're using the machine memory map leave the
	 * region as RAM as it is in the pseudo-physical map.
	 *
	 * UNUSABLE regions in domUs are not handled and will need
	 * a patch in the future.
	 */
	if (xen_initial_domain())
		xen_ignore_unusable(map, memmap.nr_entries);

623 624 625 626 627 628 629
	/* Make sure the Xen-supplied memory map is well-ordered. */
	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);

	max_pages = xen_get_max_pages();
	if (max_pages > max_pfn)
		extra_pages += max_pages - max_pfn;

630
	/*
631 632
	 * Set identity map on non-RAM pages and prepare remapping the
	 * underlying RAM.
633
	 */
634
	xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
635
				   &xen_released_pages, &remapped_pages);
636

637
	extra_pages += xen_released_pages;
638
	extra_pages += remapped_pages;
639

640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
	/*
	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
	 * factor the base size.  On non-highmem systems, the base
	 * size is the full initial memory allocation; on highmem it
	 * is limited to the max size of lowmem, so that it doesn't
	 * get completely filled.
	 *
	 * In principle there could be a problem in lowmem systems if
	 * the initial memory is also very large with respect to
	 * lowmem, but we won't try to deal with that here.
	 */
	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
			  extra_pages);
	i = 0;
	while (i < memmap.nr_entries) {
		u64 addr = map[i].addr;
		u64 size = map[i].size;
		u32 type = map[i].type;

		if (type == E820_RAM) {
			if (addr < mem_end) {
				size = min(size, mem_end - addr);
			} else if (extra_pages) {
				size = min(size, (u64)extra_pages * PAGE_SIZE);
				extra_pages -= size / PAGE_SIZE;
				xen_add_extra_mem(addr, size);
666
				xen_max_p2m_pfn = PFN_DOWN(addr + size);
667 668
			} else
				type = E820_UNUSABLE;
669 670
		}

671
		xen_align_and_add_e820_region(addr, size, type);
672

673 674 675 676
		map[i].addr += size;
		map[i].size -= size;
		if (map[i].size == 0)
			i++;
I
Ian Campbell 已提交
677
	}
678

679 680 681 682 683 684 685 686 687
	/*
	 * Set the rest as identity mapped, in case PCI BARs are
	 * located here.
	 *
	 * PFNs above MAX_P2M_PFN are considered identity mapped as
	 * well.
	 */
	set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);

688
	/*
689 690
	 * In domU, the ISA region is normal, usable memory, but we
	 * reserve ISA memory anyway because too many things poke
691 692 693 694
	 * about in there.
	 */
	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
			E820_RESERVED);
695

696 697 698 699 700
	/*
	 * Reserve Xen bits:
	 *  - mfn_list
	 *  - xen_start_info
	 * See comment above "struct start_info" in <xen/interface/xen.h>
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
	 * We tried to make the the memblock_reserve more selective so
	 * that it would be clear what region is reserved. Sadly we ran
	 * in the problem wherein on a 64-bit hypervisor with a 32-bit
	 * initial domain, the pt_base has the cr3 value which is not
	 * neccessarily where the pagetable starts! As Jan put it: "
	 * Actually, the adjustment turns out to be correct: The page
	 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
	 * "first L2", "first L3", so the offset to the page table base is
	 * indeed 2. When reading xen/include/public/xen.h's comment
	 * very strictly, this is not a violation (since there nothing is said
	 * that the first thing in the page table space is pointed to by
	 * pt_base; I admit that this seems to be implied though, namely
	 * do I think that it is implied that the page table space is the
	 * range [pt_base, pt_base + nt_pt_frames), whereas that
	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
	 * which - without a priori knowledge - the kernel would have
	 * difficulty to figure out)." - so lets just fall back to the
	 * easy way and reserve the whole region.
719
	 */
720 721
	memblock_reserve(__pa(xen_start_info->mfn_list),
			 xen_start_info->pt_base - xen_start_info->mfn_list);
722 723 724

	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);

725 726 727
	return "Xen";
}

728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
/*
 * Machine specific memory setup for auto-translated guests.
 */
char * __init xen_auto_xlated_memory_setup(void)
{
	static struct e820entry map[E820MAX] __initdata;

	struct xen_memory_map memmap;
	int i;
	int rc;

	memmap.nr_entries = E820MAX;
	set_xen_guest_handle(memmap.buffer, map);

	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
	if (rc < 0)
		panic("No memory map (%d)\n", rc);

	sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);

	for (i = 0; i < memmap.nr_entries; i++)
		e820_add_region(map[i].addr, map[i].size, map[i].type);

	memblock_reserve(__pa(xen_start_info->mfn_list),
			 xen_start_info->pt_base - xen_start_info->mfn_list);

	return "Xen";
}

757 758
/*
 * Set the bit indicating "nosegneg" library variants should be used.
759 760
 * We only need to bother in pure 32-bit mode; compat 32-bit processes
 * can have un-truncated segments, so wrapping around is allowed.
761
 */
762
static void __init fiddle_vdso(void)
763
{
764
#ifdef CONFIG_X86_32
765 766 767 768 769
	/*
	 * This could be called before selected_vdso32 is initialized, so
	 * just fiddle with both possible images.  vdso_image_32_syscall
	 * can't be selected, since it only exists on 64-bit systems.
	 */
770
	u32 *mask;
771 772
	mask = vdso_image_32_int80.data +
		vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
773
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
774 775
	mask = vdso_image_32_sysenter.data +
		vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
776
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
777
#endif
778 779
}

780
static int register_callback(unsigned type, const void *func)
781
{
782 783 784
	struct callback_register callback = {
		.type = type,
		.address = XEN_CALLBACK(__KERNEL_CS, func),
785 786 787
		.flags = CALLBACKF_mask_events,
	};

788 789 790
	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}

791
void xen_enable_sysenter(void)
792
{
793
	int ret;
794
	unsigned sysenter_feature;
795 796

#ifdef CONFIG_X86_32
797
	sysenter_feature = X86_FEATURE_SEP;
798
#else
799
	sysenter_feature = X86_FEATURE_SYSENTER32;
800
#endif
801

802 803 804
	if (!boot_cpu_has(sysenter_feature))
		return;

805
	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
806 807
	if(ret != 0)
		setup_clear_cpu_cap(sysenter_feature);
808 809
}

810
void xen_enable_syscall(void)
811 812 813 814 815 816
{
#ifdef CONFIG_X86_64
	int ret;

	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
	if (ret != 0) {
817
		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
818 819 820 821 822
		/* Pretty fatal; 64-bit userspace has no other
		   mechanism for syscalls. */
	}

	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
823 824
		ret = register_callback(CALLBACKTYPE_syscall32,
					xen_syscall32_target);
825
		if (ret != 0)
826
			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
827 828 829
	}
#endif /* CONFIG_X86_64 */
}
830

831
void __init xen_pvmmu_arch_setup(void)
832 833 834 835
{
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);

836 837
	HYPERVISOR_vm_assist(VMASST_CMD_enable,
			     VMASST_TYPE_pae_extended_cr3);
838

839 840 841
	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
		BUG();
842

843
	xen_enable_sysenter();
844
	xen_enable_syscall();
845 846 847 848 849 850 851 852 853
}

/* This function is not called for HVM domains */
void __init xen_arch_setup(void)
{
	xen_panic_handler_init();
	if (!xen_feature(XENFEAT_auto_translated_physmap))
		xen_pvmmu_arch_setup();

854 855 856 857 858 859 860 861 862 863 864
#ifdef CONFIG_ACPI
	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
		disable_acpi();
	}
#endif

	memcpy(boot_command_line, xen_start_info->cmd_line,
	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);

J
Jeremy Fitzhardinge 已提交
865
	/* Set up idle, making sure it calls safe_halt() pvop */
866
	disable_cpuidle();
867
	disable_cpufreq();
868
	WARN_ON(xen_set_default_idle());
869
	fiddle_vdso();
870 871 872
#ifdef CONFIG_NUMA
	numa_off = 1;
#endif
873
}