setup.c 28.0 KB
Newer Older
1 2 3 4 5 6
/*
 * Machine specific setup for xen
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */

7
#include <linux/init.h>
8 9 10
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
11
#include <linux/memblock.h>
12
#include <linux/cpuidle.h>
13
#include <linux/cpufreq.h>
14 15

#include <asm/elf.h>
R
Roland McGrath 已提交
16
#include <asm/vdso.h>
17
#include <asm/e820/api.h>
18
#include <asm/setup.h>
19
#include <asm/acpi.h>
20
#include <asm/numa.h>
21 22 23
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

24
#include <xen/xen.h>
25
#include <xen/page.h>
26
#include <xen/interface/callback.h>
I
Ian Campbell 已提交
27
#include <xen/interface/memory.h>
28 29
#include <xen/interface/physdev.h>
#include <xen/features.h>
30
#include <xen/hvc-console.h>
31
#include "xen-ops.h"
32
#include "vdso.h"
33
#include "mmu.h"
34

35 36
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)

37
/* Amount of extra memory space we add to the e820 ranges */
38
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
39

40 41 42
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;

43
/* E820 map used during setting up memory. */
44
static struct e820_table xen_e820_table __initdata;
45

46 47 48 49 50 51 52 53 54 55 56 57 58
/*
 * Buffer used to remap identity mapped pages. We only need the virtual space.
 * The physical page behind this address is remapped as needed to different
 * buffer pages.
 */
#define REMAP_SIZE	(P2M_PER_PAGE - 3)
static struct {
	unsigned long	next_area_mfn;
	unsigned long	target_pfn;
	unsigned long	size;
	unsigned long	mfns[REMAP_SIZE];
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
59

60 61 62 63 64 65 66 67 68 69 70 71
/* 
 * The maximum amount of extra memory compared to the base size.  The
 * main scaling factor is the size of struct page.  At extreme ratios
 * of base:extra, all the base memory can be filled with page
 * structures for the extra memory, leaving no space for anything
 * else.
 * 
 * 10x seems like a reasonable balance between scaling flexibility and
 * leaving a practically usable system.
 */
#define EXTRA_MEM_RATIO		(10)

72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);

static void __init xen_parse_512gb(void)
{
	bool val = false;
	char *arg;

	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
	if (!arg)
		return;

	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
	if (!arg)
		val = true;
	else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
		return;

	xen_512gb_limit = val;
}

92 93
static void __init xen_add_extra_mem(unsigned long start_pfn,
				     unsigned long n_pfns)
94
{
95
	int i;
96

97 98 99 100
	/*
	 * No need to check for zero size, should happen rarely and will only
	 * write a new entry regarded to be unused due to zero size.
	 */
101 102
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		/* Add new region. */
103 104 105
		if (xen_extra_mem[i].n_pfns == 0) {
			xen_extra_mem[i].start_pfn = start_pfn;
			xen_extra_mem[i].n_pfns = n_pfns;
106 107 108
			break;
		}
		/* Append to existing region. */
109 110 111
		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
		    start_pfn) {
			xen_extra_mem[i].n_pfns += n_pfns;
112 113 114 115 116
			break;
		}
	}
	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
117

118
	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
119
}
120

121 122
static void __init xen_del_extra_mem(unsigned long start_pfn,
				     unsigned long n_pfns)
123 124
{
	int i;
125
	unsigned long start_r, size_r;
126

127
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
128 129
		start_r = xen_extra_mem[i].start_pfn;
		size_r = xen_extra_mem[i].n_pfns;
130 131

		/* Start of region. */
132 133 134 135
		if (start_r == start_pfn) {
			BUG_ON(n_pfns > size_r);
			xen_extra_mem[i].start_pfn += n_pfns;
			xen_extra_mem[i].n_pfns -= n_pfns;
136 137 138
			break;
		}
		/* End of region. */
139 140 141
		if (start_r + size_r == start_pfn + n_pfns) {
			BUG_ON(n_pfns > size_r);
			xen_extra_mem[i].n_pfns -= n_pfns;
142 143 144
			break;
		}
		/* Mid of region. */
145 146 147
		if (start_pfn > start_r && start_pfn < start_r + size_r) {
			BUG_ON(start_pfn + n_pfns > start_r + size_r);
			xen_extra_mem[i].n_pfns = start_pfn - start_r;
148
			/* Calling memblock_reserve() again is okay. */
149 150
			xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
					  (start_pfn + n_pfns));
151 152 153
			break;
		}
	}
154
	memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
155 156 157 158 159 160 161 162 163 164
}

/*
 * Called during boot before the p2m list can take entries beyond the
 * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
 * invalid.
 */
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
	int i;
165

166
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
167 168
		if (pfn >= xen_extra_mem[i].start_pfn &&
		    pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
			return INVALID_P2M_ENTRY;
	}

	return IDENTITY_FRAME(pfn);
}

/*
 * Mark all pfns of extra mem as invalid in p2m list.
 */
void __init xen_inv_extra_mem(void)
{
	unsigned long pfn, pfn_s, pfn_e;
	int i;

	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
184
		if (!xen_extra_mem[i].n_pfns)
185
			continue;
186 187
		pfn_s = xen_extra_mem[i].start_pfn;
		pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
188 189
		for (pfn = pfn_s; pfn < pfn_e; pfn++)
			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
190
	}
191 192
}

193 194 195 196 197
/*
 * Finds the next RAM pfn available in the E820 map after min_pfn.
 * This function updates min_pfn with the pfn found and returns
 * the size of that range or zero if not found.
 */
198
static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
199
{
200
	const struct e820_entry *entry = xen_e820_table.entries;
201 202 203
	unsigned int i;
	unsigned long done = 0;

204
	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
205 206 207
		unsigned long s_pfn;
		unsigned long e_pfn;

208
		if (entry->type != E820_TYPE_RAM)
209 210
			continue;

211
		e_pfn = PFN_DOWN(entry->addr + entry->size);
212

213
		/* We only care about E820 after this */
214
		if (e_pfn <= *min_pfn)
215 216
			continue;

217
		s_pfn = PFN_UP(entry->addr);
218 219 220

		/* If min_pfn falls within the E820 entry, we want to start
		 * at the min_pfn PFN.
221
		 */
222 223
		if (s_pfn <= *min_pfn) {
			done = e_pfn - *min_pfn;
224
		} else {
225 226
			done = e_pfn - s_pfn;
			*min_pfn = s_pfn;
227
		}
228 229
		break;
	}
230

231 232
	return done;
}
233

234 235 236 237 238 239 240 241 242 243 244 245 246 247
static int __init xen_free_mfn(unsigned long mfn)
{
	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};

	set_xen_guest_handle(reservation.extent_start, &mfn);
	reservation.nr_extents = 1;

	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
}

248
/*
249
 * This releases a chunk of memory and then does the identity map. It's used
250 251 252
 * as a fallback if the remapping fails.
 */
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
253
			unsigned long end_pfn, unsigned long nr_pages)
254
{
255 256 257
	unsigned long pfn, end;
	int ret;

258 259
	WARN_ON(start_pfn > end_pfn);

260
	/* Release pages first. */
261 262 263 264 265 266 267 268 269 270 271 272
	end = min(end_pfn, nr_pages);
	for (pfn = start_pfn; pfn < end; pfn++) {
		unsigned long mfn = pfn_to_mfn(pfn);

		/* Make sure pfn exists to start with */
		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
			continue;

		ret = xen_free_mfn(mfn);
		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);

		if (ret == 1) {
273
			xen_released_pages++;
274 275 276 277 278 279
			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
				break;
		} else
			break;
	}

280
	set_phys_range_identity(start_pfn, end_pfn);
281 282 283
}

/*
284
 * Helper function to update the p2m and m2p tables and kernel mapping.
285
 */
286
static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
287 288
{
	struct mmu_update update = {
289
		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
290 291 292 293
		.val = pfn
	};

	/* Update p2m */
294
	if (!set_phys_to_machine(pfn, mfn)) {
295 296
		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
		     pfn, mfn);
297
		BUG();
298
	}
299 300 301 302 303

	/* Update m2p */
	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
		     mfn, pfn);
304
		BUG();
305 306
	}

307
	/* Update kernel mapping, but not for highmem. */
308
	if (pfn >= PFN_UP(__pa(high_memory - 1)))
309 310 311 312 313 314 315 316
		return;

	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
		      mfn, pfn);
		BUG();
	}
317
}
318

319 320
/*
 * This function updates the p2m and m2p tables with an identity map from
321 322 323 324 325 326 327 328 329
 * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
 * original allocation at remap_pfn. The information needed for remapping is
 * saved in the memory itself to avoid the need for allocating buffers. The
 * complete remap information is contained in a list of MFNs each containing
 * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
 * This enables us to preserve the original mfn sequence while doing the
 * remapping at a time when the memory management is capable of allocating
 * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
 * its callers.
330
 */
331
static void __init xen_do_set_identity_and_remap_chunk(
332
        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
333
{
334 335
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn;
336
	unsigned long ident_pfn_iter, remap_pfn_iter;
337
	unsigned long ident_end_pfn = start_pfn + size;
338
	unsigned long left = size;
339
	unsigned int i, chunk;
340 341 342 343

	WARN_ON(size == 0);

	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
344

345
	mfn_save = virt_to_mfn(buf);
346

347 348 349 350
	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
	     ident_pfn_iter < ident_end_pfn;
	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
351

352 353 354
		/* Map first pfn to xen_remap_buf */
		mfn = pfn_to_mfn(ident_pfn_iter);
		set_pte_mfn(buf, mfn, PAGE_KERNEL);
355

356 357 358 359 360 361
		/* Save mapping information in page */
		xen_remap_buf.next_area_mfn = xen_remap_mfn;
		xen_remap_buf.target_pfn = remap_pfn_iter;
		xen_remap_buf.size = chunk;
		for (i = 0; i < chunk; i++)
			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
362

363 364
		/* Put remap buf into list. */
		xen_remap_mfn = mfn;
365

366
		/* Set identity map */
367
		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
368

369
		left -= chunk;
370
	}
371

372 373
	/* Restore old xen_remap_buf mapping */
	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
374 375
}

376 377 378 379 380 381 382 383 384 385 386
/*
 * This function takes a contiguous pfn range that needs to be identity mapped
 * and:
 *
 *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
 *  2) Calls the do_ function to actually do the mapping/remapping work.
 *
 * The goal is to not allocate additional memory but to remap the existing
 * pages. In the case of an error the underlying memory is simply released back
 * to Xen and not remapped.
 */
387
static unsigned long __init xen_set_identity_and_remap_chunk(
388
	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
389
	unsigned long remap_pfn)
390 391 392 393 394
{
	unsigned long pfn;
	unsigned long i = 0;
	unsigned long n = end_pfn - start_pfn;

395 396 397
	if (remap_pfn == 0)
		remap_pfn = nr_pages;

398 399 400 401 402 403 404 405 406
	while (i < n) {
		unsigned long cur_pfn = start_pfn + i;
		unsigned long left = n - i;
		unsigned long size = left;
		unsigned long remap_range_size;

		/* Do not remap pages beyond the current allocation */
		if (cur_pfn >= nr_pages) {
			/* Identity map remaining pages */
407
			set_phys_range_identity(cur_pfn, cur_pfn + size);
408 409 410 411 412
			break;
		}
		if (cur_pfn + size > nr_pages)
			size = nr_pages - cur_pfn;

413
		remap_range_size = xen_find_pfn_range(&remap_pfn);
414 415 416
		if (!remap_range_size) {
			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
			xen_set_identity_and_release_chunk(cur_pfn,
417
						cur_pfn + left, nr_pages);
418 419 420 421 422 423
			break;
		}
		/* Adjust size to fit in current e820 RAM region */
		if (size > remap_range_size)
			size = remap_range_size;

424
		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442

		/* Update variables to reflect new mappings. */
		i += size;
		remap_pfn += size;
	}

	/*
	 * If the PFNs are currently mapped, the VA mapping also needs
	 * to be updated to be 1:1.
	 */
	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
		(void)HYPERVISOR_update_va_mapping(
			(unsigned long)__va(pfn << PAGE_SHIFT),
			mfn_pte(pfn, PAGE_KERNEL_IO), 0);

	return remap_pfn;
}

443 444 445 446 447 448 449 450 451 452 453 454 455
static unsigned long __init xen_count_remap_pages(
	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
	unsigned long remap_pages)
{
	if (start_pfn >= nr_pages)
		return remap_pages;

	return remap_pages + min(end_pfn, nr_pages) - start_pfn;
}

static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
	unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
			      unsigned long nr_pages, unsigned long last_val))
456
{
457
	phys_addr_t start = 0;
458
	unsigned long ret_val = 0;
459
	const struct e820_entry *entry = xen_e820_table.entries;
460 461
	int i;

462 463
	/*
	 * Combine non-RAM regions and gaps until a RAM region (or the
464 465
	 * end of the map) is reached, then call the provided function
	 * to perform its duty on the non-RAM region.
466 467 468 469 470 471 472
	 *
	 * The combined non-RAM regions are rounded to a whole number
	 * of pages so any partial pages are accessible via the 1:1
	 * mapping.  This is needed for some BIOSes that put (for
	 * example) the DMI tables in a reserved region that begins on
	 * a non-page boundary.
	 */
473
	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
474
		phys_addr_t end = entry->addr + entry->size;
475
		if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) {
476 477
			unsigned long start_pfn = PFN_DOWN(start);
			unsigned long end_pfn = PFN_UP(end);
478

479
			if (entry->type == E820_TYPE_RAM)
480
				end_pfn = PFN_UP(entry->addr);
481

482
			if (start_pfn < end_pfn)
483 484
				ret_val = func(start_pfn, end_pfn, nr_pages,
					       ret_val);
485
			start = end;
486 487
		}
	}
488

489
	return ret_val;
490
}
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528

/*
 * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
 * The remap information (which mfn remap to which pfn) is contained in the
 * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
 * This scheme allows to remap the different chunks in arbitrary order while
 * the resulting mapping will be independant from the order.
 */
void __init xen_remap_memory(void)
{
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn, pfn;
	unsigned long remapped = 0;
	unsigned int i;
	unsigned long pfn_s = ~0UL;
	unsigned long len = 0;

	mfn_save = virt_to_mfn(buf);

	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
		/* Map the remap information */
		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);

		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);

		pfn = xen_remap_buf.target_pfn;
		for (i = 0; i < xen_remap_buf.size; i++) {
			mfn = xen_remap_buf.mfns[i];
			xen_update_mem_tables(pfn, mfn);
			remapped++;
			pfn++;
		}
		if (pfn_s == ~0UL || pfn == pfn_s) {
			pfn_s = xen_remap_buf.target_pfn;
			len += xen_remap_buf.size;
		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
			len += xen_remap_buf.size;
		} else {
529
			xen_del_extra_mem(pfn_s, len);
530 531 532 533 534 535 536 537 538
			pfn_s = xen_remap_buf.target_pfn;
			len = xen_remap_buf.size;
		}

		mfn = xen_remap_mfn;
		xen_remap_mfn = xen_remap_buf.next_area_mfn;
	}

	if (pfn_s != ~0UL && len)
539
		xen_del_extra_mem(pfn_s, len);
540 541 542 543 544 545

	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);

	pr_info("Remapped %ld page(s)\n", remapped);
}

546 547 548 549 550 551 552
static unsigned long __init xen_get_pages_limit(void)
{
	unsigned long limit;

#ifdef CONFIG_X86_32
	limit = GB(64) / PAGE_SIZE;
#else
553
	limit = MAXMEM / PAGE_SIZE;
554 555 556 557 558 559
	if (!xen_initial_domain() && xen_512gb_limit)
		limit = GB(512) / PAGE_SIZE;
#endif
	return limit;
}

560 561
static unsigned long __init xen_get_max_pages(void)
{
562
	unsigned long max_pages, limit;
563
	domid_t domid = DOMID_SELF;
564
	long ret;
565

566 567 568
	limit = xen_get_pages_limit();
	max_pages = limit;

569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
	/*
	 * For the initial domain we use the maximum reservation as
	 * the maximum page.
	 *
	 * For guest domains the current maximum reservation reflects
	 * the current maximum rather than the static maximum. In this
	 * case the e820 map provided to us will cover the static
	 * maximum region.
	 */
	if (xen_initial_domain()) {
		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
		if (ret > 0)
			max_pages = ret;
	}

584
	return min(max_pages, limit);
585 586
}

587 588
static void __init xen_align_and_add_e820_region(phys_addr_t start,
						 phys_addr_t size, int type)
589
{
590
	phys_addr_t end = start + size;
591 592

	/* Align RAM regions to page boundaries. */
593
	if (type == E820_TYPE_RAM) {
594
		start = PAGE_ALIGN(start);
595
		end &= ~((phys_addr_t)PAGE_SIZE - 1);
596 597
	}

598
	e820__range_add(start, end - start, type);
599 600
}

601
static void __init xen_ignore_unusable(void)
602
{
603
	struct e820_entry *entry = xen_e820_table.entries;
604 605
	unsigned int i;

606
	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
607 608
		if (entry->type == E820_TYPE_UNUSABLE)
			entry->type = E820_TYPE_RAM;
609 610 611
	}
}

612 613
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
614
	struct e820_entry *entry;
615 616 617 618 619 620 621
	unsigned mapcnt;
	phys_addr_t end;

	if (!size)
		return false;

	end = start + size;
622
	entry = xen_e820_table.entries;
623

624
	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
625
		if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
626 627 628 629 630 631 632 633 634
		    (entry->addr + entry->size) >= end)
			return false;

		entry++;
	}

	return true;
}

635 636 637 638 639 640 641 642 643 644 645 646
/*
 * Find a free area in physical memory not yet reserved and compliant with
 * E820 map.
 * Used to relocate pre-allocated areas like initrd or p2m list which are in
 * conflict with the to be used E820 map.
 * In case no area is found, return 0. Otherwise return the physical address
 * of the area which is already reserved for convenience.
 */
phys_addr_t __init xen_find_free_area(phys_addr_t size)
{
	unsigned mapcnt;
	phys_addr_t addr, start;
647
	struct e820_entry *entry = xen_e820_table.entries;
648

649
	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
650
		if (entry->type != E820_TYPE_RAM || entry->size < size)
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
			continue;
		start = entry->addr;
		for (addr = start; addr < start + size; addr += PAGE_SIZE) {
			if (!memblock_is_reserved(addr))
				continue;
			start = addr + PAGE_SIZE;
			if (start + size > entry->addr + entry->size)
				break;
		}
		if (addr >= start + size) {
			memblock_reserve(start, size);
			return start;
		}
	}

	return 0;
}

669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
/*
 * Like memcpy, but with physical addresses for dest and src.
 */
static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
				   phys_addr_t n)
{
	phys_addr_t dest_off, src_off, dest_len, src_len, len;
	void *from, *to;

	while (n) {
		dest_off = dest & ~PAGE_MASK;
		src_off = src & ~PAGE_MASK;
		dest_len = n;
		if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
			dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
		src_len = n;
		if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
			src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
		len = min(dest_len, src_len);
		to = early_memremap(dest - dest_off, dest_len + dest_off);
		from = early_memremap(src - src_off, src_len + src_off);
		memcpy(to, from, len);
		early_memunmap(to, dest_len + dest_off);
		early_memunmap(from, src_len + src_off);
		n -= len;
		dest += len;
		src += len;
	}
}

699 700 701 702 703
/*
 * Reserve Xen mfn_list.
 */
static void __init xen_reserve_xen_mfnlist(void)
{
704 705
	phys_addr_t start, size;

706
	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
707 708 709 710 711 712 713 714
		start = __pa(xen_start_info->mfn_list);
		size = PFN_ALIGN(xen_start_info->nr_pages *
				 sizeof(unsigned long));
	} else {
		start = PFN_PHYS(xen_start_info->first_p2m_pfn);
		size = PFN_PHYS(xen_start_info->nr_p2m_frames);
	}

715 716
	memblock_reserve(start, size);
	if (!xen_is_e820_reserved(start, size))
717 718
		return;

719 720 721 722 723 724 725 726 727
#ifdef CONFIG_X86_32
	/*
	 * Relocating the p2m on 32 bit system to an arbitrary virtual address
	 * is not supported, so just give up.
	 */
	xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
	BUG();
#else
	xen_relocate_p2m();
728
	memblock_free(start, size);
729
#endif
730 731
}

732 733 734 735 736
/**
 * machine_specific_memory_setup - Hook for machine specific memory setup.
 **/
char * __init xen_memory_setup(void)
{
737
	unsigned long max_pfn, pfn_s, n_pfns;
738 739
	phys_addr_t mem_end, addr, size, chunk_size;
	u32 type;
I
Ian Campbell 已提交
740 741
	int rc;
	struct xen_memory_map memmap;
742
	unsigned long max_pages;
743
	unsigned long extra_pages = 0;
I
Ian Campbell 已提交
744
	int i;
I
Ian Campbell 已提交
745
	int op;
746

747 748 749
	xen_parse_512gb();
	max_pfn = xen_get_pages_limit();
	max_pfn = min(max_pfn, xen_start_info->nr_pages);
I
Ian Campbell 已提交
750 751
	mem_end = PFN_PHYS(max_pfn);

752 753
	memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
	set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
I
Ian Campbell 已提交
754

I
Ian Campbell 已提交
755 756 757 758
	op = xen_initial_domain() ?
		XENMEM_machine_memory_map :
		XENMEM_memory_map;
	rc = HYPERVISOR_memory_op(op, &memmap);
I
Ian Campbell 已提交
759
	if (rc == -ENOSYS) {
760
		BUG_ON(xen_initial_domain());
I
Ian Campbell 已提交
761
		memmap.nr_entries = 1;
762 763
		xen_e820_table.entries[0].addr = 0ULL;
		xen_e820_table.entries[0].size = mem_end;
I
Ian Campbell 已提交
764
		/* 8MB slack (to balance backend allocations). */
765 766
		xen_e820_table.entries[0].size += 8ULL << 20;
		xen_e820_table.entries[0].type = E820_TYPE_RAM;
I
Ian Campbell 已提交
767 768 769
		rc = 0;
	}
	BUG_ON(rc);
770
	BUG_ON(memmap.nr_entries == 0);
771
	xen_e820_table.nr_entries = memmap.nr_entries;
772

773 774 775 776 777 778 779 780 781
	/*
	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
	 * regions, so if we're using the machine memory map leave the
	 * region as RAM as it is in the pseudo-physical map.
	 *
	 * UNUSABLE regions in domUs are not handled and will need
	 * a patch in the future.
	 */
	if (xen_initial_domain())
782
		xen_ignore_unusable();
783

784
	/* Make sure the Xen-supplied memory map is well-ordered. */
785
	e820__update_table(&xen_e820_table);
786 787 788

	max_pages = xen_get_max_pages();

789
	/* How many extra pages do we need due to remapping? */
790
	max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
791 792 793

	if (max_pages > max_pfn)
		extra_pages += max_pages - max_pfn;
794

795 796 797 798 799 800 801
	/*
	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
	 * factor the base size.  On non-highmem systems, the base
	 * size is the full initial memory allocation; on highmem it
	 * is limited to the max size of lowmem, so that it doesn't
	 * get completely filled.
	 *
802 803 804
	 * Make sure we have no memory above max_pages, as this area
	 * isn't handled by the p2m management.
	 *
805 806 807 808
	 * In principle there could be a problem in lowmem systems if
	 * the initial memory is also very large with respect to
	 * lowmem, but we won't try to deal with that here.
	 */
809 810
	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
			   extra_pages, max_pages - max_pfn);
811
	i = 0;
812 813 814
	addr = xen_e820_table.entries[0].addr;
	size = xen_e820_table.entries[0].size;
	while (i < xen_e820_table.nr_entries) {
815 816
		bool discard = false;

817
		chunk_size = size;
818
		type = xen_e820_table.entries[i].type;
819

820
		if (type == E820_TYPE_RAM) {
821
			if (addr < mem_end) {
822
				chunk_size = min(size, mem_end - addr);
823
			} else if (extra_pages) {
824
				chunk_size = min(size, PFN_PHYS(extra_pages));
825 826 827 828 829
				pfn_s = PFN_UP(addr);
				n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
				extra_pages -= n_pfns;
				xen_add_extra_mem(pfn_s, n_pfns);
				xen_max_p2m_pfn = pfn_s + n_pfns;
830
			} else
831
				discard = true;
832 833
		}

834 835
		if (!discard)
			xen_align_and_add_e820_region(addr, chunk_size, type);
836

837 838 839
		addr += chunk_size;
		size -= chunk_size;
		if (size == 0) {
840
			i++;
841 842 843
			if (i < xen_e820_table.nr_entries) {
				addr = xen_e820_table.entries[i].addr;
				size = xen_e820_table.entries[i].size;
844 845
			}
		}
I
Ian Campbell 已提交
846
	}
847

848 849 850 851
	/*
	 * Set the rest as identity mapped, in case PCI BARs are
	 * located here.
	 */
852
	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
853

854
	/*
855 856
	 * In domU, the ISA region is normal, usable memory, but we
	 * reserve ISA memory anyway because too many things poke
857 858
	 * about in there.
	 */
859
	e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
860

861
	e820__update_table(e820_table);
862

863 864 865 866 867 868 869 870 871 872 873
	/*
	 * Check whether the kernel itself conflicts with the target E820 map.
	 * Failing now is better than running into weird problems later due
	 * to relocating (and even reusing) pages with kernel text or data.
	 */
	if (xen_is_e820_reserved(__pa_symbol(_text),
			__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
		xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
		BUG();
	}

874 875 876 877 878 879
	/*
	 * Check for a conflict of the hypervisor supplied page tables with
	 * the target E820 map.
	 */
	xen_pt_check_e820();

880 881
	xen_reserve_xen_mfnlist();

882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
	/* Check for a conflict of the initrd with the target E820 map. */
	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
				 boot_params.hdr.ramdisk_size)) {
		phys_addr_t new_area, start, size;

		new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
		if (!new_area) {
			xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
			BUG();
		}

		start = boot_params.hdr.ramdisk_image;
		size = boot_params.hdr.ramdisk_size;
		xen_phys_memcpy(new_area, start, size);
		pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
			start, start + size, new_area, new_area + size);
		memblock_free(start, size);
		boot_params.hdr.ramdisk_image = new_area;
		boot_params.ext_ramdisk_image = new_area >> 32;
	}

903 904 905 906
	/*
	 * Set identity map on non-RAM pages and prepare remapping the
	 * underlying RAM.
	 */
907 908 909
	xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);

	pr_info("Released %ld page(s)\n", xen_released_pages);
910

911 912 913
	return "Xen";
}

914 915 916 917 918 919 920 921 922
/*
 * Machine specific memory setup for auto-translated guests.
 */
char * __init xen_auto_xlated_memory_setup(void)
{
	struct xen_memory_map memmap;
	int i;
	int rc;

923 924
	memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
	set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
925 926 927 928 929

	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
	if (rc < 0)
		panic("No memory map (%d)\n", rc);

930
	xen_e820_table.nr_entries = memmap.nr_entries;
931

932
	e820__update_table(&xen_e820_table);
933

934 935
	for (i = 0; i < xen_e820_table.nr_entries; i++)
		e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type);
936

937 938 939 940
	/* Remove p2m info, it is not needed. */
	xen_start_info->mfn_list = 0;
	xen_start_info->first_p2m_pfn = 0;
	xen_start_info->nr_p2m_frames = 0;
941 942 943 944

	return "Xen";
}

945 946
/*
 * Set the bit indicating "nosegneg" library variants should be used.
947 948
 * We only need to bother in pure 32-bit mode; compat 32-bit processes
 * can have un-truncated segments, so wrapping around is allowed.
949
 */
950
static void __init fiddle_vdso(void)
951
{
952
#ifdef CONFIG_X86_32
953 954
	u32 *mask = vdso_image_32.data +
		vdso_image_32.sym_VDSO32_NOTE_MASK;
955
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
956
#endif
957 958
}

959
static int register_callback(unsigned type, const void *func)
960
{
961 962 963
	struct callback_register callback = {
		.type = type,
		.address = XEN_CALLBACK(__KERNEL_CS, func),
964 965 966
		.flags = CALLBACKF_mask_events,
	};

967 968 969
	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}

970
void xen_enable_sysenter(void)
971
{
972
	int ret;
973
	unsigned sysenter_feature;
974 975

#ifdef CONFIG_X86_32
976
	sysenter_feature = X86_FEATURE_SEP;
977
#else
978
	sysenter_feature = X86_FEATURE_SYSENTER32;
979
#endif
980

981 982 983
	if (!boot_cpu_has(sysenter_feature))
		return;

984
	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
985 986
	if(ret != 0)
		setup_clear_cpu_cap(sysenter_feature);
987 988
}

989
void xen_enable_syscall(void)
990 991 992 993 994 995
{
#ifdef CONFIG_X86_64
	int ret;

	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
	if (ret != 0) {
996
		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
997 998 999 1000 1001
		/* Pretty fatal; 64-bit userspace has no other
		   mechanism for syscalls. */
	}

	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
1002 1003
		ret = register_callback(CALLBACKTYPE_syscall32,
					xen_syscall32_target);
1004
		if (ret != 0)
1005
			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
1006 1007 1008
	}
#endif /* CONFIG_X86_64 */
}
1009

1010
void __init xen_pvmmu_arch_setup(void)
1011 1012 1013 1014
{
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);

1015 1016
	HYPERVISOR_vm_assist(VMASST_CMD_enable,
			     VMASST_TYPE_pae_extended_cr3);
1017

1018 1019 1020
	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
		BUG();
1021

1022
	xen_enable_sysenter();
1023
	xen_enable_syscall();
1024 1025 1026 1027 1028 1029 1030 1031 1032
}

/* This function is not called for HVM domains */
void __init xen_arch_setup(void)
{
	xen_panic_handler_init();
	if (!xen_feature(XENFEAT_auto_translated_physmap))
		xen_pvmmu_arch_setup();

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
#ifdef CONFIG_ACPI
	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
		disable_acpi();
	}
#endif

	memcpy(boot_command_line, xen_start_info->cmd_line,
	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);

J
Jeremy Fitzhardinge 已提交
1044
	/* Set up idle, making sure it calls safe_halt() pvop */
1045
	disable_cpuidle();
1046
	disable_cpufreq();
1047
	WARN_ON(xen_set_default_idle());
1048
	fiddle_vdso();
1049 1050 1051
#ifdef CONFIG_NUMA
	numa_off = 1;
#endif
1052
}