setup.c 26.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7
/*
 * Machine specific setup for xen
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */

8
#include <linux/init.h>
9 10 11
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
12
#include <linux/memblock.h>
13
#include <linux/cpuidle.h>
14
#include <linux/cpufreq.h>
15
#include <linux/memory_hotplug.h>
16 17

#include <asm/elf.h>
R
Roland McGrath 已提交
18
#include <asm/vdso.h>
19
#include <asm/e820/api.h>
20
#include <asm/setup.h>
21
#include <asm/acpi.h>
22
#include <asm/numa.h>
23
#include <asm/idtentry.h>
24 25 26
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

27
#include <xen/xen.h>
28
#include <xen/page.h>
29
#include <xen/interface/callback.h>
I
Ian Campbell 已提交
30
#include <xen/interface/memory.h>
31 32
#include <xen/interface/physdev.h>
#include <xen/features.h>
33
#include <xen/hvc-console.h>
34
#include "xen-ops.h"
35
#include "mmu.h"
36

37 38
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)

39
/* Amount of extra memory space we add to the e820 ranges */
40
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
41

42 43 44
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;

45
/* E820 map used during setting up memory. */
46
static struct e820_table xen_e820_table __initdata;
47

48 49 50 51 52 53 54 55 56 57 58 59 60
/*
 * Buffer used to remap identity mapped pages. We only need the virtual space.
 * The physical page behind this address is remapped as needed to different
 * buffer pages.
 */
#define REMAP_SIZE	(P2M_PER_PAGE - 3)
static struct {
	unsigned long	next_area_mfn;
	unsigned long	target_pfn;
	unsigned long	size;
	unsigned long	mfns[REMAP_SIZE];
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
61

62 63 64 65 66 67 68 69 70 71 72 73
/* 
 * The maximum amount of extra memory compared to the base size.  The
 * main scaling factor is the size of struct page.  At extreme ratios
 * of base:extra, all the base memory can be filled with page
 * structures for the extra memory, leaving no space for anything
 * else.
 * 
 * 10x seems like a reasonable balance between scaling flexibility and
 * leaving a practically usable system.
 */
#define EXTRA_MEM_RATIO		(10)

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);

static void __init xen_parse_512gb(void)
{
	bool val = false;
	char *arg;

	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
	if (!arg)
		return;

	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
	if (!arg)
		val = true;
	else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
		return;

	xen_512gb_limit = val;
}

94 95
static void __init xen_add_extra_mem(unsigned long start_pfn,
				     unsigned long n_pfns)
96
{
97
	int i;
98

99 100 101 102
	/*
	 * No need to check for zero size, should happen rarely and will only
	 * write a new entry regarded to be unused due to zero size.
	 */
103 104
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		/* Add new region. */
105 106 107
		if (xen_extra_mem[i].n_pfns == 0) {
			xen_extra_mem[i].start_pfn = start_pfn;
			xen_extra_mem[i].n_pfns = n_pfns;
108 109 110
			break;
		}
		/* Append to existing region. */
111 112 113
		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
		    start_pfn) {
			xen_extra_mem[i].n_pfns += n_pfns;
114 115 116 117 118
			break;
		}
	}
	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
119

120
	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
121
}
122

123 124
static void __init xen_del_extra_mem(unsigned long start_pfn,
				     unsigned long n_pfns)
125 126
{
	int i;
127
	unsigned long start_r, size_r;
128

129
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
130 131
		start_r = xen_extra_mem[i].start_pfn;
		size_r = xen_extra_mem[i].n_pfns;
132 133

		/* Start of region. */
134 135 136 137
		if (start_r == start_pfn) {
			BUG_ON(n_pfns > size_r);
			xen_extra_mem[i].start_pfn += n_pfns;
			xen_extra_mem[i].n_pfns -= n_pfns;
138 139 140
			break;
		}
		/* End of region. */
141 142 143
		if (start_r + size_r == start_pfn + n_pfns) {
			BUG_ON(n_pfns > size_r);
			xen_extra_mem[i].n_pfns -= n_pfns;
144 145 146
			break;
		}
		/* Mid of region. */
147 148 149
		if (start_pfn > start_r && start_pfn < start_r + size_r) {
			BUG_ON(start_pfn + n_pfns > start_r + size_r);
			xen_extra_mem[i].n_pfns = start_pfn - start_r;
150
			/* Calling memblock_reserve() again is okay. */
151 152
			xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
					  (start_pfn + n_pfns));
153 154 155
			break;
		}
	}
156
	memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
157 158 159 160 161 162 163 164 165 166
}

/*
 * Called during boot before the p2m list can take entries beyond the
 * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
 * invalid.
 */
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
	int i;
167

168
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
169 170
		if (pfn >= xen_extra_mem[i].start_pfn &&
		    pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
			return INVALID_P2M_ENTRY;
	}

	return IDENTITY_FRAME(pfn);
}

/*
 * Mark all pfns of extra mem as invalid in p2m list.
 */
void __init xen_inv_extra_mem(void)
{
	unsigned long pfn, pfn_s, pfn_e;
	int i;

	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
186
		if (!xen_extra_mem[i].n_pfns)
187
			continue;
188 189
		pfn_s = xen_extra_mem[i].start_pfn;
		pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
190 191
		for (pfn = pfn_s; pfn < pfn_e; pfn++)
			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
192
	}
193 194
}

195 196 197 198 199
/*
 * Finds the next RAM pfn available in the E820 map after min_pfn.
 * This function updates min_pfn with the pfn found and returns
 * the size of that range or zero if not found.
 */
200
static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
201
{
202
	const struct e820_entry *entry = xen_e820_table.entries;
203 204 205
	unsigned int i;
	unsigned long done = 0;

206
	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
207 208 209
		unsigned long s_pfn;
		unsigned long e_pfn;

210
		if (entry->type != E820_TYPE_RAM)
211 212
			continue;

213
		e_pfn = PFN_DOWN(entry->addr + entry->size);
214

215
		/* We only care about E820 after this */
216
		if (e_pfn <= *min_pfn)
217 218
			continue;

219
		s_pfn = PFN_UP(entry->addr);
220 221 222

		/* If min_pfn falls within the E820 entry, we want to start
		 * at the min_pfn PFN.
223
		 */
224 225
		if (s_pfn <= *min_pfn) {
			done = e_pfn - *min_pfn;
226
		} else {
227 228
			done = e_pfn - s_pfn;
			*min_pfn = s_pfn;
229
		}
230 231
		break;
	}
232

233 234
	return done;
}
235

236 237 238 239 240 241 242 243 244 245 246 247 248 249
static int __init xen_free_mfn(unsigned long mfn)
{
	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};

	set_xen_guest_handle(reservation.extent_start, &mfn);
	reservation.nr_extents = 1;

	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
}

250
/*
251
 * This releases a chunk of memory and then does the identity map. It's used
252 253 254
 * as a fallback if the remapping fails.
 */
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
255
			unsigned long end_pfn, unsigned long nr_pages)
256
{
257 258 259
	unsigned long pfn, end;
	int ret;

260 261
	WARN_ON(start_pfn > end_pfn);

262
	/* Release pages first. */
263 264 265 266 267 268 269 270 271 272 273 274
	end = min(end_pfn, nr_pages);
	for (pfn = start_pfn; pfn < end; pfn++) {
		unsigned long mfn = pfn_to_mfn(pfn);

		/* Make sure pfn exists to start with */
		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
			continue;

		ret = xen_free_mfn(mfn);
		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);

		if (ret == 1) {
275
			xen_released_pages++;
276 277 278 279 280 281
			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
				break;
		} else
			break;
	}

282
	set_phys_range_identity(start_pfn, end_pfn);
283 284 285
}

/*
286
 * Helper function to update the p2m and m2p tables and kernel mapping.
287
 */
288
static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
289 290
{
	struct mmu_update update = {
291
		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
292 293 294 295
		.val = pfn
	};

	/* Update p2m */
296
	if (!set_phys_to_machine(pfn, mfn)) {
297 298
		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
		     pfn, mfn);
299
		BUG();
300
	}
301 302 303 304 305

	/* Update m2p */
	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
		     mfn, pfn);
306
		BUG();
307 308
	}

309
	/* Update kernel mapping, but not for highmem. */
310
	if (pfn >= PFN_UP(__pa(high_memory - 1)))
311 312 313 314 315 316 317 318
		return;

	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
		      mfn, pfn);
		BUG();
	}
319
}
320

321 322
/*
 * This function updates the p2m and m2p tables with an identity map from
323 324 325 326 327 328 329 330 331
 * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
 * original allocation at remap_pfn. The information needed for remapping is
 * saved in the memory itself to avoid the need for allocating buffers. The
 * complete remap information is contained in a list of MFNs each containing
 * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
 * This enables us to preserve the original mfn sequence while doing the
 * remapping at a time when the memory management is capable of allocating
 * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
 * its callers.
332
 */
333
static void __init xen_do_set_identity_and_remap_chunk(
334
        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
335
{
336 337
	unsigned long buf = (unsigned long)&xen_remap_buf;
	unsigned long mfn_save, mfn;
338
	unsigned long ident_pfn_iter, remap_pfn_iter;
339
	unsigned long ident_end_pfn = start_pfn + size;
340
	unsigned long left = size;
341
	unsigned int i, chunk;
342 343 344

	WARN_ON(size == 0);

345
	mfn_save = virt_to_mfn(buf);
346

347 348 349 350
	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
	     ident_pfn_iter < ident_end_pfn;
	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
351

352 353 354
		/* Map first pfn to xen_remap_buf */
		mfn = pfn_to_mfn(ident_pfn_iter);
		set_pte_mfn(buf, mfn, PAGE_KERNEL);
355

356 357 358 359 360 361
		/* Save mapping information in page */
		xen_remap_buf.next_area_mfn = xen_remap_mfn;
		xen_remap_buf.target_pfn = remap_pfn_iter;
		xen_remap_buf.size = chunk;
		for (i = 0; i < chunk; i++)
			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
362

363 364
		/* Put remap buf into list. */
		xen_remap_mfn = mfn;
365

366
		/* Set identity map */
367
		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
368

369
		left -= chunk;
370
	}
371

372 373
	/* Restore old xen_remap_buf mapping */
	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
374 375
}

376 377 378 379 380 381 382 383 384 385 386
/*
 * This function takes a contiguous pfn range that needs to be identity mapped
 * and:
 *
 *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
 *  2) Calls the do_ function to actually do the mapping/remapping work.
 *
 * The goal is to not allocate additional memory but to remap the existing
 * pages. In the case of an error the underlying memory is simply released back
 * to Xen and not remapped.
 */
387
static unsigned long __init xen_set_identity_and_remap_chunk(
388
	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
389
	unsigned long remap_pfn)
390 391 392 393 394
{
	unsigned long pfn;
	unsigned long i = 0;
	unsigned long n = end_pfn - start_pfn;

395 396 397
	if (remap_pfn == 0)
		remap_pfn = nr_pages;

398 399 400 401 402 403 404 405 406
	while (i < n) {
		unsigned long cur_pfn = start_pfn + i;
		unsigned long left = n - i;
		unsigned long size = left;
		unsigned long remap_range_size;

		/* Do not remap pages beyond the current allocation */
		if (cur_pfn >= nr_pages) {
			/* Identity map remaining pages */
407
			set_phys_range_identity(cur_pfn, cur_pfn + size);
408 409 410 411 412
			break;
		}
		if (cur_pfn + size > nr_pages)
			size = nr_pages - cur_pfn;

413
		remap_range_size = xen_find_pfn_range(&remap_pfn);
414
		if (!remap_range_size) {
415
			pr_warn("Unable to find available pfn range, not remapping identity pages\n");
416
			xen_set_identity_and_release_chunk(cur_pfn,
417
						cur_pfn + left, nr_pages);
418 419 420 421 422 423
			break;
		}
		/* Adjust size to fit in current e820 RAM region */
		if (size > remap_range_size)
			size = remap_range_size;

424
		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442

		/* Update variables to reflect new mappings. */
		i += size;
		remap_pfn += size;
	}

	/*
	 * If the PFNs are currently mapped, the VA mapping also needs
	 * to be updated to be 1:1.
	 */
	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
		(void)HYPERVISOR_update_va_mapping(
			(unsigned long)__va(pfn << PAGE_SHIFT),
			mfn_pte(pfn, PAGE_KERNEL_IO), 0);

	return remap_pfn;
}

443 444 445 446 447 448 449 450 451 452 453 454 455
static unsigned long __init xen_count_remap_pages(
	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
	unsigned long remap_pages)
{
	if (start_pfn >= nr_pages)
		return remap_pages;

	return remap_pages + min(end_pfn, nr_pages) - start_pfn;
}

static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
	unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
			      unsigned long nr_pages, unsigned long last_val))
456
{
457
	phys_addr_t start = 0;
458
	unsigned long ret_val = 0;
459
	const struct e820_entry *entry = xen_e820_table.entries;
460 461
	int i;

462 463
	/*
	 * Combine non-RAM regions and gaps until a RAM region (or the
464 465
	 * end of the map) is reached, then call the provided function
	 * to perform its duty on the non-RAM region.
466 467 468 469 470 471 472
	 *
	 * The combined non-RAM regions are rounded to a whole number
	 * of pages so any partial pages are accessible via the 1:1
	 * mapping.  This is needed for some BIOSes that put (for
	 * example) the DMI tables in a reserved region that begins on
	 * a non-page boundary.
	 */
473
	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
474
		phys_addr_t end = entry->addr + entry->size;
475
		if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) {
476 477
			unsigned long start_pfn = PFN_DOWN(start);
			unsigned long end_pfn = PFN_UP(end);
478

479
			if (entry->type == E820_TYPE_RAM)
480
				end_pfn = PFN_UP(entry->addr);
481

482
			if (start_pfn < end_pfn)
483 484
				ret_val = func(start_pfn, end_pfn, nr_pages,
					       ret_val);
485
			start = end;
486 487
		}
	}
488

489
	return ret_val;
490
}
491 492 493 494 495 496

/*
 * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
 * The remap information (which mfn remap to which pfn) is contained in the
 * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
 * This scheme allows to remap the different chunks in arbitrary order while
I
Ingo Molnar 已提交
497
 * the resulting mapping will be independent from the order.
498 499 500 501
 */
void __init xen_remap_memory(void)
{
	unsigned long buf = (unsigned long)&xen_remap_buf;
502
	unsigned long mfn_save, pfn;
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
	unsigned long remapped = 0;
	unsigned int i;
	unsigned long pfn_s = ~0UL;
	unsigned long len = 0;

	mfn_save = virt_to_mfn(buf);

	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
		/* Map the remap information */
		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);

		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);

		pfn = xen_remap_buf.target_pfn;
		for (i = 0; i < xen_remap_buf.size; i++) {
518
			xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]);
519 520 521 522 523 524 525 526 527
			remapped++;
			pfn++;
		}
		if (pfn_s == ~0UL || pfn == pfn_s) {
			pfn_s = xen_remap_buf.target_pfn;
			len += xen_remap_buf.size;
		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
			len += xen_remap_buf.size;
		} else {
528
			xen_del_extra_mem(pfn_s, len);
529 530 531 532 533 534 535
			pfn_s = xen_remap_buf.target_pfn;
			len = xen_remap_buf.size;
		}
		xen_remap_mfn = xen_remap_buf.next_area_mfn;
	}

	if (pfn_s != ~0UL && len)
536
		xen_del_extra_mem(pfn_s, len);
537 538 539 540 541 542

	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);

	pr_info("Remapped %ld page(s)\n", remapped);
}

543 544 545 546
static unsigned long __init xen_get_pages_limit(void)
{
	unsigned long limit;

547
	limit = MAXMEM / PAGE_SIZE;
548 549
	if (!xen_initial_domain() && xen_512gb_limit)
		limit = GB(512) / PAGE_SIZE;
550

551 552 553
	return limit;
}

554 555
static unsigned long __init xen_get_max_pages(void)
{
556
	unsigned long max_pages, limit;
557
	domid_t domid = DOMID_SELF;
558
	long ret;
559

560 561 562
	limit = xen_get_pages_limit();
	max_pages = limit;

563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
	/*
	 * For the initial domain we use the maximum reservation as
	 * the maximum page.
	 *
	 * For guest domains the current maximum reservation reflects
	 * the current maximum rather than the static maximum. In this
	 * case the e820 map provided to us will cover the static
	 * maximum region.
	 */
	if (xen_initial_domain()) {
		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
		if (ret > 0)
			max_pages = ret;
	}

578
	return min(max_pages, limit);
579 580
}

581 582
static void __init xen_align_and_add_e820_region(phys_addr_t start,
						 phys_addr_t size, int type)
583
{
584
	phys_addr_t end = start + size;
585 586

	/* Align RAM regions to page boundaries. */
587
	if (type == E820_TYPE_RAM) {
588
		start = PAGE_ALIGN(start);
589
		end &= ~((phys_addr_t)PAGE_SIZE - 1);
590 591 592 593 594 595 596 597
#ifdef CONFIG_MEMORY_HOTPLUG
		/*
		 * Don't allow adding memory not in E820 map while booting the
		 * system. Once the balloon driver is up it will remove that
		 * restriction again.
		 */
		max_mem_size = end;
#endif
598 599
	}

600
	e820__range_add(start, end - start, type);
601 602
}

603
static void __init xen_ignore_unusable(void)
604
{
605
	struct e820_entry *entry = xen_e820_table.entries;
606 607
	unsigned int i;

608
	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
609 610
		if (entry->type == E820_TYPE_UNUSABLE)
			entry->type = E820_TYPE_RAM;
611 612 613
	}
}

614 615
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
616
	struct e820_entry *entry;
617 618 619 620 621 622 623
	unsigned mapcnt;
	phys_addr_t end;

	if (!size)
		return false;

	end = start + size;
624
	entry = xen_e820_table.entries;
625

626
	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
627
		if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
628 629 630 631 632 633 634 635 636
		    (entry->addr + entry->size) >= end)
			return false;

		entry++;
	}

	return true;
}

637 638 639 640 641 642 643 644 645 646 647 648
/*
 * Find a free area in physical memory not yet reserved and compliant with
 * E820 map.
 * Used to relocate pre-allocated areas like initrd or p2m list which are in
 * conflict with the to be used E820 map.
 * In case no area is found, return 0. Otherwise return the physical address
 * of the area which is already reserved for convenience.
 */
phys_addr_t __init xen_find_free_area(phys_addr_t size)
{
	unsigned mapcnt;
	phys_addr_t addr, start;
649
	struct e820_entry *entry = xen_e820_table.entries;
650

651
	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
652
		if (entry->type != E820_TYPE_RAM || entry->size < size)
653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670
			continue;
		start = entry->addr;
		for (addr = start; addr < start + size; addr += PAGE_SIZE) {
			if (!memblock_is_reserved(addr))
				continue;
			start = addr + PAGE_SIZE;
			if (start + size > entry->addr + entry->size)
				break;
		}
		if (addr >= start + size) {
			memblock_reserve(start, size);
			return start;
		}
	}

	return 0;
}

671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
/*
 * Like memcpy, but with physical addresses for dest and src.
 */
static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
				   phys_addr_t n)
{
	phys_addr_t dest_off, src_off, dest_len, src_len, len;
	void *from, *to;

	while (n) {
		dest_off = dest & ~PAGE_MASK;
		src_off = src & ~PAGE_MASK;
		dest_len = n;
		if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
			dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
		src_len = n;
		if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
			src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
		len = min(dest_len, src_len);
		to = early_memremap(dest - dest_off, dest_len + dest_off);
		from = early_memremap(src - src_off, src_len + src_off);
		memcpy(to, from, len);
		early_memunmap(to, dest_len + dest_off);
		early_memunmap(from, src_len + src_off);
		n -= len;
		dest += len;
		src += len;
	}
}

701 702 703 704 705
/*
 * Reserve Xen mfn_list.
 */
static void __init xen_reserve_xen_mfnlist(void)
{
706 707
	phys_addr_t start, size;

708
	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
709 710 711 712 713 714 715 716
		start = __pa(xen_start_info->mfn_list);
		size = PFN_ALIGN(xen_start_info->nr_pages *
				 sizeof(unsigned long));
	} else {
		start = PFN_PHYS(xen_start_info->first_p2m_pfn);
		size = PFN_PHYS(xen_start_info->nr_p2m_frames);
	}

717 718
	memblock_reserve(start, size);
	if (!xen_is_e820_reserved(start, size))
719 720
		return;

721
	xen_relocate_p2m();
722
	memblock_free(start, size);
723 724
}

725 726 727 728 729
/**
 * machine_specific_memory_setup - Hook for machine specific memory setup.
 **/
char * __init xen_memory_setup(void)
{
730
	unsigned long max_pfn, pfn_s, n_pfns;
731 732
	phys_addr_t mem_end, addr, size, chunk_size;
	u32 type;
I
Ian Campbell 已提交
733 734
	int rc;
	struct xen_memory_map memmap;
735
	unsigned long max_pages;
736
	unsigned long extra_pages = 0;
I
Ian Campbell 已提交
737
	int i;
I
Ian Campbell 已提交
738
	int op;
739

740 741 742
	xen_parse_512gb();
	max_pfn = xen_get_pages_limit();
	max_pfn = min(max_pfn, xen_start_info->nr_pages);
I
Ian Campbell 已提交
743 744
	mem_end = PFN_PHYS(max_pfn);

745 746
	memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
	set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
I
Ian Campbell 已提交
747

748 749 750 751
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
	xen_saved_max_mem_size = max_mem_size;
#endif

I
Ian Campbell 已提交
752 753 754 755
	op = xen_initial_domain() ?
		XENMEM_machine_memory_map :
		XENMEM_memory_map;
	rc = HYPERVISOR_memory_op(op, &memmap);
I
Ian Campbell 已提交
756
	if (rc == -ENOSYS) {
757
		BUG_ON(xen_initial_domain());
I
Ian Campbell 已提交
758
		memmap.nr_entries = 1;
759 760
		xen_e820_table.entries[0].addr = 0ULL;
		xen_e820_table.entries[0].size = mem_end;
I
Ian Campbell 已提交
761
		/* 8MB slack (to balance backend allocations). */
762 763
		xen_e820_table.entries[0].size += 8ULL << 20;
		xen_e820_table.entries[0].type = E820_TYPE_RAM;
I
Ian Campbell 已提交
764 765 766
		rc = 0;
	}
	BUG_ON(rc);
767
	BUG_ON(memmap.nr_entries == 0);
768
	xen_e820_table.nr_entries = memmap.nr_entries;
769

770 771 772 773 774 775 776 777 778
	/*
	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
	 * regions, so if we're using the machine memory map leave the
	 * region as RAM as it is in the pseudo-physical map.
	 *
	 * UNUSABLE regions in domUs are not handled and will need
	 * a patch in the future.
	 */
	if (xen_initial_domain())
779
		xen_ignore_unusable();
780

781
	/* Make sure the Xen-supplied memory map is well-ordered. */
782
	e820__update_table(&xen_e820_table);
783 784 785

	max_pages = xen_get_max_pages();

786
	/* How many extra pages do we need due to remapping? */
787
	max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
788 789 790

	if (max_pages > max_pfn)
		extra_pages += max_pages - max_pfn;
791

792 793 794 795 796 797 798
	/*
	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
	 * factor the base size.  On non-highmem systems, the base
	 * size is the full initial memory allocation; on highmem it
	 * is limited to the max size of lowmem, so that it doesn't
	 * get completely filled.
	 *
799 800 801
	 * Make sure we have no memory above max_pages, as this area
	 * isn't handled by the p2m management.
	 *
802 803 804 805
	 * In principle there could be a problem in lowmem systems if
	 * the initial memory is also very large with respect to
	 * lowmem, but we won't try to deal with that here.
	 */
806 807
	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
			   extra_pages, max_pages - max_pfn);
808
	i = 0;
809 810 811
	addr = xen_e820_table.entries[0].addr;
	size = xen_e820_table.entries[0].size;
	while (i < xen_e820_table.nr_entries) {
812
		bool discard = false;
813

814
		chunk_size = size;
815
		type = xen_e820_table.entries[i].type;
816

817
		if (type == E820_TYPE_RAM) {
818
			if (addr < mem_end) {
819
				chunk_size = min(size, mem_end - addr);
820
			} else if (extra_pages) {
821
				chunk_size = min(size, PFN_PHYS(extra_pages));
822 823 824 825 826
				pfn_s = PFN_UP(addr);
				n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
				extra_pages -= n_pfns;
				xen_add_extra_mem(pfn_s, n_pfns);
				xen_max_p2m_pfn = pfn_s + n_pfns;
827
			} else
828
				discard = true;
829 830
		}

831 832
		if (!discard)
			xen_align_and_add_e820_region(addr, chunk_size, type);
833

834 835 836
		addr += chunk_size;
		size -= chunk_size;
		if (size == 0) {
837
			i++;
838 839 840
			if (i < xen_e820_table.nr_entries) {
				addr = xen_e820_table.entries[i].addr;
				size = xen_e820_table.entries[i].size;
841 842
			}
		}
I
Ian Campbell 已提交
843
	}
844

845 846 847 848
	/*
	 * Set the rest as identity mapped, in case PCI BARs are
	 * located here.
	 */
849
	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
850

851
	/*
852 853
	 * In domU, the ISA region is normal, usable memory, but we
	 * reserve ISA memory anyway because too many things poke
854 855
	 * about in there.
	 */
856
	e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
857

858
	e820__update_table(e820_table);
859

860 861 862 863 864 865 866 867 868 869 870
	/*
	 * Check whether the kernel itself conflicts with the target E820 map.
	 * Failing now is better than running into weird problems later due
	 * to relocating (and even reusing) pages with kernel text or data.
	 */
	if (xen_is_e820_reserved(__pa_symbol(_text),
			__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
		xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
		BUG();
	}

871 872 873 874 875 876
	/*
	 * Check for a conflict of the hypervisor supplied page tables with
	 * the target E820 map.
	 */
	xen_pt_check_e820();

877 878
	xen_reserve_xen_mfnlist();

879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
	/* Check for a conflict of the initrd with the target E820 map. */
	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
				 boot_params.hdr.ramdisk_size)) {
		phys_addr_t new_area, start, size;

		new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
		if (!new_area) {
			xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
			BUG();
		}

		start = boot_params.hdr.ramdisk_image;
		size = boot_params.hdr.ramdisk_size;
		xen_phys_memcpy(new_area, start, size);
		pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
			start, start + size, new_area, new_area + size);
		memblock_free(start, size);
		boot_params.hdr.ramdisk_image = new_area;
		boot_params.ext_ramdisk_image = new_area >> 32;
	}

900 901 902 903
	/*
	 * Set identity map on non-RAM pages and prepare remapping the
	 * underlying RAM.
	 */
904 905 906
	xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);

	pr_info("Released %ld page(s)\n", xen_released_pages);
907

908 909 910
	return "Xen";
}

911
static int register_callback(unsigned type, const void *func)
912
{
913 914 915
	struct callback_register callback = {
		.type = type,
		.address = XEN_CALLBACK(__KERNEL_CS, func),
916 917 918
		.flags = CALLBACKF_mask_events,
	};

919 920 921
	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}

922
void xen_enable_sysenter(void)
923
{
924
	int ret;
925
	unsigned sysenter_feature;
926

927
	sysenter_feature = X86_FEATURE_SYSENTER32;
928

929 930 931
	if (!boot_cpu_has(sysenter_feature))
		return;

932
	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
933 934
	if(ret != 0)
		setup_clear_cpu_cap(sysenter_feature);
935 936
}

937
void xen_enable_syscall(void)
938 939 940 941 942
{
	int ret;

	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
	if (ret != 0) {
943
		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
944 945 946 947 948
		/* Pretty fatal; 64-bit userspace has no other
		   mechanism for syscalls. */
	}

	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
949 950
		ret = register_callback(CALLBACKTYPE_syscall32,
					xen_syscall32_target);
951
		if (ret != 0)
952
			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
953 954
	}
}
955

956
static void __init xen_pvmmu_arch_setup(void)
957 958 959 960
{
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);

961 962
	HYPERVISOR_vm_assist(VMASST_CMD_enable,
			     VMASST_TYPE_pae_extended_cr3);
963

964 965
	if (register_callback(CALLBACKTYPE_event,
			      xen_asm_exc_xen_hypervisor_callback) ||
966 967
	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
		BUG();
968

969
	xen_enable_sysenter();
970
	xen_enable_syscall();
971 972 973 974 975 976
}

/* This function is not called for HVM domains */
void __init xen_arch_setup(void)
{
	xen_panic_handler_init();
977
	xen_pvmmu_arch_setup();
978

979 980 981 982 983 984 985 986 987 988 989
#ifdef CONFIG_ACPI
	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
		disable_acpi();
	}
#endif

	memcpy(boot_command_line, xen_start_info->cmd_line,
	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);

J
Jeremy Fitzhardinge 已提交
990
	/* Set up idle, making sure it calls safe_halt() pvop */
991
	disable_cpuidle();
992
	disable_cpufreq();
993
	WARN_ON(xen_set_default_idle());
994 995 996
#ifdef CONFIG_NUMA
	numa_off = 1;
#endif
997
}