setup.c 10.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/*
 * Machine specific setup for xen
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
11
#include <linux/memblock.h>
12
#include <linux/cpuidle.h>
13 14

#include <asm/elf.h>
R
Roland McGrath 已提交
15
#include <asm/vdso.h>
16 17
#include <asm/e820.h>
#include <asm/setup.h>
18
#include <asm/acpi.h>
19 20 21
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

22
#include <xen/xen.h>
23
#include <xen/page.h>
24
#include <xen/interface/callback.h>
I
Ian Campbell 已提交
25
#include <xen/interface/memory.h>
26 27 28 29
#include <xen/interface/physdev.h>
#include <xen/features.h>

#include "xen-ops.h"
30
#include "vdso.h"
31 32 33 34

/* These are code, but not functions.  Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
T
Tej 已提交
35 36 37
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
38

39
/* Amount of extra memory space we add to the e820 ranges */
40
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
41

42 43 44
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;

45 46 47 48 49 50 51 52 53 54 55 56
/* 
 * The maximum amount of extra memory compared to the base size.  The
 * main scaling factor is the size of struct page.  At extreme ratios
 * of base:extra, all the base memory can be filled with page
 * structures for the extra memory, leaving no space for anything
 * else.
 * 
 * 10x seems like a reasonable balance between scaling flexibility and
 * leaving a practically usable system.
 */
#define EXTRA_MEM_RATIO		(10)

57
static void __init xen_add_extra_mem(u64 start, u64 size)
58
{
59
	unsigned long pfn;
60
	int i;
61

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
		/* Add new region. */
		if (xen_extra_mem[i].size == 0) {
			xen_extra_mem[i].start = start;
			xen_extra_mem[i].size  = size;
			break;
		}
		/* Append to existing region. */
		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
			xen_extra_mem[i].size += size;
			break;
		}
	}
	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
77

78
	memblock_x86_reserve_range(start, start + size, "XEN EXTRA");
79

80
	xen_max_p2m_pfn = PFN_DOWN(start + size);
81

82
	for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
83
		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
84 85
}

86 87
static unsigned long __init xen_release_chunk(unsigned long start,
					      unsigned long end)
88 89 90 91 92 93
{
	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};
94
	unsigned long len = 0;
95 96 97
	unsigned long pfn;
	int ret;

98 99 100 101 102 103 104 105 106 107 108 109
	for(pfn = start; pfn < end; pfn++) {
		unsigned long mfn = pfn_to_mfn(pfn);

		/* Make sure pfn exists to start with */
		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
			continue;

		set_xen_guest_handle(reservation.extent_start, &mfn);
		reservation.nr_extents = 1;

		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
					   &reservation);
110
		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
111
		if (ret == 1) {
112
			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
113 114 115
			len++;
		}
	}
116 117
	printk(KERN_INFO "Freeing  %lx-%lx pfn range: %lu pages freed\n",
	       start, end, len);
118 119 120 121

	return len;
}

122 123
static unsigned long __init xen_set_identity_and_release(
	const struct e820entry *list, size_t map_size, unsigned long nr_pages)
124
{
125
	phys_addr_t start = 0;
126
	unsigned long released = 0;
127
	unsigned long identity = 0;
128
	const struct e820entry *entry;
129 130
	int i;

131 132 133 134 135 136 137 138 139 140 141
	/*
	 * Combine non-RAM regions and gaps until a RAM region (or the
	 * end of the map) is reached, then set the 1:1 map and
	 * release the pages (if available) in those non-RAM regions.
	 *
	 * The combined non-RAM regions are rounded to a whole number
	 * of pages so any partial pages are accessible via the 1:1
	 * mapping.  This is needed for some BIOSes that put (for
	 * example) the DMI tables in a reserved region that begins on
	 * a non-page boundary.
	 */
142
	for (i = 0, entry = list; i < map_size; i++, entry++) {
143
		phys_addr_t end = entry->addr + entry->size;
144

145 146 147
		if (entry->type == E820_RAM || i == map_size - 1) {
			unsigned long start_pfn = PFN_DOWN(start);
			unsigned long end_pfn = PFN_UP(end);
148

149 150
			if (entry->type == E820_RAM)
				end_pfn = PFN_UP(entry->addr);
151

152 153 154 155
			if (start_pfn < end_pfn) {
				if (start_pfn < nr_pages)
					released += xen_release_chunk(
						start_pfn, min(end_pfn, nr_pages));
156 157

				identity += set_phys_range_identity(
158 159 160
					start_pfn, end_pfn);
			}
			start = end;
161 162
		}
	}
163 164 165 166 167

	printk(KERN_INFO "Released %lu pages of unused memory\n", released);
	printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);

	return released;
168
}
169 170 171 172 173 174 175 176 177 178 179 180 181

static unsigned long __init xen_get_max_pages(void)
{
	unsigned long max_pages = MAX_DOMAIN_PAGES;
	domid_t domid = DOMID_SELF;
	int ret;

	ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
	if (ret > 0)
		max_pages = ret;
	return min(max_pages, MAX_DOMAIN_PAGES);
}

182 183 184 185 186 187 188 189 190 191 192 193 194
static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
{
	u64 end = start + size;

	/* Align RAM regions to page boundaries. */
	if (type == E820_RAM) {
		start = PAGE_ALIGN(start);
		end &= ~((u64)PAGE_SIZE - 1);
	}

	e820_add_region(start, end - start, type);
}

195 196 197 198 199
/**
 * machine_specific_memory_setup - Hook for machine specific memory setup.
 **/
char * __init xen_memory_setup(void)
{
I
Ian Campbell 已提交
200 201
	static struct e820entry map[E820MAX] __initdata;

202
	unsigned long max_pfn = xen_start_info->nr_pages;
I
Ian Campbell 已提交
203 204 205
	unsigned long long mem_end;
	int rc;
	struct xen_memory_map memmap;
206
	unsigned long max_pages;
207
	unsigned long extra_pages = 0;
I
Ian Campbell 已提交
208
	int i;
I
Ian Campbell 已提交
209
	int op;
210

211
	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
I
Ian Campbell 已提交
212 213 214 215 216
	mem_end = PFN_PHYS(max_pfn);

	memmap.nr_entries = E820MAX;
	set_xen_guest_handle(memmap.buffer, map);

I
Ian Campbell 已提交
217 218 219 220
	op = xen_initial_domain() ?
		XENMEM_machine_memory_map :
		XENMEM_memory_map;
	rc = HYPERVISOR_memory_op(op, &memmap);
I
Ian Campbell 已提交
221
	if (rc == -ENOSYS) {
222
		BUG_ON(xen_initial_domain());
I
Ian Campbell 已提交
223 224 225 226 227 228 229 230 231
		memmap.nr_entries = 1;
		map[0].addr = 0ULL;
		map[0].size = mem_end;
		/* 8MB slack (to balance backend allocations). */
		map[0].size += 8ULL << 20;
		map[0].type = E820_RAM;
		rc = 0;
	}
	BUG_ON(rc);
232

233 234 235 236 237 238 239
	/* Make sure the Xen-supplied memory map is well-ordered. */
	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);

	max_pages = xen_get_max_pages();
	if (max_pages > max_pfn)
		extra_pages += max_pages - max_pfn;

240 241 242 243 244 245 246
	/*
	 * Set P2M for all non-RAM pages and E820 gaps to be identity
	 * type PFNs.  Any RAM pages that would be made inaccesible by
	 * this are first released.
	 */
	xen_released_pages = xen_set_identity_and_release(
		map, memmap.nr_entries, max_pfn);
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
	extra_pages += xen_released_pages;

	/*
	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
	 * factor the base size.  On non-highmem systems, the base
	 * size is the full initial memory allocation; on highmem it
	 * is limited to the max size of lowmem, so that it doesn't
	 * get completely filled.
	 *
	 * In principle there could be a problem in lowmem systems if
	 * the initial memory is also very large with respect to
	 * lowmem, but we won't try to deal with that here.
	 */
	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
			  extra_pages);

	i = 0;
	while (i < memmap.nr_entries) {
		u64 addr = map[i].addr;
		u64 size = map[i].size;
		u32 type = map[i].type;

		if (type == E820_RAM) {
			if (addr < mem_end) {
				size = min(size, mem_end - addr);
			} else if (extra_pages) {
				size = min(size, (u64)extra_pages * PAGE_SIZE);
				extra_pages -= size / PAGE_SIZE;
				xen_add_extra_mem(addr, size);
			} else
				type = E820_UNUSABLE;
278 279
		}

280
		xen_align_and_add_e820_region(addr, size, type);
281

282 283 284 285
		map[i].addr += size;
		map[i].size -= size;
		if (map[i].size == 0)
			i++;
I
Ian Campbell 已提交
286
	}
287 288

	/*
289 290
	 * In domU, the ISA region is normal, usable memory, but we
	 * reserve ISA memory anyway because too many things poke
291 292 293 294
	 * about in there.
	 */
	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
			E820_RESERVED);
295

296 297 298 299 300 301
	/*
	 * Reserve Xen bits:
	 *  - mfn_list
	 *  - xen_start_info
	 * See comment above "struct start_info" in <xen/interface/xen.h>
	 */
302
	memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
303 304
		      __pa(xen_start_info->pt_base),
			"XEN START INFO");
305 306 307

	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);

308 309 310
	return "Xen";
}

311 312
/*
 * Set the bit indicating "nosegneg" library variants should be used.
313 314
 * We only need to bother in pure 32-bit mode; compat 32-bit processes
 * can have un-truncated segments, so wrapping around is allowed.
315
 */
316
static void __init fiddle_vdso(void)
317
{
318 319 320 321 322
#ifdef CONFIG_X86_32
	u32 *mask;
	mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
	mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
323
	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
324
#endif
325 326
}

327
static int __cpuinit register_callback(unsigned type, const void *func)
328
{
329 330 331
	struct callback_register callback = {
		.type = type,
		.address = XEN_CALLBACK(__KERNEL_CS, func),
332 333 334
		.flags = CALLBACKF_mask_events,
	};

335 336 337 338 339
	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}

void __cpuinit xen_enable_sysenter(void)
{
340
	int ret;
341
	unsigned sysenter_feature;
342 343

#ifdef CONFIG_X86_32
344
	sysenter_feature = X86_FEATURE_SEP;
345
#else
346
	sysenter_feature = X86_FEATURE_SYSENTER32;
347
#endif
348

349 350 351
	if (!boot_cpu_has(sysenter_feature))
		return;

352
	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
353 354
	if(ret != 0)
		setup_clear_cpu_cap(sysenter_feature);
355 356
}

357 358 359 360 361 362 363
void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
	int ret;

	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
	if (ret != 0) {
364
		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
365 366 367 368 369
		/* Pretty fatal; 64-bit userspace has no other
		   mechanism for syscalls. */
	}

	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
370 371
		ret = register_callback(CALLBACKTYPE_syscall32,
					xen_syscall32_target);
372
		if (ret != 0)
373
			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
374 375 376 377
	}
#endif /* CONFIG_X86_64 */
}

378 379
void __init xen_arch_setup(void)
{
380 381
	xen_panic_handler_init();

382 383 384 385
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);

	if (!xen_feature(XENFEAT_auto_translated_physmap))
T
Tej 已提交
386 387
		HYPERVISOR_vm_assist(VMASST_CMD_enable,
				     VMASST_TYPE_pae_extended_cr3);
388

389 390 391
	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
		BUG();
392

393
	xen_enable_sysenter();
394
	xen_enable_syscall();
395

396 397 398 399 400 401 402 403 404 405 406
#ifdef CONFIG_ACPI
	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
		disable_acpi();
	}
#endif

	memcpy(boot_command_line, xen_start_info->cmd_line,
	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);

J
Jeremy Fitzhardinge 已提交
407 408 409 410
	/* Set up idle, making sure it calls safe_halt() pvop */
#ifdef CONFIG_X86_32
	boot_cpu_data.hlt_works_ok = 1;
#endif
411
	disable_cpuidle();
412
	boot_option_idle_override = IDLE_HALT;
J
Jeremy Fitzhardinge 已提交
413

414
	fiddle_vdso();
415
}