core.c 17.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
/*
 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
 * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
#include <linux/kernel.h>
#include <linux/start_kernel.h>
#include <linux/string.h>
#include <linux/console.h>
#include <linux/screen_info.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/cpu.h>
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <linux/lguest_bus.h>
#include <asm/paravirt.h>
#include <asm/param.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/lguest.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include "../lg.h"

static int cpu_had_pge;

static struct {
	unsigned long offset;
	unsigned short segment;
} lguest_entry;

/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void)
{
	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
}

/* This cpu's struct lguest_pages. */
static struct lguest_pages *lguest_pages(unsigned int cpu)
{
	return &(((struct lguest_pages *)
		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
}

static DEFINE_PER_CPU(struct lguest *, last_guest);

/*S:010
 * We are getting close to the Switcher.
 *
 * Remember that each CPU has two pages which are visible to the Guest when it
 * runs on that CPU.  This has to contain the state for that Guest: we copy the
 * state in just before we run the Guest.
 *
 * Each Guest has "changed" flags which indicate what has changed in the Guest
 * since it last ran.  We saw this set in interrupts_and_traps.c and
 * segments.c.
 */
static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
{
	/* Copying all this data can be quite expensive.  We usually run the
	 * same Guest we ran last time (and that Guest hasn't run anywhere else
	 * meanwhile).  If that's not the case, we pretend everything in the
	 * Guest has changed. */
	if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
		__get_cpu_var(last_guest) = lg;
		lg->last_pages = pages;
		lg->changed = CHANGED_ALL;
	}

	/* These copies are pretty cheap, so we do them unconditionally: */
	/* Save the current Host top-level page directory. */
	pages->state.host_cr3 = __pa(current->mm->pgd);
	/* Set up the Guest's page tables to see this CPU's pages (and no
	 * other CPU's pages). */
	map_switcher_in_guest(lg, pages);
	/* Set up the two "TSS" members which tell the CPU what stack to use
	 * for traps which do directly into the Guest (ie. traps at privilege
	 * level 1). */
	pages->state.guest_tss.esp1 = lg->esp1;
	pages->state.guest_tss.ss1 = lg->ss1;

	/* Copy direct-to-Guest trap entries. */
	if (lg->changed & CHANGED_IDT)
		copy_traps(lg, pages->state.guest_idt, default_idt_entries);

	/* Copy all GDT entries which the Guest can change. */
	if (lg->changed & CHANGED_GDT)
		copy_gdt(lg, pages->state.guest_gdt);
	/* If only the TLS entries have changed, copy them. */
	else if (lg->changed & CHANGED_GDT_TLS)
		copy_gdt_tls(lg, pages->state.guest_gdt);

	/* Mark the Guest as unchanged for next time. */
	lg->changed = 0;
}

/* Finally: the code to actually call into the Switcher to run the Guest. */
static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
{
	/* This is a dummy value we need for GCC's sake. */
	unsigned int clobber;

	/* Copy the guest-specific information into this CPU's "struct
	 * lguest_pages". */
	copy_in_guest_info(lg, pages);

	/* Set the trap number to 256 (impossible value).  If we fault while
	 * switching to the Guest (bad segment registers or bug), this will
	 * cause us to abort the Guest. */
	lg->regs->trapnum = 256;

	/* Now: we push the "eflags" register on the stack, then do an "lcall".
	 * This is how we change from using the kernel code segment to using
	 * the dedicated lguest code segment, as well as jumping into the
	 * Switcher.
	 *
	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
	 * stack, then the address of this call.  This stack layout happens to
	 * exactly match the stack of an interrupt... */
	asm volatile("pushf; lcall *lguest_entry"
		     /* This is how we tell GCC that %eax ("a") and %ebx ("b")
		      * are changed by this routine.  The "=" means output. */
		     : "=a"(clobber), "=b"(clobber)
		     /* %eax contains the pages pointer.  ("0" refers to the
		      * 0-th argument above, ie "a").  %ebx contains the
		      * physical address of the Guest's top-level page
		      * directory. */
		     : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
		     /* We tell gcc that all these registers could change,
		      * which means we don't have to save and restore them in
		      * the Switcher. */
		     : "memory", "%edx", "%ecx", "%edi", "%esi");
}
/*:*/

/*H:040 This is the i386-specific code to setup and run the Guest.  Interrupts
 * are disabled: we own the CPU. */
void lguest_arch_run_guest(struct lguest *lg)
{
	/* Remember the awfully-named TS bit?  If the Guest has asked
	 * to set it we set it now, so we can trap and pass that trap
	 * to the Guest if it uses the FPU. */
	if (lg->ts)
		lguest_set_ts();

	/* SYSENTER is an optimized way of doing system calls.  We
	 * can't allow it because it always jumps to privilege level 0.
	 * A normal Guest won't try it because we don't advertise it in
	 * CPUID, but a malicious Guest (or malicious Guest userspace
	 * program) could, so we tell the CPU to disable it before
	 * running the Guest. */
	if (boot_cpu_has(X86_FEATURE_SEP))
		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);

	/* Now we actually run the Guest.  It will pop back out when
	 * something interesting happens, and we can examine its
	 * registers to see what it was doing. */
	run_guest_once(lg, lguest_pages(raw_smp_processor_id()));

	/* The "regs" pointer contains two extra entries which are not
	 * really registers: a trap number which says what interrupt or
	 * trap made the switcher code come back, and an error code
	 * which some traps set.  */

	/* If the Guest page faulted, then the cr2 register will tell
	 * us the bad virtual address.  We have to grab this now,
	 * because once we re-enable interrupts an interrupt could
	 * fault and thus overwrite cr2, or we could even move off to a
	 * different CPU. */
	if (lg->regs->trapnum == 14)
		lg->arch.last_pagefault = read_cr2();
	/* Similarly, if we took a trap because the Guest used the FPU,
	 * we have to restore the FPU it expects to see. */
	else if (lg->regs->trapnum == 7)
		math_state_restore();

	/* Restore SYSENTER if it's supposed to be on. */
	if (boot_cpu_has(X86_FEATURE_SEP))
		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
}

/*H:130 Our Guest is usually so well behaved; it never tries to do things it
 * isn't allowed to.  Unfortunately, Linux's paravirtual infrastructure isn't
 * quite complete, because it doesn't contain replacements for the Intel I/O
 * instructions.  As a result, the Guest sometimes fumbles across one during
 * the boot process as it probes for various things which are usually attached
 * to a PC.
 *
 * When the Guest uses one of these instructions, we get trap #13 (General
 * Protection Fault) and come here.  We see if it's one of those troublesome
 * instructions and skip over it.  We return true if we did. */
static int emulate_insn(struct lguest *lg)
{
	u8 insn;
	unsigned int insnlen = 0, in = 0, shift = 0;
	/* The eip contains the *virtual* address of the Guest's instruction:
	 * guest_pa just subtracts the Guest's page_offset. */
	unsigned long physaddr = guest_pa(lg, lg->regs->eip);

	/* The guest_pa() function only works for Guest kernel addresses, but
	 * that's all we're trying to do anyway. */
	if (lg->regs->eip < lg->page_offset)
		return 0;

	/* Decoding x86 instructions is icky. */
	lgread(lg, &insn, physaddr, 1);

	/* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
	   of the eax register. */
	if (insn == 0x66) {
		shift = 16;
		/* The instruction is 1 byte so far, read the next byte. */
		insnlen = 1;
		lgread(lg, &insn, physaddr + insnlen, 1);
	}

	/* We can ignore the lower bit for the moment and decode the 4 opcodes
	 * we need to emulate. */
	switch (insn & 0xFE) {
	case 0xE4: /* in     <next byte>,%al */
		insnlen += 2;
		in = 1;
		break;
	case 0xEC: /* in     (%dx),%al */
		insnlen += 1;
		in = 1;
		break;
	case 0xE6: /* out    %al,<next byte> */
		insnlen += 2;
		break;
	case 0xEE: /* out    %al,(%dx) */
		insnlen += 1;
		break;
	default:
		/* OK, we don't know what this is, can't emulate. */
		return 0;
	}

	/* If it was an "IN" instruction, they expect the result to be read
	 * into %eax, so we change %eax.  We always return all-ones, which
	 * traditionally means "there's nothing there". */
	if (in) {
		/* Lower bit tells is whether it's a 16 or 32 bit access */
		if (insn & 0x1)
			lg->regs->eax = 0xFFFFFFFF;
		else
			lg->regs->eax |= (0xFFFF << shift);
	}
	/* Finally, we've "done" the instruction, so move past it. */
	lg->regs->eip += insnlen;
	/* Success! */
	return 1;
}

/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
void lguest_arch_handle_trap(struct lguest *lg)
{
	switch (lg->regs->trapnum) {
	case 13: /* We've intercepted a GPF. */
		 /* Check if this was one of those annoying IN or OUT
		  * instructions which we need to emulate.  If so, we
		  * just go back into the Guest after we've done it. */
		if (lg->regs->errcode == 0) {
			if (emulate_insn(lg))
				return;
		}
		break;
	case 14: /* We've intercepted a page fault. */
		 /* The Guest accessed a virtual address that wasn't
		  * mapped.  This happens a lot: we don't actually set
		  * up most of the page tables for the Guest at all when
		  * we start: as it runs it asks for more and more, and
		  * we set them up as required. In this case, we don't
		  * even tell the Guest that the fault happened.
		  *
		  * The errcode tells whether this was a read or a
		  * write, and whether kernel or userspace code. */
		if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
			return;

		 /* OK, it's really not there (or not OK): the Guest
		  * needs to know.  We write out the cr2 value so it
		  * knows where the fault occurred.
		  *
		  * Note that if the Guest were really messed up, this
		  * could happen before it's done the INITIALIZE
		  * hypercall, so lg->lguest_data will be NULL */
		if (lg->lguest_data &&
		    put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
			kill_guest(lg, "Writing cr2");
		break;
	case 7: /* We've intercepted a Device Not Available fault. */
		/* If the Guest doesn't want to know, we already
		 * restored the Floating Point Unit, so we just
		 * continue without telling it. */
		if (!lg->ts)
			return;
		break;
	case 32 ... 255:
319 320 321 322
		/* These values mean a real interrupt occurred, in which case
		 * the Host handler has already been run.  We just do a
		 * friendly check if another process should now be run, then
		 * return to run the Guest again */
323
		cond_resched();
324 325 326
		return;
	case LGUEST_TRAP_ENTRY:
		lg->hcall = lg->regs;
327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
		return;
	}

	/* We didn't handle the trap, so it needs to go to the Guest. */
	if (!deliver_trap(lg, lg->regs->trapnum))
		/* If the Guest doesn't have a handler (either it hasn't
		 * registered any yet, or it's one of the faults we don't let
		 * it handle), it dies with a cryptic error message. */
		kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
			   lg->regs->trapnum, lg->regs->eip,
			   lg->regs->trapnum == 14 ? lg->arch.last_pagefault
			   : lg->regs->errcode);
}

/* Now we can look at each of the routines this calls, in increasing order of
 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
 * deliver_trap() and demand_page().  After all those, we'll be ready to
 * examine the Switcher, and our philosophical understanding of the Host/Guest
 * duality will be complete. :*/
static void adjust_pge(void *on)
{
	if (on)
		write_cr4(read_cr4() | X86_CR4_PGE);
	else
		write_cr4(read_cr4() & ~X86_CR4_PGE);
}

/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do
 * some more i386-specific initialization. */
void __init lguest_arch_host_init(void)
{
	int i;

	/* Most of the i386/switcher.S doesn't care that it's been moved; on
	 * Intel, jumps are relative, and it doesn't access any references to
	 * external code or data.
	 *
	 * The only exception is the interrupt handlers in switcher.S: their
	 * addresses are placed in a table (default_idt_entries), so we need to
	 * update the table with the new addresses.  switcher_offset() is a
	 * convenience function which returns the distance between the builtin
	 * switcher code and the high-mapped copy we just made. */
	for (i = 0; i < IDT_ENTRIES; i++)
		default_idt_entries[i] += switcher_offset();

	/*
	 * Set up the Switcher's per-cpu areas.
	 *
	 * Each CPU gets two pages of its own within the high-mapped region
	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
	 * but some depends on what Guest we are running (which is set up in
	 * copy_in_guest_info()).
	 */
	for_each_possible_cpu(i) {
		/* lguest_pages() returns this CPU's two pages. */
		struct lguest_pages *pages = lguest_pages(i);
		/* This is a convenience pointer to make the code fit one
		 * statement to a line. */
		struct lguest_ro_state *state = &pages->state;

		/* The Global Descriptor Table: the Host has a different one
		 * for each CPU.  We keep a descriptor for the GDT which says
		 * where it is and how big it is (the size is actually the last
		 * byte, not the size, hence the "-1"). */
		state->host_gdt_desc.size = GDT_SIZE-1;
		state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);

		/* All CPUs on the Host use the same Interrupt Descriptor
		 * Table, so we just use store_idt(), which gets this CPU's IDT
		 * descriptor. */
		store_idt(&state->host_idt_desc);

		/* The descriptors for the Guest's GDT and IDT can be filled
		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
		 * ->guest_idt before actually running the Guest. */
		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
		state->guest_idt_desc.address = (long)&state->guest_idt;
		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
		state->guest_gdt_desc.address = (long)&state->guest_gdt;

		/* We know where we want the stack to be when the Guest enters
		 * the switcher: in pages->regs.  The stack grows upwards, so
		 * we start it at the end of that structure. */
		state->guest_tss.esp0 = (long)(&pages->regs + 1);
		/* And this is the GDT entry to use for the stack: we keep a
		 * couple of special LGUEST entries. */
		state->guest_tss.ss0 = LGUEST_DS;

		/* x86 can have a finegrained bitmap which indicates what I/O
		 * ports the process can use.  We set it to the end of our
		 * structure, meaning "none". */
		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);

		/* Some GDT entries are the same across all Guests, so we can
		 * set them up now. */
		setup_default_gdt_entries(state);
		/* Most IDT entries are the same for all Guests, too.*/
		setup_default_idt_entries(state, default_idt_entries);

		/* The Host needs to be able to use the LGUEST segments on this
		 * CPU, too, so put them in the Host GDT. */
		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
	}

	/* In the Switcher, we want the %cs segment register to use the
	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
	 * it will be undisturbed when we switch.  To change %cs and jump we
	 * need this structure to feed to Intel's "lcall" instruction. */
	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
	lguest_entry.segment = LGUEST_CS;

	/* Finally, we need to turn off "Page Global Enable".  PGE is an
	 * optimization where page table entries are specially marked to show
	 * they never change.  The Host kernel marks all the kernel pages this
	 * way because it's always present, even when userspace is running.
	 *
	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
	 * you'll get really weird bugs that you'll chase for two days.
	 *
	 * I used to turn PGE off every time we switched to the Guest and back
	 * on when we return, but that slowed the Switcher down noticibly. */

	/* We don't need the complexity of CPUs coming and going while we're
	 * doing this. */
	lock_cpu_hotplug();
	if (cpu_has_pge) { /* We have a broader idea of "global". */
		/* Remember that this was originally set (for cleanup). */
		cpu_had_pge = 1;
		/* adjust_pge is a helper function which sets or unsets the PGE
		 * bit on its CPU, depending on the argument (0 == unset). */
		on_each_cpu(adjust_pge, (void *)0, 0, 1);
		/* Turn off the feature in the global feature set. */
		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
	}
	unlock_cpu_hotplug();
};
/*:*/

void __exit lguest_arch_host_fini(void)
{
	/* If we had PGE before we started, turn it back on now. */
	lock_cpu_hotplug();
	if (cpu_had_pge) {
		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
		/* adjust_pge's argument "1" means set PGE. */
		on_each_cpu(adjust_pge, (void *)1, 0, 1);
	}
	unlock_cpu_hotplug();
}