core.c 24.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
 * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
20 21 22 23 24 25 26
/*P:450 This file contains the x86-specific lguest code.  It used to be all
 * mixed in with drivers/lguest/core.c but several foolhardy code slashers
 * wrestled most of the dependencies out to here in preparation for porting
 * lguest to other architectures (see what I mean by foolhardy?).
 *
 * This also contains a couple of non-obvious setup and teardown pieces which
 * were implemented after days of debugging pain. :*/
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
#include <linux/kernel.h>
#include <linux/start_kernel.h>
#include <linux/string.h>
#include <linux/console.h>
#include <linux/screen_info.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/cpu.h>
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <asm/paravirt.h>
#include <asm/param.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/lguest.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include "../lg.h"

static int cpu_had_pge;

static struct {
	unsigned long offset;
	unsigned short segment;
} lguest_entry;

/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void)
{
	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
}

/* This cpu's struct lguest_pages. */
static struct lguest_pages *lguest_pages(unsigned int cpu)
{
	return &(((struct lguest_pages *)
		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
}

70
static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
71 72

/*S:010
R
Rusty Russell 已提交
73
 * We approach the Switcher.
74 75 76 77 78 79 80 81 82
 *
 * Remember that each CPU has two pages which are visible to the Guest when it
 * runs on that CPU.  This has to contain the state for that Guest: we copy the
 * state in just before we run the Guest.
 *
 * Each Guest has "changed" flags which indicate what has changed in the Guest
 * since it last ran.  We saw this set in interrupts_and_traps.c and
 * segments.c.
 */
83
static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
84 85 86 87 88
{
	/* Copying all this data can be quite expensive.  We usually run the
	 * same Guest we ran last time (and that Guest hasn't run anywhere else
	 * meanwhile).  If that's not the case, we pretend everything in the
	 * Guest has changed. */
89
	if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
90
		__get_cpu_var(last_cpu) = cpu;
91
		cpu->last_pages = pages;
92
		cpu->changed = CHANGED_ALL;
93 94 95 96 97 98 99
	}

	/* These copies are pretty cheap, so we do them unconditionally: */
	/* Save the current Host top-level page directory. */
	pages->state.host_cr3 = __pa(current->mm->pgd);
	/* Set up the Guest's page tables to see this CPU's pages (and no
	 * other CPU's pages). */
100
	map_switcher_in_guest(cpu, pages);
101 102 103
	/* Set up the two "TSS" members which tell the CPU what stack to use
	 * for traps which do directly into the Guest (ie. traps at privilege
	 * level 1). */
104
	pages->state.guest_tss.sp1 = cpu->esp1;
105
	pages->state.guest_tss.ss1 = cpu->ss1;
106 107

	/* Copy direct-to-Guest trap entries. */
108
	if (cpu->changed & CHANGED_IDT)
109
		copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
110 111

	/* Copy all GDT entries which the Guest can change. */
112
	if (cpu->changed & CHANGED_GDT)
113
		copy_gdt(cpu, pages->state.guest_gdt);
114
	/* If only the TLS entries have changed, copy them. */
115
	else if (cpu->changed & CHANGED_GDT_TLS)
116
		copy_gdt_tls(cpu, pages->state.guest_gdt);
117 118

	/* Mark the Guest as unchanged for next time. */
119
	cpu->changed = 0;
120 121 122
}

/* Finally: the code to actually call into the Switcher to run the Guest. */
123
static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
124 125 126 127 128 129
{
	/* This is a dummy value we need for GCC's sake. */
	unsigned int clobber;

	/* Copy the guest-specific information into this CPU's "struct
	 * lguest_pages". */
130
	copy_in_guest_info(cpu, pages);
131 132 133 134

	/* Set the trap number to 256 (impossible value).  If we fault while
	 * switching to the Guest (bad segment registers or bug), this will
	 * cause us to abort the Guest. */
135
	cpu->regs->trapnum = 256;
136 137 138 139 140 141 142 143

	/* Now: we push the "eflags" register on the stack, then do an "lcall".
	 * This is how we change from using the kernel code segment to using
	 * the dedicated lguest code segment, as well as jumping into the
	 * Switcher.
	 *
	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
	 * stack, then the address of this call.  This stack layout happens to
R
Rusty Russell 已提交
144
	 * exactly match the stack layout created by an interrupt... */
145 146 147 148 149 150 151 152
	asm volatile("pushf; lcall *lguest_entry"
		     /* This is how we tell GCC that %eax ("a") and %ebx ("b")
		      * are changed by this routine.  The "=" means output. */
		     : "=a"(clobber), "=b"(clobber)
		     /* %eax contains the pages pointer.  ("0" refers to the
		      * 0-th argument above, ie "a").  %ebx contains the
		      * physical address of the Guest's top-level page
		      * directory. */
153
		     : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
154 155 156 157 158 159 160
		     /* We tell gcc that all these registers could change,
		      * which means we don't have to save and restore them in
		      * the Switcher. */
		     : "memory", "%edx", "%ecx", "%edi", "%esi");
}
/*:*/

R
Rusty Russell 已提交
161 162 163 164 165 166
/*M:002 There are hooks in the scheduler which we can register to tell when we
 * get kicked off the CPU (preempt_notifier_register()).  This would allow us
 * to lazily disable SYSENTER which would regain some performance, and should
 * also simplify copy_in_guest_info().  Note that we'd still need to restore
 * things when we exit to Launcher userspace, but that's fairly easy.
 *
167 168
 * We could also try using this hooks for PGE, but that might be too expensive.
 *
R
Rusty Russell 已提交
169 170
 * The hooks were designed for KVM, but we can also put them to good use. :*/

171 172
/*H:040 This is the i386-specific code to setup and run the Guest.  Interrupts
 * are disabled: we own the CPU. */
173
void lguest_arch_run_guest(struct lg_cpu *cpu)
174
{
R
Rusty Russell 已提交
175 176 177
	/* Remember the awfully-named TS bit?  If the Guest has asked to set it
	 * we set it now, so we can trap and pass that trap to the Guest if it
	 * uses the FPU. */
178
	if (cpu->ts)
179
		unlazy_fpu(current);
180

R
Rusty Russell 已提交
181 182 183 184 185
	/* SYSENTER is an optimized way of doing system calls.  We can't allow
	 * it because it always jumps to privilege level 0.  A normal Guest
	 * won't try it because we don't advertise it in CPUID, but a malicious
	 * Guest (or malicious Guest userspace program) could, so we tell the
	 * CPU to disable it before running the Guest. */
186 187 188
	if (boot_cpu_has(X86_FEATURE_SEP))
		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);

R
Rusty Russell 已提交
189 190 191
	/* Now we actually run the Guest.  It will return when something
	 * interesting happens, and we can examine its registers to see what it
	 * was doing. */
192
	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
193

194
	/* Note that the "regs" structure contains two extra entries which are
R
Rusty Russell 已提交
195 196 197
	 * not really registers: a trap number which says what interrupt or
	 * trap made the switcher code come back, and an error code which some
	 * traps set.  */
198

199 200 201 202
	 /* Restore SYSENTER if it's supposed to be on. */
	 if (boot_cpu_has(X86_FEATURE_SEP))
		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);

R
Rusty Russell 已提交
203 204 205 206
	/* If the Guest page faulted, then the cr2 register will tell us the
	 * bad virtual address.  We have to grab this now, because once we
	 * re-enable interrupts an interrupt could fault and thus overwrite
	 * cr2, or we could even move off to a different CPU. */
207
	if (cpu->regs->trapnum == 14)
208
		cpu->arch.last_pagefault = read_cr2();
209
	/* Similarly, if we took a trap because the Guest used the FPU,
210 211 212 213
	 * we have to restore the FPU it expects to see.
	 * math_state_restore() may sleep and we may even move off to
	 * a different CPU. So all the critical stuff should be done
	 * before this.  */
214
	else if (cpu->regs->trapnum == 7)
215 216 217
		math_state_restore();
}

R
Rusty Russell 已提交
218 219 220 221 222 223 224
/*H:130 Now we've examined the hypercall code; our Guest can make requests.
 * Our Guest is usually so well behaved; it never tries to do things it isn't
 * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
 * infrastructure isn't quite complete, because it doesn't contain replacements
 * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
 * across one during the boot process as it probes for various things which are
 * usually attached to a PC.
225
 *
R
Rusty Russell 已提交
226
 * When the Guest uses one of these instructions, we get a trap (General
227 228
 * Protection Fault) and come here.  We see if it's one of those troublesome
 * instructions and skip over it.  We return true if we did. */
229
static int emulate_insn(struct lg_cpu *cpu)
230 231 232 233 234
{
	u8 insn;
	unsigned int insnlen = 0, in = 0, shift = 0;
	/* The eip contains the *virtual* address of the Guest's instruction:
	 * guest_pa just subtracts the Guest's page_offset. */
235
	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
236

237 238 239
	/* This must be the Guest kernel trying to do something, not userspace!
	 * The bottom two bits of the CS segment register are the privilege
	 * level. */
240
	if ((cpu->regs->cs & 3) != GUEST_PL)
241 242 243
		return 0;

	/* Decoding x86 instructions is icky. */
244
	insn = lgread(cpu, physaddr, u8);
245 246 247 248 249 250 251

	/* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
	   of the eax register. */
	if (insn == 0x66) {
		shift = 16;
		/* The instruction is 1 byte so far, read the next byte. */
		insnlen = 1;
252
		insn = lgread(cpu, physaddr + insnlen, u8);
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
	}

	/* We can ignore the lower bit for the moment and decode the 4 opcodes
	 * we need to emulate. */
	switch (insn & 0xFE) {
	case 0xE4: /* in     <next byte>,%al */
		insnlen += 2;
		in = 1;
		break;
	case 0xEC: /* in     (%dx),%al */
		insnlen += 1;
		in = 1;
		break;
	case 0xE6: /* out    %al,<next byte> */
		insnlen += 2;
		break;
	case 0xEE: /* out    %al,(%dx) */
		insnlen += 1;
		break;
	default:
		/* OK, we don't know what this is, can't emulate. */
		return 0;
	}

	/* If it was an "IN" instruction, they expect the result to be read
	 * into %eax, so we change %eax.  We always return all-ones, which
	 * traditionally means "there's nothing there". */
	if (in) {
		/* Lower bit tells is whether it's a 16 or 32 bit access */
		if (insn & 0x1)
283
			cpu->regs->eax = 0xFFFFFFFF;
284
		else
285
			cpu->regs->eax |= (0xFFFF << shift);
286 287
	}
	/* Finally, we've "done" the instruction, so move past it. */
288
	cpu->regs->eip += insnlen;
289 290 291 292
	/* Success! */
	return 1;
}

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
/* Our hypercalls mechanism used to be based on direct software interrupts.
 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
 * change over to using kvm hypercalls.
 *
 * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid
 * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be
 * an *emulation approach*: if the fault was really produced by an hypercall
 * (is_hypercall() does exactly this check), we can just call the corresponding
 * hypercall host implementation function.
 *
 * But these invalid opcode faults are notably slower than software interrupts.
 * So we implemented the *patching (or rewriting) approach*: every time we hit
 * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f"
 * opcode, so next time the Guest calls this hypercall it will use the
 * faster trap mechanism.
 *
 * Matias even benchmarked it to convince you: this shows the average cycle
 * cost of a hypercall.  For each alternative solution mentioned above we've
 * made 5 runs of the benchmark:
 *
 * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898
 * 2) emulation technique: 3410, 3681, 3466, 3392, 3780
 * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884
 *
 * One two-line function is worth a 20% hypercall speed boost!
 */
static void rewrite_hypercall(struct lg_cpu *cpu)
{
	/* This are the opcodes we use to patch the Guest.  The opcode for "int
	 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
	 * complete the sequence with a NOP (0x90). */
	u8 insn[3] = {0xcd, 0x1f, 0x90};

	__lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
327 328 329 330 331
	/* The above write might have caused a copy of that page to be made
	 * (if it was read-only).  We need to make sure the Guest has
	 * up-to-date pagetables.  As this doesn't happen often, we can just
	 * drop them all. */
	guest_pagetable_clear_all(cpu);
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
}

static bool is_hypercall(struct lg_cpu *cpu)
{
	u8 insn[3];

	/* This must be the Guest kernel trying to do something.
	 * The bottom two bits of the CS segment register are the privilege
	 * level. */
	if ((cpu->regs->cs & 3) != GUEST_PL)
		return false;

	/* Is it a vmcall? */
	__lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn));
	return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1;
}

349
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
350
void lguest_arch_handle_trap(struct lg_cpu *cpu)
351
{
352
	switch (cpu->regs->trapnum) {
R
Rusty Russell 已提交
353 354 355 356
	case 13: /* We've intercepted a General Protection Fault. */
		/* Check if this was one of those annoying IN or OUT
		 * instructions which we need to emulate.  If so, we just go
		 * back into the Guest after we've done it. */
357
		if (cpu->regs->errcode == 0) {
358
			if (emulate_insn(cpu))
359 360 361
				return;
		}
		break;
R
Rusty Russell 已提交
362 363
	case 14: /* We've intercepted a Page Fault. */
		/* The Guest accessed a virtual address that wasn't mapped.
364 365 366 367
		 * This happens a lot: we don't actually set up most of the page
		 * tables for the Guest at all when we start: as it runs it asks
		 * for more and more, and we set them up as required. In this
		 * case, we don't even tell the Guest that the fault happened.
R
Rusty Russell 已提交
368 369 370
		 *
		 * The errcode tells whether this was a read or a write, and
		 * whether kernel or userspace code. */
371 372
		if (demand_page(cpu, cpu->arch.last_pagefault,
				cpu->regs->errcode))
373 374
			return;

R
Rusty Russell 已提交
375 376 377 378 379 380 381
		/* OK, it's really not there (or not OK): the Guest needs to
		 * know.  We write out the cr2 value so it knows where the
		 * fault occurred.
		 *
		 * Note that if the Guest were really messed up, this could
		 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
		 * lg->lguest_data could be NULL */
382 383 384 385
		if (cpu->lg->lguest_data &&
		    put_user(cpu->arch.last_pagefault,
			     &cpu->lg->lguest_data->cr2))
			kill_guest(cpu, "Writing cr2");
386 387
		break;
	case 7: /* We've intercepted a Device Not Available fault. */
R
Rusty Russell 已提交
388 389 390
		/* If the Guest doesn't want to know, we already restored the
		 * Floating Point Unit, so we just continue without telling
		 * it. */
391
		if (!cpu->ts)
392 393 394
			return;
		break;
	case 32 ... 255:
395
		/* These values mean a real interrupt occurred, in which case
396
		 * the Host handler has already been run. We just do a
397 398
		 * friendly check if another process should now be run, then
		 * return to run the Guest again */
399
		cond_resched();
400 401
		return;
	case LGUEST_TRAP_ENTRY:
J
Jes Sorensen 已提交
402 403
		/* Our 'struct hcall_args' maps directly over our regs: we set
		 * up the pointer now to indicate a hypercall is pending. */
404
		cpu->hcall = (struct hcall_args *)cpu->regs;
405
		return;
406 407 408 409 410 411 412 413 414
	case 6:
		/* kvm hypercalls trigger an invalid opcode fault (6).
		 * We need to check if ring == GUEST_PL and
		 * faulting instruction == vmcall. */
		if (is_hypercall(cpu)) {
			rewrite_hypercall(cpu);
			return;
		}
		break;
415 416 417
	}

	/* We didn't handle the trap, so it needs to go to the Guest. */
418
	if (!deliver_trap(cpu, cpu->regs->trapnum))
419 420
		/* If the Guest doesn't have a handler (either it hasn't
		 * registered any yet, or it's one of the faults we don't let
421
		 * it handle), it dies with this cryptic error message. */
422
		kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
423
			   cpu->regs->trapnum, cpu->regs->eip,
424
			   cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
425
			   : cpu->regs->errcode);
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
}

/* Now we can look at each of the routines this calls, in increasing order of
 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
 * deliver_trap() and demand_page().  After all those, we'll be ready to
 * examine the Switcher, and our philosophical understanding of the Host/Guest
 * duality will be complete. :*/
static void adjust_pge(void *on)
{
	if (on)
		write_cr4(read_cr4() | X86_CR4_PGE);
	else
		write_cr4(read_cr4() & ~X86_CR4_PGE);
}

/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do
 * some more i386-specific initialization. */
void __init lguest_arch_host_init(void)
{
	int i;

	/* Most of the i386/switcher.S doesn't care that it's been moved; on
	 * Intel, jumps are relative, and it doesn't access any references to
	 * external code or data.
	 *
	 * The only exception is the interrupt handlers in switcher.S: their
	 * addresses are placed in a table (default_idt_entries), so we need to
	 * update the table with the new addresses.  switcher_offset() is a
454 455
	 * convenience function which returns the distance between the
	 * compiled-in switcher code and the high-mapped copy we just made. */
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
	for (i = 0; i < IDT_ENTRIES; i++)
		default_idt_entries[i] += switcher_offset();

	/*
	 * Set up the Switcher's per-cpu areas.
	 *
	 * Each CPU gets two pages of its own within the high-mapped region
	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
	 * but some depends on what Guest we are running (which is set up in
	 * copy_in_guest_info()).
	 */
	for_each_possible_cpu(i) {
		/* lguest_pages() returns this CPU's two pages. */
		struct lguest_pages *pages = lguest_pages(i);
		/* This is a convenience pointer to make the code fit one
		 * statement to a line. */
		struct lguest_ro_state *state = &pages->state;

		/* The Global Descriptor Table: the Host has a different one
		 * for each CPU.  We keep a descriptor for the GDT which says
		 * where it is and how big it is (the size is actually the last
		 * byte, not the size, hence the "-1"). */
		state->host_gdt_desc.size = GDT_SIZE-1;
		state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);

		/* All CPUs on the Host use the same Interrupt Descriptor
		 * Table, so we just use store_idt(), which gets this CPU's IDT
		 * descriptor. */
		store_idt(&state->host_idt_desc);

		/* The descriptors for the Guest's GDT and IDT can be filled
		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
		 * ->guest_idt before actually running the Guest. */
		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
		state->guest_idt_desc.address = (long)&state->guest_idt;
		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
		state->guest_gdt_desc.address = (long)&state->guest_gdt;

		/* We know where we want the stack to be when the Guest enters
495
		 * the Switcher: in pages->regs.  The stack grows upwards, so
496
		 * we start it at the end of that structure. */
497
		state->guest_tss.sp0 = (long)(&pages->regs + 1);
498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
		/* And this is the GDT entry to use for the stack: we keep a
		 * couple of special LGUEST entries. */
		state->guest_tss.ss0 = LGUEST_DS;

		/* x86 can have a finegrained bitmap which indicates what I/O
		 * ports the process can use.  We set it to the end of our
		 * structure, meaning "none". */
		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);

		/* Some GDT entries are the same across all Guests, so we can
		 * set them up now. */
		setup_default_gdt_entries(state);
		/* Most IDT entries are the same for all Guests, too.*/
		setup_default_idt_entries(state, default_idt_entries);

		/* The Host needs to be able to use the LGUEST segments on this
		 * CPU, too, so put them in the Host GDT. */
		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
	}

	/* In the Switcher, we want the %cs segment register to use the
	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
	 * it will be undisturbed when we switch.  To change %cs and jump we
	 * need this structure to feed to Intel's "lcall" instruction. */
	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
	lguest_entry.segment = LGUEST_CS;

	/* Finally, we need to turn off "Page Global Enable".  PGE is an
	 * optimization where page table entries are specially marked to show
	 * they never change.  The Host kernel marks all the kernel pages this
	 * way because it's always present, even when userspace is running.
	 *
	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
	 * you'll get really weird bugs that you'll chase for two days.
	 *
	 * I used to turn PGE off every time we switched to the Guest and back
	 * on when we return, but that slowed the Switcher down noticibly. */

	/* We don't need the complexity of CPUs coming and going while we're
	 * doing this. */
540
	get_online_cpus();
541 542 543 544 545
	if (cpu_has_pge) { /* We have a broader idea of "global". */
		/* Remember that this was originally set (for cleanup). */
		cpu_had_pge = 1;
		/* adjust_pge is a helper function which sets or unsets the PGE
		 * bit on its CPU, depending on the argument (0 == unset). */
546
		on_each_cpu(adjust_pge, (void *)0, 1);
547
		/* Turn off the feature in the global feature set. */
548
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
549
	}
550
	put_online_cpus();
551 552 553 554 555 556
};
/*:*/

void __exit lguest_arch_host_fini(void)
{
	/* If we had PGE before we started, turn it back on now. */
557
	get_online_cpus();
558
	if (cpu_had_pge) {
559
		set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
560
		/* adjust_pge's argument "1" means set PGE. */
561
		on_each_cpu(adjust_pge, (void *)1, 1);
562
	}
563
	put_online_cpus();
564
}
J
Jes Sorensen 已提交
565 566 567


/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
568
int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
J
Jes Sorensen 已提交
569 570 571
{
	switch (args->arg0) {
	case LHCALL_LOAD_GDT:
572
		load_guest_gdt(cpu, args->arg1, args->arg2);
J
Jes Sorensen 已提交
573 574
		break;
	case LHCALL_LOAD_IDT_ENTRY:
575
		load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
J
Jes Sorensen 已提交
576 577
		break;
	case LHCALL_LOAD_TLS:
578
		guest_load_tls(cpu, args->arg1);
J
Jes Sorensen 已提交
579 580 581 582 583 584 585 586 587
		break;
	default:
		/* Bad Guest.  Bad! */
		return -EIO;
	}
	return 0;
}

/*H:126 i386-specific hypercall initialization: */
588
int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
J
Jes Sorensen 已提交
589 590 591
{
	u32 tsc_speed;

592 593
	/* The pointer to the Guest's "struct lguest_data" is the only argument.
	 * We check that address now. */
594 595
	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
			       sizeof(*cpu->lg->lguest_data)))
J
Jes Sorensen 已提交
596 597 598 599 600 601 602
		return -EFAULT;

	/* Having checked it, we simply set lg->lguest_data to point straight
	 * into the Launcher's memory at the right place and then use
	 * copy_to_user/from_user from now on, instead of lgread/write.  I put
	 * this in to show that I'm not immune to writing stupid
	 * optimizations. */
603
	cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
J
Jes Sorensen 已提交
604 605 606 607 608 609 610 611 612 613 614 615

	/* We insist that the Time Stamp Counter exist and doesn't change with
	 * cpu frequency.  Some devious chip manufacturers decided that TSC
	 * changes could be handled in software.  I decided that time going
	 * backwards might be good for benchmarks, but it's bad for users.
	 *
	 * We also insist that the TSC be stable: the kernel detects unreliable
	 * TSCs for its own purposes, and we use that here. */
	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
		tsc_speed = tsc_khz;
	else
		tsc_speed = 0;
616
	if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
J
Jes Sorensen 已提交
617 618
		return -EFAULT;

619
	/* The interrupt code might not like the system call vector. */
620 621
	if (!check_syscall_vector(cpu->lg))
		kill_guest(cpu, "bad syscall vector");
622

J
Jes Sorensen 已提交
623 624
	return 0;
}
625
/*:*/
626 627 628 629 630

/*L:030 lguest_arch_setup_regs()
 *
 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
 * allocate the structure, so they will be 0. */
631
void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
632
{
633
	struct lguest_regs *regs = cpu->regs;
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648

	/* There are four "segment" registers which the Guest needs to boot:
	 * The "code segment" register (cs) refers to the kernel code segment
	 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
	 * refer to the kernel data segment __KERNEL_DS.
	 *
	 * The privilege level is packed into the lower bits.  The Guest runs
	 * at privilege level 1 (GUEST_PL).*/
	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
	regs->cs = __KERNEL_CS|GUEST_PL;

	/* The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
	 * is supposed to always be "1".  Bit 9 (0x200) controls whether
	 * interrupts are enabled.  We always leave interrupts enabled while
	 * running the Guest. */
649
	regs->eflags = X86_EFLAGS_IF | 0x2;
650 651 652 653 654 655 656

	/* The "Extended Instruction Pointer" register says where the Guest is
	 * running. */
	regs->eip = start;

	/* %esi points to our boot information, at physical address 0, so don't
	 * touch it. */
R
Rusty Russell 已提交
657

658 659
	/* There are a couple of GDT entries the Guest expects when first
	 * booting. */
660
	setup_guest_gdt(cpu);
661
}