提交 bff672e6 编写于 作者: R Rusty Russell 提交者: Linus Torvalds

lguest: documentation V: Host

Documentation: The Host
Signed-off-by: NRusty Russell <rusty@rustcorp.com.au>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 dde79789
此差异已折叠。
......@@ -28,37 +28,63 @@
#include <irq_vectors.h>
#include "lg.h"
/*H:120 This is the core hypercall routine: where the Guest gets what it
* wants. Or gets killed. Or, in the case of LHCALL_CRASH, both.
*
* Remember from the Guest: %eax == which call to make, and the arguments are
* packed into %edx, %ebx and %ecx if needed. */
static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
{
switch (regs->eax) {
case LHCALL_FLUSH_ASYNC:
/* This call does nothing, except by breaking out of the Guest
* it makes us process all the asynchronous hypercalls. */
break;
case LHCALL_LGUEST_INIT:
/* You can't get here unless you're already initialized. Don't
* do that. */
kill_guest(lg, "already have lguest_data");
break;
case LHCALL_CRASH: {
/* Crash is such a trivial hypercall that we do it in four
* lines right here. */
char msg[128];
/* If the lgread fails, it will call kill_guest() itself; the
* kill_guest() with the message will be ignored. */
lgread(lg, msg, regs->edx, sizeof(msg));
msg[sizeof(msg)-1] = '\0';
kill_guest(lg, "CRASH: %s", msg);
break;
}
case LHCALL_FLUSH_TLB:
/* FLUSH_TLB comes in two flavors, depending on the
* argument: */
if (regs->edx)
guest_pagetable_clear_all(lg);
else
guest_pagetable_flush_user(lg);
break;
case LHCALL_GET_WALLCLOCK: {
/* The Guest wants to know the real time in seconds since 1970,
* in good Unix tradition. */
struct timespec ts;
ktime_get_real_ts(&ts);
regs->eax = ts.tv_sec;
break;
}
case LHCALL_BIND_DMA:
/* BIND_DMA really wants four arguments, but it's the only call
* which does. So the Guest packs the number of buffers and
* the interrupt number into the final argument, and we decode
* it here. This can legitimately fail, since we currently
* place a limit on the number of DMA pools a Guest can have.
* So we return true or false from this call. */
regs->eax = bind_dma(lg, regs->edx, regs->ebx,
regs->ecx >> 8, regs->ecx & 0xFF);
break;
/* All these calls simply pass the arguments through to the right
* routines. */
case LHCALL_SEND_DMA:
send_dma(lg, regs->edx, regs->ebx);
break;
......@@ -86,10 +112,13 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
case LHCALL_SET_CLOCKEVENT:
guest_set_clockevent(lg, regs->edx);
break;
case LHCALL_TS:
/* This sets the TS flag, as we saw used in run_guest(). */
lg->ts = regs->edx;
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
lg->halted = 1;
break;
default:
......@@ -97,25 +126,42 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
}
}
/* We always do queued calls before actual hypercall. */
/* Asynchronous hypercalls are easy: we just look in the array in the Guest's
* "struct lguest_data" and see if there are any new ones marked "ready".
*
* We are careful to do these in order: obviously we respect the order the
* Guest put them in the ring, but we also promise the Guest that they will
* happen before any normal hypercall (which is why we check this before
* checking for a normal hcall). */
static void do_async_hcalls(struct lguest *lg)
{
unsigned int i;
u8 st[LHCALL_RING_SIZE];
/* For simplicity, we copy the entire call status array in at once. */
if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
return;
/* We process "struct lguest_data"s hcalls[] ring once. */
for (i = 0; i < ARRAY_SIZE(st); i++) {
struct lguest_regs regs;
/* We remember where we were up to from last time. This makes
* sure that the hypercalls are done in the order the Guest
* places them in the ring. */
unsigned int n = lg->next_hcall;
/* 0xFF means there's no call here (yet). */
if (st[n] == 0xFF)
break;
/* OK, we have hypercall. Increment the "next_hcall" cursor,
* and wrap back to 0 if we reach the end. */
if (++lg->next_hcall == LHCALL_RING_SIZE)
lg->next_hcall = 0;
/* We copy the hypercall arguments into a fake register
* structure. This makes life simple for do_hcall(). */
if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
|| get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
|| get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
......@@ -124,74 +170,126 @@ static void do_async_hcalls(struct lguest *lg)
break;
}
/* Do the hypercall, same as a normal one. */
do_hcall(lg, &regs);
/* Mark the hypercall done. */
if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
kill_guest(lg, "Writing result for async hypercall");
break;
}
/* Stop doing hypercalls if we've just done a DMA to the
* Launcher: it needs to service this first. */
if (lg->dma_is_pending)
break;
}
}
/* Last of all, we look at what happens first of all. The very first time the
* Guest makes a hypercall, we end up here to set things up: */
static void initialize(struct lguest *lg)
{
u32 tsc_speed;
/* You can't do anything until you're initialized. The Guest knows the
* rules, so we're unforgiving here. */
if (lg->regs->eax != LHCALL_LGUEST_INIT) {
kill_guest(lg, "hypercall %li before LGUEST_INIT",
lg->regs->eax);
return;
}
/* We only tell the guest to use the TSC if it's reliable. */
/* We insist that the Time Stamp Counter exist and doesn't change with
* cpu frequency. Some devious chip manufacturers decided that TSC
* changes could be handled in software. I decided that time going
* backwards might be good for benchmarks, but it's bad for users.
*
* We also insist that the TSC be stable: the kernel detects unreliable
* TSCs for its own purposes, and we use that here. */
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
tsc_speed = tsc_khz;
else
tsc_speed = 0;
/* The pointer to the Guest's "struct lguest_data" is the only
* argument. */
lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
/* We check here so we can simply copy_to_user/from_user */
/* If we check the address they gave is OK now, we can simply
* copy_to_user/from_user from now on rather than using lgread/lgwrite.
* I put this in to show that I'm not immune to writing stupid
* optimizations. */
if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
kill_guest(lg, "bad guest page %p", lg->lguest_data);
return;
}
/* The Guest tells us where we're not to deliver interrupts by putting
* the range of addresses into "struct lguest_data". */
if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
/* We reserve the top pgd entry. */
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher. */
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
|| put_user(tsc_speed, &lg->lguest_data->tsc_khz)
/* We also give the Guest a unique id, as used in lguest_net.c. */
|| put_user(lg->guestid, &lg->lguest_data->guestid))
kill_guest(lg, "bad guest page %p", lg->lguest_data);
/* This is the one case where the above accesses might have
* been the first write to a Guest page. This may have caused
* a copy-on-write fault, but the Guest might be referring to
* the old (read-only) page. */
/* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the Guest might be referring to the old (read-only)
* page. */
guest_pagetable_clear_all(lg);
}
/* Now we've examined the hypercall code; our Guest can make requests. There
* is one other way we can do things for the Guest, as we see in
* emulate_insn(). */
/* Even if we go out to userspace and come back, we don't want to do
* the hypercall again. */
/*H:110 Tricky point: we mark the hypercall as "done" once we've done it.
* Normally we don't need to do this: the Guest will run again and update the
* trap number before we come back around the run_guest() loop to
* do_hypercalls().
*
* However, if we are signalled or the Guest sends DMA to the Launcher, that
* loop will exit without running the Guest. When it comes back it would try
* to re-run the hypercall. */
static void clear_hcall(struct lguest *lg)
{
lg->regs->trapnum = 255;
}
/*H:100
* Hypercalls
*
* Remember from the Guest, hypercalls come in two flavors: normal and
* asynchronous. This file handles both of types.
*/
void do_hypercalls(struct lguest *lg)
{
/* Not initialized yet? */
if (unlikely(!lg->lguest_data)) {
/* Did the Guest make a hypercall? We might have come back for
* some other reason (an interrupt, a different trap). */
if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
/* Set up the "struct lguest_data" */
initialize(lg);
/* The hypercall is done. */
clear_hcall(lg);
}
return;
}
/* The Guest has initialized.
*
* Look in the hypercall ring for the async hypercalls: */
do_async_hcalls(lg);
/* If we stopped reading the hypercall ring because the Guest did a
* SEND_DMA to the Launcher, we want to return now. Otherwise if the
* Guest asked us to do a hypercall, we do it. */
if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
do_hcall(lg, lg->regs);
/* The hypercall is done. */
clear_hcall(lg);
}
}
......@@ -14,100 +14,147 @@
#include <linux/uaccess.h>
#include "lg.h"
/* The address of the interrupt handler is split into two bits: */
static unsigned long idt_address(u32 lo, u32 hi)
{
return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
}
/* The "type" of the interrupt handler is a 4 bit field: we only support a
* couple of types. */
static int idt_type(u32 lo, u32 hi)
{
return (hi >> 8) & 0xF;
}
/* An IDT entry can't be used unless the "present" bit is set. */
static int idt_present(u32 lo, u32 hi)
{
return (hi & 0x8000);
}
/* We need a helper to "push" a value onto the Guest's stack, since that's a
* big part of what delivering an interrupt does. */
static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
{
/* Stack grows upwards: move stack then write value. */
*gstack -= 4;
lgwrite_u32(lg, *gstack, val);
}
/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
* trap. The mechanics of delivering traps and interrupts to the Guest are the
* same, except some traps have an "error code" which gets pushed onto the
* stack as well: the caller tells us if this is one.
*
* "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
* interrupt or trap. It's split into two parts for traditional reasons: gcc
* on i386 used to be frightened by 64 bit numbers.
*
* We set up the stack just like the CPU does for a real interrupt, so it's
* identical for the Guest (and the standard "iret" instruction will undo
* it). */
static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
{
unsigned long gstack;
u32 eflags, ss, irq_enable;
/* If they want a ring change, we use new stack and push old ss/esp */
/* There are two cases for interrupts: one where the Guest is already
* in the kernel, and a more complex one where the Guest is in
* userspace. We check the privilege level to find out. */
if ((lg->regs->ss&0x3) != GUEST_PL) {
/* The Guest told us their kernel stack with the SET_STACK
* hypercall: both the virtual address and the segment */
gstack = guest_pa(lg, lg->esp1);
ss = lg->ss1;
/* We push the old stack segment and pointer onto the new
* stack: when the Guest does an "iret" back from the interrupt
* handler the CPU will notice they're dropping privilege
* levels and expect these here. */
push_guest_stack(lg, &gstack, lg->regs->ss);
push_guest_stack(lg, &gstack, lg->regs->esp);
} else {
/* We're staying on the same Guest (kernel) stack. */
gstack = guest_pa(lg, lg->regs->esp);
ss = lg->regs->ss;
}
/* We use IF bit in eflags to indicate whether irqs were enabled
(it's always 1, since irqs are enabled when guest is running). */
/* Remember that we never let the Guest actually disable interrupts, so
* the "Interrupt Flag" bit is always set. We copy that bit from the
* Guest's "irq_enabled" field into the eflags word: the Guest copies
* it back in "lguest_iret". */
eflags = lg->regs->eflags;
if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
&& !(irq_enable & X86_EFLAGS_IF))
eflags &= ~X86_EFLAGS_IF;
/* An interrupt is expected to push three things on the stack: the old
* "eflags" word, the old code segment, and the old instruction
* pointer. */
push_guest_stack(lg, &gstack, eflags);
push_guest_stack(lg, &gstack, lg->regs->cs);
push_guest_stack(lg, &gstack, lg->regs->eip);
/* For the six traps which supply an error code, we push that, too. */
if (has_err)
push_guest_stack(lg, &gstack, lg->regs->errcode);
/* Change the real stack so switcher returns to trap handler */
/* Now we've pushed all the old state, we change the stack, the code
* segment and the address to execute. */
lg->regs->ss = ss;
lg->regs->esp = gstack + lg->page_offset;
lg->regs->cs = (__KERNEL_CS|GUEST_PL);
lg->regs->eip = idt_address(lo, hi);
/* Disable interrupts for an interrupt gate. */
/* There are two kinds of interrupt handlers: 0xE is an "interrupt
* gate" which expects interrupts to be disabled on entry. */
if (idt_type(lo, hi) == 0xE)
if (put_user(0, &lg->lguest_data->irq_enabled))
kill_guest(lg, "Disabling interrupts");
}
/*H:200
* Virtual Interrupts.
*
* maybe_do_interrupt() gets called before every entry to the Guest, to see if
* we should divert the Guest to running an interrupt handler. */
void maybe_do_interrupt(struct lguest *lg)
{
unsigned int irq;
DECLARE_BITMAP(blk, LGUEST_IRQS);
struct desc_struct *idt;
/* If the Guest hasn't even initialized yet, we can do nothing. */
if (!lg->lguest_data)
return;
/* Mask out any interrupts they have blocked. */
/* Take our "irqs_pending" array and remove any interrupts the Guest
* wants blocked: the result ends up in "blk". */
if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
sizeof(blk)))
return;
bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
/* Find the first interrupt. */
irq = find_first_bit(blk, LGUEST_IRQS);
/* None? Nothing to do */
if (irq >= LGUEST_IRQS)
return;
/* They may be in the middle of an iret, where they asked us never to
* deliver interrupts. */
if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
return;
/* If they're halted, we re-enable interrupts. */
/* If they're halted, interrupts restart them. */
if (lg->halted) {
/* Re-enable interrupts. */
if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
kill_guest(lg, "Re-enabling interrupts");
lg->halted = 0;
} else {
/* Maybe they have interrupts disabled? */
/* Otherwise we check if they have interrupts disabled. */
u32 irq_enabled;
if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
irq_enabled = 0;
......@@ -115,112 +162,197 @@ void maybe_do_interrupt(struct lguest *lg)
return;
}
/* Look at the IDT entry the Guest gave us for this interrupt. The
* first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
* over them. */
idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
/* If they don't have a handler (yet?), we just ignore it */
if (idt_present(idt->a, idt->b)) {
/* OK, mark it no longer pending and deliver it. */
clear_bit(irq, lg->irqs_pending);
/* set_guest_interrupt() takes the interrupt descriptor and a
* flag to say whether this interrupt pushes an error code onto
* the stack as well: virtual interrupts never do. */
set_guest_interrupt(lg, idt->a, idt->b, 0);
}
}
/*H:220 Now we've got the routines to deliver interrupts, delivering traps
* like page fault is easy. The only trick is that Intel decided that some
* traps should have error codes: */
static int has_err(unsigned int trap)
{
return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
}
/* deliver_trap() returns true if it could deliver the trap. */
int deliver_trap(struct lguest *lg, unsigned int num)
{
u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
/* Early on the Guest hasn't set the IDT entries (or maybe it put a
* bogus one in): if we fail here, the Guest will be killed. */
if (!idt_present(lo, hi))
return 0;
set_guest_interrupt(lg, lo, hi, has_err(num));
return 1;
}
/*H:250 Here's the hard part: returning to the Host every time a trap happens
* and then calling deliver_trap() and re-entering the Guest is slow.
* Particularly because Guest userspace system calls are traps (trap 128).
*
* So we'd like to set up the IDT to tell the CPU to deliver traps directly
* into the Guest. This is possible, but the complexities cause the size of
* this file to double! However, 150 lines of code is worth writing for taking
* system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
* the other hypervisors would tease it.
*
* This routine determines if a trap can be delivered directly. */
static int direct_trap(const struct lguest *lg,
const struct desc_struct *trap,
unsigned int num)
{
/* Hardware interrupts don't go to guest (except syscall). */
/* Hardware interrupts don't go to the Guest at all (except system
* call). */
if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
return 0;
/* We intercept page fault (demand shadow paging & cr2 saving)
protection fault (in/out emulation) and device not
available (TS handling), and hypercall */
/* The Host needs to see page faults (for shadow paging and to save the
* fault address), general protection faults (in/out emulation) and
* device not available (TS handling), and of course, the hypercall
* trap. */
if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
return 0;
/* Interrupt gates (0xE) or not present (0x0) can't go direct. */
/* Only trap gates (type 15) can go direct to the Guest. Interrupt
* gates (type 14) disable interrupts as they are entered, which we
* never let the Guest do. Not present entries (type 0x0) also can't
* go direct, of course 8) */
return idt_type(trap->a, trap->b) == 0xF;
}
/*H:260 When we make traps go directly into the Guest, we need to make sure
* the kernel stack is valid (ie. mapped in the page tables). Otherwise, the
* CPU trying to deliver the trap will fault while trying to push the interrupt
* words on the stack: this is called a double fault, and it forces us to kill
* the Guest.
*
* Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
void pin_stack_pages(struct lguest *lg)
{
unsigned int i;
/* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
* two pages of stack space. */
for (i = 0; i < lg->stack_pages; i++)
/* The stack grows *upwards*, hence the subtraction */
pin_page(lg, lg->esp1 - i * PAGE_SIZE);
}
/* Direct traps also mean that we need to know whenever the Guest wants to use
* a different kernel stack, so we can change the IDT entries to use that
* stack. The IDT entries expect a virtual address, so unlike most addresses
* the Guest gives us, the "esp" (stack pointer) value here is virtual, not
* physical.
*
* In Linux each process has its own kernel stack, so this happens a lot: we
* change stacks on each context switch. */
void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
{
/* You cannot have a stack segment with priv level 0. */
/* You are not allowd have a stack segment with privilege level 0: bad
* Guest! */
if ((seg & 0x3) != GUEST_PL)
kill_guest(lg, "bad stack segment %i", seg);
/* We only expect one or two stack pages. */
if (pages > 2)
kill_guest(lg, "bad stack pages %u", pages);
/* Save where the stack is, and how many pages */
lg->ss1 = seg;
lg->esp1 = esp;
lg->stack_pages = pages;
/* Make sure the new stack pages are mapped */
pin_stack_pages(lg);
}
/* Set up trap in IDT. */
/* All this reference to mapping stacks leads us neatly into the other complex
* part of the Host: page table handling. */
/*H:235 This is the routine which actually checks the Guest's IDT entry and
* transfers it into our entry in "struct lguest": */
static void set_trap(struct lguest *lg, struct desc_struct *trap,
unsigned int num, u32 lo, u32 hi)
{
u8 type = idt_type(lo, hi);
/* We zero-out a not-present entry */
if (!idt_present(lo, hi)) {
trap->a = trap->b = 0;
return;
}
/* We only support interrupt and trap gates. */
if (type != 0xE && type != 0xF)
kill_guest(lg, "bad IDT type %i", type);
/* We only copy the handler address, present bit, privilege level and
* type. The privilege level controls where the trap can be triggered
* manually with an "int" instruction. This is usually GUEST_PL,
* except for system calls which userspace can use. */
trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
trap->b = (hi&0xFFFFEF00);
}
/*H:230 While we're here, dealing with delivering traps and interrupts to the
* Guest, we might as well complete the picture: how the Guest tells us where
* it wants them to go. This would be simple, except making traps fast
* requires some tricks.
*
* We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
* LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
{
/* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
/* Guest never handles: NMI, doublefault, spurious interrupt or
* hypercall. We ignore when it tries to set them. */
if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
return;
/* Mark the IDT as changed: next time the Guest runs we'll know we have
* to copy this again. */
lg->changed |= CHANGED_IDT;
/* The IDT which we keep in "struct lguest" only contains 32 entries
* for the traps and LGUEST_IRQS (32) entries for interrupts. We
* ignore attempts to set handlers for higher interrupt numbers, except
* for the system call "interrupt" at 128: we have a special IDT entry
* for that. */
if (num < ARRAY_SIZE(lg->idt))
set_trap(lg, &lg->idt[num], num, lo, hi);
else if (num == SYSCALL_VECTOR)
set_trap(lg, &lg->syscall_idt, num, lo, hi);
}
/* The default entry for each interrupt points into the Switcher routines which
* simply return to the Host. The run_guest() loop will then call
* deliver_trap() to bounce it back into the Guest. */
static void default_idt_entry(struct desc_struct *idt,
int trap,
const unsigned long handler)
{
/* A present interrupt gate. */
u32 flags = 0x8e00;
/* They can't "int" into any of them except hypercall. */
/* Set the privilege level on the entry for the hypercall: this allows
* the Guest to use the "int" instruction to trigger it. */
if (trap == LGUEST_TRAP_ENTRY)
flags |= (GUEST_PL << 13);
/* Now pack it into the IDT entry in its weird format. */
idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
idt->b = (handler&0xFFFF0000) | flags;
}
/* When the Guest first starts, we put default entries into the IDT. */
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def)
{
......@@ -230,19 +362,25 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
default_idt_entry(&state->guest_idt[i], i, def[i]);
}
/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
* we copy them into the IDT which we've set up for Guests on this CPU, just
* before we run the Guest. This routine does that copy. */
void copy_traps(const struct lguest *lg, struct desc_struct *idt,
const unsigned long *def)
{
unsigned int i;
/* All hardware interrupts are same whatever the guest: only the
* traps might be different. */
/* We can simply copy the direct traps, otherwise we use the default
* ones in the Switcher: they will return to the Host. */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
if (direct_trap(lg, &lg->idt[i], i))
idt[i] = lg->idt[i];
else
default_idt_entry(&idt[i], i, def[i]);
}
/* Don't forget the system call trap! The IDT entries for other
* interupts never change, so no need to copy them. */
i = SYSCALL_VECTOR;
if (direct_trap(lg, &lg->syscall_idt, i))
idt[i] = lg->syscall_idt;
......
......@@ -58,9 +58,18 @@ struct lguest_dma_info
u8 interrupt; /* 0 when not registered */
};
/* We have separate types for the guest's ptes & pgds and the shadow ptes &
* pgds. Since this host might use three-level pagetables and the guest and
* shadow pagetables don't, we can't use the normal pte_t/pgd_t. */
/*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He
* reviewed the original code which used "u32" for all page table entries, and
* insisted that it would be far clearer with explicit typing. I thought it
* was overkill, but he was right: it is much clearer than it was before.
*
* We have separate types for the Guest's ptes & pgds and the shadow ptes &
* pgds. There's already a Linux type for these (pte_t and pgd_t) but they
* change depending on kernel config options (PAE). */
/* Each entry is identical: lower 12 bits of flags and upper 20 bits for the
* "page frame number" (0 == first physical page, etc). They are different
* types so the compiler will warn us if we mix them improperly. */
typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
......@@ -77,8 +86,12 @@ typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
} gpte_t;
/* We have two convenient macros to convert a "raw" value as handed to us by
* the Guest into the correct Guest PGD or PTE type. */
#define mkgpte(_val) ((gpte_t){.raw.val = _val})
#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
/*:*/
struct pgdir
{
......
此差异已折叠。
......@@ -11,17 +11,58 @@
* from frolicking through its parklike serenity. :*/
#include "lg.h"
/*H:600
* We've almost completed the Host; there's just one file to go!
*
* Segments & The Global Descriptor Table
*
* (That title sounds like a bad Nerdcore group. Not to suggest that there are
* any good Nerdcore groups, but in high school a friend of mine had a band
* called Joe Fish and the Chips, so there are definitely worse band names).
*
* To refresh: the GDT is a table of 8-byte values describing segments. Once
* set up, these segments can be loaded into one of the 6 "segment registers".
*
* GDT entries are passed around as "struct desc_struct"s, which like IDT
* entries are split into two 32-bit members, "a" and "b". One day, someone
* will clean that up, and be declared a Hero. (No pressure, I'm just saying).
*
* Anyway, the GDT entry contains a base (the start address of the segment), a
* limit (the size of the segment - 1), and some flags. Sounds simple, and it
* would be, except those zany Intel engineers decided that it was too boring
* to put the base at one end, the limit at the other, and the flags in
* between. They decided to shotgun the bits at random throughout the 8 bytes,
* like so:
*
* 0 16 40 48 52 56 63
* [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ]
* mit ags part 2
* part 2
*
* As a result, this file contains a certain amount of magic numeracy. Let's
* begin.
*/
/* Is the descriptor the Guest wants us to put in OK?
*
* The flag which Intel says must be zero: must be zero. The descriptor must
* be present, (this is actually checked earlier but is here for thorougness),
* and the descriptor type must be 1 (a memory segment). */
static int desc_ok(const struct desc_struct *gdt)
{
/* MBZ=0, P=1, DT=1 */
return ((gdt->b & 0x00209000) == 0x00009000);
}
/* Is the segment present? (Otherwise it can't be used by the Guest). */
static int segment_present(const struct desc_struct *gdt)
{
return gdt->b & 0x8000;
}
/* There are several entries we don't let the Guest set. The TSS entry is the
* "Task State Segment" which controls all kinds of delicate things. The
* LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
* the Guest can't be trusted to deal with double faults. */
static int ignored_gdt(unsigned int num)
{
return (num == GDT_ENTRY_TSS
......@@ -30,9 +71,18 @@ static int ignored_gdt(unsigned int num)
|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
}
/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
/* If the Guest asks us to remove an entry from the GDT, we have to be careful.
* If one of the segment registers is pointing at that entry the Switcher will
* crash when it tries to reload the segment registers for the Guest.
*
* It doesn't make much sense for the Guest to try to remove its own code, data
* or stack segments while they're in use: assume that's a Guest bug. If it's
* one of the lesser segment registers using the removed entry, we simply set
* that register to 0 (unusable). */
static void check_segment_use(struct lguest *lg, unsigned int desc)
{
/* GDT entries are 8 bytes long, so we divide to get the index and
* ignore the bottom bits. */
if (lg->regs->gs / 8 == desc)
lg->regs->gs = 0;
if (lg->regs->fs / 8 == desc)
......@@ -45,12 +95,16 @@ static void check_segment_use(struct lguest *lg, unsigned int desc)
kill_guest(lg, "Removed live GDT entry %u", desc);
}
/*H:610 Once the GDT has been changed, we look through the changed entries and
* see if they're OK. If not, we'll call kill_guest() and the Guest will never
* get to use the invalid entries. */
static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
{
unsigned int i;
for (i = start; i < end; i++) {
/* We never copy these ones to real gdt */
/* We never copy these ones to real GDT, so we don't care what
* they say */
if (ignored_gdt(i))
continue;
......@@ -64,41 +118,57 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
if (!desc_ok(&lg->gdt[i]))
kill_guest(lg, "Bad GDT descriptor %i", i);
/* DPL 0 presumably means "for use by guest". */
/* Segment descriptors contain a privilege level: the Guest is
* sometimes careless and leaves this as 0, even though it's
* running at privilege level 1. If so, we fix it here. */
if ((lg->gdt[i].b & 0x00006000) == 0)
lg->gdt[i].b |= (GUEST_PL << 13);
/* Set accessed bit, since gdt isn't writable. */
/* Each descriptor has an "accessed" bit. If we don't set it
* now, the CPU will try to set it when the Guest first loads
* that entry into a segment register. But the GDT isn't
* writable by the Guest, so bad things can happen. */
lg->gdt[i].b |= 0x00000100;
}
}
/* This routine is called at boot or modprobe time for each CPU to set up the
* "constant" GDT entries for Guests running on that CPU. */
void setup_default_gdt_entries(struct lguest_ro_state *state)
{
struct desc_struct *gdt = state->guest_gdt;
unsigned long tss = (unsigned long)&state->guest_tss;
/* Hypervisor segments. */
/* The hypervisor segments are full 0-4G segments, privilege level 0 */
gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
/* This is the one which we *cannot* copy from guest, since tss
is depended on this lguest_ro_state, ie. this cpu. */
/* The TSS segment refers to the TSS entry for this CPU, so we cannot
* copy it from the Guest. Forgive the magic flags */
gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
| ((tss >> 16) & 0x000000FF);
}
/* This routine is called before the Guest is run for the first time. */
void setup_guest_gdt(struct lguest *lg)
{
/* Start with full 0-4G segments... */
lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
/* ...except the Guest is allowed to use them, so set the privilege
* level appropriately in the flags. */
lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
}
/* This is a fast version for the common case where only the three TLS entries
* have changed. */
/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the
* GDTs for each CPU, then we copy across the entries each time we want to run
* a different Guest on that CPU. */
/* A partial GDT load, for the three "thead-local storage" entries. Otherwise
* it's just like load_guest_gdt(). So much, in fact, it would probably be
* neater to have a single hypercall to cover both. */
void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
{
unsigned int i;
......@@ -107,22 +177,31 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
gdt[i] = lg->gdt[i];
}
/* This is the full version */
void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
{
unsigned int i;
/* The default entries from setup_default_gdt_entries() are not
* replaced. See ignored_gdt() above. */
for (i = 0; i < GDT_ENTRIES; i++)
if (!ignored_gdt(i))
gdt[i] = lg->gdt[i];
}
/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */
void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
{
/* We assume the Guest has the same number of GDT entries as the
* Host, otherwise we'd have to dynamically allocate the Guest GDT. */
if (num > ARRAY_SIZE(lg->gdt))
kill_guest(lg, "too many gdt entries %i", num);
/* We read the whole thing in, then fix it up. */
lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
/* Mark that the GDT changed so the core knows it has to copy it again,
* even if the Guest is run on the same CPU. */
lg->changed |= CHANGED_GDT;
}
......@@ -134,3 +213,13 @@ void guest_load_tls(struct lguest *lg, unsigned long gtls)
fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
lg->changed |= CHANGED_GDT_TLS;
}
/*
* With this, we have finished the Host.
*
* Five of the seven parts of our task are complete. You have made it through
* the Bit of Despair (I think that's somewhere in the page table code,
* myself).
*
* Next, we examine "make Switcher". It's short, but intense.
*/
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册