提交 ecda85e7 编写于 作者: J Juergen Gross 提交者: Ingo Molnar

x86/lguest: Remove lguest support

Lguest seems to be rather unused these days. It has seen only patches
ensuring it still builds the last two years and its official state is
"Odd Fixes".

Remove it in order to be able to clean up the paravirt code.
Signed-off-by: NJuergen Gross <jgross@suse.com>
Acked-by: NRusty Russell <rusty@rustcorp.com.au>
Acked-by: NThomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: boris.ostrovsky@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: rusty@rustcorp.com.au
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20170816173157.8633-3-jgross@suse.comSigned-off-by: NIngo Molnar <mingo@kernel.org>
上级 edcb5cf8
......@@ -7640,17 +7640,6 @@ T: git git://linuxtv.org/mkrufky/tuners.git
S: Maintained
F: drivers/media/dvb-frontends/lgdt3305.*
LGUEST
M: Rusty Russell <rusty@rustcorp.com.au>
L: lguest@lists.ozlabs.org
W: http://lguest.ozlabs.org/
S: Odd Fixes
F: arch/x86/include/asm/lguest*.h
F: arch/x86/lguest/
F: drivers/lguest/
F: include/linux/lguest*.h
F: tools/lguest/
LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
M: Viresh Kumar <vireshk@kernel.org>
L: linux-ide@vger.kernel.org
......
......@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
# Hyper-V paravirtualization support
obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
# lguest paravirtualization support
obj-$(CONFIG_LGUEST_GUEST) += lguest/
obj-y += realmode/
obj-y += kernel/
obj-y += mm/
......
......@@ -777,8 +777,6 @@ config KVM_DEBUG_FS
Statistics are displayed in debugfs filesystem. Enabling this option
may incur significant overhead.
source "arch/x86/lguest/Kconfig"
config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
......
#ifndef _ASM_X86_LGUEST_H
#define _ASM_X86_LGUEST_H
#define GDT_ENTRY_LGUEST_CS 10
#define GDT_ENTRY_LGUEST_DS 11
#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
#ifndef __ASSEMBLY__
#include <asm/desc.h>
#define GUEST_PL 1
/* Page for Switcher text itself, then two pages per cpu */
#define SWITCHER_TEXT_PAGES (1)
#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
/* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr;
/* Found in switcher.S */
extern unsigned long default_idt_entries[];
/* Declarations for definitions in arch/x86/lguest/head_32.S */
extern char lguest_noirq_iret[];
extern const char lgstart_cli[], lgend_cli[];
extern const char lgstart_pushf[], lgend_pushf[];
extern void lguest_iret(void);
extern void lguest_init(void);
struct lguest_regs {
/* Manually saved part. */
unsigned long eax, ebx, ecx, edx;
unsigned long esi, edi, ebp;
unsigned long gs;
unsigned long fs, ds, es;
unsigned long trapnum, errcode;
/* Trap pushed part */
unsigned long eip;
unsigned long cs;
unsigned long eflags;
unsigned long esp;
unsigned long ss;
};
/* This is a guest-specific page (mapped ro) into the guest. */
struct lguest_ro_state {
/* Host information we need to restore when we switch back. */
u32 host_cr3;
struct desc_ptr host_idt_desc;
struct desc_ptr host_gdt_desc;
u32 host_sp;
/* Fields which are used when guest is running. */
struct desc_ptr guest_idt_desc;
struct desc_ptr guest_gdt_desc;
struct x86_hw_tss guest_tss;
struct desc_struct guest_idt[IDT_ENTRIES];
struct desc_struct guest_gdt[GDT_ENTRIES];
};
struct lg_cpu_arch {
/* The GDT entries copied into lguest_ro_state when running. */
struct desc_struct gdt[GDT_ENTRIES];
/* The IDT entries: some copied into lguest_ro_state when running. */
struct desc_struct idt[IDT_ENTRIES];
/* The address of the last guest-visible pagefault (ie. cr2). */
unsigned long last_pagefault;
};
static inline void lguest_set_ts(void)
{
u32 cr0;
cr0 = read_cr0();
if (!(cr0 & 8))
write_cr0(cr0 | 8);
}
/* Full 4G segment descriptors, suitable for CS and DS. */
#define FULL_EXEC_SEGMENT \
((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_LGUEST_H */
/* Architecture specific portion of the lguest hypercalls */
#ifndef _ASM_X86_LGUEST_HCALL_H
#define _ASM_X86_LGUEST_HCALL_H
#define LHCALL_FLUSH_ASYNC 0
#define LHCALL_LGUEST_INIT 1
#define LHCALL_SHUTDOWN 2
#define LHCALL_NEW_PGTABLE 4
#define LHCALL_FLUSH_TLB 5
#define LHCALL_LOAD_IDT_ENTRY 6
#define LHCALL_SET_STACK 7
#define LHCALL_SET_CLOCKEVENT 9
#define LHCALL_HALT 10
#define LHCALL_SET_PMD 13
#define LHCALL_SET_PTE 14
#define LHCALL_SET_PGD 15
#define LHCALL_LOAD_TLS 16
#define LHCALL_LOAD_GDT_ENTRY 18
#define LHCALL_SEND_INTERRUPTS 19
#define LGUEST_TRAP_ENTRY 0x1F
/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
#define LGUEST_SHUTDOWN_POWEROFF 1
#define LGUEST_SHUTDOWN_RESTART 2
#ifndef __ASSEMBLY__
#include <asm/hw_irq.h>
/*G:030
* But first, how does our Guest contact the Host to ask for privileged
* operations? There are two ways: the direct way is to make a "hypercall",
* to make requests of the Host Itself.
*
* Our hypercall mechanism uses the highest unused trap code (traps 32 and
* above are used by real hardware interrupts). Seventeen hypercalls are
* available: the hypercall number is put in the %eax register, and the
* arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
* If a return value makes sense, it's returned in %eax.
*
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
* Host, rather than returning failure. This reflects Winston Churchill's
* definition of a gentleman: "someone who is only rude intentionally".
*/
static inline unsigned long
hcall(unsigned long call,
unsigned long arg1, unsigned long arg2, unsigned long arg3,
unsigned long arg4)
{
/* "int" is the Intel instruction to trigger a trap. */
asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
/* The call in %eax (aka "a") might be overwritten */
: "=a"(call)
/* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
: "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
/* "memory" means this might write somewhere in memory.
* This isn't true for all calls, but it's safe to tell
* gcc that it might happen so it doesn't get clever. */
: "memory");
return call;
}
/*:*/
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
#define LHCALL_RING_SIZE 64
struct hcall_args {
/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
unsigned long arg0, arg1, arg2, arg3, arg4;
};
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_LGUEST_HCALL_H */
......@@ -662,7 +662,7 @@ static inline void sync_core(void)
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
* unconditional jump. That sequence also works on all CPUs,
* but it will fault at CPL3 (i.e. Xen PV and lguest).
* but it will fault at CPL3 (i.e. Xen PV).
*
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
......
......@@ -201,7 +201,7 @@ struct boot_params {
*
* @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
* PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
* @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
* @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
......
......@@ -4,9 +4,6 @@
#include <asm/ucontext.h>
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
......@@ -62,23 +59,6 @@ void foo(void)
OFFSET(stack_canary_offset, stack_canary, canary);
#endif
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
#endif
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
DEFINE(NR_syscalls, sizeof(syscalls));
......
......@@ -155,7 +155,6 @@ ENTRY(startup_32)
jmp *%eax
.Lbad_subarch:
WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
nothing we can do at this point. */
......@@ -165,7 +164,6 @@ WEAK(xen_entry)
subarch_entries:
.long .Ldefault_entry /* normal x86/PC */
.long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
.long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
......
......@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
x86_platform.legacy.reserve_bios_regions = 1;
break;
case X86_SUBARCH_XEN:
case X86_SUBARCH_LGUEST:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
break;
......
......@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
source drivers/lguest/Kconfig
endif # VIRTUALIZATION
config LGUEST_GUEST
bool "Lguest guest support"
depends on X86_32 && PARAVIRT && PCI
select TTY
select VIRTUALIZATION
select VIRTIO
select VIRTIO_CONSOLE
help
Lguest is a tiny in-kernel hypervisor. Selecting this will
allow your kernel to boot under lguest. This option will increase
your kernel size by about 10k. If in doubt, say N.
If you say Y here, make sure you say Y (or M) to the virtio block
and net drivers which lguest needs.
obj-y := head_32.o boot.o
CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
此差异已折叠。
#include <linux/linkage.h>
#include <linux/lguest.h>
#include <asm/lguest_hcall.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
/*G:020
* Our story starts with the bzImage: booting starts at startup_32 in
* arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
* kernel in place and then jumps into it: startup_32 in
* arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
* register, which is created by the bootloader (the Launcher in our case).
*
* The startup_32 function does very little: it clears the uninitialized global
* C variables which we expect to be zero (ie. BSS) and then copies the boot
* header and kernel command line somewhere safe, and populates some initial
* page tables. Finally it checks the 'hardware_subarch' field. This was
* introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
* assigned number), then it calls us here.
*
* WARNING: be very careful here! We're running at addresses equal to physical
* addresses (around 0), not above PAGE_OFFSET as most code expects
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data without remembering to subtract __PAGE_OFFSET!
*
* The .section line puts this code in .init.text so it will be discarded after
* boot.
*/
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
/*
* We make the "initialization" hypercall now to tell the Host where
* our lguest_data struct is.
*/
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx
int $LGUEST_TRAP_ENTRY
/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
movl $LHCALL_NEW_PGTABLE, %eax
movl $(initial_page_table - __PAGE_OFFSET), %ebx
int $LGUEST_TRAP_ENTRY
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
/* Jumps are relative: we're running __PAGE_OFFSET too low. */
jmp lguest_init+__PAGE_OFFSET
/*G:055
* We create a macro which puts the assembler code between lgstart_ and lgend_
* markers. These templates are put in the .text section: they can't be
* discarded after boot as we may need to patch modules, too.
*/
.text
#define LGUEST_PATCH(name, insns...) \
lgstart_##name: insns; lgend_##name:; \
.globl lgstart_##name; .globl lgend_##name
LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
/*G:033
* But using those wrappers is inefficient (we'll see why that doesn't matter
* for save_fl and irq_disable later). If we write our routines carefully in
* assembler, we can avoid clobbering any registers and avoid jumping through
* the wrapper functions.
*
* I skipped over our first piece of assembler, but this one is worth studying
* in a bit more detail so I'll describe in easy stages. First, the routine to
* enable interrupts:
*/
ENTRY(lg_irq_enable)
/*
* The reverse of irq_disable, this sets lguest_data.irq_enabled to
* X86_EFLAGS_IF (ie. "Interrupts enabled").
*/
movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
/*
* But now we need to check if the Host wants to know: there might have
* been interrupts waiting to be delivered, in which case it will have
* set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
* jump to send_interrupts, otherwise we're done.
*/
cmpl $0, lguest_data+LGUEST_DATA_irq_pending
jnz send_interrupts
/*
* One cool thing about x86 is that you can do many things without using
* a register. In this case, the normal path hasn't needed to save or
* restore any registers at all!
*/
ret
send_interrupts:
/*
* OK, now we need a register: eax is used for the hypercall number,
* which is LHCALL_SEND_INTERRUPTS.
*
* We used not to bother with this pending detection at all, which was
* much simpler. Sooner or later the Host would realize it had to
* send us an interrupt. But that turns out to make performance 7
* times worse on a simple tcp benchmark. So now we do this the hard
* way.
*/
pushl %eax
movl $LHCALL_SEND_INTERRUPTS, %eax
/* This is the actual hypercall trap. */
int $LGUEST_TRAP_ENTRY
/* Put eax back the way we found it. */
popl %eax
ret
/*
* Finally, the "popf" or "restore flags" routine. The %eax register holds the
* flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
* enabling interrupts again, if it's 0 we're leaving them off.
*/
ENTRY(lg_restore_fl)
/* This is just "lguest_data.irq_enabled = flags;" */
movl %eax, lguest_data+LGUEST_DATA_irq_enabled
/*
* Now, if the %eax value has enabled interrupts and
* lguest_data.irq_pending is set, we want to tell the Host so it can
* deliver any outstanding interrupts. Fortunately, both values will
* be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
* instruction will AND them together for us. If both are set, we
* jump to send_interrupts.
*/
testl lguest_data+LGUEST_DATA_irq_pending, %eax
jnz send_interrupts
/* Again, the normal path has used no extra registers. Clever, huh? */
ret
/*:*/
/* These demark the EIP where host should never deliver interrupts. */
.global lguest_noirq_iret
/*M:004
* When the Host reflects a trap or injects an interrupt into the Guest, it
* sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
* so the Guest iret logic does the right thing when restoring it. However,
* when the Host sets the Guest up for direct traps, such as system calls, the
* processor is the one to push eflags onto the stack, and the interrupt bit
* will be 1 (in reality, interrupts are always enabled in the Guest).
*
* This turns out to be harmless: the only trap which should happen under Linux
* with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
* regions), which has to be reflected through the Host anyway. If another
* trap *does* go off when interrupts are disabled, the Guest will panic, and
* we'll never get to this iret!
:*/
/*G:045
* There is one final paravirt_op that the Guest implements, and glancing at it
* you can see why I left it to last. It's *cool*! It's in *assembler*!
*
* The "iret" instruction is used to return from an interrupt or trap. The
* stack looks like this:
* old address
* old code segment & privilege level
* old processor flags ("eflags")
*
* The "iret" instruction pops those values off the stack and restores them all
* at once. The only problem is that eflags includes the Interrupt Flag which
* the Guest can't change: the CPU will simply ignore it when we do an "iret".
* So we have to copy eflags from the stack to lguest_data.irq_enabled before
* we do the "iret".
*
* There are two problems with this: firstly, we can't clobber any registers
* and secondly, the whole thing needs to be atomic. The first problem
* is solved by using "push memory"/"pop memory" instruction pair for copying.
*
* The second is harder: copying eflags to lguest_data.irq_enabled will turn
* interrupts on before we're finished, so we could be interrupted before we
* return to userspace or wherever. Our solution to this is to tell the
* Host that it is *never* to interrupt us there, even if interrupts seem to be
* enabled. (It's not necessary to protect pop instruction, since
* data gets updated only after it completes, so we only need to protect
* one instruction, iret).
*/
ENTRY(lguest_iret)
pushl 2*4(%esp)
/*
* Note the %ss: segment prefix here. Normal data accesses use the
* "ds" segment, but that will have already been restored for whatever
* we're returning to (such as userspace): we can't trust it. The %ss:
* prefix makes sure we use the stack segment, which is still valid.
*/
popl %ss:lguest_data+LGUEST_DATA_irq_enabled
lguest_noirq_iret:
iret
......@@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY) += accessibility/
obj-$(CONFIG_ISDN) += isdn/
obj-$(CONFIG_EDAC) += edac/
obj-$(CONFIG_EISA) += eisa/
obj-y += lguest/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_CPU_IDLE) += cpuidle/
obj-y += mmc/
......
......@@ -470,7 +470,7 @@ config VIRTIO_BLK
depends on VIRTIO
---help---
This is the virtual block driver for virtio. It can be used with
lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
QEMU based VMMs (like KVM or Xen). Say Y or M.
config VIRTIO_BLK_SCSI
bool "SCSI passthrough request for the Virtio block driver"
......
......@@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
depends on VIRTIO && TTY
select HVC_DRIVER
help
Virtio console for use with lguest and other hypervisors.
Virtio console for use with hypervisors.
Also serves as a general-purpose serial device for data
transfer between the guest and host. Character devices at
......
......@@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
* We turn the characters into a scatter-gather list, add it to the
* output queue and then kick the Host. Then we sit here waiting for
* it to finish: inefficient in theory, but in practice
* implementations will do it immediately (lguest's Launcher does).
* implementations will do it immediately.
*/
static int put_chars(u32 vtermno, const char *buf, int count)
{
......
config LGUEST
tristate "Linux hypervisor example code"
depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
select HVC_DRIVER
---help---
This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the
"lguest" command found in the tools/lguest directory.
Note that "lguest" is pronounced to rhyme with "fell quest",
not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y.
# Host requires the other files, which can be a module.
obj-$(CONFIG_LGUEST) += lg.o
lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
segments.o lguest_user.o
lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
Preparation Preparation!: PREFIX=P
Guest: PREFIX=G
Drivers: PREFIX=D
Launcher: PREFIX=L
Host: PREFIX=H
Switcher: PREFIX=S
Mastery: PREFIX=M
Beer:
@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
Puppy:
@clear
@printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n"
@sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear
@printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n"
@sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear
@printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n"
@sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear
Welcome, friend reader, to lguest.
Lguest is an adventure, with you, the reader, as Hero. I can't think of many
5000-line projects which offer both such capability and glimpses of future
potential; it is an exciting time to be delving into the source!
But be warned; this is an arduous journey of several hours or more! And as we
know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or
equivalent) to anyone I meet who has completed this documentation.
So get comfortable and keep your wits about you (both quick and humorous).
Along your way to the Noble Goal, you will also gain masterly insight into
lguest, and hypervisors and x86 virtualization in general.
Our Quest is in seven parts: (best read with C highlighting turned on)
I) Preparation
- In which our potential hero is flown quickly over the landscape for a
taste of its scope. Suitable for the armchair coders and other such
persons of faint constitution.
II) Guest
- Where we encounter the first tantalising wisps of code, and come to
understand the details of the life of a Guest kernel.
III) Drivers
- Whereby the Guest finds its voice and become useful, and our
understanding of the Guest is completed.
IV) Launcher
- Where we trace back to the creation of the Guest, and thus begin our
understanding of the Host.
V) Host
- Where we master the Host code, through a long and tortuous journey.
Indeed, it is here that our hero is tested in the Bit of Despair.
VI) Switcher
- Where our understanding of the intertwined nature of Guests and Hosts
is completed.
VII) Mastery
- Where our fully fledged hero grapples with the Great Question:
"What next?"
make Preparation!
Rusty Russell.
/*P:400
* This contains run_guest() which actually calls into the Host<->Guest
* Switcher and analyzes the return, such as determining if the Guest wants the
* Host to do something. This file also contains useful helper routines.
:*/
#include <linux/module.h>
#include <linux/stringify.h>
#include <linux/stddef.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <asm/paravirt.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
#include <asm/poll.h>
#include <asm/asm-offsets.h>
#include "lg.h"
unsigned long switcher_addr;
struct page **lg_switcher_pages;
static struct vm_struct *switcher_text_vma;
static struct vm_struct *switcher_stacks_vma;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
/*H:010
* We need to set up the Switcher at a high virtual address. Remember the
* Switcher is a few hundred bytes of assembler code which actually changes the
* CPU to run the Guest, and then changes back to the Host when a trap or
* interrupt happens.
*
* The Switcher code must be at the same virtual address in the Guest as the
* Host since it will be running as the switchover occurs.
*
* Trying to map memory at a particular address is an unusual thing to do, so
* it's not a simple one-liner.
*/
static __init int map_switcher(void)
{
int i, err;
/*
* Map the Switcher in to high memory.
*
* It turns out that if we choose the address 0xFFC00000 (4MB under the
* top virtual address), it makes setting up the page tables really
* easy.
*/
/* We assume Switcher text fits into a single page. */
if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
end_switcher_text - start_switcher_text);
return -EINVAL;
}
/*
* We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages.
*/
lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
* TOTAL_SWITCHER_PAGES,
GFP_KERNEL);
if (!lg_switcher_pages) {
err = -ENOMEM;
goto out;
}
/*
* Now we actually allocate the pages. The Guest will see these pages,
* so we make sure they're zeroed.
*/
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!lg_switcher_pages[i]) {
err = -ENOMEM;
goto free_some_pages;
}
}
/*
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
* It goes in the first page, which we map in momentarily.
*/
memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
end_switcher_text - start_switcher_text);
kunmap(lg_switcher_pages[0]);
/*
* We place the Switcher underneath the fixmap area, which is the
* highest virtual address we can get. This is important, since we
* tell the Guest it can't access this memory, so we want its ceiling
* as high as possible.
*/
switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
/*
* Now we reserve the "virtual memory area"s we want. We might
* not get them in theory, but in practice it's worked so far.
*
* We want the switcher text to be read-only and executable, and
* the stacks to be read-write and non-executable.
*/
switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
switcher_addr,
switcher_addr + PAGE_SIZE);
if (!switcher_text_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_pages;
}
switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
VM_ALLOC|VM_NO_GUARD,
switcher_addr + PAGE_SIZE,
switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
if (!switcher_stacks_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_text_vma;
}
/*
* This code actually sets up the pages we've allocated to appear at
* switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel text pages and kernel writable
* pages respectively), and a pointer to our array of struct pages.
*/
err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
if (err) {
printk("lguest: text map_vm_area failed: %i\n", err);
goto free_vmas;
}
err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
lg_switcher_pages + SWITCHER_TEXT_PAGES);
if (err) {
printk("lguest: stacks map_vm_area failed: %i\n", err);
goto free_vmas;
}
/*
* Now the Switcher is mapped at the right address, we can't fail!
*/
printk(KERN_INFO "lguest: mapped switcher at %p\n",
switcher_text_vma->addr);
/* And we succeeded... */
return 0;
free_vmas:
/* Undoes map_vm_area and __get_vm_area */
vunmap(switcher_stacks_vma->addr);
free_text_vma:
vunmap(switcher_text_vma->addr);
free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
for (--i; i >= 0; i--)
__free_pages(lg_switcher_pages[i], 0);
kfree(lg_switcher_pages);
out:
return err;
}
/*:*/
/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
static void unmap_switcher(void)
{
unsigned int i;
/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
vunmap(switcher_text_vma->addr);
vunmap(switcher_stacks_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(lg_switcher_pages[i], 0);
kfree(lg_switcher_pages);
}
/*H:032
* Dealing With Guest Memory.
*
* Before we go too much further into the Host, we need to grok the routines
* we use to deal with Guest memory.
*
* When the Guest gives us (what it thinks is) a physical address, we can use
* the normal copy_from_user() & copy_to_user() on the corresponding place in
* the memory region allocated by the Launcher.
*
* But we can't trust the Guest: it might be trying to access the Launcher
* code. We have to check that the range is below the pfn_limit the Launcher
* gave us. We have to make sure that addr + len doesn't give us a false
* positive by overflowing, too.
*/
bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len)
{
return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
}
/*
* This routine copies memory from the Guest. Here we can see how useful the
* kill_lguest() routine we met in the Launcher can be: we return a random
* value (all zeroes) instead of needing to return an error.
*/
void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
{
if (!lguest_address_ok(cpu->lg, addr, bytes)
|| copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
/* copy_from_user should do this, but as we rely on it... */
memset(b, 0, bytes);
kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
}
}
/* This is the write (copy into Guest) version. */
void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
unsigned bytes)
{
if (!lguest_address_ok(cpu->lg, addr, bytes)
|| copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
}
/*:*/
/*H:030
* Let's jump straight to the the main loop which runs the Guest.
* Remember, this is called by the Launcher reading /dev/lguest, and we keep
* going around and around until something interesting happens.
*/
int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
{
/* If the launcher asked for a register with LHREQ_GETREG */
if (cpu->reg_read) {
if (put_user(*cpu->reg_read, user))
return -EFAULT;
cpu->reg_read = NULL;
return sizeof(*cpu->reg_read);
}
/* We stop running once the Guest is dead. */
while (!cpu->lg->dead) {
unsigned int irq;
bool more;
/* First we run any hypercalls the Guest wants done. */
if (cpu->hcall)
do_hypercalls(cpu);
/* Do we have to tell the Launcher about a trap? */
if (cpu->pending.trap) {
if (copy_to_user(user, &cpu->pending,
sizeof(cpu->pending)))
return -EFAULT;
return sizeof(cpu->pending);
}
/*
* All long-lived kernel loops need to check with this horrible
* thing called the freezer. If the Host is trying to suspend,
* it stops us.
*/
try_to_freeze();
/* Check for signals */
if (signal_pending(current))
return -ERESTARTSYS;
/*
* Check if there are any interrupts which can be delivered now:
* if so, this sets up the hander to be executed when we next
* run the Guest.
*/
irq = interrupt_pending(cpu, &more);
if (irq < LGUEST_IRQS)
try_deliver_interrupt(cpu, irq, more);
/*
* Just make absolutely sure the Guest is still alive. One of
* those hypercalls could have been fatal, for example.
*/
if (cpu->lg->dead)
break;
/*
* If the Guest asked to be stopped, we sleep. The Guest's
* clock timer will wake us.
*/
if (cpu->halted) {
set_current_state(TASK_INTERRUPTIBLE);
/*
* Just before we sleep, make sure no interrupt snuck in
* which we should be doing.
*/
if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
set_current_state(TASK_RUNNING);
else
schedule();
continue;
}
/*
* OK, now we're ready to jump into the Guest. First we put up
* the "Do Not Disturb" sign:
*/
local_irq_disable();
/* Actually run the Guest until something happens. */
lguest_arch_run_guest(cpu);
/* Now we're ready to be interrupted or moved to other CPUs */
local_irq_enable();
/* Now we deal with whatever happened to the Guest. */
lguest_arch_handle_trap(cpu);
}
/* Special case: Guest is 'dead' but wants a reboot. */
if (cpu->lg->dead == ERR_PTR(-ERESTART))
return -ERESTART;
/* The Guest is dead => "No such file or directory" */
return -ENOENT;
}
/*H:000
* Welcome to the Host!
*
* By this point your brain has been tickled by the Guest code and numbed by
* the Launcher code; prepare for it to be stretched by the Host code. This is
* the heart. Let's begin at the initialization routine for the Host's lg
* module.
*/
static int __init init(void)
{
int err;
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
if (get_kernel_rpl() != 0) {
printk("lguest is afraid of being a guest\n");
return -EPERM;
}
/* First we put the Switcher up in very high virtual memory. */
err = map_switcher();
if (err)
goto out;
/* We might need to reserve an interrupt vector. */
err = init_interrupts();
if (err)
goto unmap;
/* /dev/lguest needs to be registered. */
err = lguest_device_init();
if (err)
goto free_interrupts;
/* Finally we do some architecture-specific setup. */
lguest_arch_host_init();
/* All good! */
return 0;
free_interrupts:
free_interrupts();
unmap:
unmap_switcher();
out:
return err;
}
/* Cleaning up is just the same code, backwards. With a little French. */
static void __exit fini(void)
{
lguest_device_remove();
free_interrupts();
unmap_switcher();
lguest_arch_host_fini();
}
/*:*/
/*
* The Host side of lguest can be a module. This is a nice way for people to
* play with it.
*/
module_init(init);
module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
/*P:500
* Just as userspace programs request kernel operations through a system
* call, the Guest requests Host operations through a "hypercall". You might
* notice this nomenclature doesn't really follow any logic, but the name has
* been around for long enough that we're stuck with it. As you'd expect, this
* code is basically a one big switch statement.
:*/
/* Copyright (C) 2006 Rusty Russell IBM Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/ktime.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "lg.h"
/*H:120
* This is the core hypercall routine: where the Guest gets what it wants.
* Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both.
*/
static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
{
switch (args->arg0) {
case LHCALL_FLUSH_ASYNC:
/*
* This call does nothing, except by breaking out of the Guest
* it makes us process all the asynchronous hypercalls.
*/
break;
case LHCALL_SEND_INTERRUPTS:
/*
* This call does nothing too, but by breaking out of the Guest
* it makes us process any pending interrupts.
*/
break;
case LHCALL_LGUEST_INIT:
/*
* You can't get here unless you're already initialized. Don't
* do that.
*/
kill_guest(cpu, "already have lguest_data");
break;
case LHCALL_SHUTDOWN: {
char msg[128];
/*
* Shutdown is such a trivial hypercall that we do it in five
* lines right here.
*
* If the lgread fails, it will call kill_guest() itself; the
* kill_guest() with the message will be ignored.
*/
__lgread(cpu, msg, args->arg1, sizeof(msg));
msg[sizeof(msg)-1] = '\0';
kill_guest(cpu, "CRASH: %s", msg);
if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
cpu->lg->dead = ERR_PTR(-ERESTART);
break;
}
case LHCALL_FLUSH_TLB:
/* FLUSH_TLB comes in two flavors, depending on the argument: */
if (args->arg1)
guest_pagetable_clear_all(cpu);
else
guest_pagetable_flush_user(cpu);
break;
/*
* All these calls simply pass the arguments through to the right
* routines.
*/
case LHCALL_NEW_PGTABLE:
guest_new_pagetable(cpu, args->arg1);
break;
case LHCALL_SET_STACK:
guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_SET_PTE:
#ifdef CONFIG_X86_PAE
guest_set_pte(cpu, args->arg1, args->arg2,
__pte(args->arg3 | (u64)args->arg4 << 32));
#else
guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
#endif
break;
case LHCALL_SET_PGD:
guest_set_pgd(cpu->lg, args->arg1, args->arg2);
break;
#ifdef CONFIG_X86_PAE
case LHCALL_SET_PMD:
guest_set_pmd(cpu->lg, args->arg1, args->arg2);
break;
#endif
case LHCALL_SET_CLOCKEVENT:
guest_set_clockevent(cpu, args->arg1);
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
cpu->halted = 1;
break;
default:
/* It should be an architecture-specific hypercall. */
if (lguest_arch_do_hcall(cpu, args))
kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
}
}
/*H:124
* Asynchronous hypercalls are easy: we just look in the array in the
* Guest's "struct lguest_data" to see if any new ones are marked "ready".
*
* We are careful to do these in order: obviously we respect the order the
* Guest put them in the ring, but we also promise the Guest that they will
* happen before any normal hypercall (which is why we check this before
* checking for a normal hcall).
*/
static void do_async_hcalls(struct lg_cpu *cpu)
{
unsigned int i;
u8 st[LHCALL_RING_SIZE];
/* For simplicity, we copy the entire call status array in at once. */
if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
return;
/* We process "struct lguest_data"s hcalls[] ring once. */
for (i = 0; i < ARRAY_SIZE(st); i++) {
struct hcall_args args;
/*
* We remember where we were up to from last time. This makes
* sure that the hypercalls are done in the order the Guest
* places them in the ring.
*/
unsigned int n = cpu->next_hcall;
/* 0xFF means there's no call here (yet). */
if (st[n] == 0xFF)
break;
/*
* OK, we have hypercall. Increment the "next_hcall" cursor,
* and wrap back to 0 if we reach the end.
*/
if (++cpu->next_hcall == LHCALL_RING_SIZE)
cpu->next_hcall = 0;
/*
* Copy the hypercall arguments into a local copy of the
* hcall_args struct.
*/
if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
sizeof(struct hcall_args))) {
kill_guest(cpu, "Fetching async hypercalls");
break;
}
/* Do the hypercall, same as a normal one. */
do_hcall(cpu, &args);
/* Mark the hypercall done. */
if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
kill_guest(cpu, "Writing result for async hypercall");
break;
}
/*
* Stop doing hypercalls if they want to notify the Launcher:
* it needs to service this first.
*/
if (cpu->pending.trap)
break;
}
}
/*
* Last of all, we look at what happens first of all. The very first time the
* Guest makes a hypercall, we end up here to set things up:
*/
static void initialize(struct lg_cpu *cpu)
{
/*
* You can't do anything until you're initialized. The Guest knows the
* rules, so we're unforgiving here.
*/
if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
return;
}
if (lguest_arch_init_hypercalls(cpu))
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/*
* The Guest tells us where we're not to deliver interrupts by putting
* the instruction address into "struct lguest_data".
*/
if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/*
* We write the current time into the Guest's data page once so it can
* set its clock.
*/
write_timestamp(cpu);
/* page_tables.c will also do some setup. */
page_table_guest_data_init(cpu);
/*
* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the old page might be (read-only) in the Guest
* pagetable.
*/
guest_pagetable_clear_all(cpu);
}
/*:*/
/*M:013
* If a Guest reads from a page (so creates a mapping) that it has never
* written to, and then the Launcher writes to it (ie. the output of a virtual
* device), the Guest will still see the old page. In practice, this never
* happens: why would the Guest read a page which it has never written to? But
* a similar scenario might one day bite us, so it's worth mentioning.
*
* Note that if we used a shared anonymous mapping in the Launcher instead of
* mapping /dev/zero private, we wouldn't worry about cop-on-write. And we
* need that to switch the Launcher to processes (away from threads) anyway.
:*/
/*H:100
* Hypercalls
*
* Remember from the Guest, hypercalls come in two flavors: normal and
* asynchronous. This file handles both of types.
*/
void do_hypercalls(struct lg_cpu *cpu)
{
/* Not initialized yet? This hypercall must do it. */
if (unlikely(!cpu->lg->lguest_data)) {
/* Set up the "struct lguest_data" */
initialize(cpu);
/* Hcall is done. */
cpu->hcall = NULL;
return;
}
/*
* The Guest has initialized.
*
* Look in the hypercall ring for the async hypercalls:
*/
do_async_hcalls(cpu);
/*
* If we stopped reading the hypercall ring because the Guest did a
* NOTIFY to the Launcher, we want to return now. Otherwise we do
* the hypercall.
*/
if (!cpu->pending.trap) {
do_hcall(cpu, cpu->hcall);
/*
* Tricky point: we reset the hcall pointer to mark the
* hypercall as "done". We use the hcall pointer rather than
* the trap number to indicate a hypercall is pending.
* Normally it doesn't matter: the Guest will run again and
* update the trap number before we come back here.
*
* However, if we are signalled or the Guest sends I/O to the
* Launcher, the run_guest() loop will exit without running the
* Guest. When it comes back it would try to re-run the
* hypercall. Finding that bug sucked.
*/
cpu->hcall = NULL;
}
}
/*
* This routine supplies the Guest with time: it's used for wallclock time at
* initial boot and as a rough time source if the TSC isn't available.
*/
void write_timestamp(struct lg_cpu *cpu)
{
struct timespec now;
ktime_get_real_ts(&now);
if (copy_to_user(&cpu->lg->lguest_data->time,
&now, sizeof(struct timespec)))
kill_guest(cpu, "Writing timestamp");
}
此差异已折叠。
#ifndef _LGUEST_H
#define _LGUEST_H
#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/init.h>
#include <linux/stringify.h>
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <linux/wait.h>
#include <linux/hrtimer.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <asm/lguest.h>
struct pgdir {
unsigned long gpgdir;
bool switcher_mapped;
int last_host_cpu;
pgd_t *pgdir;
};
/* We have two pages shared with guests, per cpu. */
struct lguest_pages {
/* This is the stack page mapped rw in guest */
char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
struct lguest_regs regs;
/* This is the host state & guest descriptor page, ro in guest */
struct lguest_ro_state state;
} __attribute__((aligned(PAGE_SIZE)));
#define CHANGED_IDT 1
#define CHANGED_GDT 2
#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
#define CHANGED_ALL 3
struct lg_cpu {
unsigned int id;
struct lguest *lg;
struct task_struct *tsk;
struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
u32 cr2;
u32 esp1;
u16 ss1;
/* Bitmap of what has changed: see CHANGED_* above. */
int changed;
/* Pending operation. */
struct lguest_pending pending;
unsigned long *reg_read; /* register from LHREQ_GETREG */
/* At end of a page shared mapped over lguest_pages in guest. */
unsigned long regs_page;
struct lguest_regs *regs;
struct lguest_pages *last_pages;
/* Initialization mode: linear map everything. */
bool linear_pages;
int cpu_pgd; /* Which pgd this cpu is currently using */
/* If a hypercall was asked for, this points to the arguments. */
struct hcall_args *hcall;
u32 next_hcall;
/* Virtual clock device */
struct hrtimer hrt;
/* Did the Guest tell us to halt? */
int halted;
/* Pending virtual interrupts */
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
struct lg_cpu_arch arch;
};
/* The private info the thread maintains about the guest. */
struct lguest {
struct lguest_data __user *lguest_data;
struct lg_cpu cpus[NR_CPUS];
unsigned int nr_cpus;
/* Valid guest memory pages must be < this. */
u32 pfn_limit;
/* Device memory is >= pfn_limit and < device_limit. */
u32 device_limit;
/*
* This provides the offset to the base of guest-physical memory in the
* Launcher.
*/
void __user *mem_base;
unsigned long kernel_address;
struct pgdir pgdirs[4];
unsigned long noirq_iret;
unsigned int stack_pages;
u32 tsc_khz;
/* Dead? */
const char *dead;
};
extern struct mutex lguest_lock;
/* core.c: */
bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
extern struct page **lg_switcher_pages;
/*H:035
* Using memory-copy operations like that is usually inconvient, so we
* have the following helper macros which read and write a specific type (often
* an unsigned long).
*
* This reads into a variable of the given type then returns that.
*/
#define lgread(cpu, addr, type) \
({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
/* This checks that the variable is of the given type, then writes it out. */
#define lgwrite(cpu, addr, type, val) \
do { \
typecheck(type, val); \
__lgwrite((cpu), (addr), &(val), sizeof(val)); \
} while(0)
/* (end of memory access helper routines) :*/
int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
/*
* Helper macros to obtain the first 12 or the last 20 bits, this is only the
* first step in the migration to the kernel types. pte_pfn is already defined
* in the kernel.
*/
#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT)
/* interrupts_and_traps.c: */
unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
u32 low, u32 hi);
void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
void pin_stack_pages(struct lg_cpu *cpu);
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def);
void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def);
void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
bool send_notify_to_eventfd(struct lg_cpu *cpu);
void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
bool could_be_syscall(unsigned int num);
int init_interrupts(void);
void free_interrupts(void);
/* segments.c: */
void setup_default_gdt_entries(struct lguest_ro_state *state);
void setup_guest_gdt(struct lg_cpu *cpu);
void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
u32 low, u32 hi);
void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
/* page_tables.c: */
int init_guest_pagetable(struct lguest *lg);
void free_guest_pagetable(struct lguest *lg);
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
#ifdef CONFIG_X86_PAE
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
#endif
void guest_pagetable_clear_all(struct lg_cpu *cpu);
void guest_pagetable_flush_user(struct lg_cpu *cpu);
void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
unsigned long vaddr, pte_t val);
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
unsigned long *iomem);
void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
void page_table_guest_data_init(struct lg_cpu *cpu);
/* <arch>/core.c: */
void lguest_arch_host_init(void);
void lguest_arch_host_fini(void);
void lguest_arch_run_guest(struct lg_cpu *cpu);
void lguest_arch_handle_trap(struct lg_cpu *cpu);
int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
/* <arch>/switcher.S: */
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
/* lguest_user.c: */
int lguest_device_init(void);
void lguest_device_remove(void);
/* hypercalls.c: */
void do_hypercalls(struct lg_cpu *cpu);
void write_timestamp(struct lg_cpu *cpu);
/*L:035
* Let's step aside for the moment, to study one important routine that's used
* widely in the Host code.
*
* There are many cases where the Guest can do something invalid, like pass crap
* to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
* acceptable to simply terminate the Guest and give the Launcher a nicely
* formatted reason. It's also simpler for the Guest itself, which doesn't
* need to check most hypercalls for "success"; if you're still running, it
* succeeded.
*
* Once this is called, the Guest will never run again, so most Host code can
* call this then continue as if nothing had happened. This means many
* functions don't have to explicitly return an error code, which keeps the
* code simple.
*
* It also means that this can be called more than once: only the first one is
* remembered. The only trick is that we still need to kill the Guest even if
* we can't allocate memory to store the reason. Linux has a neat way of
* packing error codes into invalid pointers, so we use that here.
*
* Like any macro which uses an "if", it is safely wrapped in a run-once "do {
* } while(0)".
*/
#define kill_guest(cpu, fmt...) \
do { \
if (!(cpu)->lg->dead) { \
(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
if (!(cpu)->lg->dead) \
(cpu)->lg->dead = ERR_PTR(-ENOMEM); \
} \
} while(0)
/* (End of aside) :*/
#endif /* __ASSEMBLY__ */
#endif /* _LGUEST_H */
/*P:200 This contains all the /dev/lguest code, whereby the userspace
* launcher controls and communicates with the Guest. For example,
* the first write will tell us the Guest's memory layout and entry
* point. A read will run the Guest until something happens, such as
* a signal or the Guest accessing a device.
:*/
#include <linux/uaccess.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/export.h>
#include "lg.h"
/*L:052
The Launcher can get the registers, and also set some of them.
*/
static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
{
unsigned long which;
/* We re-use the ptrace structure to specify which register to read. */
if (get_user(which, input) != 0)
return -EFAULT;
/*
* We set up the cpu register pointer, and their next read will
* actually get the value (instead of running the guest).
*
* The last argument 'true' says we can access any register.
*/
cpu->reg_read = lguest_arch_regptr(cpu, which, true);
if (!cpu->reg_read)
return -ENOENT;
/* And because this is a write() call, we return the length used. */
return sizeof(unsigned long) * 2;
}
static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
{
unsigned long which, value, *reg;
/* We re-use the ptrace structure to specify which register to read. */
if (get_user(which, input) != 0)
return -EFAULT;
input++;
if (get_user(value, input) != 0)
return -EFAULT;
/* The last argument 'false' means we can't access all registers. */
reg = lguest_arch_regptr(cpu, which, false);
if (!reg)
return -ENOENT;
*reg = value;
/* And because this is a write() call, we return the length used. */
return sizeof(unsigned long) * 3;
}
/*L:050
* Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
* number to /dev/lguest.
*/
static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
{
unsigned long irq;
if (get_user(irq, input) != 0)
return -EFAULT;
if (irq >= LGUEST_IRQS)
return -EINVAL;
/*
* Next time the Guest runs, the core code will see if it can deliver
* this interrupt.
*/
set_interrupt(cpu, irq);
return 0;
}
/*L:053
* Deliver a trap: this is used by the Launcher if it can't emulate
* an instruction.
*/
static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
{
unsigned long trapnum;
if (get_user(trapnum, input) != 0)
return -EFAULT;
if (!deliver_trap(cpu, trapnum))
return -EINVAL;
return 0;
}
/*L:040
* Once our Guest is initialized, the Launcher makes it run by reading
* from /dev/lguest.
*/
static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
{
struct lguest *lg = file->private_data;
struct lg_cpu *cpu;
unsigned int cpu_id = *o;
/* You must write LHREQ_INITIALIZE first! */
if (!lg)
return -EINVAL;
/* Watch out for arbitrary vcpu indexes! */
if (cpu_id >= lg->nr_cpus)
return -EINVAL;
cpu = &lg->cpus[cpu_id];
/* If you're not the task which owns the Guest, go away. */
if (current != cpu->tsk)
return -EPERM;
/* If the Guest is already dead, we indicate why */
if (lg->dead) {
size_t len;
/* lg->dead either contains an error code, or a string. */
if (IS_ERR(lg->dead))
return PTR_ERR(lg->dead);
/* We can only return as much as the buffer they read with. */
len = min(size, strlen(lg->dead)+1);
if (copy_to_user(user, lg->dead, len) != 0)
return -EFAULT;
return len;
}
/*
* If we returned from read() last time because the Guest sent I/O,
* clear the flag.
*/
if (cpu->pending.trap)
cpu->pending.trap = 0;
/* Run the Guest until something interesting happens. */
return run_guest(cpu, (unsigned long __user *)user);
}
/*L:025
* This actually initializes a CPU. For the moment, a Guest is only
* uniprocessor, so "id" is always 0.
*/
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{
/* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id;
cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */
init_clockdev(cpu);
/*
* We need a complete page for the Guest registers: they are accessible
* to the Guest and we can only grant it access to whole pages.
*/
cpu->regs_page = get_zeroed_page(GFP_KERNEL);
if (!cpu->regs_page)
return -ENOMEM;
/* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/*
* Now we initialize the Guest's registers, handing it the start
* address.
*/
lguest_arch_setup_regs(cpu, start_ip);
/*
* We keep a pointer to the Launcher task (ie. current task) for when
* other Guests want to wake this one (eg. console input).
*/
cpu->tsk = current;
/*
* We need to keep a pointer to the Launcher's memory map, because if
* the Launcher dies we need to clean it up. If we don't keep a
* reference, it is destroyed before close() is called.
*/
cpu->mm = get_task_mm(cpu->tsk);
/*
* We remember which CPU's pages this Guest used last, for optimization
* when the same Guest runs on the same CPU twice.
*/
cpu->last_pages = NULL;
/* No error == success. */
return 0;
}
/*L:020
* The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
* addition to the LHREQ_INITIALIZE value). These are:
*
* base: The start of the Guest-physical memory inside the Launcher memory.
*
* pfnlimit: The highest (Guest-physical) page number the Guest should be
* allowed to access. The Guest memory lives inside the Launcher, so it sets
* this to ensure the Guest can only reach its own memory.
*
* start: The first instruction to execute ("eip" in x86-speak).
*/
static int initialize(struct file *file, const unsigned long __user *input)
{
/* "struct lguest" contains all we (the Host) know about a Guest. */
struct lguest *lg;
int err;
unsigned long args[4];
/*
* We grab the Big Lguest lock, which protects against multiple
* simultaneous initializations.
*/
mutex_lock(&lguest_lock);
/* You can't initialize twice! Close the device and start again... */
if (file->private_data) {
err = -EBUSY;
goto unlock;
}
if (copy_from_user(args, input, sizeof(args)) != 0) {
err = -EFAULT;
goto unlock;
}
lg = kzalloc(sizeof(*lg), GFP_KERNEL);
if (!lg) {
err = -ENOMEM;
goto unlock;
}
/* Populate the easy fields of our "struct lguest" */
lg->mem_base = (void __user *)args[0];
lg->pfn_limit = args[1];
lg->device_limit = args[3];
/* This is the first cpu (cpu 0) and it will start booting at args[2] */
err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
if (err)
goto free_lg;
/*
* Initialize the Guest's shadow page tables. This allocates
* memory, so can fail.
*/
err = init_guest_pagetable(lg);
if (err)
goto free_regs;
/* We keep our "struct lguest" in the file's private_data. */
file->private_data = lg;
mutex_unlock(&lguest_lock);
/* And because this is a write() call, we return the length used. */
return sizeof(args);
free_regs:
/* FIXME: This should be in free_vcpu */
free_page(lg->cpus[0].regs_page);
free_lg:
kfree(lg);
unlock:
mutex_unlock(&lguest_lock);
return err;
}
/*L:010
* The first operation the Launcher does must be a write. All writes
* start with an unsigned long number: for the first write this must be
* LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
* writes of other values to send interrupts or set up receipt of notifications.
*
* Note that we overload the "offset" in the /dev/lguest file to indicate what
* CPU number we're dealing with. Currently this is always 0 since we only
* support uniprocessor Guests, but you can see the beginnings of SMP support
* here.
*/
static ssize_t write(struct file *file, const char __user *in,
size_t size, loff_t *off)
{
/*
* Once the Guest is initialized, we hold the "struct lguest" in the
* file private data.
*/
struct lguest *lg = file->private_data;
const unsigned long __user *input = (const unsigned long __user *)in;
unsigned long req;
struct lg_cpu *uninitialized_var(cpu);
unsigned int cpu_id = *off;
/* The first value tells us what this request is. */
if (get_user(req, input) != 0)
return -EFAULT;
input++;
/* If you haven't initialized, you must do that first. */
if (req != LHREQ_INITIALIZE) {
if (!lg || (cpu_id >= lg->nr_cpus))
return -EINVAL;
cpu = &lg->cpus[cpu_id];
/* Once the Guest is dead, you can only read() why it died. */
if (lg->dead)
return -ENOENT;
}
switch (req) {
case LHREQ_INITIALIZE:
return initialize(file, input);
case LHREQ_IRQ:
return user_send_irq(cpu, input);
case LHREQ_GETREG:
return getreg_setup(cpu, input);
case LHREQ_SETREG:
return setreg(cpu, input);
case LHREQ_TRAP:
return trap(cpu, input);
default:
return -EINVAL;
}
}
static int open(struct inode *inode, struct file *file)
{
file->private_data = NULL;
return 0;
}
/*L:060
* The final piece of interface code is the close() routine. It reverses
* everything done in initialize(). This is usually called because the
* Launcher exited.
*
* Note that the close routine returns 0 or a negative error number: it can't
* really fail, but it can whine. I blame Sun for this wart, and K&R C for
* letting them do it.
:*/
static int close(struct inode *inode, struct file *file)
{
struct lguest *lg = file->private_data;
unsigned int i;
/* If we never successfully initialized, there's nothing to clean up */
if (!lg)
return 0;
/*
* We need the big lock, to protect from inter-guest I/O and other
* Launchers initializing guests.
*/
mutex_lock(&lguest_lock);
/* Free up the shadow page tables for the Guest. */
free_guest_pagetable(lg);
for (i = 0; i < lg->nr_cpus; i++) {
/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
hrtimer_cancel(&lg->cpus[i].hrt);
/* We can free up the register page we allocated. */
free_page(lg->cpus[i].regs_page);
/*
* Now all the memory cleanups are done, it's safe to release
* the Launcher's memory management structure.
*/
mmput(lg->cpus[i].mm);
}
/*
* If lg->dead doesn't contain an error code it will be NULL or a
* kmalloc()ed string, either of which is ok to hand to kfree().
*/
if (!IS_ERR(lg->dead))
kfree(lg->dead);
/* Free the memory allocated to the lguest_struct */
kfree(lg);
/* Release lock and exit. */
mutex_unlock(&lguest_lock);
return 0;
}
/*L:000
* Welcome to our journey through the Launcher!
*
* The Launcher is the Host userspace program which sets up, runs and services
* the Guest. In fact, many comments in the Drivers which refer to "the Host"
* doing things are inaccurate: the Launcher does all the device handling for
* the Guest, but the Guest can't know that.
*
* Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
* shall see more of that later.
*
* We begin our understanding with the Host kernel interface which the Launcher
* uses: reading and writing a character device called /dev/lguest. All the
* work happens in the read(), write() and close() routines:
*/
static const struct file_operations lguest_fops = {
.owner = THIS_MODULE,
.open = open,
.release = close,
.write = write,
.read = read,
.llseek = default_llseek,
};
/*:*/
/*
* This is a textbook example of a "misc" character device. Populate a "struct
* miscdevice" and register it with misc_register().
*/
static struct miscdevice lguest_dev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "lguest",
.fops = &lguest_fops,
};
int __init lguest_device_init(void)
{
return misc_register(&lguest_dev);
}
void __exit lguest_device_remove(void)
{
misc_deregister(&lguest_dev);
}
此差异已折叠。
/*P:600
* The x86 architecture has segments, which involve a table of descriptors
* which can be used to do funky things with virtual address interpretation.
* We originally used to use segments so the Guest couldn't alter the
* Guest<->Host Switcher, and then we had to trim Guest segments, and restore
* for userspace per-thread segments, but trim again for on userspace->kernel
* transitions... This nightmarish creation was contained within this file,
* where we knew not to tread without heavy armament and a change of underwear.
*
* In these modern times, the segment handling code consists of simple sanity
* checks, and the worst you'll experience reading this code is butterfly-rash
* from frolicking through its parklike serenity.
:*/
#include "lg.h"
/*H:600
* Segments & The Global Descriptor Table
*
* (That title sounds like a bad Nerdcore group. Not to suggest that there are
* any good Nerdcore groups, but in high school a friend of mine had a band
* called Joe Fish and the Chips, so there are definitely worse band names).
*
* To refresh: the GDT is a table of 8-byte values describing segments. Once
* set up, these segments can be loaded into one of the 6 "segment registers".
*
* GDT entries are passed around as "struct desc_struct"s, which like IDT
* entries are split into two 32-bit members, "a" and "b". One day, someone
* will clean that up, and be declared a Hero. (No pressure, I'm just saying).
*
* Anyway, the GDT entry contains a base (the start address of the segment), a
* limit (the size of the segment - 1), and some flags. Sounds simple, and it
* would be, except those zany Intel engineers decided that it was too boring
* to put the base at one end, the limit at the other, and the flags in
* between. They decided to shotgun the bits at random throughout the 8 bytes,
* like so:
*
* 0 16 40 48 52 56 63
* [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ]
* mit ags part 2
* part 2
*
* As a result, this file contains a certain amount of magic numeracy. Let's
* begin.
*/
/*
* There are several entries we don't let the Guest set. The TSS entry is the
* "Task State Segment" which controls all kinds of delicate things. The
* LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
* the Guest can't be trusted to deal with double faults.
*/
static bool ignored_gdt(unsigned int num)
{
return (num == GDT_ENTRY_TSS
|| num == GDT_ENTRY_LGUEST_CS
|| num == GDT_ENTRY_LGUEST_DS
|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
}
/*H:630
* Once the Guest gave us new GDT entries, we fix them up a little. We
* don't care if they're invalid: the worst that can happen is a General
* Protection Fault in the Switcher when it restores a Guest segment register
* which tries to use that entry. Then we kill the Guest for causing such a
* mess: the message will be "unhandled trap 256".
*/
static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
{
unsigned int i;
for (i = start; i < end; i++) {
/*
* We never copy these ones to real GDT, so we don't care what
* they say
*/
if (ignored_gdt(i))
continue;
/*
* Segment descriptors contain a privilege level: the Guest is
* sometimes careless and leaves this as 0, even though it's
* running at privilege level 1. If so, we fix it here.
*/
if (cpu->arch.gdt[i].dpl == 0)
cpu->arch.gdt[i].dpl |= GUEST_PL;
/*
* Each descriptor has an "accessed" bit. If we don't set it
* now, the CPU will try to set it when the Guest first loads
* that entry into a segment register. But the GDT isn't
* writable by the Guest, so bad things can happen.
*/
cpu->arch.gdt[i].type |= 0x1;
}
}
/*H:610
* Like the IDT, we never simply use the GDT the Guest gives us. We keep
* a GDT for each CPU, and copy across the Guest's entries each time we want to
* run the Guest on that CPU.
*
* This routine is called at boot or modprobe time for each CPU to set up the
* constant GDT entries: the ones which are the same no matter what Guest we're
* running.
*/
void setup_default_gdt_entries(struct lguest_ro_state *state)
{
struct desc_struct *gdt = state->guest_gdt;
unsigned long tss = (unsigned long)&state->guest_tss;
/* The Switcher segments are full 0-4G segments, privilege level 0 */
gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
/*
* The TSS segment refers to the TSS entry for this particular CPU.
*/
gdt[GDT_ENTRY_TSS].a = 0;
gdt[GDT_ENTRY_TSS].b = 0;
gdt[GDT_ENTRY_TSS].limit0 = 0x67;
gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF;
gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF;
gdt[GDT_ENTRY_TSS].base2 = tss >> 24;
gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */
gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */
gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */
gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */
}
/*
* This routine sets up the initial Guest GDT for booting. All entries start
* as 0 (unusable).
*/
void setup_guest_gdt(struct lg_cpu *cpu)
{
/*
* Start with full 0-4G segments...except the Guest is allowed to use
* them, so set the privilege level appropriately in the flags.
*/
cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
}
/*H:650
* An optimization of copy_gdt(), for just the three "thead-local storage"
* entries.
*/
void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
gdt[i] = cpu->arch.gdt[i];
}
/*H:640
* When the Guest is run on a different CPU, or the GDT entries have changed,
* copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
* GDT.
*/
void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
/*
* The default entries from setup_default_gdt_entries() are not
* replaced. See ignored_gdt() above.
*/
for (i = 0; i < GDT_ENTRIES; i++)
if (!ignored_gdt(i))
gdt[i] = cpu->arch.gdt[i];
}
/*H:620
* This is where the Guest asks us to load a new GDT entry
* (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in.
*/
void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
{
/*
* We assume the Guest has the same number of GDT entries as the
* Host, otherwise we'd have to dynamically allocate the Guest GDT.
*/
if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
kill_guest(cpu, "too many gdt entries %i", num);
return;
}
/* Set it up, then fix it. */
cpu->arch.gdt[num].a = lo;
cpu->arch.gdt[num].b = hi;
fixup_gdt_table(cpu, num, num+1);
/*
* Mark that the GDT changed so the core knows it has to copy it again,
* even if the Guest is run on the same CPU.
*/
cpu->changed |= CHANGED_GDT;
}
/*
* This is the fast-track version for just changing the three TLS entries.
* Remember that this happens on every context switch, so it's worth
* optimizing. But wouldn't it be neater to have a single hypercall to cover
* both cases?
*/
void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
{
struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
/* Note that just the TLS entries have changed. */
cpu->changed |= CHANGED_GDT_TLS;
}
/*H:660
* With this, we have finished the Host.
*
* Five of the seven parts of our task are complete. You have made it through
* the Bit of Despair (I think that's somewhere in the page table code,
* myself).
*
* Next, we examine "make Switcher". It's short, but intense.
*/
此差异已折叠。
/*P:900
* This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
* both the Host and Guest to do the low-level Guest<->Host switch. It is as
* simple as it can be made, but it's naturally very specific to x86.
*
* You have now completed Preparation. If this has whet your appetite; if you
* are feeling invigorated and refreshed then the next, more challenging stage
* can be found in "make Guest".
:*/
/*M:012
* Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
* gain at least 1% more performance. Since neither LOC nor performance can be
* measured beforehand, it generally means implementing a feature then deciding
* if it's worth it. And once it's implemented, who can say no?
*
* This is why I haven't implemented this idea myself. I want to, but I
* haven't. You could, though.
*
* The main place where lguest performance sucks is Guest page faulting. When
* a Guest userspace process hits an unmapped page we switch back to the Host,
* walk the page tables, find it's not mapped, switch back to the Guest page
* fault handler, which calls a hypercall to set the page table entry, then
* finally returns to userspace. That's two round-trips.
*
* If we had a small walker in the Switcher, we could quickly check the Guest
* page table and if the page isn't mapped, immediately reflect the fault back
* into the Guest. This means the Switcher would have to know the top of the
* Guest page table and the page fault handler address.
*
* For simplicity, the Guest should only handle the case where the privilege
* level of the fault is 3 and probably only not present or write faults. It
* should also detect recursive faults, and hand the original fault to the
* Host (which is actually really easy).
*
* Two questions remain. Would the performance gain outweigh the complexity?
* And who would write the verse documenting it?
:*/
/*M:011
* Lguest64 handles NMI. This gave me NMI envy (until I looked at their
* code). It's worth doing though, since it would let us use oprofile in the
* Host when a Guest is running.
:*/
/*S:100
* Welcome to the Switcher itself!
*
* This file contains the low-level code which changes the CPU to run the Guest
* code, and returns to the Host when something happens. Understand this, and
* you understand the heart of our journey.
*
* Because this is in assembler rather than C, our tale switches from prose to
* verse. First I tried limericks:
*
* There once was an eax reg,
* To which our pointer was fed,
* It needed an add,
* Which asm-offsets.h had
* But this limerick is hurting my head.
*
* Next I tried haikus, but fitting the required reference to the seasons in
* every stanza was quickly becoming tiresome:
*
* The %eax reg
* Holds "struct lguest_pages" now:
* Cherry blossoms fall.
*
* Then I started with Heroic Verse, but the rhyming requirement leeched away
* the content density and led to some uniquely awful oblique rhymes:
*
* These constants are coming from struct offsets
* For use within the asm switcher text.
*
* Finally, I settled for something between heroic hexameter, and normal prose
* with inappropriate linebreaks. Anyway, it aint no Shakespeare.
*/
// Not all kernel headers work from assembler
// But these ones are needed: the ENTRY() define
// And constants extracted from struct offsets
// To avoid magic numbers and breakage:
// Should they change the compiler can't save us
// Down here in the depths of assembler code.
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/page.h>
#include <asm/segment.h>
#include <asm/lguest.h>
// We mark the start of the code to copy
// It's placed in .text tho it's never run here
// You'll see the trick macro at the end
// Which interleaves data and text to effect.
.text
ENTRY(start_switcher_text)
// When we reach switch_to_guest we have just left
// The safe and comforting shores of C code
// %eax has the "struct lguest_pages" to use
// Where we save state and still see it from the Guest
// And %ebx holds the Guest shadow pagetable:
// Once set we have truly left Host behind.
ENTRY(switch_to_guest)
// We told gcc all its regs could fade,
// Clobbered by our journey into the Guest
// We could have saved them, if we tried
// But time is our master and cycles count.
// Segment registers must be saved for the Host
// We push them on the Host stack for later
pushl %es
pushl %ds
pushl %gs
pushl %fs
// But the compiler is fickle, and heeds
// No warning of %ebp clobbers
// When frame pointers are used. That register
// Must be saved and restored or chaos strikes.
pushl %ebp
// The Host's stack is done, now save it away
// In our "struct lguest_pages" at offset
// Distilled into asm-offsets.h
movl %esp, LGUEST_PAGES_host_sp(%eax)
// All saved and there's now five steps before us:
// Stack, GDT, IDT, TSS
// Then last of all the page tables are flipped.
// Yet beware that our stack pointer must be
// Always valid lest an NMI hits
// %edx does the duty here as we juggle
// %eax is lguest_pages: our stack lies within.
movl %eax, %edx
addl $LGUEST_PAGES_regs, %edx
movl %edx, %esp
// The Guest's GDT we so carefully
// Placed in the "struct lguest_pages" before
lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
// The Guest's IDT we did partially
// Copy to "struct lguest_pages" as well.
lidt LGUEST_PAGES_guest_idt_desc(%eax)
// The TSS entry which controls traps
// Must be loaded up with "ltr" now:
// The GDT entry that TSS uses
// Changes type when we load it: damn Intel!
// For after we switch over our page tables
// That entry will be read-only: we'd crash.
movl $(GDT_ENTRY_TSS*8), %edx
ltr %dx
// Look back now, before we take this last step!
// The Host's TSS entry was also marked used;
// Let's clear it again for our return.
// The GDT descriptor of the Host
// Points to the table after two "size" bytes
movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
// Clear "used" from type field (byte 5, bit 2)
andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
// Once our page table's switched, the Guest is live!
// The Host fades as we run this final step.
// Our "struct lguest_pages" is now read-only.
movl %ebx, %cr3
// The page table change did one tricky thing:
// The Guest's register page has been mapped
// Writable under our %esp (stack) --
// We can simply pop off all Guest regs.
popl %eax
popl %ebx
popl %ecx
popl %edx
popl %esi
popl %edi
popl %ebp
popl %gs
popl %fs
popl %ds
popl %es
// Near the base of the stack lurk two strange fields
// Which we fill as we exit the Guest
// These are the trap number and its error
// We can simply step past them on our way.
addl $8, %esp
// The last five stack slots hold return address
// And everything needed to switch privilege
// From Switcher's level 0 to Guest's 1,
// And the stack where the Guest had last left it.
// Interrupts are turned back on: we are Guest.
iret
// We tread two paths to switch back to the Host
// Yet both must save Guest state and restore Host
// So we put the routine in a macro.
#define SWITCH_TO_HOST \
/* We save the Guest state: all registers first \
* Laid out just as "struct lguest_regs" defines */ \
pushl %es; \
pushl %ds; \
pushl %fs; \
pushl %gs; \
pushl %ebp; \
pushl %edi; \
pushl %esi; \
pushl %edx; \
pushl %ecx; \
pushl %ebx; \
pushl %eax; \
/* Our stack and our code are using segments \
* Set in the TSS and IDT \
* Yet if we were to touch data we'd use \
* Whatever data segment the Guest had. \
* Load the lguest ds segment for now. */ \
movl $(LGUEST_DS), %eax; \
movl %eax, %ds; \
/* So where are we? Which CPU, which struct? \
* The stack is our clue: our TSS starts \
* It at the end of "struct lguest_pages". \
* Or we may have stumbled while restoring \
* Our Guest segment regs while in switch_to_guest, \
* The fault pushed atop that part-unwound stack. \
* If we round the stack down to the page start \
* We're at the start of "struct lguest_pages". */ \
movl %esp, %eax; \
andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
/* Save our trap number: the switch will obscure it \
* (In the Host the Guest regs are not mapped here) \
* %ebx holds it safe for deliver_to_host */ \
movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
/* The Host GDT, IDT and stack! \
* All these lie safely hidden from the Guest: \
* We must return to the Host page tables \
* (Hence that was saved in struct lguest_pages) */ \
movl LGUEST_PAGES_host_cr3(%eax), %edx; \
movl %edx, %cr3; \
/* As before, when we looked back at the Host \
* As we left and marked TSS unused \
* So must we now for the Guest left behind. */ \
andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
/* Switch to Host's GDT, IDT. */ \
lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
lidt LGUEST_PAGES_host_idt_desc(%eax); \
/* Restore the Host's stack where its saved regs lie */ \
movl LGUEST_PAGES_host_sp(%eax), %esp; \
/* Last the TSS: our Host is returned */ \
movl $(GDT_ENTRY_TSS*8), %edx; \
ltr %dx; \
/* Restore now the regs saved right at the first. */ \
popl %ebp; \
popl %fs; \
popl %gs; \
popl %ds; \
popl %es
// The first path is trod when the Guest has trapped:
// (Which trap it was has been pushed on the stack).
// We need only switch back, and the Host will decode
// Why we came home, and what needs to be done.
return_to_host:
SWITCH_TO_HOST
iret
// We are lead to the second path like so:
// An interrupt, with some cause external
// Has ajerked us rudely from the Guest's code
// Again we must return home to the Host
deliver_to_host:
SWITCH_TO_HOST
// But now we must go home via that place
// Where that interrupt was supposed to go
// Had we not been ensconced, running the Guest.
// Here we see the trickness of run_guest_once():
// The Host stack is formed like an interrupt
// With EIP, CS and EFLAGS layered.
// Interrupt handlers end with "iret"
// And that will take us home at long long last.
// But first we must find the handler to call!
// The IDT descriptor for the Host
// Has two bytes for size, and four for address:
// %edx will hold it for us for now.
movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
// We now know the table address we need,
// And saved the trap's number inside %ebx.
// Yet the pointer to the handler is smeared
// Across the bits of the table entry.
// What oracle can tell us how to extract
// From such a convoluted encoding?
// I consulted gcc, and it gave
// These instructions, which I gladly credit:
leal (%edx,%ebx,8), %eax
movzwl (%eax),%edx
movl 4(%eax), %eax
xorw %ax, %ax
orl %eax, %edx
// Now the address of the handler's in %edx
// We call it now: its "iret" drops us home.
jmp *%edx
// Every interrupt can come to us here
// But we must truly tell each apart.
// They number two hundred and fifty six
// And each must land in a different spot,
// Push its number on stack, and join the stream.
// And worse, a mere six of the traps stand apart
// And push on their stack an addition:
// An error number, thirty two bits long
// So we punish the other two fifty
// And make them push a zero so they match.
// Yet two fifty six entries is long
// And all will look most the same as the last
// So we create a macro which can make
// As many entries as we need to fill.
// Note the change to .data then .text:
// We plant the address of each entry
// Into a (data) table for the Host
// To know where each Guest interrupt should go.
.macro IRQ_STUB N TARGET
.data; .long 1f; .text; 1:
// Trap eight, ten through fourteen and seventeen
// Supply an error number. Else zero.
.if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
pushl $0
.endif
pushl $\N
jmp \TARGET
ALIGN
.endm
// This macro creates numerous entries
// Using GAS macros which out-power C's.
.macro IRQ_STUBS FIRST LAST TARGET
irq=\FIRST
.rept \LAST-\FIRST+1
IRQ_STUB irq \TARGET
irq=irq+1
.endr
.endm
// Here's the marker for our pointer table
// Laid in the data section just before
// Each macro places the address of code
// Forming an array: each one points to text
// Which handles interrupt in its turn.
.data
.global default_idt_entries
default_idt_entries:
.text
// The first two traps go straight back to the Host
IRQ_STUBS 0 1 return_to_host
// We'll say nothing, yet, about NMI
IRQ_STUB 2 handle_nmi
// Other traps also return to the Host
IRQ_STUBS 3 31 return_to_host
// All interrupts go via their handlers
IRQ_STUBS 32 127 deliver_to_host
// 'Cept system calls coming from userspace
// Are to go to the Guest, never the Host.
IRQ_STUB 128 return_to_host
IRQ_STUBS 129 255 deliver_to_host
// The NMI, what a fabulous beast
// Which swoops in and stops us no matter that
// We're suspended between heaven and hell,
// (Or more likely between the Host and Guest)
// When in it comes! We are dazed and confused
// So we do the simplest thing which one can.
// Though we've pushed the trap number and zero
// We discard them, return, and hope we live.
handle_nmi:
addl $8, %esp
iret
// We are done; all that's left is Mastery
// And "make Mastery" is a journey long
// Designed to make your fingers itch to code.
// Here ends the text, the file and poem.
ENTRY(end_switcher_text)
......@@ -333,7 +333,7 @@ config VIRTIO_NET
depends on VIRTIO
---help---
This is the virtual network driver for virtio. It can be used with
lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
QEMU based VMMs (like KVM or Xen). Say Y or M.
config NLMON
tristate "Virtual netlink monitoring device"
......
......@@ -4,7 +4,7 @@ config HVC_DRIVER
bool
help
Generic "hypervisor virtual console" infrastructure for various
hypervisors (pSeries, iSeries, Xen, lguest).
hypervisors (pSeries, iSeries, Xen).
It will automatically be selected if one of the back-end console drivers
is selected.
......
......@@ -2,8 +2,8 @@ config VIRTIO
tristate
---help---
This option is selected by any driver which implements the virtio
bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
CONFIG_RPMSG or CONFIG_S390_GUEST.
bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
or CONFIG_S390_GUEST.
menu "Virtio drivers"
......
/*
* Things the lguest guest needs to know. Note: like all lguest interfaces,
* this is subject to wild and random change between versions.
*/
#ifndef _LINUX_LGUEST_H
#define _LINUX_LGUEST_H
#ifndef __ASSEMBLY__
#include <linux/time.h>
#include <asm/irq.h>
#include <asm/lguest_hcall.h>
#define LG_CLOCK_MIN_DELTA 100UL
#define LG_CLOCK_MAX_DELTA ULONG_MAX
/*G:031
* The second method of communicating with the Host is to via "struct
* lguest_data". Once the Guest's initialization hypercall tells the Host where
* this is, the Guest and Host both publish information in it.
:*/
struct lguest_data {
/*
* 512 == enabled (same as eflags in normal hardware). The Guest
* changes interrupts so often that a hypercall is too slow.
*/
unsigned int irq_enabled;
/* Fine-grained interrupt disabling by the Guest */
DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
/*
* The Host writes the virtual address of the last page fault here,
* which saves the Guest a hypercall. CR2 is the native register where
* this address would normally be found.
*/
unsigned long cr2;
/* Wallclock time set by the Host. */
struct timespec time;
/*
* Interrupt pending set by the Host. The Guest should do a hypercall
* if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
*/
int irq_pending;
/*
* Async hypercall ring. Instead of directly making hypercalls, we can
* place them in here for processing the next time the Host wants.
* This batching can be quite efficient.
*/
/* 0xFF == done (set by Host), 0 == pending (set by Guest). */
u8 hcall_status[LHCALL_RING_SIZE];
/* The actual registers for the hypercalls. */
struct hcall_args hcalls[LHCALL_RING_SIZE];
/* Fields initialized by the Host at boot: */
/* Memory not to try to access */
unsigned long reserve_mem;
/* KHz for the TSC clock. */
u32 tsc_khz;
/* Fields initialized by the Guest at boot: */
/* Instruction to suppress interrupts even if enabled */
unsigned long noirq_iret;
/* Address above which page tables are all identical. */
unsigned long kernel_address;
/* The vector to try to use for system calls (0x40 or 0x80). */
unsigned int syscall_vec;
};
extern struct lguest_data lguest_data;
#endif /* __ASSEMBLY__ */
#endif /* _LINUX_LGUEST_H */
此差异已折叠。
#ifndef _UAPI_LINUX_VIRTIO_RING_H
#define _UAPI_LINUX_VIRTIO_RING_H
/* An interface for efficient virtio implementation, currently for use by KVM
* and lguest, but hopefully others soon. Do NOT change this since it will
/* An interface for efficient virtio implementation, currently for use by KVM,
* but hopefully others soon. Do NOT change this since it will
* break existing servers and clients.
*
* This header is BSD licensed so anyone can use the definitions to implement
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册