提交 5cc97bf2 编写于 作者: L Linus Torvalds

Merge branch 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen

* 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: (44 commits)
  xen: disable all non-virtual drivers
  xen: use iret directly when possible
  xen: suppress abs symbol warnings for unused reloc pointers
  xen: Attempt to patch inline versions of common operations
  xen: Place vcpu_info structure into per-cpu memory
  xen: handle external requests for shutdown, reboot and sysrq
  xen: machine operations
  xen: add virtual network device driver
  xen: add virtual block device driver.
  xen: add the Xenbus sysfs and virtual device hotplug driver
  xen: Add grant table support
  xen: use the hvc console infrastructure for Xen console
  xen: hack to prevent bad segment register reload
  xen: lazy-mmu operations
  xen: Add support for preemption
  xen: SMP guest support
  xen: Implement sched_clock
  xen: Account for stolen time
  xen: ignore RW mapping of RO pages in pagetable_init
  xen: Complete pagetable pinning
  ...
...@@ -222,6 +222,8 @@ config PARAVIRT ...@@ -222,6 +222,8 @@ config PARAVIRT
However, when run without a hypervisor the kernel is However, when run without a hypervisor the kernel is
theoretically slower. If in doubt, say N. theoretically slower. If in doubt, say N.
source "arch/i386/xen/Kconfig"
config VMI config VMI
bool "VMI Paravirt-ops support" bool "VMI Paravirt-ops support"
depends on PARAVIRT depends on PARAVIRT
......
...@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 ...@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000
mcore-$(CONFIG_X86_ES7000) := mach-default mcore-$(CONFIG_X86_ES7000) := mach-default
core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/
# Xen paravirtualization support
core-$(CONFIG_XEN) += arch/i386/xen/
# default subarch .h files # default subarch .h files
mflags-y += -Iinclude/asm-i386/mach-default mflags-y += -Iinclude/asm-i386/mach-default
......
...@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = { ...@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
"__kernel_rt_sigreturn", "__kernel_rt_sigreturn",
"__kernel_sigreturn", "__kernel_sigreturn",
"SYSENTER_RETURN", "SYSENTER_RETURN",
"xen_irq_disable_direct_reloc",
"xen_save_fl_direct_reloc",
}; };
static int is_safe_abs_reloc(const char* sym_name) static int is_safe_abs_reloc(const char* sym_name)
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <asm/thread_info.h> #include <asm/thread_info.h>
#include <asm/elf.h> #include <asm/elf.h>
#include <xen/interface/xen.h>
#define DEFINE(sym, val) \ #define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val)) asm volatile("\n->" #sym " %0 " #val : : "i" (val))
...@@ -59,6 +61,7 @@ void foo(void) ...@@ -59,6 +61,7 @@ void foo(void)
OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_restart_block, thread_info, restart_block);
OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_sysenter_return, thread_info, sysenter_return);
OFFSET(TI_cpu, thread_info, cpu);
BLANK(); BLANK();
OFFSET(GDS_size, Xgt_desc_struct, size); OFFSET(GDS_size, Xgt_desc_struct, size);
...@@ -115,4 +118,10 @@ void foo(void) ...@@ -115,4 +118,10 @@ void foo(void)
OFFSET(PARAVIRT_iret, paravirt_ops, iret); OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
#endif #endif
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
#endif
} }
...@@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper) ...@@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper)
CFI_ENDPROC CFI_ENDPROC
ENDPROC(kernel_thread_helper) ENDPROC(kernel_thread_helper)
#ifdef CONFIG_XEN
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl $0
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
TRACE_IRQS_OFF
/* Check to see if we got the event in the critical
region in xen_iret_direct, after we've reenabled
events and checked for pending events. This simulates
iret instruction's behaviour where it delivers a
pending interrupt when enabling interrupts. */
movl PT_EIP(%esp),%eax
cmpl $xen_iret_start_crit,%eax
jb 1f
cmpl $xen_iret_end_crit,%eax
jae 1f
call xen_iret_crit_fixup
1: mov %esp, %eax
call xen_evtchn_do_upcall
jmp ret_from_intr
CFI_ENDPROC
ENDPROC(xen_hypervisor_callback)
# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
# 1. Fault while reloading DS, ES, FS or GS
# 2. Fault while executing IRET
# Category 1 we fix up by reattempting the load, and zeroing the segment
# register if the load fails.
# Category 2 we fix up by jumping to do_iret_error. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
popl %eax
CFI_ADJUST_CFA_OFFSET -4
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
.section .fixup,"ax"
6: xorl %eax,%eax
movl %eax,4(%esp)
jmp 1b
7: xorl %eax,%eax
movl %eax,8(%esp)
jmp 2b
8: xorl %eax,%eax
movl %eax,12(%esp)
jmp 3b
9: xorl %eax,%eax
movl %eax,16(%esp)
jmp 4b
.previous
.section __ex_table,"a"
.align 4
.long 1b,6b
.long 2b,7b
.long 3b,8b
.long 4b,9b
.previous
ENDPROC(xen_failsafe_callback)
#endif /* CONFIG_XEN */
.section .rodata,"a" .section .rodata,"a"
#include "syscall_table.S" #include "syscall_table.S"
......
...@@ -510,7 +510,8 @@ ENTRY(_stext) ...@@ -510,7 +510,8 @@ ENTRY(_stext)
/* /*
* BSS section * BSS section
*/ */
.section ".bss.page_aligned","w" .section ".bss.page_aligned","wa"
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir) ENTRY(swapper_pg_dir)
.fill 1024,4,0 .fill 1024,4,0
ENTRY(swapper_pg_pmd) ENTRY(swapper_pg_pmd)
...@@ -538,6 +539,8 @@ fault_msg: ...@@ -538,6 +539,8 @@ fault_msg:
.ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
.asciz "Stack: %p %p %p %p %p %p %p %p\n" .asciz "Stack: %p %p %p %p %p %p %p %p\n"
#include "../xen/xen-head.S"
/* /*
* The IDT and GDT 'descriptors' are a strange 48-bit object * The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not * only used by the lidt and lgdt instructions. They are not
......
...@@ -228,6 +228,41 @@ static int __init print_banner(void) ...@@ -228,6 +228,41 @@ static int __init print_banner(void)
} }
core_initcall(print_banner); core_initcall(print_banner);
static struct resource reserve_ioports = {
.start = 0,
.end = IO_SPACE_LIMIT,
.name = "paravirt-ioport",
.flags = IORESOURCE_IO | IORESOURCE_BUSY,
};
static struct resource reserve_iomem = {
.start = 0,
.end = -1,
.name = "paravirt-iomem",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
/*
* Reserve the whole legacy IO space to prevent any legacy drivers
* from wasting time probing for their hardware. This is a fairly
* brute-force approach to disabling all non-virtual drivers.
*
* Note that this must be called very early to have any effect.
*/
int paravirt_disable_iospace(void)
{
int ret;
ret = request_resource(&ioport_resource, &reserve_ioports);
if (ret == 0) {
ret = request_resource(&iomem_resource, &reserve_iomem);
if (ret)
release_resource(&reserve_ioports);
}
return ret;
}
struct paravirt_ops paravirt_ops = { struct paravirt_ops paravirt_ops = {
.name = "bare hardware", .name = "bare hardware",
.paravirt_enabled = 0, .paravirt_enabled = 0,
...@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = { ...@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
.write_msr = native_write_msr_safe, .write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc, .read_tsc = native_read_tsc,
.read_pmc = native_read_pmc, .read_pmc = native_read_pmc,
.get_scheduled_cycles = native_read_tsc, .sched_clock = native_sched_clock,
.get_cpu_khz = native_calculate_cpu_khz, .get_cpu_khz = native_calculate_cpu_khz,
.load_tr_desc = native_load_tr_desc, .load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt, .set_ldt = native_set_ldt,
......
...@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p) ...@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
* NOTE: at this point the bootmem allocator is fully available. * NOTE: at this point the bootmem allocator is fully available.
*/ */
paravirt_post_allocator_init();
dmi_scan_machine(); dmi_scan_machine();
#ifdef CONFIG_X86_GENERICARCH #ifdef CONFIG_X86_GENERICARCH
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <asm/mtrr.h> #include <asm/mtrr.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <mach_apic.h> #include <mach_apic.h>
/* /*
...@@ -249,13 +250,13 @@ static unsigned long flush_va; ...@@ -249,13 +250,13 @@ static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock); static DEFINE_SPINLOCK(tlbstate_lock);
/* /*
* We cannot call mmdrop() because we are in interrupt context, * We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask. * instead update mm->cpu_vm_mask.
* *
* We need to reload %cr3 since the page tables may be going * We need to reload %cr3 since the page tables may be going
* away from under us.. * away from under us..
*/ */
static inline void leave_mm (unsigned long cpu) void leave_mm(unsigned long cpu)
{ {
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
BUG(); BUG();
......
...@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void) ...@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
* a given CPU * a given CPU
*/ */
static void __cpuinit smp_store_cpu_info(int id) void __cpuinit smp_store_cpu_info(int id)
{ {
struct cpuinfo_x86 *c = cpu_data + id; struct cpuinfo_x86 *c = cpu_data + id;
...@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu) ...@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
/* representing cpus for which sibling maps can be computed */ /* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map; static cpumask_t cpu_sibling_setup_map;
static inline void void set_cpu_sibling_map(int cpu)
set_cpu_sibling_map(int cpu)
{ {
int i; int i;
struct cpuinfo_x86 *c = cpu_data; struct cpuinfo_x86 *c = cpu_data;
...@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void) ...@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
} }
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
static void void remove_siblinginfo(int cpu)
remove_siblinginfo(int cpu)
{ {
int sibling; int sibling;
struct cpuinfo_x86 *c = cpu_data; struct cpuinfo_x86 *c = cpu_data;
......
...@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void) ...@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
* *
* -johnstul@us.ibm.com "math is hard, lets go shopping!" * -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/ */
static unsigned long cyc2ns_scale __read_mostly; unsigned long cyc2ns_scale __read_mostly;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
...@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz) ...@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
} }
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}
/* /*
* Scheduler clock - returns current time in nanosec units. * Scheduler clock - returns current time in nanosec units.
*/ */
unsigned long long sched_clock(void) unsigned long long native_sched_clock(void)
{ {
unsigned long long this_offset; unsigned long long this_offset;
...@@ -118,12 +113,24 @@ unsigned long long sched_clock(void) ...@@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
/* read the Time Stamp Counter: */ /* read the Time Stamp Counter: */
get_scheduled_cycles(this_offset); rdtscll(this_offset);
/* return the value in ns */ /* return the value in ns */
return cycles_2_ns(this_offset); return cycles_2_ns(this_offset);
} }
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
unsigned long long sched_clock(void)
{
return paravirt_sched_clock();
}
#else
unsigned long long sched_clock(void)
__attribute__((alias("native_sched_clock")));
#endif
unsigned long native_calculate_cpu_khz(void) unsigned long native_calculate_cpu_khz(void)
{ {
unsigned long long start, end; unsigned long long start, end;
......
...@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) ...@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
} }
#endif #endif
static void vmi_allocate_pt(u32 pfn) static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
{ {
vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
...@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void) ...@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
paravirt_ops.setup_boot_clock = vmi_time_bsp_init; paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
paravirt_ops.setup_secondary_clock = vmi_time_ap_init; paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
#endif #endif
paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; paravirt_ops.sched_clock = vmi_sched_clock;
paravirt_ops.get_cpu_khz = vmi_cpu_khz; paravirt_ops.get_cpu_khz = vmi_cpu_khz;
/* We have true wallclock functions; disable CMOS clock sync */ /* We have true wallclock functions; disable CMOS clock sync */
......
...@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now) ...@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
return 0; return 0;
} }
/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ /* paravirt_ops.sched_clock = vmi_sched_clock */
unsigned long long vmi_get_sched_cycles(void) unsigned long long vmi_sched_clock(void)
{ {
return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
} }
/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
......
...@@ -88,6 +88,7 @@ SECTIONS ...@@ -88,6 +88,7 @@ SECTIONS
. = ALIGN(4096); . = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
*(.data.page_aligned)
*(.data.idt) *(.data.idt)
} }
......
...@@ -3,23 +3,40 @@ ...@@ -3,23 +3,40 @@
* Here we can supply some information useful to userland. * Here we can supply some information useful to userland.
*/ */
#include <linux/uts.h>
#include <linux/version.h> #include <linux/version.h>
#include <linux/elfnote.h>
#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ /* Ideally this would use UTS_NAME, but using a quoted string here
.section name, flags; \ doesn't work. Remember to change this when changing the
.balign 4; \ kernel's name. */
.long 1f - 0f; /* name length */ \ ELFNOTE_START(Linux, 0, "a")
.long 3f - 2f; /* data length */ \ .long LINUX_VERSION_CODE
.long type; /* note type */ \ ELFNOTE_END
0: .asciz vendor; /* vendor name */ \
1: .balign 4; \
2:
#define ASM_ELF_NOTE_END \ #ifdef CONFIG_XEN
3: .balign 4; /* pad out section */ \
.previous
ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) /*
.long LINUX_VERSION_CODE * Add a special note telling glibc's dynamic linker a fake hardware
ASM_ELF_NOTE_END * flavor that it will use to choose the search path for libraries in the
* same way it uses real hardware capabilities like "mmx".
* We supply "nosegneg" as the fake capability, to indicate that we
* do not like negative offsets in instructions using segment overrides,
* since we implement those inefficiently. This makes it possible to
* install libraries optimized to avoid those access patterns in someplace
* like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
* corresponding to the bits here is needed to make ldconfig work right.
* It should contain:
* hwcap 1 nosegneg
* to match the mapping of bit to name that we give here.
*/
/* Bit used for the pseudo-hwcap for non-negative segments. We use
bit 1 to avoid bugs in some versions of glibc when bit 0 is
used; the choice is otherwise arbitrary. */
#define VDSO_NOTE_NONEGSEG_BIT 1
ELFNOTE_START(GNU, 2, "a")
.long 1, 1<<VDSO_NOTE_NONEGSEG_BIT /* ncaps, mask */
.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
ELFNOTE_END
#endif
...@@ -52,7 +52,7 @@ execute(const char *string) ...@@ -52,7 +52,7 @@ execute(const char *string)
NULL, NULL,
}; };
if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
string, ret); string, ret);
} }
......
...@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) ...@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0)); BUG_ON(page_table != pte_offset_kernel(pmd, 0));
} }
...@@ -473,6 +473,7 @@ void zap_low_mappings (void) ...@@ -473,6 +473,7 @@ void zap_low_mappings (void)
static int disable_nx __initdata = 0; static int disable_nx __initdata = 0;
u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
/* /*
* noexec = on|off * noexec = on|off
......
...@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, ...@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
address = __pa(address); address = __pa(address);
addr = address & LARGE_PAGE_MASK; addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base); pbase = (pte_t *)page_address(base);
paravirt_alloc_pt(page_to_pfn(base)); paravirt_alloc_pt(&init_mm, page_to_pfn(base));
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
addr == address ? prot : ref_prot)); addr == address ? prot : ref_prot));
......
#
# This Kconfig describes xen options
#
config XEN
bool "Enable support for Xen hypervisor"
depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
events.o time.o manage.o xen-asm.o
obj-$(CONFIG_SMP) += smp.o
此差异已折叠。
/*
* Xen event channels
*
* Xen models interrupts with abstract event channels. Because each
* domain gets 1024 event channels, but NR_IRQ is not that large, we
* must dynamically map irqs<->event channels. The event channels
* interface with the rest of the kernel by defining a xen interrupt
* chip. When an event is recieved, it is mapped to an irq and sent
* through the normal interrupt processing path.
*
* There are four kinds of events which can be mapped to an event
* channel:
*
* 1. Inter-domain notifications. This includes all the virtual
* device events, since they're driven by front-ends in another domain
* (typically dom0).
* 2. VIRQs, typically used for timers. These are per-cpu events.
* 3. IPIs.
* 4. Hardware interrupts. Not supported at present.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/linkage.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/string.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/sync_bitops.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
#include "xen-ops.h"
/*
* This lock protects updates to the following mapping and reference-count
* arrays. The lock does not need to be acquired to read the mapping tables.
*/
static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
/* IRQ <-> IPI mapping */
static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
/* Packed IRQ information: binding type, sub-type index, and event channel. */
struct packed_irq
{
unsigned short evtchn;
unsigned char index;
unsigned char type;
};
static struct packed_irq irq_info[NR_IRQS];
/* Binding types. */
enum {
IRQT_UNBOUND,
IRQT_PIRQ,
IRQT_VIRQ,
IRQT_IPI,
IRQT_EVTCHN
};
/* Convenient shorthand for packed representation of an unbound IRQ. */
#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
[0 ... NR_EVENT_CHANNELS-1] = -1
};
static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
static u8 cpu_evtchn[NR_EVENT_CHANNELS];
/* Reference counts for bindings to IRQs. */
static int irq_bindcount[NR_IRQS];
/* Xen will never allocate port zero for any purpose. */
#define VALID_EVTCHN(chn) ((chn) != 0)
/*
* Force a proper event-channel callback from Xen after clearing the
* callback mask. We do this in a very simple manner, by making a call
* down into Xen. The pending flag will be checked by Xen on return.
*/
void force_evtchn_callback(void)
{
(void)HYPERVISOR_xen_version(0, NULL);
}
EXPORT_SYMBOL_GPL(force_evtchn_callback);
static struct irq_chip xen_dynamic_chip;
/* Constructor for packed IRQ information. */
static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
{
return (struct packed_irq) { evtchn, index, type };
}
/*
* Accessors for packed IRQ information.
*/
static inline unsigned int evtchn_from_irq(int irq)
{
return irq_info[irq].evtchn;
}
static inline unsigned int index_from_irq(int irq)
{
return irq_info[irq].index;
}
static inline unsigned int type_from_irq(int irq)
{
return irq_info[irq].type;
}
static inline unsigned long active_evtchns(unsigned int cpu,
struct shared_info *sh,
unsigned int idx)
{
return (sh->evtchn_pending[idx] &
cpu_evtchn_mask[cpu][idx] &
~sh->evtchn_mask[idx]);
}
static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
{
int irq = evtchn_to_irq[chn];
BUG_ON(irq == -1);
#ifdef CONFIG_SMP
irq_desc[irq].affinity = cpumask_of_cpu(cpu);
#endif
__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
__set_bit(chn, cpu_evtchn_mask[cpu]);
cpu_evtchn[chn] = cpu;
}
static void init_evtchn_cpu_bindings(void)
{
#ifdef CONFIG_SMP
int i;
/* By default all event channels notify CPU#0. */
for (i = 0; i < NR_IRQS; i++)
irq_desc[i].affinity = cpumask_of_cpu(0);
#endif
memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
}
static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
{
return cpu_evtchn[evtchn];
}
static inline void clear_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_clear_bit(port, &s->evtchn_pending[0]);
}
static inline void set_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, &s->evtchn_pending[0]);
}
/**
* notify_remote_via_irq - send event to remote end of event channel via irq
* @irq: irq of event channel to send event to
*
* Unlike notify_remote_via_evtchn(), this is safe to use across
* save/restore. Notifications on a broken connection are silently
* dropped.
*/
void notify_remote_via_irq(int irq)
{
int evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
notify_remote_via_evtchn(evtchn);
}
EXPORT_SYMBOL_GPL(notify_remote_via_irq);
static void mask_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, &s->evtchn_mask[0]);
}
static void unmask_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
unsigned int cpu = get_cpu();
BUG_ON(!irqs_disabled());
/* Slow path (hypercall) if this is a non-local port. */
if (unlikely(cpu != cpu_from_evtchn(port))) {
struct evtchn_unmask unmask = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
} else {
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
sync_clear_bit(port, &s->evtchn_mask[0]);
/*
* The following is basically the equivalent of
* 'hw_resend_irq'. Just like a real IO-APIC we 'lose
* the interrupt edge' if the channel is masked.
*/
if (sync_test_bit(port, &s->evtchn_pending[0]) &&
!sync_test_and_set_bit(port / BITS_PER_LONG,
&vcpu_info->evtchn_pending_sel))
vcpu_info->evtchn_upcall_pending = 1;
}
put_cpu();
}
static int find_unbound_irq(void)
{
int irq;
/* Only allocate from dynirq range */
for (irq = 0; irq < NR_IRQS; irq++)
if (irq_bindcount[irq] == 0)
break;
if (irq == NR_IRQS)
panic("No available IRQ to bind to: increase NR_IRQS!\n");
return irq;
}
int bind_evtchn_to_irq(unsigned int evtchn)
{
int irq;
spin_lock(&irq_mapping_update_lock);
irq = evtchn_to_irq[evtchn];
if (irq == -1) {
irq = find_unbound_irq();
dynamic_irq_init(irq);
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_level_irq, "event");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
}
irq_bindcount[irq]++;
spin_unlock(&irq_mapping_update_lock);
return irq;
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
int evtchn, irq;
spin_lock(&irq_mapping_update_lock);
irq = per_cpu(ipi_to_irq, cpu)[ipi];
if (irq == -1) {
irq = find_unbound_irq();
if (irq < 0)
goto out;
dynamic_irq_init(irq);
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_level_irq, "ipi");
bind_ipi.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
&bind_ipi) != 0)
BUG();
evtchn = bind_ipi.port;
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
per_cpu(ipi_to_irq, cpu)[ipi] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
irq_bindcount[irq]++;
out:
spin_unlock(&irq_mapping_update_lock);
return irq;
}
static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
int evtchn, irq;
spin_lock(&irq_mapping_update_lock);
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
&bind_virq) != 0)
BUG();
evtchn = bind_virq.port;
irq = find_unbound_irq();
dynamic_irq_init(irq);
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_level_irq, "virq");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
per_cpu(virq_to_irq, cpu)[virq] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
irq_bindcount[irq]++;
spin_unlock(&irq_mapping_update_lock);
return irq;
}
static void unbind_from_irq(unsigned int irq)
{
struct evtchn_close close;
int evtchn = evtchn_from_irq(irq);
spin_lock(&irq_mapping_update_lock);
if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
close.port = evtchn;
if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
BUG();
switch (type_from_irq(irq)) {
case IRQT_VIRQ:
per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
[index_from_irq(irq)] = -1;
break;
default:
break;
}
/* Closed ports are implicitly re-bound to VCPU0. */
bind_evtchn_to_cpu(evtchn, 0);
evtchn_to_irq[evtchn] = -1;
irq_info[irq] = IRQ_UNBOUND;
dynamic_irq_init(irq);
}
spin_unlock(&irq_mapping_update_lock);
}
int bind_evtchn_to_irqhandler(unsigned int evtchn,
irqreturn_t (*handler)(int, void *),
unsigned long irqflags,
const char *devname, void *dev_id)
{
unsigned int irq;
int retval;
irq = bind_evtchn_to_irq(evtchn);
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irqreturn_t (*handler)(int, void *),
unsigned long irqflags, const char *devname, void *dev_id)
{
unsigned int irq;
int retval;
irq = bind_virq_to_irq(virq, cpu);
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
int bind_ipi_to_irqhandler(enum ipi_vector ipi,
unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
void *dev_id)
{
int irq, retval;
irq = bind_ipi_to_irq(ipi, cpu);
if (irq < 0)
return irq;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
void unbind_from_irqhandler(unsigned int irq, void *dev_id)
{
free_irq(irq, dev_id);
unbind_from_irq(irq);
}
EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
{
int irq = per_cpu(ipi_to_irq, cpu)[vector];
BUG_ON(irq < 0);
notify_remote_via_irq(irq);
}
/*
* Search the CPUs pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for
* handling.
*
* Xen uses a two-level bitmap to speed searching. The first level is
* a bitset of words which contain pending event bits. The second
* level is a bitset of pending events themselves.
*/
fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
{
int cpu = get_cpu();
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
unsigned long pending_words;
vcpu_info->evtchn_upcall_pending = 0;
/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
while (pending_words != 0) {
unsigned long pending_bits;
int word_idx = __ffs(pending_words);
pending_words &= ~(1UL << word_idx);
while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
int bit_idx = __ffs(pending_bits);
int port = (word_idx * BITS_PER_LONG) + bit_idx;
int irq = evtchn_to_irq[port];
if (irq != -1) {
regs->orig_eax = ~irq;
do_IRQ(regs);
}
}
}
put_cpu();
}
/* Rebind an evtchn so that it gets delivered to a specific cpu */
static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
{
struct evtchn_bind_vcpu bind_vcpu;
int evtchn = evtchn_from_irq(irq);
if (!VALID_EVTCHN(evtchn))
return;
/* Send future instances of this interrupt to other vcpu. */
bind_vcpu.port = evtchn;
bind_vcpu.vcpu = tcpu;
/*
* If this fails, it usually just indicates that we're dealing with a
* virq or IPI channel, which don't actually need to be rebound. Ignore
* it, but don't do the xenlinux-level rebind in that case.
*/
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
bind_evtchn_to_cpu(evtchn, tcpu);
}
static void set_affinity_irq(unsigned irq, cpumask_t dest)
{
unsigned tcpu = first_cpu(dest);
rebind_irq_to_cpu(irq, tcpu);
}
static void enable_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
unmask_evtchn(evtchn);
}
static void disable_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
mask_evtchn(evtchn);
}
static void ack_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
move_native_irq(irq);
if (VALID_EVTCHN(evtchn))
clear_evtchn(evtchn);
}
static int retrigger_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
int ret = 0;
if (VALID_EVTCHN(evtchn)) {
set_evtchn(evtchn);
ret = 1;
}
return ret;
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
.name = "xen-dyn",
.mask = disable_dynirq,
.unmask = enable_dynirq,
.ack = ack_dynirq,
.set_affinity = set_affinity_irq,
.retrigger = retrigger_dynirq,
};
void __init xen_init_IRQ(void)
{
int i;
init_evtchn_cpu_bindings();
/* No event channels are 'live' right now. */
for (i = 0; i < NR_EVENT_CHANNELS; i++)
mask_evtchn(i);
/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
for (i = 0; i < NR_IRQS; i++)
irq_bindcount[i] = 0;
irq_ctx_init(smp_processor_id());
}
/******************************************************************************
* features.c
*
* Xen feature flags.
*
* Copyright (c) 2006, Ian Campbell, XenSource Inc.
*/
#include <linux/types.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <asm/xen/hypervisor.h>
#include <xen/features.h>
u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
EXPORT_SYMBOL_GPL(xen_features);
void xen_setup_features(void)
{
struct xen_feature_info fi;
int i, j;
for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
fi.submap_idx = i;
if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
break;
for (j = 0; j < 32; j++)
xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
}
}
/*
* Handle extern requests for shutdown, reboot and sysrq
*/
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/reboot.h>
#include <linux/sysrq.h>
#include <xen/xenbus.h>
#define SHUTDOWN_INVALID -1
#define SHUTDOWN_POWEROFF 0
#define SHUTDOWN_SUSPEND 2
/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
* report a crash, not be instructed to crash!
* HALT is the same as POWEROFF, as far as we're concerned. The tools use
* the distinction when we return the reason code to them.
*/
#define SHUTDOWN_HALT 4
/* Ignore multiple shutdown requests. */
static int shutting_down = SHUTDOWN_INVALID;
static void shutdown_handler(struct xenbus_watch *watch,
const char **vec, unsigned int len)
{
char *str;
struct xenbus_transaction xbt;
int err;
if (shutting_down != SHUTDOWN_INVALID)
return;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
/* Ignore read errors and empty reads. */
if (XENBUS_IS_ERR_READ(str)) {
xenbus_transaction_end(xbt, 1);
return;
}
xenbus_write(xbt, "control", "shutdown", "");
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN) {
kfree(str);
goto again;
}
if (strcmp(str, "poweroff") == 0 ||
strcmp(str, "halt") == 0)
orderly_poweroff(false);
else if (strcmp(str, "reboot") == 0)
ctrl_alt_del();
else {
printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID;
}
kfree(str);
}
static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
unsigned int len)
{
char sysrq_key = '\0';
struct xenbus_transaction xbt;
int err;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
printk(KERN_ERR "Unable to read sysrq code in "
"control/sysrq\n");
xenbus_transaction_end(xbt, 1);
return;
}
if (sysrq_key != '\0')
xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN)
goto again;
if (sysrq_key != '\0')
handle_sysrq(sysrq_key, NULL);
}
static struct xenbus_watch shutdown_watch = {
.node = "control/shutdown",
.callback = shutdown_handler
};
static struct xenbus_watch sysrq_watch = {
.node = "control/sysrq",
.callback = sysrq_handler
};
static int setup_shutdown_watcher(void)
{
int err;
err = register_xenbus_watch(&shutdown_watch);
if (err) {
printk(KERN_ERR "Failed to set shutdown watcher\n");
return err;
}
err = register_xenbus_watch(&sysrq_watch);
if (err) {
printk(KERN_ERR "Failed to set sysrq watcher\n");
return err;
}
return 0;
}
static int shutdown_event(struct notifier_block *notifier,
unsigned long event,
void *data)
{
setup_shutdown_watcher();
return NOTIFY_DONE;
}
static int __init setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
};
register_xenstore_notifier(&xenstore_notifier);
return 0;
}
subsys_initcall(setup_shutdown_event);
/*
* Xen mmu operations
*
* This file contains the various mmu fetch and update operations.
* The most important job they must perform is the mapping between the
* domain's pfn and the overall machine mfns.
*
* Xen allows guests to directly update the pagetable, in a controlled
* fashion. In other words, the guest modifies the same pagetable
* that the CPU actually uses, which eliminates the overhead of having
* a separate shadow pagetable.
*
* In order to allow this, it falls on the guest domain to map its
* notion of a "physical" pfn - which is just a domain-local linear
* address - into a real "machine address" which the CPU's MMU can
* use.
*
* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
* inserted directly into the pagetable. When creating a new
* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
* when reading the content back with __(pgd|pmd|pte)_val, it converts
* the mfn back into a pfn.
*
* The other constraint is that all pages which make up a pagetable
* must be mapped read-only in the guest. This prevents uncontrolled
* guest updates to the pagetable. Xen strictly enforces this, and
* will disallow any pagetable update which will end up mapping a
* pagetable page RW, and will disallow using any writable page as a
* pagetable.
*
* Naively, when loading %cr3 with the base of a new pagetable, Xen
* would need to validate the whole pagetable before going on.
* Naturally, this is quite slow. The solution is to "pin" a
* pagetable, which enforces all the constraints on the pagetable even
* when it is not actively in use. This menas that Xen can be assured
* that it is still valid when you do load it into %cr3, and doesn't
* need to revalidate it.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/bug.h>
#include <linux/sched.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
#include "multicalls.h"
#include "mmu.h"
xmaddr_t arbitrary_virt_to_machine(unsigned long address)
{
pte_t *pte = lookup_address(address);
unsigned offset = address & PAGE_MASK;
BUG_ON(pte == NULL);
return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
void make_lowmem_page_readonly(void *vaddr)
{
pte_t *pte, ptev;
unsigned long address = (unsigned long)vaddr;
pte = lookup_address(address);
BUG_ON(pte == NULL);
ptev = pte_wrprotect(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
}
void make_lowmem_page_readwrite(void *vaddr)
{
pte_t *pte, ptev;
unsigned long address = (unsigned long)vaddr;
pte = lookup_address(address);
BUG_ON(pte == NULL);
ptev = pte_mkwrite(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
}
void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
struct multicall_space mcs;
struct mmu_update *u;
preempt_disable();
mcs = xen_mc_entry(sizeof(*u));
u = mcs.args;
u->ptr = virt_to_machine(ptr).maddr;
u->val = pmd_val_ma(val);
MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
BUG();
return;
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
BUG();
return;
}
pte = pte_offset_kernel(pmd, vaddr);
/* <mfn,flags> stored as-is, to permit clearing entries */
xen_set_pte(pte, mfn_pte(mfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
if (mm == current->mm || mm == &init_mm) {
if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
struct multicall_space mcs;
mcs = xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
xen_mc_issue(PARAVIRT_LAZY_MMU);
return;
} else
if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
return;
}
xen_set_pte(ptep, pteval);
}
#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
struct multicall_space mcs;
struct mmu_update *u;
preempt_disable();
mcs = xen_mc_entry(sizeof(*u));
u = mcs.args;
u->ptr = virt_to_machine(ptr).maddr;
u->val = pud_val_ma(val);
MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
void xen_set_pte(pte_t *ptep, pte_t pte)
{
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
}
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_64bit((u64 *)ptep, pte_val_ma(pte));
}
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
ptep->pte_low = 0;
smp_wmb(); /* make sure low gets written first */
ptep->pte_high = 0;
}
void xen_pmd_clear(pmd_t *pmdp)
{
xen_set_pmd(pmdp, __pmd(0));
}
unsigned long long xen_pte_val(pte_t pte)
{
unsigned long long ret = 0;
if (pte.pte_low) {
ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
}
return ret;
}
unsigned long long xen_pmd_val(pmd_t pmd)
{
unsigned long long ret = pmd.pmd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
unsigned long long xen_pgd_val(pgd_t pgd)
{
unsigned long long ret = pgd.pgd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
pte_t xen_make_pte(unsigned long long pte)
{
if (pte & 1)
pte = phys_to_machine(XPADDR(pte)).maddr;
return (pte_t){ pte, pte >> 32 };
}
pmd_t xen_make_pmd(unsigned long long pmd)
{
if (pmd & 1)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
return (pmd_t){ pmd };
}
pgd_t xen_make_pgd(unsigned long long pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
return (pgd_t){ pgd };
}
#else /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
}
unsigned long xen_pte_val(pte_t pte)
{
unsigned long ret = pte.pte_low;
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr;
return ret;
}
unsigned long xen_pgd_val(pgd_t pgd)
{
unsigned long ret = pgd.pgd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
pte_t xen_make_pte(unsigned long pte)
{
if (pte & _PAGE_PRESENT)
pte = phys_to_machine(XPADDR(pte)).maddr;
return (pte_t){ pte };
}
pgd_t xen_make_pgd(unsigned long pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
return (pgd_t){ pgd };
}
#endif /* CONFIG_X86_PAE */
/*
(Yet another) pagetable walker. This one is intended for pinning a
pagetable. This means that it walks a pagetable and calls the
callback function on each page it finds making up the page table,
at every level. It walks the entire pagetable, but it only bothers
pinning pte pages which are below pte_limit. In the normal case
this will be TASK_SIZE, but at boot we need to pin up to
FIXADDR_TOP. But the important bit is that we don't pin beyond
there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
unsigned long limit)
{
pgd_t *pgd = pgd_base;
int flush = 0;
unsigned long addr = 0;
unsigned long pgd_next;
BUG_ON(limit > FIXADDR_TOP);
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
pud_t *pud;
unsigned long pud_limit, pud_next;
pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
if (!pgd_val(*pgd))
continue;
pud = pud_offset(pgd, 0);
if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(virt_to_page(pud), 0);
for (; addr != pud_limit; pud++, addr = pud_next) {
pmd_t *pmd;
unsigned long pmd_limit;
pud_next = pud_addr_end(addr, pud_limit);
if (pud_next < limit)
pmd_limit = pud_next;
else
pmd_limit = limit;
if (pud_none(*pud))
continue;
pmd = pmd_offset(pud, 0);
if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(virt_to_page(pmd), 0);
for (; addr != pmd_limit; pmd++) {
addr += (PAGE_SIZE * PTRS_PER_PTE);
if ((pmd_limit-1) < (addr-1)) {
addr = pmd_limit;
break;
}
if (pmd_none(*pmd))
continue;
flush |= (*func)(pmd_page(*pmd), 0);
}
}
}
flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
return flush;
}
static int pin_page(struct page *page, unsigned flags)
{
unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
int flush;
if (pgfl)
flush = 0; /* already pinned */
else if (PageHighMem(page))
/* kmaps need flushing if we found an unpinned
highpage */
flush = 1;
else {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0);
flush = 0;
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL_RO),
flags);
}
return flush;
}
/* This is called just after a mm has been created, but it has not
been used yet. We need to make sure that its pagetable is all
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
struct multicall_space mcs;
struct mmuext_op *op;
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
/* re-enable interrupts for kmap_flush_unused */
xen_mc_issue(0);
kmap_flush_unused();
xen_mc_batch();
}
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
#ifdef CONFIG_X86_PAE
op->cmd = MMUEXT_PIN_L3_TABLE;
#else
op->cmd = MMUEXT_PIN_L2_TABLE;
#endif
op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(0);
}
/* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all
the book-keeping now. */
static __init int mark_pinned(struct page *page, unsigned flags)
{
SetPagePinned(page);
return 0;
}
void __init xen_mark_init_mm_pinned(void)
{
pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}
static int unpin_page(struct page *page, unsigned flags)
{
unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
if (pgfl && !PageHighMem(page)) {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL),
flags);
}
return 0; /* never need to flush on unpin */
}
/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
struct mmuext_op *op;
struct multicall_space mcs;
xen_mc_batch();
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_UNPIN_TABLE;
op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
pgd_walk(pgd, unpin_page, TASK_SIZE);
xen_mc_issue(0);
}
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
spin_lock(&next->page_table_lock);
xen_pgd_pin(next->pgd);
spin_unlock(&next->page_table_lock);
}
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm->pgd);
spin_unlock(&mm->page_table_lock);
}
#ifdef CONFIG_SMP
/* Another cpu may still have their %cr3 pointing at the pagetable, so
we need to repoint it somewhere else before we can unpin it. */
static void drop_other_mm_ref(void *info)
{
struct mm_struct *mm = info;
if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
leave_mm(smp_processor_id());
}
static void drop_mm_ref(struct mm_struct *mm)
{
if (current->active_mm == mm) {
if (current->mm == mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
}
if (!cpus_empty(mm->cpu_vm_mask))
xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
{
if (current->active_mm == mm)
load_cr3(swapper_pg_dir);
}
#endif
/*
* While a process runs, Xen pins its pagetables, which means that the
* hypervisor forces it to be read-only, and it controls all updates
* to it. This means that all pagetable updates have to go via the
* hypervisor, which is moderately expensive.
*
* Since we're pulling the pagetable down, we switch to use init_mm,
* unpin old process pagetable and mark it all read-write, which
* allows further operations on it to be simple memory accesses.
*
* The only subtle point is that another CPU may be still using the
* pagetable because of lazy tlb flushing. This means we need need to
* switch all CPUs off this pagetable before we can unpin it.
*/
void xen_exit_mmap(struct mm_struct *mm)
{
get_cpu(); /* make sure we don't move around */
drop_mm_ref(mm);
put_cpu();
spin_lock(&mm->page_table_lock);
xen_pgd_unpin(mm->pgd);
spin_unlock(&mm->page_table_lock);
}
#ifndef _XEN_MMU_H
#include <linux/linkage.h>
#include <asm/page.h>
/*
* Page-directory addresses above 4GB do not fit into architectural %cr3.
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
* must use the following accessor macros to pack/unpack valid MFNs.
*
* Note that Xen is using the fact that the pagetable base is always
* page-aligned, and putting the 12 MSB of the address into the 12 LSB
* of cr3.
*/
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
void xen_exit_mmap(struct mm_struct *mm);
void xen_pgd_pin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);
#ifdef CONFIG_X86_PAE
unsigned long long xen_pte_val(pte_t);
unsigned long long xen_pmd_val(pmd_t);
unsigned long long xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long long);
pmd_t xen_make_pmd(unsigned long long);
pgd_t xen_make_pgd(unsigned long long);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#else
unsigned long xen_pte_val(pte_t);
unsigned long xen_pmd_val(pmd_t);
unsigned long xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long);
pmd_t xen_make_pmd(unsigned long);
pgd_t xen_make_pgd(unsigned long);
#endif
#endif /* _XEN_MMU_H */
/*
* Xen hypercall batching.
*
* Xen allows multiple hypercalls to be issued at once, using the
* multicall interface. This allows the cost of trapping into the
* hypervisor to be amortized over several calls.
*
* This file implements a simple interface for multicalls. There's a
* per-cpu buffer of outstanding multicalls. When you want to queue a
* multicall for issuing, you can allocate a multicall slot for the
* call and its arguments, along with storage for space which is
* pointed to by the arguments (for passing pointers to structures,
* etc). When the multicall is actually issued, all the space for the
* commands and allocated memory is freed for reuse.
*
* Multicalls are flushed whenever any of the buffers get full, or
* when explicitly requested. There's no way to get per-multicall
* return results back. It will BUG if any of the multicalls fail.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
#define MC_BATCH 32
#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
u64 args[MC_ARGS];
unsigned mcidx, argidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
int ret = 0;
unsigned long flags;
BUG_ON(preemptible());
/* Disable interrupts in case someone comes in and queues
something in the middle */
local_irq_save(flags);
if (b->mcidx) {
int i;
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
b->mcidx = 0;
b->argidx = 0;
} else
BUG_ON(b->argidx != 0);
local_irq_restore(flags);
BUG_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret;
unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
BUG_ON(preemptible());
BUG_ON(argspace > MC_ARGS);
if (b->mcidx == MC_BATCH ||
(b->argidx + argspace) > MC_ARGS)
xen_mc_flush();
ret.mc = &b->entries[b->mcidx];
b->mcidx++;
ret.args = &b->args[b->argidx];
b->argidx += argspace;
return ret;
}
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
#include "xen-ops.h"
/* Multicalls */
struct multicall_space
{
struct multicall_entry *mc;
void *args;
};
/* Allocate room for a multicall and its args */
struct multicall_space __xen_mc_entry(size_t args);
DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
/* Call to start a batch of multiple __xen_mc_entry()s. Must be
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
/* need to disable interrupts until this entry is complete */
local_irq_save(__get_cpu_var(xen_mc_irq_flags));
}
static inline struct multicall_space xen_mc_entry(size_t args)
{
xen_mc_batch();
return __xen_mc_entry(args);
}
/* Flush all pending multicalls */
void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
if ((xen_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
}
#endif /* _XEN_MULTICALLS_H */
/*
* Machine specific setup for xen
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <asm/elf.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
unsigned long *phys_to_machine_mapping;
EXPORT_SYMBOL(phys_to_machine_mapping);
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
unsigned long max_pfn = xen_start_info->nr_pages;
e820.nr_map = 0;
add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
return "Xen";
}
static void xen_idle(void)
{
local_irq_disable();
if (need_resched())
local_irq_enable();
else {
current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
safe_halt();
current_thread_info()->status |= TS_POLLING;
}
}
void __init xen_arch_setup(void)
{
struct physdev_set_iopl set_iopl;
int rc;
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
printk(KERN_INFO "physdev_op failed %d\n", rc);
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
disable_acpi();
}
#endif
memcpy(boot_command_line, xen_start_info->cmd_line,
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
pm_idle = xen_idle;
#ifdef CONFIG_SMP
/* fill cpus_possible with all available cpus */
xen_fill_possible_map();
#endif
paravirt_disable_iospace();
}
/*
* Xen SMP support
*
* This file implements the Xen versions of smp_ops. SMP under Xen is
* very straightforward. Bringing a CPU up is simply a matter of
* loading its initial context and setting it running.
*
* IPIs are handled through the Xen event mechanism.
*
* Because virtual CPUs can be scheduled onto any real CPU, there's no
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
*
* This does not handle HOTPLUG_CPU yet.
*/
#include <linux/sched.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
#include <xen/page.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "mmu.h"
static cpumask_t cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
static DEFINE_PER_CPU(int, callfunc_irq);
/*
* Structure and data for smp_call_function(). This is designed to minimise
* static memory requirements. It also looks cleaner.
*/
static DEFINE_SPINLOCK(call_lock);
struct call_data_struct {
void (*func) (void *info);
void *info;
atomic_t started;
atomic_t finished;
int wait;
};
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static struct call_data_struct *call_data;
/*
* Reschedule call back. Nothing to do,
* all the work is done automatically when
* we return from the interrupt.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
return IRQ_HANDLED;
}
static __cpuinit void cpu_bringup_and_idle(void)
{
int cpu = smp_processor_id();
cpu_init();
preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
xen_setup_cpu_clockevents();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb(); /* make sure everything is out */
cpu_idle();
}
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
const char *resched_name, *callfunc_name;
per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(resched_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(callfunc_irq, cpu) = rc;
return 0;
fail:
if (per_cpu(resched_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
if (per_cpu(callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
return rc;
}
void __init xen_fill_possible_map(void)
{
int i, rc;
for (i = 0; i < NR_CPUS; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0)
cpu_set(i, cpu_possible_map);
}
}
void __init xen_smp_prepare_boot_cpu(void)
{
int cpu;
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(&per_cpu__gdt_page);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
}
xen_setup_vcpu_info_placement();
}
void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
for (cpu = 0; cpu < NR_CPUS; cpu++) {
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
}
smp_store_cpu_info(0);
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
BUG();
cpu_initialized_map = cpumask_of_cpu(0);
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
continue;
cpu_clear(cpu, cpu_possible_map);
}
for_each_possible_cpu (cpu) {
struct task_struct *idle;
if (cpu == 0)
continue;
idle = fork_idle(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
cpu_set(cpu, cpu_present_map);
}
//init_xenbus_allowed_cpumask();
}
static __cpuinit int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
if (cpu_test_and_set(cpu, cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL)
return -ENOMEM;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = 0;
ctxt->user_regs.ss = __KERNEL_DS;
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
xen_copy_trap_info(ctxt->trap_ctxt);
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
make_lowmem_page_readonly(gdt->gdt);
ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.esp0;
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
kfree(ctxt);
return 0;
}
int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
#if 0
rc = cpu_up_check(cpu);
if (rc)
return rc;
#endif
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
xen_setup_timer(cpu);
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
rc = cpu_initialize_context(cpu, idle);
if (rc)
return rc;
if (num_online_cpus() == 1)
alternatives_smp_switch(1);
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
smp_store_cpu_info(cpu);
set_cpu_sibling_map(cpu);
/* This must be done before setting cpu_online_map */
wmb();
cpu_set(cpu, cpu_online_map);
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
return 0;
}
void xen_smp_cpus_done(unsigned int max_cpus)
{
}
static void stop_self(void *v)
{
int cpu = smp_processor_id();
/* make sure we're not pinning something down */
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
void xen_smp_send_stop(void)
{
smp_call_function(stop_self, NULL, 0, 0);
}
void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
{
unsigned cpu;
cpus_and(mask, mask, cpu_online_map);
for_each_cpu_mask(cpu, mask)
xen_send_IPI_one(cpu, vector);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
void (*func) (void *info) = call_data->func;
void *info = call_data->info;
int wait = call_data->wait;
/*
* Notify initiating CPU that I've grabbed the data and am
* about to execute the function
*/
mb();
atomic_inc(&call_data->started);
/*
* At this point the info structure may be out of scope unless wait==1
*/
irq_enter();
(*func)(info);
irq_exit();
if (wait) {
mb(); /* commit everything before setting finished */
atomic_inc(&call_data->finished);
}
return IRQ_HANDLED;
}
int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait)
{
struct call_data_struct data;
int cpus;
/* Holding any lock stops cpus from going down. */
spin_lock(&call_lock);
cpu_clear(smp_processor_id(), mask);
cpus = cpus_weight(mask);
if (!cpus) {
spin_unlock(&call_lock);
return 0;
}
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
data.wait = wait;
if (wait)
atomic_set(&data.finished, 0);
call_data = &data;
mb(); /* write everything before IPI */
/* Send a message to other CPUs and wait for them to respond */
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run.
XXX too severe? Maybe we should check the other CPU's states? */
HYPERVISOR_sched_op(SCHEDOP_yield, 0);
/* Wait for response */
while (atomic_read(&data.started) != cpus ||
(wait && atomic_read(&data.finished) != cpus))
cpu_relax();
spin_unlock(&call_lock);
return 0;
}
/*
* Xen time implementation.
*
* This is implemented in terms of a clocksource driver which uses
* the hypervisor clock as a nanosecond timebase, and a clockevent
* driver which uses the hypervisor's timer mechanism.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
#define XEN_SHIFT 22
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
static cycle_t xen_clocksource_read(void);
/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
u64 tsc_timestamp; /* TSC at last update of time vals. */
u64 system_timestamp; /* Time, in nanosecs, since boot. */
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
};
static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
/* unused ns of stolen and blocked time */
static DEFINE_PER_CPU(u64, residual_stolen);
static DEFINE_PER_CPU(u64, residual_blocked);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
u64 ret;
if (BITS_PER_LONG < 64) {
u32 *p32 = (u32 *)p;
u32 h, l;
/*
* Read high then low, and then make sure high is
* still the same; this will only loop if low wraps
* and carries into high.
* XXX some clean way to make this endian-proof?
*/
do {
h = p32[1];
barrier();
l = p32[0];
barrier();
} while (p32[1] != h);
ret = (((u64)h) << 32) | l;
} else
ret = *p;
return ret;
}
/*
* Runstate accounting
*/
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
{
u64 state_time;
struct vcpu_runstate_info *state;
BUG_ON(preemptible());
state = &__get_cpu_var(runstate);
/*
* The runstate info is always updated by the hypervisor on
* the current CPU, so there's no need to use anything
* stronger than a compiler barrier when fetching it.
*/
do {
state_time = get64(&state->state_entry_time);
barrier();
*res = *state;
barrier();
} while (get64(&state->state_entry_time) != state_time);
}
static void setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
area.addr.v = &per_cpu(runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
BUG();
}
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
s64 blocked, runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
snap = &__get_cpu_var(runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
*snap = state;
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. Passing NULL to
account_steal_time accounts the time as stolen. */
stolen = runnable + offline + __get_cpu_var(residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = 0;
while (stolen >= NS_PER_TICK) {
ticks++;
stolen -= NS_PER_TICK;
}
__get_cpu_var(residual_stolen) = stolen;
account_steal_time(NULL, ticks);
/* Add the appropriate number of ticks of blocked time,
including any left-overs from last time. Passing idle to
account_steal_time accounts the time as idle/wait. */
blocked += __get_cpu_var(residual_blocked);
if (blocked < 0)
blocked = 0;
ticks = 0;
while (blocked >= NS_PER_TICK) {
ticks++;
blocked -= NS_PER_TICK;
}
__get_cpu_var(residual_blocked) = blocked;
account_steal_time(idle_task(smp_processor_id()), ticks);
}
/*
* Xen sched_clock implementation. Returns the number of unstolen
* nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
* states.
*/
unsigned long long xen_sched_clock(void)
{
struct vcpu_runstate_info state;
cycle_t now;
u64 ret;
s64 offset;
/*
* Ideally sched_clock should be called on a per-cpu basis
* anyway, so preempt should already be disabled, but that's
* not current practice at the moment.
*/
preempt_disable();
now = xen_clocksource_read();
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
offset = now - state.state_entry_time;
if (offset < 0)
offset = 0;
ret = state.time[RUNSTATE_blocked] +
state.time[RUNSTATE_running] +
offset;
preempt_enable();
return ret;
}
/* Get the CPU speed from Xen */
unsigned long xen_cpu_khz(void)
{
u64 cpu_khz = 1000000ULL << 32;
const struct vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(cpu_khz, info->tsc_to_system_mul);
if (info->tsc_shift < 0)
cpu_khz <<= -info->tsc_shift;
else
cpu_khz >>= info->tsc_shift;
return cpu_khz;
}
/*
* Reads a consistent set of time-base values from Xen, into a shadow data
* area.
*/
static unsigned get_time_values_from_xen(void)
{
struct vcpu_time_info *src;
struct shadow_time_info *dst;
/* src is shared memory with the hypervisor, so we need to
make sure we get a consistent snapshot, even in the face of
being preempted. */
src = &__get_cpu_var(xen_vcpu)->time;
dst = &__get_cpu_var(shadow_time);
do {
dst->version = src->version;
rmb(); /* fetch version before data */
dst->tsc_timestamp = src->tsc_timestamp;
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
rmb(); /* test version after fetching data */
} while ((src->version & 1) | (dst->version ^ src->version));
return dst->version;
}
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
#endif
if (shift < 0)
delta >>= -shift;
else
delta <<= shift;
#ifdef __i386__
__asm__ (
"mul %5 ; "
"mov %4,%%eax ; "
"mov %%edx,%4 ; "
"mul %5 ; "
"xor %5,%5 ; "
"add %4,%%eax ; "
"adc %5,%%edx ; "
: "=A" (product), "=r" (tmp1), "=r" (tmp2)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
__asm__ (
"mul %%rdx ; shrd $32,%%rdx,%%rax"
: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif
return product;
}
static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
u64 now, delta;
now = native_read_tsc();
delta = now - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
static cycle_t xen_clocksource_read(void)
{
struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
cycle_t ret;
unsigned version;
do {
version = get_time_values_from_xen();
barrier();
ret = shadow->system_timestamp + get_nsec_offset(shadow);
barrier();
} while (version != __get_cpu_var(xen_vcpu)->time.version);
put_cpu_var(shadow_time);
return ret;
}
static void xen_read_wallclock(struct timespec *ts)
{
const struct shared_info *s = HYPERVISOR_shared_info;
u32 version;
u64 delta;
struct timespec now;
/* get wallclock at system boot */
do {
version = s->wc_version;
rmb(); /* fetch version before time */
now.tv_sec = s->wc_sec;
now.tv_nsec = s->wc_nsec;
rmb(); /* fetch time before checking version */
} while ((s->wc_version & 1) | (version ^ s->wc_version));
delta = xen_clocksource_read(); /* time since system boot */
delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
unsigned long xen_get_wallclock(void)
{
struct timespec ts;
xen_read_wallclock(&ts);
return ts.tv_sec;
}
int xen_set_wallclock(unsigned long now)
{
/* do nothing for domU */
return -1;
}
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
.read = xen_clocksource_read,
.mask = ~0,
.mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
.shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
/*
Xen clockevent implementation
Xen has two clockevent implementations:
The old timer_op one works with all released versions of Xen prior
to version 3.0.4. This version of the hypervisor provides a
single-shot timer with nanosecond resolution. However, sharing the
same event channel is a 100Hz tick which is delivered while the
vcpu is running. We don't care about or use this tick, but it will
cause the core time code to think the timer fired too soon, and
will end up resetting it each time. It could be filtered, but
doing so has complications when the ktime clocksource is not yet
the xen clocksource (ie, at boot time).
The new vcpu_op-based timer interface allows the tick timer period
to be changed or turned off. The tick timer is not useful as a
periodic timer because events are only delivered to running vcpus.
The one-shot timer can report when a timeout is in the past, so
set_next_event is capable of returning -ETIME when appropriate.
This interface is used when available.
*/
/*
Get a hypervisor absolute time. In theory we could maintain an
offset between the kernel's time and the hypervisor's time, and
apply that to a kernel's absolute timeout. Unfortunately the
hypervisor and kernel times can drift even if the kernel is using
the Xen clocksource, because ntp can warp the kernel's clocksource.
*/
static s64 get_abs_timeout(unsigned long delta)
{
return xen_clocksource_read() + delta;
}
static void xen_timerop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
/* unsupported */
WARN_ON(1);
break;
case CLOCK_EVT_MODE_ONESHOT:
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
HYPERVISOR_set_timer_op(0); /* cancel timeout */
break;
}
}
static int xen_timerop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
BUG();
/* We may have missed the deadline, but there's no real way of
knowing for sure. If the event was in the past, then we'll
get an immediate interrupt. */
return 0;
}
static const struct clock_event_device xen_timerop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_timerop_set_mode,
.set_next_event = xen_timerop_set_next_event,
};
static void xen_vcpuop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
WARN_ON(1); /* unsupported */
break;
case CLOCK_EVT_MODE_ONESHOT:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
}
}
static int xen_vcpuop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
struct vcpu_set_singleshot_timer single;
int ret;
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
single.timeout_abs_ns = get_abs_timeout(delta);
single.flags = VCPU_SSHOTTMR_future;
ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
BUG_ON(ret != 0 && ret != -ETIME);
return ret;
}
static const struct clock_event_device xen_vcpuop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_vcpuop_set_mode,
.set_next_event = xen_vcpuop_set_next_event,
};
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
irqreturn_t ret;
ret = IRQ_NONE;
if (evt->event_handler) {
evt->event_handler(evt);
ret = IRQ_HANDLED;
}
do_stolen_accounting();
return ret;
}
void xen_setup_timer(int cpu)
{
const char *name;
struct clock_event_device *evt;
int irq;
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
if (!name)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name, NULL);
evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of_cpu(cpu);
evt->irq = irq;
setup_runstate_info(cpu);
}
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
clockevents_register_device(&__get_cpu_var(xen_clock_events));
}
__init void xen_time_init(void)
{
int cpu = smp_processor_id();
get_time_values_from_xen();
clocksource_register(&xen_clocksource);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
vcpuop-based timer interface */
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
xen_clockevent = &xen_vcpuop_clockevent;
}
/* Set initial system time with full resolution */
xen_read_wallclock(&xtime);
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
tsc_disable = 0;
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
/*
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
The inline versions are the same as the direct-use versions, with the
pre- and post-amble chopped off.
This code is encoded for size rather than absolute efficiency,
with a view to being able to inline as much as possible.
We only bother with direct forms (ie, vcpu in pda) of the operations
here; the indirect forms are better handled in C, since they're
generally too large to inline anyway.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
/*
Enable events. This clears the event mask and tests the pending
event status with one and operation. If there are pending
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Clear mask and test pending */
andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
Disabling events is simply a matter of making the event mask
non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
(xen_)save_fl is used to get the current interrupt enable status.
Callers expect the status to be in X86_EFLAGS_IF, and other bits
may be set in the return value. We take advantage of this by
making sure that X86_EFLAGS_IF has the right value (and other bits
in that byte are 0), but other bits in the return value are
undefined. We need to toggle the state of the bit, because
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
In principle the caller should be passing us a value return
from xen_save_fl_direct, but for robustness sake we test only
the X86_EFLAGS_IF flag rather than the whole byte. After
setting the interrupt mask state, it checks for unmasked
pending events and enters the hypervisor to get them delivered
if so.
*/
ENTRY(xen_restore_fl_direct)
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
This is run where a normal iret would be run, with the same stack setup:
8: eflags
4: cs
esp-> 0: eip
This attempts to make sure that any pending events are dealt
with on return to usermode, but there is a small window in
which an event can happen just before entering usermode. If
the nested interrupt ends up setting one of the TIF_WORK_MASK
pending work flags, they will not be tested again before
returning to usermode. This means that a process can end up
with pending work, which will be unprocessed until the process
enters and leaves the kernel again, which could be an
unbounded amount of time. This means that a pending signal or
reschedule event could be indefinitely delayed.
The fix is to notice a nested interrupt in the critical
window, and if one occurs, then fold the nested interrupt into
the current interrupt stack frame, and re-process it
iteratively rather than recursively. This means that it will
exit via the normal path, and all pending work will be dealt
with appropriately.
Because the nested interrupt handler needs to deal with the
current stack state in whatever form its in, we keep things
simple by only using a single register which is pushed/popped
on the stack.
Non-direct iret could be done in the same way, but it would
require an annoying amount of code duplication. We'll assume
that direct mode will be the common case once the hypervisor
support becomes commonplace.
*/
ENTRY(xen_iret_direct)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/* Store vcpu_info pointer for easy access. Do it this
way to avoid having to reload %fs */
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax),%eax
movl __per_cpu_offset(,%eax,4),%eax
lea per_cpu__xen_vcpu_info(%eax),%eax
#else
movl $per_cpu__xen_vcpu_info, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/* Maybe enable events. Once this happens we could get a
recursive event, so the critical region starts immediately
afterwards. However, if that happens we don't end up
resuming the code, so we don't have to be worried about
being preempted to another CPU. */
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
/* If there's something pending, mask events again so we
can jump back into xen_hypervisor_callback */
sete XEN_vcpu_info_mask(%eax)
popl %eax
/* From this point on the registers are restored and the stack
updated, so we don't need to worry about it if we're preempted */
iret_restore_end:
/* Jump to hypervisor_callback after fixing up the stack.
Events are masked, so jumping out of the critical
region is OK. */
je xen_hypervisor_callback
iret
xen_iret_end_crit:
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
This is called by xen_hypervisor_callback in entry.S when it sees
that the EIP at the time of interrupt was between xen_iret_start_crit
and xen_iret_end_crit. We're passed the EIP in %eax so we can do
a more refined determination of what to do.
The stack format at this point is:
----------------
ss : (ss/esp may be present if we came from usermode)
esp :
eflags } outer exception info
cs }
eip }
---------------- <- edi (copy dest)
eax : outer eax if it hasn't been restored
----------------
eflags } nested exception info
cs } (no ss/esp because we're nested
eip } from the same ring)
orig_eax }<- esi (copy src)
- - - - - - - -
fs }
es }
ds } SAVE_ALL state
eax }
: :
ebx }
----------------
return addr <- esp
----------------
In order to deliver the nested exception properly, we need to shift
everything from the return addr up to the error code so it
sits just under the outer exception info. This means that when we
handle the exception, we do it in the context of the outer exception
rather than starting a new one.
The only caveat is that if the outer eax hasn't been
restored yet (ie, it's still on stack), we need to insert
its value into the SAVE_ALL state before going on, since
it's usermode state which we eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/* offsets +4 for return address */
/*
Paranoia: Make sure we're really coming from userspace.
One could imagine a case where userspace jumps into the
critical range address, but just before the CPU delivers a GP,
it decides to deliver an interrupt instead. Unlikely?
Definitely. Easy to avoid? Yes. The Intel documents
explicitly say that the reported EIP for a bad jump is the
jump instruction itself, not the destination, but some virtual
environments get this wrong.
*/
movl PT_CS+4(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX+4(%esp), %esi
lea PT_EFLAGS+4(%esp), %edi
/* If eip is before iret_restore_end then stack
hasn't been restored yet. */
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi),%eax /* copy EAX */
movl %eax, PT_EAX+4(%esp)
lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi),%esp /* point esp to new frame */
2: ret
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
*/
check_events:
push %eax
push %ecx
push %edx
call force_evtchn_callback
pop %edx
pop %ecx
pop %eax
ret
/* Xen-specific pieces of head.S, intended to be included in the right
place in head.S */
#ifdef CONFIG_XEN
#include <linux/elfnote.h>
#include <asm/boot.h>
#include <xen/interface/elfnote.h>
ENTRY(startup_xen)
movl %esi,xen_start_info
cld
movl $(init_thread_union+THREAD_SIZE),%esp
jmp xen_start_kernel
.pushsection ".bss.page_aligned"
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
#ifdef CONFIG_X86_PAE
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
#else
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
#endif
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
#endif /*CONFIG_XEN */
#ifndef XEN_OPS_H
#define XEN_OPS_H
#include <linux/init.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
DECLARE_PER_CPU(unsigned long, xen_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info *HYPERVISOR_shared_info;
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_setup_timer(int cpu);
void xen_setup_cpu_clockevents(void);
unsigned long xen_cpu_khz(void);
void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
void xen_mark_init_mm_pinned(void);
DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
static inline unsigned xen_get_lazy_mode(void)
{
return x86_read_percpu(xen_lazy_mode);
}
void __init xen_fill_possible_map(void);
void __init xen_setup_vcpu_info_placement(void);
void xen_smp_prepare_boot_cpu(void);
void xen_smp_prepare_cpus(unsigned int max_cpus);
int xen_cpu_up(unsigned int cpu);
void xen_smp_cpus_done(unsigned int max_cpus);
void xen_smp_send_stop(void);
void xen_smp_send_reschedule(int cpu);
int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
int wait);
int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int nonatomic, int wait);
int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait);
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
ret name(__VA_ARGS__); \
extern char name##_end[]; \
extern char name##_reloc[] \
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
void xen_iret_direct(void);
#endif /* XEN_OPS_H */
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <asm/io.h> #include <asm/io.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/fcntl.h> #include <asm/fcntl.h>
#include <xen/hvc-console.h>
/* Simple VGA output */ /* Simple VGA output */
...@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf) ...@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
simnow_init(buf + 6); simnow_init(buf + 6);
early_console = &simnow_console; early_console = &simnow_console;
keep_early = 1; keep_early = 1;
#ifdef CONFIG_HVC_XEN
} else if (!strncmp(buf, "xen", 3)) {
early_console = &xenboot_console;
#endif
} }
if (keep_early) if (keep_early)
......
...@@ -174,7 +174,7 @@ static void do_mce_trigger(void) ...@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
if (events != atomic_read(&mce_logged) && trigger[0]) { if (events != atomic_read(&mce_logged) && trigger[0]) {
/* Small race window, but should be harmless. */ /* Small race window, but should be harmless. */
atomic_set(&mce_logged, events); atomic_set(&mce_logged, events);
call_usermodehelper(trigger, trigger_argv, NULL, -1); call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
} }
} }
......
...@@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI) += acpi/ ...@@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI) += acpi/
obj-$(CONFIG_PNP) += pnp/ obj-$(CONFIG_PNP) += pnp/
obj-$(CONFIG_ARM_AMBA) += amba/ obj-$(CONFIG_ARM_AMBA) += amba/
obj-$(CONFIG_XEN) += xen/
# char/ comes before serial/ etc so that the VT console is the boot-time # char/ comes before serial/ etc so that the VT console is the boot-time
# default. # default.
obj-y += char/ obj-y += char/
......
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
#include <linux/jiffies.h> #include <linux/jiffies.h>
#include <linux/kmod.h> #include <linux/kmod.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/reboot.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <acpi/acpi_bus.h> #include <acpi/acpi_bus.h>
...@@ -59,7 +60,6 @@ ...@@ -59,7 +60,6 @@
#define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0 #define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0
#define ACPI_THERMAL_NOTIFY_HOT 0xF1 #define ACPI_THERMAL_NOTIFY_HOT 0xF1
#define ACPI_THERMAL_MODE_ACTIVE 0x00 #define ACPI_THERMAL_MODE_ACTIVE 0x00
#define ACPI_THERMAL_PATH_POWEROFF "/sbin/poweroff"
#define ACPI_THERMAL_MAX_ACTIVE 10 #define ACPI_THERMAL_MAX_ACTIVE 10
#define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
...@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz) ...@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
return 0; return 0;
} }
static int acpi_thermal_call_usermode(char *path)
{
char *argv[2] = { NULL, NULL };
char *envp[3] = { NULL, NULL, NULL };
if (!path)
return -EINVAL;
argv[0] = path;
/* minimal command environment */
envp[0] = "HOME=/";
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
call_usermodehelper(argv[0], argv, envp, 0);
return 0;
}
static int acpi_thermal_critical(struct acpi_thermal *tz) static int acpi_thermal_critical(struct acpi_thermal *tz)
{ {
if (!tz || !tz->trips.critical.flags.valid) if (!tz || !tz->trips.critical.flags.valid)
...@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz) ...@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL, acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
tz->trips.critical.flags.enabled); tz->trips.critical.flags.enabled);
acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF); orderly_poweroff(true);
return 0; return 0;
} }
......
...@@ -427,4 +427,13 @@ config XILINX_SYSACE ...@@ -427,4 +427,13 @@ config XILINX_SYSACE
help help
Include support for the Xilinx SystemACE CompactFlash interface Include support for the Xilinx SystemACE CompactFlash interface
config XEN_BLKDEV_FRONTEND
tristate "Xen virtual block device support"
depends on XEN
default y
help
This driver implements the front-end of the Xen virtual
block device driver. It communicates with a back-end driver
in another domain which drives the actual block device.
endif # BLK_DEV endif # BLK_DEV
...@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o ...@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
obj-$(CONFIG_BLK_DEV_UB) += ub.o obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
此差异已折叠。
...@@ -604,6 +604,14 @@ config HVC_BEAT ...@@ -604,6 +604,14 @@ config HVC_BEAT
help help
Toshiba's Cell Reference Set Beat Console device driver Toshiba's Cell Reference Set Beat Console device driver
config HVC_XEN
bool "Xen Hypervisor Console support"
depends on XEN
select HVC_DRIVER
default y
help
Xen virtual console device driver
config HVCS config HVCS
tristate "IBM Hypervisor Virtual Console Server support" tristate "IBM Hypervisor Virtual Console Server support"
depends on PPC_PSERIES depends on PPC_PSERIES
......
...@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o ...@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
obj-$(CONFIG_HVC_BEAT) += hvc_beat.o obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
obj-$(CONFIG_HVC_DRIVER) += hvc_console.o obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
obj-$(CONFIG_HVC_XEN) += hvc_xen.o
obj-$(CONFIG_RAW_DRIVER) += raw.o obj-$(CONFIG_RAW_DRIVER) += raw.o
obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
obj-$(CONFIG_MSPEC) += mspec.o obj-$(CONFIG_MSPEC) += mspec.o
......
/*
* xen console driver interface to hvc_console.c
*
* (c) 2007 Gerd Hoffmann <kraxel@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/types.h>
#include <asm/xen/hypervisor.h>
#include <xen/page.h>
#include <xen/events.h>
#include <xen/interface/io/console.h>
#include <xen/hvc-console.h>
#include "hvc_console.h"
#define HVC_COOKIE 0x58656e /* "Xen" in hex */
static struct hvc_struct *hvc;
static int xencons_irq;
/* ------------------------------------------------------------------ */
static inline struct xencons_interface *xencons_interface(void)
{
return mfn_to_virt(xen_start_info->console.domU.mfn);
}
static inline void notify_daemon(void)
{
/* Use evtchn: this is called early, before irq is set up. */
notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
}
static int write_console(uint32_t vtermno, const char *data, int len)
{
struct xencons_interface *intf = xencons_interface();
XENCONS_RING_IDX cons, prod;
int sent = 0;
cons = intf->out_cons;
prod = intf->out_prod;
mb(); /* update queue values before going on */
BUG_ON((prod - cons) > sizeof(intf->out));
while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
wmb(); /* write ring before updating pointer */
intf->out_prod = prod;
notify_daemon();
return sent;
}
static int read_console(uint32_t vtermno, char *buf, int len)
{
struct xencons_interface *intf = xencons_interface();
XENCONS_RING_IDX cons, prod;
int recv = 0;
cons = intf->in_cons;
prod = intf->in_prod;
mb(); /* get pointers before reading ring */
BUG_ON((prod - cons) > sizeof(intf->in));
while (cons != prod && recv < len)
buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)];
mb(); /* read ring before consuming */
intf->in_cons = cons;
notify_daemon();
return recv;
}
static struct hv_ops hvc_ops = {
.get_chars = read_console,
.put_chars = write_console,
};
static int __init xen_init(void)
{
struct hvc_struct *hp;
if (!is_running_on_xen())
return 0;
xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
if (xencons_irq < 0)
xencons_irq = 0 /* NO_IRQ */;
hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
if (IS_ERR(hp))
return PTR_ERR(hp);
hvc = hp;
return 0;
}
static void __exit xen_fini(void)
{
if (hvc)
hvc_remove(hvc);
}
static int xen_cons_init(void)
{
if (!is_running_on_xen())
return 0;
hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
return 0;
}
module_init(xen_init);
module_exit(xen_fini);
console_initcall(xen_cons_init);
static void xenboot_write_console(struct console *console, const char *string,
unsigned len)
{
unsigned int linelen, off = 0;
const char *pos;
while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
linelen = pos-string+off;
if (off + linelen > len)
break;
write_console(0, string+off, linelen);
write_console(0, "\r\n", 2);
off += linelen + 1;
}
if (off < len)
write_console(0, string+off, len-off);
}
struct console xenboot_console = {
.name = "xenboot",
.write = xenboot_write_console,
.flags = CON_PRINTBUFFER | CON_BOOT,
};
...@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void) ...@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin", "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL }; NULL };
return call_usermodehelper(critical_overtemp_path, argv, envp, 0); return call_usermodehelper(critical_overtemp_path,
argv, envp, UMH_WAIT_EXEC);
} }
......
...@@ -80,7 +80,8 @@ int wf_critical_overtemp(void) ...@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin", "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL }; NULL };
return call_usermodehelper(critical_overtemp_path, argv, envp, 0); return call_usermodehelper(critical_overtemp_path,
argv, envp, UMH_WAIT_EXEC);
} }
EXPORT_SYMBOL_GPL(wf_critical_overtemp); EXPORT_SYMBOL_GPL(wf_critical_overtemp);
......
...@@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig" ...@@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig"
source "drivers/s390/net/Kconfig" source "drivers/s390/net/Kconfig"
config XEN_NETDEV_FRONTEND
tristate "Xen network device frontend driver"
depends on XEN
default y
help
The network device frontend driver allows the kernel to
access network devices exported exported by a virtual
machine containing a physical network device driver. The
frontend driver is intended for unprivileged guest domains;
if you are compiling a kernel for a Xen guest, you almost
certainly want to enable this.
config ISERIES_VETH config ISERIES_VETH
tristate "iSeries Virtual Ethernet driver support" tristate "iSeries Virtual Ethernet driver support"
depends on PPC_ISERIES depends on PPC_ISERIES
......
...@@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o ...@@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o
obj-$(CONFIG_SLIP) += slip.o obj-$(CONFIG_SLIP) += slip.o
obj-$(CONFIG_SLHC) += slhc.o obj-$(CONFIG_SLHC) += slhc.o
obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_IFB) += ifb.o
obj-$(CONFIG_MACVLAN) += macvlan.o obj-$(CONFIG_MACVLAN) += macvlan.o
......
...@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc) ...@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
sprintf(portarg, "%ld", bc->pdev->port->base); sprintf(portarg, "%ld", bc->pdev->port->base);
printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg); printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
return call_usermodehelper(eppconfig_path, argv, envp, 1); return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
} }
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
......
此差异已折叠。
...@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info) ...@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
info->location_id, info->serial, info->capabilities); info->location_id, info->serial, info->capabilities);
envp[i] = NULL; envp[i] = NULL;
value = call_usermodehelper (argv [0], argv, envp, 0); value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
kfree (buf); kfree (buf);
kfree (envp); kfree (envp);
return 0; return 0;
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/kmod.h> #include <linux/kmod.h>
#include <linux/reboot.h>
#include <asm/oplib.h> #include <asm/oplib.h>
#include <asm/ebus.h> #include <asm/ebus.h>
...@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp) ...@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
{ {
static int shutting_down = 0; static int shutting_down = 0;
static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
char *type = "???"; char *type = "???";
s8 val = -1; s8 val = -1;
...@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) ...@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n"); printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
shutting_down = 1; shutting_down = 1;
if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0) if (orderly_poweroff(true) < 0)
printk(KERN_CRIT "envctrl: shutdown execution failed\n"); printk(KERN_CRIT "envctrl: shutdown execution failed\n");
} }
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <linux/ioport.h> #include <linux/ioport.h>
#include <linux/miscdevice.h> #include <linux/miscdevice.h>
#include <linux/kmod.h> #include <linux/kmod.h>
#include <linux/reboot.h>
#include <asm/ebus.h> #include <asm/ebus.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type) ...@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
static void envctrl_do_shutdown(void) static void envctrl_do_shutdown(void)
{ {
static int inprog = 0; static int inprog = 0;
static char *envp[] = {
"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
char *argv[] = {
"/sbin/shutdown", "-h", "now", NULL };
int ret; int ret;
if (inprog != 0) if (inprog != 0)
...@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void) ...@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
inprog = 1; inprog = 1;
printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n"); printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0); ret = orderly_poweroff(true);
if (ret < 0) { if (ret < 0) {
printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n");
inprog = 0; /* unlikely to succeed, but we could try again */ inprog = 0; /* unlikely to succeed, but we could try again */
......
obj-y += grant-table.o
obj-y += xenbus/
此差异已折叠。
obj-y += xenbus.o
xenbus-objs =
xenbus-objs += xenbus_client.o
xenbus-objs += xenbus_comms.o
xenbus-objs += xenbus_xs.o
xenbus-objs += xenbus_probe.o
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册