提交 84621c9b 编写于 作者: L Linus Torvalds

Merge tag 'stable/for-linus-3.14-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull Xen updates from Konrad Rzeszutek Wilk:
 "Two major features that Xen community is excited about:

  The first is event channel scalability by David Vrabel - we switch
  over from an two-level per-cpu bitmap of events (IRQs) - to an FIFO
  queue with priorities.  This lets us be able to handle more events,
  have lower latency, and better scalability.  Good stuff.

  The other is PVH by Mukesh Rathor.  In short, PV is a mode where the
  kernel lets the hypervisor program page-tables, segments, etc.  With
  EPT/NPT capabilities in current processors, the overhead of doing this
  in an HVM (Hardware Virtual Machine) container is much lower than the
  hypervisor doing it for us.

  In short we let a PV guest run without doing page-table, segment,
  syscall, etc updates through the hypervisor - instead it is all done
  within the guest container.  It is a "hybrid" PV - hence the 'PVH'
  name - a PV guest within an HVM container.

  The major benefits are less code to deal with - for example we only
  use one function from the the pv_mmu_ops (which has 39 function
  calls); faster performance for syscall (no context switches into the
  hypervisor); less traps on various operations; etc.

  It is still being baked - the ABI is not yet set in stone.  But it is
  pretty awesome and we are excited about it.

  Lastly, there are some changes to ARM code - you should get a simple
  conflict which has been resolved in #linux-next.

  In short, this pull has awesome features.

  Features:
   - FIFO event channels.  Key advantages: support for over 100,000
     events (2^17), 16 different event priorities, improved fairness in
     event latency through the use of FIFOs.
   - Xen PVH support.  "It’s a fully PV kernel mode, running with
     paravirtualized disk and network, paravirtualized interrupts and
     timers, no emulated devices of any kind (and thus no qemu), no BIOS
     or legacy boot — but instead of requiring PV MMU, it uses the HVM
     hardware extensions to virtualize the pagetables, as well as system
     calls and other privileged operations." (from "The
     Paravirtualization Spectrum, Part 2: From poles to a spectrum")

  Bug-fixes:
   - Fixes in balloon driver (refactor and make it work under ARM)
   - Allow xenfb to be used in HVM guests.
   - Allow xen_platform_pci=0 to work properly.
   - Refactors in event channels"

* tag 'stable/for-linus-3.14-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (52 commits)
  xen/pvh: Set X86_CR0_WP and others in CR0 (v2)
  MAINTAINERS: add git repository for Xen
  xen/pvh: Use 'depend' instead of 'select'.
  xen: delete new instances of __cpuinit usage
  xen/fb: allow xenfb initialization for hvm guests
  xen/evtchn_fifo: fix error return code in evtchn_fifo_setup()
  xen-platform: fix error return code in platform_pci_init()
  xen/pvh: remove duplicated include from enlighten.c
  xen/pvh: Fix compile issues with xen_pvh_domain()
  xen: Use dev_is_pci() to check whether it is pci device
  xen/grant-table: Force to use v1 of grants.
  xen/pvh: Support ParaVirtualized Hardware extensions (v3).
  xen/pvh: Piggyback on PVHVM XenBus.
  xen/pvh: Piggyback on PVHVM for grant driver (v4)
  xen/grant: Implement an grant frame array struct (v3).
  xen/grant-table: Refactor gnttab_init
  xen/grants: Remove gnttab_max_grant_frames dependency on gnttab_init.
  xen/pvh: Piggyback on PVHVM for event channels (v2)
  xen/pvh: Update E820 to work with PVH (v2)
  xen/pvh: Secondary VCPU bringup (non-bootup CPUs)
  ...
......@@ -9559,6 +9559,7 @@ M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
M: Boris Ostrovsky <boris.ostrovsky@oracle.com>
M: David Vrabel <david.vrabel@citrix.com>
L: xen-devel@lists.xenproject.org (moderated for non-subscribers)
T: git git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git
S: Supported
F: arch/x86/xen/
F: drivers/*/xen-*front.c
......
......@@ -117,6 +117,7 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return __set_phys_to_machine(pfn, mfn);
}
#define xen_remap(cookie, size) ioremap_cache((cookie), (size));
#define xen_remap(cookie, size) ioremap_cache((cookie), (size))
#define xen_unmap(cookie) iounmap((cookie))
#endif /* _ASM_ARM_XEN_PAGE_H */
......@@ -208,6 +208,7 @@ static int __init xen_guest_init(void)
const char *version = NULL;
const char *xen_prefix = "xen,xen-";
struct resource res;
unsigned long grant_frames;
node = of_find_compatible_node(NULL, NULL, "xen,xen");
if (!node) {
......@@ -224,10 +225,10 @@ static int __init xen_guest_init(void)
}
if (of_address_to_resource(node, GRANT_TABLE_PHYSADDR, &res))
return 0;
xen_hvm_resume_frames = res.start;
grant_frames = res.start;
xen_events_irq = irq_of_parse_and_map(node, 0);
pr_info("Xen %s support found, events_irq=%d gnttab_frame_pfn=%lx\n",
version, xen_events_irq, (xen_hvm_resume_frames >> PAGE_SHIFT));
version, xen_events_irq, (grant_frames >> PAGE_SHIFT));
xen_domain_type = XEN_HVM_DOMAIN;
xen_setup_features();
......@@ -265,6 +266,10 @@ static int __init xen_guest_init(void)
if (xen_vcpu_info == NULL)
return -ENOMEM;
if (gnttab_setup_auto_xlat_frames(grant_frames)) {
free_percpu(xen_vcpu_info);
return -ENOMEM;
}
gnttab_init();
if (!xen_initial_domain())
xenbus_probe(NULL);
......
......@@ -167,7 +167,12 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
*/
static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
{
unsigned long pfn = mfn_to_pfn(mfn);
unsigned long pfn;
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
pfn = mfn_to_pfn(mfn);
if (get_phys_to_machine(pfn) != mfn)
return -1; /* force !pfn_valid() */
return pfn;
......@@ -222,5 +227,6 @@ void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
#define xen_remap(cookie, size) ioremap((cookie), (size));
#define xen_unmap(cookie) iounmap((cookie))
#endif /* _ASM_X86_XEN_PAGE_H */
......@@ -51,3 +51,7 @@ config XEN_DEBUG_FS
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
config XEN_PVH
bool "Support for running as a PVH guest"
depends on X86_64 && XEN && XEN_PVHVM
def_bool n
......@@ -262,8 +262,9 @@ static void __init xen_banner(void)
struct xen_extraversion extra;
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
pv_info.name);
pr_info("Booting paravirtualized kernel %son %s\n",
xen_feature(XENFEAT_auto_translated_physmap) ?
"with PVH extensions " : "", pv_info.name);
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
......@@ -433,7 +434,7 @@ static void __init xen_init_cpuid_mask(void)
ax = 1;
cx = 0;
xen_cpuid(&ax, &bx, &cx, &dx);
cpuid(1, &ax, &bx, &cx, &dx);
xsave_mask =
(1 << (X86_FEATURE_XSAVE % 32)) |
......@@ -1142,8 +1143,9 @@ void xen_setup_vcpu_info_placement(void)
xen_vcpu_setup(cpu);
/* xen_vcpu_setup managed to place the vcpu_info within the
percpu area for all cpus, so make use of it */
if (have_vcpu_info_placement) {
* percpu area for all cpus, so make use of it. Note that for
* PVH we want to use native IRQ mechanism. */
if (have_vcpu_info_placement && !xen_pvh_domain()) {
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
......@@ -1407,9 +1409,49 @@ static void __init xen_boot_params_init_edd(void)
* Set up the GDT and segment registers for -fstack-protector. Until
* we do this, we have to be careful not to call any stack-protected
* function, which is most of the kernel.
*
* Note, that it is __ref because the only caller of this after init
* is PVH which is not going to use xen_load_gdt_boot or other
* __init functions.
*/
static void __init xen_setup_stackprotector(void)
static void __ref xen_setup_gdt(int cpu)
{
if (xen_feature(XENFEAT_auto_translated_physmap)) {
#ifdef CONFIG_X86_64
unsigned long dummy;
load_percpu_segment(cpu); /* We need to access per-cpu area */
switch_to_new_gdt(cpu); /* GDT and GS set */
/* We are switching of the Xen provided GDT to our HVM mode
* GDT. The new GDT has __KERNEL_CS with CS.L = 1
* and we are jumping to reload it.
*/
asm volatile ("pushq %0\n"
"leaq 1f(%%rip),%0\n"
"pushq %0\n"
"lretq\n"
"1:\n"
: "=&r" (dummy) : "0" (__KERNEL_CS));
/*
* While not needed, we also set the %es, %ds, and %fs
* to zero. We don't care about %ss as it is NULL.
* Strictly speaking this is not needed as Xen zeros those
* out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
*
* Linux zeros them in cpu_init() and in secondary_startup_64
* (for BSP).
*/
loadsegment(es, 0);
loadsegment(ds, 0);
loadsegment(fs, 0);
#else
/* PVH: TODO Implement. */
BUG();
#endif
return; /* PVH does not need any PV GDT ops. */
}
pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
pv_cpu_ops.load_gdt = xen_load_gdt_boot;
......@@ -1420,6 +1462,46 @@ static void __init xen_setup_stackprotector(void)
pv_cpu_ops.load_gdt = xen_load_gdt;
}
/*
* A PV guest starts with default flags that are not set for PVH, set them
* here asap.
*/
static void xen_pvh_set_cr_flags(int cpu)
{
/* Some of these are setup in 'secondary_startup_64'. The others:
* X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
* (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
}
/*
* Note, that it is ref - because the only caller of this after init
* is PVH which is not going to use xen_load_gdt_boot or other
* __init functions.
*/
void __ref xen_pvh_secondary_vcpu_init(int cpu)
{
xen_setup_gdt(cpu);
xen_pvh_set_cr_flags(cpu);
}
static void __init xen_pvh_early_guest_init(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap))
return;
if (!xen_feature(XENFEAT_hvm_callback_vector))
return;
xen_have_vector_callback = 1;
xen_pvh_set_cr_flags(0);
#ifdef CONFIG_X86_32
BUG(); /* PVH: Implement proper support. */
#endif
}
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
......@@ -1431,13 +1513,16 @@ asmlinkage void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
xen_setup_features();
xen_pvh_early_guest_init();
xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
pv_cpu_ops = xen_cpu_ops;
pv_apic_ops = xen_apic_ops;
if (!xen_pvh_domain())
pv_cpu_ops = xen_cpu_ops;
x86_init.resources.memory_setup = xen_memory_setup;
x86_init.oem.arch_setup = xen_arch_setup;
......@@ -1469,17 +1554,14 @@ asmlinkage void __init xen_start_kernel(void)
/* Work out if we support NX */
x86_configure_nx();
xen_setup_features();
/* Get mfn list */
if (!xen_feature(XENFEAT_auto_translated_physmap))
xen_build_dynamic_phys_to_machine();
xen_build_dynamic_phys_to_machine();
/*
* Set up kernel GDT and segment registers, mainly so that
* -fstack-protector code can be executed.
*/
xen_setup_stackprotector();
xen_setup_gdt(0);
xen_init_irq_ops();
xen_init_cpuid_mask();
......@@ -1548,14 +1630,18 @@ asmlinkage void __init xen_start_kernel(void)
/* set the limit of our address space */
xen_reserve_top();
/* We used to do this in xen_arch_setup, but that is too late on AMD
* were early_cpu_init (run before ->arch_setup()) calls early_amd_init
* which pokes 0xcf8 port.
*/
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
xen_raw_printk("physdev_op failed %d\n", rc);
/* PVH: runs at default kernel iopl of 0 */
if (!xen_pvh_domain()) {
/*
* We used to do this in xen_arch_setup, but that is too late
* on AMD were early_cpu_init (run before ->arch_setup()) calls
* early_amd_init which pokes 0xcf8 port.
*/
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
xen_raw_printk("physdev_op failed %d\n", rc);
}
#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
......
......@@ -125,3 +125,66 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
}
#ifdef CONFIG_XEN_PVH
#include <xen/balloon.h>
#include <xen/events.h>
#include <xen/xen.h>
#include <linux/slab.h>
static int __init xlated_setup_gnttab_pages(void)
{
struct page **pages;
xen_pfn_t *pfns;
int rc;
unsigned int i;
unsigned long nr_grant_frames = gnttab_max_grant_frames();
BUG_ON(nr_grant_frames == 0);
pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
if (!pages)
return -ENOMEM;
pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
if (!pfns) {
kfree(pages);
return -ENOMEM;
}
rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
if (rc) {
pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
nr_grant_frames, rc);
kfree(pages);
kfree(pfns);
return rc;
}
for (i = 0; i < nr_grant_frames; i++)
pfns[i] = page_to_pfn(pages[i]);
rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
&xen_auto_xlat_grant_frames.vaddr);
kfree(pages);
if (rc) {
pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
nr_grant_frames, rc);
free_xenballooned_pages(nr_grant_frames, pages);
kfree(pfns);
return rc;
}
xen_auto_xlat_grant_frames.pfn = pfns;
xen_auto_xlat_grant_frames.count = nr_grant_frames;
return 0;
}
static int __init xen_pvh_gnttab_setup(void)
{
if (!xen_pvh_domain())
return -ENODEV;
return xlated_setup_gnttab_pages();
}
/* Call it _before_ __gnttab_init as we need to initialize the
* xen_auto_xlat_grant_frames first. */
core_initcall(xen_pvh_gnttab_setup);
#endif
......@@ -5,6 +5,7 @@
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/vcpu.h>
#include <xen/features.h>
#include <xen/events.h>
#include <asm/xen/hypercall.h>
......@@ -128,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
void __init xen_init_irq_ops(void)
{
pv_irq_ops = xen_irq_ops;
/* For PVH we use default pv_irq_ops settings. */
if (!xen_feature(XENFEAT_hvm_callback_vector))
pv_irq_ops = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
}
......@@ -1198,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
* instead of somewhere later and be confusing. */
xen_mc_flush();
}
#endif
static void __init xen_pagetable_init(void)
static void __init xen_pagetable_p2m_copy(void)
{
#ifdef CONFIG_X86_64
unsigned long size;
unsigned long addr;
#endif
paging_init();
xen_setup_shared_info();
#ifdef CONFIG_X86_64
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
unsigned long new_mfn_list;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
/* On 32-bit, we get zero so this never gets executed. */
new_mfn_list = xen_revector_p2m_tree();
if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
/* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size);
/* We should be in __ka space. */
BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list;
/* We roundup to the PMD, which means that if anybody at this stage is
* using the __ka address of xen_start_info or xen_start_info->shared_info
* they are in going to crash. Fortunatly we have already revectored
* in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
size = roundup(size, PMD_SIZE);
xen_cleanhighmap(addr, addr + size);
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
memblock_free(__pa(xen_start_info->mfn_list), size);
/* And revector! Bye bye old array */
xen_start_info->mfn_list = new_mfn_list;
} else
goto skip;
}
unsigned long new_mfn_list;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
new_mfn_list = xen_revector_p2m_tree();
/* No memory or already called. */
if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
return;
/* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size);
/* We should be in __ka space. */
BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list;
/* We roundup to the PMD, which means that if anybody at this stage is
* using the __ka address of xen_start_info or xen_start_info->shared_info
* they are in going to crash. Fortunatly we have already revectored
* in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
size = roundup(size, PMD_SIZE);
xen_cleanhighmap(addr, addr + size);
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
memblock_free(__pa(xen_start_info->mfn_list), size);
/* And revector! Bye bye old array */
xen_start_info->mfn_list = new_mfn_list;
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
* the ramdisk). We continue on, erasing PMD entries that point to page
......@@ -1255,7 +1251,15 @@ static void __init xen_pagetable_init(void)
* anything at this stage. */
xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
#endif
skip:
}
#endif
static void __init xen_pagetable_init(void)
{
paging_init();
xen_setup_shared_info();
#ifdef CONFIG_X86_64
xen_pagetable_p2m_copy();
#endif
xen_post_allocator_init();
}
......@@ -1753,6 +1757,10 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
/* For PVH no need to set R/O or R/W to pin them or unpin them. */
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
BUG();
}
......@@ -1863,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
* but that's enough to get __va working. We need to fill in the rest
* of the physical mapping once some sort of allocator has been set
* up.
* NOTE: for PVH, the page tables are native.
*/
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
......@@ -1884,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Zap identity mapping */
init_level4_pgt[0] = __pgd(0);
/* Pre-constructed entries are in pfn, so convert to mfn */
/* L4[272] -> level3_ident_pgt
* L4[511] -> level3_kernel_pgt */
convert_pfn_mfn(init_level4_pgt);
/* L3_i[0] -> level2_ident_pgt */
convert_pfn_mfn(level3_ident_pgt);
/* L3_k[510] -> level2_kernel_pgt
* L3_i[511] -> level2_fixmap_pgt */
convert_pfn_mfn(level3_kernel_pgt);
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Pre-constructed entries are in pfn, so convert to mfn */
/* L4[272] -> level3_ident_pgt
* L4[511] -> level3_kernel_pgt */
convert_pfn_mfn(init_level4_pgt);
/* L3_i[0] -> level2_ident_pgt */
convert_pfn_mfn(level3_ident_pgt);
/* L3_k[510] -> level2_kernel_pgt
* L3_i[511] -> level2_fixmap_pgt */
convert_pfn_mfn(level3_kernel_pgt);
}
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
......@@ -1918,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
copy_page(level2_fixmap_pgt, l2);
/* Note that we don't do anything with level1_fixmap_pgt which
* we don't need. */
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
PFN_DOWN(__pa_symbol(init_level4_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
PFN_DOWN(__pa_symbol(init_level4_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
/*
* At this stage there can be no user pgd, and no page
* structure to attach it to, so make sure we just set kernel
* pgd.
*/
xen_mc_batch();
__xen_write_cr3(true, __pa(init_level4_pgt));
xen_mc_issue(PARAVIRT_LAZY_CPU);
/*
* At this stage there can be no user pgd, and no page
* structure to attach it to, so make sure we just set kernel
* pgd.
*/
xen_mc_batch();
__xen_write_cr3(true, __pa(init_level4_pgt));
xen_mc_issue(PARAVIRT_LAZY_CPU);
} else
native_write_cr3(__pa(init_level4_pgt));
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
......@@ -2103,6 +2115,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
static void __init xen_post_allocator_init(void)
{
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
......@@ -2207,6 +2222,15 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
/* Optimization - we can use the HVM one but it has no idea which
* VCPUs are descheduled - which means that it will needlessly IPI
* them. Xen knows so let it do the job.
*/
if (xen_feature(XENFEAT_auto_translated_physmap)) {
pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
return;
}
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
......
......@@ -280,6 +280,9 @@ void __ref xen_build_mfn_list_list(void)
{
unsigned long pfn;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
......@@ -336,6 +339,9 @@ void __ref xen_build_mfn_list_list(void)
void xen_setup_mfn_list_list(void)
{
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
......@@ -346,10 +352,15 @@ void xen_setup_mfn_list_list(void)
/* Set up p2m_top to point to the domain-builder provided p2m pages */
void __init xen_build_dynamic_phys_to_machine(void)
{
unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
unsigned long *mfn_list;
unsigned long max_pfn;
unsigned long pfn;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
mfn_list = (unsigned long *)xen_start_info->mfn_list;
max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
xen_max_p2m_pfn = max_pfn;
p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
......
......@@ -30,10 +30,9 @@
#define XEN_PLATFORM_ERR_PROTOCOL -2
#define XEN_PLATFORM_ERR_BLACKLIST -3
/* store the value of xen_emul_unplug after the unplug is done */
int xen_platform_pci_unplug;
EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
#ifdef CONFIG_XEN_PVHVM
/* store the value of xen_emul_unplug after the unplug is done */
static int xen_platform_pci_unplug;
static int xen_emul_unplug;
static int check_platform_magic(void)
......@@ -69,6 +68,80 @@ static int check_platform_magic(void)
return 0;
}
bool xen_has_pv_devices()
{
if (!xen_domain())
return false;
/* PV domains always have them. */
if (xen_pv_domain())
return true;
/* And user has xen_platform_pci=0 set in guest config as
* driver did not modify the value. */
if (xen_platform_pci_unplug == 0)
return false;
if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
return false;
if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
return true;
/* This is an odd one - we are going to run legacy
* and PV drivers at the same time. */
if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
return true;
/* And the caller has to follow with xen_pv_{disk,nic}_devices
* to be certain which driver can load. */
return false;
}
EXPORT_SYMBOL_GPL(xen_has_pv_devices);
static bool __xen_has_pv_device(int state)
{
/* HVM domains might or might not */
if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
return true;
return xen_has_pv_devices();
}
bool xen_has_pv_nic_devices(void)
{
return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
}
EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
bool xen_has_pv_disk_devices(void)
{
return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
}
EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
/*
* This one is odd - it determines whether you want to run PV _and_
* legacy (IDE) drivers together. This combination is only possible
* under HVM.
*/
bool xen_has_pv_and_legacy_disk_devices(void)
{
if (!xen_domain())
return false;
/* N.B. This is only ever used in HVM mode */
if (xen_pv_domain())
return false;
if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
return true;
return false;
}
EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
void xen_unplug_emulated_devices(void)
{
int r;
......
......@@ -27,6 +27,7 @@
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "mmu.h"
#include "xen-ops.h"
#include "vdso.h"
......@@ -81,6 +82,9 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
memblock_reserve(start, size);
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
xen_max_p2m_pfn = PFN_DOWN(start + size);
for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
......@@ -103,6 +107,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
.domid = DOMID_SELF
};
unsigned long len = 0;
int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
unsigned long pfn;
int ret;
......@@ -116,7 +121,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
continue;
frame = mfn;
} else {
if (mfn != INVALID_P2M_ENTRY)
if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
continue;
frame = pfn;
}
......@@ -154,6 +159,13 @@ static unsigned long __init xen_do_chunk(unsigned long start,
static unsigned long __init xen_release_chunk(unsigned long start,
unsigned long end)
{
/*
* Xen already ballooned out the E820 non RAM regions for us
* and set them up properly in EPT.
*/
if (xen_feature(XENFEAT_auto_translated_physmap))
return end - start;
return xen_do_chunk(start, end, true);
}
......@@ -222,7 +234,13 @@ static void __init xen_set_identity_and_release_chunk(
* (except for the ISA region which must be 1:1 mapped) to
* release the refcounts (in Xen) on the original frames.
*/
for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
/*
* PVH E820 matches the hypervisor's P2M which means we need to
* account for the proper values of *release and *identity.
*/
for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
pte_t pte = __pte_ma(0);
if (pfn < PFN_UP(ISA_END_ADDRESS))
......@@ -563,16 +581,13 @@ void xen_enable_nmi(void)
BUG();
#endif
}
void __init xen_arch_setup(void)
void __init xen_pvmmu_arch_setup(void)
{
xen_panic_handler_init();
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
......@@ -581,6 +596,15 @@ void __init xen_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
xen_enable_nmi();
}
/* This function is not called for HVM domains */
void __init xen_arch_setup(void)
{
xen_panic_handler_init();
if (!xen_feature(XENFEAT_auto_translated_physmap))
xen_pvmmu_arch_setup();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
......
......@@ -73,9 +73,11 @@ static void cpu_bringup(void)
touch_softlockup_watchdog();
preempt_disable();
xen_enable_sysenter();
xen_enable_syscall();
/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
xen_enable_sysenter();
xen_enable_syscall();
}
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
......@@ -97,8 +99,14 @@ static void cpu_bringup(void)
wmb(); /* make sure everything is out */
}
static void cpu_bringup_and_idle(void)
/* Note: cpu parameter is only relevant for PVH */
static void cpu_bringup_and_idle(int cpu)
{
#ifdef CONFIG_X86_64
if (xen_feature(XENFEAT_auto_translated_physmap) &&
xen_feature(XENFEAT_supervisor_mode_kernel))
xen_pvh_secondary_vcpu_init(cpu);
#endif
cpu_bringup();
cpu_startup_entry(CPUHP_ONLINE);
}
......@@ -274,9 +282,10 @@ static void __init xen_smp_prepare_boot_cpu(void)
native_smp_prepare_boot_cpu();
if (xen_pv_domain()) {
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(xen_initial_gdt);
if (!xen_feature(XENFEAT_writable_page_tables))
/* We've switched to the "real" per-cpu gdt, so make
* sure the old memory can be recycled. */
make_lowmem_page_readwrite(xen_initial_gdt);
#ifdef CONFIG_X86_32
/*
......@@ -360,22 +369,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
gdt = get_cpu_gdt_table(cpu);
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
/* Note: PVH is not yet supported on x86_32. */
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.ss = __KERNEL_DS;
xen_copy_trap_info(ctxt->trap_ctxt);
......@@ -396,18 +404,27 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_cs = __KERNEL_CS;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->event_callback_eip =
(unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
ctxt->user_regs.cs = __KERNEL_CS;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
#ifdef CONFIG_X86_32
}
ctxt->user_regs.cs = __KERNEL_CS;
#else
} else
/* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
* %rdi having the cpu number - which means are passing in
* as the first parameter the cpu. Subtle!
*/
ctxt->user_regs.rdi = cpu;
#endif
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
......
......@@ -446,6 +446,7 @@ void xen_setup_timer(int cpu)
IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
IRQF_FORCE_RESUME,
name, NULL);
(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
memcpy(evt, xen_clockevent, sizeof(*evt));
......
......@@ -11,8 +11,28 @@
#include <asm/page_types.h>
#include <xen/interface/elfnote.h>
#include <xen/interface/features.h>
#include <asm/xen/interface.h>
#ifdef CONFIG_XEN_PVH
#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
* balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
* XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
*/
#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
(1 << XENFEAT_auto_translated_physmap) | \
(1 << XENFEAT_supervisor_mode_kernel) | \
(1 << XENFEAT_hvm_callback_vector))
/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
* up regardless whether this CONFIG option is enabled or not, but it
* clarifies what the right flags need to be.
*/
#else
#define PVH_FEATURES_STR ""
#define PVH_FEATURES (0)
#endif
__INIT
ENTRY(startup_xen)
cld
......@@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
(1 << XENFEAT_writable_page_tables) |
(1 << XENFEAT_dom0))
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
......
......@@ -123,4 +123,5 @@ __visible void xen_adjust_exception_frame(void);
extern int xen_panic_handler_init(void);
void xen_pvh_secondary_vcpu_init(int cpu);
#endif /* XEN_OPS_H */
......@@ -1356,7 +1356,7 @@ static int blkfront_probe(struct xenbus_device *dev,
char *type;
int len;
/* no unplug has been done: do not hook devices != xen vbds */
if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
if (xen_has_pv_and_legacy_disk_devices()) {
int major;
if (!VDEV_IS_EXTENDED(vdevice))
......@@ -2079,7 +2079,7 @@ static int __init xlblk_init(void)
if (!xen_domain())
return -ENODEV;
if (xen_hvm_domain() && !xen_platform_pci_unplug)
if (!xen_has_pv_disk_devices())
return -ENODEV;
if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
......
......@@ -17,6 +17,7 @@
#include <xen/xenbus.h>
#include <xen/page.h>
#include "tpm.h"
#include <xen/platform_pci.h>
struct tpm_private {
struct tpm_chip *chip;
......@@ -378,6 +379,9 @@ static int __init xen_tpmfront_init(void)
if (!xen_domain())
return -ENODEV;
if (!xen_has_pv_devices())
return -ENODEV;
return xenbus_register_frontend(&tpmfront_driver);
}
module_init(xen_tpmfront_init);
......
......@@ -29,6 +29,7 @@
#include <xen/interface/io/fbif.h>
#include <xen/interface/io/kbdif.h>
#include <xen/xenbus.h>
#include <xen/platform_pci.h>
struct xenkbd_info {
struct input_dev *kbd;
......@@ -380,6 +381,9 @@ static int __init xenkbd_init(void)
if (xen_initial_domain())
return -ENODEV;
if (!xen_has_pv_devices())
return -ENODEV;
return xenbus_register_frontend(&xenkbd_driver);
}
......
......@@ -2115,7 +2115,7 @@ static int __init netif_init(void)
if (!xen_domain())
return -ENODEV;
if (xen_hvm_domain() && !xen_platform_pci_unplug)
if (!xen_has_pv_nic_devices())
return -ENODEV;
pr_info("Initialising Xen virtual ethernet driver\n");
......
......@@ -20,6 +20,7 @@
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <linux/time.h>
#include <xen/platform_pci.h>
#include <asm/xen/swiotlb-xen.h>
#define INVALID_GRANT_REF (0)
......@@ -1146,6 +1147,9 @@ static int __init pcifront_init(void)
if (!xen_pv_domain() || xen_initial_domain())
return -ENODEV;
if (!xen_has_pv_devices())
return -ENODEV;
pci_frontend_registrar(1 /* enable */);
return xenbus_register_frontend(&xenpci_driver);
......
......@@ -35,6 +35,7 @@
#include <xen/interface/io/fbif.h>
#include <xen/interface/io/protocols.h>
#include <xen/xenbus.h>
#include <xen/platform_pci.h>
struct xenfb_info {
unsigned char *fb;
......@@ -692,13 +693,16 @@ static DEFINE_XENBUS_DRIVER(xenfb, ,
static int __init xenfb_init(void)
{
if (!xen_pv_domain())
if (!xen_domain())
return -ENODEV;
/* Nothing to do if running in dom0. */
if (xen_initial_domain())
return -ENODEV;
if (!xen_has_pv_devices())
return -ENODEV;
return xenbus_register_frontend(&xenfb_driver);
}
......
......@@ -3,7 +3,6 @@ menu "Xen driver support"
config XEN_BALLOON
bool "Xen memory balloon driver"
depends on !ARM
default y
help
The balloon driver allows the Xen domain to request more memory from
......
......@@ -2,7 +2,8 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)
obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
endif
obj-$(CONFIG_X86) += fallback.o
obj-y += grant-table.o features.o events.o balloon.o manage.o
obj-y += grant-table.o features.o balloon.o manage.o
obj-y += events/
obj-y += xenbus/
nostackp := $(call cc-option, -fno-stack-protector)
......
......@@ -157,13 +157,6 @@ static struct page *balloon_retrieve(bool prefer_highmem)
return page;
}
static struct page *balloon_first_page(void)
{
if (list_empty(&ballooned_pages))
return NULL;
return list_entry(ballooned_pages.next, struct page, lru);
}
static struct page *balloon_next_page(struct page *page)
{
struct list_head *next = page->lru.next;
......@@ -328,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
if (nr_pages > ARRAY_SIZE(frame_list))
nr_pages = ARRAY_SIZE(frame_list);
page = balloon_first_page();
page = list_first_entry_or_null(&ballooned_pages, struct page, lru);
for (i = 0; i < nr_pages; i++) {
if (!page) {
nr_pages = i;
......
......@@ -19,7 +19,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op)
dbgp.op = op;
#ifdef CONFIG_PCI
if (ctrlr->bus == &pci_bus_type) {
if (dev_is_pci(ctrlr)) {
const struct pci_dev *pdev = to_pci_dev(ctrlr);
dbgp.u.pci.seg = pci_domain_nr(pdev->bus);
......
obj-y += events.o
events-y += events_base.o
events-y += events_2l.o
events-y += events_fifo.o
/*
* Xen event channels (2-level ABI)
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
#include <linux/linkage.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <asm/sync_bitops.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <xen/xen.h>
#include <xen/xen-ops.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
#include "events_internal.h"
/*
* Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be
* careful to only use bitops which allow for this (e.g
* test_bit/find_first_bit and friends but not __ffs) and to pass
* BITS_PER_EVTCHN_WORD as the bitmask length.
*/
#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8)
/*
* Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
* array. Primarily to avoid long lines (hence the terse name).
*/
#define BM(x) (unsigned long *)(x)
/* Find the first set bit in a evtchn mask */
#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD],
cpu_evtchn_mask);
static unsigned evtchn_2l_max_channels(void)
{
return EVTCHN_2L_NR_CHANNELS;
}
static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
{
clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
}
static void evtchn_2l_clear_pending(unsigned port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_clear_bit(port, BM(&s->evtchn_pending[0]));
}
static void evtchn_2l_set_pending(unsigned port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, BM(&s->evtchn_pending[0]));
}
static bool evtchn_2l_is_pending(unsigned port)
{
struct shared_info *s = HYPERVISOR_shared_info;
return sync_test_bit(port, BM(&s->evtchn_pending[0]));
}
static bool evtchn_2l_test_and_set_mask(unsigned port)
{
struct shared_info *s = HYPERVISOR_shared_info;
return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
}
static void evtchn_2l_mask(unsigned port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, BM(&s->evtchn_mask[0]));
}
static void evtchn_2l_unmask(unsigned port)
{
struct shared_info *s = HYPERVISOR_shared_info;
unsigned int cpu = get_cpu();
int do_hypercall = 0, evtchn_pending = 0;
BUG_ON(!irqs_disabled());
if (unlikely((cpu != cpu_from_evtchn(port))))
do_hypercall = 1;
else {
/*
* Need to clear the mask before checking pending to
* avoid a race with an event becoming pending.
*
* EVTCHNOP_unmask will only trigger an upcall if the
* mask bit was set, so if a hypercall is needed
* remask the event.
*/
sync_clear_bit(port, BM(&s->evtchn_mask[0]));
evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0]));
if (unlikely(evtchn_pending && xen_hvm_domain())) {
sync_set_bit(port, BM(&s->evtchn_mask[0]));
do_hypercall = 1;
}
}
/* Slow path (hypercall) if this is a non-local port or if this is
* an hvm domain and an event is pending (hvm domains don't have
* their own implementation of irq_enable). */
if (do_hypercall) {
struct evtchn_unmask unmask = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
} else {
struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
/*
* The following is basically the equivalent of
* 'hw_resend_irq'. Just like a real IO-APIC we 'lose
* the interrupt edge' if the channel is masked.
*/
if (evtchn_pending &&
!sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
BM(&vcpu_info->evtchn_pending_sel)))
vcpu_info->evtchn_upcall_pending = 1;
}
put_cpu();
}
static DEFINE_PER_CPU(unsigned int, current_word_idx);
static DEFINE_PER_CPU(unsigned int, current_bit_idx);
/*
* Mask out the i least significant bits of w
*/
#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i))
static inline xen_ulong_t active_evtchns(unsigned int cpu,
struct shared_info *sh,
unsigned int idx)
{
return sh->evtchn_pending[idx] &
per_cpu(cpu_evtchn_mask, cpu)[idx] &
~sh->evtchn_mask[idx];
}
/*
* Search the CPU's pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for handling.
*
* Xen uses a two-level bitmap to speed searching. The first level is
* a bitset of words which contain pending event bits. The second
* level is a bitset of pending events themselves.
*/
static void evtchn_2l_handle_events(unsigned cpu)
{
int irq;
xen_ulong_t pending_words;
xen_ulong_t pending_bits;
int start_word_idx, start_bit_idx;
int word_idx, bit_idx;
int i;
struct irq_desc *desc;
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
/* Timer interrupt has highest priority. */
irq = irq_from_virq(cpu, VIRQ_TIMER);
if (irq != -1) {
unsigned int evtchn = evtchn_from_irq(irq);
word_idx = evtchn / BITS_PER_LONG;
bit_idx = evtchn % BITS_PER_LONG;
if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) {
desc = irq_to_desc(irq);
if (desc)
generic_handle_irq_desc(irq, desc);
}
}
/*
* Master flag must be cleared /before/ clearing
* selector flag. xchg_xen_ulong must contain an
* appropriate barrier.
*/
pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0);
start_word_idx = __this_cpu_read(current_word_idx);
start_bit_idx = __this_cpu_read(current_bit_idx);
word_idx = start_word_idx;
for (i = 0; pending_words != 0; i++) {
xen_ulong_t words;
words = MASK_LSBS(pending_words, word_idx);
/*
* If we masked out all events, wrap to beginning.
*/
if (words == 0) {
word_idx = 0;
bit_idx = 0;
continue;
}
word_idx = EVTCHN_FIRST_BIT(words);
pending_bits = active_evtchns(cpu, s, word_idx);
bit_idx = 0; /* usually scan entire word from start */
/*
* We scan the starting word in two parts.
*
* 1st time: start in the middle, scanning the
* upper bits.
*
* 2nd time: scan the whole word (not just the
* parts skipped in the first pass) -- if an
* event in the previously scanned bits is
* pending again it would just be scanned on
* the next loop anyway.
*/
if (word_idx == start_word_idx) {
if (i == 0)
bit_idx = start_bit_idx;
}
do {
xen_ulong_t bits;
int port;
bits = MASK_LSBS(pending_bits, bit_idx);
/* If we masked out all events, move on. */
if (bits == 0)
break;
bit_idx = EVTCHN_FIRST_BIT(bits);
/* Process port. */
port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
irq = get_evtchn_to_irq(port);
if (irq != -1) {
desc = irq_to_desc(irq);
if (desc)
generic_handle_irq_desc(irq, desc);
}
bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
/* Next caller starts at last processed + 1 */
__this_cpu_write(current_word_idx,
bit_idx ? word_idx :
(word_idx+1) % BITS_PER_EVTCHN_WORD);
__this_cpu_write(current_bit_idx, bit_idx);
} while (bit_idx != 0);
/* Scan start_l1i twice; all others once. */
if ((word_idx != start_word_idx) || (i != 0))
pending_words &= ~(1UL << word_idx);
word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD;
}
}
irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
{
struct shared_info *sh = HYPERVISOR_shared_info;
int cpu = smp_processor_id();
xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
int i;
unsigned long flags;
static DEFINE_SPINLOCK(debug_lock);
struct vcpu_info *v;
spin_lock_irqsave(&debug_lock, flags);
printk("\nvcpu %d\n ", cpu);
for_each_online_cpu(i) {
int pending;
v = per_cpu(xen_vcpu, i);
pending = (get_irq_regs() && i == cpu)
? xen_irqs_disabled(get_irq_regs())
: v->evtchn_upcall_mask;
printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i,
pending, v->evtchn_upcall_pending,
(int)(sizeof(v->evtchn_pending_sel)*2),
v->evtchn_pending_sel);
}
v = per_cpu(xen_vcpu, cpu);
printk("\npending:\n ");
for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
printk("%0*"PRI_xen_ulong"%s",
(int)sizeof(sh->evtchn_pending[0])*2,
sh->evtchn_pending[i],
i % 8 == 0 ? "\n " : " ");
printk("\nglobal mask:\n ");
for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
printk("%0*"PRI_xen_ulong"%s",
(int)(sizeof(sh->evtchn_mask[0])*2),
sh->evtchn_mask[i],
i % 8 == 0 ? "\n " : " ");
printk("\nglobally unmasked:\n ");
for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
printk("%0*"PRI_xen_ulong"%s",
(int)(sizeof(sh->evtchn_mask[0])*2),
sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
i % 8 == 0 ? "\n " : " ");
printk("\nlocal cpu%d mask:\n ", cpu);
for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2),
cpu_evtchn[i],
i % 8 == 0 ? "\n " : " ");
printk("\nlocally unmasked:\n ");
for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
xen_ulong_t pending = sh->evtchn_pending[i]
& ~sh->evtchn_mask[i]
& cpu_evtchn[i];
printk("%0*"PRI_xen_ulong"%s",
(int)(sizeof(sh->evtchn_mask[0])*2),
pending, i % 8 == 0 ? "\n " : " ");
}
printk("\npending list:\n");
for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) {
if (sync_test_bit(i, BM(sh->evtchn_pending))) {
int word_idx = i / BITS_PER_EVTCHN_WORD;
printk(" %d: event %d -> irq %d%s%s%s\n",
cpu_from_evtchn(i), i,
get_evtchn_to_irq(i),
sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
? "" : " l2-clear",
!sync_test_bit(i, BM(sh->evtchn_mask))
? "" : " globally-masked",
sync_test_bit(i, BM(cpu_evtchn))
? "" : " locally-masked");
}
}
spin_unlock_irqrestore(&debug_lock, flags);
return IRQ_HANDLED;
}
static const struct evtchn_ops evtchn_ops_2l = {
.max_channels = evtchn_2l_max_channels,
.nr_channels = evtchn_2l_max_channels,
.bind_to_cpu = evtchn_2l_bind_to_cpu,
.clear_pending = evtchn_2l_clear_pending,
.set_pending = evtchn_2l_set_pending,
.is_pending = evtchn_2l_is_pending,
.test_and_set_mask = evtchn_2l_test_and_set_mask,
.mask = evtchn_2l_mask,
.unmask = evtchn_2l_unmask,
.handle_events = evtchn_2l_handle_events,
};
void __init xen_evtchn_2l_init(void)
{
pr_info("Using 2-level ABI\n");
evtchn_ops = &evtchn_ops_2l;
}
/*
* Xen event channels (FIFO-based ABI)
*
* Copyright (C) 2013 Citrix Systems R&D ltd.
*
* This source code is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* Or, when distributed separately from the Linux kernel or
* incorporated into other software packages, subject to the following
* license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
#include <linux/linkage.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <asm/sync_bitops.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/page.h>
#include <xen/xen.h>
#include <xen/xen-ops.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
#include "events_internal.h"
#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t))
#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE)
struct evtchn_fifo_queue {
uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
};
static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block);
static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue);
static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly;
static unsigned event_array_pages __read_mostly;
#define BM(w) ((unsigned long *)(w))
static inline event_word_t *event_word_from_port(unsigned port)
{
unsigned i = port / EVENT_WORDS_PER_PAGE;
return event_array[i] + port % EVENT_WORDS_PER_PAGE;
}
static unsigned evtchn_fifo_max_channels(void)
{
return EVTCHN_FIFO_NR_CHANNELS;
}
static unsigned evtchn_fifo_nr_channels(void)
{
return event_array_pages * EVENT_WORDS_PER_PAGE;
}
static void free_unused_array_pages(void)
{
unsigned i;
for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) {
if (!event_array[i])
break;
free_page((unsigned long)event_array[i]);
event_array[i] = NULL;
}
}
static void init_array_page(event_word_t *array_page)
{
unsigned i;
for (i = 0; i < EVENT_WORDS_PER_PAGE; i++)
array_page[i] = 1 << EVTCHN_FIFO_MASKED;
}
static int evtchn_fifo_setup(struct irq_info *info)
{
unsigned port = info->evtchn;
unsigned new_array_pages;
int ret;
new_array_pages = port / EVENT_WORDS_PER_PAGE + 1;
if (new_array_pages > MAX_EVENT_ARRAY_PAGES)
return -EINVAL;
while (event_array_pages < new_array_pages) {
void *array_page;
struct evtchn_expand_array expand_array;
/* Might already have a page if we've resumed. */
array_page = event_array[event_array_pages];
if (!array_page) {
array_page = (void *)__get_free_page(GFP_KERNEL);
if (array_page == NULL) {
ret = -ENOMEM;
goto error;
}
event_array[event_array_pages] = array_page;
}
/* Mask all events in this page before adding it. */
init_array_page(array_page);
expand_array.array_gfn = virt_to_mfn(array_page);
ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
if (ret < 0)
goto error;
event_array_pages++;
}
return 0;
error:
if (event_array_pages == 0)
panic("xen: unable to expand event array with initial page (%d)\n", ret);
else
pr_err("unable to expand event array (%d)\n", ret);
free_unused_array_pages();
return ret;
}
static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu)
{
/* no-op */
}
static void evtchn_fifo_clear_pending(unsigned port)
{
event_word_t *word = event_word_from_port(port);
sync_clear_bit(EVTCHN_FIFO_PENDING, BM(word));
}
static void evtchn_fifo_set_pending(unsigned port)
{
event_word_t *word = event_word_from_port(port);
sync_set_bit(EVTCHN_FIFO_PENDING, BM(word));
}
static bool evtchn_fifo_is_pending(unsigned port)
{
event_word_t *word = event_word_from_port(port);
return sync_test_bit(EVTCHN_FIFO_PENDING, BM(word));
}
static bool evtchn_fifo_test_and_set_mask(unsigned port)
{
event_word_t *word = event_word_from_port(port);
return sync_test_and_set_bit(EVTCHN_FIFO_MASKED, BM(word));
}
static void evtchn_fifo_mask(unsigned port)
{
event_word_t *word = event_word_from_port(port);
sync_set_bit(EVTCHN_FIFO_MASKED, BM(word));
}
/*
* Clear MASKED, spinning if BUSY is set.
*/
static void clear_masked(volatile event_word_t *word)
{
event_word_t new, old, w;
w = *word;
do {
old = w & ~(1 << EVTCHN_FIFO_BUSY);
new = old & ~(1 << EVTCHN_FIFO_MASKED);
w = sync_cmpxchg(word, old, new);
} while (w != old);
}
static void evtchn_fifo_unmask(unsigned port)
{
event_word_t *word = event_word_from_port(port);
BUG_ON(!irqs_disabled());
clear_masked(word);
if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))) {
struct evtchn_unmask unmask = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
}
}
static uint32_t clear_linked(volatile event_word_t *word)
{
event_word_t new, old, w;
w = *word;
do {
old = w;
new = (w & ~((1 << EVTCHN_FIFO_LINKED)
| EVTCHN_FIFO_LINK_MASK));
} while ((w = sync_cmpxchg(word, old, new)) != old);
return w & EVTCHN_FIFO_LINK_MASK;
}
static void handle_irq_for_port(unsigned port)
{
int irq;
struct irq_desc *desc;
irq = get_evtchn_to_irq(port);
if (irq != -1) {
desc = irq_to_desc(irq);
if (desc)
generic_handle_irq_desc(irq, desc);
}
}
static void consume_one_event(unsigned cpu,
struct evtchn_fifo_control_block *control_block,
unsigned priority, uint32_t *ready)
{
struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
uint32_t head;
unsigned port;
event_word_t *word;
head = q->head[priority];
/*
* Reached the tail last time? Read the new HEAD from the
* control block.
*/
if (head == 0) {
rmb(); /* Ensure word is up-to-date before reading head. */
head = control_block->head[priority];
}
port = head;
word = event_word_from_port(port);
head = clear_linked(word);
/*
* If the link is non-zero, there are more events in the
* queue, otherwise the queue is empty.
*
* If the queue is empty, clear this priority from our local
* copy of the ready word.
*/
if (head == 0)
clear_bit(priority, BM(ready));
if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))
&& !sync_test_bit(EVTCHN_FIFO_MASKED, BM(word)))
handle_irq_for_port(port);
q->head[priority] = head;
}
static void evtchn_fifo_handle_events(unsigned cpu)
{
struct evtchn_fifo_control_block *control_block;
uint32_t ready;
unsigned q;
control_block = per_cpu(cpu_control_block, cpu);
ready = xchg(&control_block->ready, 0);
while (ready) {
q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES);
consume_one_event(cpu, control_block, q, &ready);
ready |= xchg(&control_block->ready, 0);
}
}
static void evtchn_fifo_resume(void)
{
unsigned cpu;
for_each_possible_cpu(cpu) {
void *control_block = per_cpu(cpu_control_block, cpu);
struct evtchn_init_control init_control;
int ret;
if (!control_block)
continue;
/*
* If this CPU is offline, take the opportunity to
* free the control block while it is not being
* used.
*/
if (!cpu_online(cpu)) {
free_page((unsigned long)control_block);
per_cpu(cpu_control_block, cpu) = NULL;
continue;
}
init_control.control_gfn = virt_to_mfn(control_block);
init_control.offset = 0;
init_control.vcpu = cpu;
ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control,
&init_control);
if (ret < 0)
BUG();
}
/*
* The event array starts out as empty again and is extended
* as normal when events are bound. The existing pages will
* be reused.
*/
event_array_pages = 0;
}
static const struct evtchn_ops evtchn_ops_fifo = {
.max_channels = evtchn_fifo_max_channels,
.nr_channels = evtchn_fifo_nr_channels,
.setup = evtchn_fifo_setup,
.bind_to_cpu = evtchn_fifo_bind_to_cpu,
.clear_pending = evtchn_fifo_clear_pending,
.set_pending = evtchn_fifo_set_pending,
.is_pending = evtchn_fifo_is_pending,
.test_and_set_mask = evtchn_fifo_test_and_set_mask,
.mask = evtchn_fifo_mask,
.unmask = evtchn_fifo_unmask,
.handle_events = evtchn_fifo_handle_events,
.resume = evtchn_fifo_resume,
};
static int evtchn_fifo_init_control_block(unsigned cpu)
{
struct page *control_block = NULL;
struct evtchn_init_control init_control;
int ret = -ENOMEM;
control_block = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (control_block == NULL)
goto error;
init_control.control_gfn = virt_to_mfn(page_address(control_block));
init_control.offset = 0;
init_control.vcpu = cpu;
ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
if (ret < 0)
goto error;
per_cpu(cpu_control_block, cpu) = page_address(control_block);
return 0;
error:
__free_page(control_block);
return ret;
}
static int evtchn_fifo_cpu_notification(struct notifier_block *self,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
int ret = 0;
switch (action) {
case CPU_UP_PREPARE:
if (!per_cpu(cpu_control_block, cpu))
ret = evtchn_fifo_init_control_block(cpu);
break;
default:
break;
}
return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
}
static struct notifier_block evtchn_fifo_cpu_notifier = {
.notifier_call = evtchn_fifo_cpu_notification,
};
int __init xen_evtchn_fifo_init(void)
{
int cpu = get_cpu();
int ret;
ret = evtchn_fifo_init_control_block(cpu);
if (ret < 0)
goto out;
pr_info("Using FIFO-based ABI\n");
evtchn_ops = &evtchn_ops_fifo;
register_cpu_notifier(&evtchn_fifo_cpu_notifier);
out:
put_cpu();
return ret;
}
/*
* Xen Event Channels (internal header)
*
* Copyright (C) 2013 Citrix Systems R&D Ltd.
*
* This source code is licensed under the GNU General Public License,
* Version 2 or later. See the file COPYING for more details.
*/
#ifndef __EVENTS_INTERNAL_H__
#define __EVENTS_INTERNAL_H__
/* Interrupt types. */
enum xen_irq_type {
IRQT_UNBOUND = 0,
IRQT_PIRQ,
IRQT_VIRQ,
IRQT_IPI,
IRQT_EVTCHN
};
/*
* Packed IRQ information:
* type - enum xen_irq_type
* event channel - irq->event channel mapping
* cpu - cpu this event channel is bound to
* index - type-specific information:
* PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
* guest, or GSI (real passthrough IRQ) of the device.
* VIRQ - virq number
* IPI - IPI vector
* EVTCHN -
*/
struct irq_info {
struct list_head list;
int refcnt;
enum xen_irq_type type; /* type */
unsigned irq;
unsigned int evtchn; /* event channel */
unsigned short cpu; /* cpu bound */
union {
unsigned short virq;
enum ipi_vector ipi;
struct {
unsigned short pirq;
unsigned short gsi;
unsigned char vector;
unsigned char flags;
uint16_t domid;
} pirq;
} u;
};
#define PIRQ_NEEDS_EOI (1 << 0)
#define PIRQ_SHAREABLE (1 << 1)
struct evtchn_ops {
unsigned (*max_channels)(void);
unsigned (*nr_channels)(void);
int (*setup)(struct irq_info *info);
void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
void (*clear_pending)(unsigned port);
void (*set_pending)(unsigned port);
bool (*is_pending)(unsigned port);
bool (*test_and_set_mask)(unsigned port);
void (*mask)(unsigned port);
void (*unmask)(unsigned port);
void (*handle_events)(unsigned cpu);
void (*resume)(void);
};
extern const struct evtchn_ops *evtchn_ops;
extern int **evtchn_to_irq;
int get_evtchn_to_irq(unsigned int evtchn);
struct irq_info *info_for_irq(unsigned irq);
unsigned cpu_from_irq(unsigned irq);
unsigned cpu_from_evtchn(unsigned int evtchn);
static inline unsigned xen_evtchn_max_channels(void)
{
return evtchn_ops->max_channels();
}
/*
* Do any ABI specific setup for a bound event channel before it can
* be unmasked and used.
*/
static inline int xen_evtchn_port_setup(struct irq_info *info)
{
if (evtchn_ops->setup)
return evtchn_ops->setup(info);
return 0;
}
static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
unsigned cpu)
{
evtchn_ops->bind_to_cpu(info, cpu);
}
static inline void clear_evtchn(unsigned port)
{
evtchn_ops->clear_pending(port);
}
static inline void set_evtchn(unsigned port)
{
evtchn_ops->set_pending(port);
}
static inline bool test_evtchn(unsigned port)
{
return evtchn_ops->is_pending(port);
}
static inline bool test_and_set_mask(unsigned port)
{
return evtchn_ops->test_and_set_mask(port);
}
static inline void mask_evtchn(unsigned port)
{
return evtchn_ops->mask(port);
}
static inline void unmask_evtchn(unsigned port)
{
return evtchn_ops->unmask(port);
}
static inline void xen_evtchn_handle_events(unsigned cpu)
{
return evtchn_ops->handle_events(cpu);
}
static inline void xen_evtchn_resume(void)
{
if (evtchn_ops->resume)
evtchn_ops->resume();
}
void xen_evtchn_2l_init(void);
int xen_evtchn_fifo_init(void);
#endif /* #ifndef __EVENTS_INTERNAL_H__ */
......@@ -417,7 +417,7 @@ static long evtchn_ioctl(struct file *file,
break;
rc = -EINVAL;
if (unbind.port >= NR_EVENT_CHANNELS)
if (unbind.port >= xen_evtchn_nr_channels())
break;
rc = -ENOTCONN;
......
......@@ -846,7 +846,7 @@ static int __init gntdev_init(void)
if (!xen_domain())
return -ENODEV;
use_ptemod = xen_pv_domain();
use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
err = misc_register(&gntdev_miscdev);
if (err != 0) {
......
......@@ -62,12 +62,10 @@
static grant_ref_t **gnttab_list;
static unsigned int nr_grant_frames;
static unsigned int boot_max_nr_grant_frames;
static int gnttab_free_count;
static grant_ref_t gnttab_free_head;
static DEFINE_SPINLOCK(gnttab_list_lock);
unsigned long xen_hvm_resume_frames;
EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
struct grant_frames xen_auto_xlat_grant_frames;
static union {
struct grant_entry_v1 *v1;
......@@ -827,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void)
unsigned int gnttab_max_grant_frames(void)
{
unsigned int xen_max = __max_nr_grant_frames();
static unsigned int boot_max_nr_grant_frames;
/* First time, initialize it properly. */
if (!boot_max_nr_grant_frames)
boot_max_nr_grant_frames = __max_nr_grant_frames();
if (xen_max > boot_max_nr_grant_frames)
return boot_max_nr_grant_frames;
......@@ -834,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void)
}
EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
int gnttab_setup_auto_xlat_frames(unsigned long addr)
{
xen_pfn_t *pfn;
unsigned int max_nr_gframes = __max_nr_grant_frames();
unsigned int i;
void *vaddr;
if (xen_auto_xlat_grant_frames.count)
return -EINVAL;
vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes);
if (vaddr == NULL) {
pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n",
addr);
return -ENOMEM;
}
pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL);
if (!pfn) {
xen_unmap(vaddr);
return -ENOMEM;
}
for (i = 0; i < max_nr_gframes; i++)
pfn[i] = PFN_DOWN(addr) + i;
xen_auto_xlat_grant_frames.vaddr = vaddr;
xen_auto_xlat_grant_frames.pfn = pfn;
xen_auto_xlat_grant_frames.count = max_nr_gframes;
return 0;
}
EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames);
void gnttab_free_auto_xlat_frames(void)
{
if (!xen_auto_xlat_grant_frames.count)
return;
kfree(xen_auto_xlat_grant_frames.pfn);
xen_unmap(xen_auto_xlat_grant_frames.vaddr);
xen_auto_xlat_grant_frames.pfn = NULL;
xen_auto_xlat_grant_frames.count = 0;
xen_auto_xlat_grant_frames.vaddr = NULL;
}
EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames);
/* Handling of paged out grant targets (GNTST_eagain) */
#define MAX_DELAY 256
static inline void
......@@ -1060,10 +1108,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
unsigned int nr_gframes = end_idx + 1;
int rc;
if (xen_hvm_domain()) {
if (xen_feature(XENFEAT_auto_translated_physmap)) {
struct xen_add_to_physmap xatp;
unsigned int i = end_idx;
rc = 0;
BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes);
/*
* Loop backwards, so that the first hypercall has the largest
* index, ensuring that the table will grow only once.
......@@ -1072,7 +1121,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
xatp.domid = DOMID_SELF;
xatp.idx = i;
xatp.space = XENMAPSPACE_grant_table;
xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i];
rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
if (rc != 0) {
pr_warn("grant table add_to_physmap failed, err=%d\n",
......@@ -1135,10 +1184,8 @@ static void gnttab_request_version(void)
int rc;
struct gnttab_set_version gsv;
if (xen_hvm_domain())
gsv.version = 1;
else
gsv.version = 2;
gsv.version = 1;
rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
if (rc == 0 && gsv.version == 2) {
grant_table_version = 2;
......@@ -1169,22 +1216,15 @@ static int gnttab_setup(void)
if (max_nr_gframes < nr_grant_frames)
return -ENOSYS;
if (xen_pv_domain())
return gnttab_map(0, nr_grant_frames - 1);
if (gnttab_shared.addr == NULL) {
gnttab_shared.addr = xen_remap(xen_hvm_resume_frames,
PAGE_SIZE * max_nr_gframes);
if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) {
gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr;
if (gnttab_shared.addr == NULL) {
pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n",
xen_hvm_resume_frames);
pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n",
(unsigned long)xen_auto_xlat_grant_frames.vaddr);
return -ENOMEM;
}
}
gnttab_map(0, nr_grant_frames - 1);
return 0;
return gnttab_map(0, nr_grant_frames - 1);
}
int gnttab_resume(void)
......@@ -1227,13 +1267,12 @@ int gnttab_init(void)
gnttab_request_version();
nr_grant_frames = 1;
boot_max_nr_grant_frames = __max_nr_grant_frames();
/* Determine the maximum number of frames required for the
* grant reference free list on the current hypervisor.
*/
BUG_ON(grefs_per_grant_frame == 0);
max_nr_glist_frames = (boot_max_nr_grant_frames *
max_nr_glist_frames = (gnttab_max_grant_frames() *
grefs_per_grant_frame / RPP);
gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
......@@ -1286,5 +1325,6 @@ static int __gnttab_init(void)
return gnttab_init();
}
core_initcall(__gnttab_init);
/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called
* beforehand to initialize xen_auto_xlat_grant_frames. */
core_initcall_sync(__gnttab_init);
......@@ -26,7 +26,9 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include "../pci/pci.h"
#ifdef CONFIG_PCI_MMCONFIG
#include <asm/pci_x86.h>
#endif
static bool __read_mostly pci_seg_supported = true;
......
......@@ -108,6 +108,7 @@ static int platform_pci_init(struct pci_dev *pdev,
long ioaddr;
long mmio_addr, mmio_len;
unsigned int max_nr_gframes;
unsigned long grant_frames;
if (!xen_domain())
return -ENODEV;
......@@ -154,13 +155,17 @@ static int platform_pci_init(struct pci_dev *pdev,
}
max_nr_gframes = gnttab_max_grant_frames();
xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
ret = gnttab_init();
grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
ret = gnttab_setup_auto_xlat_frames(grant_frames);
if (ret)
goto out;
ret = gnttab_init();
if (ret)
goto grant_out;
xenbus_probe(NULL);
return 0;
grant_out:
gnttab_free_auto_xlat_frames();
out:
pci_release_region(pdev, 0);
mem_out:
......
......@@ -45,6 +45,7 @@
#include <xen/grant_table.h>
#include <xen/xenbus.h>
#include <xen/xen.h>
#include <xen/features.h>
#include "xenbus_probe.h"
......@@ -743,7 +744,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = {
void __init xenbus_ring_ops_init(void)
{
if (xen_pv_domain())
if (!xen_feature(XENFEAT_auto_translated_physmap))
ring_ops = &ring_ops_pv;
else
ring_ops = &ring_ops_hvm;
......
......@@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init);
#ifndef MODULE
static int __init boot_wait_for_devices(void)
{
if (xen_hvm_domain() && !xen_platform_pci_unplug)
if (!xen_has_pv_devices())
return -ENODEV;
ready_to_wait_for_devices = 1;
......
......@@ -7,6 +7,8 @@
#include <asm/xen/hypercall.h>
#include <asm/xen/events.h>
unsigned xen_evtchn_nr_channels(void);
int bind_evtchn_to_irq(unsigned int evtchn);
int bind_evtchn_to_irqhandler(unsigned int evtchn,
irq_handler_t handler,
......@@ -37,6 +39,11 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
*/
void unbind_from_irqhandler(unsigned int irq, void *dev_id);
#define XEN_IRQ_PRIORITY_MAX EVTCHN_FIFO_PRIORITY_MAX
#define XEN_IRQ_PRIORITY_DEFAULT EVTCHN_FIFO_PRIORITY_DEFAULT
#define XEN_IRQ_PRIORITY_MIN EVTCHN_FIFO_PRIORITY_MIN
int xen_set_irq_priority(unsigned irq, unsigned priority);
/*
* Allow extra references to event channels exposed to userspace by evtchn
*/
......@@ -73,6 +80,8 @@ void xen_poll_irq_timeout(int irq, u64 timeout);
/* Determine the IRQ which is bound to an event channel */
unsigned irq_from_evtchn(unsigned int evtchn);
int irq_from_virq(unsigned int cpu, unsigned int virq);
unsigned int evtchn_from_irq(unsigned irq);
/* Xen HVM evtchn vector callback */
void xen_hvm_callback_vector(void);
......
......@@ -178,8 +178,15 @@ int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
grant_status_t **__shared);
void arch_gnttab_unmap(void *shared, unsigned long nr_gframes);
extern unsigned long xen_hvm_resume_frames;
struct grant_frames {
xen_pfn_t *pfn;
unsigned int count;
void *vaddr;
};
extern struct grant_frames xen_auto_xlat_grant_frames;
unsigned int gnttab_max_grant_frames(void);
int gnttab_setup_auto_xlat_frames(unsigned long addr);
void gnttab_free_auto_xlat_frames(void);
#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
......
......@@ -140,6 +140,19 @@
*/
#define XEN_ELFNOTE_SUSPEND_CANCEL 14
/*
* The features supported by this kernel (numeric).
*
* Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a
* kernel to specify support for features that older hypervisors don't
* know about. The set of features 4.2 and newer hypervisors will
* consider supported by the kernel is the combination of the sets
* specified through this and the string note.
*
* LEGACY: FEATURES
*/
#define XEN_ELFNOTE_SUPPORTED_FEATURES 17
#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
/*
......
......@@ -190,6 +190,39 @@ struct evtchn_reset {
};
typedef struct evtchn_reset evtchn_reset_t;
/*
* EVTCHNOP_init_control: initialize the control block for the FIFO ABI.
*/
#define EVTCHNOP_init_control 11
struct evtchn_init_control {
/* IN parameters. */
uint64_t control_gfn;
uint32_t offset;
uint32_t vcpu;
/* OUT parameters. */
uint8_t link_bits;
uint8_t _pad[7];
};
/*
* EVTCHNOP_expand_array: add an additional page to the event array.
*/
#define EVTCHNOP_expand_array 12
struct evtchn_expand_array {
/* IN parameters. */
uint64_t array_gfn;
};
/*
* EVTCHNOP_set_priority: set the priority for an event channel.
*/
#define EVTCHNOP_set_priority 13
struct evtchn_set_priority {
/* IN parameters. */
uint32_t port;
uint32_t priority;
};
struct evtchn_op {
uint32_t cmd; /* EVTCHNOP_* */
union {
......@@ -207,4 +240,39 @@ struct evtchn_op {
};
DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
/*
* 2-level ABI
*/
#define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
/*
* FIFO ABI
*/
/* Events may have priorities from 0 (highest) to 15 (lowest). */
#define EVTCHN_FIFO_PRIORITY_MAX 0
#define EVTCHN_FIFO_PRIORITY_DEFAULT 7
#define EVTCHN_FIFO_PRIORITY_MIN 15
#define EVTCHN_FIFO_MAX_QUEUES (EVTCHN_FIFO_PRIORITY_MIN + 1)
typedef uint32_t event_word_t;
#define EVTCHN_FIFO_PENDING 31
#define EVTCHN_FIFO_MASKED 30
#define EVTCHN_FIFO_LINKED 29
#define EVTCHN_FIFO_BUSY 28
#define EVTCHN_FIFO_LINK_BITS 17
#define EVTCHN_FIFO_LINK_MASK ((1 << EVTCHN_FIFO_LINK_BITS) - 1)
#define EVTCHN_FIFO_NR_CHANNELS (1 << EVTCHN_FIFO_LINK_BITS)
struct evtchn_fifo_control_block {
uint32_t ready;
uint32_t _rsvd;
event_word_t head[EVTCHN_FIFO_MAX_QUEUES];
};
#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
......@@ -281,12 +281,6 @@ struct multicall_entry {
};
DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
/*
* Event channel endpoints per domain:
* 1024 if a long is 32 bits; 4096 if a long is 64 bits.
*/
#define NR_EVENT_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
struct vcpu_time_info {
/*
* Updates to the following values are preceded and followed
......
......@@ -46,6 +46,27 @@ static inline int xen_must_unplug_disks(void) {
#endif
}
extern int xen_platform_pci_unplug;
#if defined(CONFIG_XEN_PVHVM)
extern bool xen_has_pv_devices(void);
extern bool xen_has_pv_disk_devices(void);
extern bool xen_has_pv_nic_devices(void);
extern bool xen_has_pv_and_legacy_disk_devices(void);
#else
static inline bool xen_has_pv_devices(void)
{
return IS_ENABLED(CONFIG_XEN);
}
static inline bool xen_has_pv_disk_devices(void)
{
return IS_ENABLED(CONFIG_XEN);
}
static inline bool xen_has_pv_nic_devices(void)
{
return IS_ENABLED(CONFIG_XEN);
}
static inline bool xen_has_pv_and_legacy_disk_devices(void)
{
return false;
}
#endif
#endif /* _XEN_PLATFORM_PCI_H */
......@@ -29,4 +29,18 @@ extern enum xen_domain_type xen_domain_type;
#define xen_initial_domain() (0)
#endif /* CONFIG_XEN_DOM0 */
#ifdef CONFIG_XEN_PVH
/* This functionality exists only for x86. The XEN_PVHVM support exists
* only in x86 world - hence on ARM it will be always disabled.
* N.B. ARM guests are neither PV nor HVM nor PVHVM.
* It's a bit like PVH but is different also (it's further towards the H
* end of the spectrum than even PVH).
*/
#include <xen/features.h>
#define xen_pvh_domain() (xen_pv_domain() && \
xen_feature(XENFEAT_auto_translated_physmap) && \
xen_have_vector_callback)
#else
#define xen_pvh_domain() (0)
#endif
#endif /* _XEN_XEN_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册