Merge tag 'for-linus-4.12b-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from Juergen Gross: "Xen fixes and featrues for 4.12. The main changes are: - enable building the kernel with Xen support but without enabling paravirtualized mode (Vitaly Kuznetsov) - add a new 9pfs xen frontend driver (Stefano Stabellini) - simplify Xen's cpuid handling by making use of cpu capabilities (Juergen Gross) - add/modify some headers for new Xen paravirtualized devices (Oleksandr Andrushchenko) - EFI reset_system support under Xen (Julien Grall) - and the usual cleanups and corrections" * tag 'for-linus-4.12b-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (57 commits) xen: Move xen_have_vector_callback definition to enlighten.c xen: Implement EFI reset_system callback arm/xen: Consolidate calls to shutdown hypercall in a single helper xen: Export xen_reboot xen/x86: Call xen_smp_intr_init_pv() on BSP xen: Revert commits da72ff5b and 72a9b186 xen/pvh: Do not fill kernel's e820 map in init_pvh_bootparams() xen/scsifront: use offset_in_page() macro xen/arm,arm64: rename __generic_dma_ops to xen_get_dma_ops xen/arm,arm64: fix xen_dma_ops after 815dd187 "Consolidate get_dma_ops..." xen/9pfs: select CONFIG_XEN_XENBUS_FRONTEND x86/cpu: remove hypervisor specific set_cpu_features vmware: set cpu capabilities during platform initialization x86/xen: use capabilities instead of fake cpuid values for xsave x86/xen: use capabilities instead of fake cpuid values for x2apic x86/xen: use capabilities instead of fake cpuid values for mwait x86/xen: use capabilities instead of fake cpuid values for acpi x86/xen: use capabilities instead of fake cpuid values for acc x86/xen: use capabilities instead of fake cpuid values for mtrr x86/xen: use capabilities instead of fake cpuid values for aperf ...

Merge tag 'for-linus-4.12b-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from Juergen Gross: "Xen fixes and featrues for 4.12. The main changes are: - enable building the kernel with Xen support but without enabling paravirtualized mode (Vitaly Kuznetsov) - add a new 9pfs xen frontend driver (Stefano Stabellini) - simplify Xen's cpuid handling by making use of cpu capabilities (Juergen Gross) - add/modify some headers for new Xen paravirtualized devices (Oleksandr Andrushchenko) - EFI reset_system support under Xen (Julien Grall) - and the usual cleanups and corrections" * tag 'for-linus-4.12b-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (57 commits) xen: Move xen_have_vector_callback definition to enlighten.c xen: Implement EFI reset_system callback arm/xen: Consolidate calls to shutdown hypercall in a single helper xen: Export xen_reboot xen/x86: Call xen_smp_intr_init_pv() on BSP xen: Revert commits da72ff5b and 72a9b186 xen/pvh: Do not fill kernel's e820 map in init_pvh_bootparams() xen/scsifront: use offset_in_page() macro xen/arm,arm64: rename __generic_dma_ops to xen_get_dma_ops xen/arm,arm64: fix xen_dma_ops after 815dd187 "Consolidate get_dma_ops..." xen/9pfs: select CONFIG_XEN_XENBUS_FRONTEND x86/cpu: remove hypervisor specific set_cpu_features vmware: set cpu capabilities during platform initialization x86/xen: use capabilities instead of fake cpuid values for xsave x86/xen: use capabilities instead of fake cpuid values for x2apic x86/xen: use capabilities instead of fake cpuid values for mwait x86/xen: use capabilities instead of fake cpuid values for acpi x86/xen: use capabilities instead of fake cpuid values for acc x86/xen: use capabilities instead of fake cpuid values for mtrr x86/xen: use capabilities instead of fake cpuid values for aperf ...
a9648072 · Linus Torvalds · a1be8edd · 3dbd8204 · a9648072 · a9648072
53 changed file
--- a/arch/arm/include/asm/device.h
+++ b/arch/arm/include/asm/device.h
@@ -15,6 +15,9 @@ struct dev_archdata {
 #endif
 #ifdef CONFIG_ARM_DMA_USE_IOMMU
 	struct dma_iommu_mapping	*mapping;
+#endif
+#ifdef CONFIG_XEN
+	const struct dma_map_ops *dev_dma_ops;
 #endif
 	bool dma_coherent;
 };

--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -16,19 +16,9 @@
 extern const struct dma_map_ops arm_dma_ops;
 extern const struct dma_map_ops arm_coherent_dma_ops;

-static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
-{
-	if (dev && dev->dma_ops)
-		return dev->dma_ops;
-	return &arm_dma_ops;
-}
-
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (xen_initial_domain())
-		return xen_dma_ops;
-	else
-		return __generic_dma_ops(NULL);
+	return &arm_dma_ops;
 }

 #define HAVE_ARCH_DMA_SUPPORTED 1

--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -2414,6 +2414,13 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		dma_ops = arm_get_dma_map_ops(coherent);

 	set_dma_ops(dev, dma_ops);
+
+#ifdef CONFIG_XEN
+	if (xen_initial_domain()) {
+		dev->archdata.dev_dma_ops = dev->dma_ops;
+		dev->dma_ops = xen_dma_ops;
+	}
+#endif
 }

 void arch_teardown_dma_ops(struct device *dev)

--- a/arch/arm/xen/efi.c
+++ b/arch/arm/xen/efi.c
@@ -35,6 +35,6 @@ void __init xen_efi_runtime_setup(void)
 	efi.update_capsule           = xen_efi_update_capsule;
 	efi.query_capsule_caps       = xen_efi_query_capsule_caps;
 	efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count;
-	efi.reset_system             = NULL; /* Functionality provided by Xen. */
+	efi.reset_system             = xen_efi_reset_system;
 }
 EXPORT_SYMBOL_GPL(xen_efi_runtime_setup);
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -191,20 +191,24 @@ static int xen_dying_cpu(unsigned int cpu)
 	return 0;
 }

-static void xen_restart(enum reboot_mode reboot_mode, const char *cmd)
+void xen_reboot(int reason)
 {
-	struct sched_shutdown r = { .reason = SHUTDOWN_reboot };
+	struct sched_shutdown r = { .reason = reason };
 	int rc;
+
 	rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
 	BUG_ON(rc);
 }

+static void xen_restart(enum reboot_mode reboot_mode, const char *cmd)
+{
+	xen_reboot(SHUTDOWN_reboot);
+}
+
+
 static void xen_power_off(void)
 {
-	struct sched_shutdown r = { .reason = SHUTDOWN_poweroff };
-	int rc;
-	rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
-	BUG_ON(rc);
+	xen_reboot(SHUTDOWN_poweroff);
 }

 static irqreturn_t xen_arm_callback(int irq, void *arg)

--- a/arch/arm64/include/asm/device.h
+++ b/arch/arm64/include/asm/device.h
@@ -19,6 +19,9 @@
 struct dev_archdata {
 #ifdef CONFIG_IOMMU_API
 	void *iommu;			/* private IOMMU data */
+#endif
+#ifdef CONFIG_XEN
+	const struct dma_map_ops *dev_dma_ops;
 #endif
 	bool dma_coherent;
 };

--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -27,11 +27,8 @@
 #define DMA_ERROR_CODE	(~(dma_addr_t)0)
 extern const struct dma_map_ops dummy_dma_ops;

-static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (dev && dev->dma_ops)
-		return dev->dma_ops;
-
 	/*
 	 * We expect no ISA devices, and all other DMA masters are expected to
 	 * have someone call arch_setup_dma_ops at device creation time.
@@ -39,14 +36,6 @@ static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
 	return &dummy_dma_ops;
 }

-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-	if (xen_initial_domain())
-		return xen_dma_ops;
-	else
-		return __generic_dma_ops(NULL);
-}
-
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent);
 #define arch_setup_dma_ops	arch_setup_dma_ops

--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -977,4 +977,11 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,

 	dev->archdata.dma_coherent = coherent;
 	__iommu_setup_dma_ops(dev, dma_base, size, iommu);
+
+#ifdef CONFIG_XEN
+	if (xen_initial_domain()) {
+		dev->archdata.dev_dma_ops = dev->dma_ops;
+		dev->dma_ops = xen_dma_ops;
+	}
+#endif
 }
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -35,9 +35,6 @@ struct hypervisor_x86 {
 	/* Detection routine */
 	uint32_t	(*detect)(void);

-	/* Adjust CPU feature bits (run once per CPU) */
-	void		(*set_cpu_features)(struct cpuinfo_x86 *);
-
 	/* Platform setup (run once per boot) */
 	void		(*init_platform)(void);

@@ -53,15 +50,14 @@ extern const struct hypervisor_x86 *x86_hyper;
 /* Recognized hypervisors */
 extern const struct hypervisor_x86 x86_hyper_vmware;
 extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
-extern const struct hypervisor_x86 x86_hyper_xen;
+extern const struct hypervisor_x86 x86_hyper_xen_pv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
 extern const struct hypervisor_x86 x86_hyper_kvm;

-extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
 extern bool hypervisor_x2apic_available(void);
 extern void hypervisor_pin_vcpu(int cpu);
 #else
-static inline void init_hypervisor(struct cpuinfo_x86 *c) { }
 static inline void init_hypervisor_platform(void) { }
 static inline bool hypervisor_x2apic_available(void) { return false; }
 #endif /* CONFIG_HYPERVISOR_GUEST */

--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 /* No need for a barrier -- XCHG is a barrier on x86. */
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))

+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+	return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
 #endif /* _ASM_X86_XEN_EVENTS_H */
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -52,12 +52,30 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 						    unsigned long pfn_e);

+#ifdef CONFIG_XEN_PV
 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
 				   struct page **pages, unsigned int count);
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 				     struct gnttab_unmap_grant_ref *kunmap_ops,
 				     struct page **pages, unsigned int count);
+#else
+static inline int
+set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+			struct gnttab_map_grant_ref *kmap_ops,
+			struct page **pages, unsigned int count)
+{
+	return 0;
+}
+
+static inline int
+clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+			  struct gnttab_unmap_grant_ref *kunmap_ops,
+			  struct page **pages, unsigned int count)
+{
+	return 0;
+}
+#endif

 /*
 * Helper functions to write or read unsigned long values to/from
@@ -73,6 +91,7 @@ static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val)
 	return __get_user(*val, (unsigned long __user *)addr);
 }

+#ifdef CONFIG_XEN_PV
 /*
 * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine():
 * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator
@@ -99,6 +118,12 @@ static inline unsigned long __pfn_to_mfn(unsigned long pfn)

 	return mfn;
 }
+#else
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+	return pfn;
+}
+#endif

 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {

--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1149,7 +1149,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	detect_ht(c);
 #endif

-	init_hypervisor(c);
 	x86_init_rdrand(c);
 	x86_init_cache_qos(c);
 	setup_pku(c);

--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -28,8 +28,11 @@

 static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-#ifdef CONFIG_XEN
-	&x86_hyper_xen,
+#ifdef CONFIG_XEN_PV
+	&x86_hyper_xen_pv,
+#endif
+#ifdef CONFIG_XEN_PVHVM
+	&x86_hyper_xen_hvm,
 #endif
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
@@ -60,12 +63,6 @@ detect_hypervisor_vendor(void)
 		pr_info("Hypervisor detected: %s\n", x86_hyper->name);
 }

-void init_hypervisor(struct cpuinfo_x86 *c)
-{
-	if (x86_hyper && x86_hyper->set_cpu_features)
-		x86_hyper->set_cpu_features(c);
-}
-
 void __init init_hypervisor_platform(void)
 {

@@ -74,8 +71,6 @@ void __init init_hypervisor_platform(void)
 	if (!x86_hyper)
 		return;

-	init_hypervisor(&boot_cpu_data);
-
 	if (x86_hyper->init_platform)
 		x86_hyper->init_platform();
 }

--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -113,6 +113,24 @@ static void __init vmware_paravirt_ops_setup(void)
 #define vmware_paravirt_ops_setup() do {} while (0)
 #endif

+/*
+ * VMware hypervisor takes care of exporting a reliable TSC to the guest.
+ * Still, due to timing difference when running on virtual cpus, the TSC can
+ * be marked as unstable in some cases. For example, the TSC sync check at
+ * bootup can fail due to a marginal offset between vcpus' TSCs (though the
+ * TSCs do not drift from each other).  Also, the ACPI PM timer clocksource
+ * is not suitable as a watchdog when running on a hypervisor because the
+ * kernel may miss a wrap of the counter if the vcpu is descheduled for a
+ * long time. To skip these checks at runtime we set these capability bits,
+ * so that the kernel could just trust the hypervisor with providing a
+ * reliable virtual TSC that is suitable for timekeeping.
+ */
+static void __init vmware_set_capabilities(void)
+{
+	setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
+	setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+}
+
 static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
@@ -152,6 +170,8 @@ static void __init vmware_platform_setup(void)
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
 #endif
+
+	vmware_set_capabilities();
 }

 /*
@@ -176,24 +196,6 @@ static uint32_t __init vmware_platform(void)
 	return 0;
 }

-/*
- * VMware hypervisor takes care of exporting a reliable TSC to the guest.
- * Still, due to timing difference when running on virtual cpus, the TSC can
- * be marked as unstable in some cases. For example, the TSC sync check at
- * bootup can fail due to a marginal offset between vcpus' TSCs (though the
- * TSCs do not drift from each other).  Also, the ACPI PM timer clocksource
- * is not suitable as a watchdog when running on a hypervisor because the
- * kernel may miss a wrap of the counter if the vcpu is descheduled for a
- * long time. To skip these checks at runtime we set these capability bits,
- * so that the kernel could just trust the hypervisor with providing a
- * reliable virtual TSC that is suitable for timekeeping.
- */
-static void vmware_set_cpu_features(struct cpuinfo_x86 *c)
-{
-	set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-	set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
-}
-
 /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
 static bool __init vmware_legacy_x2apic_available(void)
 {
@@ -206,7 +208,6 @@ static bool __init vmware_legacy_x2apic_available(void)
 const __refconst struct hypervisor_x86 x86_hyper_vmware = {
 	.name			= "VMware",
 	.detect			= vmware_platform,
-	.set_cpu_features	= vmware_set_cpu_features,
 	.init_platform		= vmware_platform_setup,
 	.x2apic_available	= vmware_legacy_x2apic_available,
 };

--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -446,7 +446,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 		__switch_to_xtra(prev_p, next_p, tss);

-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
 	/*
 	 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
 	 * current_pt_regs()->flags may not match the current task's

--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -447,7 +447,7 @@ void __init xen_msi_init(void)

 int __init pci_xen_hvm_init(void)
 {
-	if (!xen_feature(XENFEAT_hvm_pirqs))
+	if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs))
 		return 0;

 #ifdef CONFIG_ACPI

--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,6 @@ config XEN
 	bool "Xen guest support"
 	depends on PARAVIRT
 	select PARAVIRT_CLOCK
-	select XEN_HAVE_PVMMU
-	select XEN_HAVE_VPMU
 	depends on X86_64 || (X86_32 && X86_PAE)
 	depends on X86_LOCAL_APIC && X86_TSC
 	help
@@ -15,18 +13,41 @@ config XEN
 	  kernel to boot in a paravirtualized environment under the
 	  Xen hypervisor.

-config XEN_DOM0
+config XEN_PV
+	bool "Xen PV guest support"
+	default y
+	depends on XEN
+	select XEN_HAVE_PVMMU
+	select XEN_HAVE_VPMU
+	help
+	  Support running as a Xen PV guest.
+
+config XEN_PV_SMP
 	def_bool y
-	depends on XEN && PCI_XEN && SWIOTLB_XEN
+	depends on XEN_PV && SMP
+
+config XEN_DOM0
+	bool "Xen PV Dom0 support"
+	default y
+	depends on XEN_PV && PCI_XEN && SWIOTLB_XEN
 	depends on X86_IO_APIC && ACPI && PCI
+	help
+	  Support running as a Xen PV Dom0 guest.

 config XEN_PVHVM
-	def_bool y
+	bool "Xen PVHVM guest support"
+	default y
 	depends on XEN && PCI && X86_LOCAL_APIC
+	help
+	  Support running as a Xen PVHVM guest.
+
+config XEN_PVHVM_SMP
+	def_bool y
+	depends on XEN_PVHVM && SMP

 config XEN_512GB
 	bool "Limit Xen pv-domain memory to 512GB"
-	depends on XEN && X86_64
+	depends on XEN_PV && X86_64
 	default y
 	help
 	  Limit paravirtualized user domains to 512GB of RAM.

--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -7,17 +7,23 @@ endif

 # Make sure early boot has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_enlighten.o		:= $(nostackp)
-CFLAGS_mmu.o			:= $(nostackp)
+CFLAGS_enlighten_pv.o		:= $(nostackp)
+CFLAGS_mmu_pv.o		:= $(nostackp)

-obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
+obj-y		:= enlighten.o multicalls.o mmu.o irq.o \
 			time.o xen-asm.o xen-asm_$(BITS).o \
-			grant-table.o suspend.o platform-pci-unplug.o \
-			p2m.o apic.o pmu.o
+			grant-table.o suspend.o platform-pci-unplug.o
+
+obj-$(CONFIG_XEN_PVHVM)		+= enlighten_hvm.o mmu_hvm.o suspend_hvm.o
+obj-$(CONFIG_XEN_PV)			+= setup.o apic.o pmu.o suspend_pv.o \
+						p2m.o enlighten_pv.o mmu_pv.o
+obj-$(CONFIG_XEN_PVH)			+= enlighten_pvh.o

 obj-$(CONFIG_EVENT_TRACING) += trace.o

 obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_XEN_PV_SMP)  	+= smp_pv.o
+obj-$(CONFIG_XEN_PVHVM_SMP)  	+= smp_hvm.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
 obj-$(CONFIG_XEN_DOM0)		+= vga.o

--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -81,7 +81,7 @@ static const struct efi efi_xen __initconst = {
 	.update_capsule           = xen_efi_update_capsule,
 	.query_capsule_caps       = xen_efi_query_capsule_caps,
 	.get_next_high_mono_count = xen_efi_get_next_high_mono_count,
-	.reset_system             = NULL, /* Functionality provided by Xen. */
+	.reset_system             = xen_efi_reset_system,
 	.set_virtual_address_map  = NULL, /* Not used under Xen. */
 	.flags			  = 0     /* Initialized later. */
 };

--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
+#include <linux/cpu.h>
+#include <linux/kexec.h>
+
+#include <xen/features.h>
+#include <xen/events.h>
+#include <xen/interface/memory.h>
+
+#include <asm/cpu.h>
+#include <asm/smp.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/hypervisor.h>
+
+#include <asm/xen/cpuid.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "smp.h"
+
+void __ref xen_hvm_init_shared_info(void)
+{
+	int cpu;
+	struct xen_add_to_physmap xatp;
+	static struct shared_info *shared_info_page;
+
+	if (!shared_info_page)
+		shared_info_page = (struct shared_info *)
+			extend_brk(PAGE_SIZE, PAGE_SIZE);
+	xatp.domid = DOMID_SELF;
+	xatp.idx = 0;
+	xatp.space = XENMAPSPACE_shared_info;
+	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+		BUG();
+
+	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+	 * page, we use it in the event channel upcall and in some pvclock
+	 * related functions. We don't need the vcpu_info placement
+	 * optimizations because we don't use any pv_mmu or pv_irq op on
+	 * HVM.
+	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+	 * online but xen_hvm_init_shared_info is run at resume time too and
+	 * in that case multiple vcpus might be online. */
+	for_each_online_cpu(cpu) {
+		/* Leave it to be NULL. */
+		if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS)
+			continue;
+		per_cpu(xen_vcpu, cpu) =
+			&HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
+	}
+}
+
+static void __init init_hvm_pv_info(void)
+{
+	int major, minor;
+	uint32_t eax, ebx, ecx, edx, base;
+
+	base = xen_cpuid_base();
+	eax = cpuid_eax(base + 1);
+
+	major = eax >> 16;
+	minor = eax & 0xffff;
+	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+	xen_domain_type = XEN_HVM_DOMAIN;
+
+	/* PVH set up hypercall page in xen_prepare_pvh(). */
+	if (xen_pvh_domain())
+		pv_info.name = "Xen PVH";
+	else {
+		u64 pfn;
+		uint32_t msr;
+
+		pv_info.name = "Xen HVM";
+		msr = cpuid_ebx(base + 2);
+		pfn = __pa(hypercall_page);
+		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+	}
+
+	xen_setup_features();
+
+	cpuid(base + 4, &eax, &ebx, &ecx, &edx);
+	if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT)
+		this_cpu_write(xen_vcpu_id, ebx);
+	else
+		this_cpu_write(xen_vcpu_id, smp_processor_id());
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void xen_hvm_shutdown(void)
+{
+	native_machine_shutdown();
+	if (kexec_in_progress)
+		xen_reboot(SHUTDOWN_soft_reset);
+}
+
+static void xen_hvm_crash_shutdown(struct pt_regs *regs)
+{
+	native_machine_crash_shutdown(regs);
+	xen_reboot(SHUTDOWN_soft_reset);
+}
+#endif
+
+static int xen_cpu_up_prepare_hvm(unsigned int cpu)
+{
+	int rc;
+
+	/*
+	 * This can happen if CPU was offlined earlier and
+	 * offlining timed out in common_cpu_die().
+	 */
+	if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+	}
+
+	if (cpu_acpi_id(cpu) != U32_MAX)
+		per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
+	else
+		per_cpu(xen_vcpu_id, cpu) = cpu;
+	xen_vcpu_setup(cpu);
+
+	if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
+		xen_setup_timer(cpu);
+
+	rc = xen_smp_intr_init(cpu);
+	if (rc) {
+		WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
+		     cpu, rc);
+		return rc;
+	}
+	return 0;
+}
+
+static int xen_cpu_dead_hvm(unsigned int cpu)
+{
+	xen_smp_intr_free(cpu);
+
+	if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
+		xen_teardown_timer(cpu);
+
+       return 0;
+}
+
+static void __init xen_hvm_guest_init(void)
+{
+	if (xen_pv_domain())
+		return;
+
+	init_hvm_pv_info();
+
+	xen_hvm_init_shared_info();
+
+	xen_panic_handler_init();
+
+	if (xen_feature(XENFEAT_hvm_callback_vector))
+		xen_have_vector_callback = 1;
+
+	xen_hvm_smp_init();
+	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm));
+	xen_unplug_emulated_devices();
+	x86_init.irqs.intr_init = xen_init_IRQ;
+	xen_hvm_init_time_ops();
+	xen_hvm_init_mmu_ops();
+
+	if (xen_pvh_domain())
+		machine_ops.emergency_restart = xen_emergency_restart;
+#ifdef CONFIG_KEXEC_CORE
+	machine_ops.shutdown = xen_hvm_shutdown;
+	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
+#endif
+}
+
+static bool xen_nopv;
+static __init int xen_parse_nopv(char *arg)
+{
+       xen_nopv = true;
+       return 0;
+}
+early_param("xen_nopv", xen_parse_nopv);
+
+bool xen_hvm_need_lapic(void)
+{
+	if (xen_nopv)
+		return false;
+	if (xen_pv_domain())
+		return false;
+	if (!xen_hvm_domain())
+		return false;
+	if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+		return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
+static uint32_t __init xen_platform_hvm(void)
+{
+	if (xen_pv_domain() || xen_nopv)
+		return 0;
+
+	return xen_cpuid_base();
+}
+
+const struct hypervisor_x86 x86_hyper_xen_hvm = {
+	.name                   = "Xen HVM",
+	.detect                 = xen_platform_hvm,
+	.init_platform          = xen_hvm_guest_init,
+	.pin_vcpu               = xen_pin_vcpu,
+	.x2apic_available       = xen_x2apic_para_available,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
+#include <linux/acpi.h>
+
+#include <xen/hvc-console.h>
+
+#include <asm/io_apic.h>
+#include <asm/hypervisor.h>
+#include <asm/e820/api.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/memory.h>
+#include <xen/interface/hvm/start_info.h>
+
+/*
+ * PVH variables.
+ *
+ * xen_pvh and pvh_bootparams need to live in data segment since they
+ * are used after startup_{32|64}, which clear .bss, are invoked.
+ */
+bool xen_pvh __attribute__((section(".data"))) = 0;
+struct boot_params pvh_bootparams __attribute__((section(".data")));
+
+struct hvm_start_info pvh_start_info;
+unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
+
+static void xen_pvh_arch_setup(void)
+{
+	/* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */
+	if (nr_ioapics == 0)
+		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
+}
+
+static void __init init_pvh_bootparams(void)
+{
+	struct xen_memory_map memmap;
+	int rc;
+
+	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
+
+	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
+	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc) {
+		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+		BUG();
+	}
+	pvh_bootparams.e820_entries = memmap.nr_entries;
+
+	if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
+		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
+			ISA_START_ADDRESS;
+		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].size =
+			ISA_END_ADDRESS - ISA_START_ADDRESS;
+		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
+			E820_TYPE_RESERVED;
+		pvh_bootparams.e820_entries++;
+	} else
+		xen_raw_printk("Warning: Can fit ISA range into e820\n");
+
+	pvh_bootparams.hdr.cmd_line_ptr =
+		pvh_start_info.cmdline_paddr;
+
+	/* The first module is always ramdisk. */
+	if (pvh_start_info.nr_modules) {
+		struct hvm_modlist_entry *modaddr =
+			__va(pvh_start_info.modlist_paddr);
+		pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
+		pvh_bootparams.hdr.ramdisk_size = modaddr->size;
+	}
+
+	/*
+	 * See Documentation/x86/boot.txt.
+	 *
+	 * Version 2.12 supports Xen entry point but we will use default x86/PC
+	 * environment (i.e. hardware_subarch 0).
+	 */
+	pvh_bootparams.hdr.version = 0x212;
+	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
+}
+
+/*
+ * This routine (and those that it might call) should not use
+ * anything that lives in .bss since that segment will be cleared later.
+ */
+void __init xen_prepare_pvh(void)
+{
+	u32 msr;
+	u64 pfn;
+
+	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
+		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
+				pvh_start_info.magic);
+		BUG();
+	}
+
+	xen_pvh = 1;
+
+	msr = cpuid_ebx(xen_cpuid_base() + 2);
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	init_pvh_bootparams();
+
+	x86_init.oem.arch_setup = xen_pvh_arch_setup;
+}
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
+#include <linux/types.h>
+#include <linux/crash_dump.h>
+
+#include <xen/interface/xen.h>
+#include <xen/hvm.h>
+
+#include "mmu.h"
+
+#ifdef CONFIG_PROC_VMCORE
+/*
+ * This function is used in two contexts:
+ * - the kdump kernel has to check whether a pfn of the crashed kernel
+ *   was a ballooned page. vmcore is using this function to decide
+ *   whether to access a pfn of the crashed kernel.
+ * - the kexec kernel has to check whether a pfn was ballooned by the
+ *   previous kernel. If the pfn is ballooned, handle it properly.
+ * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * handle the pfn special in this case.
+ */
+static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+{
+	struct xen_hvm_get_mem_type a = {
+		.domid = DOMID_SELF,
+		.pfn = pfn,
+	};
+	int ram;
+
+	if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
+		return -ENXIO;
+
+	switch (a.mem_type) {
+	case HVMMEM_mmio_dm:
+		ram = 0;
+		break;
+	case HVMMEM_ram_rw:
+	case HVMMEM_ram_ro:
+	default:
+		ram = 1;
+		break;
+	}
+
+	return ram;
+}
+#endif
+
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+	struct xen_hvm_pagetable_dying a;
+	int rc;
+
+	a.domid = DOMID_SELF;
+	a.gpa = __pa(mm->pgd);
+	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+	WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+	struct xen_hvm_pagetable_dying a;
+	int rc = 0;
+
+	a.domid = DOMID_SELF;
+	a.gpa = 0x00;
+	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+	if (rc < 0) {
+		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+		return 0;
+	}
+	return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+	if (is_pagetable_dying_supported())
+		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+#ifdef CONFIG_PROC_VMCORE
+	register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
+#endif
+}
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
--- a/arch/x86/xen/pmu.h
+++ b/arch/x86/xen/pmu.h
@@ -4,8 +4,13 @@
 #include <xen/interface/xenpmu.h>

 irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+#ifdef CONFIG_XEN_HAVE_VPMU
 void xen_pmu_init(int cpu);
 void xen_pmu_finish(int cpu);
+#else
+static inline void xen_pmu_init(int cpu) {}
+static inline void xen_pmu_finish(int cpu) {}
+#endif
 bool is_xen_pmu(int cpu);
 bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
 bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);

--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
-/*
- * Xen SMP support
- *
- * This file implements the Xen versions of smp_ops.  SMP under Xen is
- * very straightforward.  Bringing a CPU up is simply a matter of
- * loading its initial context and setting it running.
- *
- * IPIs are handled through the Xen event mechanism.
- *
- * Because virtual CPUs can be scheduled onto any real CPU, there's no
- * useful topology information for the kernel to make use of.  As a
- * result, all CPUs are treated as if they're single-core and
- * single-threaded.
- */
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/irq_work.h>
-#include <linux/tick.h>
-#include <linux/nmi.h>
-
-#include <asm/paravirt.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/cpu.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/vcpu.h>
-#include <xen/interface/xenpmu.h>
-
-#include <asm/xen/interface.h>
-#include <asm/xen/hypercall.h>
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>

-#include <xen/xen.h>
-#include <xen/page.h>
 #include <xen/events.h>

 #include <xen/hvc-console.h>
 #include "xen-ops.h"
-#include "mmu.h"
 #include "smp.h"
-#include "pmu.h"
-
-cpumask_var_t xen_cpu_initialized_map;

-struct xen_common_irq {
-	int irq;
-	char *name;
-};
 static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
-static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
-static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };

 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
-static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);

 /*
 * Reschedule call back.
@@ -70,42 +28,6 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }

-static void cpu_bringup(void)
-{
-	int cpu;
-
-	cpu_init();
-	touch_softlockup_watchdog();
-	preempt_disable();
-
-	/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
-	if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
-		xen_enable_sysenter();
-		xen_enable_syscall();
-	}
-	cpu = smp_processor_id();
-	smp_store_cpu_info(cpu);
-	cpu_data(cpu).x86_max_cores = 1;
-	set_cpu_sibling_map(cpu);
-
-	xen_setup_cpu_clockevents();
-
-	notify_cpu_starting(cpu);
-
-	set_cpu_online(cpu, true);
-
-	cpu_set_state_online(cpu);  /* Implies full memory barrier. */
-
-	/* We can take interrupts now: we're officially "up". */
-	local_irq_enable();
-}
-
-asmlinkage __visible void cpu_bringup_and_idle(void)
-{
-	cpu_bringup();
-	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-}
-
 void xen_smp_intr_free(unsigned int cpu)
 {
 	if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
@@ -133,27 +55,12 @@ void xen_smp_intr_free(unsigned int cpu)
 		kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
 		per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
 	}
-	if (xen_hvm_domain())
-		return;
-
-	if (per_cpu(xen_irq_work, cpu).irq >= 0) {
-		unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
-		per_cpu(xen_irq_work, cpu).irq = -1;
-		kfree(per_cpu(xen_irq_work, cpu).name);
-		per_cpu(xen_irq_work, cpu).name = NULL;
-	}
+}

-	if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
-		unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
-		per_cpu(xen_pmu_irq, cpu).irq = -1;
-		kfree(per_cpu(xen_pmu_irq, cpu).name);
-		per_cpu(xen_pmu_irq, cpu).name = NULL;
-	}
-};
 int xen_smp_intr_init(unsigned int cpu)
 {
 	int rc;
-	char *resched_name, *callfunc_name, *debug_name, *pmu_name;
+	char *resched_name, *callfunc_name, *debug_name;

 	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
 	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -200,37 +107,6 @@ int xen_smp_intr_init(unsigned int cpu)
 	per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
 	per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;

-	/*
-	 * The IRQ worker on PVHVM goes through the native path and uses the
-	 * IPI mechanism.
-	 */
-	if (xen_hvm_domain())
-		return 0;
-
-	callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
-	rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
-				    cpu,
-				    xen_irq_work_interrupt,
-				    IRQF_PERCPU|IRQF_NOBALANCING,
-				    callfunc_name,
-				    NULL);
-	if (rc < 0)
-		goto fail;
-	per_cpu(xen_irq_work, cpu).irq = rc;
-	per_cpu(xen_irq_work, cpu).name = callfunc_name;
-
-	if (is_xen_pmu(cpu)) {
-		pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
-		rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
-					     xen_pmu_irq_handler,
-					     IRQF_PERCPU|IRQF_NOBALANCING,
-					     pmu_name, NULL);
-		if (rc < 0)
-			goto fail;
-		per_cpu(xen_pmu_irq, cpu).irq = rc;
-		per_cpu(xen_pmu_irq, cpu).name = pmu_name;
-	}
-
 	return 0;

 fail:
@@ -238,333 +114,7 @@ int xen_smp_intr_init(unsigned int cpu)
 	return rc;
 }

-static void __init xen_fill_possible_map(void)
-{
-	int i, rc;
-
-	if (xen_initial_domain())
-		return;
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-		if (rc >= 0) {
-			num_processors++;
-			set_cpu_possible(i, true);
-		}
-	}
-}
-
-static void __init xen_filter_cpu_maps(void)
-{
-	int i, rc;
-	unsigned int subtract = 0;
-
-	if (!xen_initial_domain())
-		return;
-
-	num_processors = 0;
-	disabled_cpus = 0;
-	for (i = 0; i < nr_cpu_ids; i++) {
-		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-		if (rc >= 0) {
-			num_processors++;
-			set_cpu_possible(i, true);
-		} else {
-			set_cpu_possible(i, false);
-			set_cpu_present(i, false);
-			subtract++;
-		}
-	}
-#ifdef CONFIG_HOTPLUG_CPU
-	/* This is akin to using 'nr_cpus' on the Linux command line.
-	 * Which is OK as when we use 'dom0_max_vcpus=X' we can only
-	 * have up to X, while nr_cpu_ids is greater than X. This
-	 * normally is not a problem, except when CPU hotplugging
-	 * is involved and then there might be more than X CPUs
-	 * in the guest - which will not work as there is no
-	 * hypercall to expand the max number of VCPUs an already
-	 * running guest has. So cap it up to X. */
-	if (subtract)
-		nr_cpu_ids = nr_cpu_ids - subtract;
-#endif
-
-}
-
-static void __init xen_smp_prepare_boot_cpu(void)
-{
-	BUG_ON(smp_processor_id() != 0);
-	native_smp_prepare_boot_cpu();
-
-	if (xen_pv_domain()) {
-		if (!xen_feature(XENFEAT_writable_page_tables))
-			/* We've switched to the "real" per-cpu gdt, so make
-			 * sure the old memory can be recycled. */
-			make_lowmem_page_readwrite(xen_initial_gdt);
-
-#ifdef CONFIG_X86_32
-		/*
-		 * Xen starts us with XEN_FLAT_RING1_DS, but linux code
-		 * expects __USER_DS
-		 */
-		loadsegment(ds, __USER_DS);
-		loadsegment(es, __USER_DS);
-#endif
-
-		xen_filter_cpu_maps();
-		xen_setup_vcpu_info_placement();
-	}
-
-	/*
-	 * Setup vcpu_info for boot CPU.
-	 */
-	if (xen_hvm_domain())
-		xen_vcpu_setup(0);
-
-	/*
-	 * The alternative logic (which patches the unlock/lock) runs before
-	 * the smp bootup up code is activated. Hence we need to set this up
-	 * the core kernel is being patched. Otherwise we will have only
-	 * modules patched but not core code.
-	 */
-	xen_init_spinlocks();
-}
-
-static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
-{
-	unsigned cpu;
-	unsigned int i;
-
-	if (skip_ioapic_setup) {
-		char *m = (max_cpus == 0) ?
-			"The nosmp parameter is incompatible with Xen; " \
-			"use Xen dom0_max_vcpus=1 parameter" :
-			"The noapic parameter is incompatible with Xen";
-
-		xen_raw_printk(m);
-		panic(m);
-	}
-	xen_init_lock_cpu(0);
-
-	smp_store_boot_cpu_info();
-	cpu_data(0).x86_max_cores = 1;
-
-	for_each_possible_cpu(i) {
-		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
-		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
-		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
-	}
-	set_cpu_sibling_map(0);
-
-	xen_pmu_init(0);
-
-	if (xen_smp_intr_init(0))
-		BUG();
-
-	if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
-		panic("could not allocate xen_cpu_initialized_map\n");
-
-	cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
-
-	/* Restrict the possible_map according to max_cpus. */
-	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
-		for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
-			continue;
-		set_cpu_possible(cpu, false);
-	}
-
-	for_each_possible_cpu(cpu)
-		set_cpu_present(cpu, true);
-}
-
-static int
-cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
-{
-	struct vcpu_guest_context *ctxt;
-	struct desc_struct *gdt;
-	unsigned long gdt_mfn;
-
-	/* used to tell cpu_init() that it can proceed with initialization */
-	cpumask_set_cpu(cpu, cpu_callout_mask);
-	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
-		return 0;
-
-	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
-	if (ctxt == NULL)
-		return -ENOMEM;
-
-	gdt = get_cpu_gdt_rw(cpu);
-
-#ifdef CONFIG_X86_32
-	ctxt->user_regs.fs = __KERNEL_PERCPU;
-	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
-#endif
-	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
-
-	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-	ctxt->flags = VGCF_IN_KERNEL;
-	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
-	ctxt->user_regs.ds = __USER_DS;
-	ctxt->user_regs.es = __USER_DS;
-	ctxt->user_regs.ss = __KERNEL_DS;
-
-	xen_copy_trap_info(ctxt->trap_ctxt);
-
-	ctxt->ldt_ents = 0;
-
-	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
-
-	gdt_mfn = arbitrary_virt_to_mfn(gdt);
-	make_lowmem_page_readonly(gdt);
-	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
-
-	ctxt->gdt_frames[0] = gdt_mfn;
-	ctxt->gdt_ents      = GDT_ENTRIES;
-
-	ctxt->kernel_ss = __KERNEL_DS;
-	ctxt->kernel_sp = idle->thread.sp0;
-
-#ifdef CONFIG_X86_32
-	ctxt->event_callback_cs     = __KERNEL_CS;
-	ctxt->failsafe_callback_cs  = __KERNEL_CS;
-#else
-	ctxt->gs_base_kernel = per_cpu_offset(cpu);
-#endif
-	ctxt->event_callback_eip    =
-		(unsigned long)xen_hypervisor_callback;
-	ctxt->failsafe_callback_eip =
-		(unsigned long)xen_failsafe_callback;
-	ctxt->user_regs.cs = __KERNEL_CS;
-	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-
-	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
-	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
-		BUG();
-
-	kfree(ctxt);
-	return 0;
-}
-
-static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
-{
-	int rc;
-
-	common_cpu_up(cpu, idle);
-
-	xen_setup_runstate_info(cpu);
-
-	/*
-	 * PV VCPUs are always successfully taken down (see 'while' loop
-	 * in xen_cpu_die()), so -EBUSY is an error.
-	 */
-	rc = cpu_check_up_prepare(cpu);
-	if (rc)
-		return rc;
-
-	/* make sure interrupts start blocked */
-	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
-
-	rc = cpu_initialize_context(cpu, idle);
-	if (rc)
-		return rc;
-
-	xen_pmu_init(cpu);
-
-	rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
-	BUG_ON(rc);
-
-	while (cpu_report_state(cpu) != CPU_ONLINE)
-		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
-
-	return 0;
-}
-
-static void xen_smp_cpus_done(unsigned int max_cpus)
-{
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int xen_cpu_disable(void)
-{
-	unsigned int cpu = smp_processor_id();
-	if (cpu == 0)
-		return -EBUSY;
-
-	cpu_disable_common();
-
-	load_cr3(swapper_pg_dir);
-	return 0;
-}
-
-static void xen_cpu_die(unsigned int cpu)
-{
-	while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up,
-						     xen_vcpu_nr(cpu), NULL)) {
-		__set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ/10);
-	}
-
-	if (common_cpu_die(cpu) == 0) {
-		xen_smp_intr_free(cpu);
-		xen_uninit_lock_cpu(cpu);
-		xen_teardown_timer(cpu);
-		xen_pmu_finish(cpu);
-	}
-}
-
-static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
-{
-	play_dead_common();
-	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
-	cpu_bringup();
-	/*
-	 * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
-	 * clears certain data that the cpu_idle loop (which called us
-	 * and that we return from) expects. The only way to get that
-	 * data back is to call:
-	 */
-	tick_nohz_idle_enter();
-
-	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-}
-
-#else /* !CONFIG_HOTPLUG_CPU */
-static int xen_cpu_disable(void)
-{
-	return -ENOSYS;
-}
-
-static void xen_cpu_die(unsigned int cpu)
-{
-	BUG();
-}
-
-static void xen_play_dead(void)
-{
-	BUG();
-}
-
-#endif
-static void stop_self(void *v)
-{
-	int cpu = smp_processor_id();
-
-	/* make sure we're not pinning something down */
-	load_cr3(swapper_pg_dir);
-	/* should set up a minimal gdt */
-
-	set_cpu_online(cpu, false);
-
-	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL);
-	BUG();
-}
-
-static void xen_stop_other_cpus(int wait)
-{
-	smp_call_function(stop_self, NULL, wait);
-}
-
-static void xen_smp_send_reschedule(int cpu)
+void xen_smp_send_reschedule(int cpu)
 {
 	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
@@ -578,7 +128,7 @@ static void __xen_send_IPI_mask(const struct cpumask *mask,
 		xen_send_IPI_one(cpu, vector);
 }

-static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
+void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 {
 	int cpu;

@@ -593,7 +143,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 	}
 }

-static void xen_smp_send_call_function_single_ipi(int cpu)
+void xen_smp_send_call_function_single_ipi(int cpu)
 {
 	__xen_send_IPI_mask(cpumask_of(cpu),
 			  XEN_CALL_FUNCTION_SINGLE_VECTOR);
@@ -698,54 +248,3 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)

 	return IRQ_HANDLED;
 }
-
-static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
-{
-	irq_enter();
-	irq_work_run();
-	inc_irq_stat(apic_irq_work_irqs);
-	irq_exit();
-
-	return IRQ_HANDLED;
-}
-
-static const struct smp_ops xen_smp_ops __initconst = {
-	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
-	.smp_prepare_cpus = xen_smp_prepare_cpus,
-	.smp_cpus_done = xen_smp_cpus_done,
-
-	.cpu_up = xen_cpu_up,
-	.cpu_die = xen_cpu_die,
-	.cpu_disable = xen_cpu_disable,
-	.play_dead = xen_play_dead,
-
-	.stop_other_cpus = xen_stop_other_cpus,
-	.smp_send_reschedule = xen_smp_send_reschedule,
-
-	.send_call_func_ipi = xen_smp_send_call_function_ipi,
-	.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
-};
-
-void __init xen_smp_init(void)
-{
-	smp_ops = xen_smp_ops;
-	xen_fill_possible_map();
-}
-
-static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
-{
-	native_smp_prepare_cpus(max_cpus);
-	WARN_ON(xen_smp_intr_init(0));
-
-	xen_init_lock_cpu(0);
-}
-
-void __init xen_hvm_smp_init(void)
-{
-	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
-	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
-	smp_ops.cpu_die = xen_cpu_die;
-	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
-	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
-	smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
-}
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -11,7 +11,17 @@ extern void xen_send_IPI_self(int vector);

 extern int xen_smp_intr_init(unsigned int cpu);
 extern void xen_smp_intr_free(unsigned int cpu);
+int xen_smp_intr_init_pv(unsigned int cpu);
+void xen_smp_intr_free_pv(unsigned int cpu);

+void xen_smp_send_reschedule(int cpu);
+void xen_smp_send_call_function_ipi(const struct cpumask *mask);
+void xen_smp_send_call_function_single_ipi(int cpu);
+
+struct xen_common_irq {
+	int irq;
+	char *name;
+};
 #else /* CONFIG_SMP */

 static inline int xen_smp_intr_init(unsigned int cpu)
@@ -19,6 +29,12 @@ static inline int xen_smp_intr_init(unsigned int cpu)
 	return 0;
 }
 static inline void xen_smp_intr_free(unsigned int cpu) {}
+
+static inline int xen_smp_intr_init_pv(unsigned int cpu)
+{
+	return 0;
+}
+static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
 #endif /* CONFIG_SMP */

 #endif
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
--- a/arch/x86/xen/suspend_hvm.c
+++ b/arch/x86/xen/suspend_hvm.c
+#include <linux/types.h>
+
+#include <xen/xen.h>
+#include <xen/features.h>
+#include <xen/interface/features.h>
+
+#include "xen-ops.h"
+
+void xen_hvm_post_suspend(int suspend_cancelled)
+{
+	int cpu;
+
+	if (!suspend_cancelled)
+		xen_hvm_init_shared_info();
+	xen_callback_vector();
+	xen_unplug_emulated_devices();
+	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+		for_each_online_cpu(cpu) {
+			xen_setup_runstate_info(cpu);
+		}
+	}
+}
--- a/arch/x86/xen/suspend_pv.c
+++ b/arch/x86/xen/suspend_pv.c
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -436,6 +436,14 @@ static void xen_hvm_setup_cpu_clockevents(void)

 void __init xen_hvm_init_time_ops(void)
 {
+	/*
+	 * vector callback is needed otherwise we cannot receive interrupts
+	 * on cpu > 0 and at this point we don't know how many cpus are
+	 * available.
+	 */
+	if (!xen_have_vector_callback)
+		return;
+
 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
 		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
 				"disable pv timer\n");

--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
--- a/drivers/scsi/xen-scsifront.c
+++ b/drivers/scsi/xen-scsifront.c
@@ -434,7 +434,7 @@ static int map_data_for_request(struct vscsifrnt_info *info,

 	if (seg_grants) {
 		page = virt_to_page(seg);
-		off = (unsigned long)seg & ~PAGE_MASK;
+		off = offset_in_page(seg);
 		len = sizeof(struct scsiif_request_segment) * data_grants;
 		while (len > 0) {
 			bytes = min_t(unsigned int, len, PAGE_SIZE - off);

--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
--- a/drivers/xen/efi.c
+++ b/drivers/xen/efi.c
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
--- a/include/xen/arm/page-coherent.h
+++ b/include/xen/arm/page-coherent.h
--- a/include/xen/interface/io/9pfs.h
+++ b/include/xen/interface/io/9pfs.h
--- a/include/xen/interface/io/displif.h
+++ b/include/xen/interface/io/displif.h
--- a/include/xen/interface/io/kbdif.h
+++ b/include/xen/interface/io/kbdif.h
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
--- a/include/xen/interface/io/sndif.h
+++ b/include/xen/interface/io/sndif.h
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
--- a/net/9p/Kconfig
+++ b/net/9p/Kconfig
--- a/net/9p/Makefile
+++ b/net/9p/Makefile
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c