Merge branch 'x86/paravirt' into x86/apic

Conflicts: arch/x86/mach-voyager/voyager_smp.c

Merge branch 'x86/paravirt' into x86/apic
Conflicts: arch/x86/mach-voyager/voyager_smp.c
eca217b3 · Ingo Molnar · 54a353a0 · e4d04071 · eca217b3 · eca217b3
27 changed file
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -12,21 +12,38 @@
 #define CLBR_EAX  (1 << 0)
 #define CLBR_ECX  (1 << 1)
 #define CLBR_EDX  (1 << 2)
+#define CLBR_EDI  (1 << 3)

-#ifdef CONFIG_X86_64
-#define CLBR_RSI  (1 << 3)
-#define CLBR_RDI  (1 << 4)
+#ifdef CONFIG_X86_32
+/* CLBR_ANY should match all regs platform has. For i386, that's just it */
+#define CLBR_ANY  ((1 << 4) - 1)
+
+#define CLBR_ARG_REGS	(CLBR_EAX | CLBR_EDX | CLBR_ECX)
+#define CLBR_RET_REG	(CLBR_EAX | CLBR_EDX)
+#define CLBR_SCRATCH	(0)
+#else
+#define CLBR_RAX  CLBR_EAX
+#define CLBR_RCX  CLBR_ECX
+#define CLBR_RDX  CLBR_EDX
+#define CLBR_RDI  CLBR_EDI
+#define CLBR_RSI  (1 << 4)
 #define CLBR_R8   (1 << 5)
 #define CLBR_R9   (1 << 6)
 #define CLBR_R10  (1 << 7)
 #define CLBR_R11  (1 << 8)
+
 #define CLBR_ANY  ((1 << 9) - 1)
+
+#define CLBR_ARG_REGS	(CLBR_RDI | CLBR_RSI | CLBR_RDX | \
+			 CLBR_RCX | CLBR_R8 | CLBR_R9)
+#define CLBR_RET_REG	(CLBR_RAX)
+#define CLBR_SCRATCH	(CLBR_R10 | CLBR_R11)
+
 #include <asm/desc_defs.h>
-#else
-/* CLBR_ANY should match all regs platform has. For i386, that's just it */
-#define CLBR_ANY  ((1 << 3) - 1)
 #endif /* X86_64 */

+#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
+
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 #include <linux/cpumask.h>
@@ -40,6 +57,14 @@ struct tss_struct;
 struct mm_struct;
 struct desc_struct;

+/*
+ * Wrapper type for pointers to code which uses the non-standard
+ * calling convention.  See PV_CALL_SAVE_REGS_THUNK below.
+ */
+struct paravirt_callee_save {
+	void *func;
+};
+
 /* general info */
 struct pv_info {
 	unsigned int kernel_rpl;
@@ -189,11 +214,15 @@ struct pv_irq_ops {
 	 * expected to use X86_EFLAGS_IF; all other bits
 	 * returned from save_fl are undefined, and may be ignored by
 	 * restore_fl.
+	 *
+	 * NOTE: These functions callers expect the callee to preserve
+	 * more registers than the standard C calling convention.
 	 */
-	unsigned long (*save_fl)(void);
-	void (*restore_fl)(unsigned long);
-	void (*irq_disable)(void);
-	void (*irq_enable)(void);
+	struct paravirt_callee_save save_fl;
+	struct paravirt_callee_save restore_fl;
+	struct paravirt_callee_save irq_disable;
+	struct paravirt_callee_save irq_enable;
+
 	void (*safe_halt)(void);
 	void (*halt)(void);

@@ -279,11 +308,11 @@ struct pv_mmu_ops {
 	void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
 					pte_t *ptep, pte_t pte);

-	pteval_t (*pte_val)(pte_t);
-	pte_t (*make_pte)(pteval_t pte);
+	struct paravirt_callee_save pte_val;
+	struct paravirt_callee_save make_pte;

-	pgdval_t (*pgd_val)(pgd_t);
-	pgd_t (*make_pgd)(pgdval_t pgd);
+	struct paravirt_callee_save pgd_val;
+	struct paravirt_callee_save make_pgd;

 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
@@ -298,12 +327,12 @@ struct pv_mmu_ops {

 	void (*set_pud)(pud_t *pudp, pud_t pudval);

-	pmdval_t (*pmd_val)(pmd_t);
-	pmd_t (*make_pmd)(pmdval_t pmd);
+	struct paravirt_callee_save pmd_val;
+	struct paravirt_callee_save make_pmd;

 #if PAGETABLE_LEVELS == 4
-	pudval_t (*pud_val)(pud_t);
-	pud_t (*make_pud)(pudval_t pud);
+	struct paravirt_callee_save pud_val;
+	struct paravirt_callee_save make_pud;

 	void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
 #endif	/* PAGETABLE_LEVELS == 4 */
@@ -388,6 +417,8 @@ extern struct pv_lock_ops pv_lock_ops;
 	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")

 unsigned paravirt_patch_nop(void);
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
 unsigned paravirt_patch_ignore(unsigned len);
 unsigned paravirt_patch_call(void *insnbuf,
 			     const void *target, u16 tgt_clobbers,
@@ -479,25 +510,45 @@ int paravirt_disable_iospace(void);
 * makes sure the incoming and outgoing types are always correct.
 */
 #ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS			unsigned long __eax, __edx, __ecx
+#define PVOP_VCALL_ARGS				\
+	unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
 #define PVOP_CALL_ARGS			PVOP_VCALL_ARGS
+
+#define PVOP_CALL_ARG1(x)		"a" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"d" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x)		"c" ((unsigned long)(x))
+
 #define PVOP_VCALL_CLOBBERS		"=a" (__eax), "=d" (__edx),	\
 					"=c" (__ecx)
 #define PVOP_CALL_CLOBBERS		PVOP_VCALL_CLOBBERS
+
+#define PVOP_VCALLEE_CLOBBERS		"=a" (__eax), "=d" (__edx)
+#define PVOP_CALLEE_CLOBBERS		PVOP_VCALLEE_CLOBBERS
+
 #define EXTRA_CLOBBERS
 #define VEXTRA_CLOBBERS
-#else
-#define PVOP_VCALL_ARGS		unsigned long __edi, __esi, __edx, __ecx
+#else  /* CONFIG_X86_64 */
+#define PVOP_VCALL_ARGS					\
+	unsigned long __edi = __edi, __esi = __esi,	\
+		__edx = __edx, __ecx = __ecx
 #define PVOP_CALL_ARGS		PVOP_VCALL_ARGS, __eax
+
+#define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"S" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x)		"d" ((unsigned long)(x))
+#define PVOP_CALL_ARG4(x)		"c" ((unsigned long)(x))
+
 #define PVOP_VCALL_CLOBBERS	"=D" (__edi),				\
 				"=S" (__esi), "=d" (__edx),		\
 				"=c" (__ecx)
-
 #define PVOP_CALL_CLOBBERS	PVOP_VCALL_CLOBBERS, "=a" (__eax)

+#define PVOP_VCALLEE_CLOBBERS	"=a" (__eax)
+#define PVOP_CALLEE_CLOBBERS	PVOP_VCALLEE_CLOBBERS
+
 #define EXTRA_CLOBBERS	 , "r8", "r9", "r10", "r11"
 #define VEXTRA_CLOBBERS	 , "rax", "r8", "r9", "r10", "r11"
-#endif
+#endif	/* CONFIG_X86_32 */

 #ifdef CONFIG_PARAVIRT_DEBUG
 #define PVOP_TEST_NULL(op)	BUG_ON(op == NULL)
@@ -505,7 +556,8 @@ int paravirt_disable_iospace(void);
 #define PVOP_TEST_NULL(op)	((void)op)
 #endif

-#define __PVOP_CALL(rettype, op, pre, post, ...)			\
+#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr,		\
+		      pre, post, ...)					\
 	({								\
 		rettype __ret;						\
 		PVOP_CALL_ARGS;						\
@@ -516,70 +568,113 @@ int paravirt_disable_iospace(void);
 			asm volatile(pre				\
 				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
-				     : PVOP_CALL_CLOBBERS		\
+				     : call_clbr			\
 				     : paravirt_type(op),		\
-				       paravirt_clobber(CLBR_ANY),	\
+				       paravirt_clobber(clbr),		\
 				       ##__VA_ARGS__			\
-				     : "memory", "cc" EXTRA_CLOBBERS);	\
+				     : "memory", "cc" extra_clbr);	\
 			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
 		} else {						\
 			asm volatile(pre				\
 				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
-				     : PVOP_CALL_CLOBBERS		\
+				     : call_clbr			\
 				     : paravirt_type(op),		\
-				       paravirt_clobber(CLBR_ANY),	\
+				       paravirt_clobber(clbr),		\
 				       ##__VA_ARGS__			\
-				     : "memory", "cc" EXTRA_CLOBBERS);	\
+				     : "memory", "cc" extra_clbr);	\
 			__ret = (rettype)__eax;				\
 		}							\
 		__ret;							\
 	})
-#define __PVOP_VCALL(op, pre, post, ...)				\
+
+#define __PVOP_CALL(rettype, op, pre, post, ...)			\
+	____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS,	\
+		      EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...)			\
+	____PVOP_CALL(rettype, op.func, CLBR_RET_REG,			\
+		      PVOP_CALLEE_CLOBBERS, ,				\
+		      pre, post, ##__VA_ARGS__)
+
+
+#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...)	\
 	({								\
 		PVOP_VCALL_ARGS;					\
 		PVOP_TEST_NULL(op);					\
 		asm volatile(pre					\
 			     paravirt_alt(PARAVIRT_CALL)		\
 			     post					\
-			     : PVOP_VCALL_CLOBBERS			\
+			     : call_clbr				\
 			     : paravirt_type(op),			\
-			       paravirt_clobber(CLBR_ANY),		\
+			       paravirt_clobber(clbr),			\
 			       ##__VA_ARGS__				\
-			     : "memory", "cc" VEXTRA_CLOBBERS);		\
+			     : "memory", "cc" extra_clbr);		\
 	})

+#define __PVOP_VCALL(op, pre, post, ...)				\
+	____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS,		\
+		       VEXTRA_CLOBBERS,					\
+		       pre, post, ##__VA_ARGS__)
+
+#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...)			\
+	____PVOP_CALL(rettype, op.func, CLBR_RET_REG,			\
+		      PVOP_VCALLEE_CLOBBERS, ,				\
+		      pre, post, ##__VA_ARGS__)
+
+
+
 #define PVOP_CALL0(rettype, op)						\
 	__PVOP_CALL(rettype, op, "", "")
 #define PVOP_VCALL0(op)							\
 	__PVOP_VCALL(op, "", "")

+#define PVOP_CALLEE0(rettype, op)					\
+	__PVOP_CALLEESAVE(rettype, op, "", "")
+#define PVOP_VCALLEE0(op)						\
+	__PVOP_VCALLEESAVE(op, "", "")
+
+
 #define PVOP_CALL1(rettype, op, arg1)					\
-	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
+	__PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
 #define PVOP_VCALL1(op, arg1)						\
-	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
+	__PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
+
+#define PVOP_CALLEE1(rettype, op, arg1)					\
+	__PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+#define PVOP_VCALLEE1(op, arg1)						\
+	__PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
+

 #define PVOP_CALL2(rettype, op, arg1, arg2)				\
-	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), 	\
-	"1" ((unsigned long)(arg2)))
+	__PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1),		\
+		    PVOP_CALL_ARG2(arg2))
 #define PVOP_VCALL2(op, arg1, arg2)					\
-	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), 		\
-	"1" ((unsigned long)(arg2)))
+	__PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1),			\
+		     PVOP_CALL_ARG2(arg2))
+
+#define PVOP_CALLEE2(rettype, op, arg1, arg2)				\
+	__PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1),	\
+			  PVOP_CALL_ARG2(arg2))
+#define PVOP_VCALLEE2(op, arg1, arg2)					\
+	__PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1),		\
+			   PVOP_CALL_ARG2(arg2))
+

 #define PVOP_CALL3(rettype, op, arg1, arg2, arg3)			\
-	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),	\
-	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+	__PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1),		\
+		    PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
 #define PVOP_VCALL3(op, arg1, arg2, arg3)				\
-	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),		\
-	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+	__PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1),			\
+		     PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))

 /* This is the only difference in x86_64. We can make it much simpler */
 #ifdef CONFIG_X86_32
 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
 	__PVOP_CALL(rettype, op,					\
 		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
-		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
-		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+		    PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),		\
+		    PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
 #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
 	__PVOP_VCALL(op,						\
 		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
@@ -587,13 +682,13 @@ int paravirt_disable_iospace(void);
 		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
 #else
 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
-	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),	\
-	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),		\
-	"3"((unsigned long)(arg4)))
+	__PVOP_CALL(rettype, op, "", "",				\
+		    PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),		\
+		    PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
 #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
-	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),		\
-	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),		\
-	"3"((unsigned long)(arg4)))
+	__PVOP_VCALL(op, "", "",					\
+		     PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),	\
+		     PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
 #endif

 static inline int paravirt_enabled(void)
@@ -1060,11 +1155,11 @@ static inline pte_t __pte(pteval_t val)
 	pteval_t ret;

 	if (sizeof(pteval_t) > sizeof(long))
-		ret = PVOP_CALL2(pteval_t,
+		ret = PVOP_CALLEE2(pteval_t,
 				   pv_mmu_ops.make_pte,
 				   val, (u64)val >> 32);
 	else
-		ret = PVOP_CALL1(pteval_t,
+		ret = PVOP_CALLEE1(pteval_t,
 				   pv_mmu_ops.make_pte,
 				   val);

@@ -1076,10 +1171,10 @@ static inline pteval_t pte_val(pte_t pte)
 	pteval_t ret;

 	if (sizeof(pteval_t) > sizeof(long))
-		ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
+		ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
 				   pte.pte, (u64)pte.pte >> 32);
 	else
-		ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
+		ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
 				   pte.pte);

 	return ret;
@@ -1090,10 +1185,10 @@ static inline pgd_t __pgd(pgdval_t val)
 	pgdval_t ret;

 	if (sizeof(pgdval_t) > sizeof(long))
-		ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
+		ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
 				   val, (u64)val >> 32);
 	else
-		ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
+		ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
 				   val);

 	return (pgd_t) { ret };
@@ -1104,10 +1199,10 @@ static inline pgdval_t pgd_val(pgd_t pgd)
 	pgdval_t ret;

 	if (sizeof(pgdval_t) > sizeof(long))
-		ret =  PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
+		ret =  PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
 				    pgd.pgd, (u64)pgd.pgd >> 32);
 	else
-		ret =  PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
+		ret =  PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
 				    pgd.pgd);

 	return ret;
@@ -1172,10 +1267,10 @@ static inline pmd_t __pmd(pmdval_t val)
 	pmdval_t ret;

 	if (sizeof(pmdval_t) > sizeof(long))
-		ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
+		ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
 				   val, (u64)val >> 32);
 	else
-		ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
+		ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
 				   val);

 	return (pmd_t) { ret };
@@ -1186,10 +1281,10 @@ static inline pmdval_t pmd_val(pmd_t pmd)
 	pmdval_t ret;

 	if (sizeof(pmdval_t) > sizeof(long))
-		ret =  PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
+		ret =  PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
 				    pmd.pmd, (u64)pmd.pmd >> 32);
 	else
-		ret =  PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
+		ret =  PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
 				    pmd.pmd);

 	return ret;
@@ -1212,10 +1307,10 @@ static inline pud_t __pud(pudval_t val)
 	pudval_t ret;

 	if (sizeof(pudval_t) > sizeof(long))
-		ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
+		ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
 				   val, (u64)val >> 32);
 	else
-		ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
+		ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
 				   val);

 	return (pud_t) { ret };
@@ -1226,10 +1321,10 @@ static inline pudval_t pud_val(pud_t pud)
 	pudval_t ret;

 	if (sizeof(pudval_t) > sizeof(long))
-		ret =  PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
+		ret =  PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
 				    pud.pud, (u64)pud.pud >> 32);
 	else
-		ret =  PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
+		ret =  PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
 				    pud.pud);

 	return ret;
@@ -1371,6 +1466,9 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 }

 void _paravirt_nop(void);
+u32 _paravirt_ident_32(u32);
+u64 _paravirt_ident_64(u64);
+
 #define paravirt_nop	((void *)_paravirt_nop)

 #ifdef CONFIG_SMP
@@ -1420,12 +1518,37 @@ extern struct paravirt_patch_site __parainstructions[],
 	__parainstructions_end[];

 #ifdef CONFIG_X86_32
-#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
-#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
+#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
+#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
+
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS		"pushl %ecx;"
+#define PV_RESTORE_ALL_CALLER_REGS	"popl  %ecx;"
+
 #define PV_FLAGS_ARG "0"
 #define PV_EXTRA_CLOBBERS
 #define PV_VEXTRA_CLOBBERS
 #else
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS						\
+	"push %rcx;"							\
+	"push %rdx;"							\
+	"push %rsi;"							\
+	"push %rdi;"							\
+	"push %r8;"							\
+	"push %r9;"							\
+	"push %r10;"							\
+	"push %r11;"
+#define PV_RESTORE_ALL_CALLER_REGS					\
+	"pop %r11;"							\
+	"pop %r10;"							\
+	"pop %r9;"							\
+	"pop %r8;"							\
+	"pop %rdi;"							\
+	"pop %rsi;"							\
+	"pop %rdx;"							\
+	"pop %rcx;"
+
 /* We save some registers, but all of them, that's too much. We clobber all
 * caller saved registers but the argument parameter */
 #define PV_SAVE_REGS "pushq %%rdi;"
@@ -1435,52 +1558,76 @@ extern struct paravirt_patch_site __parainstructions[],
 #define PV_FLAGS_ARG "D"
 #endif

+/*
+ * Generate a thunk around a function which saves all caller-save
+ * registers except for the return value.  This allows C functions to
+ * be called from assembler code where fewer than normal registers are
+ * available.  It may also help code generation around calls from C
+ * code if the common case doesn't use many registers.
+ *
+ * When a callee is wrapped in a thunk, the caller can assume that all
+ * arg regs and all scratch registers are preserved across the
+ * call. The return value in rax/eax will not be saved, even for void
+ * functions.
+ */
+#define PV_CALLEE_SAVE_REGS_THUNK(func)					\
+	extern typeof(func) __raw_callee_save_##func;			\
+	static void *__##func##__ __used = func;			\
+									\
+	asm(".pushsection .text;"					\
+	    "__raw_callee_save_" #func ": "				\
+	    PV_SAVE_ALL_CALLER_REGS					\
+	    "call " #func ";"						\
+	    PV_RESTORE_ALL_CALLER_REGS					\
+	    "ret;"							\
+	    ".popsection")
+
+/* Get a reference to a callee-save function */
+#define PV_CALLEE_SAVE(func)						\
+	((struct paravirt_callee_save) { __raw_callee_save_##func })
+
+/* Promise that "func" already uses the right calling convention */
+#define __PV_IS_CALLEE_SAVE(func)			\
+	((struct paravirt_callee_save) { func })
+
 static inline unsigned long __raw_local_save_flags(void)
 {
 	unsigned long f;

-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     : "=a"(f)
 		     : paravirt_type(pv_irq_ops.save_fl),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "cc" PV_VEXTRA_CLOBBERS);
+		     : "memory", "cc");
 	return f;
 }

 static inline void raw_local_irq_restore(unsigned long f)
 {
-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     : "=a"(f)
 		     : PV_FLAGS_ARG(f),
 		       paravirt_type(pv_irq_ops.restore_fl),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "cc" PV_EXTRA_CLOBBERS);
+		     : "memory", "cc");
 }

 static inline void raw_local_irq_disable(void)
 {
-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     :
 		     : paravirt_type(pv_irq_ops.irq_disable),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+		     : "memory", "eax", "cc");
 }

 static inline void raw_local_irq_enable(void)
 {
-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     :
 		     : paravirt_type(pv_irq_ops.irq_enable),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+		     : "memory", "eax", "cc");
 }

 static inline unsigned long __raw_local_irq_save(void)
@@ -1523,33 +1670,49 @@ static inline unsigned long __raw_local_irq_save(void)
 	.popsection


+#define COND_PUSH(set, mask, reg)			\
+	.if ((~(set)) & mask); push %reg; .endif
+#define COND_POP(set, mask, reg)			\
+	.if ((~(set)) & mask); pop %reg; .endif
+
 #ifdef CONFIG_X86_64
-#define PV_SAVE_REGS				\
-	push %rax;				\
-	push %rcx;				\
-	push %rdx;				\
-	push %rsi;				\
-	push %rdi;				\
-	push %r8;				\
-	push %r9;				\
-	push %r10;				\
-	push %r11
-#define PV_RESTORE_REGS				\
-	pop %r11;				\
-	pop %r10;				\
-	pop %r9;				\
-	pop %r8;				\
-	pop %rdi;				\
-	pop %rsi;				\
-	pop %rdx;				\
-	pop %rcx;				\
-	pop %rax
+
+#define PV_SAVE_REGS(set)			\
+	COND_PUSH(set, CLBR_RAX, rax);		\
+	COND_PUSH(set, CLBR_RCX, rcx);		\
+	COND_PUSH(set, CLBR_RDX, rdx);		\
+	COND_PUSH(set, CLBR_RSI, rsi);		\
+	COND_PUSH(set, CLBR_RDI, rdi);		\
+	COND_PUSH(set, CLBR_R8, r8);		\
+	COND_PUSH(set, CLBR_R9, r9);		\
+	COND_PUSH(set, CLBR_R10, r10);		\
+	COND_PUSH(set, CLBR_R11, r11)
+#define PV_RESTORE_REGS(set)			\
+	COND_POP(set, CLBR_R11, r11);		\
+	COND_POP(set, CLBR_R10, r10);		\
+	COND_POP(set, CLBR_R9, r9);		\
+	COND_POP(set, CLBR_R8, r8);		\
+	COND_POP(set, CLBR_RDI, rdi);		\
+	COND_POP(set, CLBR_RSI, rsi);		\
+	COND_POP(set, CLBR_RDX, rdx);		\
+	COND_POP(set, CLBR_RCX, rcx);		\
+	COND_POP(set, CLBR_RAX, rax)
+
 #define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 8)
 #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
 #define PARA_INDIRECT(addr)	*addr(%rip)
 #else
-#define PV_SAVE_REGS   pushl %eax; pushl %edi; pushl %ecx; pushl %edx
-#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
+#define PV_SAVE_REGS(set)			\
+	COND_PUSH(set, CLBR_EAX, eax);		\
+	COND_PUSH(set, CLBR_EDI, edi);		\
+	COND_PUSH(set, CLBR_ECX, ecx);		\
+	COND_PUSH(set, CLBR_EDX, edx)
+#define PV_RESTORE_REGS(set)			\
+	COND_POP(set, CLBR_EDX, edx);		\
+	COND_POP(set, CLBR_ECX, ecx);		\
+	COND_POP(set, CLBR_EDI, edi);		\
+	COND_POP(set, CLBR_EAX, eax)
+
 #define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 4)
 #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
 #define PARA_INDIRECT(addr)	*%cs:addr
@@ -1561,15 +1724,15 @@ static inline unsigned long __raw_local_irq_save(void)

 #define DISABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
-		  PV_SAVE_REGS;						\
+		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable);	\
-		  PV_RESTORE_REGS;)			\
+		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)

 #define ENABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
-		  PV_SAVE_REGS;						\
+		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
-		  PV_RESTORE_REGS;)
+		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)

 #define USERGS_SYSRET32							\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32),	\
@@ -1599,11 +1762,15 @@ static inline unsigned long __raw_local_irq_save(void)
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
 		  swapgs)

+/*
+ * Note: swapgs is very special, and in practise is either going to be
+ * implemented with a single "swapgs" instruction or something very
+ * special.  Either way, we don't need to save any registers for
+ * it.
+ */
 #define SWAPGS								\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
-		  PV_SAVE_REGS;						\
-		  call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs);		\
-		  PV_RESTORE_REGS					\
+		  call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs)		\
 		 )

 #define GET_CR2_INTO_RCX				\

--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -768,7 +768,8 @@ extern int sysenter_setup(void);
 extern struct desc_ptr		early_gdt_descr;

 extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
 extern void cpu_init(void);

 static inline unsigned long get_debugctlmsr(void)

--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -296,23 +296,28 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)

 __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;

+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+	loadsegment(fs, __KERNEL_PERCPU);
+#else
+	loadsegment(gs, 0);
+	wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#endif
+}
+
 /* Current gdt points %fs at the "master" per-cpu area: after this,
 * it's on the real one. */
-void switch_to_new_gdt(void)
+void switch_to_new_gdt(int cpu)
 {
 	struct desc_ptr gdt_descr;
-	int cpu = smp_processor_id();

 	gdt_descr.address = (long)get_cpu_gdt_table(cpu);
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
 	/* Reload the per-cpu base */
-#ifdef CONFIG_X86_32
-	loadsegment(fs, __KERNEL_PERCPU);
-#else
-	loadsegment(gs, 0);
-	wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
-#endif
+
+	load_percpu_segment(cpu);
 }

 static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -1029,7 +1034,7 @@ void __cpuinit cpu_init(void)
 	 * and set up the GDT descriptor:
 	 */

-	switch_to_new_gdt();
+	switch_to_new_gdt(cpu);
 	loadsegment(fs, 0);

 	load_idt((const struct desc_ptr *)&idt_descr);
@@ -1131,7 +1136,7 @@ void __cpuinit cpu_init(void)
 		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

 	load_idt(&idt_descr);
-	switch_to_new_gdt();
+	switch_to_new_gdt(cpu);

 	/*
 	 * Set up and load the per-CPU TSS and LDT

--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1143,7 +1143,7 @@ ENTRY(native_load_gs_index)
 	CFI_STARTPROC
 	pushf
 	CFI_ADJUST_CFA_OFFSET 8
-	DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
 gs_change:
 	movl %edi,%gs

--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -44,6 +44,17 @@ void _paravirt_nop(void)
 {
 }

+/* identity function, which can be inlined */
+u32 _paravirt_ident_32(u32 x)
+{
+	return x;
+}
+
+u64 _paravirt_ident_64(u64 x)
+{
+	return x;
+}
+
 static void __init default_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 	if (opfunc == NULL)
 		/* If there's no function, patch it with a ud2a (BUG) */
 		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
-	else if (opfunc == paravirt_nop)
+	else if (opfunc == _paravirt_nop)
 		/* If the operation is a nop, then nop the callsite */
 		ret = paravirt_patch_nop();
+
+	/* identity functions just return their single argument */
+	else if (opfunc == _paravirt_ident_32)
+		ret = paravirt_patch_ident_32(insnbuf, len);
+	else if (opfunc == _paravirt_ident_64)
+		ret = paravirt_patch_ident_64(insnbuf, len);
+
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -292,10 +310,10 @@ struct pv_time_ops pv_time_ops = {

 struct pv_irq_ops pv_irq_ops = {
 	.init_IRQ = native_init_IRQ,
-	.save_fl = native_save_fl,
-	.restore_fl = native_restore_fl,
-	.irq_disable = native_irq_disable,
-	.irq_enable = native_irq_enable,
+	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+	.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
 	.safe_halt = native_safe_halt,
 	.halt = native_halt,
 #ifdef CONFIG_X86_64
@@ -373,6 +391,14 @@ struct pv_apic_ops pv_apic_ops = {
 #endif
 };

+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
+/* 32-bit pagetable entries */
+#define PTE_IDENT	__PV_IS_CALLEE_SAVE(_paravirt_ident_32)
+#else
+/* 64-bit pagetable entries */
+#define PTE_IDENT	__PV_IS_CALLEE_SAVE(_paravirt_ident_64)
+#endif
+
 struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
 	.pagetable_setup_start = native_pagetable_setup_start,
@@ -424,21 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.pmd_clear = native_pmd_clear,
 #endif
 	.set_pud = native_set_pud,
-	.pmd_val = native_pmd_val,
-	.make_pmd = native_make_pmd,
+
+	.pmd_val = PTE_IDENT,
+	.make_pmd = PTE_IDENT,

 #if PAGETABLE_LEVELS == 4
-	.pud_val = native_pud_val,
-	.make_pud = native_make_pud,
+	.pud_val = PTE_IDENT,
+	.make_pud = PTE_IDENT,
+
 	.set_pgd = native_set_pgd,
 #endif
 #endif /* PAGETABLE_LEVELS >= 3 */

-	.pte_val = native_pte_val,
-	.pgd_val = native_pgd_val,
+	.pte_val = PTE_IDENT,
+	.pgd_val = PTE_IDENT,

-	.make_pte = native_make_pte,
-	.make_pgd = native_make_pgd,
+	.make_pte = PTE_IDENT,
+	.make_pgd = PTE_IDENT,

 	.dup_mmap = paravirt_nop,
 	.exit_mmap = paravirt_nop,

--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");

+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+	/* arg in %eax, return in %eax */
+	return 0;
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+	/* arg in %edx:%eax, return in %edx:%eax */
+	return 0;
+}
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {

--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
 DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");

+DEF_NATIVE(, mov32, "mov %edi, %eax");
+DEF_NATIVE(, mov64, "mov %rdi, %rax");
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+	return paravirt_patch_insns(insnbuf, len,
+				    start__mov32, end__mov32);
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+	return paravirt_patch_insns(insnbuf, len,
+				    start__mov64, end__mov64);
+}
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {

--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -122,7 +122,7 @@ void __init setup_per_cpu_areas(void)
 		 * area.  Reload any changed state for the boot CPU.
 		 */
 		if (cpu == boot_cpu_id)
-			switch_to_new_gdt();
+			switch_to_new_gdt(cpu);

 		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}

--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1188,7 +1188,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 void __init native_smp_prepare_boot_cpu(void)
 {
 	int me = smp_processor_id();
-	switch_to_new_gdt();
+	switch_to_new_gdt(me);
 	/* already set me in cpu_online_mask in boot_cpu_init() */
 	cpumask_set_cpu(me, cpu_callout_mask);
 	per_cpu(cpu_state, me) = CPU_ONLINE;

--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -259,7 +259,7 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
 		 * the cpu's, all of which are still in the mask.
 		 */
 		__get_cpu_var(ptcstats).ptc_i++;
-		return 0;
+		return flush_mask;
 	}

 	/*

--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -670,10 +670,11 @@ static inline int __init activate_vmi(void)
 	para_fill(pv_mmu_ops.write_cr2, SetCR2);
 	para_fill(pv_mmu_ops.write_cr3, SetCR3);
 	para_fill(pv_cpu_ops.write_cr4, SetCR4);
-	para_fill(pv_irq_ops.save_fl, GetInterruptMask);
-	para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
-	para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
-	para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
+
+	para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
+	para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
+	para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
+	para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);

 	para_fill(pv_cpu_ops.wbinvd, WBINVD);
 	para_fill(pv_cpu_ops.read_tsc, RDTSC);

--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -22,6 +22,7 @@ PHDRS {
 #ifdef CONFIG_SMP
 	percpu PT_LOAD FLAGS(7);	/* RWE */
 #endif
+	data.init2 PT_LOAD FLAGS(7);	/* RWE */
 	note PT_NOTE FLAGS(0);	/* ___ */
 }
 SECTIONS
@@ -215,7 +216,7 @@ SECTIONS
  /*
   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
   * output PHDR, so the next output section - __data_nosave - should
-   * switch it back to data.init.  Also, pda should be at the head of
+   * start another section data.init2.  Also, pda should be at the head of
   * percpu area.  Preallocate it and define the percpu offset symbol
   * so that it can be accessed as a percpu variable.
   */
@@ -232,7 +233,7 @@ SECTIONS
  __nosave_begin = .;
  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
      *(.data.nosave)
-  } :data.init	/* switch back to data.init, see PERCPU_VADDR() above */
+  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
  . = ALIGN(PAGE_SIZE);
  __nosave_end = .;


--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
 		flags &= ~X86_EFLAGS_IF;
 	return flags;
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);

 static void vsmp_restore_fl(unsigned long flags)
 {
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
 		flags |= X86_EFLAGS_AC;
 	native_restore_fl(flags);
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);

 static void vsmp_irq_disable(void)
 {
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)

 	native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);

 static void vsmp_irq_enable(void)
 {
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)

 	native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);

 static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
 				  unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
 	       cap, ctl);
 	if (cap & ctl & (1 << 4)) {
 		/* Setup irq ops and turn on vSMP  IRQ fastpath handling */
-		pv_irq_ops.irq_disable = vsmp_irq_disable;
-		pv_irq_ops.irq_enable  = vsmp_irq_enable;
-		pv_irq_ops.save_fl  = vsmp_save_fl;
-		pv_irq_ops.restore_fl  = vsmp_restore_fl;
+		pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+		pv_irq_ops.irq_enable  = PV_CALLEE_SAVE(vsmp_irq_enable);
+		pv_irq_ops.save_fl  = PV_CALLEE_SAVE(vsmp_save_fl);
+		pv_irq_ops.restore_fl  = PV_CALLEE_SAVE(vsmp_restore_fl);
 		pv_init_ops.patch = vsmp_patch;

 		ctl &= ~(1 << 4);

--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
 {
 	return lguest_data.irq_enabled;
 }
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);

 /* restore_flags() just sets the flags back to the value given. */
 static void restore_fl(unsigned long flags)
 {
 	lguest_data.irq_enabled = flags;
 }
+PV_CALLEE_SAVE_REGS_THUNK(restore_fl);

 /* Interrupts go off... */
 static void irq_disable(void)
 {
 	lguest_data.irq_enabled = 0;
 }
+PV_CALLEE_SAVE_REGS_THUNK(irq_disable);

 /* Interrupts go on... */
 static void irq_enable(void)
 {
 	lguest_data.irq_enabled = X86_EFLAGS_IF;
 }
+PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+
 /*:*/
 /*M:003 Note that we don't check for outstanding interrupts when we re-enable
 * them (or when we unmask an interrupt).  This seems to work for the moment,
@@ -984,10 +989,10 @@ __init void lguest_init(void)

 	/* interrupt-related operations */
 	pv_irq_ops.init_IRQ = lguest_init_IRQ;
-	pv_irq_ops.save_fl = save_fl;
-	pv_irq_ops.restore_fl = restore_fl;
-	pv_irq_ops.irq_disable = irq_disable;
-	pv_irq_ops.irq_enable = irq_enable;
+	pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+	pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+	pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
 	pv_irq_ops.safe_halt = lguest_safe_halt;

 	/* init-time operations */

--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1744,13 +1744,13 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)

 static void __cpuinit voyager_smp_prepare_boot_cpu(void)
 {
-	switch_to_new_gdt();
+	int cpu = smp_processor_id();
+	switch_to_new_gdt(cpu);

 	cpu_online_map = cpumask_of_cpu(smp_processor_id());
 	cpu_callout_map = cpumask_of_cpu(smp_processor_id());
 	cpu_callin_map = CPU_MASK_NONE;
 	cpu_present_map = cpumask_of_cpu(smp_processor_id());
-
 }

 static int __cpuinit voyager_cpu_up(unsigned int cpu)

--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
 endif

 obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
-			time.o xen-asm_$(BITS).o grant-table.o suspend.o
+			time.o xen-asm.o xen-asm_$(BITS).o \
+			grant-table.o suspend.o

 obj-$(CONFIG_SMP)		+= smp.o spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
\ No newline at end of file
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);

-/*
- * Identity map, in addition to plain kernel map.  This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-
-#ifdef CONFIG_X86_64
-/* l3 pud for userspace vsyscall mapping */
-static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
-#endif /* CONFIG_X86_64 */
-
-/*
- * Note about cr3 (pagetable base) values:
- *
- * xen_cr3 contains the current logical cr3 value; it contains the
- * last set cr3.  This may not be the current effective cr3, because
- * its update may be being lazily deferred.  However, a vcpu looking
- * at its own cr3 can use this value knowing that it everything will
- * be self-consistent.
- *
- * xen_current_cr3 contains the actual vcpu cr3; it is set once the
- * hypercall to set the vcpu cr3 is complete (so it may be a little
- * out of date, but it will never be set early).  If one vcpu is
- * looking at another vcpu's cr3 value, it should use this variable.
- */
-DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
-
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);

 struct shared_info xen_dummy_shared_info;

+void *xen_initial_gdt;
+
 /*
 * Point at some empty memory to start with. We map the real shared_info
 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 *
 * 0: not available, 1: available
 */
-static int have_vcpu_info_placement =
-#ifdef CONFIG_X86_32
-	1
-#else
-	0
-#endif
-	;
-
+static int have_vcpu_info_placement = 1;

 static void xen_vcpu_setup(int cpu)
 {
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
 	return HYPERVISOR_get_debugreg(reg);
 }

-static void xen_leave_lazy(void)
+void xen_leave_lazy(void)
 {
 	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	xen_mc_flush();
@@ -598,76 +564,6 @@ static struct apic_ops xen_basic_apic_ops = {

 #endif

-static void xen_flush_tlb(void)
-{
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-
-	preempt_disable();
-
-	mcs = xen_mc_entry(sizeof(*op));
-
-	op = mcs.args;
-	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-	xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-	preempt_enable();
-}
-
-static void xen_flush_tlb_single(unsigned long addr)
-{
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-
-	preempt_disable();
-
-	mcs = xen_mc_entry(sizeof(*op));
-	op = mcs.args;
-	op->cmd = MMUEXT_INVLPG_LOCAL;
-	op->arg1.linear_addr = addr & PAGE_MASK;
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-	xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-	preempt_enable();
-}
-
-static void xen_flush_tlb_others(const struct cpumask *cpus,
-				 struct mm_struct *mm, unsigned long va)
-{
-	struct {
-		struct mmuext_op op;
-		DECLARE_BITMAP(mask, NR_CPUS);
-	} *args;
-	struct multicall_space mcs;
-
-	BUG_ON(cpumask_empty(cpus));
-	BUG_ON(!mm);
-
-	mcs = xen_mc_entry(sizeof(*args));
-	args = mcs.args;
-	args->op.arg2.vcpumask = to_cpumask(args->mask);
-
-	/* Remove us, and any offline CPUS. */
-	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
-	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
-	if (unlikely(cpumask_empty(to_cpumask(args->mask))))
-		goto issue;
-
-	if (va == TLB_FLUSH_ALL) {
-		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-	} else {
-		args->op.cmd = MMUEXT_INVLPG_MULTI;
-		args->op.arg1.linear_addr = va;
-	}
-
-	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-
-issue:
-	xen_mc_issue(PARAVIRT_LAZY_MMU);
-}

 static void xen_clts(void)
 {
@@ -693,21 +589,6 @@ static void xen_write_cr0(unsigned long cr0)
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }

-static void xen_write_cr2(unsigned long cr2)
-{
-	percpu_read(xen_vcpu)->arch.cr2 = cr2;
-}
-
-static unsigned long xen_read_cr2(void)
-{
-	return percpu_read(xen_vcpu)->arch.cr2;
-}
-
-static unsigned long xen_read_cr2_direct(void)
-{
-	return percpu_read(xen_vcpu_info.arch.cr2);
-}
-
 static void xen_write_cr4(unsigned long cr4)
 {
 	cr4 &= ~X86_CR4_PGE;
@@ -716,71 +597,6 @@ static void xen_write_cr4(unsigned long cr4)
 	native_write_cr4(cr4);
 }

-static unsigned long xen_read_cr3(void)
-{
-	return percpu_read(xen_cr3);
-}
-
-static void set_current_cr3(void *v)
-{
-	percpu_write(xen_current_cr3, (unsigned long)v);
-}
-
-static void __xen_write_cr3(bool kernel, unsigned long cr3)
-{
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-	unsigned long mfn;
-
-	if (cr3)
-		mfn = pfn_to_mfn(PFN_DOWN(cr3));
-	else
-		mfn = 0;
-
-	WARN_ON(mfn == 0 && kernel);
-
-	mcs = __xen_mc_entry(sizeof(*op));
-
-	op = mcs.args;
-	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
-	op->arg1.mfn = mfn;
-
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-	if (kernel) {
-		percpu_write(xen_cr3, cr3);
-
-		/* Update xen_current_cr3 once the batch has actually
-		   been submitted. */
-		xen_mc_callback(set_current_cr3, (void *)cr3);
-	}
-}
-
-static void xen_write_cr3(unsigned long cr3)
-{
-	BUG_ON(preemptible());
-
-	xen_mc_batch();  /* disables interrupts */
-
-	/* Update while interrupts are disabled, so its atomic with
-	   respect to ipis */
-	percpu_write(xen_cr3, cr3);
-
-	__xen_write_cr3(true, cr3);
-
-#ifdef CONFIG_X86_64
-	{
-		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
-		if (user_pgd)
-			__xen_write_cr3(false, __pa(user_pgd));
-		else
-			__xen_write_cr3(false, 0);
-	}
-#endif
-
-	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
-}
-
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
 	int ret;
@@ -822,185 +638,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 	return ret;
 }

-/* Early in boot, while setting up the initial pagetable, assume
-   everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
-{
-#ifdef CONFIG_FLATMEM
-	BUG_ON(mem_map);	/* should only be used early */
-#endif
-	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-
-/* Early release_pte assumes that all pts are pinned, since there's
-   only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
-{
-	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-}
-
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-	struct mmuext_op op;
-	op.cmd = cmd;
-	op.arg1.mfn = pfn_to_mfn(pfn);
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
-}
-
-/* This needs to make sure the new pte page is pinned iff its being
-   attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
-{
-	struct page *page = pfn_to_page(pfn);
-
-	if (PagePinned(virt_to_page(mm->pgd))) {
-		SetPagePinned(page);
-
-		vm_unmap_aliases();
-		if (!PageHighMem(page)) {
-			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
-			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
-		} else {
-			/* make sure there are no stray mappings of
-			   this page */
-			kmap_flush_unused();
-		}
-	}
-}
-
-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
-{
-	xen_alloc_ptpage(mm, pfn, PT_PTE);
-}
-
-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
-{
-	xen_alloc_ptpage(mm, pfn, PT_PMD);
-}
-
-static int xen_pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *pgd = mm->pgd;
-	int ret = 0;
-
-	BUG_ON(PagePinned(virt_to_page(pgd)));
-
-#ifdef CONFIG_X86_64
-	{
-		struct page *page = virt_to_page(pgd);
-		pgd_t *user_pgd;
-
-		BUG_ON(page->private != 0);
-
-		ret = -ENOMEM;
-
-		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-		page->private = (unsigned long)user_pgd;
-
-		if (user_pgd != NULL) {
-			user_pgd[pgd_index(VSYSCALL_START)] =
-				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
-			ret = 0;
-		}
-
-		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
-	}
-#endif
-
-	return ret;
-}
-
-static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
-	pgd_t *user_pgd = xen_get_user_pgd(pgd);
-
-	if (user_pgd)
-		free_page((unsigned long)user_pgd);
-#endif
-}
-
-/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
-{
-	struct page *page = pfn_to_page(pfn);
-
-	if (PagePinned(page)) {
-		if (!PageHighMem(page)) {
-			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
-			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-		}
-		ClearPagePinned(page);
-	}
-}
-
-static void xen_release_pte(unsigned long pfn)
-{
-	xen_release_ptpage(pfn, PT_PTE);
-}
-
-static void xen_release_pmd(unsigned long pfn)
-{
-	xen_release_ptpage(pfn, PT_PMD);
-}
-
-#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
-{
-	xen_alloc_ptpage(mm, pfn, PT_PUD);
-}
-
-static void xen_release_pud(unsigned long pfn)
-{
-	xen_release_ptpage(pfn, PT_PUD);
-}
-#endif
-
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-	pgprot_t prot = PAGE_KERNEL;
-
-	if (PagePinned(page))
-		prot = PAGE_KERNEL_RO;
-
-	if (0 && PageHighMem(page))
-		printk("mapping highpte %lx type %d prot %s\n",
-		       page_to_pfn(page), type,
-		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
-	return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
-#ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
-	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
-	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
-		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
-			       pte_val_ma(pte));
-
-	return pte;
-}
-
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
-	pte = mask_rw_pte(ptep, pte);
-
-	xen_set_pte(ptep, pte);
-}
-#endif
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
-}
-
 void xen_setup_shared_info(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1021,37 +658,6 @@ void xen_setup_shared_info(void)
 	xen_setup_mfn_list_list();
 }

-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
-	xen_setup_shared_info();
-}
-
-static __init void xen_post_allocator_init(void)
-{
-	pv_mmu_ops.set_pte = xen_set_pte;
-	pv_mmu_ops.set_pmd = xen_set_pmd;
-	pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
-	pv_mmu_ops.set_pgd = xen_set_pgd;
-#endif
-
-	/* This will work as long as patching hasn't happened yet
-	   (which it hasn't) */
-	pv_mmu_ops.alloc_pte = xen_alloc_pte;
-	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
-	pv_mmu_ops.release_pte = xen_release_pte;
-	pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
-	pv_mmu_ops.alloc_pud = xen_alloc_pud;
-	pv_mmu_ops.release_pud = xen_release_pud;
-#endif
-
-#ifdef CONFIG_X86_64
-	SetPagePinned(virt_to_page(level3_user_vsyscall));
-#endif
-	xen_mark_init_mm_pinned();
-}
-
 /* This is called once we have the cpu_possible_map */
 void xen_setup_vcpu_info_placement(void)
 {
@@ -1065,10 +671,10 @@ void xen_setup_vcpu_info_placement(void)
 	if (have_vcpu_info_placement) {
 		printk(KERN_INFO "Xen: using vcpu_info placement\n");

-		pv_irq_ops.save_fl = xen_save_fl_direct;
-		pv_irq_ops.restore_fl = xen_restore_fl_direct;
-		pv_irq_ops.irq_disable = xen_irq_disable_direct;
-		pv_irq_ops.irq_enable = xen_irq_enable_direct;
+		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+		pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
 		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
 	}
 }
@@ -1126,49 +732,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	return ret;
 }

-static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
-{
-	pte_t pte;
-
-	phys >>= PAGE_SHIFT;
-
-	switch (idx) {
-	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_F00F_BUG
-	case FIX_F00F_IDT:
-#endif
-#ifdef CONFIG_X86_32
-	case FIX_WP_TEST:
-	case FIX_VDSO:
-# ifdef CONFIG_HIGHMEM
-	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
-# endif
-#else
-	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	case FIX_APIC_BASE:	/* maps dummy local APIC */
-#endif
-		pte = pfn_pte(phys, prot);
-		break;
-
-	default:
-		pte = mfn_pte(phys, prot);
-		break;
-	}
-
-	__native_set_fixmap(idx, pte);
-
-#ifdef CONFIG_X86_64
-	/* Replicate changes to map the vsyscall page into the user
-	   pagetable vsyscall mapping. */
-	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
-		unsigned long vaddr = __fix_to_virt(idx);
-		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
-	}
-#endif
-}
-
 static const struct pv_info xen_info __initdata = {
 	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
@@ -1264,86 +827,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
 #endif
 };

-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
-	.pagetable_setup_start = xen_pagetable_setup_start,
-	.pagetable_setup_done = xen_pagetable_setup_done,
-
-	.read_cr2 = xen_read_cr2,
-	.write_cr2 = xen_write_cr2,
-
-	.read_cr3 = xen_read_cr3,
-	.write_cr3 = xen_write_cr3,
-
-	.flush_tlb_user = xen_flush_tlb,
-	.flush_tlb_kernel = xen_flush_tlb,
-	.flush_tlb_single = xen_flush_tlb_single,
-	.flush_tlb_others = xen_flush_tlb_others,
-
-	.pte_update = paravirt_nop,
-	.pte_update_defer = paravirt_nop,
-
-	.pgd_alloc = xen_pgd_alloc,
-	.pgd_free = xen_pgd_free,
-
-	.alloc_pte = xen_alloc_pte_init,
-	.release_pte = xen_release_pte_init,
-	.alloc_pmd = xen_alloc_pte_init,
-	.alloc_pmd_clone = paravirt_nop,
-	.release_pmd = xen_release_pte_init,
-
-#ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
-#ifdef CONFIG_X86_64
-	.set_pte = xen_set_pte,
-#else
-	.set_pte = xen_set_pte_init,
-#endif
-	.set_pte_at = xen_set_pte_at,
-	.set_pmd = xen_set_pmd_hyper,
-
-	.ptep_modify_prot_start = __ptep_modify_prot_start,
-	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-
-	.pte_val = xen_pte_val,
-	.pgd_val = xen_pgd_val,
-
-	.make_pte = xen_make_pte,
-	.make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
-	.set_pte_atomic = xen_set_pte_atomic,
-	.set_pte_present = xen_set_pte_at,
-	.pte_clear = xen_pte_clear,
-	.pmd_clear = xen_pmd_clear,
-#endif	/* CONFIG_X86_PAE */
-	.set_pud = xen_set_pud_hyper,
-
-	.make_pmd = xen_make_pmd,
-	.pmd_val = xen_pmd_val,
-
-#if PAGETABLE_LEVELS == 4
-	.pud_val = xen_pud_val,
-	.make_pud = xen_make_pud,
-	.set_pgd = xen_set_pgd_hyper,
-
-	.alloc_pud = xen_alloc_pte_init,
-	.release_pud = xen_release_pte_init,
-#endif	/* PAGETABLE_LEVELS == 4 */
-
-	.activate_mm = xen_activate_mm,
-	.dup_mmap = xen_dup_mmap,
-	.exit_mmap = xen_exit_mmap,
-
-	.lazy_mode = {
-		.enter = paravirt_enter_lazy_mmu,
-		.leave = xen_leave_lazy,
-	},
-
-	.set_fixmap = xen_set_fixmap,
-};
-
 static void xen_reboot(int reason)
 {
 	struct sched_shutdown r = { .reason = reason };
@@ -1386,223 +869,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
 };


-static void __init xen_reserve_top(void)
-{
-#ifdef CONFIG_X86_32
-	unsigned long top = HYPERVISOR_VIRT_START;
-	struct xen_platform_parameters pp;
-
-	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
-		top = pp.virt_start;
-
-	reserve_top_address(-top);
-#endif	/* CONFIG_X86_32 */
-}
-
-/*
- * Like __va(), but returns address in the kernel mapping (which is
- * all we have until the physical memory mapping has been set up.
- */
-static void *__ka(phys_addr_t paddr)
-{
-#ifdef CONFIG_X86_64
-	return (void *)(paddr + __START_KERNEL_map);
-#else
-	return __va(paddr);
-#endif
-}
-
-/* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
-{
-	phys_addr_t paddr;
-
-	maddr &= PTE_PFN_MASK;
-	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
-
-	return paddr;
-}
-
-/* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
-{
-	return __ka(m2p(maddr));
-}
-
-static void set_page_prot(void *addr, pgprot_t prot)
-{
-	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
-	pte_t pte = pfn_pte(pfn, prot);
-
-	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
-		BUG();
-}
-
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
-{
-	unsigned pmdidx, pteidx;
-	unsigned ident_pte;
-	unsigned long pfn;
-
-	ident_pte = 0;
-	pfn = 0;
-	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
-		pte_t *pte_page;
-
-		/* Reuse or allocate a page of ptes */
-		if (pmd_present(pmd[pmdidx]))
-			pte_page = m2v(pmd[pmdidx].pmd);
-		else {
-			/* Check for free pte pages */
-			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
-				break;
-
-			pte_page = &level1_ident_pgt[ident_pte];
-			ident_pte += PTRS_PER_PTE;
-
-			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
-		}
-
-		/* Install mappings */
-		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
-			pte_t pte;
-
-			if (pfn > max_pfn_mapped)
-				max_pfn_mapped = pfn;
-
-			if (!pte_none(pte_page[pteidx]))
-				continue;
-
-			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
-			pte_page[pteidx] = pte;
-		}
-	}
-
-	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
-		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
-
-	set_page_prot(pmd, PAGE_KERNEL_RO);
-}
-
-#ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
-{
-	pte_t *pte = v;
-	int i;
-
-	/* All levels are converted the same way, so just treat them
-	   as ptes. */
-	for (i = 0; i < PTRS_PER_PTE; i++)
-		pte[i] = xen_make_pte(pte[i].pte);
-}
-
-/*
- * Set up the inital kernel pagetable.
- *
- * We can construct this by grafting the Xen provided pagetable into
- * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working.  We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
- */
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-						unsigned long max_pfn)
-{
-	pud_t *l3;
-	pmd_t *l2;
-
-	/* Zap identity mapping */
-	init_level4_pgt[0] = __pgd(0);
-
-	/* Pre-constructed entries are in pfn, so convert to mfn */
-	convert_pfn_mfn(init_level4_pgt);
-	convert_pfn_mfn(level3_ident_pgt);
-	convert_pfn_mfn(level3_kernel_pgt);
-
-	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
-	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
-
-	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
-	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
-	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
-	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
-	/* Set up identity map */
-	xen_map_identity_early(level2_ident_pgt, max_pfn);
-
-	/* Make pagetable pieces RO */
-	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
-	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
-
-	/* Pin down new L4 */
-	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
-			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
-	/* Unpin Xen-provided one */
-	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-	/* Switch over */
-	pgd = init_level4_pgt;
-
-	/*
-	 * At this stage there can be no user pgd, and no page
-	 * structure to attach it to, so make sure we just set kernel
-	 * pgd.
-	 */
-	xen_mc_batch();
-	__xen_write_cr3(true, __pa(pgd));
-	xen_mc_issue(PARAVIRT_LAZY_CPU);
-
-	reserve_early(__pa(xen_start_info->pt_base),
-		      __pa(xen_start_info->pt_base +
-			   xen_start_info->nr_pt_frames * PAGE_SIZE),
-		      "XEN PAGETABLES");
-
-	return pgd;
-}
-#else	/* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
-
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-						unsigned long max_pfn)
-{
-	pmd_t *kernel_pmd;
-
-	init_pg_tables_start = __pa(pgd);
-	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
-
-	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
-	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-
-	xen_map_identity_early(level2_kernel_pgt, max_pfn);
-
-	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
-	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
-			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
-
-	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
-	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
-
-	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-	xen_write_cr3(__pa(swapper_pg_dir));
-
-	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
-
-	return swapper_pg_dir;
-}
-#endif	/* CONFIG_X86_64 */
-
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -1642,9 +908,18 @@ asmlinkage void __init xen_start_kernel(void)
 	machine_ops = xen_machine_ops;

 #ifdef CONFIG_X86_64
-	/* Disable until direct per-cpu data access. */
-	have_vcpu_info_placement = 0;
+	/*
+	 * Setup percpu state.  We only need to do this for 64-bit
+	 * because 32-bit already has %fs set properly.
+	 */
+	load_percpu_segment(0);
 #endif
+	/*
+	 * The only reliable way to retain the initial address of the
+	 * percpu gdt_page is to remember it here, so we can go and
+	 * mark it RW later, when the initial percpu area is freed.
+	 */
+	xen_initial_gdt = &per_cpu(gdt_page, 0);

 	xen_smp_init();


--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void)
 	*/
 	return (-flags) & X86_EFLAGS_IF;
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);

 static void xen_restore_fl(unsigned long flags)
 {
@@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags)
 			xen_force_evtchn_callback();
 	}
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);

 static void xen_irq_disable(void)
 {
@@ -86,6 +88,7 @@ static void xen_irq_disable(void)
 	percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);

 static void xen_irq_enable(void)
 {
@@ -106,6 +109,7 @@ static void xen_irq_enable(void)
 	if (unlikely(vcpu->evtchn_upcall_pending))
 		xen_force_evtchn_callback();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);

 static void xen_safe_halt(void)
 {
@@ -124,10 +128,12 @@ static void xen_halt(void)

 static const struct pv_irq_ops xen_irq_ops __initdata = {
 	.init_IRQ = __xen_init_IRQ,
-	.save_fl = xen_save_fl,
-	.restore_fl = xen_restore_fl,
-	.irq_disable = xen_irq_disable,
-	.irq_enable = xen_irq_enable,
+
+	.save_fl = PV_CALLEE_SAVE(xen_save_fl),
+	.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
+	.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+	.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+
 	.safe_halt = xen_safe_halt,
 	.halt = xen_halt,
 #ifdef CONFIG_X86_64

--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 #include <asm/mmu_context.h>
+#include <asm/setup.h>
 #include <asm/paravirt.h>
 #include <asm/linkage.h>

@@ -55,6 +56,8 @@

 #include <xen/page.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/hvc-console.h>

 #include "multicalls.h"
 #include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)

 #endif /* CONFIG_XEN_DEBUG_FS */

+
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3.  This may not be the current effective cr3, because
+ * its update may be being lazily deferred.  However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early).  If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
+
+
 /*
 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 * redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
 {
 	return pte_mfn_to_pfn(pte.pte);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);

 pgdval_t xen_pgd_val(pgd_t pgd)
 {
 	return pte_mfn_to_pfn(pgd.pgd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);

 pte_t xen_make_pte(pteval_t pte)
 {
 	pte = pte_pfn_to_mfn(pte);
 	return native_make_pte(pte);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);

 pgd_t xen_make_pgd(pgdval_t pgd)
 {
 	pgd = pte_pfn_to_mfn(pgd);
 	return native_make_pgd(pgd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);

 pmdval_t xen_pmd_val(pmd_t pmd)
 {
 	return pte_mfn_to_pfn(pmd.pmd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);

 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 {
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
 	pmd = pte_pfn_to_mfn(pmd);
 	return native_make_pmd(pmd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);

 #if PAGETABLE_LEVELS == 4
 pudval_t xen_pud_val(pud_t pud)
 {
 	return pte_mfn_to_pfn(pud.pud);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);

 pud_t xen_make_pud(pudval_t pud)
 {
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)

 	return native_make_pud(pud);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);

 pgd_t *xen_get_user_pgd(pgd_t *pgd)
 {
@@ -1152,6 +1194,709 @@ void xen_exit_mmap(struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }

+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+	xen_setup_shared_info();
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+	percpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+	return percpu_read(xen_vcpu)->arch.cr2;
+}
+
+unsigned long xen_read_cr2_direct(void)
+{
+	return percpu_read(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_flush_tlb(void)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+	op->cmd = MMUEXT_INVLPG_LOCAL;
+	op->arg1.linear_addr = addr & PAGE_MASK;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+				 struct mm_struct *mm, unsigned long va)
+{
+	struct {
+		struct mmuext_op op;
+		DECLARE_BITMAP(mask, NR_CPUS);
+	} *args;
+	struct multicall_space mcs;
+
+	BUG_ON(cpumask_empty(cpus));
+	BUG_ON(!mm);
+
+	mcs = xen_mc_entry(sizeof(*args));
+	args = mcs.args;
+	args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+	/* Remove us, and any offline CPUS. */
+	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+	if (unlikely(cpumask_empty(to_cpumask(args->mask))))
+		goto issue;
+
+	if (va == TLB_FLUSH_ALL) {
+		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+	} else {
+		args->op.cmd = MMUEXT_INVLPG_MULTI;
+		args->op.arg1.linear_addr = va;
+	}
+
+	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+issue:
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+	return percpu_read(xen_cr3);
+}
+
+static void set_current_cr3(void *v)
+{
+	percpu_write(xen_current_cr3, (unsigned long)v);
+}
+
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+	unsigned long mfn;
+
+	if (cr3)
+		mfn = pfn_to_mfn(PFN_DOWN(cr3));
+	else
+		mfn = 0;
+
+	WARN_ON(mfn == 0 && kernel);
+
+	mcs = __xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+	op->arg1.mfn = mfn;
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	if (kernel) {
+		percpu_write(xen_cr3, cr3);
+
+		/* Update xen_current_cr3 once the batch has actually
+		   been submitted. */
+		xen_mc_callback(set_current_cr3, (void *)cr3);
+	}
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+	BUG_ON(preemptible());
+
+	xen_mc_batch();  /* disables interrupts */
+
+	/* Update while interrupts are disabled, so its atomic with
+	   respect to ipis */
+	percpu_write(xen_cr3, cr3);
+
+	__xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+	{
+		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+		if (user_pgd)
+			__xen_write_cr3(false, __pa(user_pgd));
+		else
+			__xen_write_cr3(false, 0);
+	}
+#endif
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
+}
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd = mm->pgd;
+	int ret = 0;
+
+	BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+	{
+		struct page *page = virt_to_page(pgd);
+		pgd_t *user_pgd;
+
+		BUG_ON(page->private != 0);
+
+		ret = -ENOMEM;
+
+		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+		page->private = (unsigned long)user_pgd;
+
+		if (user_pgd != NULL) {
+			user_pgd[pgd_index(VSYSCALL_START)] =
+				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+			ret = 0;
+		}
+
+		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+	}
+#endif
+
+	return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+	pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+	if (user_pgd)
+		free_page((unsigned long)user_pgd);
+#endif
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+	pgprot_t prot = PAGE_KERNEL;
+
+	if (PagePinned(page))
+		prot = PAGE_KERNEL_RO;
+
+	if (0 && PageHighMem(page))
+		printk("mapping highpte %lx type %d prot %s\n",
+		       page_to_pfn(page), type,
+		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+	return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
+	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+			       pte_val_ma(pte));
+
+	return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+	pte = mask_rw_pte(ptep, pte);
+
+	xen_set_pte(ptep, pte);
+}
+#endif
+
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+	BUG_ON(mem_map);	/* should only be used early */
+#endif
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* Early release_pte assumes that all pts are pinned, since there's
+   only init_mm and anything attached to that is pinned. */
+static void xen_release_pte_init(unsigned long pfn)
+{
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct mmuext_op op;
+	op.cmd = cmd;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		SetPagePinned(page);
+
+		vm_unmap_aliases();
+		if (!PageHighMem(page)) {
+			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+		} else {
+			/* make sure there are no stray mappings of
+			   this page */
+			kmap_flush_unused();
+		}
+	}
+}
+
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+	xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+	xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(page)) {
+		if (!PageHighMem(page)) {
+			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+		}
+		ClearPagePinned(page);
+	}
+}
+
+static void xen_release_pte(unsigned long pfn)
+{
+	xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pmd(unsigned long pfn)
+{
+	xen_release_ptpage(pfn, PT_PMD);
+}
+
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+	xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(unsigned long pfn)
+{
+	xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+	unsigned long top = HYPERVISOR_VIRT_START;
+	struct xen_platform_parameters pp;
+
+	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+		top = pp.virt_start;
+
+	reserve_top_address(-top);
+#endif	/* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+	return (void *)(paddr + __START_KERNEL_map);
+#else
+	return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+	phys_addr_t paddr;
+
+	maddr &= PTE_PFN_MASK;
+	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+	return __ka(m2p(maddr));
+}
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+	pte_t pte = pfn_pte(pfn, prot);
+
+	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+		BUG();
+}
+
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+	unsigned pmdidx, pteidx;
+	unsigned ident_pte;
+	unsigned long pfn;
+
+	ident_pte = 0;
+	pfn = 0;
+	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+		pte_t *pte_page;
+
+		/* Reuse or allocate a page of ptes */
+		if (pmd_present(pmd[pmdidx]))
+			pte_page = m2v(pmd[pmdidx].pmd);
+		else {
+			/* Check for free pte pages */
+			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+				break;
+
+			pte_page = &level1_ident_pgt[ident_pte];
+			ident_pte += PTRS_PER_PTE;
+
+			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+		}
+
+		/* Install mappings */
+		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+			pte_t pte;
+
+			if (pfn > max_pfn_mapped)
+				max_pfn_mapped = pfn;
+
+			if (!pte_none(pte_page[pteidx]))
+				continue;
+
+			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+			pte_page[pteidx] = pte;
+		}
+	}
+
+	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+	set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+	pte_t *pte = v;
+	int i;
+
+	/* All levels are converted the same way, so just treat them
+	   as ptes. */
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working.  We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+					 unsigned long max_pfn)
+{
+	pud_t *l3;
+	pmd_t *l2;
+
+	/* Zap identity mapping */
+	init_level4_pgt[0] = __pgd(0);
+
+	/* Pre-constructed entries are in pfn, so convert to mfn */
+	convert_pfn_mfn(init_level4_pgt);
+	convert_pfn_mfn(level3_ident_pgt);
+	convert_pfn_mfn(level3_kernel_pgt);
+
+	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+	/* Set up identity map */
+	xen_map_identity_early(level2_ident_pgt, max_pfn);
+
+	/* Make pagetable pieces RO */
+	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+	/* Pin down new L4 */
+	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+	/* Unpin Xen-provided one */
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+	/* Switch over */
+	pgd = init_level4_pgt;
+
+	/*
+	 * At this stage there can be no user pgd, and no page
+	 * structure to attach it to, so make sure we just set kernel
+	 * pgd.
+	 */
+	xen_mc_batch();
+	__xen_write_cr3(true, __pa(pgd));
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+	reserve_early(__pa(xen_start_info->pt_base),
+		      __pa(xen_start_info->pt_base +
+			   xen_start_info->nr_pt_frames * PAGE_SIZE),
+		      "XEN PAGETABLES");
+
+	return pgd;
+}
+#else	/* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+					 unsigned long max_pfn)
+{
+	pmd_t *kernel_pmd;
+
+	init_pg_tables_start = __pa(pgd);
+	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+
+	xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+	xen_write_cr3(__pa(swapper_pg_dir));
+
+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+	return swapper_pg_dir;
+}
+#endif	/* CONFIG_X86_64 */
+
+static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+{
+	pte_t pte;
+
+	phys >>= PAGE_SHIFT;
+
+	switch (idx) {
+	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+#ifdef CONFIG_X86_F00F_BUG
+	case FIX_F00F_IDT:
+#endif
+#ifdef CONFIG_X86_32
+	case FIX_WP_TEST:
+	case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	case FIX_APIC_BASE:	/* maps dummy local APIC */
+#endif
+		pte = pfn_pte(phys, prot);
+		break;
+
+	default:
+		pte = mfn_pte(phys, prot);
+		break;
+	}
+
+	__native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+	/* Replicate changes to map the vsyscall page into the user
+	   pagetable vsyscall mapping. */
+	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+		unsigned long vaddr = __fix_to_virt(idx);
+		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+	}
+#endif
+}
+
+__init void xen_post_allocator_init(void)
+{
+	pv_mmu_ops.set_pte = xen_set_pte;
+	pv_mmu_ops.set_pmd = xen_set_pmd;
+	pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+	pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+	/* This will work as long as patching hasn't happened yet
+	   (which it hasn't) */
+	pv_mmu_ops.alloc_pte = xen_alloc_pte;
+	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+	pv_mmu_ops.release_pte = xen_release_pte;
+	pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+	pv_mmu_ops.alloc_pud = xen_alloc_pud;
+	pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+
+#ifdef CONFIG_X86_64
+	SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+	xen_mark_init_mm_pinned();
+}
+
+
+const struct pv_mmu_ops xen_mmu_ops __initdata = {
+	.pagetable_setup_start = xen_pagetable_setup_start,
+	.pagetable_setup_done = xen_pagetable_setup_done,
+
+	.read_cr2 = xen_read_cr2,
+	.write_cr2 = xen_write_cr2,
+
+	.read_cr3 = xen_read_cr3,
+	.write_cr3 = xen_write_cr3,
+
+	.flush_tlb_user = xen_flush_tlb,
+	.flush_tlb_kernel = xen_flush_tlb,
+	.flush_tlb_single = xen_flush_tlb_single,
+	.flush_tlb_others = xen_flush_tlb_others,
+
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+	.pgd_alloc = xen_pgd_alloc,
+	.pgd_free = xen_pgd_free,
+
+	.alloc_pte = xen_alloc_pte_init,
+	.release_pte = xen_release_pte_init,
+	.alloc_pmd = xen_alloc_pte_init,
+	.alloc_pmd_clone = paravirt_nop,
+	.release_pmd = xen_release_pte_init,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+#ifdef CONFIG_X86_64
+	.set_pte = xen_set_pte,
+#else
+	.set_pte = xen_set_pte_init,
+#endif
+	.set_pte_at = xen_set_pte_at,
+	.set_pmd = xen_set_pmd_hyper,
+
+	.ptep_modify_prot_start = __ptep_modify_prot_start,
+	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
+	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+
+	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
+	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = xen_set_pte_atomic,
+	.set_pte_present = xen_set_pte_at,
+	.pte_clear = xen_pte_clear,
+	.pmd_clear = xen_pmd_clear,
+#endif	/* CONFIG_X86_PAE */
+	.set_pud = xen_set_pud_hyper,
+
+	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+
+#if PAGETABLE_LEVELS == 4
+	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
+	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
+	.set_pgd = xen_set_pgd_hyper,
+
+	.alloc_pud = xen_alloc_pte_init,
+	.release_pud = xen_release_pte_init,
+#endif	/* PAGETABLE_LEVELS == 4 */
+
+	.activate_mm = xen_activate_mm,
+	.dup_mmap = xen_dup_mmap,
+	.exit_mmap = xen_exit_mmap,
+
+	.lazy_mode = {
+		.enter = paravirt_enter_lazy_mmu,
+		.leave = xen_leave_lazy,
+	},
+
+	.set_fixmap = xen_set_fixmap,
+};
+
+
 #ifdef CONFIG_XEN_DEBUG_FS

 static struct dentry *d_mmu_debug;

--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
 void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, pte_t pte);

+unsigned long xen_read_cr2_direct(void);
+
+extern const struct pv_mmu_ops xen_mmu_ops;
 #endif	/* _XEN_MMU_H */
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -170,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)

 	/* We've switched to the "real" per-cpu gdt, so make sure the
 	   old memory can be recycled */
-	make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
+	make_lowmem_page_readwrite(xen_initial_gdt);

 	xen_setup_vcpu_info_placement();
 }
@@ -235,6 +235,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->user_regs.ss = __KERNEL_DS;
 #ifdef CONFIG_X86_32
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
+#else
+	ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -284,6 +286,9 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	irq_ctx_init(cpu);
 #else
 	clear_tsk_thread_flag(idle, TIF_FORK);
+	per_cpu(kernel_stack, cpu) =
+		(unsigned long)task_stack_page(idle) -
+		KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
 	xen_setup_timer(cpu);
 	xen_init_lock_cpu(cpu);

--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
+/*
+	Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+	The inline versions are the same as the direct-use versions, with the
+	pre- and post-amble chopped off.
+
+	This code is encoded for size rather than absolute efficiency,
+	with a view to being able to inline as much as possible.
+
+	We only bother with direct forms (ie, vcpu in percpu data) of
+	the operations here; the indirect forms are better handled in
+	C, since they're generally too large to inline anyway.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+
+#include "xen-asm.h"
+
+/*
+	Enable events.  This clears the event mask and tests the pending
+	event status with one and operation.  If there are pending
+	events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+	/* Unmask events */
+	movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+
+	/* Test for pending */
+	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+	jz 1f
+
+2:	call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+	ret
+	ENDPROC(xen_irq_enable_direct)
+	RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+	Disabling events is simply a matter of making the event mask
+	non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+	movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+	ret
+	ENDPROC(xen_irq_disable_direct)
+	RELOC(xen_irq_disable_direct, 0)
+
+/*
+	(xen_)save_fl is used to get the current interrupt enable status.
+	Callers expect the status to be in X86_EFLAGS_IF, and other bits
+	may be set in the return value.  We take advantage of this by
+	making sure that X86_EFLAGS_IF has the right value (and other bits
+	in that byte are 0), but other bits in the return value are
+	undefined.  We need to toggle the state of the bit, because
+	Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+	setz %ah
+	addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+	ret
+	ENDPROC(xen_save_fl_direct)
+	RELOC(xen_save_fl_direct, 0)
+
+
+/*
+	In principle the caller should be passing us a value return
+	from xen_save_fl_direct, but for robustness sake we test only
+	the X86_EFLAGS_IF flag rather than the whole byte. After
+	setting the interrupt mask state, it checks for unmasked
+	pending events and enters the hypervisor to get them delivered
+	if so.
+ */
+ENTRY(xen_restore_fl_direct)
+#ifdef CONFIG_X86_64
+	testw $X86_EFLAGS_IF, %di
+#else
+	testb $X86_EFLAGS_IF>>8, %ah
+#endif
+	setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+
+	/* check for unmasked and pending */
+	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+	ret
+	ENDPROC(xen_restore_fl_direct)
+	RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+	Force an event check by making a hypercall,
+	but preserve regs before making the call.
+ */
+check_events:
+#ifdef CONFIG_X86_32
+	push %eax
+	push %ecx
+	push %edx
+	call xen_force_evtchn_callback
+	pop %edx
+	pop %ecx
+	pop %eax
+#else
+	push %rax
+	push %rcx
+	push %rdx
+	push %rsi
+	push %rdi
+	push %r8
+	push %r9
+	push %r10
+	push %r11
+	call xen_force_evtchn_callback
+	pop %r11
+	pop %r10
+	pop %r9
+	pop %r8
+	pop %rdi
+	pop %rsi
+	pop %rdx
+	pop %rcx
+	pop %rax
+#endif
+	ret
+
--- a/arch/x86/xen/xen-asm.h
+++ b/arch/x86/xen/xen-asm.h
+#ifndef _XEN_XEN_ASM_H
+#define _XEN_XEN_ASM_H
+
+#include <linux/linkage.h>
+
+#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)	.globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI	0x80000000
+
+#endif
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -11,101 +11,28 @@
 	generally too large to inline anyway.
 */

-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
+//#include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>

 #include <xen/interface/xen.h>

-#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)	.globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI	0x80000000
-
-/*
-	Enable events.  This clears the event mask and tests the pending
-	event status with one and operation.  If there are pending
-	events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-	/* Unmask events */
-	movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-
-	/* Preempt here doesn't matter because that will deal with
-	   any pending interrupts.  The pending check may end up being
-	   run on the wrong CPU, but that doesn't hurt. */
-
-	/* Test for pending */
-	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-	jz 1f
-
-2:	call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-	ret
-	ENDPROC(xen_irq_enable_direct)
-	RELOC(xen_irq_enable_direct, 2b+1)
-
+#include "xen-asm.h"

 /*
-	Disabling events is simply a matter of making the event mask
-	non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-	movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
-	ret
-	ENDPROC(xen_irq_disable_direct)
-	RELOC(xen_irq_disable_direct, 0)
-
-/*
-	(xen_)save_fl is used to get the current interrupt enable status.
-	Callers expect the status to be in X86_EFLAGS_IF, and other bits
-	may be set in the return value.  We take advantage of this by
-	making sure that X86_EFLAGS_IF has the right value (and other bits
-	in that byte are 0), but other bits in the return value are
-	undefined.  We need to toggle the state of the bit, because
-	Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
-	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-	setz %ah
-	addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
-	ret
-	ENDPROC(xen_save_fl_direct)
-	RELOC(xen_save_fl_direct, 0)
-
-
-/*
-	In principle the caller should be passing us a value return
-	from xen_save_fl_direct, but for robustness sake we test only
-	the X86_EFLAGS_IF flag rather than the whole byte. After
-	setting the interrupt mask state, it checks for unmasked
-	pending events and enters the hypervisor to get them delivered
-	if so.
+	Force an event check by making a hypercall,
+	but preserve regs before making the call.
 */
-ENTRY(xen_restore_fl_direct)
-	testb $X86_EFLAGS_IF>>8, %ah
-	setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-	/* Preempt here doesn't matter because that will deal with
-	   any pending interrupts.  The pending check may end up being
-	   run on the wrong CPU, but that doesn't hurt. */
-
-	/* check for unmasked and pending */
-	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-	jz 1f
-2:	call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
+check_events:
+	push %eax
+	push %ecx
+	push %edx
+	call xen_force_evtchn_callback
+	pop %edx
+	pop %ecx
+	pop %eax
 	ret
-	ENDPROC(xen_restore_fl_direct)
-	RELOC(xen_restore_fl_direct, 2b+1)

 /*
 	We can't use sysexit directly, because we're not running in ring0.
@@ -289,17 +216,3 @@ ENTRY(xen_iret_crit_fixup)
 	lea 4(%edi),%esp		/* point esp to new frame */
 2:	jmp xen_do_upcall

-
-/*
-	Force an event check by making a hypercall,
-	but preserve regs before making the call.
- */
-check_events:
-	push %eax
-	push %ecx
-	push %edx
-	call xen_force_evtchn_callback
-	pop %edx
-	pop %ecx
-	pop %eax
-	ret
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -11,142 +11,14 @@
 	generally too large to inline anyway.
 */

-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/processor-flags.h>
 #include <asm/errno.h>
-#include <asm/segment.h>
 #include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>

 #include <xen/interface/xen.h>

-#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)	.globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI	0x80000000
-
-#if 1
-/*
-	FIXME: x86_64 now can support direct access to percpu variables
-	via a segment override.  Update xen accordingly.
- */
-#define BUG			ud2a
-#endif
-
-/*
-	Enable events.  This clears the event mask and tests the pending
-	event status with one and operation.  If there are pending
-	events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-	BUG
-
-	/* Unmask events */
-	movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-
-	/* Preempt here doesn't matter because that will deal with
-	   any pending interrupts.  The pending check may end up being
-	   run on the wrong CPU, but that doesn't hurt. */
-
-	/* Test for pending */
-	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
-	jz 1f
-
-2:	call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-	ret
-	ENDPROC(xen_irq_enable_direct)
-	RELOC(xen_irq_enable_direct, 2b+1)
-
-/*
-	Disabling events is simply a matter of making the event mask
-	non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-	BUG
-
-	movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
-	ret
-	ENDPROC(xen_irq_disable_direct)
-	RELOC(xen_irq_disable_direct, 0)
-
-/*
-	(xen_)save_fl is used to get the current interrupt enable status.
-	Callers expect the status to be in X86_EFLAGS_IF, and other bits
-	may be set in the return value.  We take advantage of this by
-	making sure that X86_EFLAGS_IF has the right value (and other bits
-	in that byte are 0), but other bits in the return value are
-	undefined.  We need to toggle the state of the bit, because
-	Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
-	BUG
-
-	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-	setz %ah
-	addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
-	ret
-	ENDPROC(xen_save_fl_direct)
-	RELOC(xen_save_fl_direct, 0)
-
-/*
-	In principle the caller should be passing us a value return
-	from xen_save_fl_direct, but for robustness sake we test only
-	the X86_EFLAGS_IF flag rather than the whole byte. After
-	setting the interrupt mask state, it checks for unmasked
-	pending events and enters the hypervisor to get them delivered
-	if so.
- */
-ENTRY(xen_restore_fl_direct)
-	BUG
-
-	testb $X86_EFLAGS_IF>>8, %ah
-	setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-	/* Preempt here doesn't matter because that will deal with
-	   any pending interrupts.  The pending check may end up being
-	   run on the wrong CPU, but that doesn't hurt. */
-
-	/* check for unmasked and pending */
-	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
-	jz 1f
-2:	call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
-	ret
-	ENDPROC(xen_restore_fl_direct)
-	RELOC(xen_restore_fl_direct, 2b+1)
-
-
-/*
-	Force an event check by making a hypercall,
-	but preserve regs before making the call.
- */
-check_events:
-	push %rax
-	push %rcx
-	push %rdx
-	push %rsi
-	push %rdi
-	push %r8
-	push %r9
-	push %r10
-	push %r11
-	call xen_force_evtchn_callback
-	pop %r11
-	pop %r10
-	pop %r9
-	pop %r8
-	pop %rdi
-	pop %rsi
-	pop %rdx
-	pop %rcx
-	pop %rax
-	ret
+#include "xen-asm.h"

 ENTRY(xen_adjust_exception_frame)
 	mov 8+0(%rsp),%rcx

--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];

+extern void *xen_initial_gdt;
+
 struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);

+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);

@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;

 void xen_setup_mfn_list_list(void);
 void xen_setup_shared_info(void);
+void xen_setup_machphys_mapping(void);
+pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_ident_map_ISA(void);
+void xen_reserve_top(void);
+
+void xen_leave_lazy(void);
+void xen_post_allocator_init(void);

 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);

--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -445,24 +445,22 @@
 * section in the linker script will go there too.  @phdr should have
 * a leading colon.
 *
- * This macro defines three symbols, __per_cpu_load, __per_cpu_start
- * and __per_cpu_end.  The first one is the vaddr of loaded percpu
- * init data.  __per_cpu_start equals @vaddr and __per_cpu_end is the
- * end offset.
+ * Note that this macros defines __per_cpu_load as an absolute symbol.
+ * If there is no need to put the percpu section at a predetermined
+ * address, use PERCPU().
 */
 #define PERCPU_VADDR(vaddr, phdr)					\
-	VMLINUX_SYMBOL(__per_cpu_load_abs) = .;				\
-	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load_abs)	\
+	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
+	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
 				- LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
-		VMLINUX_SYMBOL(__per_cpu_load) = LOADADDR(.data.percpu) + LOAD_OFFSET;\
 		*(.data.percpu.first)					\
 		*(.data.percpu.page_aligned)				\
 		*(.data.percpu)						\
 		*(.data.percpu.shared_aligned)				\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
 	} phdr								\
-	. = VMLINUX_SYMBOL(__per_cpu_load_abs) + SIZEOF(.data.percpu);
+	. = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);

 /**
 * PERCPU - define output section for percpu area, simple version
@@ -471,7 +469,20 @@
 * Align to @align and outputs output section for percpu area.  This
 * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
 * __per_cpu_start will be identical.
+ *
+ * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
+ * that __per_cpu_load is defined as a relative symbol against
+ * .data.percpu which is required for relocatable x86_32
+ * configuration.
 */
 #define PERCPU(align)							\
 	. = ALIGN(align);						\
-	PERCPU_VADDR( , )
+	.data.percpu	: AT(ADDR(.data.percpu) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__per_cpu_load) = .;			\
+		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
+		*(.data.percpu.first)					\
+		*(.data.percpu.page_aligned)				\
+		*(.data.percpu)						\
+		*(.data.percpu.shared_aligned)				\
+		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
+	}