paravirt.h 42.0 KB
Newer Older
1 2 3 4
#ifndef __ASM_PARAVIRT_H
#define __ASM_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
 * para-virtualization: those hooks are defined here. */
5 6

#ifdef CONFIG_PARAVIRT
7
#include <asm/page.h>
8
#include <asm/asm.h>
9

10
/* Bitmask of what can be clobbered: usually at least eax. */
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#define CLBR_NONE 0
#define CLBR_EAX  (1 << 0)
#define CLBR_ECX  (1 << 1)
#define CLBR_EDX  (1 << 2)

#ifdef CONFIG_X86_64
#define CLBR_RSI  (1 << 3)
#define CLBR_RDI  (1 << 4)
#define CLBR_R8   (1 << 5)
#define CLBR_R9   (1 << 6)
#define CLBR_R10  (1 << 7)
#define CLBR_R11  (1 << 8)
#define CLBR_ANY  ((1 << 9) - 1)
#include <asm/desc_defs.h>
#else
/* CLBR_ANY should match all regs platform has. For i386, that's just it */
#define CLBR_ANY  ((1 << 3) - 1)
#endif /* X86_64 */
29

30
#ifndef __ASSEMBLY__
31
#include <linux/types.h>
32
#include <linux/cpumask.h>
33
#include <asm/kmap_types.h>
34
#include <asm/desc_defs.h>
35

36
struct page;
37
struct thread_struct;
38
struct desc_ptr;
39
struct tss_struct;
40
struct mm_struct;
41
struct desc_struct;
42

43 44
/* general info */
struct pv_info {
45
	unsigned int kernel_rpl;
46
	int shared_kernel_pmd;
47
	int paravirt_enabled;
48
	const char *name;
49
};
50

51
struct pv_init_ops {
52
	/*
53 54 55 56 57 58
	 * Patch may replace one of the defined code sequences with
	 * arbitrary code, subject to the same register constraints.
	 * This generally means the code is not free to clobber any
	 * registers other than EAX.  The patch function should return
	 * the number of bytes of code generated, as we nop pad the
	 * rest in generic code.
59
	 */
60 61
	unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
			  unsigned long addr, unsigned len);
62

63
	/* Basic arch-specific setup */
64 65
	void (*arch_setup)(void);
	char *(*memory_setup)(void);
66 67
	void (*post_allocator_init)(void);

68
	/* Print a banner to identify the environment */
69
	void (*banner)(void);
70 71 72
};


73
struct pv_lazy_ops {
74
	/* Set deferred update mode, used for batching operations. */
75 76
	void (*enter)(void);
	void (*leave)(void);
77 78 79 80
};

struct pv_time_ops {
	void (*time_init)(void);
81

82
	/* Set and set time of day */
83 84 85
	unsigned long (*get_wallclock)(void);
	int (*set_wallclock)(unsigned long);

86
	unsigned long long (*sched_clock)(void);
87
	unsigned long (*get_tsc_khz)(void);
88
};
89

90
struct pv_cpu_ops {
91
	/* hooks for various privileged instructions */
92 93
	unsigned long (*get_debugreg)(int regno);
	void (*set_debugreg)(int regno, unsigned long value);
94

95
	void (*clts)(void);
96

97 98
	unsigned long (*read_cr0)(void);
	void (*write_cr0)(unsigned long);
99

100 101 102
	unsigned long (*read_cr4_safe)(void);
	unsigned long (*read_cr4)(void);
	void (*write_cr4)(unsigned long);
103

104 105 106 107 108
#ifdef CONFIG_X86_64
	unsigned long (*read_cr8)(void);
	void (*write_cr8)(unsigned long);
#endif

109
	/* Segment descriptor handling */
110
	void (*load_tr_desc)(void);
111 112 113 114
	void (*load_gdt)(const struct desc_ptr *);
	void (*load_idt)(const struct desc_ptr *);
	void (*store_gdt)(struct desc_ptr *);
	void (*store_idt)(struct desc_ptr *);
115 116 117
	void (*set_ldt)(const void *desc, unsigned entries);
	unsigned long (*store_tr)(void);
	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
118 119 120
#ifdef CONFIG_X86_64
	void (*load_gs_index)(unsigned int idx);
#endif
121 122
	void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
				const void *desc);
123
	void (*write_gdt_entry)(struct desc_struct *,
124
				int entrynum, const void *desc, int size);
125 126
	void (*write_idt_entry)(gate_desc *,
				int entrynum, const gate_desc *gate);
127
	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
128

129
	void (*set_iopl_mask)(unsigned mask);
130 131

	void (*wbinvd)(void);
132
	void (*io_delay)(void);
133

134 135 136 137 138 139 140
	/* cpuid emulation, mostly so that caps bits can be disabled */
	void (*cpuid)(unsigned int *eax, unsigned int *ebx,
		      unsigned int *ecx, unsigned int *edx);

	/* MSR, PMC and TSR operations.
	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
	u64 (*read_msr)(unsigned int msr, int *err);
141
	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
142 143

	u64 (*read_tsc)(void);
144
	u64 (*read_pmc)(int counter);
145
	unsigned long long (*read_tscp)(unsigned int *aux);
146

147 148 149 150 151 152
	/*
	 * Atomically enable interrupts and return to userspace.  This
	 * is only ever used to return to 32-bit processes; in a
	 * 64-bit kernel, it's used for 32-on-64 compat processes, but
	 * never native 64-bit processes.  (Jump, not call.)
	 */
153
	void (*irq_enable_sysexit)(void);
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172

	/*
	 * Switch to usermode gs and return to 64-bit usermode using
	 * sysret.  Only used in 64-bit kernels to return to 64-bit
	 * processes.  Usermode register state, including %rsp, must
	 * already be restored.
	 */
	void (*usergs_sysret64)(void);

	/*
	 * Switch to usermode gs and return to 32-bit usermode using
	 * sysret.  Used to return to 32-on-64 compat processes.
	 * Other usermode register state, including %esp, must already
	 * be restored.
	 */
	void (*usergs_sysret32)(void);

	/* Normal iret.  Jump to this with the standard iret stack
	   frame set up. */
173
	void (*iret)(void);
174

175 176
	void (*swapgs)(void);

177
	struct pv_lazy_ops lazy_mode;
178 179 180 181 182
};

struct pv_irq_ops {
	void (*init_IRQ)(void);

183
	/*
184 185 186 187
	 * Get/set interrupt state.  save_fl and restore_fl are only
	 * expected to use X86_EFLAGS_IF; all other bits
	 * returned from save_fl are undefined, and may be ignored by
	 * restore_fl.
188
	 */
189 190 191 192 193 194
	unsigned long (*save_fl)(void);
	void (*restore_fl)(unsigned long);
	void (*irq_disable)(void);
	void (*irq_enable)(void);
	void (*safe_halt)(void);
	void (*halt)(void);
195 196 197 198

#ifdef CONFIG_X86_64
	void (*adjust_exception_frame)(void);
#endif
199
};
200

201
struct pv_apic_ops {
202
#ifdef CONFIG_X86_LOCAL_APIC
203 204 205 206
	/*
	 * Direct APIC operations, principally for VMI.  Ideally
	 * these shouldn't be in this interface.
	 */
207 208 209
	void (*apic_write)(unsigned long reg, u32 v);
	void (*apic_write_atomic)(unsigned long reg, u32 v);
	u32 (*apic_read)(unsigned long reg);
Z
Zachary Amsden 已提交
210 211
	void (*setup_boot_clock)(void);
	void (*setup_secondary_clock)(void);
212 213 214 215

	void (*startup_ipi_hook)(int phys_apicid,
				 unsigned long start_eip,
				 unsigned long start_esp);
216
#endif
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
};

struct pv_mmu_ops {
	/*
	 * Called before/after init_mm pagetable setup. setup_start
	 * may reset %cr3, and may pre-install parts of the pagetable;
	 * pagetable setup is expected to preserve any existing
	 * mapping.
	 */
	void (*pagetable_setup_start)(pgd_t *pgd_base);
	void (*pagetable_setup_done)(pgd_t *pgd_base);

	unsigned long (*read_cr2)(void);
	void (*write_cr2)(unsigned long);

	unsigned long (*read_cr3)(void);
	void (*write_cr3)(unsigned long);

	/*
	 * Hooks for intercepting the creation/use/destruction of an
	 * mm_struct.
	 */
	void (*activate_mm)(struct mm_struct *prev,
			    struct mm_struct *next);
	void (*dup_mmap)(struct mm_struct *oldmm,
			 struct mm_struct *mm);
	void (*exit_mmap)(struct mm_struct *mm);

245

246
	/* TLB operations */
247 248
	void (*flush_tlb_user)(void);
	void (*flush_tlb_kernel)(void);
249
	void (*flush_tlb_single)(unsigned long addr);
250 251
	void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
				 unsigned long va);
252

253 254 255 256 257 258 259 260
	/* Hooks for allocating and freeing a pagetable top-level */
	int  (*pgd_alloc)(struct mm_struct *mm);
	void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);

	/*
	 * Hooks for allocating/releasing pagetable pages when they're
	 * attached to a pagetable
	 */
261 262 263
	void (*alloc_pte)(struct mm_struct *mm, u32 pfn);
	void (*alloc_pmd)(struct mm_struct *mm, u32 pfn);
	void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
264
	void (*alloc_pud)(struct mm_struct *mm, u32 pfn);
265 266
	void (*release_pte)(u32 pfn);
	void (*release_pmd)(u32 pfn);
267
	void (*release_pud)(u32 pfn);
268

269
	/* Pagetable manipulation functions */
270
	void (*set_pte)(pte_t *ptep, pte_t pteval);
271 272
	void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
			   pte_t *ptep, pte_t pteval);
273
	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
274 275
	void (*pte_update)(struct mm_struct *mm, unsigned long addr,
			   pte_t *ptep);
276 277
	void (*pte_update_defer)(struct mm_struct *mm,
				 unsigned long addr, pte_t *ptep);
278

279 280 281 282 283
	pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
					pte_t *ptep);
	void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
					pte_t *ptep, pte_t pte);

284
	pteval_t (*pte_val)(pte_t);
285
	pteval_t (*pte_flags)(pte_t);
286 287 288 289 290 291
	pte_t (*make_pte)(pteval_t pte);

	pgdval_t (*pgd_val)(pgd_t);
	pgd_t (*make_pgd)(pgdval_t pgd);

#if PAGETABLE_LEVELS >= 3
292
#ifdef CONFIG_X86_PAE
293
	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
294 295
	void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
				pte_t *ptep, pte_t pte);
296 297
	void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
			  pte_t *ptep);
298
	void (*pmd_clear)(pmd_t *pmdp);
299

300
#endif	/* CONFIG_X86_PAE */
301

302
	void (*set_pud)(pud_t *pudp, pud_t pudval);
303

304 305 306 307 308 309
	pmdval_t (*pmd_val)(pmd_t);
	pmd_t (*make_pmd)(pmdval_t pmd);

#if PAGETABLE_LEVELS == 4
	pudval_t (*pud_val)(pud_t);
	pud_t (*make_pud)(pudval_t pud);
310 311

	void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
312 313
#endif	/* PAGETABLE_LEVELS == 4 */
#endif	/* PAGETABLE_LEVELS >= 3 */
314

315 316 317
#ifdef CONFIG_HIGHPTE
	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
#endif
318 319

	struct pv_lazy_ops lazy_mode;
320 321 322 323 324 325 326

	/* dom0 ops */

	/* Sometimes the physical address is a pfn, and sometimes its
	   an mfn.  We can tell which is which from the index. */
	void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
			   unsigned long phys, pgprot_t flags);
327
};
328

329 330 331
/* This contains all the paravirt structures: we get a convenient
 * number for each function using the offset which we use to indicate
 * what to patch. */
332
struct paravirt_patch_template {
333 334 335 336 337 338
	struct pv_init_ops pv_init_ops;
	struct pv_time_ops pv_time_ops;
	struct pv_cpu_ops pv_cpu_ops;
	struct pv_irq_ops pv_irq_ops;
	struct pv_apic_ops pv_apic_ops;
	struct pv_mmu_ops pv_mmu_ops;
339 340
};

341 342 343 344 345 346 347
extern struct pv_info pv_info;
extern struct pv_init_ops pv_init_ops;
extern struct pv_time_ops pv_time_ops;
extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
348

349
#define PARAVIRT_PATCH(x)					\
350
	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
351

352 353 354
#define paravirt_type(op)				\
	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
	[paravirt_opptr] "m" (op)
355 356 357
#define paravirt_clobber(clobber)		\
	[paravirt_clobber] "i" (clobber)

358 359 360 361
/*
 * Generate some code, and mark it as patchable by the
 * apply_paravirt() alternate instruction patcher.
 */
362 363 364
#define _paravirt_alt(insn_string, type, clobber)	\
	"771:\n\t" insn_string "\n" "772:\n"		\
	".pushsection .parainstructions,\"a\"\n"	\
365 366
	_ASM_ALIGN "\n"					\
	_ASM_PTR " 771b\n"				\
367 368 369 370 371
	"  .byte " type "\n"				\
	"  .byte 772b-771b\n"				\
	"  .short " clobber "\n"			\
	".popsection\n"

372
/* Generate patchable code, with the default asm parameters. */
373
#define paravirt_alt(insn_string)					\
374 375
	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")

376 377 378 379 380
/* Simple instruction patching code. */
#define DEF_NATIVE(ops, name, code) 					\
	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")

381 382
unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ignore(unsigned len);
383 384 385
unsigned paravirt_patch_call(void *insnbuf,
			     const void *target, u16 tgt_clobbers,
			     unsigned long addr, u16 site_clobbers,
386
			     unsigned len);
387
unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
388 389 390
			    unsigned long addr, unsigned len);
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
				unsigned long addr, unsigned len);
391

392
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
393 394
			      const char *start, const char *end);

395 396 397
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
		      unsigned long addr, unsigned len);

398
int paravirt_disable_iospace(void);
399

400 401 402
/*
 * This generates an indirect call based on the operation type number.
 * The type number, computed in PARAVIRT_PATCH, is derived from the
403 404
 * offset into the paravirt_patch_template structure, and can therefore be
 * freely converted back into a structure offset.
405
 */
406
#define PARAVIRT_CALL	"call *%[paravirt_opptr];"
407 408

/*
409 410
 * These macros are intended to wrap calls through one of the paravirt
 * ops structs, so that they can be later identified and patched at
411 412 413
 * runtime.
 *
 * Normally, a call to a pv_op function is a simple indirect call:
414
 * (pv_op_struct.operations)(args...).
415 416 417 418 419 420 421 422 423
 *
 * Unfortunately, this is a relatively slow operation for modern CPUs,
 * because it cannot necessarily determine what the destination
 * address is.  In this case, the address is a runtime constant, so at
 * the very least we can patch the call to e a simple direct call, or
 * ideally, patch an inline implementation into the callsite.  (Direct
 * calls are essentially free, because the call and return addresses
 * are completely predictable.)
 *
424
 * For i386, these macros rely on the standard gcc "regparm(3)" calling
425 426 427 428
 * convention, in which the first three arguments are placed in %eax,
 * %edx, %ecx (in that order), and the remaining arguments are placed
 * on the stack.  All caller-save registers (eax,edx,ecx) are expected
 * to be modified (either clobbered or used for return values).
429 430 431 432 433 434
 * X86_64, on the other hand, already specifies a register-based calling
 * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
 * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
 * special handling for dealing with 4 arguments, unlike i386.
 * However, x86_64 also have to clobber all caller saved registers, which
 * unfortunately, are quite a bit (r8 - r11)
435 436 437 438
 *
 * The call instruction itself is marked by placing its start address
 * and size into the .parainstructions section, so that
 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
439
 * appropriate patching under the control of the backend pv_init_ops
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
 * implementation.
 *
 * Unfortunately there's no way to get gcc to generate the args setup
 * for the call, and then allow the call itself to be generated by an
 * inline asm.  Because of this, we must do the complete arg setup and
 * return value handling from within these macros.  This is fairly
 * cumbersome.
 *
 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
 * It could be extended to more arguments, but there would be little
 * to be gained from that.  For each number of arguments, there are
 * the two VCALL and CALL variants for void and non-void functions.
 *
 * When there is a return value, the invoker of the macro must specify
 * the return type.  The macro then uses sizeof() on that type to
 * determine whether its a 32 or 64 bit value, and places the return
 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
457 458
 * 64-bit). For x86_64 machines, it just returns at %rax regardless of
 * the return value size.
459 460
 *
 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
461 462
 * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
 * in low,high order
463 464 465 466 467 468 469 470 471
 *
 * Small structures are passed and returned in registers.  The macro
 * calling convention can't directly deal with this, so the wrapper
 * functions must do this.
 *
 * These PVOP_* macros are only defined within this header.  This
 * means that all uses must be wrapped in inline functions.  This also
 * makes sure the incoming and outgoing types are always correct.
 */
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
#ifdef CONFIG_X86_32
#define PVOP_VCALL_ARGS			unsigned long __eax, __edx, __ecx
#define PVOP_CALL_ARGS			PVOP_VCALL_ARGS
#define PVOP_VCALL_CLOBBERS		"=a" (__eax), "=d" (__edx),	\
					"=c" (__ecx)
#define PVOP_CALL_CLOBBERS		PVOP_VCALL_CLOBBERS
#define EXTRA_CLOBBERS
#define VEXTRA_CLOBBERS
#else
#define PVOP_VCALL_ARGS		unsigned long __edi, __esi, __edx, __ecx
#define PVOP_CALL_ARGS		PVOP_VCALL_ARGS, __eax
#define PVOP_VCALL_CLOBBERS	"=D" (__edi),				\
				"=S" (__esi), "=d" (__edx),		\
				"=c" (__ecx)

#define PVOP_CALL_CLOBBERS	PVOP_VCALL_CLOBBERS, "=a" (__eax)

#define EXTRA_CLOBBERS	 , "r8", "r9", "r10", "r11"
#define VEXTRA_CLOBBERS	 , "rax", "r8", "r9", "r10", "r11"
#endif

493 494 495 496 497 498
#ifdef CONFIG_PARAVIRT_DEBUG
#define PVOP_TEST_NULL(op)	BUG_ON(op == NULL)
#else
#define PVOP_TEST_NULL(op)	((void)op)
#endif

499
#define __PVOP_CALL(rettype, op, pre, post, ...)			\
500
	({								\
501
		rettype __ret;						\
502
		PVOP_CALL_ARGS;					\
503
		PVOP_TEST_NULL(op);					\
504 505
		/* This is 32-bit specific, but is okay in 64-bit */	\
		/* since this condition will never hold */		\
506 507 508 509
		if (sizeof(rettype) > sizeof(unsigned long)) {		\
			asm volatile(pre				\
				     paravirt_alt(PARAVIRT_CALL)	\
				     post				\
510
				     : PVOP_CALL_CLOBBERS		\
511 512 513
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
514
				     : "memory", "cc" EXTRA_CLOBBERS);	\
515
			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
516
		} else {						\
517
			asm volatile(pre				\
518
				     paravirt_alt(PARAVIRT_CALL)	\
519
				     post				\
520
				     : PVOP_CALL_CLOBBERS		\
521 522 523
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
524
				     : "memory", "cc" EXTRA_CLOBBERS);	\
525
			__ret = (rettype)__eax;				\
526 527 528
		}							\
		__ret;							\
	})
529
#define __PVOP_VCALL(op, pre, post, ...)				\
530
	({								\
531
		PVOP_VCALL_ARGS;					\
532
		PVOP_TEST_NULL(op);					\
533
		asm volatile(pre					\
534
			     paravirt_alt(PARAVIRT_CALL)		\
535
			     post					\
536
			     : PVOP_VCALL_CLOBBERS			\
537 538 539
			     : paravirt_type(op),			\
			       paravirt_clobber(CLBR_ANY),		\
			       ##__VA_ARGS__				\
540
			     : "memory", "cc" VEXTRA_CLOBBERS);		\
541 542
	})

543 544 545 546 547 548
#define PVOP_CALL0(rettype, op)						\
	__PVOP_CALL(rettype, op, "", "")
#define PVOP_VCALL0(op)							\
	__PVOP_VCALL(op, "", "")

#define PVOP_CALL1(rettype, op, arg1)					\
549
	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
550
#define PVOP_VCALL1(op, arg1)						\
551
	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
552 553

#define PVOP_CALL2(rettype, op, arg1, arg2)				\
554 555
	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), 	\
	"1" ((unsigned long)(arg2)))
556
#define PVOP_VCALL2(op, arg1, arg2)					\
557 558
	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), 		\
	"1" ((unsigned long)(arg2)))
559 560

#define PVOP_CALL3(rettype, op, arg1, arg2, arg3)			\
561 562
	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),	\
	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
563
#define PVOP_VCALL3(op, arg1, arg2, arg3)				\
564 565
	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),		\
	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
566

567 568
/* This is the only difference in x86_64. We can make it much simpler */
#ifdef CONFIG_X86_32
569 570 571 572 573 574 575 576 577 578
#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
	__PVOP_CALL(rettype, op,					\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
	__PVOP_VCALL(op,						\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
579 580 581 582 583 584 585 586 587 588
#else
#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
	__PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),	\
	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),		\
	"3"((unsigned long)(arg4)))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
	__PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),		\
	"1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),		\
	"3"((unsigned long)(arg4)))
#endif
589

590 591
static inline int paravirt_enabled(void)
{
592
	return pv_info.paravirt_enabled;
593
}
594

595
static inline void load_sp0(struct tss_struct *tss,
596 597
			     struct thread_struct *thread)
{
598
	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
599 600
}

601
#define ARCH_SETUP			pv_init_ops.arch_setup();
602 603
static inline unsigned long get_wallclock(void)
{
604
	return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
605 606 607 608
}

static inline int set_wallclock(unsigned long nowtime)
{
609
	return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
610 611
}

Z
Zachary Amsden 已提交
612
static inline void (*choose_time_init(void))(void)
613
{
614
	return pv_time_ops.time_init;
615 616 617 618 619 620
}

/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
			   unsigned int *ecx, unsigned int *edx)
{
621
	PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
622 623 624 625 626
}

/*
 * These special macros can be used to get or set a debugging register
 */
627 628
static inline unsigned long paravirt_get_debugreg(int reg)
{
629
	return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
630 631 632 633
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
634
	PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
635
}
636

637 638
static inline void clts(void)
{
639
	PVOP_VCALL0(pv_cpu_ops.clts);
640
}
641

642 643
static inline unsigned long read_cr0(void)
{
644
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
645
}
646

647 648
static inline void write_cr0(unsigned long x)
{
649
	PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
650 651 652 653
}

static inline unsigned long read_cr2(void)
{
654
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
655 656 657 658
}

static inline void write_cr2(unsigned long x)
{
659
	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
660 661 662 663
}

static inline unsigned long read_cr3(void)
{
664
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
665
}
666

667 668
static inline void write_cr3(unsigned long x)
{
669
	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
670
}
671

672 673
static inline unsigned long read_cr4(void)
{
674
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
675 676 677
}
static inline unsigned long read_cr4_safe(void)
{
678
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
679
}
680

681 682
static inline void write_cr4(unsigned long x)
{
683
	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
684
}
685

686
#ifdef CONFIG_X86_64
687 688 689 690 691 692 693 694 695
static inline unsigned long read_cr8(void)
{
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
}

static inline void write_cr8(unsigned long x)
{
	PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
}
696
#endif
697

698 699
static inline void raw_safe_halt(void)
{
700
	PVOP_VCALL0(pv_irq_ops.safe_halt);
701 702 703 704
}

static inline void halt(void)
{
705
	PVOP_VCALL0(pv_irq_ops.safe_halt);
706 707 708 709
}

static inline void wbinvd(void)
{
710
	PVOP_VCALL0(pv_cpu_ops.wbinvd);
711 712
}

713
#define get_kernel_rpl()  (pv_info.kernel_rpl)
714

715 716
static inline u64 paravirt_read_msr(unsigned msr, int *err)
{
717
	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
718 719 720
}
static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
{
721
	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
722 723
}

724
/* These should all do BUG_ON(_err), but our headers are too tangled. */
725 726
#define rdmsr(msr, val1, val2)			\
do {						\
727 728 729 730
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	val1 = (u32)_l;				\
	val2 = _l >> 32;			\
731
} while (0)
732

733 734
#define wrmsr(msr, val1, val2)			\
do {						\
735
	paravirt_write_msr(msr, val1, val2);	\
736
} while (0)
737

738 739
#define rdmsrl(msr, val)			\
do {						\
740 741
	int _err;				\
	val = paravirt_read_msr(msr, &_err);	\
742
} while (0)
743

744 745
#define wrmsrl(msr, val)	wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
#define wrmsr_safe(msr, a, b)	paravirt_write_msr(msr, a, b)
746 747

/* rdmsr with exception handling */
748 749
#define rdmsr_safe(msr, a, b)			\
({						\
750 751 752 753
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	(*a) = (u32)_l;				\
	(*b) = _l >> 32;			\
754 755
	_err;					\
})
756

A
Andi Kleen 已提交
757 758 759 760 761 762 763
static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
{
	int err;

	*p = paravirt_read_msr(msr, &err);
	return err;
}
764 765 766

static inline u64 paravirt_read_tsc(void)
{
767
	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
768
}
769

770 771
#define rdtscl(low)				\
do {						\
772 773
	u64 _l = paravirt_read_tsc();		\
	low = (int)_l;				\
774
} while (0)
775

776
#define rdtscll(val) (val = paravirt_read_tsc())
777

778 779
static inline unsigned long long paravirt_sched_clock(void)
{
780
	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
781
}
782
#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
783

784 785
static inline unsigned long long paravirt_read_pmc(int counter)
{
786
	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
787
}
788

789 790
#define rdpmc(counter, low, high)		\
do {						\
791 792 793
	u64 _l = paravirt_read_pmc(counter);	\
	low = (u32)_l;				\
	high = _l >> 32;			\
794
} while (0)
795

796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
{
	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
}

#define rdtscp(low, high, aux)				\
do {							\
	int __aux;					\
	unsigned long __val = paravirt_rdtscp(&__aux);	\
	(low) = (u32)__val;				\
	(high) = (u32)(__val >> 32);			\
	(aux) = __aux;					\
} while (0)

#define rdtscpll(val, aux)				\
do {							\
	unsigned long __aux; 				\
	val = paravirt_rdtscp(&__aux);			\
	(aux) = __aux;					\
} while (0)

817 818
static inline void load_TR_desc(void)
{
819
	PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
820
}
821
static inline void load_gdt(const struct desc_ptr *dtr)
822
{
823
	PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
824
}
825
static inline void load_idt(const struct desc_ptr *dtr)
826
{
827
	PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
828 829 830
}
static inline void set_ldt(const void *addr, unsigned entries)
{
831
	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
832
}
833
static inline void store_gdt(struct desc_ptr *dtr)
834
{
835
	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
836
}
837
static inline void store_idt(struct desc_ptr *dtr)
838
{
839
	PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
840 841 842
}
static inline unsigned long paravirt_store_tr(void)
{
843
	return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
844 845 846 847
}
#define store_tr(tr)	((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
848
	PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
849
}
850

851 852 853 854 855 856 857
#ifdef CONFIG_X86_64
static inline void load_gs_index(unsigned int gs)
{
	PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
}
#endif

858 859
static inline void write_ldt_entry(struct desc_struct *dt, int entry,
				   const void *desc)
860
{
861
	PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
862
}
863 864 865

static inline void write_gdt_entry(struct desc_struct *dt, int entry,
				   void *desc, int type)
866
{
867
	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
868
}
869

870
static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
871
{
872
	PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
873 874 875
}
static inline void set_iopl_mask(unsigned mask)
{
876
	PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
877
}
878

879
/* The paravirtualized I/O functions */
880 881
static inline void slow_down_io(void)
{
882
	pv_cpu_ops.io_delay();
883
#ifdef REALLY_SLOW_IO
884 885 886
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
887 888 889
#endif
}

890 891 892 893
#ifdef CONFIG_X86_LOCAL_APIC
/*
 * Basic functions accessing APICs.
 */
894
static inline void apic_write(unsigned long reg, u32 v)
895
{
896
	PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
897 898
}

899
static inline void apic_write_atomic(unsigned long reg, u32 v)
900
{
901
	PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
902 903
}

904
static inline u32 apic_read(unsigned long reg)
905
{
906
	return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
907
}
Z
Zachary Amsden 已提交
908 909 910

static inline void setup_boot_clock(void)
{
911
	PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
Z
Zachary Amsden 已提交
912 913 914 915
}

static inline void setup_secondary_clock(void)
{
916
	PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
Z
Zachary Amsden 已提交
917
}
918 919
#endif

920 921
static inline void paravirt_post_allocator_init(void)
{
922 923
	if (pv_init_ops.post_allocator_init)
		(*pv_init_ops.post_allocator_init)();
924 925
}

926 927
static inline void paravirt_pagetable_setup_start(pgd_t *base)
{
928
	(*pv_mmu_ops.pagetable_setup_start)(base);
929 930 931 932
}

static inline void paravirt_pagetable_setup_done(pgd_t *base)
{
933
	(*pv_mmu_ops.pagetable_setup_done)(base);
934
}
935

936 937 938 939
#ifdef CONFIG_SMP
static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
				    unsigned long start_esp)
{
940 941
	PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
		    phys_apicid, start_eip, start_esp);
942 943
}
#endif
944

945 946 947
static inline void paravirt_activate_mm(struct mm_struct *prev,
					struct mm_struct *next)
{
948
	PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
949 950 951 952 953
}

static inline void arch_dup_mmap(struct mm_struct *oldmm,
				 struct mm_struct *mm)
{
954
	PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
955 956 957 958
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
959
	PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
960 961
}

962 963
static inline void __flush_tlb(void)
{
964
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
965 966 967
}
static inline void __flush_tlb_global(void)
{
968
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
969 970 971
}
static inline void __flush_tlb_single(unsigned long addr)
{
972
	PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
973
}
974

975 976 977
static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
				    unsigned long va)
{
978
	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
979 980
}

981 982 983 984 985 986 987 988 989 990
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
	return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
}

static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
	PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
}

991
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned pfn)
992
{
993
	PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
994
}
995
static inline void paravirt_release_pte(unsigned pfn)
996
{
997
	PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
998
}
999

1000
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned pfn)
1001
{
1002
	PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
1003
}
1004

1005 1006
static inline void paravirt_alloc_pmd_clone(unsigned pfn, unsigned clonepfn,
					    unsigned start, unsigned count)
1007
{
1008
	PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
1009
}
1010
static inline void paravirt_release_pmd(unsigned pfn)
1011
{
1012
	PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
1013 1014
}

1015 1016 1017 1018 1019 1020 1021 1022 1023
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned pfn)
{
	PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
}
static inline void paravirt_release_pud(unsigned pfn)
{
	PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
}

1024 1025 1026 1027
#ifdef CONFIG_HIGHPTE
static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
{
	unsigned long ret;
1028
	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
1029 1030 1031 1032
	return (void *)ret;
}
#endif

1033 1034
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
1035
{
1036
	PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
1037 1038
}

1039 1040
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
				    pte_t *ptep)
1041
{
1042
	PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
1043 1044
}

1045
static inline pte_t __pte(pteval_t val)
1046
{
1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
	pteval_t ret;

	if (sizeof(pteval_t) > sizeof(long))
		ret = PVOP_CALL2(pteval_t,
				 pv_mmu_ops.make_pte,
				 val, (u64)val >> 32);
	else
		ret = PVOP_CALL1(pteval_t,
				 pv_mmu_ops.make_pte,
				 val);

1058
	return (pte_t) { .pte = ret };
1059 1060
}

1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
static inline pteval_t pte_val(pte_t pte)
{
	pteval_t ret;

	if (sizeof(pteval_t) > sizeof(long))
		ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
				 pte.pte, (u64)pte.pte >> 32);
	else
		ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
				 pte.pte);

	return ret;
}

1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
static inline pteval_t pte_flags(pte_t pte)
{
	pteval_t ret;

	if (sizeof(pteval_t) > sizeof(long))
		ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
				 pte.pte, (u64)pte.pte >> 32);
	else
		ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
				 pte.pte);

	return ret;
}

1089
static inline pgd_t __pgd(pgdval_t val)
1090
{
1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
	pgdval_t ret;

	if (sizeof(pgdval_t) > sizeof(long))
		ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
				 val, (u64)val >> 32);
	else
		ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
				 val);

	return (pgd_t) { ret };
}

static inline pgdval_t pgd_val(pgd_t pgd)
{
	pgdval_t ret;

	if (sizeof(pgdval_t) > sizeof(long))
		ret =  PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
				  pgd.pgd, (u64)pgd.pgd >> 32);
	else
		ret =  PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
				  pgd.pgd);

	return ret;
1115 1116
}

1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
#define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
					   pte_t *ptep)
{
	pteval_t ret;

	ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
			 mm, addr, ptep);

	return (pte_t) { .pte = ret };
}

static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
					   pte_t *ptep, pte_t pte)
{
	if (sizeof(pteval_t) > sizeof(long))
		/* 5 arg words */
		pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
	else
		PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
			    mm, addr, ptep, pte.pte);
}

1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
static inline void set_pte(pte_t *ptep, pte_t pte)
{
	if (sizeof(pteval_t) > sizeof(long))
		PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
			    pte.pte, (u64)pte.pte >> 32);
	else
		PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
			    pte.pte);
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pte)
{
	if (sizeof(pteval_t) > sizeof(long))
		/* 5 arg words */
		pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
	else
		PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
}

1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
	pmdval_t val = native_pmd_val(pmd);

	if (sizeof(pmdval_t) > sizeof(long))
		PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
	else
		PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
}

1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209
#if PAGETABLE_LEVELS >= 3
static inline pmd_t __pmd(pmdval_t val)
{
	pmdval_t ret;

	if (sizeof(pmdval_t) > sizeof(long))
		ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
				 val, (u64)val >> 32);
	else
		ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
				 val);

	return (pmd_t) { ret };
}

static inline pmdval_t pmd_val(pmd_t pmd)
{
	pmdval_t ret;

	if (sizeof(pmdval_t) > sizeof(long))
		ret =  PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
				  pmd.pmd, (u64)pmd.pmd >> 32);
	else
		ret =  PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
				  pmd.pmd);

	return ret;
}

static inline void set_pud(pud_t *pudp, pud_t pud)
{
	pudval_t val = native_pud_val(pud);

	if (sizeof(pudval_t) > sizeof(long))
		PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
			    val, (u64)val >> 32);
	else
		PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
			    val);
}
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262
#if PAGETABLE_LEVELS == 4
static inline pud_t __pud(pudval_t val)
{
	pudval_t ret;

	if (sizeof(pudval_t) > sizeof(long))
		ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
				 val, (u64)val >> 32);
	else
		ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
				 val);

	return (pud_t) { ret };
}

static inline pudval_t pud_val(pud_t pud)
{
	pudval_t ret;

	if (sizeof(pudval_t) > sizeof(long))
		ret =  PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
				  pud.pud, (u64)pud.pud >> 32);
	else
		ret =  PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
				  pud.pud);

	return ret;
}

static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
{
	pgdval_t val = native_pgd_val(pgd);

	if (sizeof(pgdval_t) > sizeof(long))
		PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
			    val, (u64)val >> 32);
	else
		PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
			    val);
}

static inline void pgd_clear(pgd_t *pgdp)
{
	set_pgd(pgdp, __pgd(0));
}

static inline void pud_clear(pud_t *pudp)
{
	set_pud(pudp, __pud(0));
}

#endif	/* PAGETABLE_LEVELS == 4 */

1263 1264
#endif	/* PAGETABLE_LEVELS >= 3 */

1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285
#ifdef CONFIG_X86_PAE
/* Special-case pte-setting operations for PAE, which can't update a
   64-bit pte atomically */
static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
	PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
		    pte.pte, pte.pte >> 32);
}

static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
				   pte_t *ptep, pte_t pte)
{
	/* 5 arg words */
	pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
			     pte_t *ptep)
{
	PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
}
1286 1287 1288 1289 1290

static inline void pmd_clear(pmd_t *pmdp)
{
	PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
}
1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307
#else  /* !CONFIG_X86_PAE */
static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
	set_pte(ptep, pte);
}

static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
				   pte_t *ptep, pte_t pte)
{
	set_pte(ptep, pte);
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
			     pte_t *ptep)
{
	set_pte_at(mm, addr, ptep, __pte(0));
}
1308 1309 1310 1311 1312

static inline void pmd_clear(pmd_t *pmdp)
{
	set_pmd(pmdp, __pmd(0));
}
1313 1314
#endif	/* CONFIG_X86_PAE */

1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328
/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
	PARAVIRT_LAZY_NONE,
	PARAVIRT_LAZY_MMU,
	PARAVIRT_LAZY_CPU,
};

enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
void paravirt_enter_lazy_cpu(void);
void paravirt_leave_lazy_cpu(void);
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
void paravirt_leave_lazy(enum paravirt_lazy_mode mode);

1329
#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
1330 1331
static inline void arch_enter_lazy_cpu_mode(void)
{
1332
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
1333 1334 1335 1336
}

static inline void arch_leave_lazy_cpu_mode(void)
{
1337
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
1338 1339 1340 1341
}

static inline void arch_flush_lazy_cpu_mode(void)
{
1342 1343 1344 1345
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
		arch_leave_lazy_cpu_mode();
		arch_enter_lazy_cpu_mode();
	}
1346 1347
}

1348 1349

#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1350 1351
static inline void arch_enter_lazy_mmu_mode(void)
{
1352
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
1353 1354 1355 1356
}

static inline void arch_leave_lazy_mmu_mode(void)
{
1357
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
1358 1359 1360 1361
}

static inline void arch_flush_lazy_mmu_mode(void)
{
1362 1363 1364 1365
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
		arch_leave_lazy_mmu_mode();
		arch_enter_lazy_mmu_mode();
	}
1366
}
1367

1368 1369 1370 1371 1372 1373
static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
				unsigned long phys, pgprot_t flags)
{
	pv_mmu_ops.set_fixmap(idx, phys, flags);
}

1374 1375 1376
void _paravirt_nop(void);
#define paravirt_nop	((void *)_paravirt_nop)

1377
/* These all sit in the .parainstructions section to tell us what to patch. */
1378
struct paravirt_patch_site {
1379 1380 1381 1382 1383 1384
	u8 *instr; 		/* original instructions */
	u8 instrtype;		/* type of this instruction */
	u8 len;			/* length of original instruction */
	u16 clobbers;		/* what registers you may clobber */
};

1385 1386 1387
extern struct paravirt_patch_site __parainstructions[],
	__parainstructions_end[];

1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
#ifdef CONFIG_X86_32
#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
#define PV_FLAGS_ARG "0"
#define PV_EXTRA_CLOBBERS
#define PV_VEXTRA_CLOBBERS
#else
/* We save some registers, but all of them, that's too much. We clobber all
 * caller saved registers but the argument parameter */
#define PV_SAVE_REGS "pushq %%rdi;"
#define PV_RESTORE_REGS "popq %%rdi;"
#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx"
#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx"
#define PV_FLAGS_ARG "D"
#endif

1404 1405 1406 1407
static inline unsigned long __raw_local_save_flags(void)
{
	unsigned long f;

1408
	asm volatile(paravirt_alt(PV_SAVE_REGS
1409
				  PARAVIRT_CALL
1410
				  PV_RESTORE_REGS)
1411
		     : "=a"(f)
1412
		     : paravirt_type(pv_irq_ops.save_fl),
1413
		       paravirt_clobber(CLBR_EAX)
1414
		     : "memory", "cc" PV_VEXTRA_CLOBBERS);
1415 1416 1417 1418 1419
	return f;
}

static inline void raw_local_irq_restore(unsigned long f)
{
1420
	asm volatile(paravirt_alt(PV_SAVE_REGS
1421
				  PARAVIRT_CALL
1422
				  PV_RESTORE_REGS)
1423
		     : "=a"(f)
1424
		     : PV_FLAGS_ARG(f),
1425
		       paravirt_type(pv_irq_ops.restore_fl),
1426
		       paravirt_clobber(CLBR_EAX)
1427
		     : "memory", "cc" PV_EXTRA_CLOBBERS);
1428 1429 1430 1431
}

static inline void raw_local_irq_disable(void)
{
1432
	asm volatile(paravirt_alt(PV_SAVE_REGS
1433
				  PARAVIRT_CALL
1434
				  PV_RESTORE_REGS)
1435
		     :
1436
		     : paravirt_type(pv_irq_ops.irq_disable),
1437
		       paravirt_clobber(CLBR_EAX)
1438
		     : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1439 1440 1441 1442
}

static inline void raw_local_irq_enable(void)
{
1443
	asm volatile(paravirt_alt(PV_SAVE_REGS
1444
				  PARAVIRT_CALL
1445
				  PV_RESTORE_REGS)
1446
		     :
1447
		     : paravirt_type(pv_irq_ops.irq_enable),
1448
		       paravirt_clobber(CLBR_EAX)
1449
		     : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1450 1451 1452 1453 1454 1455
}

static inline unsigned long __raw_local_irq_save(void)
{
	unsigned long f;

1456 1457
	f = __raw_local_save_flags();
	raw_local_irq_disable();
1458 1459 1460
	return f;
}

1461
/* Make sure as little as possible of this mess escapes. */
1462
#undef PARAVIRT_CALL
1463 1464
#undef __PVOP_CALL
#undef __PVOP_VCALL
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4
1475

1476 1477
#else  /* __ASSEMBLY__ */

1478
#define _PVSITE(ptype, clobbers, ops, word, algn)	\
1479 1480 1481 1482
771:;						\
	ops;					\
772:;						\
	.pushsection .parainstructions,"a";	\
1483 1484
	 .align	algn;				\
	 word 771b;				\
1485 1486 1487 1488 1489
	 .byte ptype;				\
	 .byte 772b-771b;			\
	 .short clobbers;			\
	.popsection

1490 1491

#ifdef CONFIG_X86_64
1492 1493 1494
#define PV_SAVE_REGS   pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
#define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 8)
1495
#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1496
#define PARA_INDIRECT(addr)	*addr(%rip)
1497
#else
1498 1499 1500
#define PV_SAVE_REGS   pushl %eax; pushl %edi; pushl %ecx; pushl %edx
#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
#define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 4)
1501
#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1502
#define PARA_INDIRECT(addr)	*%cs:addr
1503 1504
#endif

1505 1506
#define INTERRUPT_RETURN						\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
1507
		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
1508 1509

#define DISABLE_INTERRUPTS(clobbers)					\
1510
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1511 1512
		  PV_SAVE_REGS;						\
		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable);	\
1513
		  PV_RESTORE_REGS;)			\
1514 1515

#define ENABLE_INTERRUPTS(clobbers)					\
1516
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
1517 1518
		  PV_SAVE_REGS;						\
		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
1519
		  PV_RESTORE_REGS;)
1520

1521 1522
#define USERGS_SYSRET32							\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32),	\
1523
		  CLBR_NONE,						\
1524
		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
1525

1526
#ifdef CONFIG_X86_32
1527 1528 1529
#define GET_CR0_INTO_EAX				\
	push %ecx; push %edx;				\
	call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0);	\
1530
	pop %edx; pop %ecx
1531 1532 1533 1534 1535 1536 1537 1538

#define ENABLE_INTERRUPTS_SYSEXIT					\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
		  CLBR_NONE,						\
		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))


#else	/* !CONFIG_X86_32 */
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548

/*
 * If swapgs is used while the userspace stack is still current,
 * there's no way to call a pvop.  The PV replacement *must* be
 * inlined, or the swapgs instruction must be trapped and emulated.
 */
#define SWAPGS_UNSAFE_STACK						\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
		  swapgs)

1549 1550 1551
#define SWAPGS								\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
		  PV_SAVE_REGS;						\
1552
		  call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs);		\
1553 1554 1555
		  PV_RESTORE_REGS					\
		 )

1556 1557 1558
#define GET_CR2_INTO_RCX				\
	call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2);	\
	movq %rax, %rcx;				\
1559 1560
	xorq %rax, %rax;

1561 1562 1563 1564 1565
#define PARAVIRT_ADJUST_EXCEPTION_FRAME					\
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
		  CLBR_NONE,						\
		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))

1566 1567
#define USERGS_SYSRET64							\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
1568
		  CLBR_NONE,						\
1569 1570 1571 1572 1573 1574 1575
		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))

#define ENABLE_INTERRUPTS_SYSEXIT32					\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
		  CLBR_NONE,						\
		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
#endif	/* CONFIG_X86_32 */
1576

1577 1578 1579
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
#endif	/* __ASM_PARAVIRT_H */