paravirt.h 31.0 KB
Newer Older
1 2 3 4
#ifndef __ASM_PARAVIRT_H
#define __ASM_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
 * para-virtualization: those hooks are defined here. */
5 6

#ifdef CONFIG_PARAVIRT
7
#include <asm/page.h>
8

9 10 11 12 13 14 15
/* Bitmask of what can be clobbered: usually at least eax. */
#define CLBR_NONE 0x0
#define CLBR_EAX 0x1
#define CLBR_ECX 0x2
#define CLBR_EDX 0x4
#define CLBR_ANY 0x7

16
#ifndef __ASSEMBLY__
17
#include <linux/types.h>
18
#include <linux/cpumask.h>
19
#include <asm/kmap_types.h>
20

21
struct page;
22 23 24
struct thread_struct;
struct Xgt_desc_struct;
struct tss_struct;
25
struct mm_struct;
26
struct desc_struct;
27 28 29 30 31 32

/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
	PARAVIRT_LAZY_NONE = 0,
	PARAVIRT_LAZY_MMU = 1,
	PARAVIRT_LAZY_CPU = 2,
33
	PARAVIRT_LAZY_FLUSH = 3,
34 35
};

36 37 38

/* general info */
struct pv_info {
39
	unsigned int kernel_rpl;
40
	int shared_kernel_pmd;
41
	int paravirt_enabled;
42
	const char *name;
43
};
44

45
struct pv_init_ops {
46
	/*
47 48 49 50 51 52
	 * Patch may replace one of the defined code sequences with
	 * arbitrary code, subject to the same register constraints.
	 * This generally means the code is not free to clobber any
	 * registers other than EAX.  The patch function should return
	 * the number of bytes of code generated, as we nop pad the
	 * rest in generic code.
53
	 */
54 55
	unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
			  unsigned long addr, unsigned len);
56

57
	/* Basic arch-specific setup */
58 59
	void (*arch_setup)(void);
	char *(*memory_setup)(void);
60 61
	void (*post_allocator_init)(void);

62
	/* Print a banner to identify the environment */
63
	void (*banner)(void);
64 65 66 67 68 69 70 71 72 73
};


struct pv_misc_ops {
	/* Set deferred update mode, used for batching operations. */
	void (*set_lazy_mode)(enum paravirt_lazy_mode mode);
};

struct pv_time_ops {
	void (*time_init)(void);
74

75
	/* Set and set time of day */
76 77 78
	unsigned long (*get_wallclock)(void);
	int (*set_wallclock)(unsigned long);

79 80 81
	unsigned long long (*sched_clock)(void);
	unsigned long (*get_cpu_khz)(void);
};
82

83
struct pv_cpu_ops {
84
	/* hooks for various privileged instructions */
85 86
	unsigned long (*get_debugreg)(int regno);
	void (*set_debugreg)(int regno, unsigned long value);
87

88
	void (*clts)(void);
89

90 91
	unsigned long (*read_cr0)(void);
	void (*write_cr0)(unsigned long);
92

93 94 95
	unsigned long (*read_cr4_safe)(void);
	unsigned long (*read_cr4)(void);
	void (*write_cr4)(unsigned long);
96

97
	/* Segment descriptor handling */
98 99 100 101 102 103 104 105
	void (*load_tr_desc)(void);
	void (*load_gdt)(const struct Xgt_desc_struct *);
	void (*load_idt)(const struct Xgt_desc_struct *);
	void (*store_gdt)(struct Xgt_desc_struct *);
	void (*store_idt)(struct Xgt_desc_struct *);
	void (*set_ldt)(const void *desc, unsigned entries);
	unsigned long (*store_tr)(void);
	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
106 107 108 109 110 111 112
	void (*write_ldt_entry)(struct desc_struct *,
				int entrynum, u32 low, u32 high);
	void (*write_gdt_entry)(struct desc_struct *,
				int entrynum, u32 low, u32 high);
	void (*write_idt_entry)(struct desc_struct *,
				int entrynum, u32 low, u32 high);
	void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t);
113

114
	void (*set_iopl_mask)(unsigned mask);
115 116

	void (*wbinvd)(void);
117
	void (*io_delay)(void);
118

119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
	/* cpuid emulation, mostly so that caps bits can be disabled */
	void (*cpuid)(unsigned int *eax, unsigned int *ebx,
		      unsigned int *ecx, unsigned int *edx);

	/* MSR, PMC and TSR operations.
	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
	u64 (*read_msr)(unsigned int msr, int *err);
	int (*write_msr)(unsigned int msr, u64 val);

	u64 (*read_tsc)(void);
	u64 (*read_pmc)(void);

	/* These two are jmp to, not actually called. */
	void (*irq_enable_sysexit)(void);
	void (*iret)(void);
};

struct pv_irq_ops {
	void (*init_IRQ)(void);

139
	/*
140 141 142 143
	 * Get/set interrupt state.  save_fl and restore_fl are only
	 * expected to use X86_EFLAGS_IF; all other bits
	 * returned from save_fl are undefined, and may be ignored by
	 * restore_fl.
144
	 */
145 146 147 148 149 150 151
	unsigned long (*save_fl)(void);
	void (*restore_fl)(unsigned long);
	void (*irq_disable)(void);
	void (*irq_enable)(void);
	void (*safe_halt)(void);
	void (*halt)(void);
};
152

153
struct pv_apic_ops {
154
#ifdef CONFIG_X86_LOCAL_APIC
155 156 157 158
	/*
	 * Direct APIC operations, principally for VMI.  Ideally
	 * these shouldn't be in this interface.
	 */
159 160 161
	void (*apic_write)(unsigned long reg, unsigned long v);
	void (*apic_write_atomic)(unsigned long reg, unsigned long v);
	unsigned long (*apic_read)(unsigned long reg);
Z
Zachary Amsden 已提交
162 163
	void (*setup_boot_clock)(void);
	void (*setup_secondary_clock)(void);
164 165 166 167

	void (*startup_ipi_hook)(int phys_apicid,
				 unsigned long start_eip,
				 unsigned long start_esp);
168
#endif
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
};

struct pv_mmu_ops {
	/*
	 * Called before/after init_mm pagetable setup. setup_start
	 * may reset %cr3, and may pre-install parts of the pagetable;
	 * pagetable setup is expected to preserve any existing
	 * mapping.
	 */
	void (*pagetable_setup_start)(pgd_t *pgd_base);
	void (*pagetable_setup_done)(pgd_t *pgd_base);

	unsigned long (*read_cr2)(void);
	void (*write_cr2)(unsigned long);

	unsigned long (*read_cr3)(void);
	void (*write_cr3)(unsigned long);

	/*
	 * Hooks for intercepting the creation/use/destruction of an
	 * mm_struct.
	 */
	void (*activate_mm)(struct mm_struct *prev,
			    struct mm_struct *next);
	void (*dup_mmap)(struct mm_struct *oldmm,
			 struct mm_struct *mm);
	void (*exit_mmap)(struct mm_struct *mm);

197

198
	/* TLB operations */
199 200
	void (*flush_tlb_user)(void);
	void (*flush_tlb_kernel)(void);
201
	void (*flush_tlb_single)(unsigned long addr);
202 203
	void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
				 unsigned long va);
204

205
	/* Hooks for allocating/releasing pagetable pages */
206
	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
207 208 209 210 211
	void (*alloc_pd)(u32 pfn);
	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
	void (*release_pt)(u32 pfn);
	void (*release_pd)(u32 pfn);

212
	/* Pagetable manipulation functions */
213
	void (*set_pte)(pte_t *ptep, pte_t pteval);
214 215
	void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
			   pte_t *ptep, pte_t pteval);
216
	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
217
	void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
218 219
	void (*pte_update_defer)(struct mm_struct *mm,
				 unsigned long addr, pte_t *ptep);
220

221
#ifdef CONFIG_X86_PAE
222
	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
223 224
	void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
				pte_t *ptep, pte_t pte);
225
	void (*set_pud)(pud_t *pudp, pud_t pudval);
226
	void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
227
	void (*pmd_clear)(pmd_t *pmdp);
228 229 230 231 232 233 234 235 236 237 238 239 240 241

	unsigned long long (*pte_val)(pte_t);
	unsigned long long (*pmd_val)(pmd_t);
	unsigned long long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long long pte);
	pmd_t (*make_pmd)(unsigned long long pmd);
	pgd_t (*make_pgd)(unsigned long long pgd);
#else
	unsigned long (*pte_val)(pte_t);
	unsigned long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long pte);
	pgd_t (*make_pgd)(unsigned long pgd);
242 243
#endif

244 245 246 247
#ifdef CONFIG_HIGHPTE
	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
#endif
};
248

249 250 251 252 253 254 255 256 257 258 259 260
/* This contains all the paravirt structures: we get a convenient
 * number for each function using the offset which we use to indicate
 * what to patch. */
struct paravirt_patch_template
{
	struct pv_init_ops pv_init_ops;
	struct pv_misc_ops pv_misc_ops;
	struct pv_time_ops pv_time_ops;
	struct pv_cpu_ops pv_cpu_ops;
	struct pv_irq_ops pv_irq_ops;
	struct pv_apic_ops pv_apic_ops;
	struct pv_mmu_ops pv_mmu_ops;
261 262
};

263 264 265 266 267 268 269 270
extern struct pv_info pv_info;
extern struct pv_init_ops pv_init_ops;
extern struct pv_misc_ops pv_misc_ops;
extern struct pv_time_ops pv_time_ops;
extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
271

272
#define PARAVIRT_PATCH(x)					\
273
	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
274

275 276 277
#define paravirt_type(op)				\
	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
	[paravirt_opptr] "m" (op)
278 279 280
#define paravirt_clobber(clobber)		\
	[paravirt_clobber] "i" (clobber)

281 282 283 284
/*
 * Generate some code, and mark it as patchable by the
 * apply_paravirt() alternate instruction patcher.
 */
285 286 287 288 289 290 291 292 293
#define _paravirt_alt(insn_string, type, clobber)	\
	"771:\n\t" insn_string "\n" "772:\n"		\
	".pushsection .parainstructions,\"a\"\n"	\
	"  .long 771b\n"				\
	"  .byte " type "\n"				\
	"  .byte 772b-771b\n"				\
	"  .short " clobber "\n"			\
	".popsection\n"

294
/* Generate patchable code, with the default asm parameters. */
295
#define paravirt_alt(insn_string)					\
296 297
	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")

298 299
unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ignore(unsigned len);
300 301 302
unsigned paravirt_patch_call(void *insnbuf,
			     const void *target, u16 tgt_clobbers,
			     unsigned long addr, u16 site_clobbers,
303
			     unsigned len);
304
unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
305 306 307
			    unsigned long addr, unsigned len);
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
				unsigned long addr, unsigned len);
308

309
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
310 311
			      const char *start, const char *end);

312
int paravirt_disable_iospace(void);
313

314 315 316
/*
 * This generates an indirect call based on the operation type number.
 * The type number, computed in PARAVIRT_PATCH, is derived from the
317 318
 * offset into the paravirt_patch_template structure, and can therefore be
 * freely converted back into a structure offset.
319
 */
320
#define PARAVIRT_CALL	"call *%[paravirt_opptr];"
321 322

/*
323 324
 * These macros are intended to wrap calls through one of the paravirt
 * ops structs, so that they can be later identified and patched at
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
 * runtime.
 *
 * Normally, a call to a pv_op function is a simple indirect call:
 * (paravirt_ops.operations)(args...).
 *
 * Unfortunately, this is a relatively slow operation for modern CPUs,
 * because it cannot necessarily determine what the destination
 * address is.  In this case, the address is a runtime constant, so at
 * the very least we can patch the call to e a simple direct call, or
 * ideally, patch an inline implementation into the callsite.  (Direct
 * calls are essentially free, because the call and return addresses
 * are completely predictable.)
 *
 * These macros rely on the standard gcc "regparm(3)" calling
 * convention, in which the first three arguments are placed in %eax,
 * %edx, %ecx (in that order), and the remaining arguments are placed
 * on the stack.  All caller-save registers (eax,edx,ecx) are expected
 * to be modified (either clobbered or used for return values).
 *
 * The call instruction itself is marked by placing its start address
 * and size into the .parainstructions section, so that
 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
347
 * appropriate patching under the control of the backend pv_init_ops
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
 * implementation.
 *
 * Unfortunately there's no way to get gcc to generate the args setup
 * for the call, and then allow the call itself to be generated by an
 * inline asm.  Because of this, we must do the complete arg setup and
 * return value handling from within these macros.  This is fairly
 * cumbersome.
 *
 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
 * It could be extended to more arguments, but there would be little
 * to be gained from that.  For each number of arguments, there are
 * the two VCALL and CALL variants for void and non-void functions.
 *
 * When there is a return value, the invoker of the macro must specify
 * the return type.  The macro then uses sizeof() on that type to
 * determine whether its a 32 or 64 bit value, and places the return
 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
 * 64-bit).
 *
 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
 * in low,high order.
 *
 * Small structures are passed and returned in registers.  The macro
 * calling convention can't directly deal with this, so the wrapper
 * functions must do this.
 *
 * These PVOP_* macros are only defined within this header.  This
 * means that all uses must be wrapped in inline functions.  This also
 * makes sure the incoming and outgoing types are always correct.
 */
378
#define __PVOP_CALL(rettype, op, pre, post, ...)			\
379
	({								\
380
		rettype __ret;						\
381
		unsigned long __eax, __edx, __ecx;			\
382 383 384 385 386
		if (sizeof(rettype) > sizeof(unsigned long)) {		\
			asm volatile(pre				\
				     paravirt_alt(PARAVIRT_CALL)	\
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
387
				       "=c" (__ecx)			\
388 389 390
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
391
				     : "memory", "cc");			\
392
			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
393
		} else {						\
394
			asm volatile(pre				\
395
				     paravirt_alt(PARAVIRT_CALL)	\
396 397 398 399 400 401
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
				       "=c" (__ecx)			\
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
402
				     : "memory", "cc");			\
403
			__ret = (rettype)__eax;				\
404 405 406
		}							\
		__ret;							\
	})
407
#define __PVOP_VCALL(op, pre, post, ...)				\
408 409
	({								\
		unsigned long __eax, __edx, __ecx;			\
410
		asm volatile(pre					\
411
			     paravirt_alt(PARAVIRT_CALL)		\
412
			     post					\
413
			     : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \
414 415 416
			     : paravirt_type(op),			\
			       paravirt_clobber(CLBR_ANY),		\
			       ##__VA_ARGS__				\
417 418 419
			     : "memory", "cc");				\
	})

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
#define PVOP_CALL0(rettype, op)						\
	__PVOP_CALL(rettype, op, "", "")
#define PVOP_VCALL0(op)							\
	__PVOP_VCALL(op, "", "")

#define PVOP_CALL1(rettype, op, arg1)					\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)))
#define PVOP_VCALL1(op, arg1)						\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)))

#define PVOP_CALL2(rettype, op, arg1, arg2)				\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))
#define PVOP_VCALL2(op, arg1, arg2)					\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))

#define PVOP_CALL3(rettype, op, arg1, arg2, arg3)			\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)),		\
		    "1"((u32)(arg2)), "2"((u32)(arg3)))
#define PVOP_VCALL3(op, arg1, arg2, arg3)				\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1"((u32)(arg2)),	\
		     "2"((u32)(arg3)))

#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
	__PVOP_CALL(rettype, op,					\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
	__PVOP_VCALL(op,						\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))

453 454
static inline int paravirt_enabled(void)
{
455
	return pv_info.paravirt_enabled;
456
}
457 458 459 460

static inline void load_esp0(struct tss_struct *tss,
			     struct thread_struct *thread)
{
461
	PVOP_VCALL2(pv_cpu_ops.load_esp0, tss, thread);
462 463
}

464
#define ARCH_SETUP			pv_init_ops.arch_setup();
465 466
static inline unsigned long get_wallclock(void)
{
467
	return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
468 469 470 471
}

static inline int set_wallclock(unsigned long nowtime)
{
472
	return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
473 474
}

Z
Zachary Amsden 已提交
475
static inline void (*choose_time_init(void))(void)
476
{
477
	return pv_time_ops.time_init;
478 479 480 481 482 483
}

/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
			   unsigned int *ecx, unsigned int *edx)
{
484
	PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
485 486 487 488 489
}

/*
 * These special macros can be used to get or set a debugging register
 */
490 491
static inline unsigned long paravirt_get_debugreg(int reg)
{
492
	return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
493 494 495 496
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
497
	PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
498
}
499

500 501
static inline void clts(void)
{
502
	PVOP_VCALL0(pv_cpu_ops.clts);
503
}
504

505 506
static inline unsigned long read_cr0(void)
{
507
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
508
}
509

510 511
static inline void write_cr0(unsigned long x)
{
512
	PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
513 514 515 516
}

static inline unsigned long read_cr2(void)
{
517
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
518 519 520 521
}

static inline void write_cr2(unsigned long x)
{
522
	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
523 524 525 526
}

static inline unsigned long read_cr3(void)
{
527
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
528
}
529

530 531
static inline void write_cr3(unsigned long x)
{
532
	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
533
}
534

535 536
static inline unsigned long read_cr4(void)
{
537
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
538 539 540
}
static inline unsigned long read_cr4_safe(void)
{
541
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
542
}
543

544 545
static inline void write_cr4(unsigned long x)
{
546
	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
547
}
548

549 550
static inline void raw_safe_halt(void)
{
551
	PVOP_VCALL0(pv_irq_ops.safe_halt);
552 553 554 555
}

static inline void halt(void)
{
556
	PVOP_VCALL0(pv_irq_ops.safe_halt);
557 558 559 560
}

static inline void wbinvd(void)
{
561
	PVOP_VCALL0(pv_cpu_ops.wbinvd);
562 563
}

564
#define get_kernel_rpl()  (pv_info.kernel_rpl)
565

566 567
static inline u64 paravirt_read_msr(unsigned msr, int *err)
{
568
	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
569 570 571
}
static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
{
572
	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
573 574
}

575
/* These should all do BUG_ON(_err), but our headers are too tangled. */
576 577 578 579 580
#define rdmsr(msr,val1,val2) do {		\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	val1 = (u32)_l;				\
	val2 = _l >> 32;			\
581 582
} while(0)

583 584
#define wrmsr(msr,val1,val2) do {		\
	paravirt_write_msr(msr, val1, val2);	\
585 586
} while(0)

587 588 589
#define rdmsrl(msr,val) do {			\
	int _err;				\
	val = paravirt_read_msr(msr, &_err);	\
590 591
} while(0)

592
#define wrmsrl(msr,val)		wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
593
#define wrmsr_safe(msr,a,b)	paravirt_write_msr(msr, a, b)
594 595

/* rdmsr with exception handling */
596 597 598 599 600
#define rdmsr_safe(msr,a,b) ({			\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	(*a) = (u32)_l;				\
	(*b) = _l >> 32;			\
601 602
	_err; })

603 604 605

static inline u64 paravirt_read_tsc(void)
{
606
	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
607
}
608

609 610 611
#define rdtscl(low) do {			\
	u64 _l = paravirt_read_tsc();		\
	low = (int)_l;				\
612 613
} while(0)

614
#define rdtscll(val) (val = paravirt_read_tsc())
615

616 617
static inline unsigned long long paravirt_sched_clock(void)
{
618
	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
619
}
620
#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
621

622 623
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

624 625
static inline unsigned long long paravirt_read_pmc(int counter)
{
626
	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
627
}
628

629 630 631 632 633
#define rdpmc(counter,low,high) do {		\
	u64 _l = paravirt_read_pmc(counter);	\
	low = (u32)_l;				\
	high = _l >> 32;			\
} while(0)
634

635 636
static inline void load_TR_desc(void)
{
637
	PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
638 639 640
}
static inline void load_gdt(const struct Xgt_desc_struct *dtr)
{
641
	PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
642 643 644
}
static inline void load_idt(const struct Xgt_desc_struct *dtr)
{
645
	PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
646 647 648
}
static inline void set_ldt(const void *addr, unsigned entries)
{
649
	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
650 651 652
}
static inline void store_gdt(struct Xgt_desc_struct *dtr)
{
653
	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
654 655 656
}
static inline void store_idt(struct Xgt_desc_struct *dtr)
{
657
	PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
658 659 660
}
static inline unsigned long paravirt_store_tr(void)
{
661
	return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
662 663 664 665
}
#define store_tr(tr)	((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
666
	PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
667 668 669
}
static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
{
670
	PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
671 672 673
}
static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high)
{
674
	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high);
675 676 677
}
static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high)
{
678
	PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high);
679 680 681
}
static inline void set_iopl_mask(unsigned mask)
{
682
	PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
683
}
684

685 686
/* The paravirtualized I/O functions */
static inline void slow_down_io(void) {
687
	pv_cpu_ops.io_delay();
688
#ifdef REALLY_SLOW_IO
689 690 691
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
692 693 694
#endif
}

695 696 697 698 699 700
#ifdef CONFIG_X86_LOCAL_APIC
/*
 * Basic functions accessing APICs.
 */
static inline void apic_write(unsigned long reg, unsigned long v)
{
701
	PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
702 703 704 705
}

static inline void apic_write_atomic(unsigned long reg, unsigned long v)
{
706
	PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
707 708 709 710
}

static inline unsigned long apic_read(unsigned long reg)
{
711
	return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
712
}
Z
Zachary Amsden 已提交
713 714 715

static inline void setup_boot_clock(void)
{
716
	PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
Z
Zachary Amsden 已提交
717 718 719 720
}

static inline void setup_secondary_clock(void)
{
721
	PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
Z
Zachary Amsden 已提交
722
}
723 724
#endif

725 726
static inline void paravirt_post_allocator_init(void)
{
727 728
	if (pv_init_ops.post_allocator_init)
		(*pv_init_ops.post_allocator_init)();
729 730
}

731 732
static inline void paravirt_pagetable_setup_start(pgd_t *base)
{
733
	(*pv_mmu_ops.pagetable_setup_start)(base);
734 735 736 737
}

static inline void paravirt_pagetable_setup_done(pgd_t *base)
{
738
	(*pv_mmu_ops.pagetable_setup_done)(base);
739
}
740

741 742 743 744
#ifdef CONFIG_SMP
static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
				    unsigned long start_esp)
{
745 746
	PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
		    phys_apicid, start_eip, start_esp);
747 748
}
#endif
749

750 751 752
static inline void paravirt_activate_mm(struct mm_struct *prev,
					struct mm_struct *next)
{
753
	PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
754 755 756 757 758
}

static inline void arch_dup_mmap(struct mm_struct *oldmm,
				 struct mm_struct *mm)
{
759
	PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
760 761 762 763
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
764
	PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
765 766
}

767 768
static inline void __flush_tlb(void)
{
769
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
770 771 772
}
static inline void __flush_tlb_global(void)
{
773
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
774 775 776
}
static inline void __flush_tlb_single(unsigned long addr)
{
777
	PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
778
}
779

780 781 782
static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
				    unsigned long va)
{
783
	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
784 785
}

786
static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
787
{
788
	PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
789 790 791
}
static inline void paravirt_release_pt(unsigned pfn)
{
792
	PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
793
}
794

795 796
static inline void paravirt_alloc_pd(unsigned pfn)
{
797
	PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
798
}
799

800 801 802
static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
					   unsigned start, unsigned count)
{
803
	PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
804 805
}
static inline void paravirt_release_pd(unsigned pfn)
806
{
807
	PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
808 809
}

810 811 812 813
#ifdef CONFIG_HIGHPTE
static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
{
	unsigned long ret;
814
	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
815 816 817 818
	return (void *)ret;
}
#endif

819 820
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
821
{
822
	PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
823 824
}

825 826
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
				    pte_t *ptep)
827
{
828
	PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
829 830
}

831 832
#ifdef CONFIG_X86_PAE
static inline pte_t __pte(unsigned long long val)
833
{
834 835
	unsigned long long ret = PVOP_CALL2(unsigned long long,
					    pv_mmu_ops.make_pte,
836 837
					    val, val >> 32);
	return (pte_t) { ret, ret >> 32 };
838 839
}

840
static inline pmd_t __pmd(unsigned long long val)
841
{
842 843
	return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd,
				    val, val >> 32) };
844 845 846 847
}

static inline pgd_t __pgd(unsigned long long val)
{
848 849
	return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd,
				    val, val >> 32) };
850 851 852 853
}

static inline unsigned long long pte_val(pte_t x)
{
854 855
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val,
			  x.pte_low, x.pte_high);
856 857 858 859
}

static inline unsigned long long pmd_val(pmd_t x)
{
860 861
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val,
			  x.pmd, x.pmd >> 32);
862 863 864 865
}

static inline unsigned long long pgd_val(pgd_t x)
{
866 867
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val,
			  x.pgd, x.pgd >> 32);
868 869 870 871
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
872
	PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high);
873 874 875 876 877 878
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
	/* 5 arg words */
879
	pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval);
880 881 882 883
}

static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
{
884 885
	PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
		    pteval.pte_low, pteval.pte_high);
886 887
}

888 889
static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
				   pte_t *ptep, pte_t pte)
890
{
891
	/* 5 arg words */
892
	pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
893 894
}

895 896
static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
897 898
	PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp,
		    pmdval.pmd, pmdval.pmd >> 32);
899 900
}

901 902
static inline void set_pud(pud_t *pudp, pud_t pudval)
{
903 904
	PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
		    pudval.pgd.pgd, pudval.pgd.pgd >> 32);
905 906 907 908
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
909
	PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
910 911 912 913
}

static inline void pmd_clear(pmd_t *pmdp)
{
914
	PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
915 916 917
}

#else  /* !CONFIG_X86_PAE */
918

919 920
static inline pte_t __pte(unsigned long val)
{
921
	return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) };
922
}
923 924 925

static inline pgd_t __pgd(unsigned long val)
{
926
	return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) };
927 928 929 930
}

static inline unsigned long pte_val(pte_t x)
{
931
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low);
932 933 934 935
}

static inline unsigned long pgd_val(pgd_t x)
{
936
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd);
937 938 939 940
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
941
	PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low);
942 943 944 945 946
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
947
	PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low);
948 949 950 951
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
952
	PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd);
953 954
}
#endif	/* CONFIG_X86_PAE */
955

956
#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
957 958
static inline void arch_enter_lazy_cpu_mode(void)
{
959
	PVOP_VCALL1(pv_misc_ops.set_lazy_mode, PARAVIRT_LAZY_CPU);
960 961 962 963
}

static inline void arch_leave_lazy_cpu_mode(void)
{
964
	PVOP_VCALL1(pv_misc_ops.set_lazy_mode, PARAVIRT_LAZY_NONE);
965 966 967 968
}

static inline void arch_flush_lazy_cpu_mode(void)
{
969
	PVOP_VCALL1(pv_misc_ops.set_lazy_mode, PARAVIRT_LAZY_FLUSH);
970 971
}

972 973

#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
974 975
static inline void arch_enter_lazy_mmu_mode(void)
{
976
	PVOP_VCALL1(pv_misc_ops.set_lazy_mode, PARAVIRT_LAZY_MMU);
977 978 979 980
}

static inline void arch_leave_lazy_mmu_mode(void)
{
981
	PVOP_VCALL1(pv_misc_ops.set_lazy_mode, PARAVIRT_LAZY_NONE);
982 983 984 985
}

static inline void arch_flush_lazy_mmu_mode(void)
{
986
	PVOP_VCALL1(pv_misc_ops.set_lazy_mode, PARAVIRT_LAZY_FLUSH);
987
}
988

989 990 991
void _paravirt_nop(void);
#define paravirt_nop	((void *)_paravirt_nop)

992
/* These all sit in the .parainstructions section to tell us what to patch. */
993
struct paravirt_patch_site {
994 995 996 997 998 999
	u8 *instr; 		/* original instructions */
	u8 instrtype;		/* type of this instruction */
	u8 len;			/* length of original instruction */
	u16 clobbers;		/* what registers you may clobber */
};

1000 1001 1002
extern struct paravirt_patch_site __parainstructions[],
	__parainstructions_end[];

1003 1004 1005 1006
static inline unsigned long __raw_local_save_flags(void)
{
	unsigned long f;

1007 1008 1009 1010
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
1011
		     : paravirt_type(pv_irq_ops.save_fl),
1012
		       paravirt_clobber(CLBR_EAX)
1013
		     : "memory", "cc");
1014 1015 1016 1017 1018
	return f;
}

static inline void raw_local_irq_restore(unsigned long f)
{
1019 1020 1021 1022 1023
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
		     : "0"(f),
1024
		       paravirt_type(pv_irq_ops.restore_fl),
1025 1026
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "cc");
1027 1028 1029 1030
}

static inline void raw_local_irq_disable(void)
{
1031 1032 1033 1034
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1035
		     : paravirt_type(pv_irq_ops.irq_disable),
1036 1037
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1038 1039 1040 1041
}

static inline void raw_local_irq_enable(void)
{
1042 1043 1044 1045
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1046
		     : paravirt_type(pv_irq_ops.irq_enable),
1047 1048
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1049 1050 1051 1052 1053 1054
}

static inline unsigned long __raw_local_irq_save(void)
{
	unsigned long f;

1055 1056
	f = __raw_local_save_flags();
	raw_local_irq_disable();
1057 1058 1059
	return f;
}

1060 1061
#define CLI_STRING							\
	_paravirt_alt("pushl %%ecx; pushl %%edx;"			\
1062
		      "call *%[paravirt_cli_opptr];"			\
1063 1064 1065 1066 1067
		      "popl %%edx; popl %%ecx",				\
		      "%c[paravirt_cli_type]", "%c[paravirt_clobber]")

#define STI_STRING							\
	_paravirt_alt("pushl %%ecx; pushl %%edx;"			\
1068
		      "call *%[paravirt_sti_opptr];"			\
1069 1070
		      "popl %%edx; popl %%ecx",				\
		      "%c[paravirt_sti_type]", "%c[paravirt_clobber]")
1071 1072

#define CLI_STI_CLOBBERS , "%eax"
1073
#define CLI_STI_INPUT_ARGS						\
1074
	,								\
1075 1076 1077 1078
	[paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)),		\
	[paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable),		\
	[paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)),		\
	[paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable),		\
1079 1080
	paravirt_clobber(CLBR_EAX)

1081
/* Make sure as little as possible of this mess escapes. */
1082
#undef PARAVIRT_CALL
1083 1084
#undef __PVOP_CALL
#undef __PVOP_VCALL
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4
1095

1096 1097
#else  /* __ASSEMBLY__ */

1098
#define PARA_PATCH(struct, off)	((PARAVIRT_PATCH_##struct + (off)) / 4)
1099 1100

#define PARA_SITE(ptype, clobbers, ops)		\
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
771:;						\
	ops;					\
772:;						\
	.pushsection .parainstructions,"a";	\
	 .long 771b;				\
	 .byte ptype;				\
	 .byte 772b-771b;			\
	 .short clobbers;			\
	.popsection

1111 1112 1113
#define INTERRUPT_RETURN						\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
		  jmp *%cs:pv_cpu_ops+PV_CPU_iret)
1114 1115

#define DISABLE_INTERRUPTS(clobbers)					\
1116
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1117
		  pushl %eax; pushl %ecx; pushl %edx;			\
1118
		  call *%cs:pv_irq_ops+PV_IRQ_irq_disable;		\
1119
		  popl %edx; popl %ecx; popl %eax)			\
1120 1121

#define ENABLE_INTERRUPTS(clobbers)					\
1122
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
1123
		  pushl %eax; pushl %ecx; pushl %edx;			\
1124
		  call *%cs:pv_irq_ops+PV_IRQ_irq_enable;		\
1125
		  popl %edx; popl %ecx; popl %eax)
1126

1127 1128 1129
#define ENABLE_INTERRUPTS_SYSEXIT					       \
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\
		  jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit)
1130 1131

#define GET_CR0_INTO_EAX			\
1132
	push %ecx; push %edx;			\
1133
	call *pv_cpu_ops+PV_CPU_read_cr0;	\
1134
	pop %edx; pop %ecx
1135

1136 1137 1138
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
#endif	/* __ASM_PARAVIRT_H */