paravirt.h 31.3 KB
Newer Older
1 2 3 4
#ifndef __ASM_PARAVIRT_H
#define __ASM_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
 * para-virtualization: those hooks are defined here. */
5 6

#ifdef CONFIG_PARAVIRT
7
#include <asm/page.h>
8

9 10 11 12 13 14 15
/* Bitmask of what can be clobbered: usually at least eax. */
#define CLBR_NONE 0x0
#define CLBR_EAX 0x1
#define CLBR_ECX 0x2
#define CLBR_EDX 0x4
#define CLBR_ANY 0x7

16
#ifndef __ASSEMBLY__
17
#include <linux/types.h>
18
#include <linux/cpumask.h>
19
#include <asm/kmap_types.h>
20
#include <asm/desc_defs.h>
21

22
struct page;
23
struct thread_struct;
24
struct desc_ptr;
25
struct tss_struct;
26
struct mm_struct;
27
struct desc_struct;
28

29 30
/* general info */
struct pv_info {
31
	unsigned int kernel_rpl;
32
	int shared_kernel_pmd;
33
	int paravirt_enabled;
34
	const char *name;
35
};
36

37
struct pv_init_ops {
38
	/*
39 40 41 42 43 44
	 * Patch may replace one of the defined code sequences with
	 * arbitrary code, subject to the same register constraints.
	 * This generally means the code is not free to clobber any
	 * registers other than EAX.  The patch function should return
	 * the number of bytes of code generated, as we nop pad the
	 * rest in generic code.
45
	 */
46 47
	unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
			  unsigned long addr, unsigned len);
48

49
	/* Basic arch-specific setup */
50 51
	void (*arch_setup)(void);
	char *(*memory_setup)(void);
52 53
	void (*post_allocator_init)(void);

54
	/* Print a banner to identify the environment */
55
	void (*banner)(void);
56 57 58
};


59
struct pv_lazy_ops {
60
	/* Set deferred update mode, used for batching operations. */
61 62
	void (*enter)(void);
	void (*leave)(void);
63 64 65 66
};

struct pv_time_ops {
	void (*time_init)(void);
67

68
	/* Set and set time of day */
69 70 71
	unsigned long (*get_wallclock)(void);
	int (*set_wallclock)(unsigned long);

72 73 74
	unsigned long long (*sched_clock)(void);
	unsigned long (*get_cpu_khz)(void);
};
75

76
struct pv_cpu_ops {
77
	/* hooks for various privileged instructions */
78 79
	unsigned long (*get_debugreg)(int regno);
	void (*set_debugreg)(int regno, unsigned long value);
80

81
	void (*clts)(void);
82

83 84
	unsigned long (*read_cr0)(void);
	void (*write_cr0)(unsigned long);
85

86 87 88
	unsigned long (*read_cr4_safe)(void);
	unsigned long (*read_cr4)(void);
	void (*write_cr4)(unsigned long);
89

90
	/* Segment descriptor handling */
91
	void (*load_tr_desc)(void);
92 93 94 95
	void (*load_gdt)(const struct desc_ptr *);
	void (*load_idt)(const struct desc_ptr *);
	void (*store_gdt)(struct desc_ptr *);
	void (*store_idt)(struct desc_ptr *);
96 97 98
	void (*set_ldt)(const void *desc, unsigned entries);
	unsigned long (*store_tr)(void);
	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
99 100 101
	void (*write_ldt_entry)(struct desc_struct *,
				int entrynum, u32 low, u32 high);
	void (*write_gdt_entry)(struct desc_struct *,
102
				int entrynum, const void *desc, int size);
103 104
	void (*write_idt_entry)(gate_desc *,
				int entrynum, const gate_desc *gate);
105
	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
106

107
	void (*set_iopl_mask)(unsigned mask);
108 109

	void (*wbinvd)(void);
110
	void (*io_delay)(void);
111

112 113 114 115 116 117 118
	/* cpuid emulation, mostly so that caps bits can be disabled */
	void (*cpuid)(unsigned int *eax, unsigned int *ebx,
		      unsigned int *ecx, unsigned int *edx);

	/* MSR, PMC and TSR operations.
	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
	u64 (*read_msr)(unsigned int msr, int *err);
119
	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
120 121

	u64 (*read_tsc)(void);
122
	u64 (*read_pmc)(int counter);
123 124

	/* These two are jmp to, not actually called. */
125
	void (*irq_enable_syscall_ret)(void);
126
	void (*iret)(void);
127 128

	struct pv_lazy_ops lazy_mode;
129 130 131 132 133
};

struct pv_irq_ops {
	void (*init_IRQ)(void);

134
	/*
135 136 137 138
	 * Get/set interrupt state.  save_fl and restore_fl are only
	 * expected to use X86_EFLAGS_IF; all other bits
	 * returned from save_fl are undefined, and may be ignored by
	 * restore_fl.
139
	 */
140 141 142 143 144 145 146
	unsigned long (*save_fl)(void);
	void (*restore_fl)(unsigned long);
	void (*irq_disable)(void);
	void (*irq_enable)(void);
	void (*safe_halt)(void);
	void (*halt)(void);
};
147

148
struct pv_apic_ops {
149
#ifdef CONFIG_X86_LOCAL_APIC
150 151 152 153
	/*
	 * Direct APIC operations, principally for VMI.  Ideally
	 * these shouldn't be in this interface.
	 */
154 155 156
	void (*apic_write)(unsigned long reg, u32 v);
	void (*apic_write_atomic)(unsigned long reg, u32 v);
	u32 (*apic_read)(unsigned long reg);
Z
Zachary Amsden 已提交
157 158
	void (*setup_boot_clock)(void);
	void (*setup_secondary_clock)(void);
159 160 161 162

	void (*startup_ipi_hook)(int phys_apicid,
				 unsigned long start_eip,
				 unsigned long start_esp);
163
#endif
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
};

struct pv_mmu_ops {
	/*
	 * Called before/after init_mm pagetable setup. setup_start
	 * may reset %cr3, and may pre-install parts of the pagetable;
	 * pagetable setup is expected to preserve any existing
	 * mapping.
	 */
	void (*pagetable_setup_start)(pgd_t *pgd_base);
	void (*pagetable_setup_done)(pgd_t *pgd_base);

	unsigned long (*read_cr2)(void);
	void (*write_cr2)(unsigned long);

	unsigned long (*read_cr3)(void);
	void (*write_cr3)(unsigned long);

	/*
	 * Hooks for intercepting the creation/use/destruction of an
	 * mm_struct.
	 */
	void (*activate_mm)(struct mm_struct *prev,
			    struct mm_struct *next);
	void (*dup_mmap)(struct mm_struct *oldmm,
			 struct mm_struct *mm);
	void (*exit_mmap)(struct mm_struct *mm);

192

193
	/* TLB operations */
194 195
	void (*flush_tlb_user)(void);
	void (*flush_tlb_kernel)(void);
196
	void (*flush_tlb_single)(unsigned long addr);
197 198
	void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
				 unsigned long va);
199

200
	/* Hooks for allocating/releasing pagetable pages */
201
	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
202 203 204 205 206
	void (*alloc_pd)(u32 pfn);
	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
	void (*release_pt)(u32 pfn);
	void (*release_pd)(u32 pfn);

207
	/* Pagetable manipulation functions */
208
	void (*set_pte)(pte_t *ptep, pte_t pteval);
209 210
	void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
			   pte_t *ptep, pte_t pteval);
211
	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
212
	void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
213 214
	void (*pte_update_defer)(struct mm_struct *mm,
				 unsigned long addr, pte_t *ptep);
215

216
#ifdef CONFIG_X86_PAE
217
	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
218 219
	void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
				pte_t *ptep, pte_t pte);
220
	void (*set_pud)(pud_t *pudp, pud_t pudval);
221
	void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
222
	void (*pmd_clear)(pmd_t *pmdp);
223 224 225 226 227 228 229 230 231 232 233 234 235 236

	unsigned long long (*pte_val)(pte_t);
	unsigned long long (*pmd_val)(pmd_t);
	unsigned long long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long long pte);
	pmd_t (*make_pmd)(unsigned long long pmd);
	pgd_t (*make_pgd)(unsigned long long pgd);
#else
	unsigned long (*pte_val)(pte_t);
	unsigned long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long pte);
	pgd_t (*make_pgd)(unsigned long pgd);
237 238
#endif

239 240 241
#ifdef CONFIG_HIGHPTE
	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
#endif
242 243

	struct pv_lazy_ops lazy_mode;
244
};
245

246 247 248 249 250 251 252 253 254 255 256
/* This contains all the paravirt structures: we get a convenient
 * number for each function using the offset which we use to indicate
 * what to patch. */
struct paravirt_patch_template
{
	struct pv_init_ops pv_init_ops;
	struct pv_time_ops pv_time_ops;
	struct pv_cpu_ops pv_cpu_ops;
	struct pv_irq_ops pv_irq_ops;
	struct pv_apic_ops pv_apic_ops;
	struct pv_mmu_ops pv_mmu_ops;
257 258
};

259 260 261 262 263 264 265
extern struct pv_info pv_info;
extern struct pv_init_ops pv_init_ops;
extern struct pv_time_ops pv_time_ops;
extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
266

267
#define PARAVIRT_PATCH(x)					\
268
	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
269

270 271 272
#define paravirt_type(op)				\
	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
	[paravirt_opptr] "m" (op)
273 274 275
#define paravirt_clobber(clobber)		\
	[paravirt_clobber] "i" (clobber)

276 277 278 279
/*
 * Generate some code, and mark it as patchable by the
 * apply_paravirt() alternate instruction patcher.
 */
280 281 282 283 284 285 286 287 288
#define _paravirt_alt(insn_string, type, clobber)	\
	"771:\n\t" insn_string "\n" "772:\n"		\
	".pushsection .parainstructions,\"a\"\n"	\
	"  .long 771b\n"				\
	"  .byte " type "\n"				\
	"  .byte 772b-771b\n"				\
	"  .short " clobber "\n"			\
	".popsection\n"

289
/* Generate patchable code, with the default asm parameters. */
290
#define paravirt_alt(insn_string)					\
291 292
	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")

293 294
unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ignore(unsigned len);
295 296 297
unsigned paravirt_patch_call(void *insnbuf,
			     const void *target, u16 tgt_clobbers,
			     unsigned long addr, u16 site_clobbers,
298
			     unsigned len);
299
unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
300 301 302
			    unsigned long addr, unsigned len);
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
				unsigned long addr, unsigned len);
303

304
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
305 306
			      const char *start, const char *end);

307
int paravirt_disable_iospace(void);
308

309 310 311
/*
 * This generates an indirect call based on the operation type number.
 * The type number, computed in PARAVIRT_PATCH, is derived from the
312 313
 * offset into the paravirt_patch_template structure, and can therefore be
 * freely converted back into a structure offset.
314
 */
315
#define PARAVIRT_CALL	"call *%[paravirt_opptr];"
316 317

/*
318 319
 * These macros are intended to wrap calls through one of the paravirt
 * ops structs, so that they can be later identified and patched at
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
 * runtime.
 *
 * Normally, a call to a pv_op function is a simple indirect call:
 * (paravirt_ops.operations)(args...).
 *
 * Unfortunately, this is a relatively slow operation for modern CPUs,
 * because it cannot necessarily determine what the destination
 * address is.  In this case, the address is a runtime constant, so at
 * the very least we can patch the call to e a simple direct call, or
 * ideally, patch an inline implementation into the callsite.  (Direct
 * calls are essentially free, because the call and return addresses
 * are completely predictable.)
 *
 * These macros rely on the standard gcc "regparm(3)" calling
 * convention, in which the first three arguments are placed in %eax,
 * %edx, %ecx (in that order), and the remaining arguments are placed
 * on the stack.  All caller-save registers (eax,edx,ecx) are expected
 * to be modified (either clobbered or used for return values).
 *
 * The call instruction itself is marked by placing its start address
 * and size into the .parainstructions section, so that
 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
342
 * appropriate patching under the control of the backend pv_init_ops
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
 * implementation.
 *
 * Unfortunately there's no way to get gcc to generate the args setup
 * for the call, and then allow the call itself to be generated by an
 * inline asm.  Because of this, we must do the complete arg setup and
 * return value handling from within these macros.  This is fairly
 * cumbersome.
 *
 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
 * It could be extended to more arguments, but there would be little
 * to be gained from that.  For each number of arguments, there are
 * the two VCALL and CALL variants for void and non-void functions.
 *
 * When there is a return value, the invoker of the macro must specify
 * the return type.  The macro then uses sizeof() on that type to
 * determine whether its a 32 or 64 bit value, and places the return
 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
 * 64-bit).
 *
 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
 * in low,high order.
 *
 * Small structures are passed and returned in registers.  The macro
 * calling convention can't directly deal with this, so the wrapper
 * functions must do this.
 *
 * These PVOP_* macros are only defined within this header.  This
 * means that all uses must be wrapped in inline functions.  This also
 * makes sure the incoming and outgoing types are always correct.
 */
373
#define __PVOP_CALL(rettype, op, pre, post, ...)			\
374
	({								\
375
		rettype __ret;						\
376
		unsigned long __eax, __edx, __ecx;			\
377 378 379 380 381
		if (sizeof(rettype) > sizeof(unsigned long)) {		\
			asm volatile(pre				\
				     paravirt_alt(PARAVIRT_CALL)	\
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
382
				       "=c" (__ecx)			\
383 384 385
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
386
				     : "memory", "cc");			\
387
			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
388
		} else {						\
389
			asm volatile(pre				\
390
				     paravirt_alt(PARAVIRT_CALL)	\
391 392 393 394 395 396
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
				       "=c" (__ecx)			\
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
397
				     : "memory", "cc");			\
398
			__ret = (rettype)__eax;				\
399 400 401
		}							\
		__ret;							\
	})
402
#define __PVOP_VCALL(op, pre, post, ...)				\
403 404
	({								\
		unsigned long __eax, __edx, __ecx;			\
405
		asm volatile(pre					\
406
			     paravirt_alt(PARAVIRT_CALL)		\
407
			     post					\
408
			     : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \
409 410 411
			     : paravirt_type(op),			\
			       paravirt_clobber(CLBR_ANY),		\
			       ##__VA_ARGS__				\
412 413 414
			     : "memory", "cc");				\
	})

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
#define PVOP_CALL0(rettype, op)						\
	__PVOP_CALL(rettype, op, "", "")
#define PVOP_VCALL0(op)							\
	__PVOP_VCALL(op, "", "")

#define PVOP_CALL1(rettype, op, arg1)					\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)))
#define PVOP_VCALL1(op, arg1)						\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)))

#define PVOP_CALL2(rettype, op, arg1, arg2)				\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))
#define PVOP_VCALL2(op, arg1, arg2)					\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))

#define PVOP_CALL3(rettype, op, arg1, arg2, arg3)			\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)),		\
		    "1"((u32)(arg2)), "2"((u32)(arg3)))
#define PVOP_VCALL3(op, arg1, arg2, arg3)				\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1"((u32)(arg2)),	\
		     "2"((u32)(arg3)))

#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
	__PVOP_CALL(rettype, op,					\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
	__PVOP_VCALL(op,						\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))

448 449
static inline int paravirt_enabled(void)
{
450
	return pv_info.paravirt_enabled;
451
}
452

453
static inline void load_sp0(struct tss_struct *tss,
454 455
			     struct thread_struct *thread)
{
456
	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
457 458
}

459
#define ARCH_SETUP			pv_init_ops.arch_setup();
460 461
static inline unsigned long get_wallclock(void)
{
462
	return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
463 464 465 466
}

static inline int set_wallclock(unsigned long nowtime)
{
467
	return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
468 469
}

Z
Zachary Amsden 已提交
470
static inline void (*choose_time_init(void))(void)
471
{
472
	return pv_time_ops.time_init;
473 474 475 476 477 478
}

/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
			   unsigned int *ecx, unsigned int *edx)
{
479
	PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
480 481 482 483 484
}

/*
 * These special macros can be used to get or set a debugging register
 */
485 486
static inline unsigned long paravirt_get_debugreg(int reg)
{
487
	return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
488 489 490 491
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
492
	PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
493
}
494

495 496
static inline void clts(void)
{
497
	PVOP_VCALL0(pv_cpu_ops.clts);
498
}
499

500 501
static inline unsigned long read_cr0(void)
{
502
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
503
}
504

505 506
static inline void write_cr0(unsigned long x)
{
507
	PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
508 509 510 511
}

static inline unsigned long read_cr2(void)
{
512
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
513 514 515 516
}

static inline void write_cr2(unsigned long x)
{
517
	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
518 519 520 521
}

static inline unsigned long read_cr3(void)
{
522
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
523
}
524

525 526
static inline void write_cr3(unsigned long x)
{
527
	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
528
}
529

530 531
static inline unsigned long read_cr4(void)
{
532
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
533 534 535
}
static inline unsigned long read_cr4_safe(void)
{
536
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
537
}
538

539 540
static inline void write_cr4(unsigned long x)
{
541
	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
542
}
543

544 545
static inline void raw_safe_halt(void)
{
546
	PVOP_VCALL0(pv_irq_ops.safe_halt);
547 548 549 550
}

static inline void halt(void)
{
551
	PVOP_VCALL0(pv_irq_ops.safe_halt);
552 553 554 555
}

static inline void wbinvd(void)
{
556
	PVOP_VCALL0(pv_cpu_ops.wbinvd);
557 558
}

559
#define get_kernel_rpl()  (pv_info.kernel_rpl)
560

561 562
static inline u64 paravirt_read_msr(unsigned msr, int *err)
{
563
	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
564 565 566
}
static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
{
567
	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
568 569
}

570
/* These should all do BUG_ON(_err), but our headers are too tangled. */
571 572 573 574 575
#define rdmsr(msr,val1,val2) do {		\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	val1 = (u32)_l;				\
	val2 = _l >> 32;			\
576 577
} while(0)

578 579
#define wrmsr(msr,val1,val2) do {		\
	paravirt_write_msr(msr, val1, val2);	\
580 581
} while(0)

582 583 584
#define rdmsrl(msr,val) do {			\
	int _err;				\
	val = paravirt_read_msr(msr, &_err);	\
585 586
} while(0)

587
#define wrmsrl(msr,val)		wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
588
#define wrmsr_safe(msr,a,b)	paravirt_write_msr(msr, a, b)
589 590

/* rdmsr with exception handling */
591 592 593 594 595
#define rdmsr_safe(msr,a,b) ({			\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	(*a) = (u32)_l;				\
	(*b) = _l >> 32;			\
596 597
	_err; })

598 599 600

static inline u64 paravirt_read_tsc(void)
{
601
	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
602
}
603

604 605 606
#define rdtscl(low) do {			\
	u64 _l = paravirt_read_tsc();		\
	low = (int)_l;				\
607 608
} while(0)

609
#define rdtscll(val) (val = paravirt_read_tsc())
610

611 612
static inline unsigned long long paravirt_sched_clock(void)
{
613
	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
614
}
615
#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
616

617 618
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

619 620
static inline unsigned long long paravirt_read_pmc(int counter)
{
621
	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
622
}
623

624 625 626 627 628
#define rdpmc(counter,low,high) do {		\
	u64 _l = paravirt_read_pmc(counter);	\
	low = (u32)_l;				\
	high = _l >> 32;			\
} while(0)
629

630 631
static inline void load_TR_desc(void)
{
632
	PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
633
}
634
static inline void load_gdt(const struct desc_ptr *dtr)
635
{
636
	PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
637
}
638
static inline void load_idt(const struct desc_ptr *dtr)
639
{
640
	PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
641 642 643
}
static inline void set_ldt(const void *addr, unsigned entries)
{
644
	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
645
}
646
static inline void store_gdt(struct desc_ptr *dtr)
647
{
648
	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
649
}
650
static inline void store_idt(struct desc_ptr *dtr)
651
{
652
	PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
653 654 655
}
static inline unsigned long paravirt_store_tr(void)
{
656
	return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
657 658 659 660
}
#define store_tr(tr)	((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
661
	PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
662 663 664
}
static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
{
665
	PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
666
}
667 668 669

static inline void write_gdt_entry(struct desc_struct *dt, int entry,
				   void *desc, int type)
670
{
671
	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
672
}
673

674
static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
675
{
676
	PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
677 678 679
}
static inline void set_iopl_mask(unsigned mask)
{
680
	PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
681
}
682

683 684
/* The paravirtualized I/O functions */
static inline void slow_down_io(void) {
685
	pv_cpu_ops.io_delay();
686
#ifdef REALLY_SLOW_IO
687 688 689
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
690 691 692
#endif
}

693 694 695 696
#ifdef CONFIG_X86_LOCAL_APIC
/*
 * Basic functions accessing APICs.
 */
697
static inline void apic_write(unsigned long reg, u32 v)
698
{
699
	PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
700 701
}

702
static inline void apic_write_atomic(unsigned long reg, u32 v)
703
{
704
	PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
705 706
}

707
static inline u32 apic_read(unsigned long reg)
708
{
709
	return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
710
}
Z
Zachary Amsden 已提交
711 712 713

static inline void setup_boot_clock(void)
{
714
	PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
Z
Zachary Amsden 已提交
715 716 717 718
}

static inline void setup_secondary_clock(void)
{
719
	PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
Z
Zachary Amsden 已提交
720
}
721 722
#endif

723 724
static inline void paravirt_post_allocator_init(void)
{
725 726
	if (pv_init_ops.post_allocator_init)
		(*pv_init_ops.post_allocator_init)();
727 728
}

729 730
static inline void paravirt_pagetable_setup_start(pgd_t *base)
{
731
	(*pv_mmu_ops.pagetable_setup_start)(base);
732 733 734 735
}

static inline void paravirt_pagetable_setup_done(pgd_t *base)
{
736
	(*pv_mmu_ops.pagetable_setup_done)(base);
737
}
738

739 740 741 742
#ifdef CONFIG_SMP
static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
				    unsigned long start_esp)
{
743 744
	PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
		    phys_apicid, start_eip, start_esp);
745 746
}
#endif
747

748 749 750
static inline void paravirt_activate_mm(struct mm_struct *prev,
					struct mm_struct *next)
{
751
	PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
752 753 754 755 756
}

static inline void arch_dup_mmap(struct mm_struct *oldmm,
				 struct mm_struct *mm)
{
757
	PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
758 759 760 761
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
762
	PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
763 764
}

765 766
static inline void __flush_tlb(void)
{
767
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
768 769 770
}
static inline void __flush_tlb_global(void)
{
771
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
772 773 774
}
static inline void __flush_tlb_single(unsigned long addr)
{
775
	PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
776
}
777

778 779 780
static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
				    unsigned long va)
{
781
	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
782 783
}

784
static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
785
{
786
	PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
787 788 789
}
static inline void paravirt_release_pt(unsigned pfn)
{
790
	PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
791
}
792

793 794
static inline void paravirt_alloc_pd(unsigned pfn)
{
795
	PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
796
}
797

798 799 800
static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
					   unsigned start, unsigned count)
{
801
	PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
802 803
}
static inline void paravirt_release_pd(unsigned pfn)
804
{
805
	PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
806 807
}

808 809 810 811
#ifdef CONFIG_HIGHPTE
static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
{
	unsigned long ret;
812
	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
813 814 815 816
	return (void *)ret;
}
#endif

817 818
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
819
{
820
	PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
821 822
}

823 824
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
				    pte_t *ptep)
825
{
826
	PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
827 828
}

829 830
#ifdef CONFIG_X86_PAE
static inline pte_t __pte(unsigned long long val)
831
{
832 833
	unsigned long long ret = PVOP_CALL2(unsigned long long,
					    pv_mmu_ops.make_pte,
834 835
					    val, val >> 32);
	return (pte_t) { ret, ret >> 32 };
836 837
}

838
static inline pmd_t __pmd(unsigned long long val)
839
{
840 841
	return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd,
				    val, val >> 32) };
842 843 844 845
}

static inline pgd_t __pgd(unsigned long long val)
{
846 847
	return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd,
				    val, val >> 32) };
848 849 850 851
}

static inline unsigned long long pte_val(pte_t x)
{
852 853
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val,
			  x.pte_low, x.pte_high);
854 855 856 857
}

static inline unsigned long long pmd_val(pmd_t x)
{
858 859
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val,
			  x.pmd, x.pmd >> 32);
860 861 862 863
}

static inline unsigned long long pgd_val(pgd_t x)
{
864 865
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val,
			  x.pgd, x.pgd >> 32);
866 867 868 869
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
870
	PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high);
871 872 873 874 875 876
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
	/* 5 arg words */
877
	pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval);
878 879 880 881
}

static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
{
882 883
	PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
		    pteval.pte_low, pteval.pte_high);
884 885
}

886 887
static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
				   pte_t *ptep, pte_t pte)
888
{
889
	/* 5 arg words */
890
	pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
891 892
}

893 894
static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
895 896
	PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp,
		    pmdval.pmd, pmdval.pmd >> 32);
897 898
}

899 900
static inline void set_pud(pud_t *pudp, pud_t pudval)
{
901 902
	PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
		    pudval.pgd.pgd, pudval.pgd.pgd >> 32);
903 904 905 906
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
907
	PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
908 909 910 911
}

static inline void pmd_clear(pmd_t *pmdp)
{
912
	PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
913 914 915
}

#else  /* !CONFIG_X86_PAE */
916

917 918
static inline pte_t __pte(unsigned long val)
{
919
	return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) };
920
}
921 922 923

static inline pgd_t __pgd(unsigned long val)
{
924
	return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) };
925 926 927 928
}

static inline unsigned long pte_val(pte_t x)
{
929
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low);
930 931 932 933
}

static inline unsigned long pgd_val(pgd_t x)
{
934
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd);
935 936 937 938
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
939
	PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low);
940 941 942 943 944
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
945
	PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low);
946 947 948 949
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
950
	PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd);
951 952
}
#endif	/* CONFIG_X86_PAE */
953

954 955 956 957 958 959 960 961 962 963 964 965 966 967
/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
	PARAVIRT_LAZY_NONE,
	PARAVIRT_LAZY_MMU,
	PARAVIRT_LAZY_CPU,
};

enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
void paravirt_enter_lazy_cpu(void);
void paravirt_leave_lazy_cpu(void);
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
void paravirt_leave_lazy(enum paravirt_lazy_mode mode);

968
#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
969 970
static inline void arch_enter_lazy_cpu_mode(void)
{
971
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
972 973 974 975
}

static inline void arch_leave_lazy_cpu_mode(void)
{
976
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
977 978 979 980
}

static inline void arch_flush_lazy_cpu_mode(void)
{
981 982 983 984
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
		arch_leave_lazy_cpu_mode();
		arch_enter_lazy_cpu_mode();
	}
985 986
}

987 988

#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
989 990
static inline void arch_enter_lazy_mmu_mode(void)
{
991
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
992 993 994 995
}

static inline void arch_leave_lazy_mmu_mode(void)
{
996
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
997 998 999 1000
}

static inline void arch_flush_lazy_mmu_mode(void)
{
1001 1002 1003 1004
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
		arch_leave_lazy_mmu_mode();
		arch_enter_lazy_mmu_mode();
	}
1005
}
1006

1007 1008 1009
void _paravirt_nop(void);
#define paravirt_nop	((void *)_paravirt_nop)

1010
/* These all sit in the .parainstructions section to tell us what to patch. */
1011
struct paravirt_patch_site {
1012 1013 1014 1015 1016 1017
	u8 *instr; 		/* original instructions */
	u8 instrtype;		/* type of this instruction */
	u8 len;			/* length of original instruction */
	u16 clobbers;		/* what registers you may clobber */
};

1018 1019 1020
extern struct paravirt_patch_site __parainstructions[],
	__parainstructions_end[];

1021 1022 1023 1024
static inline unsigned long __raw_local_save_flags(void)
{
	unsigned long f;

1025 1026 1027 1028
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
1029
		     : paravirt_type(pv_irq_ops.save_fl),
1030
		       paravirt_clobber(CLBR_EAX)
1031
		     : "memory", "cc");
1032 1033 1034 1035 1036
	return f;
}

static inline void raw_local_irq_restore(unsigned long f)
{
1037 1038 1039 1040 1041
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
		     : "0"(f),
1042
		       paravirt_type(pv_irq_ops.restore_fl),
1043 1044
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "cc");
1045 1046 1047 1048
}

static inline void raw_local_irq_disable(void)
{
1049 1050 1051 1052
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1053
		     : paravirt_type(pv_irq_ops.irq_disable),
1054 1055
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1056 1057 1058 1059
}

static inline void raw_local_irq_enable(void)
{
1060 1061 1062 1063
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1064
		     : paravirt_type(pv_irq_ops.irq_enable),
1065 1066
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1067 1068 1069 1070 1071 1072
}

static inline unsigned long __raw_local_irq_save(void)
{
	unsigned long f;

1073 1074
	f = __raw_local_save_flags();
	raw_local_irq_disable();
1075 1076 1077
	return f;
}

1078 1079
#define CLI_STRING							\
	_paravirt_alt("pushl %%ecx; pushl %%edx;"			\
1080
		      "call *%[paravirt_cli_opptr];"			\
1081 1082 1083 1084 1085
		      "popl %%edx; popl %%ecx",				\
		      "%c[paravirt_cli_type]", "%c[paravirt_clobber]")

#define STI_STRING							\
	_paravirt_alt("pushl %%ecx; pushl %%edx;"			\
1086
		      "call *%[paravirt_sti_opptr];"			\
1087 1088
		      "popl %%edx; popl %%ecx",				\
		      "%c[paravirt_sti_type]", "%c[paravirt_clobber]")
1089 1090

#define CLI_STI_CLOBBERS , "%eax"
1091
#define CLI_STI_INPUT_ARGS						\
1092
	,								\
1093 1094 1095 1096
	[paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)),		\
	[paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable),		\
	[paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)),		\
	[paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable),		\
1097 1098
	paravirt_clobber(CLBR_EAX)

1099
/* Make sure as little as possible of this mess escapes. */
1100
#undef PARAVIRT_CALL
1101 1102
#undef __PVOP_CALL
#undef __PVOP_VCALL
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4
1113

1114 1115
#else  /* __ASSEMBLY__ */

1116
#define PARA_PATCH(struct, off)	((PARAVIRT_PATCH_##struct + (off)) / 4)
1117 1118

#define PARA_SITE(ptype, clobbers, ops)		\
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
771:;						\
	ops;					\
772:;						\
	.pushsection .parainstructions,"a";	\
	 .long 771b;				\
	 .byte ptype;				\
	 .byte 772b-771b;			\
	 .short clobbers;			\
	.popsection

1129 1130 1131
#define INTERRUPT_RETURN						\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
		  jmp *%cs:pv_cpu_ops+PV_CPU_iret)
1132 1133

#define DISABLE_INTERRUPTS(clobbers)					\
1134
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1135
		  pushl %eax; pushl %ecx; pushl %edx;			\
1136
		  call *%cs:pv_irq_ops+PV_IRQ_irq_disable;		\
1137
		  popl %edx; popl %ecx; popl %eax)			\
1138 1139

#define ENABLE_INTERRUPTS(clobbers)					\
1140
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
1141
		  pushl %eax; pushl %ecx; pushl %edx;			\
1142
		  call *%cs:pv_irq_ops+PV_IRQ_irq_enable;		\
1143
		  popl %edx; popl %ecx; popl %eax)
1144

1145 1146 1147 1148
#define ENABLE_INTERRUPTS_SYSCALL_RET					\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\
		  CLBR_NONE,						\
		  jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)
1149 1150

#define GET_CR0_INTO_EAX			\
1151
	push %ecx; push %edx;			\
1152
	call *pv_cpu_ops+PV_CPU_read_cr0;	\
1153
	pop %edx; pop %ecx
1154

1155 1156 1157
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
#endif	/* __ASM_PARAVIRT_H */