paravirt.h 30.5 KB
Newer Older
1 2 3 4
#ifndef __ASM_PARAVIRT_H
#define __ASM_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
 * para-virtualization: those hooks are defined here. */
5 6

#ifdef CONFIG_PARAVIRT
7
#include <asm/page.h>
8

9 10 11 12 13 14 15
/* Bitmask of what can be clobbered: usually at least eax. */
#define CLBR_NONE 0x0
#define CLBR_EAX 0x1
#define CLBR_ECX 0x2
#define CLBR_EDX 0x4
#define CLBR_ANY 0x7

16
#ifndef __ASSEMBLY__
17
#include <linux/types.h>
18
#include <linux/cpumask.h>
19
#include <asm/kmap_types.h>
20
#include <asm/desc_defs.h>
21

22
struct page;
23
struct thread_struct;
24
struct desc_ptr;
25
struct tss_struct;
26
struct mm_struct;
27
struct desc_struct;
28

29 30
/* general info */
struct pv_info {
31
	unsigned int kernel_rpl;
32
	int shared_kernel_pmd;
33
	int paravirt_enabled;
34
	const char *name;
35
};
36

37
struct pv_init_ops {
38
	/*
39 40 41 42 43 44
	 * Patch may replace one of the defined code sequences with
	 * arbitrary code, subject to the same register constraints.
	 * This generally means the code is not free to clobber any
	 * registers other than EAX.  The patch function should return
	 * the number of bytes of code generated, as we nop pad the
	 * rest in generic code.
45
	 */
46 47
	unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
			  unsigned long addr, unsigned len);
48

49
	/* Basic arch-specific setup */
50 51
	void (*arch_setup)(void);
	char *(*memory_setup)(void);
52 53
	void (*post_allocator_init)(void);

54
	/* Print a banner to identify the environment */
55
	void (*banner)(void);
56 57 58
};


59
struct pv_lazy_ops {
60
	/* Set deferred update mode, used for batching operations. */
61 62
	void (*enter)(void);
	void (*leave)(void);
63 64 65 66
};

struct pv_time_ops {
	void (*time_init)(void);
67

68
	/* Set and set time of day */
69 70 71
	unsigned long (*get_wallclock)(void);
	int (*set_wallclock)(unsigned long);

72 73 74
	unsigned long long (*sched_clock)(void);
	unsigned long (*get_cpu_khz)(void);
};
75

76
struct pv_cpu_ops {
77
	/* hooks for various privileged instructions */
78 79
	unsigned long (*get_debugreg)(int regno);
	void (*set_debugreg)(int regno, unsigned long value);
80

81
	void (*clts)(void);
82

83 84
	unsigned long (*read_cr0)(void);
	void (*write_cr0)(unsigned long);
85

86 87 88
	unsigned long (*read_cr4_safe)(void);
	unsigned long (*read_cr4)(void);
	void (*write_cr4)(unsigned long);
89

90
	/* Segment descriptor handling */
91
	void (*load_tr_desc)(void);
92 93 94 95
	void (*load_gdt)(const struct desc_ptr *);
	void (*load_idt)(const struct desc_ptr *);
	void (*store_gdt)(struct desc_ptr *);
	void (*store_idt)(struct desc_ptr *);
96 97 98
	void (*set_ldt)(const void *desc, unsigned entries);
	unsigned long (*store_tr)(void);
	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
99 100
	void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
				const void *desc);
101
	void (*write_gdt_entry)(struct desc_struct *,
102
				int entrynum, const void *desc, int size);
103 104
	void (*write_idt_entry)(gate_desc *,
				int entrynum, const gate_desc *gate);
105
	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
106

107
	void (*set_iopl_mask)(unsigned mask);
108 109

	void (*wbinvd)(void);
110
	void (*io_delay)(void);
111

112 113 114 115 116 117 118
	/* cpuid emulation, mostly so that caps bits can be disabled */
	void (*cpuid)(unsigned int *eax, unsigned int *ebx,
		      unsigned int *ecx, unsigned int *edx);

	/* MSR, PMC and TSR operations.
	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
	u64 (*read_msr)(unsigned int msr, int *err);
119
	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
120 121

	u64 (*read_tsc)(void);
122
	u64 (*read_pmc)(int counter);
123 124

	/* These two are jmp to, not actually called. */
125
	void (*irq_enable_syscall_ret)(void);
126
	void (*iret)(void);
127 128

	struct pv_lazy_ops lazy_mode;
129 130 131 132 133
};

struct pv_irq_ops {
	void (*init_IRQ)(void);

134
	/*
135 136 137 138
	 * Get/set interrupt state.  save_fl and restore_fl are only
	 * expected to use X86_EFLAGS_IF; all other bits
	 * returned from save_fl are undefined, and may be ignored by
	 * restore_fl.
139
	 */
140 141 142 143 144 145 146
	unsigned long (*save_fl)(void);
	void (*restore_fl)(unsigned long);
	void (*irq_disable)(void);
	void (*irq_enable)(void);
	void (*safe_halt)(void);
	void (*halt)(void);
};
147

148
struct pv_apic_ops {
149
#ifdef CONFIG_X86_LOCAL_APIC
150 151 152 153
	/*
	 * Direct APIC operations, principally for VMI.  Ideally
	 * these shouldn't be in this interface.
	 */
154 155 156
	void (*apic_write)(unsigned long reg, u32 v);
	void (*apic_write_atomic)(unsigned long reg, u32 v);
	u32 (*apic_read)(unsigned long reg);
Z
Zachary Amsden 已提交
157 158
	void (*setup_boot_clock)(void);
	void (*setup_secondary_clock)(void);
159 160 161 162

	void (*startup_ipi_hook)(int phys_apicid,
				 unsigned long start_eip,
				 unsigned long start_esp);
163
#endif
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
};

struct pv_mmu_ops {
	/*
	 * Called before/after init_mm pagetable setup. setup_start
	 * may reset %cr3, and may pre-install parts of the pagetable;
	 * pagetable setup is expected to preserve any existing
	 * mapping.
	 */
	void (*pagetable_setup_start)(pgd_t *pgd_base);
	void (*pagetable_setup_done)(pgd_t *pgd_base);

	unsigned long (*read_cr2)(void);
	void (*write_cr2)(unsigned long);

	unsigned long (*read_cr3)(void);
	void (*write_cr3)(unsigned long);

	/*
	 * Hooks for intercepting the creation/use/destruction of an
	 * mm_struct.
	 */
	void (*activate_mm)(struct mm_struct *prev,
			    struct mm_struct *next);
	void (*dup_mmap)(struct mm_struct *oldmm,
			 struct mm_struct *mm);
	void (*exit_mmap)(struct mm_struct *mm);

192

193
	/* TLB operations */
194 195
	void (*flush_tlb_user)(void);
	void (*flush_tlb_kernel)(void);
196
	void (*flush_tlb_single)(unsigned long addr);
197 198
	void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
				 unsigned long va);
199

200
	/* Hooks for allocating/releasing pagetable pages */
201
	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
202 203 204 205 206
	void (*alloc_pd)(u32 pfn);
	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
	void (*release_pt)(u32 pfn);
	void (*release_pd)(u32 pfn);

207
	/* Pagetable manipulation functions */
208
	void (*set_pte)(pte_t *ptep, pte_t pteval);
209 210
	void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
			   pte_t *ptep, pte_t pteval);
211
	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
212
	void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
213 214
	void (*pte_update_defer)(struct mm_struct *mm,
				 unsigned long addr, pte_t *ptep);
215

216
#ifdef CONFIG_X86_PAE
217
	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
218 219
	void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
				pte_t *ptep, pte_t pte);
220
	void (*set_pud)(pud_t *pudp, pud_t pudval);
221
	void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
222
	void (*pmd_clear)(pmd_t *pmdp);
223 224 225 226 227 228 229 230 231 232 233 234 235 236

	unsigned long long (*pte_val)(pte_t);
	unsigned long long (*pmd_val)(pmd_t);
	unsigned long long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long long pte);
	pmd_t (*make_pmd)(unsigned long long pmd);
	pgd_t (*make_pgd)(unsigned long long pgd);
#else
	unsigned long (*pte_val)(pte_t);
	unsigned long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long pte);
	pgd_t (*make_pgd)(unsigned long pgd);
237 238
#endif

239 240 241
#ifdef CONFIG_HIGHPTE
	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
#endif
242 243

	struct pv_lazy_ops lazy_mode;
244
};
245

246 247 248 249 250 251 252 253 254 255 256
/* This contains all the paravirt structures: we get a convenient
 * number for each function using the offset which we use to indicate
 * what to patch. */
struct paravirt_patch_template
{
	struct pv_init_ops pv_init_ops;
	struct pv_time_ops pv_time_ops;
	struct pv_cpu_ops pv_cpu_ops;
	struct pv_irq_ops pv_irq_ops;
	struct pv_apic_ops pv_apic_ops;
	struct pv_mmu_ops pv_mmu_ops;
257 258
};

259 260 261 262 263 264 265
extern struct pv_info pv_info;
extern struct pv_init_ops pv_init_ops;
extern struct pv_time_ops pv_time_ops;
extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
266

267
#define PARAVIRT_PATCH(x)					\
268
	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
269

270 271 272
#define paravirt_type(op)				\
	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
	[paravirt_opptr] "m" (op)
273 274 275
#define paravirt_clobber(clobber)		\
	[paravirt_clobber] "i" (clobber)

276 277 278 279
/*
 * Generate some code, and mark it as patchable by the
 * apply_paravirt() alternate instruction patcher.
 */
280 281 282 283 284 285 286 287 288
#define _paravirt_alt(insn_string, type, clobber)	\
	"771:\n\t" insn_string "\n" "772:\n"		\
	".pushsection .parainstructions,\"a\"\n"	\
	"  .long 771b\n"				\
	"  .byte " type "\n"				\
	"  .byte 772b-771b\n"				\
	"  .short " clobber "\n"			\
	".popsection\n"

289
/* Generate patchable code, with the default asm parameters. */
290
#define paravirt_alt(insn_string)					\
291 292
	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")

293 294
unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ignore(unsigned len);
295 296 297
unsigned paravirt_patch_call(void *insnbuf,
			     const void *target, u16 tgt_clobbers,
			     unsigned long addr, u16 site_clobbers,
298
			     unsigned len);
299
unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
300 301 302
			    unsigned long addr, unsigned len);
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
				unsigned long addr, unsigned len);
303

304
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
305 306
			      const char *start, const char *end);

307
int paravirt_disable_iospace(void);
308

309 310 311
/*
 * This generates an indirect call based on the operation type number.
 * The type number, computed in PARAVIRT_PATCH, is derived from the
312 313
 * offset into the paravirt_patch_template structure, and can therefore be
 * freely converted back into a structure offset.
314
 */
315
#define PARAVIRT_CALL	"call *%[paravirt_opptr];"
316 317

/*
318 319
 * These macros are intended to wrap calls through one of the paravirt
 * ops structs, so that they can be later identified and patched at
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
 * runtime.
 *
 * Normally, a call to a pv_op function is a simple indirect call:
 * (paravirt_ops.operations)(args...).
 *
 * Unfortunately, this is a relatively slow operation for modern CPUs,
 * because it cannot necessarily determine what the destination
 * address is.  In this case, the address is a runtime constant, so at
 * the very least we can patch the call to e a simple direct call, or
 * ideally, patch an inline implementation into the callsite.  (Direct
 * calls are essentially free, because the call and return addresses
 * are completely predictable.)
 *
 * These macros rely on the standard gcc "regparm(3)" calling
 * convention, in which the first three arguments are placed in %eax,
 * %edx, %ecx (in that order), and the remaining arguments are placed
 * on the stack.  All caller-save registers (eax,edx,ecx) are expected
 * to be modified (either clobbered or used for return values).
 *
 * The call instruction itself is marked by placing its start address
 * and size into the .parainstructions section, so that
 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
342
 * appropriate patching under the control of the backend pv_init_ops
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
 * implementation.
 *
 * Unfortunately there's no way to get gcc to generate the args setup
 * for the call, and then allow the call itself to be generated by an
 * inline asm.  Because of this, we must do the complete arg setup and
 * return value handling from within these macros.  This is fairly
 * cumbersome.
 *
 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
 * It could be extended to more arguments, but there would be little
 * to be gained from that.  For each number of arguments, there are
 * the two VCALL and CALL variants for void and non-void functions.
 *
 * When there is a return value, the invoker of the macro must specify
 * the return type.  The macro then uses sizeof() on that type to
 * determine whether its a 32 or 64 bit value, and places the return
 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
 * 64-bit).
 *
 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
 * in low,high order.
 *
 * Small structures are passed and returned in registers.  The macro
 * calling convention can't directly deal with this, so the wrapper
 * functions must do this.
 *
 * These PVOP_* macros are only defined within this header.  This
 * means that all uses must be wrapped in inline functions.  This also
 * makes sure the incoming and outgoing types are always correct.
 */
373
#define __PVOP_CALL(rettype, op, pre, post, ...)			\
374
	({								\
375
		rettype __ret;						\
376
		unsigned long __eax, __edx, __ecx;			\
377 378 379 380 381
		if (sizeof(rettype) > sizeof(unsigned long)) {		\
			asm volatile(pre				\
				     paravirt_alt(PARAVIRT_CALL)	\
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
382
				       "=c" (__ecx)			\
383 384 385
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
386
				     : "memory", "cc");			\
387
			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
388
		} else {						\
389
			asm volatile(pre				\
390
				     paravirt_alt(PARAVIRT_CALL)	\
391 392 393 394 395 396
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
				       "=c" (__ecx)			\
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
397
				     : "memory", "cc");			\
398
			__ret = (rettype)__eax;				\
399 400 401
		}							\
		__ret;							\
	})
402
#define __PVOP_VCALL(op, pre, post, ...)				\
403 404
	({								\
		unsigned long __eax, __edx, __ecx;			\
405
		asm volatile(pre					\
406
			     paravirt_alt(PARAVIRT_CALL)		\
407
			     post					\
408
			     : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \
409 410 411
			     : paravirt_type(op),			\
			       paravirt_clobber(CLBR_ANY),		\
			       ##__VA_ARGS__				\
412 413 414
			     : "memory", "cc");				\
	})

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
#define PVOP_CALL0(rettype, op)						\
	__PVOP_CALL(rettype, op, "", "")
#define PVOP_VCALL0(op)							\
	__PVOP_VCALL(op, "", "")

#define PVOP_CALL1(rettype, op, arg1)					\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)))
#define PVOP_VCALL1(op, arg1)						\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)))

#define PVOP_CALL2(rettype, op, arg1, arg2)				\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))
#define PVOP_VCALL2(op, arg1, arg2)					\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))

#define PVOP_CALL3(rettype, op, arg1, arg2, arg3)			\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)),		\
		    "1"((u32)(arg2)), "2"((u32)(arg3)))
#define PVOP_VCALL3(op, arg1, arg2, arg3)				\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1"((u32)(arg2)),	\
		     "2"((u32)(arg3)))

#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
	__PVOP_CALL(rettype, op,					\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
	__PVOP_VCALL(op,						\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))

448 449
static inline int paravirt_enabled(void)
{
450
	return pv_info.paravirt_enabled;
451
}
452

453
static inline void load_sp0(struct tss_struct *tss,
454 455
			     struct thread_struct *thread)
{
456
	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
457 458
}

459
#define ARCH_SETUP			pv_init_ops.arch_setup();
460 461
static inline unsigned long get_wallclock(void)
{
462
	return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
463 464 465 466
}

static inline int set_wallclock(unsigned long nowtime)
{
467
	return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
468 469
}

Z
Zachary Amsden 已提交
470
static inline void (*choose_time_init(void))(void)
471
{
472
	return pv_time_ops.time_init;
473 474 475 476 477 478
}

/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
			   unsigned int *ecx, unsigned int *edx)
{
479
	PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
480 481 482 483 484
}

/*
 * These special macros can be used to get or set a debugging register
 */
485 486
static inline unsigned long paravirt_get_debugreg(int reg)
{
487
	return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
488 489 490 491
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
492
	PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
493
}
494

495 496
static inline void clts(void)
{
497
	PVOP_VCALL0(pv_cpu_ops.clts);
498
}
499

500 501
static inline unsigned long read_cr0(void)
{
502
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
503
}
504

505 506
static inline void write_cr0(unsigned long x)
{
507
	PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
508 509 510 511
}

static inline unsigned long read_cr2(void)
{
512
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
513 514 515 516
}

static inline void write_cr2(unsigned long x)
{
517
	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
518 519 520 521
}

static inline unsigned long read_cr3(void)
{
522
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
523
}
524

525 526
static inline void write_cr3(unsigned long x)
{
527
	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
528
}
529

530 531
static inline unsigned long read_cr4(void)
{
532
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
533 534 535
}
static inline unsigned long read_cr4_safe(void)
{
536
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
537
}
538

539 540
static inline void write_cr4(unsigned long x)
{
541
	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
542
}
543

544 545
static inline void raw_safe_halt(void)
{
546
	PVOP_VCALL0(pv_irq_ops.safe_halt);
547 548 549 550
}

static inline void halt(void)
{
551
	PVOP_VCALL0(pv_irq_ops.safe_halt);
552 553 554 555
}

static inline void wbinvd(void)
{
556
	PVOP_VCALL0(pv_cpu_ops.wbinvd);
557 558
}

559
#define get_kernel_rpl()  (pv_info.kernel_rpl)
560

561 562
static inline u64 paravirt_read_msr(unsigned msr, int *err)
{
563
	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
564 565 566
}
static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
{
567
	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
568 569
}

570
/* These should all do BUG_ON(_err), but our headers are too tangled. */
571 572 573 574 575
#define rdmsr(msr,val1,val2) do {		\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	val1 = (u32)_l;				\
	val2 = _l >> 32;			\
576 577
} while(0)

578 579
#define wrmsr(msr,val1,val2) do {		\
	paravirt_write_msr(msr, val1, val2);	\
580 581
} while(0)

582 583 584
#define rdmsrl(msr,val) do {			\
	int _err;				\
	val = paravirt_read_msr(msr, &_err);	\
585 586
} while(0)

587
#define wrmsrl(msr,val)		wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
588
#define wrmsr_safe(msr,a,b)	paravirt_write_msr(msr, a, b)
589 590

/* rdmsr with exception handling */
591 592 593 594 595
#define rdmsr_safe(msr,a,b) ({			\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	(*a) = (u32)_l;				\
	(*b) = _l >> 32;			\
596 597
	_err; })

598 599 600

static inline u64 paravirt_read_tsc(void)
{
601
	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
602
}
603

604 605 606
#define rdtscl(low) do {			\
	u64 _l = paravirt_read_tsc();		\
	low = (int)_l;				\
607 608
} while(0)

609
#define rdtscll(val) (val = paravirt_read_tsc())
610

611 612
static inline unsigned long long paravirt_sched_clock(void)
{
613
	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
614
}
615
#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
616

617 618
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

619 620
static inline unsigned long long paravirt_read_pmc(int counter)
{
621
	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
622
}
623

624 625 626 627 628
#define rdpmc(counter,low,high) do {		\
	u64 _l = paravirt_read_pmc(counter);	\
	low = (u32)_l;				\
	high = _l >> 32;			\
} while(0)
629

630 631
static inline void load_TR_desc(void)
{
632
	PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
633
}
634
static inline void load_gdt(const struct desc_ptr *dtr)
635
{
636
	PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
637
}
638
static inline void load_idt(const struct desc_ptr *dtr)
639
{
640
	PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
641 642 643
}
static inline void set_ldt(const void *addr, unsigned entries)
{
644
	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
645
}
646
static inline void store_gdt(struct desc_ptr *dtr)
647
{
648
	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
649
}
650
static inline void store_idt(struct desc_ptr *dtr)
651
{
652
	PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
653 654 655
}
static inline unsigned long paravirt_store_tr(void)
{
656
	return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
657 658 659 660
}
#define store_tr(tr)	((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
661
	PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
662
}
663 664 665

static inline void write_ldt_entry(struct desc_struct *dt, int entry,
				   const void *desc)
666
{
667
	PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
668
}
669 670 671

static inline void write_gdt_entry(struct desc_struct *dt, int entry,
				   void *desc, int type)
672
{
673
	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
674
}
675

676
static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
677
{
678
	PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
679 680 681
}
static inline void set_iopl_mask(unsigned mask)
{
682
	PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
683
}
684

685 686
/* The paravirtualized I/O functions */
static inline void slow_down_io(void) {
687
	pv_cpu_ops.io_delay();
688
#ifdef REALLY_SLOW_IO
689 690 691
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
692 693 694
#endif
}

695 696 697 698
#ifdef CONFIG_X86_LOCAL_APIC
/*
 * Basic functions accessing APICs.
 */
699
static inline void apic_write(unsigned long reg, u32 v)
700
{
701
	PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
702 703
}

704
static inline void apic_write_atomic(unsigned long reg, u32 v)
705
{
706
	PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
707 708
}

709
static inline u32 apic_read(unsigned long reg)
710
{
711
	return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
712
}
Z
Zachary Amsden 已提交
713 714 715

static inline void setup_boot_clock(void)
{
716
	PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
Z
Zachary Amsden 已提交
717 718 719 720
}

static inline void setup_secondary_clock(void)
{
721
	PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
Z
Zachary Amsden 已提交
722
}
723 724
#endif

725 726
static inline void paravirt_post_allocator_init(void)
{
727 728
	if (pv_init_ops.post_allocator_init)
		(*pv_init_ops.post_allocator_init)();
729 730
}

731 732
static inline void paravirt_pagetable_setup_start(pgd_t *base)
{
733
	(*pv_mmu_ops.pagetable_setup_start)(base);
734 735 736 737
}

static inline void paravirt_pagetable_setup_done(pgd_t *base)
{
738
	(*pv_mmu_ops.pagetable_setup_done)(base);
739
}
740

741 742 743 744
#ifdef CONFIG_SMP
static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
				    unsigned long start_esp)
{
745 746
	PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
		    phys_apicid, start_eip, start_esp);
747 748
}
#endif
749

750 751 752
static inline void paravirt_activate_mm(struct mm_struct *prev,
					struct mm_struct *next)
{
753
	PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
754 755 756 757 758
}

static inline void arch_dup_mmap(struct mm_struct *oldmm,
				 struct mm_struct *mm)
{
759
	PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
760 761 762 763
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
764
	PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
765 766
}

767 768
static inline void __flush_tlb(void)
{
769
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
770 771 772
}
static inline void __flush_tlb_global(void)
{
773
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
774 775 776
}
static inline void __flush_tlb_single(unsigned long addr)
{
777
	PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
778
}
779

780 781 782
static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
				    unsigned long va)
{
783
	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
784 785
}

786
static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
787
{
788
	PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
789 790 791
}
static inline void paravirt_release_pt(unsigned pfn)
{
792
	PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
793
}
794

795 796
static inline void paravirt_alloc_pd(unsigned pfn)
{
797
	PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
798
}
799

800 801 802
static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
					   unsigned start, unsigned count)
{
803
	PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
804 805
}
static inline void paravirt_release_pd(unsigned pfn)
806
{
807
	PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
808 809
}

810 811 812 813
#ifdef CONFIG_HIGHPTE
static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
{
	unsigned long ret;
814
	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
815 816 817 818
	return (void *)ret;
}
#endif

819 820
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
821
{
822
	PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
823 824
}

825 826
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
				    pte_t *ptep)
827
{
828
	PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
829 830
}

831 832
#ifdef CONFIG_X86_PAE
static inline pte_t __pte(unsigned long long val)
833
{
834 835
	unsigned long long ret = PVOP_CALL2(unsigned long long,
					    pv_mmu_ops.make_pte,
836 837
					    val, val >> 32);
	return (pte_t) { ret, ret >> 32 };
838 839
}

840
static inline pmd_t __pmd(unsigned long long val)
841
{
842 843
	return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd,
				    val, val >> 32) };
844 845 846 847
}

static inline pgd_t __pgd(unsigned long long val)
{
848 849
	return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd,
				    val, val >> 32) };
850 851 852 853
}

static inline unsigned long long pte_val(pte_t x)
{
854 855
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val,
			  x.pte_low, x.pte_high);
856 857 858 859
}

static inline unsigned long long pmd_val(pmd_t x)
{
860 861
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val,
			  x.pmd, x.pmd >> 32);
862 863 864 865
}

static inline unsigned long long pgd_val(pgd_t x)
{
866 867
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val,
			  x.pgd, x.pgd >> 32);
868 869 870 871
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
872
	PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high);
873 874 875 876 877 878
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
	/* 5 arg words */
879
	pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval);
880 881 882 883
}

static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
{
884 885
	PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
		    pteval.pte_low, pteval.pte_high);
886 887
}

888 889
static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
				   pte_t *ptep, pte_t pte)
890
{
891
	/* 5 arg words */
892
	pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
893 894
}

895 896
static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
897 898
	PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp,
		    pmdval.pmd, pmdval.pmd >> 32);
899 900
}

901 902
static inline void set_pud(pud_t *pudp, pud_t pudval)
{
903 904
	PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
		    pudval.pgd.pgd, pudval.pgd.pgd >> 32);
905 906 907 908
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
909
	PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
910 911 912 913
}

static inline void pmd_clear(pmd_t *pmdp)
{
914
	PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
915 916 917
}

#else  /* !CONFIG_X86_PAE */
918

919 920
static inline pte_t __pte(unsigned long val)
{
921
	return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) };
922
}
923 924 925

static inline pgd_t __pgd(unsigned long val)
{
926
	return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) };
927 928 929 930
}

static inline unsigned long pte_val(pte_t x)
{
931
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low);
932 933 934 935
}

static inline unsigned long pgd_val(pgd_t x)
{
936
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd);
937 938 939 940
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
941
	PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low);
942 943 944 945 946
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
947
	PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low);
948 949 950 951
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
952
	PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd);
953 954
}
#endif	/* CONFIG_X86_PAE */
955

956 957 958 959 960 961 962 963 964 965 966 967 968 969
/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
	PARAVIRT_LAZY_NONE,
	PARAVIRT_LAZY_MMU,
	PARAVIRT_LAZY_CPU,
};

enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
void paravirt_enter_lazy_cpu(void);
void paravirt_leave_lazy_cpu(void);
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
void paravirt_leave_lazy(enum paravirt_lazy_mode mode);

970
#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
971 972
static inline void arch_enter_lazy_cpu_mode(void)
{
973
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
974 975 976 977
}

static inline void arch_leave_lazy_cpu_mode(void)
{
978
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
979 980 981 982
}

static inline void arch_flush_lazy_cpu_mode(void)
{
983 984 985 986
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
		arch_leave_lazy_cpu_mode();
		arch_enter_lazy_cpu_mode();
	}
987 988
}

989 990

#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
991 992
static inline void arch_enter_lazy_mmu_mode(void)
{
993
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
994 995 996 997
}

static inline void arch_leave_lazy_mmu_mode(void)
{
998
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
999 1000 1001 1002
}

static inline void arch_flush_lazy_mmu_mode(void)
{
1003 1004 1005 1006
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
		arch_leave_lazy_mmu_mode();
		arch_enter_lazy_mmu_mode();
	}
1007
}
1008

1009 1010 1011
void _paravirt_nop(void);
#define paravirt_nop	((void *)_paravirt_nop)

1012
/* These all sit in the .parainstructions section to tell us what to patch. */
1013
struct paravirt_patch_site {
1014 1015 1016 1017 1018 1019
	u8 *instr; 		/* original instructions */
	u8 instrtype;		/* type of this instruction */
	u8 len;			/* length of original instruction */
	u16 clobbers;		/* what registers you may clobber */
};

1020 1021 1022
extern struct paravirt_patch_site __parainstructions[],
	__parainstructions_end[];

1023 1024 1025 1026
static inline unsigned long __raw_local_save_flags(void)
{
	unsigned long f;

1027 1028 1029 1030
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
1031
		     : paravirt_type(pv_irq_ops.save_fl),
1032
		       paravirt_clobber(CLBR_EAX)
1033
		     : "memory", "cc");
1034 1035 1036 1037 1038
	return f;
}

static inline void raw_local_irq_restore(unsigned long f)
{
1039 1040 1041 1042 1043
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
		     : "0"(f),
1044
		       paravirt_type(pv_irq_ops.restore_fl),
1045 1046
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "cc");
1047 1048 1049 1050
}

static inline void raw_local_irq_disable(void)
{
1051 1052 1053 1054
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1055
		     : paravirt_type(pv_irq_ops.irq_disable),
1056 1057
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1058 1059 1060 1061
}

static inline void raw_local_irq_enable(void)
{
1062 1063 1064 1065
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1066
		     : paravirt_type(pv_irq_ops.irq_enable),
1067 1068
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1069 1070 1071 1072 1073 1074
}

static inline unsigned long __raw_local_irq_save(void)
{
	unsigned long f;

1075 1076
	f = __raw_local_save_flags();
	raw_local_irq_disable();
1077 1078 1079
	return f;
}

1080
/* Make sure as little as possible of this mess escapes. */
1081
#undef PARAVIRT_CALL
1082 1083
#undef __PVOP_CALL
#undef __PVOP_VCALL
1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4
1094

1095 1096
#else  /* __ASSEMBLY__ */

1097
#define PARA_PATCH(struct, off)	((PARAVIRT_PATCH_##struct + (off)) / 4)
1098 1099

#define PARA_SITE(ptype, clobbers, ops)		\
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
771:;						\
	ops;					\
772:;						\
	.pushsection .parainstructions,"a";	\
	 .long 771b;				\
	 .byte ptype;				\
	 .byte 772b-771b;			\
	 .short clobbers;			\
	.popsection

1110 1111 1112
#define INTERRUPT_RETURN						\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
		  jmp *%cs:pv_cpu_ops+PV_CPU_iret)
1113 1114

#define DISABLE_INTERRUPTS(clobbers)					\
1115
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1116
		  pushl %eax; pushl %ecx; pushl %edx;			\
1117
		  call *%cs:pv_irq_ops+PV_IRQ_irq_disable;		\
1118
		  popl %edx; popl %ecx; popl %eax)			\
1119 1120

#define ENABLE_INTERRUPTS(clobbers)					\
1121
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
1122
		  pushl %eax; pushl %ecx; pushl %edx;			\
1123
		  call *%cs:pv_irq_ops+PV_IRQ_irq_enable;		\
1124
		  popl %edx; popl %ecx; popl %eax)
1125

1126 1127 1128 1129
#define ENABLE_INTERRUPTS_SYSCALL_RET					\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\
		  CLBR_NONE,						\
		  jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)
1130 1131

#define GET_CR0_INTO_EAX			\
1132
	push %ecx; push %edx;			\
1133
	call *pv_cpu_ops+PV_CPU_read_cr0;	\
1134
	pop %edx; pop %ecx
1135

1136 1137 1138
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
#endif	/* __ASM_PARAVIRT_H */