paravirt.h 31.2 KB
Newer Older
1 2 3 4
#ifndef __ASM_PARAVIRT_H
#define __ASM_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
 * para-virtualization: those hooks are defined here. */
5 6

#ifdef CONFIG_PARAVIRT
7
#include <asm/page.h>
8

9 10 11 12 13 14 15
/* Bitmask of what can be clobbered: usually at least eax. */
#define CLBR_NONE 0x0
#define CLBR_EAX 0x1
#define CLBR_ECX 0x2
#define CLBR_EDX 0x4
#define CLBR_ANY 0x7

16
#ifndef __ASSEMBLY__
17
#include <linux/types.h>
18
#include <linux/cpumask.h>
19
#include <asm/kmap_types.h>
20

21
struct page;
22
struct thread_struct;
23
struct desc_ptr;
24
struct tss_struct;
25
struct mm_struct;
26
struct desc_struct;
27

28 29
/* general info */
struct pv_info {
30
	unsigned int kernel_rpl;
31
	int shared_kernel_pmd;
32
	int paravirt_enabled;
33
	const char *name;
34
};
35

36
struct pv_init_ops {
37
	/*
38 39 40 41 42 43
	 * Patch may replace one of the defined code sequences with
	 * arbitrary code, subject to the same register constraints.
	 * This generally means the code is not free to clobber any
	 * registers other than EAX.  The patch function should return
	 * the number of bytes of code generated, as we nop pad the
	 * rest in generic code.
44
	 */
45 46
	unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
			  unsigned long addr, unsigned len);
47

48
	/* Basic arch-specific setup */
49 50
	void (*arch_setup)(void);
	char *(*memory_setup)(void);
51 52
	void (*post_allocator_init)(void);

53
	/* Print a banner to identify the environment */
54
	void (*banner)(void);
55 56 57
};


58
struct pv_lazy_ops {
59
	/* Set deferred update mode, used for batching operations. */
60 61
	void (*enter)(void);
	void (*leave)(void);
62 63 64 65
};

struct pv_time_ops {
	void (*time_init)(void);
66

67
	/* Set and set time of day */
68 69 70
	unsigned long (*get_wallclock)(void);
	int (*set_wallclock)(unsigned long);

71 72 73
	unsigned long long (*sched_clock)(void);
	unsigned long (*get_cpu_khz)(void);
};
74

75
struct pv_cpu_ops {
76
	/* hooks for various privileged instructions */
77 78
	unsigned long (*get_debugreg)(int regno);
	void (*set_debugreg)(int regno, unsigned long value);
79

80
	void (*clts)(void);
81

82 83
	unsigned long (*read_cr0)(void);
	void (*write_cr0)(unsigned long);
84

85 86 87
	unsigned long (*read_cr4_safe)(void);
	unsigned long (*read_cr4)(void);
	void (*write_cr4)(unsigned long);
88

89
	/* Segment descriptor handling */
90
	void (*load_tr_desc)(void);
91 92 93 94
	void (*load_gdt)(const struct desc_ptr *);
	void (*load_idt)(const struct desc_ptr *);
	void (*store_gdt)(struct desc_ptr *);
	void (*store_idt)(struct desc_ptr *);
95 96 97
	void (*set_ldt)(const void *desc, unsigned entries);
	unsigned long (*store_tr)(void);
	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
98 99 100 101 102 103
	void (*write_ldt_entry)(struct desc_struct *,
				int entrynum, u32 low, u32 high);
	void (*write_gdt_entry)(struct desc_struct *,
				int entrynum, u32 low, u32 high);
	void (*write_idt_entry)(struct desc_struct *,
				int entrynum, u32 low, u32 high);
104
	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
105

106
	void (*set_iopl_mask)(unsigned mask);
107 108

	void (*wbinvd)(void);
109
	void (*io_delay)(void);
110

111 112 113 114 115 116 117
	/* cpuid emulation, mostly so that caps bits can be disabled */
	void (*cpuid)(unsigned int *eax, unsigned int *ebx,
		      unsigned int *ecx, unsigned int *edx);

	/* MSR, PMC and TSR operations.
	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
	u64 (*read_msr)(unsigned int msr, int *err);
118
	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
119 120

	u64 (*read_tsc)(void);
121
	u64 (*read_pmc)(int counter);
122 123

	/* These two are jmp to, not actually called. */
124
	void (*irq_enable_syscall_ret)(void);
125
	void (*iret)(void);
126 127

	struct pv_lazy_ops lazy_mode;
128 129 130 131 132
};

struct pv_irq_ops {
	void (*init_IRQ)(void);

133
	/*
134 135 136 137
	 * Get/set interrupt state.  save_fl and restore_fl are only
	 * expected to use X86_EFLAGS_IF; all other bits
	 * returned from save_fl are undefined, and may be ignored by
	 * restore_fl.
138
	 */
139 140 141 142 143 144 145
	unsigned long (*save_fl)(void);
	void (*restore_fl)(unsigned long);
	void (*irq_disable)(void);
	void (*irq_enable)(void);
	void (*safe_halt)(void);
	void (*halt)(void);
};
146

147
struct pv_apic_ops {
148
#ifdef CONFIG_X86_LOCAL_APIC
149 150 151 152
	/*
	 * Direct APIC operations, principally for VMI.  Ideally
	 * these shouldn't be in this interface.
	 */
153 154 155
	void (*apic_write)(unsigned long reg, u32 v);
	void (*apic_write_atomic)(unsigned long reg, u32 v);
	u32 (*apic_read)(unsigned long reg);
Z
Zachary Amsden 已提交
156 157
	void (*setup_boot_clock)(void);
	void (*setup_secondary_clock)(void);
158 159 160 161

	void (*startup_ipi_hook)(int phys_apicid,
				 unsigned long start_eip,
				 unsigned long start_esp);
162
#endif
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
};

struct pv_mmu_ops {
	/*
	 * Called before/after init_mm pagetable setup. setup_start
	 * may reset %cr3, and may pre-install parts of the pagetable;
	 * pagetable setup is expected to preserve any existing
	 * mapping.
	 */
	void (*pagetable_setup_start)(pgd_t *pgd_base);
	void (*pagetable_setup_done)(pgd_t *pgd_base);

	unsigned long (*read_cr2)(void);
	void (*write_cr2)(unsigned long);

	unsigned long (*read_cr3)(void);
	void (*write_cr3)(unsigned long);

	/*
	 * Hooks for intercepting the creation/use/destruction of an
	 * mm_struct.
	 */
	void (*activate_mm)(struct mm_struct *prev,
			    struct mm_struct *next);
	void (*dup_mmap)(struct mm_struct *oldmm,
			 struct mm_struct *mm);
	void (*exit_mmap)(struct mm_struct *mm);

191

192
	/* TLB operations */
193 194
	void (*flush_tlb_user)(void);
	void (*flush_tlb_kernel)(void);
195
	void (*flush_tlb_single)(unsigned long addr);
196 197
	void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
				 unsigned long va);
198

199
	/* Hooks for allocating/releasing pagetable pages */
200
	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
201 202 203 204 205
	void (*alloc_pd)(u32 pfn);
	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
	void (*release_pt)(u32 pfn);
	void (*release_pd)(u32 pfn);

206
	/* Pagetable manipulation functions */
207
	void (*set_pte)(pte_t *ptep, pte_t pteval);
208 209
	void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
			   pte_t *ptep, pte_t pteval);
210
	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
211
	void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
212 213
	void (*pte_update_defer)(struct mm_struct *mm,
				 unsigned long addr, pte_t *ptep);
214

215
#ifdef CONFIG_X86_PAE
216
	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
217 218
	void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
				pte_t *ptep, pte_t pte);
219
	void (*set_pud)(pud_t *pudp, pud_t pudval);
220
	void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
221
	void (*pmd_clear)(pmd_t *pmdp);
222 223 224 225 226 227 228 229 230 231 232 233 234 235

	unsigned long long (*pte_val)(pte_t);
	unsigned long long (*pmd_val)(pmd_t);
	unsigned long long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long long pte);
	pmd_t (*make_pmd)(unsigned long long pmd);
	pgd_t (*make_pgd)(unsigned long long pgd);
#else
	unsigned long (*pte_val)(pte_t);
	unsigned long (*pgd_val)(pgd_t);

	pte_t (*make_pte)(unsigned long pte);
	pgd_t (*make_pgd)(unsigned long pgd);
236 237
#endif

238 239 240
#ifdef CONFIG_HIGHPTE
	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
#endif
241 242

	struct pv_lazy_ops lazy_mode;
243
};
244

245 246 247 248 249 250 251 252 253 254 255
/* This contains all the paravirt structures: we get a convenient
 * number for each function using the offset which we use to indicate
 * what to patch. */
struct paravirt_patch_template
{
	struct pv_init_ops pv_init_ops;
	struct pv_time_ops pv_time_ops;
	struct pv_cpu_ops pv_cpu_ops;
	struct pv_irq_ops pv_irq_ops;
	struct pv_apic_ops pv_apic_ops;
	struct pv_mmu_ops pv_mmu_ops;
256 257
};

258 259 260 261 262 263 264
extern struct pv_info pv_info;
extern struct pv_init_ops pv_init_ops;
extern struct pv_time_ops pv_time_ops;
extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
265

266
#define PARAVIRT_PATCH(x)					\
267
	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
268

269 270 271
#define paravirt_type(op)				\
	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
	[paravirt_opptr] "m" (op)
272 273 274
#define paravirt_clobber(clobber)		\
	[paravirt_clobber] "i" (clobber)

275 276 277 278
/*
 * Generate some code, and mark it as patchable by the
 * apply_paravirt() alternate instruction patcher.
 */
279 280 281 282 283 284 285 286 287
#define _paravirt_alt(insn_string, type, clobber)	\
	"771:\n\t" insn_string "\n" "772:\n"		\
	".pushsection .parainstructions,\"a\"\n"	\
	"  .long 771b\n"				\
	"  .byte " type "\n"				\
	"  .byte 772b-771b\n"				\
	"  .short " clobber "\n"			\
	".popsection\n"

288
/* Generate patchable code, with the default asm parameters. */
289
#define paravirt_alt(insn_string)					\
290 291
	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")

292 293
unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ignore(unsigned len);
294 295 296
unsigned paravirt_patch_call(void *insnbuf,
			     const void *target, u16 tgt_clobbers,
			     unsigned long addr, u16 site_clobbers,
297
			     unsigned len);
298
unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
299 300 301
			    unsigned long addr, unsigned len);
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
				unsigned long addr, unsigned len);
302

303
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
304 305
			      const char *start, const char *end);

306
int paravirt_disable_iospace(void);
307

308 309 310
/*
 * This generates an indirect call based on the operation type number.
 * The type number, computed in PARAVIRT_PATCH, is derived from the
311 312
 * offset into the paravirt_patch_template structure, and can therefore be
 * freely converted back into a structure offset.
313
 */
314
#define PARAVIRT_CALL	"call *%[paravirt_opptr];"
315 316

/*
317 318
 * These macros are intended to wrap calls through one of the paravirt
 * ops structs, so that they can be later identified and patched at
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
 * runtime.
 *
 * Normally, a call to a pv_op function is a simple indirect call:
 * (paravirt_ops.operations)(args...).
 *
 * Unfortunately, this is a relatively slow operation for modern CPUs,
 * because it cannot necessarily determine what the destination
 * address is.  In this case, the address is a runtime constant, so at
 * the very least we can patch the call to e a simple direct call, or
 * ideally, patch an inline implementation into the callsite.  (Direct
 * calls are essentially free, because the call and return addresses
 * are completely predictable.)
 *
 * These macros rely on the standard gcc "regparm(3)" calling
 * convention, in which the first three arguments are placed in %eax,
 * %edx, %ecx (in that order), and the remaining arguments are placed
 * on the stack.  All caller-save registers (eax,edx,ecx) are expected
 * to be modified (either clobbered or used for return values).
 *
 * The call instruction itself is marked by placing its start address
 * and size into the .parainstructions section, so that
 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
341
 * appropriate patching under the control of the backend pv_init_ops
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
 * implementation.
 *
 * Unfortunately there's no way to get gcc to generate the args setup
 * for the call, and then allow the call itself to be generated by an
 * inline asm.  Because of this, we must do the complete arg setup and
 * return value handling from within these macros.  This is fairly
 * cumbersome.
 *
 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
 * It could be extended to more arguments, but there would be little
 * to be gained from that.  For each number of arguments, there are
 * the two VCALL and CALL variants for void and non-void functions.
 *
 * When there is a return value, the invoker of the macro must specify
 * the return type.  The macro then uses sizeof() on that type to
 * determine whether its a 32 or 64 bit value, and places the return
 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
 * 64-bit).
 *
 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
 * in low,high order.
 *
 * Small structures are passed and returned in registers.  The macro
 * calling convention can't directly deal with this, so the wrapper
 * functions must do this.
 *
 * These PVOP_* macros are only defined within this header.  This
 * means that all uses must be wrapped in inline functions.  This also
 * makes sure the incoming and outgoing types are always correct.
 */
372
#define __PVOP_CALL(rettype, op, pre, post, ...)			\
373
	({								\
374
		rettype __ret;						\
375
		unsigned long __eax, __edx, __ecx;			\
376 377 378 379 380
		if (sizeof(rettype) > sizeof(unsigned long)) {		\
			asm volatile(pre				\
				     paravirt_alt(PARAVIRT_CALL)	\
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
381
				       "=c" (__ecx)			\
382 383 384
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
385
				     : "memory", "cc");			\
386
			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
387
		} else {						\
388
			asm volatile(pre				\
389
				     paravirt_alt(PARAVIRT_CALL)	\
390 391 392 393 394 395
				     post				\
				     : "=a" (__eax), "=d" (__edx),	\
				       "=c" (__ecx)			\
				     : paravirt_type(op),		\
				       paravirt_clobber(CLBR_ANY),	\
				       ##__VA_ARGS__			\
396
				     : "memory", "cc");			\
397
			__ret = (rettype)__eax;				\
398 399 400
		}							\
		__ret;							\
	})
401
#define __PVOP_VCALL(op, pre, post, ...)				\
402 403
	({								\
		unsigned long __eax, __edx, __ecx;			\
404
		asm volatile(pre					\
405
			     paravirt_alt(PARAVIRT_CALL)		\
406
			     post					\
407
			     : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \
408 409 410
			     : paravirt_type(op),			\
			       paravirt_clobber(CLBR_ANY),		\
			       ##__VA_ARGS__				\
411 412 413
			     : "memory", "cc");				\
	})

414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
#define PVOP_CALL0(rettype, op)						\
	__PVOP_CALL(rettype, op, "", "")
#define PVOP_VCALL0(op)							\
	__PVOP_VCALL(op, "", "")

#define PVOP_CALL1(rettype, op, arg1)					\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)))
#define PVOP_VCALL1(op, arg1)						\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)))

#define PVOP_CALL2(rettype, op, arg1, arg2)				\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))
#define PVOP_VCALL2(op, arg1, arg2)					\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))

#define PVOP_CALL3(rettype, op, arg1, arg2, arg3)			\
	__PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)),		\
		    "1"((u32)(arg2)), "2"((u32)(arg3)))
#define PVOP_VCALL3(op, arg1, arg2, arg3)				\
	__PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1"((u32)(arg2)),	\
		     "2"((u32)(arg3)))

#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)			\
	__PVOP_CALL(rettype, op,					\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
	__PVOP_VCALL(op,						\
		    "push %[_arg4];", "lea 4(%%esp),%%esp;",		\
		    "0" ((u32)(arg1)), "1" ((u32)(arg2)),		\
		    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))

447 448
static inline int paravirt_enabled(void)
{
449
	return pv_info.paravirt_enabled;
450
}
451

452
static inline void load_sp0(struct tss_struct *tss,
453 454
			     struct thread_struct *thread)
{
455
	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
456 457
}

458
#define ARCH_SETUP			pv_init_ops.arch_setup();
459 460
static inline unsigned long get_wallclock(void)
{
461
	return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
462 463 464 465
}

static inline int set_wallclock(unsigned long nowtime)
{
466
	return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
467 468
}

Z
Zachary Amsden 已提交
469
static inline void (*choose_time_init(void))(void)
470
{
471
	return pv_time_ops.time_init;
472 473 474 475 476 477
}

/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
			   unsigned int *ecx, unsigned int *edx)
{
478
	PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
479 480 481 482 483
}

/*
 * These special macros can be used to get or set a debugging register
 */
484 485
static inline unsigned long paravirt_get_debugreg(int reg)
{
486
	return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
487 488 489 490
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
491
	PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
492
}
493

494 495
static inline void clts(void)
{
496
	PVOP_VCALL0(pv_cpu_ops.clts);
497
}
498

499 500
static inline unsigned long read_cr0(void)
{
501
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
502
}
503

504 505
static inline void write_cr0(unsigned long x)
{
506
	PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
507 508 509 510
}

static inline unsigned long read_cr2(void)
{
511
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
512 513 514 515
}

static inline void write_cr2(unsigned long x)
{
516
	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
517 518 519 520
}

static inline unsigned long read_cr3(void)
{
521
	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
522
}
523

524 525
static inline void write_cr3(unsigned long x)
{
526
	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
527
}
528

529 530
static inline unsigned long read_cr4(void)
{
531
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
532 533 534
}
static inline unsigned long read_cr4_safe(void)
{
535
	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
536
}
537

538 539
static inline void write_cr4(unsigned long x)
{
540
	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
541
}
542

543 544
static inline void raw_safe_halt(void)
{
545
	PVOP_VCALL0(pv_irq_ops.safe_halt);
546 547 548 549
}

static inline void halt(void)
{
550
	PVOP_VCALL0(pv_irq_ops.safe_halt);
551 552 553 554
}

static inline void wbinvd(void)
{
555
	PVOP_VCALL0(pv_cpu_ops.wbinvd);
556 557
}

558
#define get_kernel_rpl()  (pv_info.kernel_rpl)
559

560 561
static inline u64 paravirt_read_msr(unsigned msr, int *err)
{
562
	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
563 564 565
}
static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
{
566
	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
567 568
}

569
/* These should all do BUG_ON(_err), but our headers are too tangled. */
570 571 572 573 574
#define rdmsr(msr,val1,val2) do {		\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	val1 = (u32)_l;				\
	val2 = _l >> 32;			\
575 576
} while(0)

577 578
#define wrmsr(msr,val1,val2) do {		\
	paravirt_write_msr(msr, val1, val2);	\
579 580
} while(0)

581 582 583
#define rdmsrl(msr,val) do {			\
	int _err;				\
	val = paravirt_read_msr(msr, &_err);	\
584 585
} while(0)

586
#define wrmsrl(msr,val)		wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
587
#define wrmsr_safe(msr,a,b)	paravirt_write_msr(msr, a, b)
588 589

/* rdmsr with exception handling */
590 591 592 593 594
#define rdmsr_safe(msr,a,b) ({			\
	int _err;				\
	u64 _l = paravirt_read_msr(msr, &_err);	\
	(*a) = (u32)_l;				\
	(*b) = _l >> 32;			\
595 596
	_err; })

597 598 599

static inline u64 paravirt_read_tsc(void)
{
600
	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
601
}
602

603 604 605
#define rdtscl(low) do {			\
	u64 _l = paravirt_read_tsc();		\
	low = (int)_l;				\
606 607
} while(0)

608
#define rdtscll(val) (val = paravirt_read_tsc())
609

610 611
static inline unsigned long long paravirt_sched_clock(void)
{
612
	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
613
}
614
#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
615

616 617
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

618 619
static inline unsigned long long paravirt_read_pmc(int counter)
{
620
	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
621
}
622

623 624 625 626 627
#define rdpmc(counter,low,high) do {		\
	u64 _l = paravirt_read_pmc(counter);	\
	low = (u32)_l;				\
	high = _l >> 32;			\
} while(0)
628

629 630
static inline void load_TR_desc(void)
{
631
	PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
632
}
633
static inline void load_gdt(const struct desc_ptr *dtr)
634
{
635
	PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
636
}
637
static inline void load_idt(const struct desc_ptr *dtr)
638
{
639
	PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
640 641 642
}
static inline void set_ldt(const void *addr, unsigned entries)
{
643
	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
644
}
645
static inline void store_gdt(struct desc_ptr *dtr)
646
{
647
	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
648
}
649
static inline void store_idt(struct desc_ptr *dtr)
650
{
651
	PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
652 653 654
}
static inline unsigned long paravirt_store_tr(void)
{
655
	return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
656 657 658 659
}
#define store_tr(tr)	((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
660
	PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
661 662 663
}
static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
{
664
	PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
665 666 667
}
static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high)
{
668
	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high);
669 670 671
}
static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high)
{
672
	PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high);
673 674 675
}
static inline void set_iopl_mask(unsigned mask)
{
676
	PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
677
}
678

679 680
/* The paravirtualized I/O functions */
static inline void slow_down_io(void) {
681
	pv_cpu_ops.io_delay();
682
#ifdef REALLY_SLOW_IO
683 684 685
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
	pv_cpu_ops.io_delay();
686 687 688
#endif
}

689 690 691 692
#ifdef CONFIG_X86_LOCAL_APIC
/*
 * Basic functions accessing APICs.
 */
693
static inline void apic_write(unsigned long reg, u32 v)
694
{
695
	PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
696 697
}

698
static inline void apic_write_atomic(unsigned long reg, u32 v)
699
{
700
	PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
701 702
}

703
static inline u32 apic_read(unsigned long reg)
704
{
705
	return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
706
}
Z
Zachary Amsden 已提交
707 708 709

static inline void setup_boot_clock(void)
{
710
	PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
Z
Zachary Amsden 已提交
711 712 713 714
}

static inline void setup_secondary_clock(void)
{
715
	PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
Z
Zachary Amsden 已提交
716
}
717 718
#endif

719 720
static inline void paravirt_post_allocator_init(void)
{
721 722
	if (pv_init_ops.post_allocator_init)
		(*pv_init_ops.post_allocator_init)();
723 724
}

725 726
static inline void paravirt_pagetable_setup_start(pgd_t *base)
{
727
	(*pv_mmu_ops.pagetable_setup_start)(base);
728 729 730 731
}

static inline void paravirt_pagetable_setup_done(pgd_t *base)
{
732
	(*pv_mmu_ops.pagetable_setup_done)(base);
733
}
734

735 736 737 738
#ifdef CONFIG_SMP
static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
				    unsigned long start_esp)
{
739 740
	PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
		    phys_apicid, start_eip, start_esp);
741 742
}
#endif
743

744 745 746
static inline void paravirt_activate_mm(struct mm_struct *prev,
					struct mm_struct *next)
{
747
	PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
748 749 750 751 752
}

static inline void arch_dup_mmap(struct mm_struct *oldmm,
				 struct mm_struct *mm)
{
753
	PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
754 755 756 757
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
758
	PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
759 760
}

761 762
static inline void __flush_tlb(void)
{
763
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
764 765 766
}
static inline void __flush_tlb_global(void)
{
767
	PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
768 769 770
}
static inline void __flush_tlb_single(unsigned long addr)
{
771
	PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
772
}
773

774 775 776
static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
				    unsigned long va)
{
777
	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
778 779
}

780
static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
781
{
782
	PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
783 784 785
}
static inline void paravirt_release_pt(unsigned pfn)
{
786
	PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
787
}
788

789 790
static inline void paravirt_alloc_pd(unsigned pfn)
{
791
	PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
792
}
793

794 795 796
static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
					   unsigned start, unsigned count)
{
797
	PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
798 799
}
static inline void paravirt_release_pd(unsigned pfn)
800
{
801
	PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
802 803
}

804 805 806 807
#ifdef CONFIG_HIGHPTE
static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
{
	unsigned long ret;
808
	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
809 810 811 812
	return (void *)ret;
}
#endif

813 814
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
815
{
816
	PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
817 818
}

819 820
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
				    pte_t *ptep)
821
{
822
	PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
823 824
}

825 826
#ifdef CONFIG_X86_PAE
static inline pte_t __pte(unsigned long long val)
827
{
828 829
	unsigned long long ret = PVOP_CALL2(unsigned long long,
					    pv_mmu_ops.make_pte,
830 831
					    val, val >> 32);
	return (pte_t) { ret, ret >> 32 };
832 833
}

834
static inline pmd_t __pmd(unsigned long long val)
835
{
836 837
	return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd,
				    val, val >> 32) };
838 839 840 841
}

static inline pgd_t __pgd(unsigned long long val)
{
842 843
	return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd,
				    val, val >> 32) };
844 845 846 847
}

static inline unsigned long long pte_val(pte_t x)
{
848 849
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val,
			  x.pte_low, x.pte_high);
850 851 852 853
}

static inline unsigned long long pmd_val(pmd_t x)
{
854 855
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val,
			  x.pmd, x.pmd >> 32);
856 857 858 859
}

static inline unsigned long long pgd_val(pgd_t x)
{
860 861
	return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val,
			  x.pgd, x.pgd >> 32);
862 863 864 865
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
866
	PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high);
867 868 869 870 871 872
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
	/* 5 arg words */
873
	pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval);
874 875 876 877
}

static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
{
878 879
	PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
		    pteval.pte_low, pteval.pte_high);
880 881
}

882 883
static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
				   pte_t *ptep, pte_t pte)
884
{
885
	/* 5 arg words */
886
	pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
887 888
}

889 890
static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
891 892
	PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp,
		    pmdval.pmd, pmdval.pmd >> 32);
893 894
}

895 896
static inline void set_pud(pud_t *pudp, pud_t pudval)
{
897 898
	PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
		    pudval.pgd.pgd, pudval.pgd.pgd >> 32);
899 900 901 902
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
903
	PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
904 905 906 907
}

static inline void pmd_clear(pmd_t *pmdp)
{
908
	PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
909 910 911
}

#else  /* !CONFIG_X86_PAE */
912

913 914
static inline pte_t __pte(unsigned long val)
{
915
	return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) };
916
}
917 918 919

static inline pgd_t __pgd(unsigned long val)
{
920
	return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) };
921 922 923 924
}

static inline unsigned long pte_val(pte_t x)
{
925
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low);
926 927 928 929
}

static inline unsigned long pgd_val(pgd_t x)
{
930
	return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd);
931 932 933 934
}

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
935
	PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low);
936 937 938 939 940
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep, pte_t pteval)
{
941
	PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low);
942 943 944 945
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
946
	PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd);
947 948
}
#endif	/* CONFIG_X86_PAE */
949

950 951 952 953 954 955 956 957 958 959 960 961 962 963
/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
	PARAVIRT_LAZY_NONE,
	PARAVIRT_LAZY_MMU,
	PARAVIRT_LAZY_CPU,
};

enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
void paravirt_enter_lazy_cpu(void);
void paravirt_leave_lazy_cpu(void);
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
void paravirt_leave_lazy(enum paravirt_lazy_mode mode);

964
#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
965 966
static inline void arch_enter_lazy_cpu_mode(void)
{
967
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
968 969 970 971
}

static inline void arch_leave_lazy_cpu_mode(void)
{
972
	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
973 974 975 976
}

static inline void arch_flush_lazy_cpu_mode(void)
{
977 978 979 980
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
		arch_leave_lazy_cpu_mode();
		arch_enter_lazy_cpu_mode();
	}
981 982
}

983 984

#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
985 986
static inline void arch_enter_lazy_mmu_mode(void)
{
987
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
988 989 990 991
}

static inline void arch_leave_lazy_mmu_mode(void)
{
992
	PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
993 994 995 996
}

static inline void arch_flush_lazy_mmu_mode(void)
{
997 998 999 1000
	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
		arch_leave_lazy_mmu_mode();
		arch_enter_lazy_mmu_mode();
	}
1001
}
1002

1003 1004 1005
void _paravirt_nop(void);
#define paravirt_nop	((void *)_paravirt_nop)

1006
/* These all sit in the .parainstructions section to tell us what to patch. */
1007
struct paravirt_patch_site {
1008 1009 1010 1011 1012 1013
	u8 *instr; 		/* original instructions */
	u8 instrtype;		/* type of this instruction */
	u8 len;			/* length of original instruction */
	u16 clobbers;		/* what registers you may clobber */
};

1014 1015 1016
extern struct paravirt_patch_site __parainstructions[],
	__parainstructions_end[];

1017 1018 1019 1020
static inline unsigned long __raw_local_save_flags(void)
{
	unsigned long f;

1021 1022 1023 1024
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
1025
		     : paravirt_type(pv_irq_ops.save_fl),
1026
		       paravirt_clobber(CLBR_EAX)
1027
		     : "memory", "cc");
1028 1029 1030 1031 1032
	return f;
}

static inline void raw_local_irq_restore(unsigned long f)
{
1033 1034 1035 1036 1037
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     : "=a"(f)
		     : "0"(f),
1038
		       paravirt_type(pv_irq_ops.restore_fl),
1039 1040
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "cc");
1041 1042 1043 1044
}

static inline void raw_local_irq_disable(void)
{
1045 1046 1047 1048
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1049
		     : paravirt_type(pv_irq_ops.irq_disable),
1050 1051
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1052 1053 1054 1055
}

static inline void raw_local_irq_enable(void)
{
1056 1057 1058 1059
	asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
				  PARAVIRT_CALL
				  "popl %%edx; popl %%ecx")
		     :
1060
		     : paravirt_type(pv_irq_ops.irq_enable),
1061 1062
		       paravirt_clobber(CLBR_EAX)
		     : "memory", "eax", "cc");
1063 1064 1065 1066 1067 1068
}

static inline unsigned long __raw_local_irq_save(void)
{
	unsigned long f;

1069 1070
	f = __raw_local_save_flags();
	raw_local_irq_disable();
1071 1072 1073
	return f;
}

1074 1075
#define CLI_STRING							\
	_paravirt_alt("pushl %%ecx; pushl %%edx;"			\
1076
		      "call *%[paravirt_cli_opptr];"			\
1077 1078 1079 1080 1081
		      "popl %%edx; popl %%ecx",				\
		      "%c[paravirt_cli_type]", "%c[paravirt_clobber]")

#define STI_STRING							\
	_paravirt_alt("pushl %%ecx; pushl %%edx;"			\
1082
		      "call *%[paravirt_sti_opptr];"			\
1083 1084
		      "popl %%edx; popl %%ecx",				\
		      "%c[paravirt_sti_type]", "%c[paravirt_clobber]")
1085 1086

#define CLI_STI_CLOBBERS , "%eax"
1087
#define CLI_STI_INPUT_ARGS						\
1088
	,								\
1089 1090 1091 1092
	[paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)),		\
	[paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable),		\
	[paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)),		\
	[paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable),		\
1093 1094
	paravirt_clobber(CLBR_EAX)

1095
/* Make sure as little as possible of this mess escapes. */
1096
#undef PARAVIRT_CALL
1097 1098
#undef __PVOP_CALL
#undef __PVOP_VCALL
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4
1109

1110 1111
#else  /* __ASSEMBLY__ */

1112
#define PARA_PATCH(struct, off)	((PARAVIRT_PATCH_##struct + (off)) / 4)
1113 1114

#define PARA_SITE(ptype, clobbers, ops)		\
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
771:;						\
	ops;					\
772:;						\
	.pushsection .parainstructions,"a";	\
	 .long 771b;				\
	 .byte ptype;				\
	 .byte 772b-771b;			\
	 .short clobbers;			\
	.popsection

1125 1126 1127
#define INTERRUPT_RETURN						\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
		  jmp *%cs:pv_cpu_ops+PV_CPU_iret)
1128 1129

#define DISABLE_INTERRUPTS(clobbers)					\
1130
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1131
		  pushl %eax; pushl %ecx; pushl %edx;			\
1132
		  call *%cs:pv_irq_ops+PV_IRQ_irq_disable;		\
1133
		  popl %edx; popl %ecx; popl %eax)			\
1134 1135

#define ENABLE_INTERRUPTS(clobbers)					\
1136
	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
1137
		  pushl %eax; pushl %ecx; pushl %edx;			\
1138
		  call *%cs:pv_irq_ops+PV_IRQ_irq_enable;		\
1139
		  popl %edx; popl %ecx; popl %eax)
1140

1141 1142 1143 1144
#define ENABLE_INTERRUPTS_SYSCALL_RET					\
	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\
		  CLBR_NONE,						\
		  jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)
1145 1146

#define GET_CR0_INTO_EAX			\
1147
	push %ecx; push %edx;			\
1148
	call *pv_cpu_ops+PV_CPU_read_cr0;	\
1149
	pop %edx; pop %ecx
1150

1151 1152 1153
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
#endif	/* __ASM_PARAVIRT_H */