kvm.c 19.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * KVM paravirt_ops implementation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 *
 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 * Copyright IBM Corporation, 2007
 *   Authors: Anthony Liguori <aliguori@us.ibm.com>
 */

23
#include <linux/context_tracking.h>
24 25 26 27 28
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
#include <linux/mm.h>
29
#include <linux/highmem.h>
30
#include <linux/hardirq.h>
31 32
#include <linux/notifier.h>
#include <linux/reboot.h>
G
Gleb Natapov 已提交
33 34 35 36
#include <linux/hash.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
37
#include <linux/debugfs.h>
38
#include <linux/nmi.h>
39
#include <linux/swait.h>
40
#include <asm/timer.h>
41
#include <asm/cpu.h>
G
Gleb Natapov 已提交
42 43
#include <asm/traps.h>
#include <asm/desc.h>
44
#include <asm/tlbflush.h>
45
#include <asm/idle.h>
46 47
#include <asm/apic.h>
#include <asm/apicdef.h>
48
#include <asm/hypervisor.h>
49
#include <asm/kvm_guest.h>
50

51 52 53 54 55 56 57 58 59 60
static int kvmapf = 1;

static int parse_no_kvmapf(char *arg)
{
        kvmapf = 0;
        return 0;
}

early_param("no-kvmapf", parse_no_kvmapf);

61 62 63 64 65 66 67 68 69
static int steal_acc = 1;
static int parse_no_stealacc(char *arg)
{
        steal_acc = 0;
        return 0;
}

early_param("no-steal-acc", parse_no_stealacc);

70 71 72 73 74 75 76 77 78
static int kvmclock_vsyscall = 1;
static int parse_no_kvmclock_vsyscall(char *arg)
{
        kvmclock_vsyscall = 0;
        return 0;
}

early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);

79
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
80 81
static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
82

83 84 85 86 87 88 89
/*
 * No need for any "IO delay" on KVM
 */
static void kvm_io_delay(void)
{
}

G
Gleb Natapov 已提交
90 91 92 93 94
#define KVM_TASK_SLEEP_HASHBITS 8
#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)

struct kvm_task_sleep_node {
	struct hlist_node link;
95
	struct swait_queue_head wq;
G
Gleb Natapov 已提交
96 97
	u32 token;
	int cpu;
98
	bool halted;
G
Gleb Natapov 已提交
99 100 101
};

static struct kvm_task_sleep_head {
102
	raw_spinlock_t lock;
G
Gleb Natapov 已提交
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
	struct hlist_head list;
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];

static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
						  u32 token)
{
	struct hlist_node *p;

	hlist_for_each(p, &b->list) {
		struct kvm_task_sleep_node *n =
			hlist_entry(p, typeof(*n), link);
		if (n->token == token)
			return n;
	}

	return NULL;
}

void kvm_async_pf_task_wait(u32 token)
{
	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
	struct kvm_task_sleep_node n, *e;
126
	DECLARE_SWAITQUEUE(wait);
G
Gleb Natapov 已提交
127

128 129
	rcu_irq_enter();

130
	raw_spin_lock(&b->lock);
G
Gleb Natapov 已提交
131 132 133 134 135
	e = _find_apf_task(b, token);
	if (e) {
		/* dummy entry exist -> wake up was delivered ahead of PF */
		hlist_del(&e->link);
		kfree(e);
136
		raw_spin_unlock(&b->lock);
137 138

		rcu_irq_exit();
G
Gleb Natapov 已提交
139 140 141 142 143
		return;
	}

	n.token = token;
	n.cpu = smp_processor_id();
144
	n.halted = is_idle_task(current) || preempt_count() > 1;
145
	init_swait_queue_head(&n.wq);
G
Gleb Natapov 已提交
146
	hlist_add_head(&n.link, &b->list);
147
	raw_spin_unlock(&b->lock);
G
Gleb Natapov 已提交
148 149

	for (;;) {
150
		if (!n.halted)
151
			prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
G
Gleb Natapov 已提交
152 153
		if (hlist_unhashed(&n.link))
			break;
154 155 156 157 158 159 160 161 162

		if (!n.halted) {
			local_irq_enable();
			schedule();
			local_irq_disable();
		} else {
			/*
			 * We cannot reschedule. So halt.
			 */
163
			rcu_irq_exit();
164
			native_safe_halt();
165
			rcu_irq_enter();
166 167
			local_irq_disable();
		}
G
Gleb Natapov 已提交
168
	}
169
	if (!n.halted)
170
		finish_swait(&n.wq, &wait);
G
Gleb Natapov 已提交
171

172
	rcu_irq_exit();
G
Gleb Natapov 已提交
173 174 175 176 177 178 179
	return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);

static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
	hlist_del_init(&n->link);
180 181
	if (n->halted)
		smp_send_reschedule(n->cpu);
182 183
	else if (swait_active(&n->wq))
		swake_up(&n->wq);
G
Gleb Natapov 已提交
184 185 186 187 188 189 190 191 192
}

static void apf_task_wake_all(void)
{
	int i;

	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
		struct hlist_node *p, *next;
		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
193
		raw_spin_lock(&b->lock);
G
Gleb Natapov 已提交
194 195 196 197 198 199
		hlist_for_each_safe(p, next, &b->list) {
			struct kvm_task_sleep_node *n =
				hlist_entry(p, typeof(*n), link);
			if (n->cpu == smp_processor_id())
				apf_task_wake_one(n);
		}
200
		raw_spin_unlock(&b->lock);
G
Gleb Natapov 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
	}
}

void kvm_async_pf_task_wake(u32 token)
{
	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
	struct kvm_task_sleep_node *n;

	if (token == ~0) {
		apf_task_wake_all();
		return;
	}

again:
216
	raw_spin_lock(&b->lock);
G
Gleb Natapov 已提交
217 218 219 220 221 222
	n = _find_apf_task(b, token);
	if (!n) {
		/*
		 * async PF was not yet handled.
		 * Add dummy entry for the token.
		 */
223
		n = kzalloc(sizeof(*n), GFP_ATOMIC);
G
Gleb Natapov 已提交
224 225 226 227 228
		if (!n) {
			/*
			 * Allocation failed! Busy wait while other cpu
			 * handles async PF.
			 */
229
			raw_spin_unlock(&b->lock);
G
Gleb Natapov 已提交
230 231 232 233 234
			cpu_relax();
			goto again;
		}
		n->token = token;
		n->cpu = smp_processor_id();
235
		init_swait_queue_head(&n->wq);
G
Gleb Natapov 已提交
236 237 238
		hlist_add_head(&n->link, &b->list);
	} else
		apf_task_wake_one(n);
239
	raw_spin_unlock(&b->lock);
G
Gleb Natapov 已提交
240 241 242 243 244 245 246 247
	return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);

u32 kvm_read_and_reset_pf_reason(void)
{
	u32 reason = 0;

248 249 250
	if (__this_cpu_read(apf_reason.enabled)) {
		reason = __this_cpu_read(apf_reason.reason);
		__this_cpu_write(apf_reason.reason, 0);
G
Gleb Natapov 已提交
251 252 253 254 255
	}

	return reason;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
256
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
G
Gleb Natapov 已提交
257

258
dotraplinkage void
G
Gleb Natapov 已提交
259 260
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
{
261 262
	enum ctx_state prev_state;

G
Gleb Natapov 已提交
263 264
	switch (kvm_read_and_reset_pf_reason()) {
	default:
265
		trace_do_page_fault(regs, error_code);
G
Gleb Natapov 已提交
266 267 268
		break;
	case KVM_PV_REASON_PAGE_NOT_PRESENT:
		/* page is swapped out by the host. */
269
		prev_state = exception_enter();
270
		exit_idle();
G
Gleb Natapov 已提交
271
		kvm_async_pf_task_wait((u32)read_cr2());
272
		exception_exit(prev_state);
G
Gleb Natapov 已提交
273 274
		break;
	case KVM_PV_REASON_PAGE_READY:
275 276
		rcu_irq_enter();
		exit_idle();
G
Gleb Natapov 已提交
277
		kvm_async_pf_task_wake((u32)read_cr2());
278
		rcu_irq_exit();
G
Gleb Natapov 已提交
279 280 281
		break;
	}
}
282
NOKPROBE_SYMBOL(do_async_page_fault);
G
Gleb Natapov 已提交
283

284
static void __init paravirt_ops_setup(void)
285 286
{
	pv_info.name = "KVM";
287

288 289 290
	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
		pv_cpu_ops.io_delay = kvm_io_delay;

291 292 293
#ifdef CONFIG_X86_IO_APIC
	no_timer_check = 1;
#endif
294 295
}

296 297 298 299 300 301 302 303
static void kvm_register_steal_time(void)
{
	int cpu = smp_processor_id();
	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);

	if (!has_steal_clock)
		return;

304
	wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
305 306
	pr_info("kvm-stealtime: cpu %d, msr %llx\n",
		cpu, (unsigned long long) slow_virt_to_phys(st));
307 308
}

309 310 311 312 313 314 315 316 317 318 319
static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;

static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
	/**
	 * This relies on __test_and_clear_bit to modify the memory
	 * in a way that is atomic with respect to the local CPU.
	 * The hypervisor only accesses this memory from the local CPU so
	 * there's no need for lock or memory barriers.
	 * An optimization barrier is implied in apic write.
	 */
320
	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
321
		return;
322
	apic_write(APIC_EOI, APIC_EOI_ACK);
323 324
}

325
static void kvm_guest_cpu_init(void)
326 327 328 329 330
{
	if (!kvm_para_available())
		return;

	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
331
		u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
332

333 334 335
#ifdef CONFIG_PREEMPT
		pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
336
		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
337
		__this_cpu_write(apf_reason.enabled, 1);
338 339 340
		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
		       smp_processor_id());
	}
341

342 343 344 345
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
		unsigned long pa;
		/* Size alignment is implied but just to make it explicit. */
		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
346 347
		__this_cpu_write(kvm_apic_eoi, 0);
		pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
348
			| KVM_MSR_ENABLED;
349 350 351
		wrmsrl(MSR_KVM_PV_EOI_EN, pa);
	}

352 353
	if (has_steal_clock)
		kvm_register_steal_time();
354 355
}

356
static void kvm_pv_disable_apf(void)
357
{
358
	if (!__this_cpu_read(apf_reason.enabled))
359 360 361
		return;

	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
362
	__this_cpu_write(apf_reason.enabled, 0);
363 364 365 366 367

	printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
	       smp_processor_id());
}

368 369 370 371 372 373 374 375 376 377
static void kvm_pv_guest_cpu_reboot(void *unused)
{
	/*
	 * We disable PV EOI before we load a new kernel by kexec,
	 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
	 * New kernel can re-enable when it boots.
	 */
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
	kvm_pv_disable_apf();
378
	kvm_disable_steal_time();
379 380
}

381 382 383 384
static int kvm_pv_reboot_notify(struct notifier_block *nb,
				unsigned long code, void *unused)
{
	if (code == SYS_RESTART)
385
		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
386 387 388 389 390 391 392
	return NOTIFY_DONE;
}

static struct notifier_block kvm_pv_reboot_nb = {
	.notifier_call = kvm_pv_reboot_notify,
};

393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
static u64 kvm_steal_clock(int cpu)
{
	u64 steal;
	struct kvm_steal_time *src;
	int version;

	src = &per_cpu(steal_time, cpu);
	do {
		version = src->version;
		rmb();
		steal = src->steal;
		rmb();
	} while ((version & 1) || (version != src->version));

	return steal;
}

void kvm_disable_steal_time(void)
{
	if (!has_steal_clock)
		return;

	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}

418 419 420
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
421
	kvm_guest_cpu_init();
422
	native_smp_prepare_boot_cpu();
423
	kvm_spinlock_init();
424
}
425

426
static void kvm_guest_cpu_online(void *dummy)
427 428 429 430 431 432
{
	kvm_guest_cpu_init();
}

static void kvm_guest_cpu_offline(void *dummy)
{
433
	kvm_disable_steal_time();
434 435 436
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
	kvm_pv_disable_apf();
G
Gleb Natapov 已提交
437
	apf_task_wake_all();
438 439
}

440 441
static int kvm_cpu_notify(struct notifier_block *self, unsigned long action,
			  void *hcpu)
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
{
	int cpu = (unsigned long)hcpu;
	switch (action) {
	case CPU_ONLINE:
	case CPU_DOWN_FAILED:
	case CPU_ONLINE_FROZEN:
		smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
		break;
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
		smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
		break;
	default:
		break;
	}
	return NOTIFY_OK;
}

460
static struct notifier_block kvm_cpu_notifier = {
461 462
        .notifier_call  = kvm_cpu_notify,
};
463 464
#endif

G
Gleb Natapov 已提交
465 466
static void __init kvm_apf_trap_init(void)
{
467
	set_intr_gate(14, async_page_fault);
G
Gleb Natapov 已提交
468 469
}

470 471
void __init kvm_guest_init(void)
{
G
Gleb Natapov 已提交
472 473
	int i;

474 475 476 477
	if (!kvm_para_available())
		return;

	paravirt_ops_setup();
478
	register_reboot_notifier(&kvm_pv_reboot_nb);
G
Gleb Natapov 已提交
479
	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
480
		raw_spin_lock_init(&async_pf_sleepers[i].lock);
G
Gleb Natapov 已提交
481 482 483
	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
		x86_init.irqs.trap_init = kvm_apf_trap_init;

484 485 486 487 488
	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
		has_steal_clock = 1;
		pv_time_ops.steal_clock = kvm_steal_clock;
	}

489 490
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
		apic_set_eoi_write(kvm_guest_apic_eoi_write);
491

492 493 494
	if (kvmclock_vsyscall)
		kvm_setup_vsyscall_timeinfo();

495 496
#ifdef CONFIG_SMP
	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
497 498 499
	register_cpu_notifier(&kvm_cpu_notifier);
#else
	kvm_guest_cpu_init();
500
#endif
501 502 503 504 505 506

	/*
	 * Hard lockup detection is enabled by default. Disable it, as guests
	 * can get false positives too easily, for example if the host is
	 * overcommitted.
	 */
507
	hardlockup_detector_disable();
508
}
509

510 511 512 513 514
static noinline uint32_t __kvm_cpuid_base(void)
{
	if (boot_cpu_data.cpuid_level < 0)
		return 0;	/* So we don't blow up on old processors */

515
	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);

	return 0;
}

static inline uint32_t kvm_cpuid_base(void)
{
	static int kvm_cpuid_base = -1;

	if (kvm_cpuid_base == -1)
		kvm_cpuid_base = __kvm_cpuid_base();

	return kvm_cpuid_base;
}

bool kvm_para_available(void)
{
	return kvm_cpuid_base() != 0;
}
EXPORT_SYMBOL_GPL(kvm_para_available);

537 538 539 540 541
unsigned int kvm_arch_para_features(void)
{
	return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}

J
Jason Wang 已提交
542
static uint32_t __init kvm_detect(void)
543
{
J
Jason Wang 已提交
544
	return kvm_cpuid_base();
545 546 547 548 549
}

const struct hypervisor_x86 x86_hyper_kvm __refconst = {
	.name			= "KVM",
	.detect			= kvm_detect,
550
	.x2apic_available	= kvm_para_available,
551 552 553
};
EXPORT_SYMBOL_GPL(x86_hyper_kvm);

554 555 556
static __init int activate_jump_labels(void)
{
	if (has_steal_clock) {
557
		static_key_slow_inc(&paravirt_steal_enabled);
558
		if (steal_acc)
559
			static_key_slow_inc(&paravirt_steal_rq_enabled);
560 561 562 563 564
	}

	return 0;
}
arch_initcall(activate_jump_labels);
565 566 567 568

#ifdef CONFIG_PARAVIRT_SPINLOCKS

/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
569
static void kvm_kick_cpu(int cpu)
570 571 572 573 574 575 576 577
{
	int apicid;
	unsigned long flags = 0;

	apicid = per_cpu(x86_cpu_to_apicid, cpu);
	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}

578

579
#ifdef CONFIG_QUEUED_SPINLOCKS
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608

#include <asm/qspinlock.h>

static void kvm_wait(u8 *ptr, u8 val)
{
	unsigned long flags;

	if (in_nmi())
		return;

	local_irq_save(flags);

	if (READ_ONCE(*ptr) != val)
		goto out;

	/*
	 * halt until it's our turn and kicked. Note that we do safe halt
	 * for irq enabled case to avoid hang when lock info is overwritten
	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
	 */
	if (arch_irqs_disabled_flags(flags))
		halt();
	else
		safe_halt();

out:
	local_irq_restore(flags);
}

609
#else /* !CONFIG_QUEUED_SPINLOCKS */
610

611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
enum kvm_contention_stat {
	TAKEN_SLOW,
	TAKEN_SLOW_PICKUP,
	RELEASED_SLOW,
	RELEASED_SLOW_KICKED,
	NR_CONTENTION_STATS
};

#ifdef CONFIG_KVM_DEBUG_FS
#define HISTO_BUCKETS	30

static struct kvm_spinlock_stats
{
	u32 contention_stats[NR_CONTENTION_STATS];
	u32 histo_spin_blocked[HISTO_BUCKETS+1];
	u64 time_blocked;
} spinlock_stats;

static u8 zero_stats;

static inline void check_zero(void)
{
	u8 ret;
	u8 old;

636
	old = READ_ONCE(zero_stats);
637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
	if (unlikely(old)) {
		ret = cmpxchg(&zero_stats, old, 0);
		/* This ensures only one fellow resets the stat */
		if (ret == old)
			memset(&spinlock_stats, 0, sizeof(spinlock_stats));
	}
}

static inline void add_stats(enum kvm_contention_stat var, u32 val)
{
	check_zero();
	spinlock_stats.contention_stats[var] += val;
}


static inline u64 spin_time_start(void)
{
	return sched_clock();
}

static void __spin_time_accum(u64 delta, u32 *array)
{
	unsigned index;

	index = ilog2(delta);
	check_zero();

	if (index < HISTO_BUCKETS)
		array[index]++;
	else
		array[HISTO_BUCKETS]++;
}

static inline void spin_time_accum_blocked(u64 start)
{
	u32 delta;

	delta = sched_clock() - start;
	__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
	spinlock_stats.time_blocked += delta;
}

static struct dentry *d_spin_debug;
static struct dentry *d_kvm_debug;

682
static struct dentry *kvm_init_debugfs(void)
683
{
684
	d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
	if (!d_kvm_debug)
		printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");

	return d_kvm_debug;
}

static int __init kvm_spinlock_debugfs(void)
{
	struct dentry *d_kvm;

	d_kvm = kvm_init_debugfs();
	if (d_kvm == NULL)
		return -ENOMEM;

	d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);

	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);

	debugfs_create_u32("taken_slow", 0444, d_spin_debug,
		   &spinlock_stats.contention_stats[TAKEN_SLOW]);
	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
		   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);

	debugfs_create_u32("released_slow", 0444, d_spin_debug,
		   &spinlock_stats.contention_stats[RELEASED_SLOW]);
	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
		   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);

	debugfs_create_u64("time_blocked", 0444, d_spin_debug,
			   &spinlock_stats.time_blocked);

	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
		     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);

	return 0;
}
fs_initcall(kvm_spinlock_debugfs);
#else  /* !CONFIG_KVM_DEBUG_FS */
static inline void add_stats(enum kvm_contention_stat var, u32 val)
{
}

static inline u64 spin_time_start(void)
{
	return 0;
}

static inline void spin_time_accum_blocked(u64 start)
{
}
#endif  /* CONFIG_KVM_DEBUG_FS */

struct kvm_lock_waiting {
	struct arch_spinlock *lock;
	__ticket_t want;
};

/* cpus 'waiting' on a spinlock to become available */
static cpumask_t waiting_cpus;

/* Track spinlock on which a cpu is waiting */
static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);

748
__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
749 750 751 752 753
{
	struct kvm_lock_waiting *w;
	int cpu;
	u64 start;
	unsigned long flags;
754
	__ticket_t head;
755 756 757 758

	if (in_nmi())
		return;

759
	w = this_cpu_ptr(&klock_waiting);
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795
	cpu = smp_processor_id();
	start = spin_time_start();

	/*
	 * Make sure an interrupt handler can't upset things in a
	 * partially setup state.
	 */
	local_irq_save(flags);

	/*
	 * The ordering protocol on this is that the "lock" pointer
	 * may only be set non-NULL if the "want" ticket is correct.
	 * If we're updating "want", we must first clear "lock".
	 */
	w->lock = NULL;
	smp_wmb();
	w->want = want;
	smp_wmb();
	w->lock = lock;

	add_stats(TAKEN_SLOW, 1);

	/*
	 * This uses set_bit, which is atomic but we should not rely on its
	 * reordering gurantees. So barrier is needed after this call.
	 */
	cpumask_set_cpu(cpu, &waiting_cpus);

	barrier();

	/*
	 * Mark entry to slowpath before doing the pickup test to make
	 * sure we don't deadlock with an unlocker.
	 */
	__ticket_enter_slowpath(lock);

796 797 798
	/* make sure enter_slowpath, which is atomic does not cross the read */
	smp_mb__after_atomic();

799 800 801 802
	/*
	 * check again make sure it didn't become free while
	 * we weren't looking.
	 */
803 804
	head = READ_ONCE(lock->tickets.head);
	if (__tickets_equal(head, want)) {
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
		add_stats(TAKEN_SLOW_PICKUP, 1);
		goto out;
	}

	/*
	 * halt until it's our turn and kicked. Note that we do safe halt
	 * for irq enabled case to avoid hang when lock info is overwritten
	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
	 */
	if (arch_irqs_disabled_flags(flags))
		halt();
	else
		safe_halt();

out:
	cpumask_clear_cpu(cpu, &waiting_cpus);
	w->lock = NULL;
	local_irq_restore(flags);
	spin_time_accum_blocked(start);
}
PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);

/* Kick vcpu waiting on @lock->head to reach value @ticket */
static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
{
	int cpu;

	add_stats(RELEASED_SLOW, 1);
	for_each_cpu(cpu, &waiting_cpus) {
		const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
835 836
		if (READ_ONCE(w->lock) == lock &&
		    READ_ONCE(w->want) == ticket) {
837 838 839 840 841 842 843
			add_stats(RELEASED_SLOW_KICKED, 1);
			kvm_kick_cpu(cpu);
			break;
		}
	}
}

844
#endif /* !CONFIG_QUEUED_SPINLOCKS */
845

846 847 848 849 850 851 852 853 854 855 856
/*
 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
 */
void __init kvm_spinlock_init(void)
{
	if (!kvm_para_available())
		return;
	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
		return;

857
#ifdef CONFIG_QUEUED_SPINLOCKS
858 859 860 861 862
	__pv_init_lock_hash();
	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
	pv_lock_ops.wait = kvm_wait;
	pv_lock_ops.kick = kvm_kick_cpu;
863
#else /* !CONFIG_QUEUED_SPINLOCKS */
864 865
	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
	pv_lock_ops.unlock_kick = kvm_unlock_kick;
866
#endif
867 868 869 870 871 872 873 874
}

static __init int kvm_spinlock_init_jump(void)
{
	if (!kvm_para_available())
		return 0;
	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
		return 0;
875 876

	static_key_slow_inc(&paravirt_ticketlocks_enabled);
877
	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
878

879
	return 0;
880
}
881 882
early_initcall(kvm_spinlock_init_jump);

883
#endif	/* CONFIG_PARAVIRT_SPINLOCKS */