Merge branch 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86: Fix leak of free lapic date in kvm_arch_vcpu_init() KVM: x86: Fix probable memory leak of vcpu->arch.mce_banks KVM: S390: fix potential array overrun in intercept handling KVM: fix spurious interrupt with irqfd eventfd - allow atomic read and waitqueue remove KVM: MMU: bail out pagewalk on kvm_read_guest error KVM: properly check max PIC pin in irq route setup KVM: only allow one gsi per fd KVM: x86: Fix host_mapping_level() KVM: powerpc: Show timing option only on embedded KVM: Fix race between APIC TMR and IRR

Merge branch 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86: Fix leak of free lapic date in kvm_arch_vcpu_init() KVM: x86: Fix probable memory leak of vcpu->arch.mce_banks KVM: S390: fix potential array overrun in intercept handling KVM: fix spurious interrupt with irqfd eventfd - allow atomic read and waitqueue remove KVM: MMU: bail out pagewalk on kvm_read_guest error KVM: properly check max PIC pin in irq route setup KVM: only allow one gsi per fd KVM: x86: Fix host_mapping_level() KVM: powerpc: Show timing option only on embedded KVM: Fix race between APIC TMR and IRR
486d35e2 · Linus Torvalds · a8d0b666 · 443c39bc · 486d35e2 · 486d35e2
10 changed file
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -53,7 +53,7 @@ config KVM_440
 config KVM_EXIT_TIMING
 	bool "Detailed exit timing"
-	depends on KVM
+	depends on KVM_440 || KVM_E500
 	---help---
 	  Calculate elapsed time for every exit/enter cycle. A per-vcpu
 	  report is available in debugfs kvm/vm#_vcpu#_timing.

--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -213,7 +213,7 @@ static int handle_instruction_and_prog(struct kvm_vcpu *vcpu)
 	return rc2;
 }
-static const intercept_handler_t intercept_funcs[0x48 >> 2] = {
+static const intercept_handler_t intercept_funcs[] = {
 	[0x00 >> 2] = handle_noop,
 	[0x04 >> 2] = handle_instruction,
 	[0x08 >> 2] = handle_prog,
@@ -230,7 +230,7 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 	intercept_handler_t func;
 	u8 code = vcpu->arch.sie_block->icptcode;
-	if (code & 3 || code > 0x48)
+	if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs))
 		return -ENOTSUPP;
 	func = intercept_funcs[code >> 2];
 	if (func)

--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -373,6 +373,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (unlikely(!apic_enabled(apic)))
 			break;
+		if (trig_mode) {
+			apic_debug("level trig mode for vector %d", vector);
+			apic_set_vector(vector, apic->regs + APIC_TMR);
+		} else
+			apic_clear_vector(vector, apic->regs + APIC_TMR);
 		result = !apic_test_and_set_irr(vector, apic);
 		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
 					  trig_mode, vector, !result);
@@ -383,11 +389,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			break;
 		}
-		if (trig_mode) {
-			apic_debug("level trig mode for vector %d", vector);
-			apic_set_vector(vector, apic->regs + APIC_TMR);
-		} else
-			apic_clear_vector(vector, apic->regs + APIC_TMR);
 		kvm_vcpu_kick(vcpu);
 		break;

--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -477,7 +477,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
-		return page_size;
+		return PT_PAGE_TABLE_LEVEL;
 	down_read(&current->mm->mmap_sem);
 	vma = find_vma(current->mm, addr);
@@ -515,11 +515,9 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 	if (host_level == PT_PAGE_TABLE_LEVEL)
 		return host_level;
-	for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
+	for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level)
 		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
 			break;
-	}
 	return level - 1;
 }

--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -150,7 +150,9 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
-		kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
+			goto not_present;
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte))

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5072,12 +5072,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 				       GFP_KERNEL);
 	if (!vcpu->arch.mce_banks) {
 		r = -ENOMEM;
-		goto fail_mmu_destroy;
+		goto fail_free_lapic;
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 	return 0;
+fail_free_lapic:
+	kvm_free_lapic(vcpu);
 fail_mmu_destroy:
 	kvm_mmu_destroy(vcpu);
 fail_free_pio_data:
@@ -5088,6 +5089,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
+	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
 	down_read(&vcpu->kvm->slots_lock);
 	kvm_mmu_destroy(vcpu);

--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
 	return events;
 }
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
-			    loff_t *ppos)
+{
+	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+	ctx->count -= *cnt;
+}
+/**
+ * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
+ * @ctx: [in] Pointer to eventfd context.
+ * @wait: [in] Wait queue to be removed.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked.
+ *
+ * This is used to atomically remove a wait queue entry from the eventfd wait
+ * queue head, and read/reset the counter value.
+ */
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+				  __u64 *cnt)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	eventfd_ctx_do_read(ctx, cnt);
+	__remove_wait_queue(&ctx->wqh, wait);
+	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
+		wake_up_locked_poll(&ctx->wqh, POLLOUT);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+	return *cnt != 0 ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	ssize_t res;
-	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
-	if (count < sizeof(ucnt))
-		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
+	*cnt = 0;
 	res = -EAGAIN;
 	if (ctx->count > 0)
-		res = sizeof(ucnt);
+		res = 0;
-	else if (!(file->f_flags & O_NONBLOCK)) {
+	else if (!no_wait) {
 		__add_wait_queue(&ctx->wqh, &wait);
-		for (res = 0;;) {
+		for (;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (ctx->count > 0) {
-				res = sizeof(ucnt);
+				res = 0;
 				break;
 			}
 			if (signal_pending(current)) {
@@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (likely(res > 0)) {
+	if (likely(res == 0)) {
-		ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+		eventfd_ctx_do_read(ctx, cnt);
-		ctx->count -= ucnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked_poll(&ctx->wqh, POLLOUT);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
-	if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
-		return -EFAULT;
 	return res;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 cnt;
+	if (count < sizeof(cnt))
+		return -EINVAL;
+	res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+	if (res < 0)
+		return res;
+	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+}
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 			     loff_t *ppos)

--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -10,6 +10,7 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
+#include <linux/wait.h>
 /*
 * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -34,6 +35,9 @@ struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 int eventfd_signal(struct eventfd_ctx *ctx, int n);
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+				  __u64 *cnt);
 #else /* CONFIG_EVENTFD */
@@ -61,6 +65,18 @@ static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
 }
+static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait,
+				       __u64 *cnt)
+{
+	return -ENOSYS;
+}
+static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
+						wait_queue_t *wait, __u64 *cnt)
+{
+	return -ENOSYS;
+}
 #endif
 #endif /* _LINUX_EVENTFD_H */

--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -72,12 +72,13 @@ static void
 irqfd_shutdown(struct work_struct *work)
 {
 	struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+	u64 cnt;
 	/*
 	 * Synchronize with the wait-queue and unhook ourselves to prevent
 	 * further events.
 	 */
-	remove_wait_queue(irqfd->wqh, &irqfd->wait);
+	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
 	/*
 	 * We know no new events will be scheduled at this point, so block
@@ -166,7 +167,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 static int
 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 {
-	struct _irqfd *irqfd;
+	struct _irqfd *irqfd, *tmp;
 	struct file *file = NULL;
 	struct eventfd_ctx *eventfd = NULL;
 	int ret;
@@ -203,9 +204,20 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+	spin_lock_irq(&kvm->irqfds.lock);
+	ret = 0;
+	list_for_each_entry(tmp, &kvm->irqfds.items, list) {
+		if (irqfd->eventfd != tmp->eventfd)
+			continue;
+		/* This fd is used for another irq already. */
+		ret = -EBUSY;
+		spin_unlock_irq(&kvm->irqfds.lock);
+		goto fail;
+	}
 	events = file->f_op->poll(file, &irqfd->pt);
-	spin_lock_irq(&kvm->irqfds.lock);
 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
 	spin_unlock_irq(&kvm->irqfds.lock);

--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -302,6 +302,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 {
 	int r = -EINVAL;
 	int delta;
+	unsigned max_pin;
 	struct kvm_kernel_irq_routing_entry *ei;
 	struct hlist_node *n;
@@ -322,12 +323,15 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_MASTER:
 			e->set = kvm_set_pic_irq;
+			max_pin = 16;
 			break;
 		case KVM_IRQCHIP_PIC_SLAVE:
 			e->set = kvm_set_pic_irq;
+			max_pin = 16;
 			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
+			max_pin = KVM_IOAPIC_NUM_PINS;
 			e->set = kvm_set_ioapic_irq;
 			break;
 		default:
@@ -335,7 +339,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
 		e->irqchip.pin = ue->u.irqchip.pin + delta;
-		if (e->irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+		if (e->irqchip.pin >= max_pin)
 			goto out;
 		rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
 		break;