channel_mgmt.c 30.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * Copyright (c) 2009, Microsoft Corporation.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 * Place - Suite 330, Boston, MA 02111-1307 USA.
 *
 * Authors:
 *   Haiyang Zhang <haiyangz@microsoft.com>
 *   Hank Janssen  <hjanssen@microsoft.com>
 */
21 22
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

23
#include <linux/kernel.h>
24
#include <linux/interrupt.h>
25 26
#include <linux/sched.h>
#include <linux/wait.h>
27
#include <linux/mm.h>
28
#include <linux/slab.h>
29
#include <linux/list.h>
30
#include <linux/module.h>
31
#include <linux/completion.h>
32
#include <linux/delay.h>
33
#include <linux/hyperv.h>
34

35
#include "hyperv_vmbus.h"
36

37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
static void init_vp_index(struct vmbus_channel *channel, u16 dev_type);

static const struct vmbus_device vmbus_devs[] = {
	/* IDE */
	{ .dev_type = HV_IDE,
	  HV_IDE_GUID,
	  .perf_device = true,
	},

	/* SCSI */
	{ .dev_type = HV_SCSI,
	  HV_SCSI_GUID,
	  .perf_device = true,
	},

	/* Fibre Channel */
	{ .dev_type = HV_FC,
	  HV_SYNTHFC_GUID,
	  .perf_device = true,
	},

	/* Synthetic NIC */
	{ .dev_type = HV_NIC,
	  HV_NIC_GUID,
	  .perf_device = true,
	},

	/* Network Direct */
	{ .dev_type = HV_ND,
	  HV_ND_GUID,
	  .perf_device = true,
	},

	/* PCIE */
	{ .dev_type = HV_PCIE,
	  HV_PCIE_GUID,
	  .perf_device = true,
	},

	/* Synthetic Frame Buffer */
	{ .dev_type = HV_FB,
	  HV_SYNTHVID_GUID,
	  .perf_device = false,
	},

	/* Synthetic Keyboard */
	{ .dev_type = HV_KBD,
	  HV_KBD_GUID,
	  .perf_device = false,
	},

	/* Synthetic MOUSE */
	{ .dev_type = HV_MOUSE,
	  HV_MOUSE_GUID,
	  .perf_device = false,
	},

	/* KVP */
	{ .dev_type = HV_KVP,
	  HV_KVP_GUID,
	  .perf_device = false,
	},

	/* Time Synch */
	{ .dev_type = HV_TS,
	  HV_TS_GUID,
	  .perf_device = false,
	},

	/* Heartbeat */
	{ .dev_type = HV_HB,
	  HV_HEART_BEAT_GUID,
	  .perf_device = false,
	},

	/* Shutdown */
	{ .dev_type = HV_SHUTDOWN,
	  HV_SHUTDOWN_GUID,
	  .perf_device = false,
	},

	/* File copy */
	{ .dev_type = HV_FCOPY,
	  HV_FCOPY_GUID,
	  .perf_device = false,
	},

	/* Backup */
	{ .dev_type = HV_BACKUP,
	  HV_VSS_GUID,
	  .perf_device = false,
	},

	/* Dynamic Memory */
	{ .dev_type = HV_DM,
	  HV_DM_GUID,
	  .perf_device = false,
	},

	/* Unknown GUID */
	{ .dev_type = HV_UNKOWN,
	  .perf_device = false,
	},
};

static u16 hv_get_dev_type(const uuid_le *guid)
{
	u16 i;

	for (i = HV_IDE; i < HV_UNKOWN; i++) {
		if (!uuid_le_cmp(*guid, vmbus_devs[i].guid))
			return i;
	}
	pr_info("Unknown GUID: %pUl\n", guid);
	return i;
}
153

154
/**
155
 * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
156 157 158 159 160 161
 * @icmsghdrp: Pointer to msg header structure
 * @icmsg_negotiate: Pointer to negotiate message structure
 * @buf: Raw buffer channel data
 *
 * @icmsghdrp is of type &struct icmsg_hdr.
 * @negop is of type &struct icmsg_negotiate.
162 163
 * Set up and fill in default negotiate response message.
 *
164 165 166
 * The fw_version specifies the  framework version that
 * we can support and srv_version specifies the service
 * version we can support.
167 168 169
 *
 * Mainly used by Hyper-V drivers.
 */
170
bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
171
				struct icmsg_negotiate *negop, u8 *buf,
172
				int fw_version, int srv_version)
173
{
174 175 176 177
	int icframe_major, icframe_minor;
	int icmsg_major, icmsg_minor;
	int fw_major, fw_minor;
	int srv_major, srv_minor;
178
	int i;
179
	bool found_match = false;
180

181
	icmsghdrp->icmsgsize = 0x10;
182 183 184 185 186
	fw_major = (fw_version >> 16);
	fw_minor = (fw_version & 0xFFFF);

	srv_major = (srv_version >> 16);
	srv_minor = (srv_version & 0xFFFF);
187

188 189 190
	negop = (struct icmsg_negotiate *)&buf[
		sizeof(struct vmbuspipe_hdr) +
		sizeof(struct icmsg_hdr)];
191

192 193 194 195 196
	icframe_major = negop->icframe_vercnt;
	icframe_minor = 0;

	icmsg_major = negop->icmsg_vercnt;
	icmsg_minor = 0;
197 198 199 200 201 202 203

	/*
	 * Select the framework version number we will
	 * support.
	 */

	for (i = 0; i < negop->icframe_vercnt; i++) {
204 205 206 207 208 209
		if ((negop->icversion_data[i].major == fw_major) &&
		   (negop->icversion_data[i].minor == fw_minor)) {
			icframe_major = negop->icversion_data[i].major;
			icframe_minor = negop->icversion_data[i].minor;
			found_match = true;
		}
210 211
	}

212 213 214 215 216
	if (!found_match)
		goto fw_error;

	found_match = false;

217 218
	for (i = negop->icframe_vercnt;
		 (i < negop->icframe_vercnt + negop->icmsg_vercnt); i++) {
219 220 221 222 223 224
		if ((negop->icversion_data[i].major == srv_major) &&
		   (negop->icversion_data[i].minor == srv_minor)) {
			icmsg_major = negop->icversion_data[i].major;
			icmsg_minor = negop->icversion_data[i].minor;
			found_match = true;
		}
225
	}
226

227
	/*
228
	 * Respond with the framework and service
229 230
	 * version numbers we can support.
	 */
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

fw_error:
	if (!found_match) {
		negop->icframe_vercnt = 0;
		negop->icmsg_vercnt = 0;
	} else {
		negop->icframe_vercnt = 1;
		negop->icmsg_vercnt = 1;
	}

	negop->icversion_data[0].major = icframe_major;
	negop->icversion_data[0].minor = icframe_minor;
	negop->icversion_data[1].major = icmsg_major;
	negop->icversion_data[1].minor = icmsg_minor;
	return found_match;
246
}
247

248
EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
249

250
/*
251
 * alloc_channel - Allocate and initialize a vmbus channel object
252
 */
253
static struct vmbus_channel *alloc_channel(void)
254
{
255
	static atomic_t chan_num = ATOMIC_INIT(0);
256
	struct vmbus_channel *channel;
257

258
	channel = kzalloc(sizeof(*channel), GFP_ATOMIC);
259 260 261
	if (!channel)
		return NULL;

262
	channel->id = atomic_inc_return(&chan_num);
263
	channel->acquire_ring_lock = true;
264
	spin_lock_init(&channel->inbound_lock);
265
	spin_lock_init(&channel->lock);
266 267

	INIT_LIST_HEAD(&channel->sc_list);
268
	INIT_LIST_HEAD(&channel->percpu_list);
269 270 271 272

	return channel;
}

273
/*
274
 * free_channel - Release the resources used by the vmbus channel object
275
 */
276
static void free_channel(struct vmbus_channel *channel)
277
{
278
	kfree(channel);
279 280
}

281 282 283 284 285 286 287
static void percpu_channel_enq(void *arg)
{
	struct vmbus_channel *channel = arg;
	int cpu = smp_processor_id();

	list_add_tail(&channel->percpu_list, &hv_context.percpu_list[cpu]);
}
288

289 290 291 292 293 294
static void percpu_channel_deq(void *arg)
{
	struct vmbus_channel *channel = arg;

	list_del(&channel->percpu_list);
}
295

296

297
static void vmbus_release_relid(u32 relid)
298
{
299
	struct vmbus_channel_relid_released msg;
300

301
	memset(&msg, 0, sizeof(struct vmbus_channel_relid_released));
302
	msg.child_relid = relid;
303 304
	msg.header.msgtype = CHANNELMSG_RELID_RELEASED;
	vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released));
305
}
306

307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
void hv_event_tasklet_disable(struct vmbus_channel *channel)
{
	struct tasklet_struct *tasklet;
	tasklet = hv_context.event_dpc[channel->target_cpu];
	tasklet_disable(tasklet);
}

void hv_event_tasklet_enable(struct vmbus_channel *channel)
{
	struct tasklet_struct *tasklet;
	tasklet = hv_context.event_dpc[channel->target_cpu];
	tasklet_enable(tasklet);

	/* In case there is any pending event */
	tasklet_schedule(tasklet);
}

324 325 326 327 328
void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
{
	unsigned long flags;
	struct vmbus_channel *primary_channel;

329
	BUG_ON(!channel->rescind);
330
	BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
331

332
	hv_event_tasklet_disable(channel);
333 334
	if (channel->target_cpu != get_cpu()) {
		put_cpu();
335 336
		smp_call_function_single(channel->target_cpu,
					 percpu_channel_deq, channel, true);
337
	} else {
338
		percpu_channel_deq(channel);
339 340
		put_cpu();
	}
341
	hv_event_tasklet_enable(channel);
342

343 344
	if (channel->primary_channel == NULL) {
		list_del(&channel->listentry);
345 346

		primary_channel = channel;
347 348
	} else {
		primary_channel = channel->primary_channel;
349
		spin_lock_irqsave(&primary_channel->lock, flags);
350
		list_del(&channel->sc_list);
351
		primary_channel->num_sc--;
352
		spin_unlock_irqrestore(&primary_channel->lock, flags);
353
	}
354 355 356 357 358 359 360 361

	/*
	 * We need to free the bit for init_vp_index() to work in the case
	 * of sub-channel, when we reload drivers like hv_netvsc.
	 */
	cpumask_clear_cpu(channel->target_cpu,
			  &primary_channel->alloced_cpus_in_node);

362 363
	vmbus_release_relid(relid);

364
	free_channel(channel);
365
}
366

367 368
void vmbus_free_channels(void)
{
369 370 371 372
	struct vmbus_channel *channel, *tmp;

	list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list,
		listentry) {
373
		/* hv_process_channel_removal() needs this */
374
		channel->rescind = true;
375 376 377 378 379

		vmbus_device_unregister(channel->device_obj);
	}
}

380
/*
381
 * vmbus_process_offer - Process the offer by creating a channel/device
382
 * associated with this offer
383
 */
384
static void vmbus_process_offer(struct vmbus_channel *newchannel)
385
{
386
	struct vmbus_channel *channel;
387
	bool fnew = true;
388
	unsigned long flags;
389
	u16 dev_type;
390
	int ret;
391

392
	/* Make sure this is a new offer */
393
	mutex_lock(&vmbus_connection.channel_mutex);
394

395
	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
396 397 398 399
		if (!uuid_le_cmp(channel->offermsg.offer.if_type,
			newchannel->offermsg.offer.if_type) &&
			!uuid_le_cmp(channel->offermsg.offer.if_instance,
				newchannel->offermsg.offer.if_instance)) {
400
			fnew = false;
401 402 403 404
			break;
		}
	}

405
	if (fnew)
406
		list_add_tail(&newchannel->listentry,
407
			      &vmbus_connection.chn_list);
408

409
	mutex_unlock(&vmbus_connection.channel_mutex);
410

411
	if (!fnew) {
412 413 414 415 416 417 418 419
		/*
		 * Check to see if this is a sub-channel.
		 */
		if (newchannel->offermsg.offer.sub_channel_index != 0) {
			/*
			 * Process the sub-channel.
			 */
			newchannel->primary_channel = channel;
420
			spin_lock_irqsave(&channel->lock, flags);
421
			list_add_tail(&newchannel->sc_list, &channel->sc_list);
422
			channel->num_sc++;
423
			spin_unlock_irqrestore(&channel->lock, flags);
424 425 426
		} else
			goto err_free_chan;
	}
427

428
	dev_type = hv_get_dev_type(&newchannel->offermsg.offer.if_type);
429 430
	if (dev_type == HV_NIC)
		set_channel_signal_state(newchannel, HV_SIGNAL_POLICY_EXPLICIT);
431 432

	init_vp_index(newchannel, dev_type);
433

434
	hv_event_tasklet_disable(newchannel);
435 436 437 438 439 440 441 442
	if (newchannel->target_cpu != get_cpu()) {
		put_cpu();
		smp_call_function_single(newchannel->target_cpu,
					 percpu_channel_enq,
					 newchannel, true);
	} else {
		percpu_channel_enq(newchannel);
		put_cpu();
443
	}
444
	hv_event_tasklet_enable(newchannel);
445

446 447 448 449 450 451 452
	/*
	 * This state is used to indicate a successful open
	 * so that when we do close the channel normally, we
	 * can cleanup properly
	 */
	newchannel->state = CHANNEL_OPEN_STATE;

453 454 455 456 457 458
	if (!fnew) {
		if (channel->sc_creation_callback != NULL)
			channel->sc_creation_callback(newchannel);
		return;
	}

459 460 461
	/*
	 * Start the process of binding this offer to the driver
	 * We need to set the DeviceObject field before calling
462
	 * vmbus_child_dev_add()
463
	 */
464
	newchannel->device_obj = vmbus_device_create(
465 466
		&newchannel->offermsg.offer.if_type,
		&newchannel->offermsg.offer.if_instance,
467
		newchannel);
468
	if (!newchannel->device_obj)
469
		goto err_deq_chan;
470

471
	newchannel->device_obj->device_id = dev_type;
472 473 474 475 476
	/*
	 * Add the new device to the bus. This will kick off device-driver
	 * binding which eventually invokes the device driver's AddDevice()
	 * method.
	 */
477 478 479 480 481
	mutex_lock(&vmbus_connection.channel_mutex);
	ret = vmbus_device_register(newchannel->device_obj);
	mutex_unlock(&vmbus_connection.channel_mutex);

	if (ret != 0) {
482 483 484 485 486
		pr_err("unable to add child device object (relid %d)\n",
			newchannel->offermsg.child_relid);
		kfree(newchannel->device_obj);
		goto err_deq_chan;
	}
487
	return;
488

489
err_deq_chan:
490
	mutex_lock(&vmbus_connection.channel_mutex);
491
	list_del(&newchannel->listentry);
492
	mutex_unlock(&vmbus_connection.channel_mutex);
493

494
	hv_event_tasklet_disable(newchannel);
495 496 497 498 499 500 501 502
	if (newchannel->target_cpu != get_cpu()) {
		put_cpu();
		smp_call_function_single(newchannel->target_cpu,
					 percpu_channel_deq, newchannel, true);
	} else {
		percpu_channel_deq(newchannel);
		put_cpu();
	}
503 504 505
	hv_event_tasklet_enable(newchannel);

	vmbus_release_relid(newchannel->offermsg.child_relid);
506

507 508
err_free_chan:
	free_channel(newchannel);
509 510
}

511 512 513
/*
 * We use this state to statically distribute the channel interrupt load.
 */
514
static int next_numa_node_id;
515 516 517

/*
 * Starting with Win8, we can statically distribute the incoming
518 519 520 521 522 523 524 525
 * channel interrupt load by binding a channel to VCPU.
 * We do this in a hierarchical fashion:
 * First distribute the primary channels across available NUMA nodes
 * and then distribute the subchannels amongst the CPUs in the NUMA
 * node assigned to the primary channel.
 *
 * For pre-win8 hosts or non-performance critical channels we assign the
 * first CPU in the first NUMA node.
526
 */
527
static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
528 529
{
	u32 cur_cpu;
530
	bool perf_chn = vmbus_devs[dev_type].perf_device;
531 532 533
	struct vmbus_channel *primary = channel->primary_channel;
	int next_node;
	struct cpumask available_mask;
534
	struct cpumask *alloced_mask;
535 536 537 538 539 540 541 542 543

	if ((vmbus_proto_version == VERSION_WS2008) ||
	    (vmbus_proto_version == VERSION_WIN7) || (!perf_chn)) {
		/*
		 * Prior to win8, all channel interrupts are
		 * delivered on cpu 0.
		 * Also if the channel is not a performance critical
		 * channel, bind it to cpu 0.
		 */
544
		channel->numa_node = 0;
545
		channel->target_cpu = 0;
546
		channel->target_vp = hv_context.vp_index[0];
547
		return;
548
	}
549 550

	/*
551 552 553 554 555
	 * We distribute primary channels evenly across all the available
	 * NUMA nodes and within the assigned NUMA node we will assign the
	 * first available CPU to the primary channel.
	 * The sub-channels will be assigned to the CPUs available in the
	 * NUMA node evenly.
556
	 */
557 558 559 560 561 562 563 564 565 566 567 568
	if (!primary) {
		while (true) {
			next_node = next_numa_node_id++;
			if (next_node == nr_node_ids)
				next_node = next_numa_node_id = 0;
			if (cpumask_empty(cpumask_of_node(next_node)))
				continue;
			break;
		}
		channel->numa_node = next_node;
		primary = channel;
	}
569
	alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
570

571
	if (cpumask_weight(alloced_mask) ==
572
	    cpumask_weight(cpumask_of_node(primary->numa_node))) {
573
		/*
574 575
		 * We have cycled through all the CPUs in the node;
		 * reset the alloced map.
576
		 */
577
		cpumask_clear(alloced_mask);
578 579
	}

580
	cpumask_xor(&available_mask, alloced_mask,
581 582
		    cpumask_of_node(primary->numa_node));

583
	cur_cpu = -1;
584 585 586 587 588 589 590 591 592 593 594

	/*
	 * Normally Hyper-V host doesn't create more subchannels than there
	 * are VCPUs on the node but it is possible when not all present VCPUs
	 * on the node are initialized by guest. Clear the alloced_cpus_in_node
	 * to start over.
	 */
	if (cpumask_equal(&primary->alloced_cpus_in_node,
			  cpumask_of_node(primary->numa_node)))
		cpumask_clear(&primary->alloced_cpus_in_node);

595 596 597 598 599 600 601 602 603
	while (true) {
		cur_cpu = cpumask_next(cur_cpu, &available_mask);
		if (cur_cpu >= nr_cpu_ids) {
			cur_cpu = -1;
			cpumask_copy(&available_mask,
				     cpumask_of_node(primary->numa_node));
			continue;
		}

604 605 606 607 608 609 610
		/*
		 * NOTE: in the case of sub-channel, we clear the sub-channel
		 * related bit(s) in primary->alloced_cpus_in_node in
		 * hv_process_channel_removal(), so when we reload drivers
		 * like hv_netvsc in SMP guest, here we're able to re-allocate
		 * bit from primary->alloced_cpus_in_node.
		 */
611 612 613 614 615 616 617 618
		if (!cpumask_test_cpu(cur_cpu,
				&primary->alloced_cpus_in_node)) {
			cpumask_set_cpu(cur_cpu,
					&primary->alloced_cpus_in_node);
			cpumask_set_cpu(cur_cpu, alloced_mask);
			break;
		}
	}
619

620 621
	channel->target_cpu = cur_cpu;
	channel->target_vp = hv_context.vp_index[cur_cpu];
622 623
}

624 625
static void vmbus_wait_for_unload(void)
{
626 627 628
	int cpu;
	void *page_addr;
	struct hv_message *msg;
629
	struct vmbus_channel_message_header *hdr;
630
	u32 message_type;
631

632 633 634 635 636 637 638 639 640
	/*
	 * CHANNELMSG_UNLOAD_RESPONSE is always delivered to the CPU which was
	 * used for initial contact or to CPU0 depending on host version. When
	 * we're crashing on a different CPU let's hope that IRQ handler on
	 * the cpu which receives CHANNELMSG_UNLOAD_RESPONSE is still
	 * functional and vmbus_unload_response() will complete
	 * vmbus_connection.unload_event. If not, the last thing we can do is
	 * read message pages for all CPUs directly.
	 */
641
	while (1) {
642 643
		if (completion_done(&vmbus_connection.unload_event))
			break;
644

645 646 647 648
		for_each_online_cpu(cpu) {
			page_addr = hv_context.synic_message_page[cpu];
			msg = (struct hv_message *)page_addr +
				VMBUS_MESSAGE_SINT;
649

650 651 652
			message_type = READ_ONCE(msg->header.message_type);
			if (message_type == HVMSG_NONE)
				continue;
653

654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
			hdr = (struct vmbus_channel_message_header *)
				msg->u.payload;

			if (hdr->msgtype == CHANNELMSG_UNLOAD_RESPONSE)
				complete(&vmbus_connection.unload_event);

			vmbus_signal_eom(msg, message_type);
		}

		mdelay(10);
	}

	/*
	 * We're crashing and already got the UNLOAD_RESPONSE, cleanup all
	 * maybe-pending messages on all CPUs to be able to receive new
	 * messages after we reconnect.
	 */
	for_each_online_cpu(cpu) {
		page_addr = hv_context.synic_message_page[cpu];
		msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
		msg->header.message_type = HVMSG_NONE;
675 676 677
	}
}

678 679 680 681 682 683 684 685 686 687 688 689
/*
 * vmbus_unload_response - Handler for the unload response.
 */
static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
{
	/*
	 * This is a global event; just wakeup the waiting thread.
	 * Once we successfully unload, we can cleanup the monitor state.
	 */
	complete(&vmbus_connection.unload_event);
}

690
void vmbus_initiate_unload(bool crash)
691 692 693
{
	struct vmbus_channel_message_header hdr;

694 695 696 697
	/* Pre-Win2012R2 hosts don't support reconnect */
	if (vmbus_proto_version < VERSION_WIN8_1)
		return;

698 699 700 701 702
	init_completion(&vmbus_connection.unload_event);
	memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
	hdr.msgtype = CHANNELMSG_UNLOAD;
	vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header));

703 704 705 706
	/*
	 * vmbus_initiate_unload() is also called on crash and the crash can be
	 * happening in an interrupt context, where scheduling is impossible.
	 */
707
	if (!crash)
708 709 710
		wait_for_completion(&vmbus_connection.unload_event);
	else
		vmbus_wait_for_unload();
711 712
}

713
/*
714
 * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
715 716
 *
 */
717
static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
718
{
719
	struct vmbus_channel_offer_channel *offer;
720
	struct vmbus_channel *newchannel;
721

722
	offer = (struct vmbus_channel_offer_channel *)hdr;
723

724
	/* Allocate the channel object and save this offer. */
725
	newchannel = alloc_channel();
726
	if (!newchannel) {
727
		pr_err("Unable to allocate channel object\n");
728 729 730
		return;
	}

731 732 733 734 735 736 737
	/*
	 * By default we setup state to enable batched
	 * reading. A specific service can choose to
	 * disable this prior to opening the channel.
	 */
	newchannel->batched_reading = true;

738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
	/*
	 * Setup state for signalling the host.
	 */
	newchannel->sig_event = (struct hv_input_signal_event *)
				(ALIGN((unsigned long)
				&newchannel->sig_buf,
				HV_HYPERCALL_PARAM_ALIGN));

	newchannel->sig_event->connectionid.asu32 = 0;
	newchannel->sig_event->connectionid.u.id = VMBUS_EVENT_CONNECTION_ID;
	newchannel->sig_event->flag_number = 0;
	newchannel->sig_event->rsvdz = 0;

	if (vmbus_proto_version != VERSION_WS2008) {
		newchannel->is_dedicated_interrupt =
				(offer->is_dedicated_interrupt != 0);
		newchannel->sig_event->connectionid.u.id =
				offer->connection_id;
	}

758
	memcpy(&newchannel->offermsg, offer,
759
	       sizeof(struct vmbus_channel_offer_channel));
760 761
	newchannel->monitor_grp = (u8)offer->monitorid / 32;
	newchannel->monitor_bit = (u8)offer->monitorid % 32;
762

763
	vmbus_process_offer(newchannel);
764 765
}

766
/*
767
 * vmbus_onoffer_rescind - Rescind offer handler.
768 769 770
 *
 * We queue a work item to process this offer synchronously
 */
771
static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
772
{
773
	struct vmbus_channel_rescind_offer *rescind;
774
	struct vmbus_channel *channel;
775 776
	unsigned long flags;
	struct device *dev;
777

778
	rescind = (struct vmbus_channel_rescind_offer *)hdr;
779 780

	mutex_lock(&vmbus_connection.channel_mutex);
781
	channel = relid2channel(rescind->child_relid);
782

783
	if (channel == NULL) {
784 785 786 787 788
		/*
		 * This is very impossible, because in
		 * vmbus_process_offer(), we have already invoked
		 * vmbus_release_relid() on error.
		 */
789
		goto out;
790
	}
791

792 793 794 795 796
	spin_lock_irqsave(&channel->lock, flags);
	channel->rescind = true;
	spin_unlock_irqrestore(&channel->lock, flags);

	if (channel->device_obj) {
797 798
		if (channel->chn_rescind_callback) {
			channel->chn_rescind_callback(channel);
799
			goto out;
800
		}
801 802 803 804 805 806 807 808 809 810 811 812
		/*
		 * We will have to unregister this device from the
		 * driver core.
		 */
		dev = get_device(&channel->device_obj->device);
		if (dev) {
			vmbus_device_unregister(channel->device_obj);
			put_device(dev);
		}
	} else {
		hv_process_channel_removal(channel,
			channel->offermsg.child_relid);
813
	}
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828

out:
	mutex_unlock(&vmbus_connection.channel_mutex);
}

void vmbus_hvsock_device_unregister(struct vmbus_channel *channel)
{
	mutex_lock(&vmbus_connection.channel_mutex);

	BUG_ON(!is_hvsock_channel(channel));

	channel->rescind = true;
	vmbus_device_unregister(channel->device_obj);

	mutex_unlock(&vmbus_connection.channel_mutex);
829
}
830 831
EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister);

832

833
/*
834 835
 * vmbus_onoffers_delivered -
 * This is invoked when all offers have been delivered.
836 837 838
 *
 * Nothing to do here.
 */
839
static void vmbus_onoffers_delivered(
840
			struct vmbus_channel_message_header *hdr)
841 842 843
{
}

844
/*
845
 * vmbus_onopen_result - Open result handler.
846 847 848 849 850
 *
 * This is invoked when we received a response to our channel open request.
 * Find the matching request, copy the response and signal the requesting
 * thread.
 */
851
static void vmbus_onopen_result(struct vmbus_channel_message_header *hdr)
852
{
853
	struct vmbus_channel_open_result *result;
854 855 856
	struct vmbus_channel_msginfo *msginfo;
	struct vmbus_channel_message_header *requestheader;
	struct vmbus_channel_open_channel *openmsg;
857
	unsigned long flags;
858

859
	result = (struct vmbus_channel_open_result *)hdr;
860

861 862 863
	/*
	 * Find the open msg, copy the result and signal/unblock the wait event
	 */
864
	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
865

866 867
	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
				msglistentry) {
868
		requestheader =
869
			(struct vmbus_channel_message_header *)msginfo->msg;
870

871
		if (requestheader->msgtype == CHANNELMSG_OPENCHANNEL) {
872
			openmsg =
873 874 875 876
			(struct vmbus_channel_open_channel *)msginfo->msg;
			if (openmsg->child_relid == result->child_relid &&
			    openmsg->openid == result->openid) {
				memcpy(&msginfo->response.open_result,
877
				       result,
878 879 880
				       sizeof(
					struct vmbus_channel_open_result));
				complete(&msginfo->waitevent);
881 882 883 884
				break;
			}
		}
	}
885
	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
886 887
}

888
/*
889
 * vmbus_ongpadl_created - GPADL created handler.
890 891 892 893 894
 *
 * This is invoked when we received a response to our gpadl create request.
 * Find the matching request, copy the response and signal the requesting
 * thread.
 */
895
static void vmbus_ongpadl_created(struct vmbus_channel_message_header *hdr)
896
{
897 898 899 900
	struct vmbus_channel_gpadl_created *gpadlcreated;
	struct vmbus_channel_msginfo *msginfo;
	struct vmbus_channel_message_header *requestheader;
	struct vmbus_channel_gpadl_header *gpadlheader;
901
	unsigned long flags;
902

903
	gpadlcreated = (struct vmbus_channel_gpadl_created *)hdr;
904

905 906 907 908
	/*
	 * Find the establish msg, copy the result and signal/unblock the wait
	 * event
	 */
909
	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
910

911 912
	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
				msglistentry) {
913
		requestheader =
914
			(struct vmbus_channel_message_header *)msginfo->msg;
915

916
		if (requestheader->msgtype == CHANNELMSG_GPADL_HEADER) {
917 918 919
			gpadlheader =
			(struct vmbus_channel_gpadl_header *)requestheader;

920 921 922 923
			if ((gpadlcreated->child_relid ==
			     gpadlheader->child_relid) &&
			    (gpadlcreated->gpadl == gpadlheader->gpadl)) {
				memcpy(&msginfo->response.gpadl_created,
924
				       gpadlcreated,
925 926 927
				       sizeof(
					struct vmbus_channel_gpadl_created));
				complete(&msginfo->waitevent);
928 929 930 931
				break;
			}
		}
	}
932
	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
933 934
}

935
/*
936
 * vmbus_ongpadl_torndown - GPADL torndown handler.
937 938 939 940 941
 *
 * This is invoked when we received a response to our gpadl teardown request.
 * Find the matching request, copy the response and signal the requesting
 * thread.
 */
942
static void vmbus_ongpadl_torndown(
943
			struct vmbus_channel_message_header *hdr)
944
{
945 946 947 948
	struct vmbus_channel_gpadl_torndown *gpadl_torndown;
	struct vmbus_channel_msginfo *msginfo;
	struct vmbus_channel_message_header *requestheader;
	struct vmbus_channel_gpadl_teardown *gpadl_teardown;
949
	unsigned long flags;
950

951
	gpadl_torndown = (struct vmbus_channel_gpadl_torndown *)hdr;
952 953 954 955

	/*
	 * Find the open msg, copy the result and signal/unblock the wait event
	 */
956
	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
957

958 959
	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
				msglistentry) {
960
		requestheader =
961
			(struct vmbus_channel_message_header *)msginfo->msg;
962

963
		if (requestheader->msgtype == CHANNELMSG_GPADL_TEARDOWN) {
964 965
			gpadl_teardown =
			(struct vmbus_channel_gpadl_teardown *)requestheader;
966

967 968
			if (gpadl_torndown->gpadl == gpadl_teardown->gpadl) {
				memcpy(&msginfo->response.gpadl_torndown,
969
				       gpadl_torndown,
970 971 972
				       sizeof(
					struct vmbus_channel_gpadl_torndown));
				complete(&msginfo->waitevent);
973 974 975 976
				break;
			}
		}
	}
977
	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
978 979
}

980
/*
981
 * vmbus_onversion_response - Version response handler
982 983 984 985 986
 *
 * This is invoked when we received a response to our initiate contact request.
 * Find the matching request, copy the response and signal the requesting
 * thread.
 */
987
static void vmbus_onversion_response(
988
		struct vmbus_channel_message_header *hdr)
989
{
990 991 992
	struct vmbus_channel_msginfo *msginfo;
	struct vmbus_channel_message_header *requestheader;
	struct vmbus_channel_version_response *version_response;
993
	unsigned long flags;
994

995
	version_response = (struct vmbus_channel_version_response *)hdr;
996
	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
997

998 999
	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
				msglistentry) {
1000
		requestheader =
1001
			(struct vmbus_channel_message_header *)msginfo->msg;
1002

1003 1004 1005
		if (requestheader->msgtype ==
		    CHANNELMSG_INITIATE_CONTACT) {
			memcpy(&msginfo->response.version_response,
1006
			      version_response,
1007
			      sizeof(struct vmbus_channel_version_response));
1008
			complete(&msginfo->waitevent);
1009 1010
		}
	}
1011
	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
1012 1013
}

1014
/* Channel message dispatch table */
1015
struct vmbus_channel_message_table_entry
1016
	channel_message_table[CHANNELMSG_COUNT] = {
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
	{CHANNELMSG_INVALID,			0, NULL},
	{CHANNELMSG_OFFERCHANNEL,		0, vmbus_onoffer},
	{CHANNELMSG_RESCIND_CHANNELOFFER,	0, vmbus_onoffer_rescind},
	{CHANNELMSG_REQUESTOFFERS,		0, NULL},
	{CHANNELMSG_ALLOFFERS_DELIVERED,	1, vmbus_onoffers_delivered},
	{CHANNELMSG_OPENCHANNEL,		0, NULL},
	{CHANNELMSG_OPENCHANNEL_RESULT,		1, vmbus_onopen_result},
	{CHANNELMSG_CLOSECHANNEL,		0, NULL},
	{CHANNELMSG_GPADL_HEADER,		0, NULL},
	{CHANNELMSG_GPADL_BODY,			0, NULL},
	{CHANNELMSG_GPADL_CREATED,		1, vmbus_ongpadl_created},
	{CHANNELMSG_GPADL_TEARDOWN,		0, NULL},
	{CHANNELMSG_GPADL_TORNDOWN,		1, vmbus_ongpadl_torndown},
	{CHANNELMSG_RELID_RELEASED,		0, NULL},
	{CHANNELMSG_INITIATE_CONTACT,		0, NULL},
	{CHANNELMSG_VERSION_RESPONSE,		1, vmbus_onversion_response},
	{CHANNELMSG_UNLOAD,			0, NULL},
1034
	{CHANNELMSG_UNLOAD_RESPONSE,		1, vmbus_unload_response},
1035 1036 1037 1038
	{CHANNELMSG_18,				0, NULL},
	{CHANNELMSG_19,				0, NULL},
	{CHANNELMSG_20,				0, NULL},
	{CHANNELMSG_TL_CONNECT_REQUEST,		0, NULL},
1039 1040
};

1041
/*
1042
 * vmbus_onmessage - Handler for channel protocol messages.
1043 1044 1045
 *
 * This is invoked in the vmbus worker thread context.
 */
1046
void vmbus_onmessage(void *context)
1047
{
1048
	struct hv_message *msg = context;
1049
	struct vmbus_channel_message_header *hdr;
1050 1051
	int size;

1052 1053
	hdr = (struct vmbus_channel_message_header *)msg->u.payload;
	size = msg->header.payload_size;
1054

1055
	if (hdr->msgtype >= CHANNELMSG_COUNT) {
1056
		pr_err("Received invalid channel message type %d size %d\n",
1057
			   hdr->msgtype, size);
1058
		print_hex_dump_bytes("", DUMP_PREFIX_NONE,
1059
				     (unsigned char *)msg->u.payload, size);
1060 1061 1062
		return;
	}

1063 1064
	if (channel_message_table[hdr->msgtype].message_handler)
		channel_message_table[hdr->msgtype].message_handler(hdr);
1065
	else
1066
		pr_err("Unhandled channel message type %d\n", hdr->msgtype);
1067 1068
}

1069
/*
1070
 * vmbus_request_offers - Send a request to get all our pending offers.
1071
 */
1072
int vmbus_request_offers(void)
1073
{
1074
	struct vmbus_channel_message_header *msg;
1075
	struct vmbus_channel_msginfo *msginfo;
1076
	int ret;
1077

1078
	msginfo = kmalloc(sizeof(*msginfo) +
1079 1080
			  sizeof(struct vmbus_channel_message_header),
			  GFP_KERNEL);
1081
	if (!msginfo)
1082
		return -ENOMEM;
1083

1084
	msg = (struct vmbus_channel_message_header *)msginfo->msg;
1085

1086
	msg->msgtype = CHANNELMSG_REQUESTOFFERS;
1087 1088


1089
	ret = vmbus_post_msg(msg,
1090 1091
			       sizeof(struct vmbus_channel_message_header));
	if (ret != 0) {
1092
		pr_err("Unable to request offers - %d\n", ret);
1093

1094 1095
		goto cleanup;
	}
1096

1097
cleanup:
1098
	kfree(msginfo);
1099 1100 1101 1102

	return ret;
}

1103 1104
/*
 * Retrieve the (sub) channel on which to send an outgoing request.
1105 1106
 * When a primary channel has multiple sub-channels, we try to
 * distribute the load equally amongst all available channels.
1107 1108 1109 1110
 */
struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
{
	struct list_head *cur, *tmp;
1111
	int cur_cpu;
1112 1113
	struct vmbus_channel *cur_channel;
	struct vmbus_channel *outgoing_channel = primary;
1114 1115
	int next_channel;
	int i = 1;
1116 1117 1118 1119

	if (list_empty(&primary->sc_list))
		return outgoing_channel;

1120 1121 1122 1123 1124 1125 1126
	next_channel = primary->next_oc++;

	if (next_channel > (primary->num_sc)) {
		primary->next_oc = 0;
		return outgoing_channel;
	}

1127 1128
	cur_cpu = hv_context.vp_index[get_cpu()];
	put_cpu();
1129 1130 1131 1132 1133 1134 1135 1136
	list_for_each_safe(cur, tmp, &primary->sc_list) {
		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
		if (cur_channel->state != CHANNEL_OPENED_STATE)
			continue;

		if (cur_channel->target_vp == cur_cpu)
			return cur_channel;

1137 1138
		if (i == next_channel)
			return cur_channel;
1139

1140
		i++;
1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
	}

	return outgoing_channel;
}
EXPORT_SYMBOL_GPL(vmbus_get_outgoing_channel);

static void invoke_sc_cb(struct vmbus_channel *primary_channel)
{
	struct list_head *cur, *tmp;
	struct vmbus_channel *cur_channel;

	if (primary_channel->sc_creation_callback == NULL)
		return;

	list_for_each_safe(cur, tmp, &primary_channel->sc_list) {
		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);

		primary_channel->sc_creation_callback(cur_channel);
	}
}

void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel,
				void (*sc_cr_cb)(struct vmbus_channel *new_sc))
{
	primary_channel->sc_creation_callback = sc_cr_cb;
}
EXPORT_SYMBOL_GPL(vmbus_set_sc_create_callback);

bool vmbus_are_subchannels_present(struct vmbus_channel *primary)
{
	bool ret;

	ret = !list_empty(&primary->sc_list);

	if (ret) {
		/*
		 * Invoke the callback on sub-channel creation.
		 * This will present a uniform interface to the
		 * clients.
		 */
		invoke_sc_cb(primary);
	}

	return ret;
}
EXPORT_SYMBOL_GPL(vmbus_are_subchannels_present);
1187 1188 1189 1190 1191 1192 1193

void vmbus_set_chn_rescind_callback(struct vmbus_channel *channel,
		void (*chn_rescind_cb)(struct vmbus_channel *))
{
	channel->chn_rescind_callback = chn_rescind_cb;
}
EXPORT_SYMBOL_GPL(vmbus_set_chn_rescind_callback);