blkback.c 36.6 KB
Newer Older
K
Konrad Rzeszutek Wilk 已提交
1 2 3 4 5 6
/******************************************************************************
 *
 * Back-end of the driver for virtual block devices. This portion of the
 * driver exports a 'unified' block-device interface that can be accessed
 * by any operating system that implements a compatible front end. A
 * reference front-end implementation can be found in:
7
 *  drivers/block/xen-blkfront.c
K
Konrad Rzeszutek Wilk 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 *
 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
 * Copyright (c) 2005, Christopher Clark
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version 2
 * as published by the Free Software Foundation; or, when distributed
 * separately from the Linux kernel or incorporated into other
 * software packages, subject to the following license:
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this source file (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use, copy, modify,
 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/delay.h>
J
Jeremy Fitzhardinge 已提交
41
#include <linux/freezer.h>
42
#include <linux/bitmap.h>
43

J
Jeremy Fitzhardinge 已提交
44 45
#include <xen/events.h>
#include <xen/page.h>
46
#include <xen/xen.h>
J
Jeremy Fitzhardinge 已提交
47 48
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
49
#include <xen/balloon.h>
K
Konrad Rzeszutek Wilk 已提交
50 51
#include "common.h"

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
/*
 * Maximum number of unused free pages to keep in the internal buffer.
 * Setting this to a value too low will reduce memory used in each backend,
 * but can have a performance penalty.
 *
 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
 * be set to a lower value that might degrade performance on some intensive
 * IO workloads.
 */

static int xen_blkif_max_buffer_pages = 704;
module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
MODULE_PARM_DESC(max_buffer_pages,
"Maximum number of free pages to keep in each block backend buffer");

67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
/*
 * Maximum number of grants to map persistently in blkback. For maximum
 * performance this should be the total numbers of grants that can be used
 * to fill the ring, but since this might become too high, specially with
 * the use of indirect descriptors, we set it to a value that provides good
 * performance without using too much memory.
 *
 * When the list of persistent grants is full we clean it up using a LRU
 * algorithm.
 */

static int xen_blkif_max_pgrants = 352;
module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
MODULE_PARM_DESC(max_persistent_grants,
                 "Maximum number of grants to map persistently");

/*
 * The LRU mechanism to clean the lists of persistent grants needs to
 * be executed periodically. The time interval between consecutive executions
 * of the purge mechanism is set in ms.
 */
#define LRU_INTERVAL 100

/*
 * When the persistent grants list is full we will remove unused grants
 * from the list. The percent number of grants to be removed at each LRU
 * execution.
 */
#define LRU_PERCENT_CLEAN 5

K
Konrad Rzeszutek Wilk 已提交
97
/* Run-time switchable: /sys/module/blkback/parameters/ */
98
static unsigned int log_stats;
K
Konrad Rzeszutek Wilk 已提交
99 100 101 102
module_param(log_stats, int, 0644);

#define BLKBACK_INVALID_HANDLE (~0)

103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
/* Number of free pages to remove on each call to free_xenballooned_pages */
#define NUM_BATCH_FREE_PAGES 10

static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
{
	unsigned long flags;

	spin_lock_irqsave(&blkif->free_pages_lock, flags);
	if (list_empty(&blkif->free_pages)) {
		BUG_ON(blkif->free_pages_num != 0);
		spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
		return alloc_xenballooned_pages(1, page, false);
	}
	BUG_ON(blkif->free_pages_num == 0);
	page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
	list_del(&page[0]->lru);
	blkif->free_pages_num--;
	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
121

122 123 124 125 126
	return 0;
}

static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
                                  int num)
K
Konrad Rzeszutek Wilk 已提交
127
{
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
	unsigned long flags;
	int i;

	spin_lock_irqsave(&blkif->free_pages_lock, flags);
	for (i = 0; i < num; i++)
		list_add(&page[i]->lru, &blkif->free_pages);
	blkif->free_pages_num += num;
	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
}

static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
{
	/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
	struct page *page[NUM_BATCH_FREE_PAGES];
	unsigned int num_pages = 0;
	unsigned long flags;

	spin_lock_irqsave(&blkif->free_pages_lock, flags);
	while (blkif->free_pages_num > num) {
		BUG_ON(list_empty(&blkif->free_pages));
		page[num_pages] = list_first_entry(&blkif->free_pages,
		                                   struct page, lru);
		list_del(&page[num_pages]->lru);
		blkif->free_pages_num--;
		if (++num_pages == NUM_BATCH_FREE_PAGES) {
			spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
			free_xenballooned_pages(num_pages, page);
			spin_lock_irqsave(&blkif->free_pages_lock, flags);
			num_pages = 0;
		}
	}
	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
	if (num_pages != 0)
		free_xenballooned_pages(num_pages, page);
K
Konrad Rzeszutek Wilk 已提交
162 163
}

164 165
#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))

166 167
static int do_block_io_op(struct xen_blkif *blkif);
static int dispatch_rw_block_io(struct xen_blkif *blkif,
168 169
				struct blkif_request *req,
				struct pending_req *pending_req);
170
static void make_response(struct xen_blkif *blkif, u64 id,
K
Konrad Rzeszutek Wilk 已提交
171 172
			  unsigned short op, int st);

173 174
#define foreach_grant_safe(pos, n, rbtree, node) \
	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
175
	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
176
	     &(pos)->node != NULL; \
177 178
	     (pos) = container_of(n, typeof(*(pos)), node), \
	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
179 180


181 182 183 184 185 186 187 188 189 190 191
/*
 * We don't need locking around the persistent grant helpers
 * because blkback uses a single-thread for each backed, so we
 * can be sure that this functions will never be called recursively.
 *
 * The only exception to that is put_persistent_grant, that can be called
 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
 * bit operations to modify the flags of a persistent grant and to count
 * the number of used grants.
 */
static int add_persistent_gnt(struct xen_blkif *blkif,
192 193
			       struct persistent_gnt *persistent_gnt)
{
194
	struct rb_node **new = NULL, *parent = NULL;
195 196
	struct persistent_gnt *this;

197 198 199 200 201
	if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
		if (!blkif->vbd.overflow_max_grants)
			blkif->vbd.overflow_max_grants = 1;
		return -EBUSY;
	}
202
	/* Figure out where to put new node */
203
	new = &blkif->persistent_gnts.rb_node;
204 205 206 207 208 209 210 211 212
	while (*new) {
		this = container_of(*new, struct persistent_gnt, node);

		parent = *new;
		if (persistent_gnt->gnt < this->gnt)
			new = &((*new)->rb_left);
		else if (persistent_gnt->gnt > this->gnt)
			new = &((*new)->rb_right);
		else {
213 214
			pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
			return -EINVAL;
215 216 217
		}
	}

218 219
	bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
	set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
220 221
	/* Add new node and rebalance tree. */
	rb_link_node(&(persistent_gnt->node), parent, new);
222 223 224
	rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
	blkif->persistent_gnt_c++;
	atomic_inc(&blkif->persistent_gnt_in_use);
225
	return 0;
226 227
}

228
static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
229 230 231
						 grant_ref_t gref)
{
	struct persistent_gnt *data;
232
	struct rb_node *node = NULL;
233

234
	node = blkif->persistent_gnts.rb_node;
235 236 237 238 239 240 241
	while (node) {
		data = container_of(node, struct persistent_gnt, node);

		if (gref < data->gnt)
			node = node->rb_left;
		else if (gref > data->gnt)
			node = node->rb_right;
242 243 244 245 246 247 248
		else {
			if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
				pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
				return NULL;
			}
			set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
			atomic_inc(&blkif->persistent_gnt_in_use);
249
			return data;
250
		}
251 252 253 254
	}
	return NULL;
}

255 256 257 258 259 260 261 262 263 264
static void put_persistent_gnt(struct xen_blkif *blkif,
                               struct persistent_gnt *persistent_gnt)
{
	if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
	          pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
	set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
	clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
	atomic_dec(&blkif->persistent_gnt_in_use);
}

265 266
static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
                                 unsigned int num)
267 268 269 270
{
	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct persistent_gnt *persistent_gnt;
271
	struct rb_node *n;
272 273 274
	int ret = 0;
	int segs_to_unmap = 0;

275
	foreach_grant_safe(persistent_gnt, n, root, node) {
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
		BUG_ON(persistent_gnt->handle ==
			BLKBACK_INVALID_HANDLE);
		gnttab_set_unmap_op(&unmap[segs_to_unmap],
			(unsigned long) pfn_to_kaddr(page_to_pfn(
				persistent_gnt->page)),
			GNTMAP_host_map,
			persistent_gnt->handle);

		pages[segs_to_unmap] = persistent_gnt->page;

		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
			!rb_next(&persistent_gnt->node)) {
			ret = gnttab_unmap_refs(unmap, NULL, pages,
				segs_to_unmap);
			BUG_ON(ret);
291
			put_free_pages(blkif, pages, segs_to_unmap);
292 293
			segs_to_unmap = 0;
		}
294 295 296 297

		rb_erase(&persistent_gnt->node, root);
		kfree(persistent_gnt);
		num--;
298 299 300 301
	}
	BUG_ON(num != 0);
}

302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
static void unmap_purged_grants(struct work_struct *work)
{
	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct persistent_gnt *persistent_gnt;
	int ret, segs_to_unmap = 0;
	struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);

	while(!list_empty(&blkif->persistent_purge_list)) {
		persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
		                                  struct persistent_gnt,
		                                  remove_node);
		list_del(&persistent_gnt->remove_node);

		gnttab_set_unmap_op(&unmap[segs_to_unmap],
			vaddr(persistent_gnt->page),
			GNTMAP_host_map,
			persistent_gnt->handle);

		pages[segs_to_unmap] = persistent_gnt->page;

		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
			ret = gnttab_unmap_refs(unmap, NULL, pages,
				segs_to_unmap);
			BUG_ON(ret);
			put_free_pages(blkif, pages, segs_to_unmap);
			segs_to_unmap = 0;
		}
		kfree(persistent_gnt);
	}
	if (segs_to_unmap > 0) {
		ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
		BUG_ON(ret);
		put_free_pages(blkif, pages, segs_to_unmap);
	}
}

static void purge_persistent_gnt(struct xen_blkif *blkif)
{
	struct persistent_gnt *persistent_gnt;
	struct rb_node *n;
	unsigned int num_clean, total;
	bool scan_used = false;
	struct rb_root *root;

	if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
	    (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
	    !blkif->vbd.overflow_max_grants)) {
		return;
	}

	if (work_pending(&blkif->persistent_purge_work)) {
		pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
		return;
	}

	num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
	num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
	num_clean = min(blkif->persistent_gnt_c, num_clean);
	if (num_clean >
	    (blkif->persistent_gnt_c -
	    atomic_read(&blkif->persistent_gnt_in_use)))
		return;

	/*
	 * At this point, we can assure that there will be no calls
         * to get_persistent_grant (because we are executing this code from
         * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
         * which means that the number of currently used grants will go down,
         * but never up, so we will always be able to remove the requested
         * number of grants.
	 */

	total = num_clean;

	pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);

	INIT_LIST_HEAD(&blkif->persistent_purge_list);
	root = &blkif->persistent_gnts;
purge_list:
	foreach_grant_safe(persistent_gnt, n, root, node) {
		BUG_ON(persistent_gnt->handle ==
			BLKBACK_INVALID_HANDLE);

		if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
			continue;
		if (!scan_used &&
		    (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
			continue;

		rb_erase(&persistent_gnt->node, root);
		list_add(&persistent_gnt->remove_node,
		         &blkif->persistent_purge_list);
		if (--num_clean == 0)
			goto finished;
	}
	/*
	 * If we get here it means we also need to start cleaning
	 * grants that were used since last purge in order to cope
	 * with the requested num
	 */
	if (!scan_used) {
		pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
		scan_used = true;
		goto purge_list;
	}
finished:
	/* Remove the "used" flag from all the persistent grants */
	foreach_grant_safe(persistent_gnt, n, root, node) {
		BUG_ON(persistent_gnt->handle ==
			BLKBACK_INVALID_HANDLE);
		clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
	}
	blkif->persistent_gnt_c -= (total - num_clean);
	blkif->vbd.overflow_max_grants = 0;

	/* We can defer this work */
	INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
	schedule_work(&blkif->persistent_purge_work);
	pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
	return;
}

425 426
/*
 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
K
Konrad Rzeszutek Wilk 已提交
427
 */
428
static struct pending_req *alloc_req(struct xen_blkif *blkif)
K
Konrad Rzeszutek Wilk 已提交
429
{
430
	struct pending_req *req = NULL;
K
Konrad Rzeszutek Wilk 已提交
431 432
	unsigned long flags;

433 434 435
	spin_lock_irqsave(&blkif->pending_free_lock, flags);
	if (!list_empty(&blkif->pending_free)) {
		req = list_entry(blkif->pending_free.next, struct pending_req,
436
				 free_list);
K
Konrad Rzeszutek Wilk 已提交
437 438
		list_del(&req->free_list);
	}
439
	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
K
Konrad Rzeszutek Wilk 已提交
440 441 442
	return req;
}

443 444 445 446
/*
 * Return the 'pending_req' structure back to the freepool. We also
 * wake up the thread if it was waiting for a free page.
 */
447
static void free_req(struct xen_blkif *blkif, struct pending_req *req)
K
Konrad Rzeszutek Wilk 已提交
448 449 450 451
{
	unsigned long flags;
	int was_empty;

452 453 454 455
	spin_lock_irqsave(&blkif->pending_free_lock, flags);
	was_empty = list_empty(&blkif->pending_free);
	list_add(&req->free_list, &blkif->pending_free);
	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
K
Konrad Rzeszutek Wilk 已提交
456
	if (was_empty)
457
		wake_up(&blkif->pending_free_wq);
K
Konrad Rzeszutek Wilk 已提交
458 459
}

460 461 462
/*
 * Routines for managing virtual block devices (vbds).
 */
463 464
static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
			     int operation)
465
{
466
	struct xen_vbd *vbd = &blkif->vbd;
467 468 469 470 471
	int rc = -EACCES;

	if ((operation != READ) && vbd->readonly)
		goto out;

472 473 474 475 476 477 478 479
	if (likely(req->nr_sects)) {
		blkif_sector_t end = req->sector_number + req->nr_sects;

		if (unlikely(end < req->sector_number))
			goto out;
		if (unlikely(end > vbd_sz(vbd)))
			goto out;
	}
480 481 482 483 484 485 486 487 488

	req->dev  = vbd->pdevice;
	req->bdev = vbd->bdev;
	rc = 0;

 out:
	return rc;
}

489
static void xen_vbd_resize(struct xen_blkif *blkif)
490
{
491
	struct xen_vbd *vbd = &blkif->vbd;
492 493
	struct xenbus_transaction xbt;
	int err;
494
	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
495
	unsigned long long new_size = vbd_sz(vbd);
496

497
	pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
498
		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
499
	pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
500 501 502 503
	vbd->size = new_size;
again:
	err = xenbus_transaction_start(&xbt);
	if (err) {
504
		pr_warn(DRV_PFX "Error starting transaction");
505 506 507
		return;
	}
	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
508
			    (unsigned long long)vbd_sz(vbd));
509
	if (err) {
510
		pr_warn(DRV_PFX "Error writing new size");
511 512 513 514 515 516 517 518 519
		goto abort;
	}
	/*
	 * Write the current state; we will use this to synchronize
	 * the front-end. If the current state is "connected" the
	 * front-end will get the new size information online.
	 */
	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
	if (err) {
520
		pr_warn(DRV_PFX "Error writing the state");
521 522 523 524 525 526 527
		goto abort;
	}

	err = xenbus_transaction_end(xbt, 0);
	if (err == -EAGAIN)
		goto again;
	if (err)
528
		pr_warn(DRV_PFX "Error ending transaction");
529
	return;
530 531 532 533
abort:
	xenbus_transaction_end(xbt, 1);
}

534
/*
535 536
 * Notification from the guest OS.
 */
537
static void blkif_notify_work(struct xen_blkif *blkif)
K
Konrad Rzeszutek Wilk 已提交
538
{
539 540 541
	blkif->waiting_reqs = 1;
	wake_up(&blkif->wq);
}
K
Konrad Rzeszutek Wilk 已提交
542

543
irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
544 545 546
{
	blkif_notify_work(dev_id);
	return IRQ_HANDLED;
K
Konrad Rzeszutek Wilk 已提交
547 548
}

549
/*
K
Konrad Rzeszutek Wilk 已提交
550 551 552
 * SCHEDULER FUNCTIONS
 */

553
static void print_stats(struct xen_blkif *blkif)
K
Konrad Rzeszutek Wilk 已提交
554
{
555
	pr_info("xen-blkback (%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
556
		 "  |  ds %4llu | pg: %4u/%4d\n",
557
		 current->comm, blkif->st_oo_req,
558
		 blkif->st_rd_req, blkif->st_wr_req,
559 560
		 blkif->st_f_req, blkif->st_ds_req,
		 blkif->persistent_gnt_c,
561
		 xen_blkif_max_pgrants);
K
Konrad Rzeszutek Wilk 已提交
562 563 564 565
	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
	blkif->st_rd_req = 0;
	blkif->st_wr_req = 0;
	blkif->st_oo_req = 0;
566
	blkif->st_ds_req = 0;
K
Konrad Rzeszutek Wilk 已提交
567 568
}

569
int xen_blkif_schedule(void *arg)
K
Konrad Rzeszutek Wilk 已提交
570
{
571
	struct xen_blkif *blkif = arg;
572
	struct xen_vbd *vbd = &blkif->vbd;
573
	unsigned long timeout;
K
Konrad Rzeszutek Wilk 已提交
574

575
	xen_blkif_get(blkif);
K
Konrad Rzeszutek Wilk 已提交
576 577 578 579

	while (!kthread_should_stop()) {
		if (try_to_freeze())
			continue;
580
		if (unlikely(vbd->size != vbd_sz(vbd)))
581
			xen_vbd_resize(blkif);
K
Konrad Rzeszutek Wilk 已提交
582

583 584 585
		timeout = msecs_to_jiffies(LRU_INTERVAL);

		timeout = wait_event_interruptible_timeout(
K
Konrad Rzeszutek Wilk 已提交
586
			blkif->wq,
587 588 589 590 591
			blkif->waiting_reqs || kthread_should_stop(),
			timeout);
		if (timeout == 0)
			goto purge_gnt_list;
		timeout = wait_event_interruptible_timeout(
592 593
			blkif->pending_free_wq,
			!list_empty(&blkif->pending_free) ||
594 595 596 597
			kthread_should_stop(),
			timeout);
		if (timeout == 0)
			goto purge_gnt_list;
K
Konrad Rzeszutek Wilk 已提交
598 599 600 601 602 603 604

		blkif->waiting_reqs = 0;
		smp_mb(); /* clear flag *before* checking for work */

		if (do_block_io_op(blkif))
			blkif->waiting_reqs = 1;

605 606 607 608 609 610 611
purge_gnt_list:
		if (blkif->vbd.feature_gnt_persistent &&
		    time_after(jiffies, blkif->next_lru)) {
			purge_persistent_gnt(blkif);
			blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
		}

612 613 614
		/* Shrink if we have more than xen_blkif_max_buffer_pages */
		shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);

K
Konrad Rzeszutek Wilk 已提交
615 616 617 618
		if (log_stats && time_after(jiffies, blkif->st_print))
			print_stats(blkif);
	}

619 620 621
	/* Since we are shutting down remove all pages from the buffer */
	shrink_free_pagepool(blkif, 0 /* All */);

622
	/* Free all persistent grant pages */
623
	if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
624
		free_persistent_gnts(blkif, &blkif->persistent_gnts,
625
			blkif->persistent_gnt_c);
626 627

	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
628
	blkif->persistent_gnt_c = 0;
629

K
Konrad Rzeszutek Wilk 已提交
630 631 632 633
	if (log_stats)
		print_stats(blkif);

	blkif->xenblkd = NULL;
634
	xen_blkif_put(blkif);
K
Konrad Rzeszutek Wilk 已提交
635 636 637 638

	return 0;
}

639
struct seg_buf {
640
	unsigned int offset;
641 642
	unsigned int nsec;
};
643 644 645
/*
 * Unmap the grant references, and also remove the M2P over-rides
 * used in the 'pending_req'.
646
 */
647 648 649 650 651
static void xen_blkbk_unmap(struct xen_blkif *blkif,
                            grant_handle_t handles[],
                            struct page *pages[],
                            struct persistent_gnt *persistent_gnts[],
                            int num)
652 653
{
	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
654
	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
655 656 657
	unsigned int i, invcount = 0;
	int ret;

658 659 660
	for (i = 0; i < num; i++) {
		if (persistent_gnts[i] != NULL) {
			put_persistent_gnt(blkif, persistent_gnts[i]);
661
			continue;
662
		}
663
		if (handles[i] == BLKBACK_INVALID_HANDLE)
664
			continue;
665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
		unmap_pages[invcount] = pages[i];
		gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]),
				    GNTMAP_host_map, handles[i]);
		handles[i] = BLKBACK_INVALID_HANDLE;
		if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
			                        invcount);
			BUG_ON(ret);
			put_free_pages(blkif, unmap_pages, invcount);
			invcount = 0;
		}
	}
	if (invcount) {
		ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
		BUG_ON(ret);
		put_free_pages(blkif, unmap_pages, invcount);
681 682
	}
}
683

684 685 686 687 688
static int xen_blkbk_map(struct xen_blkif *blkif, grant_ref_t grefs[],
			 struct persistent_gnt *persistent_gnts[],
			 grant_handle_t handles[],
			 struct page *pages[],
			 int num, bool ro)
689 690
{
	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
691 692 693
	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct persistent_gnt *persistent_gnt = NULL;
	phys_addr_t addr = 0;
694
	int i, seg_idx, new_map_idx;
695
	int segs_to_map = 0;
696
	int ret = 0;
697
	int last_map = 0, map_until = 0;
698 699 700 701
	int use_persistent_gnts;

	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);

702 703
	/*
	 * Fill out preq.nr_sects with proper amount of sectors, and setup
704 705 706
	 * assign map[..] with the PFN of the page in our domain with the
	 * corresponding grant reference for each page.
	 */
707 708
again:
	for (i = map_until; i < num; i++) {
709 710
		uint32_t flags;

711 712
		if (use_persistent_gnts)
			persistent_gnt = get_persistent_gnt(
713
				blkif,
714
				grefs[i]);
715 716 717 718 719 720 721 722 723

		if (persistent_gnt) {
			/*
			 * We are using persistent grants and
			 * the grant is already mapped
			 */
			pages[i] = persistent_gnt->page;
			persistent_gnts[i] = persistent_gnt;
		} else {
724 725 726 727
			if (get_free_page(blkif, &pages[i]))
				goto out_of_memory;
			addr = vaddr(pages[i]);
			pages_to_gnt[segs_to_map] = pages[i];
728 729
			persistent_gnts[i] = NULL;
			flags = GNTMAP_host_map;
730
			if (!use_persistent_gnts && ro)
731 732
				flags |= GNTMAP_readonly;
			gnttab_set_map_op(&map[segs_to_map++], addr,
733
					  flags, grefs[i],
734 735
					  blkif->domid);
		}
736 737 738
		map_until = i + 1;
		if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
			break;
739 740
	}

741 742 743 744
	if (segs_to_map) {
		ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
		BUG_ON(ret);
	}
745

746 747
	/*
	 * Now swizzle the MFN in our domain with the MFN from the other domain
748 749 750
	 * so that when we access vaddr(pending_req,i) it has the contents of
	 * the page from the other domain.
	 */
751
	for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
752
		if (!persistent_gnts[seg_idx]) {
753
			/* This is a newly mapped grant */
754 755
			BUG_ON(new_map_idx >= segs_to_map);
			if (unlikely(map[new_map_idx].status != 0)) {
756
				pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
757
				handles[seg_idx] = BLKBACK_INVALID_HANDLE;
758
				ret |= 1;
759
				goto next;
760
			}
761
			handles[seg_idx] = map[new_map_idx].handle;
762
		} else {
763
			continue;
764
		}
765
		if (use_persistent_gnts &&
766
		    blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
767 768
			/*
			 * We are using persistent grants, the grant is
769
			 * not mapped but we might have room for it.
770 771 772 773
			 */
			persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
				                 GFP_KERNEL);
			if (!persistent_gnt) {
774
				/*
775 776 777
				 * If we don't have enough memory to
				 * allocate the persistent_gnt struct
				 * map this grant non-persistenly
778
				 */
779
				goto next;
780
			}
781 782 783
			persistent_gnt->gnt = map[new_map_idx].ref;
			persistent_gnt->handle = map[new_map_idx].handle;
			persistent_gnt->page = pages[seg_idx];
784
			if (add_persistent_gnt(blkif,
785 786 787
			                       persistent_gnt)) {
				kfree(persistent_gnt);
				persistent_gnt = NULL;
788
				goto next;
789
			}
790
			persistent_gnts[seg_idx] = persistent_gnt;
791 792
			pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
				 persistent_gnt->gnt, blkif->persistent_gnt_c,
793
				 xen_blkif_max_pgrants);
794 795 796 797 798 799
			goto next;
		}
		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
			blkif->vbd.overflow_max_grants = 1;
			pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
			         blkif->domid, blkif->vbd.handle);
800
		}
801 802 803 804 805
		/*
		 * We could not map this grant persistently, so use it as
		 * a non-persistent grant.
		 */
next:
806
		new_map_idx++;
807
	}
808 809 810 811 812
	segs_to_map = 0;
	last_map = map_until;
	if (map_until != num)
		goto again;

813
	return ret;
814 815 816 817 818

out_of_memory:
	pr_alert(DRV_PFX "%s: out of memory\n", __func__);
	put_free_pages(blkif, pages_to_gnt, segs_to_map);
	return -ENOMEM;
819 820
}

821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
static int xen_blkbk_map_seg(struct blkif_request *req,
			     struct pending_req *pending_req,
			     struct seg_buf seg[],
			     struct page *pages[])
{
	int i, rc;
	grant_ref_t grefs[BLKIF_MAX_SEGMENTS_PER_REQUEST];

	for (i = 0; i < req->u.rw.nr_segments; i++)
		grefs[i] = req->u.rw.seg[i].gref;

	rc = xen_blkbk_map(pending_req->blkif, grefs,
	                   pending_req->persistent_gnts,
	                   pending_req->grant_handles, pending_req->pages,
	                   req->u.rw.nr_segments,
	                   (pending_req->operation != BLKIF_OP_READ));
	if (rc)
		return rc;

	for (i = 0; i < req->u.rw.nr_segments; i++)
		seg[i].offset = (req->u.rw.seg[i].first_sect << 9);

	return 0;
}

846 847
static int dispatch_discard_io(struct xen_blkif *blkif,
				struct blkif_request *req)
848 849 850 851
{
	int err = 0;
	int status = BLKIF_RSP_OKAY;
	struct block_device *bdev = blkif->vbd.bdev;
852
	unsigned long secure;
853

854 855 856
	blkif->st_ds_req++;

	xen_blkif_get(blkif);
857 858 859 860 861 862 863
	secure = (blkif->vbd.discard_secure &&
		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
		 BLKDEV_DISCARD_SECURE : 0;

	err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
				   req->u.discard.nr_sectors,
				   GFP_KERNEL, secure);
864 865 866 867 868 869 870

	if (err == -EOPNOTSUPP) {
		pr_debug(DRV_PFX "discard op failed, not supported\n");
		status = BLKIF_RSP_EOPNOTSUPP;
	} else if (err)
		status = BLKIF_RSP_ERROR;

871
	make_response(blkif, req->u.discard.id, req->operation, status);
872 873
	xen_blkif_put(blkif);
	return err;
874 875
}

876 877 878 879
static int dispatch_other_io(struct xen_blkif *blkif,
			     struct blkif_request *req,
			     struct pending_req *pending_req)
{
880
	free_req(blkif, pending_req);
881 882 883 884 885
	make_response(blkif, req->u.other.id, req->operation,
		      BLKIF_RSP_EOPNOTSUPP);
	return -EIO;
}

886 887 888 889
static void xen_blk_drain_io(struct xen_blkif *blkif)
{
	atomic_set(&blkif->drain, 1);
	do {
890 891 892 893
		/* The initial value is one, and one refcnt taken at the
		 * start of the xen_blkif_schedule thread. */
		if (atomic_read(&blkif->refcnt) <= 2)
			break;
894 895 896 897 898 899 900 901 902
		wait_for_completion_interruptible_timeout(
				&blkif->drain_complete, HZ);

		if (!atomic_read(&blkif->drain))
			break;
	} while (!kthread_should_stop());
	atomic_set(&blkif->drain, 0);
}

903 904
/*
 * Completion callback on the bio's. Called as bh->b_end_io()
K
Konrad Rzeszutek Wilk 已提交
905 906
 */

907
static void __end_block_io_op(struct pending_req *pending_req, int error)
K
Konrad Rzeszutek Wilk 已提交
908 909
{
	/* An error fails the entire request. */
910
	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
K
Konrad Rzeszutek Wilk 已提交
911
	    (error == -EOPNOTSUPP)) {
912
		pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
913
		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
K
Konrad Rzeszutek Wilk 已提交
914
		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
915 916 917 918 919
	} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
		    (error == -EOPNOTSUPP)) {
		pr_debug(DRV_PFX "write barrier op failed, not supported\n");
		xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
K
Konrad Rzeszutek Wilk 已提交
920
	} else if (error) {
921
		pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
922
			 " error=%d\n", error);
K
Konrad Rzeszutek Wilk 已提交
923 924 925
		pending_req->status = BLKIF_RSP_ERROR;
	}

926 927
	/*
	 * If all of the bio's have completed it is time to unmap
928
	 * the grant references associated with 'request' and provide
929 930
	 * the proper response on the ring.
	 */
K
Konrad Rzeszutek Wilk 已提交
931
	if (atomic_dec_and_test(&pending_req->pendcnt)) {
932 933 934 935
		xen_blkbk_unmap(pending_req->blkif, pending_req->grant_handles,
		                pending_req->pages,
		                pending_req->persistent_gnts,
		                pending_req->nr_pages);
K
Konrad Rzeszutek Wilk 已提交
936 937
		make_response(pending_req->blkif, pending_req->id,
			      pending_req->operation, pending_req->status);
938
		xen_blkif_put(pending_req->blkif);
939 940 941 942
		if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
			if (atomic_read(&pending_req->blkif->drain))
				complete(&pending_req->blkif->drain_complete);
		}
943
		free_req(pending_req->blkif, pending_req);
K
Konrad Rzeszutek Wilk 已提交
944 945 946
	}
}

947 948 949
/*
 * bio callback.
 */
J
Jeremy Fitzhardinge 已提交
950
static void end_block_io_op(struct bio *bio, int error)
K
Konrad Rzeszutek Wilk 已提交
951 952 953 954 955 956 957
{
	__end_block_io_op(bio->bi_private, error);
	bio_put(bio);
}



958 959 960 961
/*
 * Function to copy the from the ring buffer the 'struct blkif_request'
 * (which has the sectors we want, number of them, grant references, etc),
 * and transmute  it to the block API to hand it over to the proper block disk.
K
Konrad Rzeszutek Wilk 已提交
962
 */
963 964
static int
__do_block_io_op(struct xen_blkif *blkif)
K
Konrad Rzeszutek Wilk 已提交
965
{
J
Jeremy Fitzhardinge 已提交
966 967
	union blkif_back_rings *blk_rings = &blkif->blk_rings;
	struct blkif_request req;
968
	struct pending_req *pending_req;
K
Konrad Rzeszutek Wilk 已提交
969 970 971 972 973 974 975 976 977 978 979 980
	RING_IDX rc, rp;
	int more_to_do = 0;

	rc = blk_rings->common.req_cons;
	rp = blk_rings->common.sring->req_prod;
	rmb(); /* Ensure we see queued requests up to 'rp'. */

	while (rc != rp) {

		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
			break;

981
		if (kthread_should_stop()) {
K
Konrad Rzeszutek Wilk 已提交
982 983 984 985
			more_to_do = 1;
			break;
		}

986
		pending_req = alloc_req(blkif);
987 988
		if (NULL == pending_req) {
			blkif->st_oo_req++;
K
Konrad Rzeszutek Wilk 已提交
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
			more_to_do = 1;
			break;
		}

		switch (blkif->blk_protocol) {
		case BLKIF_PROTOCOL_NATIVE:
			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
			break;
		case BLKIF_PROTOCOL_X86_32:
			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
			break;
		case BLKIF_PROTOCOL_X86_64:
			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
			break;
		default:
			BUG();
		}
		blk_rings->common.req_cons = ++rc; /* before make_response() */

		/* Apply all sanity checks to /private copy/ of request. */
		barrier();
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019

		switch (req.operation) {
		case BLKIF_OP_READ:
		case BLKIF_OP_WRITE:
		case BLKIF_OP_WRITE_BARRIER:
		case BLKIF_OP_FLUSH_DISKCACHE:
			if (dispatch_rw_block_io(blkif, &req, pending_req))
				goto done;
			break;
		case BLKIF_OP_DISCARD:
1020
			free_req(blkif, pending_req);
1021
			if (dispatch_discard_io(blkif, &req))
1022
				goto done;
K
Konrad Rzeszutek Wilk 已提交
1023
			break;
1024 1025 1026 1027 1028
		default:
			if (dispatch_other_io(blkif, &req, pending_req))
				goto done;
			break;
		}
K
Konrad Rzeszutek Wilk 已提交
1029 1030 1031 1032

		/* Yield point for this unbounded loop. */
		cond_resched();
	}
1033
done:
K
Konrad Rzeszutek Wilk 已提交
1034 1035 1036
	return more_to_do;
}

1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
static int
do_block_io_op(struct xen_blkif *blkif)
{
	union blkif_back_rings *blk_rings = &blkif->blk_rings;
	int more_to_do;

	do {
		more_to_do = __do_block_io_op(blkif);
		if (more_to_do)
			break;

		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
	} while (more_to_do);

	return more_to_do;
}
1053
/*
1054 1055
 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
 * and call the 'submit_bio' to pass it to the underlying storage.
1056
 */
1057 1058 1059
static int dispatch_rw_block_io(struct xen_blkif *blkif,
				struct blkif_request *req,
				struct pending_req *pending_req)
K
Konrad Rzeszutek Wilk 已提交
1060 1061
{
	struct phys_req preq;
1062
	struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
K
Konrad Rzeszutek Wilk 已提交
1063 1064
	unsigned int nseg;
	struct bio *bio = NULL;
1065
	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
1066
	int i, nbio = 0;
K
Konrad Rzeszutek Wilk 已提交
1067
	int operation;
1068
	struct blk_plug plug;
1069
	bool drain = false;
1070
	struct page **pages = pending_req->pages;
K
Konrad Rzeszutek Wilk 已提交
1071 1072 1073

	switch (req->operation) {
	case BLKIF_OP_READ:
1074
		blkif->st_rd_req++;
K
Konrad Rzeszutek Wilk 已提交
1075 1076 1077
		operation = READ;
		break;
	case BLKIF_OP_WRITE:
1078
		blkif->st_wr_req++;
1079
		operation = WRITE_ODIRECT;
K
Konrad Rzeszutek Wilk 已提交
1080
		break;
1081 1082
	case BLKIF_OP_WRITE_BARRIER:
		drain = true;
1083
	case BLKIF_OP_FLUSH_DISKCACHE:
1084
		blkif->st_f_req++;
1085
		operation = WRITE_FLUSH;
K
Konrad Rzeszutek Wilk 已提交
1086 1087 1088
		break;
	default:
		operation = 0; /* make gcc happy */
1089 1090
		goto fail_response;
		break;
K
Konrad Rzeszutek Wilk 已提交
1091 1092
	}

1093 1094
	/* Check that the number of segments is sane. */
	nseg = req->u.rw.nr_segments;
1095

1096
	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
K
Konrad Rzeszutek Wilk 已提交
1097
	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
1098
		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
1099
			 nseg);
1100
		/* Haven't submitted any bio's yet. */
K
Konrad Rzeszutek Wilk 已提交
1101 1102 1103
		goto fail_response;
	}

1104
	preq.sector_number = req->u.rw.sector_number;
K
Konrad Rzeszutek Wilk 已提交
1105 1106 1107
	preq.nr_sects      = 0;

	pending_req->blkif     = blkif;
1108
	pending_req->id        = req->u.rw.id;
K
Konrad Rzeszutek Wilk 已提交
1109 1110 1111
	pending_req->operation = req->operation;
	pending_req->status    = BLKIF_RSP_OKAY;
	pending_req->nr_pages  = nseg;
1112

K
Konrad Rzeszutek Wilk 已提交
1113
	for (i = 0; i < nseg; i++) {
1114 1115 1116 1117
		seg[i].nsec = req->u.rw.seg[i].last_sect -
			req->u.rw.seg[i].first_sect + 1;
		if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
K
Konrad Rzeszutek Wilk 已提交
1118 1119
			goto fail_response;
		preq.nr_sects += seg[i].nsec;
1120

K
Konrad Rzeszutek Wilk 已提交
1121 1122
	}

1123
	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
1124
		pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
1125 1126
			 operation == READ ? "read" : "write",
			 preq.sector_number,
1127 1128
			 preq.sector_number + preq.nr_sects,
			 blkif->vbd.pdevice);
1129
		goto fail_response;
K
Konrad Rzeszutek Wilk 已提交
1130
	}
1131 1132

	/*
1133
	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
1134 1135
	 * is set there.
	 */
1136 1137 1138
	for (i = 0; i < nseg; i++) {
		if (((int)preq.sector_number|(int)seg[i].nsec) &
		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
1139
			pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
1140
				 blkif->domid);
1141 1142 1143
			goto fail_response;
		}
	}
1144

1145 1146 1147 1148 1149 1150
	/* Wait on all outstanding I/O's and once that has been completed
	 * issue the WRITE_FLUSH.
	 */
	if (drain)
		xen_blk_drain_io(pending_req->blkif);

1151 1152
	/*
	 * If we have failed at this point, we need to undo the M2P override,
1153 1154
	 * set gnttab_set_unmap_op on all of the grant references and perform
	 * the hypercall to unmap the grants - that is all done in
1155
	 * xen_blkbk_unmap.
1156
	 */
1157
	if (xen_blkbk_map_seg(req, pending_req, seg, pages))
K
Konrad Rzeszutek Wilk 已提交
1158 1159
		goto fail_flush;

1160 1161 1162 1163
	/*
	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
	 */
1164
	xen_blkif_get(blkif);
K
Konrad Rzeszutek Wilk 已提交
1165 1166 1167 1168

	for (i = 0; i < nseg; i++) {
		while ((bio == NULL) ||
		       (bio_add_page(bio,
1169
				     pages[i],
K
Konrad Rzeszutek Wilk 已提交
1170
				     seg[i].nsec << 9,
1171
				     seg[i].offset) == 0)) {
1172

1173
			bio = bio_alloc(GFP_KERNEL, nseg-i);
K
Konrad Rzeszutek Wilk 已提交
1174 1175 1176
			if (unlikely(bio == NULL))
				goto fail_put_bio;

1177
			biolist[nbio++] = bio;
K
Konrad Rzeszutek Wilk 已提交
1178 1179 1180 1181 1182 1183 1184 1185 1186
			bio->bi_bdev    = preq.bdev;
			bio->bi_private = pending_req;
			bio->bi_end_io  = end_block_io_op;
			bio->bi_sector  = preq.sector_number;
		}

		preq.sector_number += seg[i].nsec;
	}

1187
	/* This will be hit if the operation was a flush or discard. */
K
Konrad Rzeszutek Wilk 已提交
1188
	if (!bio) {
1189
		BUG_ON(operation != WRITE_FLUSH);
1190

1191 1192 1193
		bio = bio_alloc(GFP_KERNEL, 0);
		if (unlikely(bio == NULL))
			goto fail_put_bio;
K
Konrad Rzeszutek Wilk 已提交
1194

1195 1196 1197 1198
		biolist[nbio++] = bio;
		bio->bi_bdev    = preq.bdev;
		bio->bi_private = pending_req;
		bio->bi_end_io  = end_block_io_op;
K
Konrad Rzeszutek Wilk 已提交
1199 1200
	}

1201
	atomic_set(&pending_req->pendcnt, nbio);
1202 1203
	blk_start_plug(&plug);

1204 1205 1206
	for (i = 0; i < nbio; i++)
		submit_bio(operation, biolist[i]);

1207
	/* Let the I/Os go.. */
1208
	blk_finish_plug(&plug);
1209

K
Konrad Rzeszutek Wilk 已提交
1210 1211
	if (operation == READ)
		blkif->st_rd_sect += preq.nr_sects;
1212
	else if (operation & WRITE)
K
Konrad Rzeszutek Wilk 已提交
1213 1214
		blkif->st_wr_sect += preq.nr_sects;

1215
	return 0;
K
Konrad Rzeszutek Wilk 已提交
1216 1217

 fail_flush:
1218 1219 1220
	xen_blkbk_unmap(blkif, pending_req->grant_handles,
	                pending_req->pages, pending_req->persistent_gnts,
	                pending_req->nr_pages);
K
Konrad Rzeszutek Wilk 已提交
1221
 fail_response:
1222
	/* Haven't submitted any bio's yet. */
1223
	make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
1224
	free_req(blkif, pending_req);
K
Konrad Rzeszutek Wilk 已提交
1225
	msleep(1); /* back off a bit */
1226
	return -EIO;
K
Konrad Rzeszutek Wilk 已提交
1227 1228

 fail_put_bio:
1229
	for (i = 0; i < nbio; i++)
1230
		bio_put(biolist[i]);
1231
	atomic_set(&pending_req->pendcnt, 1);
K
Konrad Rzeszutek Wilk 已提交
1232 1233
	__end_block_io_op(pending_req, -EINVAL);
	msleep(1); /* back off a bit */
1234
	return -EIO;
K
Konrad Rzeszutek Wilk 已提交
1235 1236 1237 1238
}



1239 1240
/*
 * Put a response on the ring on how the operation fared.
K
Konrad Rzeszutek Wilk 已提交
1241
 */
1242
static void make_response(struct xen_blkif *blkif, u64 id,
K
Konrad Rzeszutek Wilk 已提交
1243 1244
			  unsigned short op, int st)
{
J
Jeremy Fitzhardinge 已提交
1245
	struct blkif_response  resp;
K
Konrad Rzeszutek Wilk 已提交
1246
	unsigned long     flags;
J
Jeremy Fitzhardinge 已提交
1247
	union blkif_back_rings *blk_rings = &blkif->blk_rings;
K
Konrad Rzeszutek Wilk 已提交
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
	int notify;

	resp.id        = id;
	resp.operation = op;
	resp.status    = st;

	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
	/* Place on the response ring for the relevant domain. */
	switch (blkif->blk_protocol) {
	case BLKIF_PROTOCOL_NATIVE:
		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
		       &resp, sizeof(resp));
		break;
	case BLKIF_PROTOCOL_X86_32:
		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
		       &resp, sizeof(resp));
		break;
	case BLKIF_PROTOCOL_X86_64:
		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
		       &resp, sizeof(resp));
		break;
	default:
		BUG();
	}
	blk_rings->common.rsp_prod_pvt++;
	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
	if (notify)
		notify_remote_via_irq(blkif->irq);
}

1279
static int __init xen_blkif_init(void)
K
Konrad Rzeszutek Wilk 已提交
1280
{
1281
	int rc = 0;
K
Konrad Rzeszutek Wilk 已提交
1282

1283
	if (!xen_domain())
K
Konrad Rzeszutek Wilk 已提交
1284 1285
		return -ENODEV;

1286
	rc = xen_blkif_interface_init();
1287 1288
	if (rc)
		goto failed_init;
K
Konrad Rzeszutek Wilk 已提交
1289

1290
	rc = xen_blkif_xenbus_init();
1291 1292
	if (rc)
		goto failed_init;
K
Konrad Rzeszutek Wilk 已提交
1293

1294 1295
 failed_init:
	return rc;
K
Konrad Rzeszutek Wilk 已提交
1296 1297
}

1298
module_init(xen_blkif_init);
K
Konrad Rzeszutek Wilk 已提交
1299 1300

MODULE_LICENSE("Dual BSD/GPL");
1301
MODULE_ALIAS("xen-backend:vbd");