virtio_blk.c 23.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
R
Rusty Russell 已提交
2 3
//#define DEBUG
#include <linux/spinlock.h>
4
#include <linux/slab.h>
R
Rusty Russell 已提交
5 6
#include <linux/blkdev.h>
#include <linux/hdreg.h>
7
#include <linux/module.h>
8
#include <linux/mutex.h>
9
#include <linux/interrupt.h>
R
Rusty Russell 已提交
10 11
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
12
#include <linux/scatterlist.h>
13
#include <linux/string_helpers.h>
14
#include <linux/idr.h>
J
Jens Axboe 已提交
15
#include <linux/blk-mq.h>
16
#include <linux/blk-mq-virtio.h>
J
Jens Axboe 已提交
17
#include <linux/numa.h>
18

19
#define PART_BITS 4
20
#define VQ_NAME_LEN 16
21
#define MAX_DISCARD_SEGMENTS 256u
R
Rusty Russell 已提交
22

23 24 25
static int major;
static DEFINE_IDA(vd_index_ida);

26
static struct workqueue_struct *virtblk_wq;
27

28 29 30 31 32 33
struct virtio_blk_vq {
	struct virtqueue *vq;
	spinlock_t lock;
	char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;

34
struct virtio_blk {
R
Rusty Russell 已提交
35 36 37 38 39
	struct virtio_device *vdev;

	/* The disk structure for the kernel. */
	struct gendisk *disk;

40 41 42
	/* Block layer tags. */
	struct blk_mq_tag_set tag_set;

43 44 45
	/* Process context for config space updates */
	struct work_struct config_work;

46 47 48
	/* What host tells us, plus 2 for header & tailer. */
	unsigned int sg_elems;

49 50
	/* Ida index - used to track minor number allocations. */
	int index;
51 52 53 54

	/* num of vqs */
	int num_vqs;
	struct virtio_blk_vq *vqs;
R
Rusty Russell 已提交
55 56
};

57
struct virtblk_req {
58
	struct virtio_blk_outhdr out_hdr;
59
	u8 status;
60
	struct scatterlist sg[];
R
Rusty Russell 已提交
61 62
};

63
static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
64 65 66
{
	switch (vbr->status) {
	case VIRTIO_BLK_S_OK:
67
		return BLK_STS_OK;
68
	case VIRTIO_BLK_S_UNSUPP:
69
		return BLK_STS_NOTSUPP;
70
	default:
71
		return BLK_STS_IOERR;
72 73 74
	}
}

75 76 77 78 79 80 81 82
static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
		struct scatterlist *data_sg, bool have_data)
{
	struct scatterlist hdr, status, *sgs[3];
	unsigned int num_out = 0, num_in = 0;

	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
	sgs[num_out++] = &hdr;
83

R
Rusty Russell 已提交
84
	if (have_data) {
M
Michael S. Tsirkin 已提交
85
		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
86
			sgs[num_out++] = data_sg;
87
		else
88 89 90
			sgs[num_out + num_in++] = data_sg;
	}

91 92 93 94
	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
	sgs[num_out + num_in++] = &status;

	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
95 96
}

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
{
	unsigned short segments = blk_rq_nr_discard_segments(req);
	unsigned short n = 0;
	struct virtio_blk_discard_write_zeroes *range;
	struct bio *bio;
	u32 flags = 0;

	if (unmap)
		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;

	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
	if (!range)
		return -ENOMEM;

	__rq_for_each_bio(bio, req) {
		u64 sector = bio->bi_iter.bi_sector;
		u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;

		range[n].flags = cpu_to_le32(flags);
		range[n].num_sectors = cpu_to_le32(num_sectors);
		range[n].sector = cpu_to_le64(sector);
		n++;
	}

	req->special_vec.bv_page = virt_to_page(range);
	req->special_vec.bv_offset = offset_in_page(range);
	req->special_vec.bv_len = sizeof(*range) * segments;
	req->rq_flags |= RQF_SPECIAL_PAYLOAD;

	return 0;
}

130
static inline void virtblk_request_done(struct request *req)
131
{
132
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
133

134 135 136 137 138
	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
		kfree(page_address(req->special_vec.bv_page) +
		      req->special_vec.bv_offset);
	}

139
	blk_mq_end_request(req, virtblk_result(vbr));
140 141 142
}

static void virtblk_done(struct virtqueue *vq)
R
Rusty Russell 已提交
143 144
{
	struct virtio_blk *vblk = vq->vdev->priv;
J
Jens Axboe 已提交
145
	bool req_done = false;
146
	int qid = vq->index;
R
Rusty Russell 已提交
147 148
	struct virtblk_req *vbr;
	unsigned long flags;
149
	unsigned int len;
R
Rusty Russell 已提交
150

151
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
152 153
	do {
		virtqueue_disable_cb(vq);
154
		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
155 156
			struct request *req = blk_mq_rq_from_pdu(vbr);

157
			blk_mq_complete_request(req);
J
Jens Axboe 已提交
158
			req_done = true;
159
		}
160 161
		if (unlikely(virtqueue_is_broken(vq)))
			break;
162
	} while (!virtqueue_enable_cb(vq));
J
Jens Axboe 已提交
163

R
Rusty Russell 已提交
164
	/* In case queue is stopped waiting for more buffers. */
165
	if (req_done)
166
		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
167
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
168 169
}

170 171 172 173 174 175 176 177 178 179 180 181 182 183
static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct virtio_blk *vblk = hctx->queue->queuedata;
	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
	bool kick;

	spin_lock_irq(&vq->lock);
	kick = virtqueue_kick_prepare(vq->vq);
	spin_unlock_irq(&vq->lock);

	if (kick)
		virtqueue_notify(vq->vq);
}

184
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
185
			   const struct blk_mq_queue_data *bd)
R
Rusty Russell 已提交
186
{
J
Jens Axboe 已提交
187
	struct virtio_blk *vblk = hctx->queue->queuedata;
188
	struct request *req = bd->rq;
189
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
J
Jens Axboe 已提交
190
	unsigned long flags;
191
	unsigned int num;
192
	int qid = hctx->queue_num;
193
	int err;
194
	bool notify = false;
195
	bool unmap = false;
196
	u32 type;
R
Rusty Russell 已提交
197

J
Jens Axboe 已提交
198
	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
R
Rusty Russell 已提交
199

200 201 202 203 204 205 206 207
	switch (req_op(req)) {
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		type = 0;
		break;
	case REQ_OP_FLUSH:
		type = VIRTIO_BLK_T_FLUSH;
		break;
208 209 210 211 212 213 214
	case REQ_OP_DISCARD:
		type = VIRTIO_BLK_T_DISCARD;
		break;
	case REQ_OP_WRITE_ZEROES:
		type = VIRTIO_BLK_T_WRITE_ZEROES;
		unmap = !(req->cmd_flags & REQ_NOUNMAP);
		break;
215 216 217 218 219
	case REQ_OP_DRV_IN:
		type = VIRTIO_BLK_T_GET_ID;
		break;
	default:
		WARN_ON_ONCE(1);
220
		return BLK_STS_IOERR;
R
Rusty Russell 已提交
221 222
	}

223 224 225 226 227
	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
	vbr->out_hdr.sector = type ?
		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));

228 229
	blk_mq_start_request(req);

230 231 232 233 234 235
	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
		err = virtblk_setup_discard_write_zeroes(req, unmap);
		if (err)
			return BLK_STS_RESOURCE;
	}

236
	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
237
	if (num) {
238
		if (rq_data_dir(req) == WRITE)
M
Michael S. Tsirkin 已提交
239
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
240
		else
M
Michael S. Tsirkin 已提交
241
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
R
Rusty Russell 已提交
242 243
	}

244
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
245
	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
246
	if (err) {
247
		virtqueue_kick(vblk->vqs[qid].vq);
J
Jens Axboe 已提交
248
		blk_mq_stop_hw_queue(hctx);
249
		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
250 251 252
		/* Out of mem doesn't actually happen, since we fall back
		 * to direct descriptors */
		if (err == -ENOMEM || err == -ENOSPC)
253
			return BLK_STS_DEV_RESOURCE;
254
		return BLK_STS_IOERR;
255 256
	}

257
	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
258
		notify = true;
259
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
260 261

	if (notify)
262
		virtqueue_notify(vblk->vqs[qid].vq);
263
	return BLK_STS_OK;
264 265
}

266 267 268 269 270
/* return id (s/n) string for *disk to *id_str
 */
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
	struct virtio_blk *vblk = disk->private_data;
271
	struct request_queue *q = vblk->disk->queue;
272
	struct request *req;
M
Mike Snitzer 已提交
273
	int err;
274

275
	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
276
	if (IS_ERR(req))
277
		return PTR_ERR(req);
278 279 280 281 282

	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
	if (err)
		goto out;

283
	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
284
	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
285
out:
M
Mike Snitzer 已提交
286 287
	blk_put_request(req);
	return err;
288 289
}

290 291 292
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
293 294 295
	struct virtio_blk *vblk = bd->bd_disk->private_data;

	/* see if the host passed in geometry config */
296 297 298 299 300 301 302
	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.cylinders, &geo->cylinders);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.heads, &geo->heads);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.sectors, &geo->sectors);
303 304 305 306 307 308
	} else {
		/* some standard values, similar to sd */
		geo->heads = 1 << 6;
		geo->sectors = 1 << 5;
		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
	}
309 310 311
	return 0;
}

312
static const struct block_device_operations virtblk_fops = {
313 314
	.owner  = THIS_MODULE,
	.getgeo = virtblk_getgeo,
R
Rusty Russell 已提交
315 316
};

317 318 319 320 321
static int index_to_minor(int index)
{
	return index << PART_BITS;
}

322 323 324 325 326
static int minor_to_index(int minor)
{
	return minor >> PART_BITS;
}

327 328
static ssize_t serial_show(struct device *dev,
			   struct device_attribute *attr, char *buf)
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
{
	struct gendisk *disk = dev_to_disk(dev);
	int err;

	/* sysfs gives us a PAGE_SIZE buffer */
	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);

	buf[VIRTIO_BLK_ID_BYTES] = '\0';
	err = virtblk_get_id(disk, buf);
	if (!err)
		return strlen(buf);

	if (err == -EIO) /* Unsupported? Make it empty. */
		return 0;

	return err;
}
346

347
static DEVICE_ATTR_RO(serial);
348

349 350
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
351 352 353 354
{
	struct virtio_device *vdev = vblk->vdev;
	struct request_queue *q = vblk->disk->queue;
	char cap_str_2[10], cap_str_10[10];
355
	unsigned long long nblocks;
356
	u64 capacity;
357 358

	/* Host must always specify the capacity. */
359
	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
360 361 362 363 364 365 366 367

	/* If capacity is too big, truncate with warning. */
	if ((sector_t)capacity != capacity) {
		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
			 (unsigned long long)capacity);
		capacity = (sector_t)-1;
	}

368 369 370
	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);

	string_get_size(nblocks, queue_logical_block_size(q),
371
			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
372
	string_get_size(nblocks, queue_logical_block_size(q),
373
			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
374 375

	dev_notice(&vdev->dev,
376 377 378
		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
		   vblk->disk->disk_name,
		   resize ? "new size: " : "",
379 380 381 382
		   nblocks,
		   queue_logical_block_size(q),
		   cap_str_10,
		   cap_str_2);
383 384

	set_capacity(vblk->disk, capacity);
385 386 387 388 389 390 391 392 393
}

static void virtblk_config_changed_work(struct work_struct *work)
{
	struct virtio_blk *vblk =
		container_of(work, struct virtio_blk, config_work);
	char *envp[] = { "RESIZE=1", NULL };

	virtblk_update_capacity(vblk, true);
394
	revalidate_disk(vblk->disk);
395
	kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
396 397 398 399 400 401 402 403 404
}

static void virtblk_config_changed(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	queue_work(virtblk_wq, &vblk->config_work);
}

405 406
static int init_vq(struct virtio_blk *vblk)
{
407
	int err;
408 409 410 411 412 413
	int i;
	vq_callback_t **callbacks;
	const char **names;
	struct virtqueue **vqs;
	unsigned short num_vqs;
	struct virtio_device *vdev = vblk->vdev;
414
	struct irq_affinity desc = { 0, };
415 416 417 418 419 420 421

	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
				   struct virtio_blk_config, num_queues,
				   &num_vqs);
	if (err)
		num_vqs = 1;

422 423
	num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);

424
	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
425 426
	if (!vblk->vqs)
		return -ENOMEM;
427

428 429 430
	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
431 432 433 434
	if (!names || !callbacks || !vqs) {
		err = -ENOMEM;
		goto out;
	}
435

436 437 438 439 440 441 442
	for (i = 0; i < num_vqs; i++) {
		callbacks[i] = virtblk_done;
		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
		names[i] = vblk->vqs[i].name;
	}

	/* Discover virtqueues and write information to configuration.  */
M
Michael S. Tsirkin 已提交
443
	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
444
	if (err)
445
		goto out;
446

447 448 449 450 451 452
	for (i = 0; i < num_vqs; i++) {
		spin_lock_init(&vblk->vqs[i].lock);
		vblk->vqs[i].vq = vqs[i];
	}
	vblk->num_vqs = num_vqs;

453
out:
454 455 456 457 458
	kfree(vqs);
	kfree(callbacks);
	kfree(names);
	if (err)
		kfree(vblk->vqs);
459 460 461
	return err;
}

462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
/*
 * Legacy naming scheme used for virtio devices.  We are stuck with it for
 * virtio blk but don't ever use it for any new driver.
 */
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
{
	const int base = 'z' - 'a' + 1;
	char *begin = buf + strlen(prefix);
	char *end = buf + buflen;
	char *p;
	int unit;

	p = end - 1;
	*p = '\0';
	unit = base;
	do {
		if (p == begin)
			return -EINVAL;
		*--p = 'a' + (index % unit);
		index = (index / unit) - 1;
	} while (index >= 0);

	memmove(begin, p, end - p);
	memcpy(buf, prefix, strlen(prefix));

	return 0;
}

490 491 492 493 494
static int virtblk_get_cache_mode(struct virtio_device *vdev)
{
	u8 writeback;
	int err;

495 496 497
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
				   struct virtio_blk_config, wce,
				   &writeback);
498 499 500 501 502

	/*
	 * If WCE is not configurable and flush is not available,
	 * assume no writeback cache is in use.
	 */
503
	if (err)
504
		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
505 506 507 508 509 510 511 512 513

	return writeback;
}

static void virtblk_update_cache_mode(struct virtio_device *vdev)
{
	u8 writeback = virtblk_get_cache_mode(vdev);
	struct virtio_blk *vblk = vdev->priv;

514
	blk_queue_write_cache(vblk->disk->queue, writeback, false);
515 516 517 518 519 520 521 522
	revalidate_disk(vblk->disk);
}

static const char *const virtblk_cache_types[] = {
	"write through", "write back"
};

static ssize_t
523 524
cache_type_store(struct device *dev, struct device_attribute *attr,
		 const char *buf, size_t count)
525 526 527 528 529 530 531
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;
	int i;

	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
532
	i = sysfs_match_string(virtblk_cache_types, buf);
533
	if (i < 0)
534
		return i;
535

536
	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
537 538 539 540 541
	virtblk_update_cache_mode(vdev);
	return count;
}

static ssize_t
542
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
543 544 545 546 547 548 549 550 551
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	u8 writeback = virtblk_get_cache_mode(vblk->vdev);

	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
}

552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
static DEVICE_ATTR_RW(cache_type);

static struct attribute *virtblk_attrs[] = {
	&dev_attr_serial.attr,
	&dev_attr_cache_type.attr,
	NULL,
};

static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
		struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, struct device, kobj);
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;

	if (a == &dev_attr_cache_type.attr &&
	    !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
		return S_IRUGO;

	return a->mode;
}

static const struct attribute_group virtblk_attr_group = {
	.attrs = virtblk_attrs,
	.is_visible = virtblk_attrs_are_visible,
};

static const struct attribute_group *virtblk_attr_groups[] = {
	&virtblk_attr_group,
	NULL,
};
584

585 586
static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
		unsigned int hctx_idx, unsigned int numa_node)
587
{
588
	struct virtio_blk *vblk = set->driver_data;
589 590 591 592 593 594
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);

	sg_init_table(vbr->sg, vblk->sg_elems);
	return 0;
}

595 596 597 598
static int virtblk_map_queues(struct blk_mq_tag_set *set)
{
	struct virtio_blk *vblk = set->driver_data;

599 600
	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
					vblk->vdev, 0);
601 602
}

603
static const struct blk_mq_ops virtio_mq_ops = {
J
Jens Axboe 已提交
604
	.queue_rq	= virtio_queue_rq,
605
	.commit_rqs	= virtio_commit_rqs,
606
	.complete	= virtblk_request_done,
607
	.init_request	= virtblk_init_request,
608
	.map_queues	= virtblk_map_queues,
J
Jens Axboe 已提交
609 610
};

611 612
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
J
Jens Axboe 已提交
613

614
static int virtblk_probe(struct virtio_device *vdev)
R
Rusty Russell 已提交
615 616
{
	struct virtio_blk *vblk;
617
	struct request_queue *q;
618
	int err, index;
619

620
	u32 v, blk_size, max_size, sg_elems, opt_io_size;
621 622
	u16 min_io_size;
	u8 physical_block_exp, alignment_offset;
R
Rusty Russell 已提交
623

624 625 626 627 628 629
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

630 631 632 633 634
	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
			     GFP_KERNEL);
	if (err < 0)
		goto out;
	index = err;
635

636
	/* We need to know how many segments before we allocate. */
637 638 639
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
				   struct virtio_blk_config, seg_max,
				   &sg_elems);
640 641 642

	/* We need at least one SG element, whatever they say. */
	if (err || !sg_elems)
643 644 645 646
		sg_elems = 1;

	/* We need an extra sg elements at head and tail. */
	sg_elems += 2;
J
Jens Axboe 已提交
647
	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
R
Rusty Russell 已提交
648 649
	if (!vblk) {
		err = -ENOMEM;
650
		goto out_free_index;
R
Rusty Russell 已提交
651 652 653
	}

	vblk->vdev = vdev;
654
	vblk->sg_elems = sg_elems;
655

656
	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
R
Rusty Russell 已提交
657

658 659
	err = init_vq(vblk);
	if (err)
R
Rusty Russell 已提交
660 661 662
		goto out_free_vblk;

	/* FIXME: How many partitions?  How long is a piece of string? */
663
	vblk->disk = alloc_disk(1 << PART_BITS);
R
Rusty Russell 已提交
664 665
	if (!vblk->disk) {
		err = -ENOMEM;
J
Jens Axboe 已提交
666
		goto out_free_vq;
R
Rusty Russell 已提交
667 668
	}

669
	/* Default queue sizing is to fill the ring. */
670
	if (!virtblk_queue_depth) {
671
		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
672 673
		/* ... but without indirect descs, we use 2 descs per req */
		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
674
			virtblk_queue_depth /= 2;
675
	}
676 677 678 679 680 681 682

	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
	vblk->tag_set.ops = &virtio_mq_ops;
	vblk->tag_set.queue_depth = virtblk_queue_depth;
	vblk->tag_set.numa_node = NUMA_NO_NODE;
	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
	vblk->tag_set.cmd_size =
J
Jens Axboe 已提交
683 684
		sizeof(struct virtblk_req) +
		sizeof(struct scatterlist) * sg_elems;
685
	vblk->tag_set.driver_data = vblk;
686
	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
J
Jens Axboe 已提交
687

688 689 690 691
	err = blk_mq_alloc_tag_set(&vblk->tag_set);
	if (err)
		goto out_put_disk;

692
	q = blk_mq_init_queue(&vblk->tag_set);
693
	if (IS_ERR(q)) {
R
Rusty Russell 已提交
694
		err = -ENOMEM;
695
		goto out_free_tags;
R
Rusty Russell 已提交
696
	}
697
	vblk->disk->queue = q;
R
Rusty Russell 已提交
698

699
	q->queuedata = vblk;
700

701
	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
702

R
Rusty Russell 已提交
703
	vblk->disk->major = major;
704
	vblk->disk->first_minor = index_to_minor(index);
R
Rusty Russell 已提交
705 706
	vblk->disk->private_data = vblk;
	vblk->disk->fops = &virtblk_fops;
707
	vblk->disk->flags |= GENHD_FL_EXT_DEVT;
708
	vblk->index = index;
709

710
	/* configure queue flush support */
711
	virtblk_update_cache_mode(vdev);
R
Rusty Russell 已提交
712

713 714 715 716
	/* If disk is read-only in the host, the guest should obey */
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
		set_disk_ro(vblk->disk, 1);

717
	/* We can handle whatever the host told us to handle. */
718
	blk_queue_max_segments(q, vblk->sg_elems-2);
719

720
	/* No real sector limit. */
721
	blk_queue_max_hw_sectors(q, -1U);
722

723 724
	max_size = virtio_max_dma_size(vdev);

725 726
	/* Host can optionally specify maximum segment size and number of
	 * segments. */
727 728
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
				   struct virtio_blk_config, size_max, &v);
R
Rusty Russell 已提交
729
	if (!err)
730 731 732
		max_size = min(max_size, v);

	blk_queue_max_segment_size(q, max_size);
R
Rusty Russell 已提交
733

734
	/* Host can optionally specify the block size of the device */
735 736 737
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
				   struct virtio_blk_config, blk_size,
				   &blk_size);
738
	if (!err)
739 740 741 742 743
		blk_queue_logical_block_size(q, blk_size);
	else
		blk_size = queue_logical_block_size(q);

	/* Use topology information if available */
744 745 746
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, physical_block_exp,
				   &physical_block_exp);
747 748 749 750
	if (!err && physical_block_exp)
		blk_queue_physical_block_size(q,
				blk_size * (1 << physical_block_exp));

751 752 753
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, alignment_offset,
				   &alignment_offset);
754 755 756
	if (!err && alignment_offset)
		blk_queue_alignment_offset(q, blk_size * alignment_offset);

757 758 759
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, min_io_size,
				   &min_io_size);
760 761 762
	if (!err && min_io_size)
		blk_queue_io_min(q, blk_size * min_io_size);

763 764 765
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, opt_io_size,
				   &opt_io_size);
766 767 768
	if (!err && opt_io_size)
		blk_queue_io_opt(q, blk_size * opt_io_size);

769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
		q->limits.discard_granularity = blk_size;

		virtio_cread(vdev, struct virtio_blk_config,
			     discard_sector_alignment, &v);
		q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;

		virtio_cread(vdev, struct virtio_blk_config,
			     max_discard_sectors, &v);
		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);

		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
			     &v);
		blk_queue_max_discard_segments(q,
					       min_not_zero(v,
							    MAX_DISCARD_SEGMENTS));

		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
	}

	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
		virtio_cread(vdev, struct virtio_blk_config,
			     max_write_zeroes_sectors, &v);
		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
	}

795
	virtblk_update_capacity(vblk, false);
M
Michael S. Tsirkin 已提交
796 797
	virtio_device_ready(vdev);

798
	device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
R
Rusty Russell 已提交
799 800
	return 0;

801 802
out_free_tags:
	blk_mq_free_tag_set(&vblk->tag_set);
R
Rusty Russell 已提交
803 804 805
out_put_disk:
	put_disk(vblk->disk);
out_free_vq:
806
	vdev->config->del_vqs(vdev);
R
Rusty Russell 已提交
807 808
out_free_vblk:
	kfree(vblk);
809 810
out_free_index:
	ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
811 812 813 814
out:
	return err;
}

815
static void virtblk_remove(struct virtio_device *vdev)
R
Rusty Russell 已提交
816 817
{
	struct virtio_blk *vblk = vdev->priv;
818
	int index = vblk->index;
819
	int refc;
R
Rusty Russell 已提交
820

821 822
	/* Make sure no work handler is accessing the device. */
	flush_work(&vblk->config_work);
823

824
	del_gendisk(vblk->disk);
825
	blk_cleanup_queue(vblk->disk->queue);
826

827 828
	blk_mq_free_tag_set(&vblk->tag_set);

R
Rusty Russell 已提交
829 830 831
	/* Stop all the virtqueues. */
	vdev->config->reset(vdev);

832
	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
R
Rusty Russell 已提交
833
	put_disk(vblk->disk);
834
	vdev->config->del_vqs(vdev);
835
	kfree(vblk->vqs);
R
Rusty Russell 已提交
836
	kfree(vblk);
837 838 839 840

	/* Only free device id if we don't have any users */
	if (refc == 1)
		ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
841 842
}

843
#ifdef CONFIG_PM_SLEEP
844 845 846 847 848 849 850
static int virtblk_freeze(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	/* Ensure we don't receive any more interrupts */
	vdev->config->reset(vdev);

851
	/* Make sure no work handler is accessing the device. */
852 853
	flush_work(&vblk->config_work);

854
	blk_mq_quiesce_queue(vblk->disk->queue);
855 856 857 858 859 860 861 862 863 864 865

	vdev->config->del_vqs(vdev);
	return 0;
}

static int virtblk_restore(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;
	int ret;

	ret = init_vq(vdev->priv);
866 867 868 869
	if (ret)
		return ret;

	virtio_device_ready(vdev);
J
Jens Axboe 已提交
870

871
	blk_mq_unquiesce_queue(vblk->disk->queue);
872
	return 0;
873 874 875
}
#endif

876
static const struct virtio_device_id id_table[] = {
R
Rusty Russell 已提交
877 878 879 880
	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

M
Michael S. Tsirkin 已提交
881
static unsigned int features_legacy[] = {
882
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
883
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
884
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
885
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
M
Michael S. Tsirkin 已提交
886 887 888 889 890
}
;
static unsigned int features[] = {
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
891
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
892
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
893 894
};

895
static struct virtio_driver virtio_blk = {
M
Michael S. Tsirkin 已提交
896 897 898 899 900 901 902 903 904 905
	.feature_table			= features,
	.feature_table_size		= ARRAY_SIZE(features),
	.feature_table_legacy		= features_legacy,
	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
	.driver.name			= KBUILD_MODNAME,
	.driver.owner			= THIS_MODULE,
	.id_table			= id_table,
	.probe				= virtblk_probe,
	.remove				= virtblk_remove,
	.config_changed			= virtblk_config_changed,
906
#ifdef CONFIG_PM_SLEEP
M
Michael S. Tsirkin 已提交
907 908
	.freeze				= virtblk_freeze,
	.restore			= virtblk_restore,
909
#endif
R
Rusty Russell 已提交
910 911 912 913
};

static int __init init(void)
{
914 915 916 917 918 919
	int error;

	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
	if (!virtblk_wq)
		return -ENOMEM;

920
	major = register_blkdev(0, "virtblk");
921 922 923 924 925 926 927 928 929 930 931 932 933 934 935
	if (major < 0) {
		error = major;
		goto out_destroy_workqueue;
	}

	error = register_virtio_driver(&virtio_blk);
	if (error)
		goto out_unregister_blkdev;
	return 0;

out_unregister_blkdev:
	unregister_blkdev(major, "virtblk");
out_destroy_workqueue:
	destroy_workqueue(virtblk_wq);
	return error;
R
Rusty Russell 已提交
936 937 938 939 940
}

static void __exit fini(void)
{
	unregister_virtio_driver(&virtio_blk);
941
	unregister_blkdev(major, "virtblk");
942
	destroy_workqueue(virtblk_wq);
R
Rusty Russell 已提交
943 944 945 946 947 948 949
}
module_init(init);
module_exit(fini);

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio block driver");
MODULE_LICENSE("GPL");