virtio_blk.c 23.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
R
Rusty Russell 已提交
2 3
//#define DEBUG
#include <linux/spinlock.h>
4
#include <linux/slab.h>
R
Rusty Russell 已提交
5 6
#include <linux/blkdev.h>
#include <linux/hdreg.h>
7
#include <linux/module.h>
8
#include <linux/mutex.h>
9
#include <linux/interrupt.h>
R
Rusty Russell 已提交
10 11
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
12
#include <linux/scatterlist.h>
13
#include <linux/string_helpers.h>
14
#include <linux/idr.h>
J
Jens Axboe 已提交
15
#include <linux/blk-mq.h>
16
#include <linux/blk-mq-virtio.h>
J
Jens Axboe 已提交
17
#include <linux/numa.h>
18
#include <uapi/linux/virtio_ring.h>
19

20
#define PART_BITS 4
21
#define VQ_NAME_LEN 16
22
#define MAX_DISCARD_SEGMENTS 256u
R
Rusty Russell 已提交
23

24 25 26
static int major;
static DEFINE_IDA(vd_index_ida);

27
static struct workqueue_struct *virtblk_wq;
28

29 30 31 32 33 34
struct virtio_blk_vq {
	struct virtqueue *vq;
	spinlock_t lock;
	char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;

35
struct virtio_blk {
R
Rusty Russell 已提交
36 37 38 39 40
	struct virtio_device *vdev;

	/* The disk structure for the kernel. */
	struct gendisk *disk;

41 42 43
	/* Block layer tags. */
	struct blk_mq_tag_set tag_set;

44 45 46
	/* Process context for config space updates */
	struct work_struct config_work;

47 48 49
	/* What host tells us, plus 2 for header & tailer. */
	unsigned int sg_elems;

50 51
	/* Ida index - used to track minor number allocations. */
	int index;
52 53 54 55

	/* num of vqs */
	int num_vqs;
	struct virtio_blk_vq *vqs;
R
Rusty Russell 已提交
56 57
};

58
struct virtblk_req {
59
	struct virtio_blk_outhdr out_hdr;
60
	u8 status;
61
	struct scatterlist sg[];
R
Rusty Russell 已提交
62 63
};

64
static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
65 66 67
{
	switch (vbr->status) {
	case VIRTIO_BLK_S_OK:
68
		return BLK_STS_OK;
69
	case VIRTIO_BLK_S_UNSUPP:
70
		return BLK_STS_NOTSUPP;
71
	default:
72
		return BLK_STS_IOERR;
73 74 75
	}
}

76 77 78 79 80 81 82 83
static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
		struct scatterlist *data_sg, bool have_data)
{
	struct scatterlist hdr, status, *sgs[3];
	unsigned int num_out = 0, num_in = 0;

	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
	sgs[num_out++] = &hdr;
84

R
Rusty Russell 已提交
85
	if (have_data) {
M
Michael S. Tsirkin 已提交
86
		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
87
			sgs[num_out++] = data_sg;
88
		else
89 90 91
			sgs[num_out + num_in++] = data_sg;
	}

92 93 94 95
	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
	sgs[num_out + num_in++] = &status;

	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
96 97
}

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
{
	unsigned short segments = blk_rq_nr_discard_segments(req);
	unsigned short n = 0;
	struct virtio_blk_discard_write_zeroes *range;
	struct bio *bio;
	u32 flags = 0;

	if (unmap)
		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;

	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
	if (!range)
		return -ENOMEM;

	__rq_for_each_bio(bio, req) {
		u64 sector = bio->bi_iter.bi_sector;
		u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;

		range[n].flags = cpu_to_le32(flags);
		range[n].num_sectors = cpu_to_le32(num_sectors);
		range[n].sector = cpu_to_le64(sector);
		n++;
	}

	req->special_vec.bv_page = virt_to_page(range);
	req->special_vec.bv_offset = offset_in_page(range);
	req->special_vec.bv_len = sizeof(*range) * segments;
	req->rq_flags |= RQF_SPECIAL_PAYLOAD;

	return 0;
}

131
static inline void virtblk_request_done(struct request *req)
132
{
133
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
134

135 136 137 138 139
	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
		kfree(page_address(req->special_vec.bv_page) +
		      req->special_vec.bv_offset);
	}

140
	blk_mq_end_request(req, virtblk_result(vbr));
141 142 143
}

static void virtblk_done(struct virtqueue *vq)
R
Rusty Russell 已提交
144 145
{
	struct virtio_blk *vblk = vq->vdev->priv;
J
Jens Axboe 已提交
146
	bool req_done = false;
147
	int qid = vq->index;
R
Rusty Russell 已提交
148 149
	struct virtblk_req *vbr;
	unsigned long flags;
150
	unsigned int len;
R
Rusty Russell 已提交
151

152
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
153 154
	do {
		virtqueue_disable_cb(vq);
155
		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
156 157
			struct request *req = blk_mq_rq_from_pdu(vbr);

158
			blk_mq_complete_request(req);
J
Jens Axboe 已提交
159
			req_done = true;
160
		}
161 162
		if (unlikely(virtqueue_is_broken(vq)))
			break;
163
	} while (!virtqueue_enable_cb(vq));
J
Jens Axboe 已提交
164

R
Rusty Russell 已提交
165
	/* In case queue is stopped waiting for more buffers. */
166
	if (req_done)
167
		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
168
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
169 170
}

171 172 173 174 175 176 177 178 179 180 181 182 183 184
static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct virtio_blk *vblk = hctx->queue->queuedata;
	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
	bool kick;

	spin_lock_irq(&vq->lock);
	kick = virtqueue_kick_prepare(vq->vq);
	spin_unlock_irq(&vq->lock);

	if (kick)
		virtqueue_notify(vq->vq);
}

185
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
186
			   const struct blk_mq_queue_data *bd)
R
Rusty Russell 已提交
187
{
J
Jens Axboe 已提交
188
	struct virtio_blk *vblk = hctx->queue->queuedata;
189
	struct request *req = bd->rq;
190
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
J
Jens Axboe 已提交
191
	unsigned long flags;
192
	unsigned int num;
193
	int qid = hctx->queue_num;
194
	int err;
195
	bool notify = false;
196
	bool unmap = false;
197
	u32 type;
R
Rusty Russell 已提交
198

J
Jens Axboe 已提交
199
	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
R
Rusty Russell 已提交
200

201 202 203 204 205 206 207 208
	switch (req_op(req)) {
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		type = 0;
		break;
	case REQ_OP_FLUSH:
		type = VIRTIO_BLK_T_FLUSH;
		break;
209 210 211 212 213 214 215
	case REQ_OP_DISCARD:
		type = VIRTIO_BLK_T_DISCARD;
		break;
	case REQ_OP_WRITE_ZEROES:
		type = VIRTIO_BLK_T_WRITE_ZEROES;
		unmap = !(req->cmd_flags & REQ_NOUNMAP);
		break;
216 217 218 219 220
	case REQ_OP_DRV_IN:
		type = VIRTIO_BLK_T_GET_ID;
		break;
	default:
		WARN_ON_ONCE(1);
221
		return BLK_STS_IOERR;
R
Rusty Russell 已提交
222 223
	}

224 225 226 227 228
	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
	vbr->out_hdr.sector = type ?
		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));

229 230
	blk_mq_start_request(req);

231 232 233 234 235 236
	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
		err = virtblk_setup_discard_write_zeroes(req, unmap);
		if (err)
			return BLK_STS_RESOURCE;
	}

237
	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
238
	if (num) {
239
		if (rq_data_dir(req) == WRITE)
M
Michael S. Tsirkin 已提交
240
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
241
		else
M
Michael S. Tsirkin 已提交
242
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
R
Rusty Russell 已提交
243 244
	}

245
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
246
	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
247
	if (err) {
248
		virtqueue_kick(vblk->vqs[qid].vq);
249 250 251 252 253
		/* Don't stop the queue if -ENOMEM: we may have failed to
		 * bounce the buffer due to global resource outage.
		 */
		if (err == -ENOSPC)
			blk_mq_stop_hw_queue(hctx);
254
		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
255 256
		switch (err) {
		case -ENOSPC:
257
			return BLK_STS_DEV_RESOURCE;
258 259 260 261 262
		case -ENOMEM:
			return BLK_STS_RESOURCE;
		default:
			return BLK_STS_IOERR;
		}
263 264
	}

265
	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
266
		notify = true;
267
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
268 269

	if (notify)
270
		virtqueue_notify(vblk->vqs[qid].vq);
271
	return BLK_STS_OK;
272 273
}

274 275 276 277 278
/* return id (s/n) string for *disk to *id_str
 */
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
	struct virtio_blk *vblk = disk->private_data;
279
	struct request_queue *q = vblk->disk->queue;
280
	struct request *req;
M
Mike Snitzer 已提交
281
	int err;
282

283
	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
284
	if (IS_ERR(req))
285
		return PTR_ERR(req);
286 287 288 289 290

	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
	if (err)
		goto out;

291
	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
292
	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
293
out:
M
Mike Snitzer 已提交
294 295
	blk_put_request(req);
	return err;
296 297
}

298 299 300
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
301 302 303
	struct virtio_blk *vblk = bd->bd_disk->private_data;

	/* see if the host passed in geometry config */
304 305 306 307 308 309 310
	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.cylinders, &geo->cylinders);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.heads, &geo->heads);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.sectors, &geo->sectors);
311 312 313 314 315 316
	} else {
		/* some standard values, similar to sd */
		geo->heads = 1 << 6;
		geo->sectors = 1 << 5;
		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
	}
317 318 319
	return 0;
}

320
static const struct block_device_operations virtblk_fops = {
321 322
	.owner  = THIS_MODULE,
	.getgeo = virtblk_getgeo,
R
Rusty Russell 已提交
323 324
};

325 326 327 328 329
static int index_to_minor(int index)
{
	return index << PART_BITS;
}

330 331 332 333 334
static int minor_to_index(int minor)
{
	return minor >> PART_BITS;
}

335 336
static ssize_t serial_show(struct device *dev,
			   struct device_attribute *attr, char *buf)
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
{
	struct gendisk *disk = dev_to_disk(dev);
	int err;

	/* sysfs gives us a PAGE_SIZE buffer */
	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);

	buf[VIRTIO_BLK_ID_BYTES] = '\0';
	err = virtblk_get_id(disk, buf);
	if (!err)
		return strlen(buf);

	if (err == -EIO) /* Unsupported? Make it empty. */
		return 0;

	return err;
}
354

355
static DEVICE_ATTR_RO(serial);
356

357 358
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
359 360 361 362
{
	struct virtio_device *vdev = vblk->vdev;
	struct request_queue *q = vblk->disk->queue;
	char cap_str_2[10], cap_str_10[10];
363
	unsigned long long nblocks;
364
	u64 capacity;
365 366

	/* Host must always specify the capacity. */
367
	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
368 369 370 371 372 373 374 375

	/* If capacity is too big, truncate with warning. */
	if ((sector_t)capacity != capacity) {
		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
			 (unsigned long long)capacity);
		capacity = (sector_t)-1;
	}

376 377 378
	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);

	string_get_size(nblocks, queue_logical_block_size(q),
379
			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
380
	string_get_size(nblocks, queue_logical_block_size(q),
381
			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
382 383

	dev_notice(&vdev->dev,
384 385 386
		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
		   vblk->disk->disk_name,
		   resize ? "new size: " : "",
387 388 389 390
		   nblocks,
		   queue_logical_block_size(q),
		   cap_str_10,
		   cap_str_2);
391

392
	set_capacity_revalidate_and_notify(vblk->disk, capacity, true);
393 394 395 396 397 398 399 400
}

static void virtblk_config_changed_work(struct work_struct *work)
{
	struct virtio_blk *vblk =
		container_of(work, struct virtio_blk, config_work);

	virtblk_update_capacity(vblk, true);
401 402 403 404 405 406 407 408 409
}

static void virtblk_config_changed(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	queue_work(virtblk_wq, &vblk->config_work);
}

410 411
static int init_vq(struct virtio_blk *vblk)
{
412
	int err;
413 414 415 416 417 418
	int i;
	vq_callback_t **callbacks;
	const char **names;
	struct virtqueue **vqs;
	unsigned short num_vqs;
	struct virtio_device *vdev = vblk->vdev;
419
	struct irq_affinity desc = { 0, };
420 421 422 423 424 425 426

	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
				   struct virtio_blk_config, num_queues,
				   &num_vqs);
	if (err)
		num_vqs = 1;

427 428
	num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);

429
	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
430 431
	if (!vblk->vqs)
		return -ENOMEM;
432

433 434 435
	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
436 437 438 439
	if (!names || !callbacks || !vqs) {
		err = -ENOMEM;
		goto out;
	}
440

441 442 443 444 445 446 447
	for (i = 0; i < num_vqs; i++) {
		callbacks[i] = virtblk_done;
		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
		names[i] = vblk->vqs[i].name;
	}

	/* Discover virtqueues and write information to configuration.  */
M
Michael S. Tsirkin 已提交
448
	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
449
	if (err)
450
		goto out;
451

452 453 454 455 456 457
	for (i = 0; i < num_vqs; i++) {
		spin_lock_init(&vblk->vqs[i].lock);
		vblk->vqs[i].vq = vqs[i];
	}
	vblk->num_vqs = num_vqs;

458
out:
459 460 461 462 463
	kfree(vqs);
	kfree(callbacks);
	kfree(names);
	if (err)
		kfree(vblk->vqs);
464 465 466
	return err;
}

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
/*
 * Legacy naming scheme used for virtio devices.  We are stuck with it for
 * virtio blk but don't ever use it for any new driver.
 */
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
{
	const int base = 'z' - 'a' + 1;
	char *begin = buf + strlen(prefix);
	char *end = buf + buflen;
	char *p;
	int unit;

	p = end - 1;
	*p = '\0';
	unit = base;
	do {
		if (p == begin)
			return -EINVAL;
		*--p = 'a' + (index % unit);
		index = (index / unit) - 1;
	} while (index >= 0);

	memmove(begin, p, end - p);
	memcpy(buf, prefix, strlen(prefix));

	return 0;
}

495 496 497 498 499
static int virtblk_get_cache_mode(struct virtio_device *vdev)
{
	u8 writeback;
	int err;

500 501 502
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
				   struct virtio_blk_config, wce,
				   &writeback);
503 504 505 506 507

	/*
	 * If WCE is not configurable and flush is not available,
	 * assume no writeback cache is in use.
	 */
508
	if (err)
509
		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
510 511 512 513 514 515 516 517 518

	return writeback;
}

static void virtblk_update_cache_mode(struct virtio_device *vdev)
{
	u8 writeback = virtblk_get_cache_mode(vdev);
	struct virtio_blk *vblk = vdev->priv;

519
	blk_queue_write_cache(vblk->disk->queue, writeback, false);
520 521 522 523 524 525 526 527
	revalidate_disk(vblk->disk);
}

static const char *const virtblk_cache_types[] = {
	"write through", "write back"
};

static ssize_t
528 529
cache_type_store(struct device *dev, struct device_attribute *attr,
		 const char *buf, size_t count)
530 531 532 533 534 535 536
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;
	int i;

	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
537
	i = sysfs_match_string(virtblk_cache_types, buf);
538
	if (i < 0)
539
		return i;
540

541
	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
542 543 544 545 546
	virtblk_update_cache_mode(vdev);
	return count;
}

static ssize_t
547
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
548 549 550 551 552 553 554 555 556
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	u8 writeback = virtblk_get_cache_mode(vblk->vdev);

	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
}

557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
static DEVICE_ATTR_RW(cache_type);

static struct attribute *virtblk_attrs[] = {
	&dev_attr_serial.attr,
	&dev_attr_cache_type.attr,
	NULL,
};

static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
		struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, struct device, kobj);
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;

	if (a == &dev_attr_cache_type.attr &&
	    !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
		return S_IRUGO;

	return a->mode;
}

static const struct attribute_group virtblk_attr_group = {
	.attrs = virtblk_attrs,
	.is_visible = virtblk_attrs_are_visible,
};

static const struct attribute_group *virtblk_attr_groups[] = {
	&virtblk_attr_group,
	NULL,
};
589

590 591
static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
		unsigned int hctx_idx, unsigned int numa_node)
592
{
593
	struct virtio_blk *vblk = set->driver_data;
594 595 596 597 598 599
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);

	sg_init_table(vbr->sg, vblk->sg_elems);
	return 0;
}

600 601 602 603
static int virtblk_map_queues(struct blk_mq_tag_set *set)
{
	struct virtio_blk *vblk = set->driver_data;

604 605
	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
					vblk->vdev, 0);
606 607
}

608
static const struct blk_mq_ops virtio_mq_ops = {
J
Jens Axboe 已提交
609
	.queue_rq	= virtio_queue_rq,
610
	.commit_rqs	= virtio_commit_rqs,
611
	.complete	= virtblk_request_done,
612
	.init_request	= virtblk_init_request,
613
	.map_queues	= virtblk_map_queues,
J
Jens Axboe 已提交
614 615
};

616 617
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
J
Jens Axboe 已提交
618

619
static int virtblk_probe(struct virtio_device *vdev)
R
Rusty Russell 已提交
620 621
{
	struct virtio_blk *vblk;
622
	struct request_queue *q;
623
	int err, index;
624

625
	u32 v, blk_size, max_size, sg_elems, opt_io_size;
626 627
	u16 min_io_size;
	u8 physical_block_exp, alignment_offset;
R
Rusty Russell 已提交
628

629 630 631 632 633 634
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

635 636 637 638 639
	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
			     GFP_KERNEL);
	if (err < 0)
		goto out;
	index = err;
640

641
	/* We need to know how many segments before we allocate. */
642 643 644
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
				   struct virtio_blk_config, seg_max,
				   &sg_elems);
645 646 647

	/* We need at least one SG element, whatever they say. */
	if (err || !sg_elems)
648 649 650 651
		sg_elems = 1;

	/* We need an extra sg elements at head and tail. */
	sg_elems += 2;
J
Jens Axboe 已提交
652
	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
R
Rusty Russell 已提交
653 654
	if (!vblk) {
		err = -ENOMEM;
655
		goto out_free_index;
R
Rusty Russell 已提交
656 657 658
	}

	vblk->vdev = vdev;
659
	vblk->sg_elems = sg_elems;
660

661
	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
R
Rusty Russell 已提交
662

663 664
	err = init_vq(vblk);
	if (err)
R
Rusty Russell 已提交
665 666 667
		goto out_free_vblk;

	/* FIXME: How many partitions?  How long is a piece of string? */
668
	vblk->disk = alloc_disk(1 << PART_BITS);
R
Rusty Russell 已提交
669 670
	if (!vblk->disk) {
		err = -ENOMEM;
J
Jens Axboe 已提交
671
		goto out_free_vq;
R
Rusty Russell 已提交
672 673
	}

674
	/* Default queue sizing is to fill the ring. */
675
	if (!virtblk_queue_depth) {
676
		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
677 678
		/* ... but without indirect descs, we use 2 descs per req */
		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
679
			virtblk_queue_depth /= 2;
680
	}
681 682 683 684 685 686 687

	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
	vblk->tag_set.ops = &virtio_mq_ops;
	vblk->tag_set.queue_depth = virtblk_queue_depth;
	vblk->tag_set.numa_node = NUMA_NO_NODE;
	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
	vblk->tag_set.cmd_size =
J
Jens Axboe 已提交
688 689
		sizeof(struct virtblk_req) +
		sizeof(struct scatterlist) * sg_elems;
690
	vblk->tag_set.driver_data = vblk;
691
	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
J
Jens Axboe 已提交
692

693 694 695 696
	err = blk_mq_alloc_tag_set(&vblk->tag_set);
	if (err)
		goto out_put_disk;

697
	q = blk_mq_init_queue(&vblk->tag_set);
698
	if (IS_ERR(q)) {
R
Rusty Russell 已提交
699
		err = -ENOMEM;
700
		goto out_free_tags;
R
Rusty Russell 已提交
701
	}
702
	vblk->disk->queue = q;
R
Rusty Russell 已提交
703

704
	q->queuedata = vblk;
705

706
	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
707

R
Rusty Russell 已提交
708
	vblk->disk->major = major;
709
	vblk->disk->first_minor = index_to_minor(index);
R
Rusty Russell 已提交
710 711
	vblk->disk->private_data = vblk;
	vblk->disk->fops = &virtblk_fops;
712
	vblk->disk->flags |= GENHD_FL_EXT_DEVT;
713
	vblk->index = index;
714

715
	/* configure queue flush support */
716
	virtblk_update_cache_mode(vdev);
R
Rusty Russell 已提交
717

718 719 720 721
	/* If disk is read-only in the host, the guest should obey */
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
		set_disk_ro(vblk->disk, 1);

722
	/* We can handle whatever the host told us to handle. */
723
	blk_queue_max_segments(q, vblk->sg_elems-2);
724

725
	/* No real sector limit. */
726
	blk_queue_max_hw_sectors(q, -1U);
727

728 729
	max_size = virtio_max_dma_size(vdev);

730 731
	/* Host can optionally specify maximum segment size and number of
	 * segments. */
732 733
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
				   struct virtio_blk_config, size_max, &v);
R
Rusty Russell 已提交
734
	if (!err)
735 736 737
		max_size = min(max_size, v);

	blk_queue_max_segment_size(q, max_size);
R
Rusty Russell 已提交
738

739
	/* Host can optionally specify the block size of the device */
740 741 742
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
				   struct virtio_blk_config, blk_size,
				   &blk_size);
743
	if (!err)
744 745 746 747 748
		blk_queue_logical_block_size(q, blk_size);
	else
		blk_size = queue_logical_block_size(q);

	/* Use topology information if available */
749 750 751
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, physical_block_exp,
				   &physical_block_exp);
752 753 754 755
	if (!err && physical_block_exp)
		blk_queue_physical_block_size(q,
				blk_size * (1 << physical_block_exp));

756 757 758
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, alignment_offset,
				   &alignment_offset);
759 760 761
	if (!err && alignment_offset)
		blk_queue_alignment_offset(q, blk_size * alignment_offset);

762 763 764
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, min_io_size,
				   &min_io_size);
765 766 767
	if (!err && min_io_size)
		blk_queue_io_min(q, blk_size * min_io_size);

768 769 770
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, opt_io_size,
				   &opt_io_size);
771 772 773
	if (!err && opt_io_size)
		blk_queue_io_opt(q, blk_size * opt_io_size);

774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
		q->limits.discard_granularity = blk_size;

		virtio_cread(vdev, struct virtio_blk_config,
			     discard_sector_alignment, &v);
		q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;

		virtio_cread(vdev, struct virtio_blk_config,
			     max_discard_sectors, &v);
		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);

		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
			     &v);
		blk_queue_max_discard_segments(q,
					       min_not_zero(v,
							    MAX_DISCARD_SEGMENTS));

		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
	}

	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
		virtio_cread(vdev, struct virtio_blk_config,
			     max_write_zeroes_sectors, &v);
		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
	}

800
	virtblk_update_capacity(vblk, false);
M
Michael S. Tsirkin 已提交
801 802
	virtio_device_ready(vdev);

803
	device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
R
Rusty Russell 已提交
804 805
	return 0;

806 807
out_free_tags:
	blk_mq_free_tag_set(&vblk->tag_set);
R
Rusty Russell 已提交
808 809 810
out_put_disk:
	put_disk(vblk->disk);
out_free_vq:
811
	vdev->config->del_vqs(vdev);
R
Rusty Russell 已提交
812 813
out_free_vblk:
	kfree(vblk);
814 815
out_free_index:
	ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
816 817 818 819
out:
	return err;
}

820
static void virtblk_remove(struct virtio_device *vdev)
R
Rusty Russell 已提交
821 822
{
	struct virtio_blk *vblk = vdev->priv;
823
	int index = vblk->index;
824
	int refc;
R
Rusty Russell 已提交
825

826 827
	/* Make sure no work handler is accessing the device. */
	flush_work(&vblk->config_work);
828

829
	del_gendisk(vblk->disk);
830
	blk_cleanup_queue(vblk->disk->queue);
831

832 833
	blk_mq_free_tag_set(&vblk->tag_set);

R
Rusty Russell 已提交
834 835 836
	/* Stop all the virtqueues. */
	vdev->config->reset(vdev);

837
	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
R
Rusty Russell 已提交
838
	put_disk(vblk->disk);
839
	vdev->config->del_vqs(vdev);
840
	kfree(vblk->vqs);
R
Rusty Russell 已提交
841
	kfree(vblk);
842 843 844 845

	/* Only free device id if we don't have any users */
	if (refc == 1)
		ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
846 847
}

848
#ifdef CONFIG_PM_SLEEP
849 850 851 852 853 854 855
static int virtblk_freeze(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	/* Ensure we don't receive any more interrupts */
	vdev->config->reset(vdev);

856
	/* Make sure no work handler is accessing the device. */
857 858
	flush_work(&vblk->config_work);

859
	blk_mq_quiesce_queue(vblk->disk->queue);
860 861 862 863 864 865 866 867 868 869 870

	vdev->config->del_vqs(vdev);
	return 0;
}

static int virtblk_restore(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;
	int ret;

	ret = init_vq(vdev->priv);
871 872 873 874
	if (ret)
		return ret;

	virtio_device_ready(vdev);
J
Jens Axboe 已提交
875

876
	blk_mq_unquiesce_queue(vblk->disk->queue);
877
	return 0;
878 879 880
}
#endif

881
static const struct virtio_device_id id_table[] = {
R
Rusty Russell 已提交
882 883 884 885
	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

M
Michael S. Tsirkin 已提交
886
static unsigned int features_legacy[] = {
887
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
888
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
889
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
890
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
M
Michael S. Tsirkin 已提交
891 892 893 894 895
}
;
static unsigned int features[] = {
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
896
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
897
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
898 899
};

900
static struct virtio_driver virtio_blk = {
M
Michael S. Tsirkin 已提交
901 902 903 904 905 906 907 908 909 910
	.feature_table			= features,
	.feature_table_size		= ARRAY_SIZE(features),
	.feature_table_legacy		= features_legacy,
	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
	.driver.name			= KBUILD_MODNAME,
	.driver.owner			= THIS_MODULE,
	.id_table			= id_table,
	.probe				= virtblk_probe,
	.remove				= virtblk_remove,
	.config_changed			= virtblk_config_changed,
911
#ifdef CONFIG_PM_SLEEP
M
Michael S. Tsirkin 已提交
912 913
	.freeze				= virtblk_freeze,
	.restore			= virtblk_restore,
914
#endif
R
Rusty Russell 已提交
915 916 917 918
};

static int __init init(void)
{
919 920 921 922 923 924
	int error;

	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
	if (!virtblk_wq)
		return -ENOMEM;

925
	major = register_blkdev(0, "virtblk");
926 927 928 929 930 931 932 933 934 935 936 937 938 939 940
	if (major < 0) {
		error = major;
		goto out_destroy_workqueue;
	}

	error = register_virtio_driver(&virtio_blk);
	if (error)
		goto out_unregister_blkdev;
	return 0;

out_unregister_blkdev:
	unregister_blkdev(major, "virtblk");
out_destroy_workqueue:
	destroy_workqueue(virtblk_wq);
	return error;
R
Rusty Russell 已提交
941 942 943 944 945
}

static void __exit fini(void)
{
	unregister_virtio_driver(&virtio_blk);
946
	unregister_blkdev(major, "virtblk");
947
	destroy_workqueue(virtblk_wq);
R
Rusty Russell 已提交
948 949 950 951 952 953 954
}
module_init(init);
module_exit(fini);

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio block driver");
MODULE_LICENSE("GPL");