virtio_blk.c 23.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
R
Rusty Russell 已提交
2 3
//#define DEBUG
#include <linux/spinlock.h>
4
#include <linux/slab.h>
R
Rusty Russell 已提交
5 6
#include <linux/blkdev.h>
#include <linux/hdreg.h>
7
#include <linux/module.h>
8
#include <linux/mutex.h>
9
#include <linux/interrupt.h>
R
Rusty Russell 已提交
10 11
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
12
#include <linux/scatterlist.h>
13
#include <linux/string_helpers.h>
14
#include <linux/idr.h>
J
Jens Axboe 已提交
15
#include <linux/blk-mq.h>
16
#include <linux/blk-mq-virtio.h>
J
Jens Axboe 已提交
17
#include <linux/numa.h>
18

19
#define PART_BITS 4
20
#define VQ_NAME_LEN 16
21
#define MAX_DISCARD_SEGMENTS 256u
R
Rusty Russell 已提交
22

23 24 25
static int major;
static DEFINE_IDA(vd_index_ida);

26
static struct workqueue_struct *virtblk_wq;
27

28 29 30 31 32 33
struct virtio_blk_vq {
	struct virtqueue *vq;
	spinlock_t lock;
	char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;

34
struct virtio_blk {
R
Rusty Russell 已提交
35 36 37 38 39
	struct virtio_device *vdev;

	/* The disk structure for the kernel. */
	struct gendisk *disk;

40 41 42
	/* Block layer tags. */
	struct blk_mq_tag_set tag_set;

43 44 45
	/* Process context for config space updates */
	struct work_struct config_work;

46 47 48
	/* What host tells us, plus 2 for header & tailer. */
	unsigned int sg_elems;

49 50
	/* Ida index - used to track minor number allocations. */
	int index;
51 52 53 54

	/* num of vqs */
	int num_vqs;
	struct virtio_blk_vq *vqs;
R
Rusty Russell 已提交
55 56
};

57
struct virtblk_req {
58
	struct virtio_blk_outhdr out_hdr;
59
	u8 status;
60
	struct scatterlist sg[];
R
Rusty Russell 已提交
61 62
};

63
static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
64 65 66
{
	switch (vbr->status) {
	case VIRTIO_BLK_S_OK:
67
		return BLK_STS_OK;
68
	case VIRTIO_BLK_S_UNSUPP:
69
		return BLK_STS_NOTSUPP;
70
	default:
71
		return BLK_STS_IOERR;
72 73 74
	}
}

75 76 77 78 79 80 81 82
static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
		struct scatterlist *data_sg, bool have_data)
{
	struct scatterlist hdr, status, *sgs[3];
	unsigned int num_out = 0, num_in = 0;

	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
	sgs[num_out++] = &hdr;
83

R
Rusty Russell 已提交
84
	if (have_data) {
M
Michael S. Tsirkin 已提交
85
		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
86
			sgs[num_out++] = data_sg;
87
		else
88 89 90
			sgs[num_out + num_in++] = data_sg;
	}

91 92 93 94
	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
	sgs[num_out + num_in++] = &status;

	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
95 96
}

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
{
	unsigned short segments = blk_rq_nr_discard_segments(req);
	unsigned short n = 0;
	struct virtio_blk_discard_write_zeroes *range;
	struct bio *bio;
	u32 flags = 0;

	if (unmap)
		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;

	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
	if (!range)
		return -ENOMEM;

	__rq_for_each_bio(bio, req) {
		u64 sector = bio->bi_iter.bi_sector;
		u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;

		range[n].flags = cpu_to_le32(flags);
		range[n].num_sectors = cpu_to_le32(num_sectors);
		range[n].sector = cpu_to_le64(sector);
		n++;
	}

	req->special_vec.bv_page = virt_to_page(range);
	req->special_vec.bv_offset = offset_in_page(range);
	req->special_vec.bv_len = sizeof(*range) * segments;
	req->rq_flags |= RQF_SPECIAL_PAYLOAD;

	return 0;
}

130
static inline void virtblk_request_done(struct request *req)
131
{
132
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
133

134 135 136 137 138
	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
		kfree(page_address(req->special_vec.bv_page) +
		      req->special_vec.bv_offset);
	}

139
	blk_mq_end_request(req, virtblk_result(vbr));
140 141 142
}

static void virtblk_done(struct virtqueue *vq)
R
Rusty Russell 已提交
143 144
{
	struct virtio_blk *vblk = vq->vdev->priv;
J
Jens Axboe 已提交
145
	bool req_done = false;
146
	int qid = vq->index;
R
Rusty Russell 已提交
147 148
	struct virtblk_req *vbr;
	unsigned long flags;
149
	unsigned int len;
R
Rusty Russell 已提交
150

151
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
152 153
	do {
		virtqueue_disable_cb(vq);
154
		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
155 156
			struct request *req = blk_mq_rq_from_pdu(vbr);

157
			blk_mq_complete_request(req);
J
Jens Axboe 已提交
158
			req_done = true;
159
		}
160 161
		if (unlikely(virtqueue_is_broken(vq)))
			break;
162
	} while (!virtqueue_enable_cb(vq));
J
Jens Axboe 已提交
163

R
Rusty Russell 已提交
164
	/* In case queue is stopped waiting for more buffers. */
165
	if (req_done)
166
		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
167
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
168 169
}

170 171 172 173 174 175 176 177 178 179 180 181 182 183
static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct virtio_blk *vblk = hctx->queue->queuedata;
	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
	bool kick;

	spin_lock_irq(&vq->lock);
	kick = virtqueue_kick_prepare(vq->vq);
	spin_unlock_irq(&vq->lock);

	if (kick)
		virtqueue_notify(vq->vq);
}

184
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
185
			   const struct blk_mq_queue_data *bd)
R
Rusty Russell 已提交
186
{
J
Jens Axboe 已提交
187
	struct virtio_blk *vblk = hctx->queue->queuedata;
188
	struct request *req = bd->rq;
189
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
J
Jens Axboe 已提交
190
	unsigned long flags;
191
	unsigned int num;
192
	int qid = hctx->queue_num;
193
	int err;
194
	bool notify = false;
195
	bool unmap = false;
196
	u32 type;
R
Rusty Russell 已提交
197

J
Jens Axboe 已提交
198
	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
R
Rusty Russell 已提交
199

200 201 202 203 204 205 206 207
	switch (req_op(req)) {
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		type = 0;
		break;
	case REQ_OP_FLUSH:
		type = VIRTIO_BLK_T_FLUSH;
		break;
208 209 210 211 212 213 214
	case REQ_OP_DISCARD:
		type = VIRTIO_BLK_T_DISCARD;
		break;
	case REQ_OP_WRITE_ZEROES:
		type = VIRTIO_BLK_T_WRITE_ZEROES;
		unmap = !(req->cmd_flags & REQ_NOUNMAP);
		break;
215 216 217 218 219
	case REQ_OP_DRV_IN:
		type = VIRTIO_BLK_T_GET_ID;
		break;
	default:
		WARN_ON_ONCE(1);
220
		return BLK_STS_IOERR;
R
Rusty Russell 已提交
221 222
	}

223 224 225 226 227
	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
	vbr->out_hdr.sector = type ?
		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));

228 229
	blk_mq_start_request(req);

230 231 232 233 234 235
	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
		err = virtblk_setup_discard_write_zeroes(req, unmap);
		if (err)
			return BLK_STS_RESOURCE;
	}

236
	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
237
	if (num) {
238
		if (rq_data_dir(req) == WRITE)
M
Michael S. Tsirkin 已提交
239
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
240
		else
M
Michael S. Tsirkin 已提交
241
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
R
Rusty Russell 已提交
242 243
	}

244
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
245
	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
246
	if (err) {
247
		virtqueue_kick(vblk->vqs[qid].vq);
248 249 250 251 252
		/* Don't stop the queue if -ENOMEM: we may have failed to
		 * bounce the buffer due to global resource outage.
		 */
		if (err == -ENOSPC)
			blk_mq_stop_hw_queue(hctx);
253
		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
254
		if (err == -ENOMEM || err == -ENOSPC)
255
			return BLK_STS_DEV_RESOURCE;
256
		return BLK_STS_IOERR;
257 258
	}

259
	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
260
		notify = true;
261
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
262 263

	if (notify)
264
		virtqueue_notify(vblk->vqs[qid].vq);
265
	return BLK_STS_OK;
266 267
}

268 269 270 271 272
/* return id (s/n) string for *disk to *id_str
 */
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
	struct virtio_blk *vblk = disk->private_data;
273
	struct request_queue *q = vblk->disk->queue;
274
	struct request *req;
M
Mike Snitzer 已提交
275
	int err;
276

277
	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
278
	if (IS_ERR(req))
279
		return PTR_ERR(req);
280 281 282 283 284

	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
	if (err)
		goto out;

285
	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
286
	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
287
out:
M
Mike Snitzer 已提交
288 289
	blk_put_request(req);
	return err;
290 291
}

292 293 294
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
295 296 297
	struct virtio_blk *vblk = bd->bd_disk->private_data;

	/* see if the host passed in geometry config */
298 299 300 301 302 303 304
	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.cylinders, &geo->cylinders);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.heads, &geo->heads);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.sectors, &geo->sectors);
305 306 307 308 309 310
	} else {
		/* some standard values, similar to sd */
		geo->heads = 1 << 6;
		geo->sectors = 1 << 5;
		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
	}
311 312 313
	return 0;
}

314
static const struct block_device_operations virtblk_fops = {
315 316
	.owner  = THIS_MODULE,
	.getgeo = virtblk_getgeo,
R
Rusty Russell 已提交
317 318
};

319 320 321 322 323
static int index_to_minor(int index)
{
	return index << PART_BITS;
}

324 325 326 327 328
static int minor_to_index(int minor)
{
	return minor >> PART_BITS;
}

329 330
static ssize_t serial_show(struct device *dev,
			   struct device_attribute *attr, char *buf)
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
{
	struct gendisk *disk = dev_to_disk(dev);
	int err;

	/* sysfs gives us a PAGE_SIZE buffer */
	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);

	buf[VIRTIO_BLK_ID_BYTES] = '\0';
	err = virtblk_get_id(disk, buf);
	if (!err)
		return strlen(buf);

	if (err == -EIO) /* Unsupported? Make it empty. */
		return 0;

	return err;
}
348

349
static DEVICE_ATTR_RO(serial);
350

351 352
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
353 354 355 356
{
	struct virtio_device *vdev = vblk->vdev;
	struct request_queue *q = vblk->disk->queue;
	char cap_str_2[10], cap_str_10[10];
357
	unsigned long long nblocks;
358
	u64 capacity;
359 360

	/* Host must always specify the capacity. */
361
	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
362 363 364 365 366 367 368 369

	/* If capacity is too big, truncate with warning. */
	if ((sector_t)capacity != capacity) {
		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
			 (unsigned long long)capacity);
		capacity = (sector_t)-1;
	}

370 371 372
	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);

	string_get_size(nblocks, queue_logical_block_size(q),
373
			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
374
	string_get_size(nblocks, queue_logical_block_size(q),
375
			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
376 377

	dev_notice(&vdev->dev,
378 379 380
		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
		   vblk->disk->disk_name,
		   resize ? "new size: " : "",
381 382 383 384
		   nblocks,
		   queue_logical_block_size(q),
		   cap_str_10,
		   cap_str_2);
385 386

	set_capacity(vblk->disk, capacity);
387 388 389 390 391 392 393 394 395
}

static void virtblk_config_changed_work(struct work_struct *work)
{
	struct virtio_blk *vblk =
		container_of(work, struct virtio_blk, config_work);
	char *envp[] = { "RESIZE=1", NULL };

	virtblk_update_capacity(vblk, true);
396
	revalidate_disk(vblk->disk);
397
	kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
398 399 400 401 402 403 404 405 406
}

static void virtblk_config_changed(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	queue_work(virtblk_wq, &vblk->config_work);
}

407 408
static int init_vq(struct virtio_blk *vblk)
{
409
	int err;
410 411 412 413 414 415
	int i;
	vq_callback_t **callbacks;
	const char **names;
	struct virtqueue **vqs;
	unsigned short num_vqs;
	struct virtio_device *vdev = vblk->vdev;
416
	struct irq_affinity desc = { 0, };
417 418 419 420 421 422 423

	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
				   struct virtio_blk_config, num_queues,
				   &num_vqs);
	if (err)
		num_vqs = 1;

424 425
	num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);

426
	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
427 428
	if (!vblk->vqs)
		return -ENOMEM;
429

430 431 432
	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
433 434 435 436
	if (!names || !callbacks || !vqs) {
		err = -ENOMEM;
		goto out;
	}
437

438 439 440 441 442 443 444
	for (i = 0; i < num_vqs; i++) {
		callbacks[i] = virtblk_done;
		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
		names[i] = vblk->vqs[i].name;
	}

	/* Discover virtqueues and write information to configuration.  */
M
Michael S. Tsirkin 已提交
445
	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
446
	if (err)
447
		goto out;
448

449 450 451 452 453 454
	for (i = 0; i < num_vqs; i++) {
		spin_lock_init(&vblk->vqs[i].lock);
		vblk->vqs[i].vq = vqs[i];
	}
	vblk->num_vqs = num_vqs;

455
out:
456 457 458 459 460
	kfree(vqs);
	kfree(callbacks);
	kfree(names);
	if (err)
		kfree(vblk->vqs);
461 462 463
	return err;
}

464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
/*
 * Legacy naming scheme used for virtio devices.  We are stuck with it for
 * virtio blk but don't ever use it for any new driver.
 */
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
{
	const int base = 'z' - 'a' + 1;
	char *begin = buf + strlen(prefix);
	char *end = buf + buflen;
	char *p;
	int unit;

	p = end - 1;
	*p = '\0';
	unit = base;
	do {
		if (p == begin)
			return -EINVAL;
		*--p = 'a' + (index % unit);
		index = (index / unit) - 1;
	} while (index >= 0);

	memmove(begin, p, end - p);
	memcpy(buf, prefix, strlen(prefix));

	return 0;
}

492 493 494 495 496
static int virtblk_get_cache_mode(struct virtio_device *vdev)
{
	u8 writeback;
	int err;

497 498 499
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
				   struct virtio_blk_config, wce,
				   &writeback);
500 501 502 503 504

	/*
	 * If WCE is not configurable and flush is not available,
	 * assume no writeback cache is in use.
	 */
505
	if (err)
506
		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
507 508 509 510 511 512 513 514 515

	return writeback;
}

static void virtblk_update_cache_mode(struct virtio_device *vdev)
{
	u8 writeback = virtblk_get_cache_mode(vdev);
	struct virtio_blk *vblk = vdev->priv;

516
	blk_queue_write_cache(vblk->disk->queue, writeback, false);
517 518 519 520 521 522 523 524
	revalidate_disk(vblk->disk);
}

static const char *const virtblk_cache_types[] = {
	"write through", "write back"
};

static ssize_t
525 526
cache_type_store(struct device *dev, struct device_attribute *attr,
		 const char *buf, size_t count)
527 528 529 530 531 532 533
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;
	int i;

	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
534
	i = sysfs_match_string(virtblk_cache_types, buf);
535
	if (i < 0)
536
		return i;
537

538
	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
539 540 541 542 543
	virtblk_update_cache_mode(vdev);
	return count;
}

static ssize_t
544
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
545 546 547 548 549 550 551 552 553
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	u8 writeback = virtblk_get_cache_mode(vblk->vdev);

	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
}

554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
static DEVICE_ATTR_RW(cache_type);

static struct attribute *virtblk_attrs[] = {
	&dev_attr_serial.attr,
	&dev_attr_cache_type.attr,
	NULL,
};

static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
		struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, struct device, kobj);
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;

	if (a == &dev_attr_cache_type.attr &&
	    !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
		return S_IRUGO;

	return a->mode;
}

static const struct attribute_group virtblk_attr_group = {
	.attrs = virtblk_attrs,
	.is_visible = virtblk_attrs_are_visible,
};

static const struct attribute_group *virtblk_attr_groups[] = {
	&virtblk_attr_group,
	NULL,
};
586

587 588
static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
		unsigned int hctx_idx, unsigned int numa_node)
589
{
590
	struct virtio_blk *vblk = set->driver_data;
591 592 593 594 595 596
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);

	sg_init_table(vbr->sg, vblk->sg_elems);
	return 0;
}

597 598 599 600
static int virtblk_map_queues(struct blk_mq_tag_set *set)
{
	struct virtio_blk *vblk = set->driver_data;

601 602
	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
					vblk->vdev, 0);
603 604
}

605
static const struct blk_mq_ops virtio_mq_ops = {
J
Jens Axboe 已提交
606
	.queue_rq	= virtio_queue_rq,
607
	.commit_rqs	= virtio_commit_rqs,
608
	.complete	= virtblk_request_done,
609
	.init_request	= virtblk_init_request,
610
	.map_queues	= virtblk_map_queues,
J
Jens Axboe 已提交
611 612
};

613 614
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
J
Jens Axboe 已提交
615

616
static int virtblk_probe(struct virtio_device *vdev)
R
Rusty Russell 已提交
617 618
{
	struct virtio_blk *vblk;
619
	struct request_queue *q;
620
	int err, index;
621

622
	u32 v, blk_size, max_size, sg_elems, opt_io_size;
623 624
	u16 min_io_size;
	u8 physical_block_exp, alignment_offset;
R
Rusty Russell 已提交
625

626 627 628 629 630 631
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

632 633 634 635 636
	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
			     GFP_KERNEL);
	if (err < 0)
		goto out;
	index = err;
637

638
	/* We need to know how many segments before we allocate. */
639 640 641
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
				   struct virtio_blk_config, seg_max,
				   &sg_elems);
642 643 644

	/* We need at least one SG element, whatever they say. */
	if (err || !sg_elems)
645 646 647 648
		sg_elems = 1;

	/* We need an extra sg elements at head and tail. */
	sg_elems += 2;
J
Jens Axboe 已提交
649
	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
R
Rusty Russell 已提交
650 651
	if (!vblk) {
		err = -ENOMEM;
652
		goto out_free_index;
R
Rusty Russell 已提交
653 654 655
	}

	vblk->vdev = vdev;
656
	vblk->sg_elems = sg_elems;
657

658
	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
R
Rusty Russell 已提交
659

660 661
	err = init_vq(vblk);
	if (err)
R
Rusty Russell 已提交
662 663 664
		goto out_free_vblk;

	/* FIXME: How many partitions?  How long is a piece of string? */
665
	vblk->disk = alloc_disk(1 << PART_BITS);
R
Rusty Russell 已提交
666 667
	if (!vblk->disk) {
		err = -ENOMEM;
J
Jens Axboe 已提交
668
		goto out_free_vq;
R
Rusty Russell 已提交
669 670
	}

671
	/* Default queue sizing is to fill the ring. */
672
	if (!virtblk_queue_depth) {
673
		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
674 675
		/* ... but without indirect descs, we use 2 descs per req */
		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
676
			virtblk_queue_depth /= 2;
677
	}
678 679 680 681 682 683 684

	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
	vblk->tag_set.ops = &virtio_mq_ops;
	vblk->tag_set.queue_depth = virtblk_queue_depth;
	vblk->tag_set.numa_node = NUMA_NO_NODE;
	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
	vblk->tag_set.cmd_size =
J
Jens Axboe 已提交
685 686
		sizeof(struct virtblk_req) +
		sizeof(struct scatterlist) * sg_elems;
687
	vblk->tag_set.driver_data = vblk;
688
	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
J
Jens Axboe 已提交
689

690 691 692 693
	err = blk_mq_alloc_tag_set(&vblk->tag_set);
	if (err)
		goto out_put_disk;

694
	q = blk_mq_init_queue(&vblk->tag_set);
695
	if (IS_ERR(q)) {
R
Rusty Russell 已提交
696
		err = -ENOMEM;
697
		goto out_free_tags;
R
Rusty Russell 已提交
698
	}
699
	vblk->disk->queue = q;
R
Rusty Russell 已提交
700

701
	q->queuedata = vblk;
702

703
	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
704

R
Rusty Russell 已提交
705
	vblk->disk->major = major;
706
	vblk->disk->first_minor = index_to_minor(index);
R
Rusty Russell 已提交
707 708
	vblk->disk->private_data = vblk;
	vblk->disk->fops = &virtblk_fops;
709
	vblk->disk->flags |= GENHD_FL_EXT_DEVT;
710
	vblk->index = index;
711

712
	/* configure queue flush support */
713
	virtblk_update_cache_mode(vdev);
R
Rusty Russell 已提交
714

715 716 717 718
	/* If disk is read-only in the host, the guest should obey */
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
		set_disk_ro(vblk->disk, 1);

719
	/* We can handle whatever the host told us to handle. */
720
	blk_queue_max_segments(q, vblk->sg_elems-2);
721

722
	/* No real sector limit. */
723
	blk_queue_max_hw_sectors(q, -1U);
724

725 726
	max_size = virtio_max_dma_size(vdev);

727 728
	/* Host can optionally specify maximum segment size and number of
	 * segments. */
729 730
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
				   struct virtio_blk_config, size_max, &v);
R
Rusty Russell 已提交
731
	if (!err)
732 733 734
		max_size = min(max_size, v);

	blk_queue_max_segment_size(q, max_size);
R
Rusty Russell 已提交
735

736
	/* Host can optionally specify the block size of the device */
737 738 739
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
				   struct virtio_blk_config, blk_size,
				   &blk_size);
740
	if (!err)
741 742 743 744 745
		blk_queue_logical_block_size(q, blk_size);
	else
		blk_size = queue_logical_block_size(q);

	/* Use topology information if available */
746 747 748
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, physical_block_exp,
				   &physical_block_exp);
749 750 751 752
	if (!err && physical_block_exp)
		blk_queue_physical_block_size(q,
				blk_size * (1 << physical_block_exp));

753 754 755
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, alignment_offset,
				   &alignment_offset);
756 757 758
	if (!err && alignment_offset)
		blk_queue_alignment_offset(q, blk_size * alignment_offset);

759 760 761
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, min_io_size,
				   &min_io_size);
762 763 764
	if (!err && min_io_size)
		blk_queue_io_min(q, blk_size * min_io_size);

765 766 767
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, opt_io_size,
				   &opt_io_size);
768 769 770
	if (!err && opt_io_size)
		blk_queue_io_opt(q, blk_size * opt_io_size);

771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
		q->limits.discard_granularity = blk_size;

		virtio_cread(vdev, struct virtio_blk_config,
			     discard_sector_alignment, &v);
		q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;

		virtio_cread(vdev, struct virtio_blk_config,
			     max_discard_sectors, &v);
		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);

		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
			     &v);
		blk_queue_max_discard_segments(q,
					       min_not_zero(v,
							    MAX_DISCARD_SEGMENTS));

		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
	}

	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
		virtio_cread(vdev, struct virtio_blk_config,
			     max_write_zeroes_sectors, &v);
		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
	}

797
	virtblk_update_capacity(vblk, false);
M
Michael S. Tsirkin 已提交
798 799
	virtio_device_ready(vdev);

800
	device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
R
Rusty Russell 已提交
801 802
	return 0;

803 804
out_free_tags:
	blk_mq_free_tag_set(&vblk->tag_set);
R
Rusty Russell 已提交
805 806 807
out_put_disk:
	put_disk(vblk->disk);
out_free_vq:
808
	vdev->config->del_vqs(vdev);
R
Rusty Russell 已提交
809 810
out_free_vblk:
	kfree(vblk);
811 812
out_free_index:
	ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
813 814 815 816
out:
	return err;
}

817
static void virtblk_remove(struct virtio_device *vdev)
R
Rusty Russell 已提交
818 819
{
	struct virtio_blk *vblk = vdev->priv;
820
	int index = vblk->index;
821
	int refc;
R
Rusty Russell 已提交
822

823 824
	/* Make sure no work handler is accessing the device. */
	flush_work(&vblk->config_work);
825

826
	del_gendisk(vblk->disk);
827
	blk_cleanup_queue(vblk->disk->queue);
828

829 830
	blk_mq_free_tag_set(&vblk->tag_set);

R
Rusty Russell 已提交
831 832 833
	/* Stop all the virtqueues. */
	vdev->config->reset(vdev);

834
	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
R
Rusty Russell 已提交
835
	put_disk(vblk->disk);
836
	vdev->config->del_vqs(vdev);
837
	kfree(vblk->vqs);
R
Rusty Russell 已提交
838
	kfree(vblk);
839 840 841 842

	/* Only free device id if we don't have any users */
	if (refc == 1)
		ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
843 844
}

845
#ifdef CONFIG_PM_SLEEP
846 847 848 849 850 851 852
static int virtblk_freeze(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	/* Ensure we don't receive any more interrupts */
	vdev->config->reset(vdev);

853
	/* Make sure no work handler is accessing the device. */
854 855
	flush_work(&vblk->config_work);

856
	blk_mq_quiesce_queue(vblk->disk->queue);
857 858 859 860 861 862 863 864 865 866 867

	vdev->config->del_vqs(vdev);
	return 0;
}

static int virtblk_restore(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;
	int ret;

	ret = init_vq(vdev->priv);
868 869 870 871
	if (ret)
		return ret;

	virtio_device_ready(vdev);
J
Jens Axboe 已提交
872

873
	blk_mq_unquiesce_queue(vblk->disk->queue);
874
	return 0;
875 876 877
}
#endif

878
static const struct virtio_device_id id_table[] = {
R
Rusty Russell 已提交
879 880 881 882
	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

M
Michael S. Tsirkin 已提交
883
static unsigned int features_legacy[] = {
884
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
885
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
886
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
887
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
M
Michael S. Tsirkin 已提交
888 889 890 891 892
}
;
static unsigned int features[] = {
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
893
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
894
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
895 896
};

897
static struct virtio_driver virtio_blk = {
M
Michael S. Tsirkin 已提交
898 899 900 901 902 903 904 905 906 907
	.feature_table			= features,
	.feature_table_size		= ARRAY_SIZE(features),
	.feature_table_legacy		= features_legacy,
	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
	.driver.name			= KBUILD_MODNAME,
	.driver.owner			= THIS_MODULE,
	.id_table			= id_table,
	.probe				= virtblk_probe,
	.remove				= virtblk_remove,
	.config_changed			= virtblk_config_changed,
908
#ifdef CONFIG_PM_SLEEP
M
Michael S. Tsirkin 已提交
909 910
	.freeze				= virtblk_freeze,
	.restore			= virtblk_restore,
911
#endif
R
Rusty Russell 已提交
912 913 914 915
};

static int __init init(void)
{
916 917 918 919 920 921
	int error;

	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
	if (!virtblk_wq)
		return -ENOMEM;

922
	major = register_blkdev(0, "virtblk");
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
	if (major < 0) {
		error = major;
		goto out_destroy_workqueue;
	}

	error = register_virtio_driver(&virtio_blk);
	if (error)
		goto out_unregister_blkdev;
	return 0;

out_unregister_blkdev:
	unregister_blkdev(major, "virtblk");
out_destroy_workqueue:
	destroy_workqueue(virtblk_wq);
	return error;
R
Rusty Russell 已提交
938 939 940 941 942
}

static void __exit fini(void)
{
	unregister_virtio_driver(&virtio_blk);
943
	unregister_blkdev(major, "virtblk");
944
	destroy_workqueue(virtblk_wq);
R
Rusty Russell 已提交
945 946 947 948 949 950 951
}
module_init(init);
module_exit(fini);

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio block driver");
MODULE_LICENSE("GPL");