virtio_blk.c 26.4 KB
Newer Older
R
Rusty Russell 已提交
1 2
//#define DEBUG
#include <linux/spinlock.h>
3
#include <linux/slab.h>
R
Rusty Russell 已提交
4 5
#include <linux/blkdev.h>
#include <linux/hdreg.h>
6
#include <linux/module.h>
7
#include <linux/mutex.h>
8
#include <linux/interrupt.h>
R
Rusty Russell 已提交
9 10
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
11
#include <linux/scatterlist.h>
12
#include <linux/string_helpers.h>
13
#include <scsi/scsi_cmnd.h>
14
#include <linux/idr.h>
J
Jens Axboe 已提交
15
#include <linux/blk-mq.h>
16
#include <linux/blk-mq-virtio.h>
J
Jens Axboe 已提交
17
#include <linux/numa.h>
18

19
#define PART_BITS 4
20
#define VQ_NAME_LEN 16
21
#define MAX_DISCARD_SEGMENTS 256u
R
Rusty Russell 已提交
22

23 24 25
static int major;
static DEFINE_IDA(vd_index_ida);

26
static struct workqueue_struct *virtblk_wq;
27

28 29 30 31 32 33
struct virtio_blk_vq {
	struct virtqueue *vq;
	spinlock_t lock;
	char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;

34
struct virtio_blk {
R
Rusty Russell 已提交
35 36 37 38 39
	struct virtio_device *vdev;

	/* The disk structure for the kernel. */
	struct gendisk *disk;

40 41 42
	/* Block layer tags. */
	struct blk_mq_tag_set tag_set;

43 44 45
	/* Process context for config space updates */
	struct work_struct config_work;

46 47 48
	/* What host tells us, plus 2 for header & tailer. */
	unsigned int sg_elems;

49 50
	/* Ida index - used to track minor number allocations. */
	int index;
51 52 53 54

	/* num of vqs */
	int num_vqs;
	struct virtio_blk_vq *vqs;
R
Rusty Russell 已提交
55 56
};

57
struct virtblk_req {
58 59 60
#ifdef CONFIG_VIRTIO_BLK_SCSI
	struct scsi_request sreq;	/* for SCSI passthrough, must be first */
	u8 sense[SCSI_SENSE_BUFFERSIZE];
61
	struct virtio_scsi_inhdr in_hdr;
62 63
#endif
	struct virtio_blk_outhdr out_hdr;
64
	u8 status;
65
	struct scatterlist sg[];
R
Rusty Russell 已提交
66 67
};

68
static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
69 70 71
{
	switch (vbr->status) {
	case VIRTIO_BLK_S_OK:
72
		return BLK_STS_OK;
73
	case VIRTIO_BLK_S_UNSUPP:
74
		return BLK_STS_NOTSUPP;
75
	default:
76
		return BLK_STS_IOERR;
77 78 79
	}
}

80 81 82 83 84 85 86 87 88
/*
 * If this is a packet command we need a couple of additional headers.  Behind
 * the normal outhdr we put a segment with the scsi command block, and before
 * the normal inhdr we put the sense data and the inhdr with additional status
 * information.
 */
#ifdef CONFIG_VIRTIO_BLK_SCSI
static int virtblk_add_req_scsi(struct virtqueue *vq, struct virtblk_req *vbr,
		struct scatterlist *data_sg, bool have_data)
89
{
90
	struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
91 92 93 94
	unsigned int num_out = 0, num_in = 0;

	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
	sgs[num_out++] = &hdr;
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
	sg_init_one(&cmd, vbr->sreq.cmd, vbr->sreq.cmd_len);
	sgs[num_out++] = &cmd;

	if (have_data) {
		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
			sgs[num_out++] = data_sg;
		else
			sgs[num_out + num_in++] = data_sg;
	}

	sg_init_one(&sense, vbr->sense, SCSI_SENSE_BUFFERSIZE);
	sgs[num_out + num_in++] = &sense;
	sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
	sgs[num_out + num_in++] = &inhdr;
	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
	sgs[num_out + num_in++] = &status;

	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}

115
static inline void virtblk_scsi_request_done(struct request *req)
116 117 118 119 120 121 122
{
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
	struct virtio_blk *vblk = req->q->queuedata;
	struct scsi_request *sreq = &vbr->sreq;

	sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
	sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
123
	sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
124 125 126 127 128 129 130
}

static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
			     unsigned int cmd, unsigned long data)
{
	struct gendisk *disk = bdev->bd_disk;
	struct virtio_blk *vblk = disk->private_data;
131

132
	/*
133
	 * Only allow the generic SCSI ioctls if the host can support it.
134
	 */
135 136 137 138 139 140 141 142 143 144 145 146 147
	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
		return -ENOTTY;

	return scsi_cmd_blk_ioctl(bdev, mode, cmd,
				  (void __user *)data);
}
#else
static inline int virtblk_add_req_scsi(struct virtqueue *vq,
		struct virtblk_req *vbr, struct scatterlist *data_sg,
		bool have_data)
{
	return -EIO;
}
148
static inline void virtblk_scsi_request_done(struct request *req)
149 150 151 152 153 154 155 156 157 158 159 160 161
{
}
#define virtblk_ioctl	NULL
#endif /* CONFIG_VIRTIO_BLK_SCSI */

static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
		struct scatterlist *data_sg, bool have_data)
{
	struct scatterlist hdr, status, *sgs[3];
	unsigned int num_out = 0, num_in = 0;

	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
	sgs[num_out++] = &hdr;
162

R
Rusty Russell 已提交
163
	if (have_data) {
M
Michael S. Tsirkin 已提交
164
		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
165
			sgs[num_out++] = data_sg;
166
		else
167 168 169
			sgs[num_out + num_in++] = data_sg;
	}

170 171 172 173
	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
	sgs[num_out + num_in++] = &status;

	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
174 175
}

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
{
	unsigned short segments = blk_rq_nr_discard_segments(req);
	unsigned short n = 0;
	struct virtio_blk_discard_write_zeroes *range;
	struct bio *bio;
	u32 flags = 0;

	if (unmap)
		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;

	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
	if (!range)
		return -ENOMEM;

	__rq_for_each_bio(bio, req) {
		u64 sector = bio->bi_iter.bi_sector;
		u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;

		range[n].flags = cpu_to_le32(flags);
		range[n].num_sectors = cpu_to_le32(num_sectors);
		range[n].sector = cpu_to_le64(sector);
		n++;
	}

	req->special_vec.bv_page = virt_to_page(range);
	req->special_vec.bv_offset = offset_in_page(range);
	req->special_vec.bv_len = sizeof(*range) * segments;
	req->rq_flags |= RQF_SPECIAL_PAYLOAD;

	return 0;
}

209
static inline void virtblk_request_done(struct request *req)
210
{
211
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
212

213 214 215 216 217
	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
		kfree(page_address(req->special_vec.bv_page) +
		      req->special_vec.bv_offset);
	}

218 219 220
	switch (req_op(req)) {
	case REQ_OP_SCSI_IN:
	case REQ_OP_SCSI_OUT:
221
		virtblk_scsi_request_done(req);
222
		break;
223 224
	}

225
	blk_mq_end_request(req, virtblk_result(vbr));
226 227 228
}

static void virtblk_done(struct virtqueue *vq)
R
Rusty Russell 已提交
229 230
{
	struct virtio_blk *vblk = vq->vdev->priv;
J
Jens Axboe 已提交
231
	bool req_done = false;
232
	int qid = vq->index;
R
Rusty Russell 已提交
233 234
	struct virtblk_req *vbr;
	unsigned long flags;
235
	unsigned int len;
R
Rusty Russell 已提交
236

237
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
238 239
	do {
		virtqueue_disable_cb(vq);
240
		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
241 242
			struct request *req = blk_mq_rq_from_pdu(vbr);

243
			blk_mq_complete_request(req);
J
Jens Axboe 已提交
244
			req_done = true;
245
		}
246 247
		if (unlikely(virtqueue_is_broken(vq)))
			break;
248
	} while (!virtqueue_enable_cb(vq));
J
Jens Axboe 已提交
249

R
Rusty Russell 已提交
250
	/* In case queue is stopped waiting for more buffers. */
251
	if (req_done)
252
		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
253
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
254 255
}

256
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
257
			   const struct blk_mq_queue_data *bd)
R
Rusty Russell 已提交
258
{
J
Jens Axboe 已提交
259
	struct virtio_blk *vblk = hctx->queue->queuedata;
260
	struct request *req = bd->rq;
261
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
J
Jens Axboe 已提交
262
	unsigned long flags;
263
	unsigned int num;
264
	int qid = hctx->queue_num;
265
	int err;
266
	bool notify = false;
267
	bool unmap = false;
268
	u32 type;
R
Rusty Russell 已提交
269

J
Jens Axboe 已提交
270
	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
R
Rusty Russell 已提交
271

272 273 274 275 276 277 278 279
	switch (req_op(req)) {
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		type = 0;
		break;
	case REQ_OP_FLUSH:
		type = VIRTIO_BLK_T_FLUSH;
		break;
280 281 282 283 284 285 286
	case REQ_OP_DISCARD:
		type = VIRTIO_BLK_T_DISCARD;
		break;
	case REQ_OP_WRITE_ZEROES:
		type = VIRTIO_BLK_T_WRITE_ZEROES;
		unmap = !(req->cmd_flags & REQ_NOUNMAP);
		break;
287 288 289 290 291 292 293 294 295
	case REQ_OP_SCSI_IN:
	case REQ_OP_SCSI_OUT:
		type = VIRTIO_BLK_T_SCSI_CMD;
		break;
	case REQ_OP_DRV_IN:
		type = VIRTIO_BLK_T_GET_ID;
		break;
	default:
		WARN_ON_ONCE(1);
296
		return BLK_STS_IOERR;
R
Rusty Russell 已提交
297 298
	}

299 300 301 302 303
	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
	vbr->out_hdr.sector = type ?
		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));

304 305
	blk_mq_start_request(req);

306 307 308 309 310 311
	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
		err = virtblk_setup_discard_write_zeroes(req, unmap);
		if (err)
			return BLK_STS_RESOURCE;
	}

312
	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
313
	if (num) {
314
		if (rq_data_dir(req) == WRITE)
M
Michael S. Tsirkin 已提交
315
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
316
		else
M
Michael S. Tsirkin 已提交
317
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
R
Rusty Russell 已提交
318 319
	}

320
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
321
	if (blk_rq_is_scsi(req))
322 323 324
		err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num);
	else
		err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
325
	if (err) {
326
		virtqueue_kick(vblk->vqs[qid].vq);
J
Jens Axboe 已提交
327
		blk_mq_stop_hw_queue(hctx);
328
		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
329 330 331
		/* Out of mem doesn't actually happen, since we fall back
		 * to direct descriptors */
		if (err == -ENOMEM || err == -ENOSPC)
332
			return BLK_STS_DEV_RESOURCE;
333
		return BLK_STS_IOERR;
334 335
	}

336
	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
337
		notify = true;
338
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
339 340

	if (notify)
341
		virtqueue_notify(vblk->vqs[qid].vq);
342
	return BLK_STS_OK;
343 344
}

345 346 347 348 349
/* return id (s/n) string for *disk to *id_str
 */
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
	struct virtio_blk *vblk = disk->private_data;
350
	struct request_queue *q = vblk->disk->queue;
351
	struct request *req;
M
Mike Snitzer 已提交
352
	int err;
353

354
	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
355
	if (IS_ERR(req))
356
		return PTR_ERR(req);
357 358 359 360 361

	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
	if (err)
		goto out;

362
	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
363
	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
364
out:
M
Mike Snitzer 已提交
365 366
	blk_put_request(req);
	return err;
367 368
}

369 370 371
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
372 373 374
	struct virtio_blk *vblk = bd->bd_disk->private_data;

	/* see if the host passed in geometry config */
375 376 377 378 379 380 381
	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.cylinders, &geo->cylinders);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.heads, &geo->heads);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.sectors, &geo->sectors);
382 383 384 385 386 387
	} else {
		/* some standard values, similar to sd */
		geo->heads = 1 << 6;
		geo->sectors = 1 << 5;
		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
	}
388 389 390
	return 0;
}

391
static const struct block_device_operations virtblk_fops = {
392
	.ioctl  = virtblk_ioctl,
393 394
	.owner  = THIS_MODULE,
	.getgeo = virtblk_getgeo,
R
Rusty Russell 已提交
395 396
};

397 398 399 400 401
static int index_to_minor(int index)
{
	return index << PART_BITS;
}

402 403 404 405 406
static int minor_to_index(int minor)
{
	return minor >> PART_BITS;
}

407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
static ssize_t virtblk_serial_show(struct device *dev,
				struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);
	int err;

	/* sysfs gives us a PAGE_SIZE buffer */
	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);

	buf[VIRTIO_BLK_ID_BYTES] = '\0';
	err = virtblk_get_id(disk, buf);
	if (!err)
		return strlen(buf);

	if (err == -EIO) /* Unsupported? Make it empty. */
		return 0;

	return err;
}
426

427
static DEVICE_ATTR(serial, 0444, virtblk_serial_show, NULL);
428

429 430
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
431 432 433 434
{
	struct virtio_device *vdev = vblk->vdev;
	struct request_queue *q = vblk->disk->queue;
	char cap_str_2[10], cap_str_10[10];
435
	unsigned long long nblocks;
436
	u64 capacity;
437 438

	/* Host must always specify the capacity. */
439
	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
440 441 442 443 444 445 446 447

	/* If capacity is too big, truncate with warning. */
	if ((sector_t)capacity != capacity) {
		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
			 (unsigned long long)capacity);
		capacity = (sector_t)-1;
	}

448 449 450
	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);

	string_get_size(nblocks, queue_logical_block_size(q),
451
			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
452
	string_get_size(nblocks, queue_logical_block_size(q),
453
			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
454 455

	dev_notice(&vdev->dev,
456 457 458
		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
		   vblk->disk->disk_name,
		   resize ? "new size: " : "",
459 460 461 462
		   nblocks,
		   queue_logical_block_size(q),
		   cap_str_10,
		   cap_str_2);
463 464

	set_capacity(vblk->disk, capacity);
465 466 467 468 469 470 471 472 473
}

static void virtblk_config_changed_work(struct work_struct *work)
{
	struct virtio_blk *vblk =
		container_of(work, struct virtio_blk, config_work);
	char *envp[] = { "RESIZE=1", NULL };

	virtblk_update_capacity(vblk, true);
474
	revalidate_disk(vblk->disk);
475
	kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
476 477 478 479 480 481 482 483 484
}

static void virtblk_config_changed(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	queue_work(virtblk_wq, &vblk->config_work);
}

485 486
static int init_vq(struct virtio_blk *vblk)
{
487
	int err;
488 489 490 491 492 493
	int i;
	vq_callback_t **callbacks;
	const char **names;
	struct virtqueue **vqs;
	unsigned short num_vqs;
	struct virtio_device *vdev = vblk->vdev;
494
	struct irq_affinity desc = { 0, };
495 496 497 498 499 500 501

	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
				   struct virtio_blk_config, num_queues,
				   &num_vqs);
	if (err)
		num_vqs = 1;

502
	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
503 504
	if (!vblk->vqs)
		return -ENOMEM;
505

506 507 508
	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
509 510 511 512
	if (!names || !callbacks || !vqs) {
		err = -ENOMEM;
		goto out;
	}
513

514 515 516 517 518 519 520
	for (i = 0; i < num_vqs; i++) {
		callbacks[i] = virtblk_done;
		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
		names[i] = vblk->vqs[i].name;
	}

	/* Discover virtqueues and write information to configuration.  */
M
Michael S. Tsirkin 已提交
521
	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
522
	if (err)
523
		goto out;
524

525 526 527 528 529 530
	for (i = 0; i < num_vqs; i++) {
		spin_lock_init(&vblk->vqs[i].lock);
		vblk->vqs[i].vq = vqs[i];
	}
	vblk->num_vqs = num_vqs;

531
out:
532 533 534 535 536
	kfree(vqs);
	kfree(callbacks);
	kfree(names);
	if (err)
		kfree(vblk->vqs);
537 538 539
	return err;
}

540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
/*
 * Legacy naming scheme used for virtio devices.  We are stuck with it for
 * virtio blk but don't ever use it for any new driver.
 */
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
{
	const int base = 'z' - 'a' + 1;
	char *begin = buf + strlen(prefix);
	char *end = buf + buflen;
	char *p;
	int unit;

	p = end - 1;
	*p = '\0';
	unit = base;
	do {
		if (p == begin)
			return -EINVAL;
		*--p = 'a' + (index % unit);
		index = (index / unit) - 1;
	} while (index >= 0);

	memmove(begin, p, end - p);
	memcpy(buf, prefix, strlen(prefix));

	return 0;
}

568 569 570 571 572
static int virtblk_get_cache_mode(struct virtio_device *vdev)
{
	u8 writeback;
	int err;

573 574 575
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
				   struct virtio_blk_config, wce,
				   &writeback);
576 577 578 579 580

	/*
	 * If WCE is not configurable and flush is not available,
	 * assume no writeback cache is in use.
	 */
581
	if (err)
582
		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
583 584 585 586 587 588 589 590 591

	return writeback;
}

static void virtblk_update_cache_mode(struct virtio_device *vdev)
{
	u8 writeback = virtblk_get_cache_mode(vdev);
	struct virtio_blk *vblk = vdev->priv;

592
	blk_queue_write_cache(vblk->disk->queue, writeback, false);
593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
	revalidate_disk(vblk->disk);
}

static const char *const virtblk_cache_types[] = {
	"write through", "write back"
};

static ssize_t
virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
			 const char *buf, size_t count)
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;
	int i;

	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
610
	i = sysfs_match_string(virtblk_cache_types, buf);
611
	if (i < 0)
612
		return i;
613

614
	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
	virtblk_update_cache_mode(vdev);
	return count;
}

static ssize_t
virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
			 char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	u8 writeback = virtblk_get_cache_mode(vblk->vdev);

	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
}

static const struct device_attribute dev_attr_cache_type_ro =
632
	__ATTR(cache_type, 0444,
633 634
	       virtblk_cache_type_show, NULL);
static const struct device_attribute dev_attr_cache_type_rw =
635
	__ATTR(cache_type, 0644,
636 637
	       virtblk_cache_type_show, virtblk_cache_type_store);

638 639
static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
		unsigned int hctx_idx, unsigned int numa_node)
640
{
641
	struct virtio_blk *vblk = set->driver_data;
642 643
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);

644
#ifdef CONFIG_VIRTIO_BLK_SCSI
645
	vbr->sreq.sense = vbr->sense;
646
#endif
647 648 649 650
	sg_init_table(vbr->sg, vblk->sg_elems);
	return 0;
}

651 652 653 654 655 656 657
static int virtblk_map_queues(struct blk_mq_tag_set *set)
{
	struct virtio_blk *vblk = set->driver_data;

	return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
}

658 659 660 661 662 663 664 665 666
#ifdef CONFIG_VIRTIO_BLK_SCSI
static void virtblk_initialize_rq(struct request *req)
{
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);

	scsi_req_init(&vbr->sreq);
}
#endif

667
static const struct blk_mq_ops virtio_mq_ops = {
J
Jens Axboe 已提交
668
	.queue_rq	= virtio_queue_rq,
669
	.complete	= virtblk_request_done,
670
	.init_request	= virtblk_init_request,
671 672 673
#ifdef CONFIG_VIRTIO_BLK_SCSI
	.initialize_rq_fn = virtblk_initialize_rq,
#endif
674
	.map_queues	= virtblk_map_queues,
J
Jens Axboe 已提交
675 676
};

677 678
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
J
Jens Axboe 已提交
679

680
static int virtblk_probe(struct virtio_device *vdev)
R
Rusty Russell 已提交
681 682
{
	struct virtio_blk *vblk;
683
	struct request_queue *q;
684
	int err, index;
685

686 687 688
	u32 v, blk_size, sg_elems, opt_io_size;
	u16 min_io_size;
	u8 physical_block_exp, alignment_offset;
R
Rusty Russell 已提交
689

690 691 692 693 694 695
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

696 697 698 699 700
	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
			     GFP_KERNEL);
	if (err < 0)
		goto out;
	index = err;
701

702
	/* We need to know how many segments before we allocate. */
703 704 705
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
				   struct virtio_blk_config, seg_max,
				   &sg_elems);
706 707 708

	/* We need at least one SG element, whatever they say. */
	if (err || !sg_elems)
709 710 711 712
		sg_elems = 1;

	/* We need an extra sg elements at head and tail. */
	sg_elems += 2;
J
Jens Axboe 已提交
713
	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
R
Rusty Russell 已提交
714 715
	if (!vblk) {
		err = -ENOMEM;
716
		goto out_free_index;
R
Rusty Russell 已提交
717 718 719
	}

	vblk->vdev = vdev;
720
	vblk->sg_elems = sg_elems;
721

722
	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
R
Rusty Russell 已提交
723

724 725
	err = init_vq(vblk);
	if (err)
R
Rusty Russell 已提交
726 727 728
		goto out_free_vblk;

	/* FIXME: How many partitions?  How long is a piece of string? */
729
	vblk->disk = alloc_disk(1 << PART_BITS);
R
Rusty Russell 已提交
730 731
	if (!vblk->disk) {
		err = -ENOMEM;
J
Jens Axboe 已提交
732
		goto out_free_vq;
R
Rusty Russell 已提交
733 734
	}

735
	/* Default queue sizing is to fill the ring. */
736
	if (!virtblk_queue_depth) {
737
		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
738 739
		/* ... but without indirect descs, we use 2 descs per req */
		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
740
			virtblk_queue_depth /= 2;
741
	}
742 743 744 745 746 747 748

	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
	vblk->tag_set.ops = &virtio_mq_ops;
	vblk->tag_set.queue_depth = virtblk_queue_depth;
	vblk->tag_set.numa_node = NUMA_NO_NODE;
	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
	vblk->tag_set.cmd_size =
J
Jens Axboe 已提交
749 750
		sizeof(struct virtblk_req) +
		sizeof(struct scatterlist) * sg_elems;
751
	vblk->tag_set.driver_data = vblk;
752
	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
J
Jens Axboe 已提交
753

754 755 756 757
	err = blk_mq_alloc_tag_set(&vblk->tag_set);
	if (err)
		goto out_put_disk;

758
	q = blk_mq_init_queue(&vblk->tag_set);
759
	if (IS_ERR(q)) {
R
Rusty Russell 已提交
760
		err = -ENOMEM;
761
		goto out_free_tags;
R
Rusty Russell 已提交
762
	}
763
	vblk->disk->queue = q;
R
Rusty Russell 已提交
764

765
	q->queuedata = vblk;
766

767
	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
768

R
Rusty Russell 已提交
769
	vblk->disk->major = major;
770
	vblk->disk->first_minor = index_to_minor(index);
R
Rusty Russell 已提交
771 772
	vblk->disk->private_data = vblk;
	vblk->disk->fops = &virtblk_fops;
773
	vblk->disk->flags |= GENHD_FL_EXT_DEVT;
774
	vblk->index = index;
775

776
	/* configure queue flush support */
777
	virtblk_update_cache_mode(vdev);
R
Rusty Russell 已提交
778

779 780 781 782
	/* If disk is read-only in the host, the guest should obey */
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
		set_disk_ro(vblk->disk, 1);

783
	/* We can handle whatever the host told us to handle. */
784
	blk_queue_max_segments(q, vblk->sg_elems-2);
785

786
	/* No real sector limit. */
787
	blk_queue_max_hw_sectors(q, -1U);
788

789 790
	/* Host can optionally specify maximum segment size and number of
	 * segments. */
791 792
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
				   struct virtio_blk_config, size_max, &v);
R
Rusty Russell 已提交
793
	if (!err)
794
		blk_queue_max_segment_size(q, v);
795
	else
796
		blk_queue_max_segment_size(q, -1U);
R
Rusty Russell 已提交
797

798
	/* Host can optionally specify the block size of the device */
799 800 801
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
				   struct virtio_blk_config, blk_size,
				   &blk_size);
802
	if (!err)
803 804 805 806 807
		blk_queue_logical_block_size(q, blk_size);
	else
		blk_size = queue_logical_block_size(q);

	/* Use topology information if available */
808 809 810
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, physical_block_exp,
				   &physical_block_exp);
811 812 813 814
	if (!err && physical_block_exp)
		blk_queue_physical_block_size(q,
				blk_size * (1 << physical_block_exp));

815 816 817
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, alignment_offset,
				   &alignment_offset);
818 819 820
	if (!err && alignment_offset)
		blk_queue_alignment_offset(q, blk_size * alignment_offset);

821 822 823
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, min_io_size,
				   &min_io_size);
824 825 826
	if (!err && min_io_size)
		blk_queue_io_min(q, blk_size * min_io_size);

827 828 829
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, opt_io_size,
				   &opt_io_size);
830 831 832
	if (!err && opt_io_size)
		blk_queue_io_opt(q, blk_size * opt_io_size);

833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
		q->limits.discard_granularity = blk_size;

		virtio_cread(vdev, struct virtio_blk_config,
			     discard_sector_alignment, &v);
		q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;

		virtio_cread(vdev, struct virtio_blk_config,
			     max_discard_sectors, &v);
		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);

		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
			     &v);
		blk_queue_max_discard_segments(q,
					       min_not_zero(v,
							    MAX_DISCARD_SEGMENTS));

		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
	}

	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
		virtio_cread(vdev, struct virtio_blk_config,
			     max_write_zeroes_sectors, &v);
		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
	}

859
	virtblk_update_capacity(vblk, false);
M
Michael S. Tsirkin 已提交
860 861
	virtio_device_ready(vdev);

862
	device_add_disk(&vdev->dev, vblk->disk);
863 864 865 866
	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
	if (err)
		goto out_del_disk;

867 868 869 870 871 872 873 874
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
		err = device_create_file(disk_to_dev(vblk->disk),
					 &dev_attr_cache_type_rw);
	else
		err = device_create_file(disk_to_dev(vblk->disk),
					 &dev_attr_cache_type_ro);
	if (err)
		goto out_del_disk;
R
Rusty Russell 已提交
875 876
	return 0;

877 878 879
out_del_disk:
	del_gendisk(vblk->disk);
	blk_cleanup_queue(vblk->disk->queue);
880 881
out_free_tags:
	blk_mq_free_tag_set(&vblk->tag_set);
R
Rusty Russell 已提交
882 883 884
out_put_disk:
	put_disk(vblk->disk);
out_free_vq:
885
	vdev->config->del_vqs(vdev);
R
Rusty Russell 已提交
886 887
out_free_vblk:
	kfree(vblk);
888 889
out_free_index:
	ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
890 891 892 893
out:
	return err;
}

894
static void virtblk_remove(struct virtio_device *vdev)
R
Rusty Russell 已提交
895 896
{
	struct virtio_blk *vblk = vdev->priv;
897
	int index = vblk->index;
898
	int refc;
R
Rusty Russell 已提交
899

900 901
	/* Make sure no work handler is accessing the device. */
	flush_work(&vblk->config_work);
902

903
	del_gendisk(vblk->disk);
904
	blk_cleanup_queue(vblk->disk->queue);
905

906 907
	blk_mq_free_tag_set(&vblk->tag_set);

R
Rusty Russell 已提交
908 909 910
	/* Stop all the virtqueues. */
	vdev->config->reset(vdev);

911
	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
R
Rusty Russell 已提交
912
	put_disk(vblk->disk);
913
	vdev->config->del_vqs(vdev);
914
	kfree(vblk->vqs);
R
Rusty Russell 已提交
915
	kfree(vblk);
916 917 918 919

	/* Only free device id if we don't have any users */
	if (refc == 1)
		ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
920 921
}

922
#ifdef CONFIG_PM_SLEEP
923 924 925 926 927 928 929
static int virtblk_freeze(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	/* Ensure we don't receive any more interrupts */
	vdev->config->reset(vdev);

930
	/* Make sure no work handler is accessing the device. */
931 932
	flush_work(&vblk->config_work);

933
	blk_mq_quiesce_queue(vblk->disk->queue);
934 935 936 937 938 939 940 941 942 943 944

	vdev->config->del_vqs(vdev);
	return 0;
}

static int virtblk_restore(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;
	int ret;

	ret = init_vq(vdev->priv);
945 946 947 948
	if (ret)
		return ret;

	virtio_device_ready(vdev);
J
Jens Axboe 已提交
949

950
	blk_mq_unquiesce_queue(vblk->disk->queue);
951
	return 0;
952 953 954
}
#endif

955
static const struct virtio_device_id id_table[] = {
R
Rusty Russell 已提交
956 957 958 959
	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

M
Michael S. Tsirkin 已提交
960
static unsigned int features_legacy[] = {
961
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
962 963 964 965
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
#ifdef CONFIG_VIRTIO_BLK_SCSI
	VIRTIO_BLK_F_SCSI,
#endif
966
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
967
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
M
Michael S. Tsirkin 已提交
968 969 970 971 972
}
;
static unsigned int features[] = {
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
973
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
974
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
975 976
};

977
static struct virtio_driver virtio_blk = {
M
Michael S. Tsirkin 已提交
978 979 980 981 982 983 984 985 986 987
	.feature_table			= features,
	.feature_table_size		= ARRAY_SIZE(features),
	.feature_table_legacy		= features_legacy,
	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
	.driver.name			= KBUILD_MODNAME,
	.driver.owner			= THIS_MODULE,
	.id_table			= id_table,
	.probe				= virtblk_probe,
	.remove				= virtblk_remove,
	.config_changed			= virtblk_config_changed,
988
#ifdef CONFIG_PM_SLEEP
M
Michael S. Tsirkin 已提交
989 990
	.freeze				= virtblk_freeze,
	.restore			= virtblk_restore,
991
#endif
R
Rusty Russell 已提交
992 993 994 995
};

static int __init init(void)
{
996 997 998 999 1000 1001
	int error;

	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
	if (!virtblk_wq)
		return -ENOMEM;

1002
	major = register_blkdev(0, "virtblk");
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
	if (major < 0) {
		error = major;
		goto out_destroy_workqueue;
	}

	error = register_virtio_driver(&virtio_blk);
	if (error)
		goto out_unregister_blkdev;
	return 0;

out_unregister_blkdev:
	unregister_blkdev(major, "virtblk");
out_destroy_workqueue:
	destroy_workqueue(virtblk_wq);
	return error;
R
Rusty Russell 已提交
1018 1019 1020 1021 1022
}

static void __exit fini(void)
{
	unregister_virtio_driver(&virtio_blk);
1023
	unregister_blkdev(major, "virtblk");
1024
	destroy_workqueue(virtblk_wq);
R
Rusty Russell 已提交
1025 1026 1027 1028 1029 1030 1031
}
module_init(init);
module_exit(fini);

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio block driver");
MODULE_LICENSE("GPL");