virtio_blk.c 26.9 KB
Newer Older
R
Rusty Russell 已提交
1 2
//#define DEBUG
#include <linux/spinlock.h>
3
#include <linux/slab.h>
R
Rusty Russell 已提交
4 5
#include <linux/blkdev.h>
#include <linux/hdreg.h>
6
#include <linux/module.h>
7
#include <linux/mutex.h>
8
#include <linux/interrupt.h>
R
Rusty Russell 已提交
9 10
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
11
#include <linux/scatterlist.h>
12
#include <linux/string_helpers.h>
13
#include <scsi/scsi_cmnd.h>
14
#include <linux/idr.h>
J
Jens Axboe 已提交
15
#include <linux/blk-mq.h>
16
#include <linux/blk-mq-virtio.h>
J
Jens Axboe 已提交
17
#include <linux/numa.h>
18

19
#define PART_BITS 4
20
#define VQ_NAME_LEN 16
21
#define MAX_DISCARD_SEGMENTS 256u
R
Rusty Russell 已提交
22

23 24 25
static int major;
static DEFINE_IDA(vd_index_ida);

26
static struct workqueue_struct *virtblk_wq;
27

28 29 30 31 32 33
struct virtio_blk_vq {
	struct virtqueue *vq;
	spinlock_t lock;
	char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;

34
struct virtio_blk {
R
Rusty Russell 已提交
35 36 37 38 39
	struct virtio_device *vdev;

	/* The disk structure for the kernel. */
	struct gendisk *disk;

40 41 42
	/* Block layer tags. */
	struct blk_mq_tag_set tag_set;

43 44 45
	/* Process context for config space updates */
	struct work_struct config_work;

46 47 48
	/* What host tells us, plus 2 for header & tailer. */
	unsigned int sg_elems;

49 50
	/* Ida index - used to track minor number allocations. */
	int index;
51 52 53 54

	/* num of vqs */
	int num_vqs;
	struct virtio_blk_vq *vqs;
R
Rusty Russell 已提交
55 56
};

57
struct virtblk_req {
58 59 60
#ifdef CONFIG_VIRTIO_BLK_SCSI
	struct scsi_request sreq;	/* for SCSI passthrough, must be first */
	u8 sense[SCSI_SENSE_BUFFERSIZE];
61
	struct virtio_scsi_inhdr in_hdr;
62 63
#endif
	struct virtio_blk_outhdr out_hdr;
64
	u8 status;
65
	struct scatterlist sg[];
R
Rusty Russell 已提交
66 67
};

68
static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
69 70 71
{
	switch (vbr->status) {
	case VIRTIO_BLK_S_OK:
72
		return BLK_STS_OK;
73
	case VIRTIO_BLK_S_UNSUPP:
74
		return BLK_STS_NOTSUPP;
75
	default:
76
		return BLK_STS_IOERR;
77 78 79
	}
}

80 81 82 83 84 85 86 87 88
/*
 * If this is a packet command we need a couple of additional headers.  Behind
 * the normal outhdr we put a segment with the scsi command block, and before
 * the normal inhdr we put the sense data and the inhdr with additional status
 * information.
 */
#ifdef CONFIG_VIRTIO_BLK_SCSI
static int virtblk_add_req_scsi(struct virtqueue *vq, struct virtblk_req *vbr,
		struct scatterlist *data_sg, bool have_data)
89
{
90
	struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
91 92 93 94
	unsigned int num_out = 0, num_in = 0;

	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
	sgs[num_out++] = &hdr;
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
	sg_init_one(&cmd, vbr->sreq.cmd, vbr->sreq.cmd_len);
	sgs[num_out++] = &cmd;

	if (have_data) {
		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
			sgs[num_out++] = data_sg;
		else
			sgs[num_out + num_in++] = data_sg;
	}

	sg_init_one(&sense, vbr->sense, SCSI_SENSE_BUFFERSIZE);
	sgs[num_out + num_in++] = &sense;
	sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
	sgs[num_out + num_in++] = &inhdr;
	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
	sgs[num_out + num_in++] = &status;

	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}

115
static inline void virtblk_scsi_request_done(struct request *req)
116 117 118 119 120 121 122
{
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
	struct virtio_blk *vblk = req->q->queuedata;
	struct scsi_request *sreq = &vbr->sreq;

	sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
	sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
123
	sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
124 125 126 127 128 129 130
}

static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
			     unsigned int cmd, unsigned long data)
{
	struct gendisk *disk = bdev->bd_disk;
	struct virtio_blk *vblk = disk->private_data;
131

132
	/*
133
	 * Only allow the generic SCSI ioctls if the host can support it.
134
	 */
135 136 137 138 139 140 141 142 143 144 145 146 147
	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
		return -ENOTTY;

	return scsi_cmd_blk_ioctl(bdev, mode, cmd,
				  (void __user *)data);
}
#else
static inline int virtblk_add_req_scsi(struct virtqueue *vq,
		struct virtblk_req *vbr, struct scatterlist *data_sg,
		bool have_data)
{
	return -EIO;
}
148
static inline void virtblk_scsi_request_done(struct request *req)
149 150 151 152 153 154 155 156 157 158 159 160 161
{
}
#define virtblk_ioctl	NULL
#endif /* CONFIG_VIRTIO_BLK_SCSI */

static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
		struct scatterlist *data_sg, bool have_data)
{
	struct scatterlist hdr, status, *sgs[3];
	unsigned int num_out = 0, num_in = 0;

	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
	sgs[num_out++] = &hdr;
162

R
Rusty Russell 已提交
163
	if (have_data) {
M
Michael S. Tsirkin 已提交
164
		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
165
			sgs[num_out++] = data_sg;
166
		else
167 168 169
			sgs[num_out + num_in++] = data_sg;
	}

170 171 172 173
	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
	sgs[num_out + num_in++] = &status;

	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
174 175
}

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
{
	unsigned short segments = blk_rq_nr_discard_segments(req);
	unsigned short n = 0;
	struct virtio_blk_discard_write_zeroes *range;
	struct bio *bio;
	u32 flags = 0;

	if (unmap)
		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;

	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
	if (!range)
		return -ENOMEM;

	__rq_for_each_bio(bio, req) {
		u64 sector = bio->bi_iter.bi_sector;
		u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;

		range[n].flags = cpu_to_le32(flags);
		range[n].num_sectors = cpu_to_le32(num_sectors);
		range[n].sector = cpu_to_le64(sector);
		n++;
	}

	req->special_vec.bv_page = virt_to_page(range);
	req->special_vec.bv_offset = offset_in_page(range);
	req->special_vec.bv_len = sizeof(*range) * segments;
	req->rq_flags |= RQF_SPECIAL_PAYLOAD;

	return 0;
}

209
static inline void virtblk_request_done(struct request *req)
210
{
211
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
212

213 214 215 216 217
	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
		kfree(page_address(req->special_vec.bv_page) +
		      req->special_vec.bv_offset);
	}

218 219 220
	switch (req_op(req)) {
	case REQ_OP_SCSI_IN:
	case REQ_OP_SCSI_OUT:
221
		virtblk_scsi_request_done(req);
222
		break;
223 224
	}

225
	blk_mq_end_request(req, virtblk_result(vbr));
226 227 228
}

static void virtblk_done(struct virtqueue *vq)
R
Rusty Russell 已提交
229 230
{
	struct virtio_blk *vblk = vq->vdev->priv;
J
Jens Axboe 已提交
231
	bool req_done = false;
232
	int qid = vq->index;
R
Rusty Russell 已提交
233 234
	struct virtblk_req *vbr;
	unsigned long flags;
235
	unsigned int len;
R
Rusty Russell 已提交
236

237
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
238 239
	do {
		virtqueue_disable_cb(vq);
240
		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
241 242
			struct request *req = blk_mq_rq_from_pdu(vbr);

243
			blk_mq_complete_request(req);
J
Jens Axboe 已提交
244
			req_done = true;
245
		}
246 247
		if (unlikely(virtqueue_is_broken(vq)))
			break;
248
	} while (!virtqueue_enable_cb(vq));
J
Jens Axboe 已提交
249

R
Rusty Russell 已提交
250
	/* In case queue is stopped waiting for more buffers. */
251
	if (req_done)
252
		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
253
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
254 255
}

256 257 258 259 260 261 262 263 264 265 266 267 268 269
static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct virtio_blk *vblk = hctx->queue->queuedata;
	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
	bool kick;

	spin_lock_irq(&vq->lock);
	kick = virtqueue_kick_prepare(vq->vq);
	spin_unlock_irq(&vq->lock);

	if (kick)
		virtqueue_notify(vq->vq);
}

270
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
271
			   const struct blk_mq_queue_data *bd)
R
Rusty Russell 已提交
272
{
J
Jens Axboe 已提交
273
	struct virtio_blk *vblk = hctx->queue->queuedata;
274
	struct request *req = bd->rq;
275
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
J
Jens Axboe 已提交
276
	unsigned long flags;
277
	unsigned int num;
278
	int qid = hctx->queue_num;
279
	int err;
280
	bool notify = false;
281
	bool unmap = false;
282
	u32 type;
R
Rusty Russell 已提交
283

J
Jens Axboe 已提交
284
	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
R
Rusty Russell 已提交
285

286 287 288 289 290 291 292 293
	switch (req_op(req)) {
	case REQ_OP_READ:
	case REQ_OP_WRITE:
		type = 0;
		break;
	case REQ_OP_FLUSH:
		type = VIRTIO_BLK_T_FLUSH;
		break;
294 295 296 297 298 299 300
	case REQ_OP_DISCARD:
		type = VIRTIO_BLK_T_DISCARD;
		break;
	case REQ_OP_WRITE_ZEROES:
		type = VIRTIO_BLK_T_WRITE_ZEROES;
		unmap = !(req->cmd_flags & REQ_NOUNMAP);
		break;
301 302 303 304 305 306 307 308 309
	case REQ_OP_SCSI_IN:
	case REQ_OP_SCSI_OUT:
		type = VIRTIO_BLK_T_SCSI_CMD;
		break;
	case REQ_OP_DRV_IN:
		type = VIRTIO_BLK_T_GET_ID;
		break;
	default:
		WARN_ON_ONCE(1);
310
		return BLK_STS_IOERR;
R
Rusty Russell 已提交
311 312
	}

313 314 315 316 317
	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
	vbr->out_hdr.sector = type ?
		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));

318 319
	blk_mq_start_request(req);

320 321 322 323 324 325
	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
		err = virtblk_setup_discard_write_zeroes(req, unmap);
		if (err)
			return BLK_STS_RESOURCE;
	}

326
	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
327
	if (num) {
328
		if (rq_data_dir(req) == WRITE)
M
Michael S. Tsirkin 已提交
329
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
330
		else
M
Michael S. Tsirkin 已提交
331
			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
R
Rusty Russell 已提交
332 333
	}

334
	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
335
	if (blk_rq_is_scsi(req))
336 337 338
		err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num);
	else
		err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
339
	if (err) {
340
		virtqueue_kick(vblk->vqs[qid].vq);
J
Jens Axboe 已提交
341
		blk_mq_stop_hw_queue(hctx);
342
		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
343 344 345
		/* Out of mem doesn't actually happen, since we fall back
		 * to direct descriptors */
		if (err == -ENOMEM || err == -ENOSPC)
346
			return BLK_STS_DEV_RESOURCE;
347
		return BLK_STS_IOERR;
348 349
	}

350
	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
351
		notify = true;
352
	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
353 354

	if (notify)
355
		virtqueue_notify(vblk->vqs[qid].vq);
356
	return BLK_STS_OK;
357 358
}

359 360 361 362 363
/* return id (s/n) string for *disk to *id_str
 */
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
	struct virtio_blk *vblk = disk->private_data;
364
	struct request_queue *q = vblk->disk->queue;
365
	struct request *req;
M
Mike Snitzer 已提交
366
	int err;
367

368
	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
369
	if (IS_ERR(req))
370
		return PTR_ERR(req);
371 372 373 374 375

	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
	if (err)
		goto out;

376
	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
377
	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
378
out:
M
Mike Snitzer 已提交
379 380
	blk_put_request(req);
	return err;
381 382
}

383 384 385
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
386 387 388
	struct virtio_blk *vblk = bd->bd_disk->private_data;

	/* see if the host passed in geometry config */
389 390 391 392 393 394 395
	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.cylinders, &geo->cylinders);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.heads, &geo->heads);
		virtio_cread(vblk->vdev, struct virtio_blk_config,
			     geometry.sectors, &geo->sectors);
396 397 398 399 400 401
	} else {
		/* some standard values, similar to sd */
		geo->heads = 1 << 6;
		geo->sectors = 1 << 5;
		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
	}
402 403 404
	return 0;
}

405
static const struct block_device_operations virtblk_fops = {
406
	.ioctl  = virtblk_ioctl,
407 408
	.owner  = THIS_MODULE,
	.getgeo = virtblk_getgeo,
R
Rusty Russell 已提交
409 410
};

411 412 413 414 415
static int index_to_minor(int index)
{
	return index << PART_BITS;
}

416 417 418 419 420
static int minor_to_index(int minor)
{
	return minor >> PART_BITS;
}

421 422
static ssize_t serial_show(struct device *dev,
			   struct device_attribute *attr, char *buf)
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
{
	struct gendisk *disk = dev_to_disk(dev);
	int err;

	/* sysfs gives us a PAGE_SIZE buffer */
	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);

	buf[VIRTIO_BLK_ID_BYTES] = '\0';
	err = virtblk_get_id(disk, buf);
	if (!err)
		return strlen(buf);

	if (err == -EIO) /* Unsupported? Make it empty. */
		return 0;

	return err;
}
440

441
static DEVICE_ATTR_RO(serial);
442

443 444
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
445 446 447 448
{
	struct virtio_device *vdev = vblk->vdev;
	struct request_queue *q = vblk->disk->queue;
	char cap_str_2[10], cap_str_10[10];
449
	unsigned long long nblocks;
450
	u64 capacity;
451 452

	/* Host must always specify the capacity. */
453
	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
454 455 456 457 458 459 460 461

	/* If capacity is too big, truncate with warning. */
	if ((sector_t)capacity != capacity) {
		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
			 (unsigned long long)capacity);
		capacity = (sector_t)-1;
	}

462 463 464
	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);

	string_get_size(nblocks, queue_logical_block_size(q),
465
			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
466
	string_get_size(nblocks, queue_logical_block_size(q),
467
			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
468 469

	dev_notice(&vdev->dev,
470 471 472
		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
		   vblk->disk->disk_name,
		   resize ? "new size: " : "",
473 474 475 476
		   nblocks,
		   queue_logical_block_size(q),
		   cap_str_10,
		   cap_str_2);
477 478

	set_capacity(vblk->disk, capacity);
479 480 481 482 483 484 485 486 487
}

static void virtblk_config_changed_work(struct work_struct *work)
{
	struct virtio_blk *vblk =
		container_of(work, struct virtio_blk, config_work);
	char *envp[] = { "RESIZE=1", NULL };

	virtblk_update_capacity(vblk, true);
488
	revalidate_disk(vblk->disk);
489
	kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
490 491 492 493 494 495 496 497 498
}

static void virtblk_config_changed(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	queue_work(virtblk_wq, &vblk->config_work);
}

499 500
static int init_vq(struct virtio_blk *vblk)
{
501
	int err;
502 503 504 505 506 507
	int i;
	vq_callback_t **callbacks;
	const char **names;
	struct virtqueue **vqs;
	unsigned short num_vqs;
	struct virtio_device *vdev = vblk->vdev;
508
	struct irq_affinity desc = { 0, };
509 510 511 512 513 514 515

	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
				   struct virtio_blk_config, num_queues,
				   &num_vqs);
	if (err)
		num_vqs = 1;

516 517
	num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);

518
	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
519 520
	if (!vblk->vqs)
		return -ENOMEM;
521

522 523 524
	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
525 526 527 528
	if (!names || !callbacks || !vqs) {
		err = -ENOMEM;
		goto out;
	}
529

530 531 532 533 534 535 536
	for (i = 0; i < num_vqs; i++) {
		callbacks[i] = virtblk_done;
		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
		names[i] = vblk->vqs[i].name;
	}

	/* Discover virtqueues and write information to configuration.  */
M
Michael S. Tsirkin 已提交
537
	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
538
	if (err)
539
		goto out;
540

541 542 543 544 545 546
	for (i = 0; i < num_vqs; i++) {
		spin_lock_init(&vblk->vqs[i].lock);
		vblk->vqs[i].vq = vqs[i];
	}
	vblk->num_vqs = num_vqs;

547
out:
548 549 550 551 552
	kfree(vqs);
	kfree(callbacks);
	kfree(names);
	if (err)
		kfree(vblk->vqs);
553 554 555
	return err;
}

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
/*
 * Legacy naming scheme used for virtio devices.  We are stuck with it for
 * virtio blk but don't ever use it for any new driver.
 */
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
{
	const int base = 'z' - 'a' + 1;
	char *begin = buf + strlen(prefix);
	char *end = buf + buflen;
	char *p;
	int unit;

	p = end - 1;
	*p = '\0';
	unit = base;
	do {
		if (p == begin)
			return -EINVAL;
		*--p = 'a' + (index % unit);
		index = (index / unit) - 1;
	} while (index >= 0);

	memmove(begin, p, end - p);
	memcpy(buf, prefix, strlen(prefix));

	return 0;
}

584 585 586 587 588
static int virtblk_get_cache_mode(struct virtio_device *vdev)
{
	u8 writeback;
	int err;

589 590 591
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
				   struct virtio_blk_config, wce,
				   &writeback);
592 593 594 595 596

	/*
	 * If WCE is not configurable and flush is not available,
	 * assume no writeback cache is in use.
	 */
597
	if (err)
598
		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
599 600 601 602 603 604 605 606 607

	return writeback;
}

static void virtblk_update_cache_mode(struct virtio_device *vdev)
{
	u8 writeback = virtblk_get_cache_mode(vdev);
	struct virtio_blk *vblk = vdev->priv;

608
	blk_queue_write_cache(vblk->disk->queue, writeback, false);
609 610 611 612 613 614 615 616
	revalidate_disk(vblk->disk);
}

static const char *const virtblk_cache_types[] = {
	"write through", "write back"
};

static ssize_t
617 618
cache_type_store(struct device *dev, struct device_attribute *attr,
		 const char *buf, size_t count)
619 620 621 622 623 624 625
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;
	int i;

	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
626
	i = sysfs_match_string(virtblk_cache_types, buf);
627
	if (i < 0)
628
		return i;
629

630
	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
631 632 633 634 635
	virtblk_update_cache_mode(vdev);
	return count;
}

static ssize_t
636
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
637 638 639 640 641 642 643 644 645
{
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	u8 writeback = virtblk_get_cache_mode(vblk->vdev);

	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
}

646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
static DEVICE_ATTR_RW(cache_type);

static struct attribute *virtblk_attrs[] = {
	&dev_attr_serial.attr,
	&dev_attr_cache_type.attr,
	NULL,
};

static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
		struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, struct device, kobj);
	struct gendisk *disk = dev_to_disk(dev);
	struct virtio_blk *vblk = disk->private_data;
	struct virtio_device *vdev = vblk->vdev;

	if (a == &dev_attr_cache_type.attr &&
	    !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
		return S_IRUGO;

	return a->mode;
}

static const struct attribute_group virtblk_attr_group = {
	.attrs = virtblk_attrs,
	.is_visible = virtblk_attrs_are_visible,
};

static const struct attribute_group *virtblk_attr_groups[] = {
	&virtblk_attr_group,
	NULL,
};
678

679 680
static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
		unsigned int hctx_idx, unsigned int numa_node)
681
{
682
	struct virtio_blk *vblk = set->driver_data;
683 684
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);

685
#ifdef CONFIG_VIRTIO_BLK_SCSI
686
	vbr->sreq.sense = vbr->sense;
687
#endif
688 689 690 691
	sg_init_table(vbr->sg, vblk->sg_elems);
	return 0;
}

692 693 694 695
static int virtblk_map_queues(struct blk_mq_tag_set *set)
{
	struct virtio_blk *vblk = set->driver_data;

696 697
	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
					vblk->vdev, 0);
698 699
}

700 701 702 703 704 705 706 707 708
#ifdef CONFIG_VIRTIO_BLK_SCSI
static void virtblk_initialize_rq(struct request *req)
{
	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);

	scsi_req_init(&vbr->sreq);
}
#endif

709
static const struct blk_mq_ops virtio_mq_ops = {
J
Jens Axboe 已提交
710
	.queue_rq	= virtio_queue_rq,
711
	.commit_rqs	= virtio_commit_rqs,
712
	.complete	= virtblk_request_done,
713
	.init_request	= virtblk_init_request,
714 715 716
#ifdef CONFIG_VIRTIO_BLK_SCSI
	.initialize_rq_fn = virtblk_initialize_rq,
#endif
717
	.map_queues	= virtblk_map_queues,
J
Jens Axboe 已提交
718 719
};

720 721
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
J
Jens Axboe 已提交
722

723
static int virtblk_probe(struct virtio_device *vdev)
R
Rusty Russell 已提交
724 725
{
	struct virtio_blk *vblk;
726
	struct request_queue *q;
727
	int err, index;
728

729
	u32 v, blk_size, max_size, sg_elems, opt_io_size;
730 731
	u16 min_io_size;
	u8 physical_block_exp, alignment_offset;
R
Rusty Russell 已提交
732

733 734 735 736 737 738
	if (!vdev->config->get) {
		dev_err(&vdev->dev, "%s failure: config access disabled\n",
			__func__);
		return -EINVAL;
	}

739 740 741 742 743
	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
			     GFP_KERNEL);
	if (err < 0)
		goto out;
	index = err;
744

745
	/* We need to know how many segments before we allocate. */
746 747 748
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
				   struct virtio_blk_config, seg_max,
				   &sg_elems);
749 750 751

	/* We need at least one SG element, whatever they say. */
	if (err || !sg_elems)
752 753 754 755
		sg_elems = 1;

	/* We need an extra sg elements at head and tail. */
	sg_elems += 2;
J
Jens Axboe 已提交
756
	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
R
Rusty Russell 已提交
757 758
	if (!vblk) {
		err = -ENOMEM;
759
		goto out_free_index;
R
Rusty Russell 已提交
760 761 762
	}

	vblk->vdev = vdev;
763
	vblk->sg_elems = sg_elems;
764

765
	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
R
Rusty Russell 已提交
766

767 768
	err = init_vq(vblk);
	if (err)
R
Rusty Russell 已提交
769 770 771
		goto out_free_vblk;

	/* FIXME: How many partitions?  How long is a piece of string? */
772
	vblk->disk = alloc_disk(1 << PART_BITS);
R
Rusty Russell 已提交
773 774
	if (!vblk->disk) {
		err = -ENOMEM;
J
Jens Axboe 已提交
775
		goto out_free_vq;
R
Rusty Russell 已提交
776 777
	}

778
	/* Default queue sizing is to fill the ring. */
779
	if (!virtblk_queue_depth) {
780
		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
781 782
		/* ... but without indirect descs, we use 2 descs per req */
		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
783
			virtblk_queue_depth /= 2;
784
	}
785 786 787 788 789 790 791

	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
	vblk->tag_set.ops = &virtio_mq_ops;
	vblk->tag_set.queue_depth = virtblk_queue_depth;
	vblk->tag_set.numa_node = NUMA_NO_NODE;
	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
	vblk->tag_set.cmd_size =
J
Jens Axboe 已提交
792 793
		sizeof(struct virtblk_req) +
		sizeof(struct scatterlist) * sg_elems;
794
	vblk->tag_set.driver_data = vblk;
795
	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
J
Jens Axboe 已提交
796

797 798 799 800
	err = blk_mq_alloc_tag_set(&vblk->tag_set);
	if (err)
		goto out_put_disk;

801
	q = blk_mq_init_queue(&vblk->tag_set);
802
	if (IS_ERR(q)) {
R
Rusty Russell 已提交
803
		err = -ENOMEM;
804
		goto out_free_tags;
R
Rusty Russell 已提交
805
	}
806
	vblk->disk->queue = q;
R
Rusty Russell 已提交
807

808
	q->queuedata = vblk;
809

810
	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
811

R
Rusty Russell 已提交
812
	vblk->disk->major = major;
813
	vblk->disk->first_minor = index_to_minor(index);
R
Rusty Russell 已提交
814 815
	vblk->disk->private_data = vblk;
	vblk->disk->fops = &virtblk_fops;
816
	vblk->disk->flags |= GENHD_FL_EXT_DEVT;
817
	vblk->index = index;
818

819
	/* configure queue flush support */
820
	virtblk_update_cache_mode(vdev);
R
Rusty Russell 已提交
821

822 823 824 825
	/* If disk is read-only in the host, the guest should obey */
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
		set_disk_ro(vblk->disk, 1);

826
	/* We can handle whatever the host told us to handle. */
827
	blk_queue_max_segments(q, vblk->sg_elems-2);
828

829
	/* No real sector limit. */
830
	blk_queue_max_hw_sectors(q, -1U);
831

832 833
	max_size = virtio_max_dma_size(vdev);

834 835
	/* Host can optionally specify maximum segment size and number of
	 * segments. */
836 837
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
				   struct virtio_blk_config, size_max, &v);
R
Rusty Russell 已提交
838
	if (!err)
839 840 841
		max_size = min(max_size, v);

	blk_queue_max_segment_size(q, max_size);
R
Rusty Russell 已提交
842

843
	/* Host can optionally specify the block size of the device */
844 845 846
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
				   struct virtio_blk_config, blk_size,
				   &blk_size);
847
	if (!err)
848 849 850 851 852
		blk_queue_logical_block_size(q, blk_size);
	else
		blk_size = queue_logical_block_size(q);

	/* Use topology information if available */
853 854 855
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, physical_block_exp,
				   &physical_block_exp);
856 857 858 859
	if (!err && physical_block_exp)
		blk_queue_physical_block_size(q,
				blk_size * (1 << physical_block_exp));

860 861 862
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, alignment_offset,
				   &alignment_offset);
863 864 865
	if (!err && alignment_offset)
		blk_queue_alignment_offset(q, blk_size * alignment_offset);

866 867 868
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, min_io_size,
				   &min_io_size);
869 870 871
	if (!err && min_io_size)
		blk_queue_io_min(q, blk_size * min_io_size);

872 873 874
	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
				   struct virtio_blk_config, opt_io_size,
				   &opt_io_size);
875 876 877
	if (!err && opt_io_size)
		blk_queue_io_opt(q, blk_size * opt_io_size);

878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
		q->limits.discard_granularity = blk_size;

		virtio_cread(vdev, struct virtio_blk_config,
			     discard_sector_alignment, &v);
		q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;

		virtio_cread(vdev, struct virtio_blk_config,
			     max_discard_sectors, &v);
		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);

		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
			     &v);
		blk_queue_max_discard_segments(q,
					       min_not_zero(v,
							    MAX_DISCARD_SEGMENTS));

		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
	}

	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
		virtio_cread(vdev, struct virtio_blk_config,
			     max_write_zeroes_sectors, &v);
		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
	}

904
	virtblk_update_capacity(vblk, false);
M
Michael S. Tsirkin 已提交
905 906
	virtio_device_ready(vdev);

907
	device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
R
Rusty Russell 已提交
908 909
	return 0;

910 911
out_free_tags:
	blk_mq_free_tag_set(&vblk->tag_set);
R
Rusty Russell 已提交
912 913 914
out_put_disk:
	put_disk(vblk->disk);
out_free_vq:
915
	vdev->config->del_vqs(vdev);
R
Rusty Russell 已提交
916 917
out_free_vblk:
	kfree(vblk);
918 919
out_free_index:
	ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
920 921 922 923
out:
	return err;
}

924
static void virtblk_remove(struct virtio_device *vdev)
R
Rusty Russell 已提交
925 926
{
	struct virtio_blk *vblk = vdev->priv;
927
	int index = vblk->index;
928
	int refc;
R
Rusty Russell 已提交
929

930 931
	/* Make sure no work handler is accessing the device. */
	flush_work(&vblk->config_work);
932

933
	del_gendisk(vblk->disk);
934
	blk_cleanup_queue(vblk->disk->queue);
935

936 937
	blk_mq_free_tag_set(&vblk->tag_set);

R
Rusty Russell 已提交
938 939 940
	/* Stop all the virtqueues. */
	vdev->config->reset(vdev);

941
	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
R
Rusty Russell 已提交
942
	put_disk(vblk->disk);
943
	vdev->config->del_vqs(vdev);
944
	kfree(vblk->vqs);
R
Rusty Russell 已提交
945
	kfree(vblk);
946 947 948 949

	/* Only free device id if we don't have any users */
	if (refc == 1)
		ida_simple_remove(&vd_index_ida, index);
R
Rusty Russell 已提交
950 951
}

952
#ifdef CONFIG_PM_SLEEP
953 954 955 956 957 958 959
static int virtblk_freeze(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;

	/* Ensure we don't receive any more interrupts */
	vdev->config->reset(vdev);

960
	/* Make sure no work handler is accessing the device. */
961 962
	flush_work(&vblk->config_work);

963
	blk_mq_quiesce_queue(vblk->disk->queue);
964 965 966 967 968 969 970 971 972 973 974

	vdev->config->del_vqs(vdev);
	return 0;
}

static int virtblk_restore(struct virtio_device *vdev)
{
	struct virtio_blk *vblk = vdev->priv;
	int ret;

	ret = init_vq(vdev->priv);
975 976 977 978
	if (ret)
		return ret;

	virtio_device_ready(vdev);
J
Jens Axboe 已提交
979

980
	blk_mq_unquiesce_queue(vblk->disk->queue);
981
	return 0;
982 983 984
}
#endif

985
static const struct virtio_device_id id_table[] = {
R
Rusty Russell 已提交
986 987 988 989
	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
	{ 0 },
};

M
Michael S. Tsirkin 已提交
990
static unsigned int features_legacy[] = {
991
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
992 993 994 995
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
#ifdef CONFIG_VIRTIO_BLK_SCSI
	VIRTIO_BLK_F_SCSI,
#endif
996
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
997
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
M
Michael S. Tsirkin 已提交
998 999 1000 1001 1002
}
;
static unsigned int features[] = {
	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
1003
	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
1004
	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
1005 1006
};

1007
static struct virtio_driver virtio_blk = {
M
Michael S. Tsirkin 已提交
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
	.feature_table			= features,
	.feature_table_size		= ARRAY_SIZE(features),
	.feature_table_legacy		= features_legacy,
	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
	.driver.name			= KBUILD_MODNAME,
	.driver.owner			= THIS_MODULE,
	.id_table			= id_table,
	.probe				= virtblk_probe,
	.remove				= virtblk_remove,
	.config_changed			= virtblk_config_changed,
1018
#ifdef CONFIG_PM_SLEEP
M
Michael S. Tsirkin 已提交
1019 1020
	.freeze				= virtblk_freeze,
	.restore			= virtblk_restore,
1021
#endif
R
Rusty Russell 已提交
1022 1023 1024 1025
};

static int __init init(void)
{
1026 1027 1028 1029 1030 1031
	int error;

	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
	if (!virtblk_wq)
		return -ENOMEM;

1032
	major = register_blkdev(0, "virtblk");
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
	if (major < 0) {
		error = major;
		goto out_destroy_workqueue;
	}

	error = register_virtio_driver(&virtio_blk);
	if (error)
		goto out_unregister_blkdev;
	return 0;

out_unregister_blkdev:
	unregister_blkdev(major, "virtblk");
out_destroy_workqueue:
	destroy_workqueue(virtblk_wq);
	return error;
R
Rusty Russell 已提交
1048 1049 1050 1051 1052
}

static void __exit fini(void)
{
	unregister_virtio_driver(&virtio_blk);
1053
	unregister_blkdev(major, "virtblk");
1054
	destroy_workqueue(virtblk_wq);
R
Rusty Russell 已提交
1055 1056 1057 1058 1059 1060 1061
}
module_init(init);
module_exit(fini);

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio block driver");
MODULE_LICENSE("GPL");