virtio-blk.c 31.5 KB
Newer Older
A
aliguori 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Virtio Block Device
 *
 * Copyright IBM, Corp. 2007
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
15
#include "qapi/error.h"
16
#include "qemu-common.h"
17
#include "qemu/iov.h"
18
#include "qemu/error-report.h"
19
#include "trace.h"
P
Paolo Bonzini 已提交
20
#include "hw/block/block.h"
21
#include "sysemu/block-backend.h"
22
#include "sysemu/blockdev.h"
P
Paolo Bonzini 已提交
23
#include "hw/virtio/virtio-blk.h"
24
#include "dataplane/virtio-blk.h"
P
Paolo Bonzini 已提交
25
#include "block/scsi.h"
26 27 28
#ifdef __linux__
# include <scsi/sg.h>
#endif
P
Paolo Bonzini 已提交
29
#include "hw/virtio/virtio-bus.h"
30
#include "hw/virtio/virtio-access.h"
A
aliguori 已提交
31

G
Greg Kurz 已提交
32 33
static void virtio_blk_init_request(VirtIOBlock *s, VirtQueue *vq,
                                    VirtIOBlockReq *req)
34 35
{
    req->dev = s;
36
    req->vq = vq;
37
    req->qiov.size = 0;
38
    req->in_len = 0;
39
    req->next = NULL;
P
Peter Lieven 已提交
40
    req->mr_next = NULL;
41 42
}

G
Greg Kurz 已提交
43
static void virtio_blk_free_request(VirtIOBlockReq *req)
44 45
{
    if (req) {
46
        g_free(req);
47 48 49
    }
}

50
static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
51 52
{
    VirtIOBlock *s = req->dev;
K
KONRAD Frederic 已提交
53
    VirtIODevice *vdev = VIRTIO_DEVICE(s);
54

55 56
    trace_virtio_blk_req_complete(req, status);

57
    stb_p(&req->in->status, status);
58
    virtqueue_push(req->vq, &req->elem, req->in_len);
P
Paolo Bonzini 已提交
59
    if (s->dataplane_started && !s->dataplane_disabled) {
60
        virtio_blk_data_plane_notify(s->dataplane, req->vq);
61
    } else {
62
        virtio_notify(vdev, req->vq);
63
    }
64 65
}

K
Kevin Wolf 已提交
66
static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
67
    bool is_read)
68
{
69 70
    BlockErrorAction action = blk_get_error_action(req->dev->blk,
                                                   is_read, error);
71 72
    VirtIOBlock *s = req->dev;

W
Wenchao Xia 已提交
73
    if (action == BLOCK_ERROR_ACTION_STOP) {
74 75 76
        /* Break the link as the next request is going to be parsed from the
         * ring again. Otherwise we may end up doing a double completion! */
        req->mr_next = NULL;
77 78
        req->next = s->rq;
        s->rq = req;
W
Wenchao Xia 已提交
79
    } else if (action == BLOCK_ERROR_ACTION_REPORT) {
80
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
81
        block_acct_failed(blk_get_stats(s->blk), &req->acct);
82
        virtio_blk_free_request(req);
83 84
    }

85
    blk_error_action(s->blk, action, is_read, error);
W
Wenchao Xia 已提交
86
    return action != BLOCK_ERROR_ACTION_IGNORE;
87 88
}

A
aliguori 已提交
89 90
static void virtio_blk_rw_complete(void *opaque, int ret)
{
P
Peter Lieven 已提交
91 92 93 94 95 96 97 98 99 100 101 102 103
    VirtIOBlockReq *next = opaque;

    while (next) {
        VirtIOBlockReq *req = next;
        next = req->mr_next;
        trace_virtio_blk_rw_complete(req, ret);

        if (req->qiov.nalloc != -1) {
            /* If nalloc is != 1 req->qiov is a local copy of the original
             * external iovec. It was allocated in submit_merged_requests
             * to be able to merge requests. */
            qemu_iovec_destroy(&req->qiov);
        }
A
aliguori 已提交
104

P
Peter Lieven 已提交
105 106 107
        if (ret) {
            int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
            bool is_read = !(p & VIRTIO_BLK_T_OUT);
108 109 110 111 112 113 114 115
            /* Note that memory may be dirtied on read failure.  If the
             * virtio request is not completed here, as is the case for
             * BLOCK_ERROR_ACTION_STOP, the memory may not be copied
             * correctly during live migration.  While this is ugly,
             * it is acceptable because the device is free to write to
             * the memory until the request is completed (which will
             * happen on the other side of the migration).
             */
P
Peter Lieven 已提交
116 117 118 119
            if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
                continue;
            }
        }
120

P
Peter Lieven 已提交
121 122 123
        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
        virtio_blk_free_request(req);
A
aliguori 已提交
124
    }
125
}
A
aliguori 已提交
126

127 128 129 130
static void virtio_blk_flush_complete(void *opaque, int ret)
{
    VirtIOBlockReq *req = opaque;

131 132 133 134 135 136 137
    if (ret) {
        if (virtio_blk_handle_rw_error(req, -ret, 0)) {
            return;
        }
    }

    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
138
    block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
139
    virtio_blk_free_request(req);
A
aliguori 已提交
140 141
}

F
Fam Zheng 已提交
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
#ifdef __linux__

typedef struct {
    VirtIOBlockReq *req;
    struct sg_io_hdr hdr;
} VirtIOBlockIoctlReq;

static void virtio_blk_ioctl_complete(void *opaque, int status)
{
    VirtIOBlockIoctlReq *ioctl_req = opaque;
    VirtIOBlockReq *req = ioctl_req->req;
    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
    struct virtio_scsi_inhdr *scsi;
    struct sg_io_hdr *hdr;

    scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;

    if (status) {
        status = VIRTIO_BLK_S_UNSUPP;
        virtio_stl_p(vdev, &scsi->errors, 255);
        goto out;
    }

    hdr = &ioctl_req->hdr;
    /*
     * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi)
     * clear the masked_status field [hence status gets cleared too, see
     * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED
     * status has occurred.  However they do set DRIVER_SENSE in driver_status
     * field. Also a (sb_len_wr > 0) indicates there is a sense buffer.
     */
    if (hdr->status == 0 && hdr->sb_len_wr > 0) {
        hdr->status = CHECK_CONDITION;
    }

    virtio_stl_p(vdev, &scsi->errors,
                 hdr->status | (hdr->msg_status << 8) |
                 (hdr->host_status << 16) | (hdr->driver_status << 24));
    virtio_stl_p(vdev, &scsi->residual, hdr->resid);
    virtio_stl_p(vdev, &scsi->sense_len, hdr->sb_len_wr);
    virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);

out:
    virtio_blk_req_complete(req, status);
    virtio_blk_free_request(req);
    g_free(ioctl_req);
}

#endif

192
static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s, VirtQueue *vq)
A
aliguori 已提交
193
{
194
    VirtIOBlockReq *req = virtqueue_pop(vq, sizeof(VirtIOBlockReq));
A
aliguori 已提交
195

196
    if (req) {
197
        virtio_blk_init_request(s, vq, req);
A
aliguori 已提交
198 199 200 201
    }
    return req;
}

202
static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
203
{
204 205
    int status = VIRTIO_BLK_S_OK;
    struct virtio_scsi_inhdr *scsi = NULL;
206 207 208
    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
    VirtQueueElement *elem = &req->elem;
    VirtIOBlock *blk = req->dev;
209

210
#ifdef __linux__
211
    int i;
F
Fam Zheng 已提交
212
    VirtIOBlockIoctlReq *ioctl_req;
213
    BlockAIOCB *acb;
214
#endif
215 216 217 218 219 220 221 222

    /*
     * We require at least one output segment each for the virtio_blk_outhdr
     * and the SCSI command block.
     *
     * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
     * and the sense buffer pointer in the input segments.
     */
223 224 225
    if (elem->out_num < 2 || elem->in_num < 3) {
        status = VIRTIO_BLK_S_IOERR;
        goto fail;
226 227 228
    }

    /*
229 230
     * The scsi inhdr is placed in the second-to-last input segment, just
     * before the regular inhdr.
231
     */
232
    scsi = (void *)elem->in_sg[elem->in_num - 2].iov_base;
233

234
    if (!blk->conf.scsi) {
235 236
        status = VIRTIO_BLK_S_UNSUPP;
        goto fail;
237 238 239
    }

    /*
240
     * No support for bidirection commands yet.
241
     */
242
    if (elem->out_num > 2 && elem->in_num > 3) {
243 244 245
        status = VIRTIO_BLK_S_UNSUPP;
        goto fail;
    }
246

247
#ifdef __linux__
F
Fam Zheng 已提交
248 249 250 251 252 253
    ioctl_req = g_new0(VirtIOBlockIoctlReq, 1);
    ioctl_req->req = req;
    ioctl_req->hdr.interface_id = 'S';
    ioctl_req->hdr.cmd_len = elem->out_sg[1].iov_len;
    ioctl_req->hdr.cmdp = elem->out_sg[1].iov_base;
    ioctl_req->hdr.dxfer_len = 0;
254

255
    if (elem->out_num > 2) {
256 257 258 259
        /*
         * If there are more than the minimally required 2 output segments
         * there is write payload starting from the third iovec.
         */
F
Fam Zheng 已提交
260 261
        ioctl_req->hdr.dxfer_direction = SG_DXFER_TO_DEV;
        ioctl_req->hdr.iovec_count = elem->out_num - 2;
262

F
Fam Zheng 已提交
263 264 265
        for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
            ioctl_req->hdr.dxfer_len += elem->out_sg[i + 2].iov_len;
        }
266

F
Fam Zheng 已提交
267
        ioctl_req->hdr.dxferp = elem->out_sg + 2;
268

269
    } else if (elem->in_num > 3) {
270 271 272 273
        /*
         * If we have more than 3 input segments the guest wants to actually
         * read data.
         */
F
Fam Zheng 已提交
274 275 276 277 278
        ioctl_req->hdr.dxfer_direction = SG_DXFER_FROM_DEV;
        ioctl_req->hdr.iovec_count = elem->in_num - 3;
        for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
            ioctl_req->hdr.dxfer_len += elem->in_sg[i].iov_len;
        }
279

F
Fam Zheng 已提交
280
        ioctl_req->hdr.dxferp = elem->in_sg;
281 282 283 284
    } else {
        /*
         * Some SCSI commands don't actually transfer any data.
         */
F
Fam Zheng 已提交
285
        ioctl_req->hdr.dxfer_direction = SG_DXFER_NONE;
286 287
    }

F
Fam Zheng 已提交
288 289
    ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
    ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
290

291 292 293 294 295 296 297
    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
                        virtio_blk_ioctl_complete, ioctl_req);
    if (!acb) {
        g_free(ioctl_req);
        status = VIRTIO_BLK_S_UNSUPP;
        goto fail;
    }
F
Fam Zheng 已提交
298
    return -EINPROGRESS;
299
#else
300 301 302 303 304
    abort();
#endif

fail:
    /* Just put anything nonzero so that the ioctl fails in the guest.  */
305
    if (scsi) {
306
        virtio_stl_p(vdev, &scsi->errors, 255);
307 308 309 310 311 312 313 314
    }
    return status;
}

static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
{
    int status;

315
    status = virtio_blk_handle_scsi_req(req);
F
Fam Zheng 已提交
316 317 318 319
    if (status != -EINPROGRESS) {
        virtio_blk_req_complete(req, status);
        virtio_blk_free_request(req);
    }
320 321
}

P
Peter Lieven 已提交
322 323
static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb,
                                   int start, int num_reqs, int niov)
324
{
P
Peter Lieven 已提交
325 326 327 328 329 330 331 332 333 334
    QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
    int64_t sector_num = mrb->reqs[start]->sector_num;
    bool is_write = mrb->is_write;

    if (num_reqs > 1) {
        int i;
        struct iovec *tmp_iov = qiov->iov;
        int tmp_niov = qiov->niov;

        /* mrb->reqs[start]->qiov was initialized from external so we can't
335
         * modify it here. We need to initialize it locally and then add the
P
Peter Lieven 已提交
336 337 338 339 340 341 342 343 344 345 346 347 348
         * external iovecs. */
        qemu_iovec_init(qiov, niov);

        for (i = 0; i < tmp_niov; i++) {
            qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
        }

        for (i = start + 1; i < start + num_reqs; i++) {
            qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
                              mrb->reqs[i]->qiov.size);
            mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
        }

349 350 351
        trace_virtio_blk_submit_multireq(mrb, start, num_reqs,
                                         sector_num << BDRV_SECTOR_BITS,
                                         qiov->size, is_write);
P
Peter Lieven 已提交
352 353 354 355
        block_acct_merge_done(blk_get_stats(blk),
                              is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
                              num_reqs - 1);
    }
K
Kevin Wolf 已提交
356

P
Peter Lieven 已提交
357
    if (is_write) {
358 359
        blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
                        virtio_blk_rw_complete, mrb->reqs[start]);
P
Peter Lieven 已提交
360
    } else {
361 362
        blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
                       virtio_blk_rw_complete, mrb->reqs[start]);
P
Peter Lieven 已提交
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
    }
}

static int multireq_compare(const void *a, const void *b)
{
    const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
                         *req2 = *(VirtIOBlockReq **)b;

    /*
     * Note that we can't simply subtract sector_num1 from sector_num2
     * here as that could overflow the return value.
     */
    if (req1->sector_num > req2->sector_num) {
        return 1;
    } else if (req1->sector_num < req2->sector_num) {
        return -1;
    } else {
        return 0;
    }
}

G
Greg Kurz 已提交
384
static void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
P
Peter Lieven 已提交
385 386
{
    int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
387
    uint32_t max_transfer;
P
Peter Lieven 已提交
388 389 390 391 392
    int64_t sector_num = 0;

    if (mrb->num_reqs == 1) {
        submit_requests(blk, mrb, 0, 1, -1);
        mrb->num_reqs = 0;
393 394 395
        return;
    }

396
    max_transfer = blk_get_max_transfer(mrb->reqs[0]->dev->blk);
P
Peter Lieven 已提交
397 398 399 400 401 402 403

    qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
          &multireq_compare);

    for (i = 0; i < mrb->num_reqs; i++) {
        VirtIOBlockReq *req = mrb->reqs[i];
        if (num_reqs > 0) {
G
Gonglei 已提交
404 405 406 407 408 409 410
            /*
             * NOTE: We cannot merge the requests in below situations:
             * 1. requests are not sequential
             * 2. merge would exceed maximum number of IOVs
             * 3. merge would exceed maximum transfer length of backend device
             */
            if (sector_num + nb_sectors != req->sector_num ||
411
                niov > blk_get_max_iov(blk) - req->qiov.niov ||
412 413 414
                req->qiov.size > max_transfer ||
                nb_sectors > (max_transfer -
                              req->qiov.size) / BDRV_SECTOR_SIZE) {
P
Peter Lieven 已提交
415 416
                submit_requests(blk, mrb, start, num_reqs, niov);
                num_reqs = 0;
K
Kevin Wolf 已提交
417 418
            }
        }
P
Peter Lieven 已提交
419 420 421 422 423 424 425 426 427 428

        if (num_reqs == 0) {
            sector_num = req->sector_num;
            nb_sectors = niov = 0;
            start = i;
        }

        nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
        niov += req->qiov.niov;
        num_reqs++;
K
Kevin Wolf 已提交
429
    }
430

P
Peter Lieven 已提交
431 432
    submit_requests(blk, mrb, start, num_reqs, niov);
    mrb->num_reqs = 0;
K
Kevin Wolf 已提交
433
}
434

435
static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
436
{
437
    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, 0,
438
                     BLOCK_ACCT_FLUSH);
439

440 441 442
    /*
     * Make sure all outstanding writes are posted to the backing device.
     */
P
Peter Lieven 已提交
443 444 445
    if (mrb->is_write && mrb->num_reqs > 0) {
        virtio_blk_submit_multireq(req->dev->blk, mrb);
    }
446
    blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
447 448
}

449 450 451
static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
                                     uint64_t sector, size_t size)
{
452 453 454
    uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
    uint64_t total_sectors;

455
    if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
P
Peter Lieven 已提交
456 457
        return false;
    }
458 459 460
    if (sector & dev->sector_mask) {
        return false;
    }
461
    if (size % dev->conf.conf.logical_block_size) {
462 463
        return false;
    }
464
    blk_get_geometry(dev->blk, &total_sectors);
465 466 467
    if (sector > total_sectors || nb_sectors > total_sectors - sector) {
        return false;
    }
468 469 470
    return true;
}

471
static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
472
{
473
    uint32_t type;
474 475 476 477
    struct iovec *in_iov = req->elem.in_sg;
    struct iovec *iov = req->elem.out_sg;
    unsigned in_num = req->elem.in_num;
    unsigned out_num = req->elem.out_num;
478 479
    VirtIOBlock *s = req->dev;
    VirtIODevice *vdev = VIRTIO_DEVICE(s);
480

481
    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
482 483
        virtio_error(vdev, "virtio-blk missing headers");
        return -1;
484 485
    }

486 487
    if (unlikely(iov_to_buf(iov, out_num, 0, &req->out,
                            sizeof(req->out)) != sizeof(req->out))) {
488 489
        virtio_error(vdev, "virtio-blk request outhdr too short");
        return -1;
490
    }
491

492
    iov_discard_front(&iov, &out_num, sizeof(req->out));
493

G
Gonglei 已提交
494
    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
495 496
        virtio_error(vdev, "virtio-blk request inhdr too short");
        return -1;
497 498
    }

499 500
    /* We always touch the last byte, so just see how big in_iov is.  */
    req->in_len = iov_size(in_iov, in_num);
501 502 503 504
    req->in = (void *)in_iov[in_num - 1].iov_base
              + in_iov[in_num - 1].iov_len
              - sizeof(struct virtio_blk_inhdr);
    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
505

506
    type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
507

P
Peter Lieven 已提交
508
    /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
S
Stefan Weil 已提交
509
     * is an optional flag. Although a guest should not send this flag if
P
Peter Lieven 已提交
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
     * not negotiated we ignored it in the past. So keep ignoring it. */
    switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
    case VIRTIO_BLK_T_IN:
    {
        bool is_write = type & VIRTIO_BLK_T_OUT;
        req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev),
                                       &req->out.sector);

        if (is_write) {
            qemu_iovec_init_external(&req->qiov, iov, out_num);
            trace_virtio_blk_handle_write(req, req->sector_num,
                                          req->qiov.size / BDRV_SECTOR_SIZE);
        } else {
            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
            trace_virtio_blk_handle_read(req, req->sector_num,
                                         req->qiov.size / BDRV_SECTOR_SIZE);
        }

        if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
                                      req->qiov.size)) {
            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
531 532
            block_acct_invalid(blk_get_stats(req->dev->blk),
                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
P
Peter Lieven 已提交
533
            virtio_blk_free_request(req);
534
            return 0;
P
Peter Lieven 已提交
535 536 537 538 539 540 541 542 543
        }

        block_acct_start(blk_get_stats(req->dev->blk),
                         &req->acct, req->qiov.size,
                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);

        /* merge would exceed maximum number of requests or IO direction
         * changes */
        if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
544 545
                                  is_write != mrb->is_write ||
                                  !req->dev->conf.request_merging)) {
P
Peter Lieven 已提交
546 547 548 549 550 551 552 553 554
            virtio_blk_submit_multireq(req->dev->blk, mrb);
        }

        assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
        mrb->reqs[mrb->num_reqs++] = req;
        mrb->is_write = is_write;
        break;
    }
    case VIRTIO_BLK_T_FLUSH:
555
        virtio_blk_handle_flush(req, mrb);
P
Peter Lieven 已提交
556 557
        break;
    case VIRTIO_BLK_T_SCSI_CMD:
558
        virtio_blk_handle_scsi(req);
P
Peter Lieven 已提交
559 560 561
        break;
    case VIRTIO_BLK_T_GET_ID:
    {
562 563
        VirtIOBlock *s = req->dev;

564 565 566 567
        /*
         * NB: per existing s/n string convention the string is
         * terminated by '\0' only when shorter than buffer.
         */
568
        const char *serial = s->conf.serial ? s->conf.serial : "";
569 570 571 572
        size_t size = MIN(strlen(serial) + 1,
                          MIN(iov_size(in_iov, in_num),
                              VIRTIO_BLK_ID_BYTES));
        iov_from_buf(in_iov, in_num, 0, serial, size);
573
        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
574
        virtio_blk_free_request(req);
P
Peter Lieven 已提交
575 576 577
        break;
    }
    default:
578
        virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
579
        virtio_blk_free_request(req);
580
    }
581
    return 0;
582 583
}

584
void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
A
aliguori 已提交
585 586
{
    VirtIOBlockReq *req;
P
Peter Lieven 已提交
587
    MultiReqBuffer mrb = {};
A
aliguori 已提交
588

589 590
    blk_io_plug(s->blk);

591
    while ((req = virtio_blk_get_request(s, vq))) {
592 593 594 595 596
        if (virtio_blk_handle_request(req, &mrb)) {
            virtqueue_detach_element(req->vq, &req->elem, 0);
            virtio_blk_free_request(req);
            break;
        }
A
aliguori 已提交
597
    }
K
Kevin Wolf 已提交
598

P
Peter Lieven 已提交
599 600 601
    if (mrb.num_reqs) {
        virtio_blk_submit_multireq(s->blk, &mrb);
    }
602 603

    blk_io_unplug(s->blk);
A
aliguori 已提交
604 605
}

606 607 608 609 610 611 612 613
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
    VirtIOBlock *s = (VirtIOBlock *)vdev;

    if (s->dataplane) {
        /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
         * dataplane here instead of waiting for .set_status().
         */
614
        virtio_device_start_ioeventfd(vdev);
615 616 617 618 619 620 621
        if (!s->dataplane_disabled) {
            return;
        }
    }
    virtio_blk_handle_vq(s, vq);
}

622
static void virtio_blk_dma_restart_bh(void *opaque)
623 624 625
{
    VirtIOBlock *s = opaque;
    VirtIOBlockReq *req = s->rq;
P
Peter Lieven 已提交
626
    MultiReqBuffer mrb = {};
627

628 629
    qemu_bh_delete(s->bh);
    s->bh = NULL;
630 631 632 633

    s->rq = NULL;

    while (req) {
634
        VirtIOBlockReq *next = req->next;
635 636 637 638 639 640 641 642 643 644 645 646
        if (virtio_blk_handle_request(req, &mrb)) {
            /* Device is now broken and won't do any processing until it gets
             * reset. Already queued requests will be lost: let's purge them.
             */
            while (req) {
                next = req->next;
                virtqueue_detach_element(req->vq, &req->elem, 0);
                virtio_blk_free_request(req);
                req = next;
            }
            break;
        }
647
        req = next;
648
    }
649

P
Peter Lieven 已提交
650 651 652
    if (mrb.num_reqs) {
        virtio_blk_submit_multireq(s->blk, &mrb);
    }
653 654
}

655 656
static void virtio_blk_dma_restart_cb(void *opaque, int running,
                                      RunState state)
657 658 659
{
    VirtIOBlock *s = opaque;

660
    if (!running) {
661
        return;
662
    }
663 664

    if (!s->bh) {
665
        s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk),
666
                           virtio_blk_dma_restart_bh, s);
667 668 669 670
        qemu_bh_schedule(s->bh);
    }
}

A
aliguori 已提交
671 672
static void virtio_blk_reset(VirtIODevice *vdev)
{
K
KONRAD Frederic 已提交
673
    VirtIOBlock *s = VIRTIO_BLK(vdev);
674
    AioContext *ctx;
675
    VirtIOBlockReq *req;
676

677 678 679 680
    ctx = blk_get_aio_context(s->blk);
    aio_context_acquire(ctx);
    blk_drain(s->blk);

681 682 683 684 685
    /* We drop queued requests after blk_drain() because blk_drain() itself can
     * produce them. */
    while (s->rq) {
        req = s->rq;
        s->rq = req->next;
686
        virtqueue_detach_element(req->vq, &req->elem, 0);
687 688 689
        virtio_blk_free_request(req);
    }

690 691
    aio_context_release(ctx);

692
    assert(!s->dataplane_started);
693
    blk_set_enable_write_cache(s->blk, s->original_wce);
A
aliguori 已提交
694 695
}

696 697
/* coalesce internal state, copy to pci i/o region 0
 */
A
aliguori 已提交
698 699
static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
{
K
KONRAD Frederic 已提交
700
    VirtIOBlock *s = VIRTIO_BLK(vdev);
701
    BlockConf *conf = &s->conf.conf;
A
aliguori 已提交
702 703
    struct virtio_blk_config blkcfg;
    uint64_t capacity;
704
    int blk_size = conf->logical_block_size;
A
aliguori 已提交
705

706
    blk_get_geometry(s->blk, &capacity);
G
Gerd Hoffmann 已提交
707
    memset(&blkcfg, 0, sizeof(blkcfg));
708 709
    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
    virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
710
    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
711
    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
712 713
    virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
    virtio_stw_p(vdev, &blkcfg.opt_io_size, conf->opt_io_size / blk_size);
714
    blkcfg.geometry.heads = conf->heads;
715 716
    /*
     * We must ensure that the block device capacity is a multiple of
717
     * the logical block size. If that is not the case, let's use
718 719
     * sector_mask to adopt the geometry to have a correct picture.
     * For those devices where the capacity is ok for the given geometry
720
     * we don't touch the sector value of the geometry, since some devices
721 722 723 724 725
     * (like s390 dasd) need a specific value. Here the capacity is already
     * cyls*heads*secs*blk_size and the sector value is not block size
     * divided by 512 - instead it is the amount of blk_size blocks
     * per track (cylinder).
     */
726
    if (blk_getlength(s->blk) /  conf->heads / conf->secs % blk_size) {
727
        blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
728
    } else {
729
        blkcfg.geometry.sectors = conf->secs;
730
    }
731
    blkcfg.size_max = 0;
732
    blkcfg.physical_block_exp = get_physical_block_exp(conf);
733
    blkcfg.alignment_offset = 0;
734
    blkcfg.wce = blk_enable_write_cache(s->blk);
735
    virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
736
    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
A
aliguori 已提交
737 738
}

739 740
static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
{
K
KONRAD Frederic 已提交
741
    VirtIOBlock *s = VIRTIO_BLK(vdev);
742 743 744
    struct virtio_blk_config blkcfg;

    memcpy(&blkcfg, config, sizeof(blkcfg));
745

746 747 748
    aio_context_acquire(blk_get_aio_context(s->blk));
    blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
    aio_context_release(blk_get_aio_context(s->blk));
749 750
}

J
Jason Wang 已提交
751 752
static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
                                        Error **errp)
A
aliguori 已提交
753
{
K
KONRAD Frederic 已提交
754
    VirtIOBlock *s = VIRTIO_BLK(vdev);
755

756 757 758 759
    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
760
    if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
761 762 763 764 765
        if (s->conf.scsi) {
            error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
            return 0;
        }
    } else {
766
        virtio_clear_feature(&features, VIRTIO_F_ANY_LAYOUT);
767 768
        virtio_add_feature(&features, VIRTIO_BLK_F_SCSI);
    }
769

770
    if (s->conf.config_wce) {
771
        virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
772
    }
773
    if (blk_enable_write_cache(s->blk)) {
774
        virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
775 776
    }
    if (blk_is_read_only(s->blk)) {
777
        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
778
    }
779 780 781
    if (s->conf.num_queues > 1) {
        virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
    }
782 783

    return features;
A
aliguori 已提交
784 785
}

786 787
static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
{
K
KONRAD Frederic 已提交
788
    VirtIOBlock *s = VIRTIO_BLK(vdev);
789

790 791
    if (!(status & (VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK))) {
        assert(!s->dataplane_started);
792 793
    }

794 795 796 797
    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
        return;
    }

798 799 800 801 802 803 804 805 806 807 808 809 810
    /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
     * cache flushes.  Thus, the "auto writethrough" behavior is never
     * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
     * Leaving it enabled would break the following sequence:
     *
     *     Guest started with "-drive cache=writethrough"
     *     Guest sets status to 0
     *     Guest sets DRIVER bit in status field
     *     Guest reads host features (WCE=0, CONFIG_WCE=1)
     *     Guest writes guest features (WCE=0, CONFIG_WCE=1)
     *     Guest writes 1 to the WCE configuration field (writeback mode)
     *     Guest sets DRIVER_OK bit in status field
     *
811
     * s->blk would erroneously be placed in writethrough mode.
812
     */
813
    if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
814 815
        aio_context_acquire(blk_get_aio_context(s->blk));
        blk_set_enable_write_cache(s->blk,
816 817
                                   virtio_vdev_has_feature(vdev,
                                                           VIRTIO_BLK_F_WCE));
818
        aio_context_release(blk_get_aio_context(s->blk));
819
    }
820 821
}

822 823 824 825 826
static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
{
    VirtIOBlock *s = VIRTIO_BLK(vdev);
    VirtIOBlockReq *req = s->rq;

827 828
    while (req) {
        qemu_put_sbyte(f, 1);
829 830 831 832 833

        if (s->conf.num_queues > 1) {
            qemu_put_be32(f, virtio_get_queue_index(req->vq));
        }

834
        qemu_put_virtqueue_element(f, &req->elem);
835 836 837
        req = req->next;
    }
    qemu_put_sbyte(f, 0);
A
aliguori 已提交
838 839
}

840 841 842 843
static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
                                  int version_id)
{
    VirtIOBlock *s = VIRTIO_BLK(vdev);
844

845
    while (qemu_get_sbyte(f)) {
846 847
        unsigned nvqs = s->conf.num_queues;
        unsigned vq_idx = 0;
848
        VirtIOBlockReq *req;
849 850 851 852 853 854 855 856 857 858 859

        if (nvqs > 1) {
            vq_idx = qemu_get_be32(f);

            if (vq_idx >= nvqs) {
                error_report("Invalid virtqueue index in request list: %#x",
                             vq_idx);
                return -EINVAL;
            }
        }

860
        req = qemu_get_virtqueue_element(f, sizeof(VirtIOBlockReq));
861
        virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
862
        req->next = s->rq;
863
        s->rq = req;
864
    }
A
aliguori 已提交
865 866 867 868

    return 0;
}

869
static void virtio_blk_resize(void *opaque)
870
{
K
KONRAD Frederic 已提交
871
    VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
872

K
KONRAD Frederic 已提交
873
    virtio_notify_config(vdev);
874 875
}

876
static const BlockDevOps virtio_block_ops = {
877
    .resize_cb = virtio_blk_resize,
878 879
};

880
static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
881
{
882
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
883
    VirtIOBlock *s = VIRTIO_BLK(dev);
884
    VirtIOBlkConf *conf = &s->conf;
885
    Error *err = NULL;
886
    unsigned i;
P
Paul Brook 已提交
887

888
    if (!conf->conf.blk) {
889 890
        error_setg(errp, "drive property not set");
        return;
891
    }
892
    if (!blk_is_inserted(conf->conf.blk)) {
893 894
        error_setg(errp, "Device needs media, but drive is empty");
        return;
895
    }
896 897 898 899
    if (!conf->num_queues) {
        error_setg(errp, "num-queues property must be larger than 0");
        return;
    }
900

901
    blkconf_serial(&conf->conf, &conf->serial);
902
    blkconf_apply_backend_options(&conf->conf);
903
    s->original_wce = blk_enable_write_cache(conf->conf.blk);
904
    blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, &err);
F
Fam Zheng 已提交
905 906
    if (err) {
        error_propagate(errp, err);
907
        return;
908
    }
909
    blkconf_blocksizes(&conf->conf);
910

911 912
    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
                sizeof(struct virtio_blk_config));
A
aliguori 已提交
913

914
    s->blk = conf->conf.blk;
915
    s->rq = NULL;
916
    s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
917

918
    for (i = 0; i < conf->num_queues; i++) {
919
        virtio_add_queue(vdev, 128, virtio_blk_handle_output);
920
    }
921
    virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
922
    if (err != NULL) {
923
        error_propagate(errp, err);
924
        virtio_cleanup(vdev);
925
        return;
926
    }
A
aliguori 已提交
927

928
    s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
929 930
    blk_set_dev_ops(s->blk, &virtio_block_ops, s);
    blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
A
aliguori 已提交
931

932
    blk_iostatus_enable(s->blk);
933 934
}

935
static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
936
{
937 938 939
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    VirtIOBlock *s = VIRTIO_BLK(dev);

940 941 942
    virtio_blk_data_plane_destroy(s->dataplane);
    s->dataplane = NULL;
    qemu_del_vm_change_state_handler(s->change);
943
    blockdev_mark_auto_del(s->blk);
944
    virtio_cleanup(vdev);
945 946
}

947 948 949 950 951
static void virtio_blk_instance_init(Object *obj)
{
    VirtIOBlock *s = VIRTIO_BLK(obj);

    object_property_add_link(obj, "iothread", TYPE_IOTHREAD,
952
                             (Object **)&s->conf.iothread,
953 954
                             qdev_prop_allow_set_link_before_realize,
                             OBJ_PROP_LINK_UNREF_ON_RELEASE, NULL);
955
    device_add_bootindex_property(obj, &s->conf.conf.bootindex,
956 957
                                  "bootindex", "/disk@0,0",
                                  DEVICE(obj), NULL);
958 959
}

960 961 962 963 964 965 966 967 968
static const VMStateDescription vmstate_virtio_blk = {
    .name = "virtio-blk",
    .minimum_version_id = 2,
    .version_id = 2,
    .fields = (VMStateField[]) {
        VMSTATE_VIRTIO_DEVICE,
        VMSTATE_END_OF_LIST()
    },
};
969

970
static Property virtio_blk_properties[] = {
971
    DEFINE_BLOCK_PROPERTIES(VirtIOBlock, conf.conf),
972
    DEFINE_BLOCK_ERROR_PROPERTIES(VirtIOBlock, conf.conf),
973 974 975
    DEFINE_BLOCK_CHS_PROPERTIES(VirtIOBlock, conf.conf),
    DEFINE_PROP_STRING("serial", VirtIOBlock, conf.serial),
    DEFINE_PROP_BIT("config-wce", VirtIOBlock, conf.config_wce, 0, true),
976
#ifdef __linux__
977
    DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, false),
978
#endif
979 980
    DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
                    true),
981
    DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
982 983 984 985 986 987 988
    DEFINE_PROP_END_OF_LIST(),
};

static void virtio_blk_class_init(ObjectClass *klass, void *data)
{
    DeviceClass *dc = DEVICE_CLASS(klass);
    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
989

990
    dc->props = virtio_blk_properties;
991
    dc->vmsd = &vmstate_virtio_blk;
992
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
993
    vdc->realize = virtio_blk_device_realize;
994
    vdc->unrealize = virtio_blk_device_unrealize;
995 996 997 998 999
    vdc->get_config = virtio_blk_update_config;
    vdc->set_config = virtio_blk_set_config;
    vdc->get_features = virtio_blk_get_features;
    vdc->set_status = virtio_blk_set_status;
    vdc->reset = virtio_blk_reset;
1000 1001
    vdc->save = virtio_blk_save_device;
    vdc->load = virtio_blk_load_device;
1002 1003
    vdc->start_ioeventfd = virtio_blk_data_plane_start;
    vdc->stop_ioeventfd = virtio_blk_data_plane_stop;
1004 1005
}

1006
static const TypeInfo virtio_blk_info = {
1007 1008 1009
    .name = TYPE_VIRTIO_BLK,
    .parent = TYPE_VIRTIO_DEVICE,
    .instance_size = sizeof(VirtIOBlock),
1010
    .instance_init = virtio_blk_instance_init,
1011 1012 1013 1014 1015
    .class_init = virtio_blk_class_init,
};

static void virtio_register_types(void)
{
1016
    type_register_static(&virtio_blk_info);
1017 1018 1019
}

type_init(virtio_register_types)