virtio-blk.c 33.2 KB
Newer Older
A
aliguori 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Virtio Block Device
 *
 * Copyright IBM, Corp. 2007
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
14
#include "qemu/osdep.h"
15
#include "qapi/error.h"
16
#include "qemu-common.h"
17
#include "qemu/iov.h"
18
#include "qemu/error-report.h"
19
#include "trace.h"
P
Paolo Bonzini 已提交
20
#include "hw/block/block.h"
21
#include "sysemu/blockdev.h"
P
Paolo Bonzini 已提交
22
#include "hw/virtio/virtio-blk.h"
23
#include "dataplane/virtio-blk.h"
24
#include "scsi/constants.h"
25 26 27
#ifdef __linux__
# include <scsi/sg.h>
#endif
P
Paolo Bonzini 已提交
28
#include "hw/virtio/virtio-bus.h"
29
#include "hw/virtio/virtio-access.h"
A
aliguori 已提交
30

G
Greg Kurz 已提交
31 32
static void virtio_blk_init_request(VirtIOBlock *s, VirtQueue *vq,
                                    VirtIOBlockReq *req)
33 34
{
    req->dev = s;
35
    req->vq = vq;
36
    req->qiov.size = 0;
37
    req->in_len = 0;
38
    req->next = NULL;
P
Peter Lieven 已提交
39
    req->mr_next = NULL;
40 41
}

G
Greg Kurz 已提交
42
static void virtio_blk_free_request(VirtIOBlockReq *req)
43
{
44
    g_free(req);
45 46
}

47
static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
48 49
{
    VirtIOBlock *s = req->dev;
K
KONRAD Frederic 已提交
50
    VirtIODevice *vdev = VIRTIO_DEVICE(s);
51

52
    trace_virtio_blk_req_complete(vdev, req, status);
53

54
    stb_p(&req->in->status, status);
55
    virtqueue_push(req->vq, &req->elem, req->in_len);
P
Paolo Bonzini 已提交
56
    if (s->dataplane_started && !s->dataplane_disabled) {
57
        virtio_blk_data_plane_notify(s->dataplane, req->vq);
58
    } else {
59
        virtio_notify(vdev, req->vq);
60
    }
61 62
}

K
Kevin Wolf 已提交
63
static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
64
    bool is_read)
65
{
66 67
    BlockErrorAction action = blk_get_error_action(req->dev->blk,
                                                   is_read, error);
68 69
    VirtIOBlock *s = req->dev;

W
Wenchao Xia 已提交
70
    if (action == BLOCK_ERROR_ACTION_STOP) {
71 72 73
        /* Break the link as the next request is going to be parsed from the
         * ring again. Otherwise we may end up doing a double completion! */
        req->mr_next = NULL;
74 75
        req->next = s->rq;
        s->rq = req;
W
Wenchao Xia 已提交
76
    } else if (action == BLOCK_ERROR_ACTION_REPORT) {
77
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
78
        block_acct_failed(blk_get_stats(s->blk), &req->acct);
79
        virtio_blk_free_request(req);
80 81
    }

82
    blk_error_action(s->blk, action, is_read, error);
W
Wenchao Xia 已提交
83
    return action != BLOCK_ERROR_ACTION_IGNORE;
84 85
}

A
aliguori 已提交
86 87
static void virtio_blk_rw_complete(void *opaque, int ret)
{
P
Peter Lieven 已提交
88
    VirtIOBlockReq *next = opaque;
89
    VirtIOBlock *s = next->dev;
90
    VirtIODevice *vdev = VIRTIO_DEVICE(s);
P
Peter Lieven 已提交
91

92
    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
P
Peter Lieven 已提交
93 94 95
    while (next) {
        VirtIOBlockReq *req = next;
        next = req->mr_next;
96
        trace_virtio_blk_rw_complete(vdev, req, ret);
P
Peter Lieven 已提交
97 98 99 100 101 102 103

        if (req->qiov.nalloc != -1) {
            /* If nalloc is != 1 req->qiov is a local copy of the original
             * external iovec. It was allocated in submit_merged_requests
             * to be able to merge requests. */
            qemu_iovec_destroy(&req->qiov);
        }
A
aliguori 已提交
104

P
Peter Lieven 已提交
105 106 107
        if (ret) {
            int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
            bool is_read = !(p & VIRTIO_BLK_T_OUT);
108 109 110 111 112 113 114 115
            /* Note that memory may be dirtied on read failure.  If the
             * virtio request is not completed here, as is the case for
             * BLOCK_ERROR_ACTION_STOP, the memory may not be copied
             * correctly during live migration.  While this is ugly,
             * it is acceptable because the device is free to write to
             * the memory until the request is completed (which will
             * happen on the other side of the migration).
             */
P
Peter Lieven 已提交
116 117 118 119
            if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
                continue;
            }
        }
120

P
Peter Lieven 已提交
121 122 123
        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
        virtio_blk_free_request(req);
A
aliguori 已提交
124
    }
125
    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
126
}
A
aliguori 已提交
127

128 129 130
static void virtio_blk_flush_complete(void *opaque, int ret)
{
    VirtIOBlockReq *req = opaque;
131
    VirtIOBlock *s = req->dev;
132

133
    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
134 135
    if (ret) {
        if (virtio_blk_handle_rw_error(req, -ret, 0)) {
136
            goto out;
137 138 139 140
        }
    }

    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
141
    block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
142
    virtio_blk_free_request(req);
143 144 145

out:
    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
A
aliguori 已提交
146 147
}

F
Fam Zheng 已提交
148 149 150 151 152 153 154 155 156 157 158
#ifdef __linux__

typedef struct {
    VirtIOBlockReq *req;
    struct sg_io_hdr hdr;
} VirtIOBlockIoctlReq;

static void virtio_blk_ioctl_complete(void *opaque, int status)
{
    VirtIOBlockIoctlReq *ioctl_req = opaque;
    VirtIOBlockReq *req = ioctl_req->req;
159 160
    VirtIOBlock *s = req->dev;
    VirtIODevice *vdev = VIRTIO_DEVICE(s);
F
Fam Zheng 已提交
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
    struct virtio_scsi_inhdr *scsi;
    struct sg_io_hdr *hdr;

    scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;

    if (status) {
        status = VIRTIO_BLK_S_UNSUPP;
        virtio_stl_p(vdev, &scsi->errors, 255);
        goto out;
    }

    hdr = &ioctl_req->hdr;
    /*
     * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi)
     * clear the masked_status field [hence status gets cleared too, see
     * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED
     * status has occurred.  However they do set DRIVER_SENSE in driver_status
     * field. Also a (sb_len_wr > 0) indicates there is a sense buffer.
     */
    if (hdr->status == 0 && hdr->sb_len_wr > 0) {
        hdr->status = CHECK_CONDITION;
    }

    virtio_stl_p(vdev, &scsi->errors,
                 hdr->status | (hdr->msg_status << 8) |
                 (hdr->host_status << 16) | (hdr->driver_status << 24));
    virtio_stl_p(vdev, &scsi->residual, hdr->resid);
    virtio_stl_p(vdev, &scsi->sense_len, hdr->sb_len_wr);
    virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);

out:
192
    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
F
Fam Zheng 已提交
193 194
    virtio_blk_req_complete(req, status);
    virtio_blk_free_request(req);
195
    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
F
Fam Zheng 已提交
196 197 198 199 200
    g_free(ioctl_req);
}

#endif

201
static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s, VirtQueue *vq)
A
aliguori 已提交
202
{
203
    VirtIOBlockReq *req = virtqueue_pop(vq, sizeof(VirtIOBlockReq));
A
aliguori 已提交
204

205
    if (req) {
206
        virtio_blk_init_request(s, vq, req);
A
aliguori 已提交
207 208 209 210
    }
    return req;
}

211
static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
212
{
213 214
    int status = VIRTIO_BLK_S_OK;
    struct virtio_scsi_inhdr *scsi = NULL;
215 216 217
    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
    VirtQueueElement *elem = &req->elem;
    VirtIOBlock *blk = req->dev;
218

219
#ifdef __linux__
220
    int i;
F
Fam Zheng 已提交
221
    VirtIOBlockIoctlReq *ioctl_req;
222
    BlockAIOCB *acb;
223
#endif
224 225 226 227 228 229 230 231

    /*
     * We require at least one output segment each for the virtio_blk_outhdr
     * and the SCSI command block.
     *
     * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
     * and the sense buffer pointer in the input segments.
     */
232 233 234
    if (elem->out_num < 2 || elem->in_num < 3) {
        status = VIRTIO_BLK_S_IOERR;
        goto fail;
235 236 237
    }

    /*
238 239
     * The scsi inhdr is placed in the second-to-last input segment, just
     * before the regular inhdr.
240
     */
241
    scsi = (void *)elem->in_sg[elem->in_num - 2].iov_base;
242

243
    if (!blk->conf.scsi) {
244 245
        status = VIRTIO_BLK_S_UNSUPP;
        goto fail;
246 247 248
    }

    /*
249
     * No support for bidirection commands yet.
250
     */
251
    if (elem->out_num > 2 && elem->in_num > 3) {
252 253 254
        status = VIRTIO_BLK_S_UNSUPP;
        goto fail;
    }
255

256
#ifdef __linux__
F
Fam Zheng 已提交
257 258 259 260 261 262
    ioctl_req = g_new0(VirtIOBlockIoctlReq, 1);
    ioctl_req->req = req;
    ioctl_req->hdr.interface_id = 'S';
    ioctl_req->hdr.cmd_len = elem->out_sg[1].iov_len;
    ioctl_req->hdr.cmdp = elem->out_sg[1].iov_base;
    ioctl_req->hdr.dxfer_len = 0;
263

264
    if (elem->out_num > 2) {
265 266 267 268
        /*
         * If there are more than the minimally required 2 output segments
         * there is write payload starting from the third iovec.
         */
F
Fam Zheng 已提交
269 270
        ioctl_req->hdr.dxfer_direction = SG_DXFER_TO_DEV;
        ioctl_req->hdr.iovec_count = elem->out_num - 2;
271

F
Fam Zheng 已提交
272 273 274
        for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
            ioctl_req->hdr.dxfer_len += elem->out_sg[i + 2].iov_len;
        }
275

F
Fam Zheng 已提交
276
        ioctl_req->hdr.dxferp = elem->out_sg + 2;
277

278
    } else if (elem->in_num > 3) {
279 280 281 282
        /*
         * If we have more than 3 input segments the guest wants to actually
         * read data.
         */
F
Fam Zheng 已提交
283 284 285 286 287
        ioctl_req->hdr.dxfer_direction = SG_DXFER_FROM_DEV;
        ioctl_req->hdr.iovec_count = elem->in_num - 3;
        for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
            ioctl_req->hdr.dxfer_len += elem->in_sg[i].iov_len;
        }
288

F
Fam Zheng 已提交
289
        ioctl_req->hdr.dxferp = elem->in_sg;
290 291 292 293
    } else {
        /*
         * Some SCSI commands don't actually transfer any data.
         */
F
Fam Zheng 已提交
294
        ioctl_req->hdr.dxfer_direction = SG_DXFER_NONE;
295 296
    }

F
Fam Zheng 已提交
297 298
    ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
    ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
299

300 301 302 303 304 305 306
    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
                        virtio_blk_ioctl_complete, ioctl_req);
    if (!acb) {
        g_free(ioctl_req);
        status = VIRTIO_BLK_S_UNSUPP;
        goto fail;
    }
F
Fam Zheng 已提交
307
    return -EINPROGRESS;
308
#else
309 310 311 312 313
    abort();
#endif

fail:
    /* Just put anything nonzero so that the ioctl fails in the guest.  */
314
    if (scsi) {
315
        virtio_stl_p(vdev, &scsi->errors, 255);
316 317 318 319 320 321 322 323
    }
    return status;
}

static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
{
    int status;

324
    status = virtio_blk_handle_scsi_req(req);
F
Fam Zheng 已提交
325 326 327 328
    if (status != -EINPROGRESS) {
        virtio_blk_req_complete(req, status);
        virtio_blk_free_request(req);
    }
329 330
}

P
Peter Lieven 已提交
331 332
static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb,
                                   int start, int num_reqs, int niov)
333
{
P
Peter Lieven 已提交
334 335 336 337 338 339 340 341 342 343
    QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
    int64_t sector_num = mrb->reqs[start]->sector_num;
    bool is_write = mrb->is_write;

    if (num_reqs > 1) {
        int i;
        struct iovec *tmp_iov = qiov->iov;
        int tmp_niov = qiov->niov;

        /* mrb->reqs[start]->qiov was initialized from external so we can't
344
         * modify it here. We need to initialize it locally and then add the
P
Peter Lieven 已提交
345 346 347 348 349 350 351 352 353 354 355 356 357
         * external iovecs. */
        qemu_iovec_init(qiov, niov);

        for (i = 0; i < tmp_niov; i++) {
            qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
        }

        for (i = start + 1; i < start + num_reqs; i++) {
            qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
                              mrb->reqs[i]->qiov.size);
            mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
        }

358 359
        trace_virtio_blk_submit_multireq(VIRTIO_DEVICE(mrb->reqs[start]->dev),
                                         mrb, start, num_reqs,
360 361
                                         sector_num << BDRV_SECTOR_BITS,
                                         qiov->size, is_write);
P
Peter Lieven 已提交
362 363 364 365
        block_acct_merge_done(blk_get_stats(blk),
                              is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
                              num_reqs - 1);
    }
K
Kevin Wolf 已提交
366

P
Peter Lieven 已提交
367
    if (is_write) {
368 369
        blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
                        virtio_blk_rw_complete, mrb->reqs[start]);
P
Peter Lieven 已提交
370
    } else {
371 372
        blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
                       virtio_blk_rw_complete, mrb->reqs[start]);
P
Peter Lieven 已提交
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
    }
}

static int multireq_compare(const void *a, const void *b)
{
    const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
                         *req2 = *(VirtIOBlockReq **)b;

    /*
     * Note that we can't simply subtract sector_num1 from sector_num2
     * here as that could overflow the return value.
     */
    if (req1->sector_num > req2->sector_num) {
        return 1;
    } else if (req1->sector_num < req2->sector_num) {
        return -1;
    } else {
        return 0;
    }
}

G
Greg Kurz 已提交
394
static void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
P
Peter Lieven 已提交
395 396
{
    int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
397
    uint32_t max_transfer;
P
Peter Lieven 已提交
398 399 400 401 402
    int64_t sector_num = 0;

    if (mrb->num_reqs == 1) {
        submit_requests(blk, mrb, 0, 1, -1);
        mrb->num_reqs = 0;
403 404 405
        return;
    }

406
    max_transfer = blk_get_max_transfer(mrb->reqs[0]->dev->blk);
P
Peter Lieven 已提交
407 408 409 410 411 412 413

    qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
          &multireq_compare);

    for (i = 0; i < mrb->num_reqs; i++) {
        VirtIOBlockReq *req = mrb->reqs[i];
        if (num_reqs > 0) {
G
Gonglei 已提交
414 415 416 417 418 419 420
            /*
             * NOTE: We cannot merge the requests in below situations:
             * 1. requests are not sequential
             * 2. merge would exceed maximum number of IOVs
             * 3. merge would exceed maximum transfer length of backend device
             */
            if (sector_num + nb_sectors != req->sector_num ||
421
                niov > blk_get_max_iov(blk) - req->qiov.niov ||
422 423 424
                req->qiov.size > max_transfer ||
                nb_sectors > (max_transfer -
                              req->qiov.size) / BDRV_SECTOR_SIZE) {
P
Peter Lieven 已提交
425 426
                submit_requests(blk, mrb, start, num_reqs, niov);
                num_reqs = 0;
K
Kevin Wolf 已提交
427 428
            }
        }
P
Peter Lieven 已提交
429 430 431 432 433 434 435 436 437 438

        if (num_reqs == 0) {
            sector_num = req->sector_num;
            nb_sectors = niov = 0;
            start = i;
        }

        nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
        niov += req->qiov.niov;
        num_reqs++;
K
Kevin Wolf 已提交
439
    }
440

P
Peter Lieven 已提交
441 442
    submit_requests(blk, mrb, start, num_reqs, niov);
    mrb->num_reqs = 0;
K
Kevin Wolf 已提交
443
}
444

445
static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
446
{
447
    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, 0,
448
                     BLOCK_ACCT_FLUSH);
449

450 451 452
    /*
     * Make sure all outstanding writes are posted to the backing device.
     */
P
Peter Lieven 已提交
453 454 455
    if (mrb->is_write && mrb->num_reqs > 0) {
        virtio_blk_submit_multireq(req->dev->blk, mrb);
    }
456
    blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
457 458
}

459 460 461
static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
                                     uint64_t sector, size_t size)
{
462 463 464
    uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
    uint64_t total_sectors;

465
    if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
P
Peter Lieven 已提交
466 467
        return false;
    }
468 469 470
    if (sector & dev->sector_mask) {
        return false;
    }
471
    if (size % dev->conf.conf.logical_block_size) {
472 473
        return false;
    }
474
    blk_get_geometry(dev->blk, &total_sectors);
475 476 477
    if (sector > total_sectors || nb_sectors > total_sectors - sector) {
        return false;
    }
478 479 480
    return true;
}

481
static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
482
{
483
    uint32_t type;
484 485 486 487
    struct iovec *in_iov = req->elem.in_sg;
    struct iovec *iov = req->elem.out_sg;
    unsigned in_num = req->elem.in_num;
    unsigned out_num = req->elem.out_num;
488 489
    VirtIOBlock *s = req->dev;
    VirtIODevice *vdev = VIRTIO_DEVICE(s);
490

491
    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
492 493
        virtio_error(vdev, "virtio-blk missing headers");
        return -1;
494 495
    }

496 497
    if (unlikely(iov_to_buf(iov, out_num, 0, &req->out,
                            sizeof(req->out)) != sizeof(req->out))) {
498 499
        virtio_error(vdev, "virtio-blk request outhdr too short");
        return -1;
500
    }
501

502
    iov_discard_front(&iov, &out_num, sizeof(req->out));
503

G
Gonglei 已提交
504
    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
505 506
        virtio_error(vdev, "virtio-blk request inhdr too short");
        return -1;
507 508
    }

509 510
    /* We always touch the last byte, so just see how big in_iov is.  */
    req->in_len = iov_size(in_iov, in_num);
511 512 513 514
    req->in = (void *)in_iov[in_num - 1].iov_base
              + in_iov[in_num - 1].iov_len
              - sizeof(struct virtio_blk_inhdr);
    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
515

516
    type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
517

P
Peter Lieven 已提交
518
    /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
S
Stefan Weil 已提交
519
     * is an optional flag. Although a guest should not send this flag if
P
Peter Lieven 已提交
520 521 522 523 524 525 526 527 528 529
     * not negotiated we ignored it in the past. So keep ignoring it. */
    switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
    case VIRTIO_BLK_T_IN:
    {
        bool is_write = type & VIRTIO_BLK_T_OUT;
        req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev),
                                       &req->out.sector);

        if (is_write) {
            qemu_iovec_init_external(&req->qiov, iov, out_num);
530
            trace_virtio_blk_handle_write(vdev, req, req->sector_num,
P
Peter Lieven 已提交
531 532 533
                                          req->qiov.size / BDRV_SECTOR_SIZE);
        } else {
            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
534
            trace_virtio_blk_handle_read(vdev, req, req->sector_num,
P
Peter Lieven 已提交
535 536 537 538 539 540
                                         req->qiov.size / BDRV_SECTOR_SIZE);
        }

        if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
                                      req->qiov.size)) {
            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
541 542
            block_acct_invalid(blk_get_stats(req->dev->blk),
                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
P
Peter Lieven 已提交
543
            virtio_blk_free_request(req);
544
            return 0;
P
Peter Lieven 已提交
545 546 547 548 549 550 551 552 553
        }

        block_acct_start(blk_get_stats(req->dev->blk),
                         &req->acct, req->qiov.size,
                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);

        /* merge would exceed maximum number of requests or IO direction
         * changes */
        if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
554 555
                                  is_write != mrb->is_write ||
                                  !req->dev->conf.request_merging)) {
P
Peter Lieven 已提交
556 557 558 559 560 561 562 563 564
            virtio_blk_submit_multireq(req->dev->blk, mrb);
        }

        assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
        mrb->reqs[mrb->num_reqs++] = req;
        mrb->is_write = is_write;
        break;
    }
    case VIRTIO_BLK_T_FLUSH:
565
        virtio_blk_handle_flush(req, mrb);
P
Peter Lieven 已提交
566 567
        break;
    case VIRTIO_BLK_T_SCSI_CMD:
568
        virtio_blk_handle_scsi(req);
P
Peter Lieven 已提交
569 570 571
        break;
    case VIRTIO_BLK_T_GET_ID:
    {
572 573
        VirtIOBlock *s = req->dev;

574 575 576 577
        /*
         * NB: per existing s/n string convention the string is
         * terminated by '\0' only when shorter than buffer.
         */
578
        const char *serial = s->conf.serial ? s->conf.serial : "";
579 580 581 582
        size_t size = MIN(strlen(serial) + 1,
                          MIN(iov_size(in_iov, in_num),
                              VIRTIO_BLK_ID_BYTES));
        iov_from_buf(in_iov, in_num, 0, serial, size);
583
        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
584
        virtio_blk_free_request(req);
P
Peter Lieven 已提交
585 586 587
        break;
    }
    default:
588
        virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
589
        virtio_blk_free_request(req);
590
    }
591
    return 0;
592 593
}

594
bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
A
aliguori 已提交
595 596
{
    VirtIOBlockReq *req;
P
Peter Lieven 已提交
597
    MultiReqBuffer mrb = {};
598
    bool progress = false;
A
aliguori 已提交
599

600
    aio_context_acquire(blk_get_aio_context(s->blk));
601 602
    blk_io_plug(s->blk);

603 604 605 606
    do {
        virtio_queue_set_notification(vq, 0);

        while ((req = virtio_blk_get_request(s, vq))) {
607
            progress = true;
608 609 610 611 612
            if (virtio_blk_handle_request(req, &mrb)) {
                virtqueue_detach_element(req->vq, &req->elem, 0);
                virtio_blk_free_request(req);
                break;
            }
613
        }
614 615 616

        virtio_queue_set_notification(vq, 1);
    } while (!virtio_queue_empty(vq));
K
Kevin Wolf 已提交
617

P
Peter Lieven 已提交
618 619 620
    if (mrb.num_reqs) {
        virtio_blk_submit_multireq(s->blk, &mrb);
    }
621 622

    blk_io_unplug(s->blk);
623
    aio_context_release(blk_get_aio_context(s->blk));
624 625 626 627 628 629
    return progress;
}

static void virtio_blk_handle_output_do(VirtIOBlock *s, VirtQueue *vq)
{
    virtio_blk_handle_vq(s, vq);
A
aliguori 已提交
630 631
}

632 633 634 635 636 637 638 639
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
    VirtIOBlock *s = (VirtIOBlock *)vdev;

    if (s->dataplane) {
        /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
         * dataplane here instead of waiting for .set_status().
         */
640
        virtio_device_start_ioeventfd(vdev);
641 642 643 644
        if (!s->dataplane_disabled) {
            return;
        }
    }
645
    virtio_blk_handle_output_do(s, vq);
646 647
}

648
static void virtio_blk_dma_restart_bh(void *opaque)
649 650 651
{
    VirtIOBlock *s = opaque;
    VirtIOBlockReq *req = s->rq;
P
Peter Lieven 已提交
652
    MultiReqBuffer mrb = {};
653

654 655
    qemu_bh_delete(s->bh);
    s->bh = NULL;
656 657 658

    s->rq = NULL;

659
    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
660
    while (req) {
661
        VirtIOBlockReq *next = req->next;
662 663 664 665 666 667 668 669 670 671 672 673
        if (virtio_blk_handle_request(req, &mrb)) {
            /* Device is now broken and won't do any processing until it gets
             * reset. Already queued requests will be lost: let's purge them.
             */
            while (req) {
                next = req->next;
                virtqueue_detach_element(req->vq, &req->elem, 0);
                virtio_blk_free_request(req);
                req = next;
            }
            break;
        }
674
        req = next;
675
    }
676

P
Peter Lieven 已提交
677 678 679
    if (mrb.num_reqs) {
        virtio_blk_submit_multireq(s->blk, &mrb);
    }
680
    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
681 682
}

683 684
static void virtio_blk_dma_restart_cb(void *opaque, int running,
                                      RunState state)
685 686 687
{
    VirtIOBlock *s = opaque;

688
    if (!running) {
689
        return;
690
    }
691 692

    if (!s->bh) {
693
        s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk),
694
                           virtio_blk_dma_restart_bh, s);
695 696 697 698
        qemu_bh_schedule(s->bh);
    }
}

A
aliguori 已提交
699 700
static void virtio_blk_reset(VirtIODevice *vdev)
{
K
KONRAD Frederic 已提交
701
    VirtIOBlock *s = VIRTIO_BLK(vdev);
702
    AioContext *ctx;
703
    VirtIOBlockReq *req;
704

705 706 707 708
    ctx = blk_get_aio_context(s->blk);
    aio_context_acquire(ctx);
    blk_drain(s->blk);

709 710 711 712 713
    /* We drop queued requests after blk_drain() because blk_drain() itself can
     * produce them. */
    while (s->rq) {
        req = s->rq;
        s->rq = req->next;
714
        virtqueue_detach_element(req->vq, &req->elem, 0);
715 716 717
        virtio_blk_free_request(req);
    }

718 719
    aio_context_release(ctx);

720
    assert(!s->dataplane_started);
721
    blk_set_enable_write_cache(s->blk, s->original_wce);
A
aliguori 已提交
722 723
}

724 725
/* coalesce internal state, copy to pci i/o region 0
 */
A
aliguori 已提交
726 727
static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
{
K
KONRAD Frederic 已提交
728
    VirtIOBlock *s = VIRTIO_BLK(vdev);
729
    BlockConf *conf = &s->conf.conf;
A
aliguori 已提交
730 731
    struct virtio_blk_config blkcfg;
    uint64_t capacity;
732
    int64_t length;
733
    int blk_size = conf->logical_block_size;
A
aliguori 已提交
734

735
    blk_get_geometry(s->blk, &capacity);
G
Gerd Hoffmann 已提交
736
    memset(&blkcfg, 0, sizeof(blkcfg));
737 738
    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
    virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
739
    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
740
    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
741 742
    virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
    virtio_stw_p(vdev, &blkcfg.opt_io_size, conf->opt_io_size / blk_size);
743
    blkcfg.geometry.heads = conf->heads;
744 745
    /*
     * We must ensure that the block device capacity is a multiple of
746
     * the logical block size. If that is not the case, let's use
747 748
     * sector_mask to adopt the geometry to have a correct picture.
     * For those devices where the capacity is ok for the given geometry
749
     * we don't touch the sector value of the geometry, since some devices
750 751 752 753 754
     * (like s390 dasd) need a specific value. Here the capacity is already
     * cyls*heads*secs*blk_size and the sector value is not block size
     * divided by 512 - instead it is the amount of blk_size blocks
     * per track (cylinder).
     */
755 756
    length = blk_getlength(s->blk);
    if (length > 0 && length / conf->heads / conf->secs % blk_size) {
757
        blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
758
    } else {
759
        blkcfg.geometry.sectors = conf->secs;
760
    }
761
    blkcfg.size_max = 0;
762
    blkcfg.physical_block_exp = get_physical_block_exp(conf);
763
    blkcfg.alignment_offset = 0;
764
    blkcfg.wce = blk_enable_write_cache(s->blk);
765
    virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
766
    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
A
aliguori 已提交
767 768
}

769 770
static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
{
K
KONRAD Frederic 已提交
771
    VirtIOBlock *s = VIRTIO_BLK(vdev);
772 773 774
    struct virtio_blk_config blkcfg;

    memcpy(&blkcfg, config, sizeof(blkcfg));
775

776 777 778
    aio_context_acquire(blk_get_aio_context(s->blk));
    blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
    aio_context_release(blk_get_aio_context(s->blk));
779 780
}

J
Jason Wang 已提交
781 782
static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
                                        Error **errp)
A
aliguori 已提交
783
{
K
KONRAD Frederic 已提交
784
    VirtIOBlock *s = VIRTIO_BLK(vdev);
785

786 787 788 789
    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
790
    if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
791 792 793 794 795
        if (s->conf.scsi) {
            error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
            return 0;
        }
    } else {
796
        virtio_clear_feature(&features, VIRTIO_F_ANY_LAYOUT);
797 798
        virtio_add_feature(&features, VIRTIO_BLK_F_SCSI);
    }
799

800
    if (s->conf.config_wce) {
801
        virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
802
    }
803
    if (blk_enable_write_cache(s->blk)) {
804
        virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
805 806
    }
    if (blk_is_read_only(s->blk)) {
807
        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
808
    }
809 810 811
    if (s->conf.num_queues > 1) {
        virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
    }
812 813

    return features;
A
aliguori 已提交
814 815
}

816 817
static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
{
K
KONRAD Frederic 已提交
818
    VirtIOBlock *s = VIRTIO_BLK(vdev);
819

820 821
    if (!(status & (VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK))) {
        assert(!s->dataplane_started);
822 823
    }

824 825 826 827
    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
        return;
    }

828 829 830 831 832 833 834 835 836 837 838 839 840
    /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
     * cache flushes.  Thus, the "auto writethrough" behavior is never
     * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
     * Leaving it enabled would break the following sequence:
     *
     *     Guest started with "-drive cache=writethrough"
     *     Guest sets status to 0
     *     Guest sets DRIVER bit in status field
     *     Guest reads host features (WCE=0, CONFIG_WCE=1)
     *     Guest writes guest features (WCE=0, CONFIG_WCE=1)
     *     Guest writes 1 to the WCE configuration field (writeback mode)
     *     Guest sets DRIVER_OK bit in status field
     *
841
     * s->blk would erroneously be placed in writethrough mode.
842
     */
843
    if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
844 845
        aio_context_acquire(blk_get_aio_context(s->blk));
        blk_set_enable_write_cache(s->blk,
846 847
                                   virtio_vdev_has_feature(vdev,
                                                           VIRTIO_BLK_F_WCE));
848
        aio_context_release(blk_get_aio_context(s->blk));
849
    }
850 851
}

852 853 854 855 856
static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
{
    VirtIOBlock *s = VIRTIO_BLK(vdev);
    VirtIOBlockReq *req = s->rq;

857 858
    while (req) {
        qemu_put_sbyte(f, 1);
859 860 861 862 863

        if (s->conf.num_queues > 1) {
            qemu_put_be32(f, virtio_get_queue_index(req->vq));
        }

864
        qemu_put_virtqueue_element(f, &req->elem);
865 866 867
        req = req->next;
    }
    qemu_put_sbyte(f, 0);
A
aliguori 已提交
868 869
}

870 871 872 873
static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
                                  int version_id)
{
    VirtIOBlock *s = VIRTIO_BLK(vdev);
874

875
    while (qemu_get_sbyte(f)) {
876 877
        unsigned nvqs = s->conf.num_queues;
        unsigned vq_idx = 0;
878
        VirtIOBlockReq *req;
879 880 881 882 883 884 885 886 887 888 889

        if (nvqs > 1) {
            vq_idx = qemu_get_be32(f);

            if (vq_idx >= nvqs) {
                error_report("Invalid virtqueue index in request list: %#x",
                             vq_idx);
                return -EINVAL;
            }
        }

J
Jason Wang 已提交
890
        req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq));
891
        virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
892
        req->next = s->rq;
893
        s->rq = req;
894
    }
A
aliguori 已提交
895 896 897 898

    return 0;
}

899
static void virtio_blk_resize(void *opaque)
900
{
K
KONRAD Frederic 已提交
901
    VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
902

K
KONRAD Frederic 已提交
903
    virtio_notify_config(vdev);
904 905
}

906
static const BlockDevOps virtio_block_ops = {
907
    .resize_cb = virtio_blk_resize,
908 909
};

910
static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
911
{
912
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
913
    VirtIOBlock *s = VIRTIO_BLK(dev);
914
    VirtIOBlkConf *conf = &s->conf;
915
    Error *err = NULL;
916
    unsigned i;
P
Paul Brook 已提交
917

918
    if (!conf->conf.blk) {
919 920
        error_setg(errp, "drive property not set");
        return;
921
    }
922
    if (!blk_is_inserted(conf->conf.blk)) {
923 924
        error_setg(errp, "Device needs media, but drive is empty");
        return;
925
    }
926 927 928 929
    if (!conf->num_queues) {
        error_setg(errp, "num-queues property must be larger than 0");
        return;
    }
930 931 932 933 934 935 936
    if (!is_power_of_2(conf->queue_size) ||
        conf->queue_size > VIRTQUEUE_MAX_SIZE) {
        error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
                   "must be a power of 2 (max %d)",
                   conf->queue_size, VIRTQUEUE_MAX_SIZE);
        return;
    }
937

938 939 940
    if (!blkconf_apply_backend_options(&conf->conf,
                                       blk_is_read_only(conf->conf.blk), true,
                                       errp)) {
K
Kevin Wolf 已提交
941 942
        return;
    }
943
    s->original_wce = blk_enable_write_cache(conf->conf.blk);
944
    if (!blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, errp)) {
945
        return;
946
    }
947

948
    blkconf_blocksizes(&conf->conf);
949

950 951 952 953 954 955 956
    if (conf->conf.logical_block_size >
        conf->conf.physical_block_size) {
        error_setg(errp,
                   "logical_block_size > physical_block_size not supported");
        return;
    }

957 958
    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
                sizeof(struct virtio_blk_config));
A
aliguori 已提交
959

960
    s->blk = conf->conf.blk;
961
    s->rq = NULL;
962
    s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
963

964
    for (i = 0; i < conf->num_queues; i++) {
965
        virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
966
    }
967
    virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
968
    if (err != NULL) {
969
        error_propagate(errp, err);
970
        virtio_cleanup(vdev);
971
        return;
972
    }
A
aliguori 已提交
973

974
    s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
975 976
    blk_set_dev_ops(s->blk, &virtio_block_ops, s);
    blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
A
aliguori 已提交
977

978
    blk_iostatus_enable(s->blk);
979 980
}

981
static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
982
{
983 984 985
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    VirtIOBlock *s = VIRTIO_BLK(dev);

986 987 988
    virtio_blk_data_plane_destroy(s->dataplane);
    s->dataplane = NULL;
    qemu_del_vm_change_state_handler(s->change);
989
    blockdev_mark_auto_del(s->blk);
990
    virtio_cleanup(vdev);
991 992
}

993 994 995 996
static void virtio_blk_instance_init(Object *obj)
{
    VirtIOBlock *s = VIRTIO_BLK(obj);

997
    device_add_bootindex_property(obj, &s->conf.conf.bootindex,
998 999
                                  "bootindex", "/disk@0,0",
                                  DEVICE(obj), NULL);
1000 1001
}

1002 1003 1004 1005 1006 1007 1008 1009 1010
static const VMStateDescription vmstate_virtio_blk = {
    .name = "virtio-blk",
    .minimum_version_id = 2,
    .version_id = 2,
    .fields = (VMStateField[]) {
        VMSTATE_VIRTIO_DEVICE,
        VMSTATE_END_OF_LIST()
    },
};
1011

1012
static Property virtio_blk_properties[] = {
1013
    DEFINE_BLOCK_PROPERTIES(VirtIOBlock, conf.conf),
1014
    DEFINE_BLOCK_ERROR_PROPERTIES(VirtIOBlock, conf.conf),
1015 1016 1017
    DEFINE_BLOCK_CHS_PROPERTIES(VirtIOBlock, conf.conf),
    DEFINE_PROP_STRING("serial", VirtIOBlock, conf.serial),
    DEFINE_PROP_BIT("config-wce", VirtIOBlock, conf.config_wce, 0, true),
1018
#ifdef __linux__
1019
    DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, false),
1020
#endif
1021 1022
    DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
                    true),
1023
    DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
1024
    DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
1025 1026
    DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
                     IOThread *),
1027 1028 1029 1030 1031 1032 1033
    DEFINE_PROP_END_OF_LIST(),
};

static void virtio_blk_class_init(ObjectClass *klass, void *data)
{
    DeviceClass *dc = DEVICE_CLASS(klass);
    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1034

1035
    dc->props = virtio_blk_properties;
1036
    dc->vmsd = &vmstate_virtio_blk;
1037
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
1038
    vdc->realize = virtio_blk_device_realize;
1039
    vdc->unrealize = virtio_blk_device_unrealize;
1040 1041 1042 1043 1044
    vdc->get_config = virtio_blk_update_config;
    vdc->set_config = virtio_blk_set_config;
    vdc->get_features = virtio_blk_get_features;
    vdc->set_status = virtio_blk_set_status;
    vdc->reset = virtio_blk_reset;
1045 1046
    vdc->save = virtio_blk_save_device;
    vdc->load = virtio_blk_load_device;
1047 1048
    vdc->start_ioeventfd = virtio_blk_data_plane_start;
    vdc->stop_ioeventfd = virtio_blk_data_plane_stop;
1049 1050
}

1051
static const TypeInfo virtio_blk_info = {
1052 1053 1054
    .name = TYPE_VIRTIO_BLK,
    .parent = TYPE_VIRTIO_DEVICE,
    .instance_size = sizeof(VirtIOBlock),
1055
    .instance_init = virtio_blk_instance_init,
1056 1057 1058 1059 1060
    .class_init = virtio_blk_class_init,
};

static void virtio_register_types(void)
{
1061
    type_register_static(&virtio_blk_info);
1062 1063 1064
}

type_init(virtio_register_types)