virtio-blk.c 13.5 KB
Newer Older
A
aliguori 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Virtio Block Device
 *
 * Copyright IBM, Corp. 2007
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

14 15
#include <qemu-common.h>
#include <sysemu.h>
A
aliguori 已提交
16 17
#include "virtio-blk.h"
#include "block_int.h"
18 19 20
#ifdef __linux__
# include <scsi/sg.h>
#endif
A
aliguori 已提交
21 22 23 24 25 26

typedef struct VirtIOBlock
{
    VirtIODevice vdev;
    BlockDriverState *bs;
    VirtQueue *vq;
27
    void *rq;
28
    QEMUBH *bh;
29
    BlockConf *conf;
30
    unsigned short sector_mask;
A
aliguori 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43
} VirtIOBlock;

static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
{
    return (VirtIOBlock *)vdev;
}

typedef struct VirtIOBlockReq
{
    VirtIOBlock *dev;
    VirtQueueElement elem;
    struct virtio_blk_inhdr *in;
    struct virtio_blk_outhdr *out;
44
    struct virtio_scsi_inhdr *scsi;
45
    QEMUIOVector qiov;
46
    struct VirtIOBlockReq *next;
A
aliguori 已提交
47 48
} VirtIOBlockReq;

49 50 51 52 53
static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
{
    VirtIOBlock *s = req->dev;

    req->in->status = status;
54
    virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
55 56 57 58 59
    virtio_notify(&s->vdev, s->vq);

    qemu_free(req);
}

K
Kevin Wolf 已提交
60 61
static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
    int is_read)
62
{
K
Kevin Wolf 已提交
63 64
    BlockInterfaceErrorAction action =
        drive_get_on_error(req->dev->bs, is_read);
65 66
    VirtIOBlock *s = req->dev;

67 68
    if (action == BLOCK_ERR_IGNORE) {
        bdrv_mon_event(req->dev->bs, BDRV_ACTION_IGNORE, is_read);
69
        return 0;
70
    }
71 72 73 74 75

    if ((error == ENOSPC && action == BLOCK_ERR_STOP_ENOSPC)
            || action == BLOCK_ERR_STOP_ANY) {
        req->next = s->rq;
        s->rq = req;
76
        bdrv_mon_event(req->dev->bs, BDRV_ACTION_STOP, is_read);
77
        vm_stop(0);
78 79
    } else {
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
80
        bdrv_mon_event(req->dev->bs, BDRV_ACTION_REPORT, is_read);
81 82 83 84 85
    }

    return 1;
}

A
aliguori 已提交
86 87 88 89
static void virtio_blk_rw_complete(void *opaque, int ret)
{
    VirtIOBlockReq *req = opaque;

K
Kevin Wolf 已提交
90 91 92
    if (ret) {
        int is_read = !(req->out->type & VIRTIO_BLK_T_OUT);
        if (virtio_blk_handle_rw_error(req, -ret, is_read))
93
            return;
A
aliguori 已提交
94 95
    }

K
Kevin Wolf 已提交
96
    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
97
}
A
aliguori 已提交
98

99 100 101 102 103 104 105
static void virtio_blk_flush_complete(void *opaque, int ret)
{
    VirtIOBlockReq *req = opaque;

    virtio_blk_req_complete(req, ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK);
}

106 107 108
static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
{
    VirtIOBlockReq *req = qemu_mallocz(sizeof(*req));
109
    req->dev = s;
110
    return req;
A
aliguori 已提交
111 112 113 114
}

static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
{
115
    VirtIOBlockReq *req = virtio_blk_alloc_request(s);
A
aliguori 已提交
116

117 118 119 120 121
    if (req != NULL) {
        if (!virtqueue_pop(s->vq, &req->elem)) {
            qemu_free(req);
            return NULL;
        }
A
aliguori 已提交
122 123 124 125 126
    }

    return req;
}

127 128 129 130
#ifdef __linux__
static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
{
    struct sg_io_hdr hdr;
131
    int ret;
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
    int status;
    int i;

    /*
     * We require at least one output segment each for the virtio_blk_outhdr
     * and the SCSI command block.
     *
     * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
     * and the sense buffer pointer in the input segments.
     */
    if (req->elem.out_num < 2 || req->elem.in_num < 3) {
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
        return;
    }

    /*
     * No support for bidirection commands yet.
     */
    if (req->elem.out_num > 2 && req->elem.in_num > 3) {
        virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
        return;
    }

    /*
     * The scsi inhdr is placed in the second-to-last input segment, just
     * before the regular inhdr.
     */
    req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;

    memset(&hdr, 0, sizeof(struct sg_io_hdr));
    hdr.interface_id = 'S';
    hdr.cmd_len = req->elem.out_sg[1].iov_len;
    hdr.cmdp = req->elem.out_sg[1].iov_base;
    hdr.dxfer_len = 0;

    if (req->elem.out_num > 2) {
        /*
         * If there are more than the minimally required 2 output segments
         * there is write payload starting from the third iovec.
         */
        hdr.dxfer_direction = SG_DXFER_TO_DEV;
        hdr.iovec_count = req->elem.out_num - 2;

        for (i = 0; i < hdr.iovec_count; i++)
            hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len;

        hdr.dxferp = req->elem.out_sg + 2;

    } else if (req->elem.in_num > 3) {
        /*
         * If we have more than 3 input segments the guest wants to actually
         * read data.
         */
        hdr.dxfer_direction = SG_DXFER_FROM_DEV;
        hdr.iovec_count = req->elem.in_num - 3;
        for (i = 0; i < hdr.iovec_count; i++)
            hdr.dxfer_len += req->elem.in_sg[i].iov_len;

        hdr.dxferp = req->elem.in_sg;
    } else {
        /*
         * Some SCSI commands don't actually transfer any data.
         */
        hdr.dxfer_direction = SG_DXFER_NONE;
    }

    hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base;
    hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len;

    ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr);
    if (ret) {
        status = VIRTIO_BLK_S_UNSUPP;
        hdr.status = ret;
        hdr.resid = hdr.dxfer_len;
    } else if (hdr.status) {
        status = VIRTIO_BLK_S_IOERR;
    } else {
        status = VIRTIO_BLK_S_OK;
    }

    req->scsi->errors = hdr.status;
    req->scsi->residual = hdr.resid;
    req->scsi->sense_len = hdr.sb_len_wr;
    req->scsi->data_len = hdr.dxfer_len;

    virtio_blk_req_complete(req, status);
}
#else
static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
{
    virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
}
#endif /* __linux__ */

K
Kevin Wolf 已提交
226 227
static void do_multiwrite(BlockDriverState *bs, BlockRequest *blkreq,
    int num_writes)
228
{
K
Kevin Wolf 已提交
229 230 231 232 233 234
    int i, ret;
    ret = bdrv_aio_multiwrite(bs, blkreq, num_writes);

    if (ret != 0) {
        for (i = 0; i < num_writes; i++) {
            if (blkreq[i].error) {
235
                virtio_blk_rw_complete(blkreq[i].opaque, -EIO);
K
Kevin Wolf 已提交
236 237 238 239
            }
        }
    }
}
240

241 242 243 244 245 246 247 248 249 250
static void virtio_blk_handle_flush(VirtIOBlockReq *req)
{
    BlockDriverAIOCB *acb;

    acb = bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
    if (!acb) {
        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
    }
}

K
Kevin Wolf 已提交
251 252 253
static void virtio_blk_handle_write(BlockRequest *blkreq, int *num_writes,
    VirtIOBlockReq *req, BlockDriverState **old_bs)
{
254 255 256 257 258
    if (req->out->sector & req->dev->sector_mask) {
        virtio_blk_rw_complete(req, -EIO);
        return;
    }

K
Kevin Wolf 已提交
259 260 261 262 263 264
    if (req->dev->bs != *old_bs || *num_writes == 32) {
        if (*old_bs != NULL) {
            do_multiwrite(*old_bs, blkreq, *num_writes);
        }
        *num_writes = 0;
        *old_bs = req->dev->bs;
265
    }
K
Kevin Wolf 已提交
266 267 268 269 270 271 272 273 274

    blkreq[*num_writes].sector = req->out->sector;
    blkreq[*num_writes].nb_sectors = req->qiov.size / 512;
    blkreq[*num_writes].qiov = &req->qiov;
    blkreq[*num_writes].cb = virtio_blk_rw_complete;
    blkreq[*num_writes].opaque = req;
    blkreq[*num_writes].error = 0;

    (*num_writes)++;
275
}
276

277 278
static void virtio_blk_handle_read(VirtIOBlockReq *req)
{
279 280
    BlockDriverAIOCB *acb;

281 282 283 284 285
    if (req->out->sector & req->dev->sector_mask) {
        virtio_blk_rw_complete(req, -EIO);
        return;
    }

286 287 288
    acb = bdrv_aio_readv(req->dev->bs, req->out->sector, &req->qiov,
                         req->qiov.size / 512, virtio_blk_rw_complete, req);
    if (!acb) {
289
        virtio_blk_rw_complete(req, -EIO);
290
    }
291 292
}

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
typedef struct MultiReqBuffer {
    BlockRequest        blkreq[32];
    int                 num_writes;
    BlockDriverState    *old_bs;
} MultiReqBuffer;

static void virtio_blk_handle_request(VirtIOBlockReq *req,
    MultiReqBuffer *mrb)
{
    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
        fprintf(stderr, "virtio-blk missing headers\n");
        exit(1);
    }

    if (req->elem.out_sg[0].iov_len < sizeof(*req->out) ||
        req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) {
        fprintf(stderr, "virtio-blk header not in correct element\n");
        exit(1);
    }

    req->out = (void *)req->elem.out_sg[0].iov_base;
    req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;

    if (req->out->type & VIRTIO_BLK_T_FLUSH) {
        virtio_blk_handle_flush(req);
    } else if (req->out->type & VIRTIO_BLK_T_SCSI_CMD) {
        virtio_blk_handle_scsi(req);
    } else if (req->out->type & VIRTIO_BLK_T_OUT) {
        qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
                                 req->elem.out_num - 1);
        virtio_blk_handle_write(mrb->blkreq, &mrb->num_writes,
            req, &mrb->old_bs);
    } else {
        qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
                                 req->elem.in_num - 1);
        virtio_blk_handle_read(req);
    }
}

A
aliguori 已提交
332 333 334 335
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
    VirtIOBlock *s = to_virtio_blk(vdev);
    VirtIOBlockReq *req;
336 337 338 339
    MultiReqBuffer mrb = {
        .num_writes = 0,
        .old_bs = NULL,
    };
A
aliguori 已提交
340 341

    while ((req = virtio_blk_get_request(s))) {
342
        virtio_blk_handle_request(req, &mrb);
A
aliguori 已提交
343
    }
K
Kevin Wolf 已提交
344

345 346
    if (mrb.num_writes > 0) {
        do_multiwrite(mrb.old_bs, mrb.blkreq, mrb.num_writes);
K
Kevin Wolf 已提交
347 348
    }

A
aliguori 已提交
349 350 351 352 353 354 355
    /*
     * FIXME: Want to check for completions before returning to guest mode,
     * so cached reads and writes are reported as quickly as possible. But
     * that should be done in the generic block layer.
     */
}

356
static void virtio_blk_dma_restart_bh(void *opaque)
357 358 359
{
    VirtIOBlock *s = opaque;
    VirtIOBlockReq *req = s->rq;
360 361 362 363
    MultiReqBuffer mrb = {
        .num_writes = 0,
        .old_bs = NULL,
    };
364

365 366
    qemu_bh_delete(s->bh);
    s->bh = NULL;
367 368 369 370

    s->rq = NULL;

    while (req) {
371
        virtio_blk_handle_request(req, &mrb);
372 373
        req = req->next;
    }
374 375 376 377

    if (mrb.num_writes > 0) {
        do_multiwrite(mrb.old_bs, mrb.blkreq, mrb.num_writes);
    }
378 379
}

380 381 382 383 384 385 386 387 388 389 390 391 392
static void virtio_blk_dma_restart_cb(void *opaque, int running, int reason)
{
    VirtIOBlock *s = opaque;

    if (!running)
        return;

    if (!s->bh) {
        s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s);
        qemu_bh_schedule(s->bh);
    }
}

A
aliguori 已提交
393 394 395 396 397 398 399 400 401
static void virtio_blk_reset(VirtIODevice *vdev)
{
    /*
     * This should cancel pending requests, but can't do nicely until there
     * are per-device request lists.
     */
    qemu_aio_flush();
}

402 403
/* coalesce internal state, copy to pci i/o region 0
 */
A
aliguori 已提交
404 405 406 407 408 409 410 411 412
static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
{
    VirtIOBlock *s = to_virtio_blk(vdev);
    struct virtio_blk_config blkcfg;
    uint64_t capacity;
    int cylinders, heads, secs;

    bdrv_get_geometry(s->bs, &capacity);
    bdrv_get_geometry_hint(s->bs, &cylinders, &heads, &secs);
G
Gerd Hoffmann 已提交
413
    memset(&blkcfg, 0, sizeof(blkcfg));
A
aliguori 已提交
414 415 416 417
    stq_raw(&blkcfg.capacity, capacity);
    stl_raw(&blkcfg.seg_max, 128 - 2);
    stw_raw(&blkcfg.cylinders, cylinders);
    blkcfg.heads = heads;
418 419
    blkcfg.sectors = secs & ~s->sector_mask;
    blkcfg.blk_size = s->conf->logical_block_size;
420
    blkcfg.size_max = 0;
421 422
    blkcfg.physical_block_exp = get_physical_block_exp(s->conf);
    blkcfg.alignment_offset = 0;
423 424
    blkcfg.min_io_size = s->conf->min_io_size / blkcfg.blk_size;
    blkcfg.opt_io_size = s->conf->opt_io_size / blkcfg.blk_size;
425
    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
A
aliguori 已提交
426 427
}

428
static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
A
aliguori 已提交
429
{
430
    VirtIOBlock *s = to_virtio_blk(vdev);
431 432 433

    features |= (1 << VIRTIO_BLK_F_SEG_MAX);
    features |= (1 << VIRTIO_BLK_F_GEOMETRY);
434
    features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
435
    features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
436 437 438

    if (bdrv_enable_write_cache(s->bs))
        features |= (1 << VIRTIO_BLK_F_WCACHE);
439 440 441
    
    if (bdrv_is_read_only(s->bs))
        features |= 1 << VIRTIO_BLK_F_RO;
442 443

    return features;
A
aliguori 已提交
444 445 446 447 448
}

static void virtio_blk_save(QEMUFile *f, void *opaque)
{
    VirtIOBlock *s = opaque;
449 450
    VirtIOBlockReq *req = s->rq;

A
aliguori 已提交
451
    virtio_save(&s->vdev, f);
452 453 454 455 456 457 458
    
    while (req) {
        qemu_put_sbyte(f, 1);
        qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
        req = req->next;
    }
    qemu_put_sbyte(f, 0);
A
aliguori 已提交
459 460 461 462 463 464
}

static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
{
    VirtIOBlock *s = opaque;

465
    if (version_id != 2)
A
aliguori 已提交
466 467 468
        return -EINVAL;

    virtio_load(&s->vdev, f);
469 470 471 472 473 474
    while (qemu_get_sbyte(f)) {
        VirtIOBlockReq *req = virtio_blk_alloc_request(s);
        qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
        req->next = s->rq;
        s->rq = req->next;
    }
A
aliguori 已提交
475 476 477 478

    return 0;
}

479
VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf)
A
aliguori 已提交
480 481 482 483
{
    VirtIOBlock *s;
    int cylinders, heads, secs;
    static int virtio_blk_id;
P
Paul Brook 已提交
484

P
Paul Brook 已提交
485
    s = (VirtIOBlock *)virtio_common_init("virtio-blk", VIRTIO_ID_BLOCK,
486
                                          sizeof(struct virtio_blk_config),
P
Paul Brook 已提交
487
                                          sizeof(VirtIOBlock));
A
aliguori 已提交
488 489 490 491

    s->vdev.get_config = virtio_blk_update_config;
    s->vdev.get_features = virtio_blk_get_features;
    s->vdev.reset = virtio_blk_reset;
492
    s->bs = conf->dinfo->bdrv;
493
    s->conf = conf;
494
    s->rq = NULL;
495
    s->sector_mask = (s->conf->logical_block_size / 512) - 1;
A
aliguori 已提交
496 497 498 499 500
    bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
    bdrv_set_geometry_hint(s->bs, cylinders, heads, secs);

    s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);

501 502
    qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
    register_savevm("virtio-blk", virtio_blk_id++, 2,
A
aliguori 已提交
503 504
                    virtio_blk_save, virtio_blk_load, s);

P
Paul Brook 已提交
505
    return &s->vdev;
A
aliguori 已提交
506
}