xen_disk.c 30.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 *  xen paravirt block device backend
 *
 *  (c) Gerd Hoffmann <kraxel@redhat.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
16
 *  with this program; if not, see <http://www.gnu.org/licenses/>.
17 18 19
 *
 *  Contributions after 2012-01-13 are licensed under the terms of the
 *  GNU GPL, version 2 or (at your option) any later version.
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <inttypes.h>
#include <time.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/uio.h>

38
#include "hw/hw.h"
P
Paolo Bonzini 已提交
39
#include "hw/xen/xen_backend.h"
40
#include "xen_blkif.h"
41
#include "sysemu/blockdev.h"
42 43 44 45 46 47 48 49 50 51 52 53

/* ------------------------------------------------------------- */

static int batch_maps   = 0;

static int max_requests = 32;

/* ------------------------------------------------------------- */

#define BLOCK_SIZE  512
#define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)

54 55 56 57 58 59 60
struct PersistentGrant {
    void *page;
    struct XenBlkDev *blkdev;
};

typedef struct PersistentGrant PersistentGrant;

61 62 63 64 65 66 67 68 69
struct ioreq {
    blkif_request_t     req;
    int16_t             status;

    /* parsed request */
    off_t               start;
    QEMUIOVector        v;
    int                 presync;
    int                 postsync;
70
    uint8_t             mapped;
71 72 73 74 75 76 77

    /* grant mapping */
    uint32_t            domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    uint32_t            refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    int                 prot;
    void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    void                *pages;
78
    int                 num_unmap;
79 80 81 82 83 84

    /* aio status */
    int                 aio_inflight;
    int                 aio_errors;

    struct XenBlkDev    *blkdev;
B
Blue Swirl 已提交
85
    QLIST_ENTRY(ioreq)   list;
86
    BlockAcctCookie     acct;
87 88 89 90 91 92 93 94 95
};

struct XenBlkDev {
    struct XenDevice    xendev;  /* must be first */
    char                *params;
    char                *mode;
    char                *type;
    char                *dev;
    char                *devtype;
96
    bool                directiosafe;
97 98 99 100 101 102 103 104 105 106 107 108
    const char          *fileproto;
    const char          *filename;
    int                 ring_ref;
    void                *sring;
    int64_t             file_blk;
    int64_t             file_size;
    int                 protocol;
    blkif_back_rings_t  rings;
    int                 more_work;
    int                 cnt_map;

    /* request lists */
B
Blue Swirl 已提交
109 110 111
    QLIST_HEAD(inflight_head, ioreq) inflight;
    QLIST_HEAD(finished_head, ioreq) finished;
    QLIST_HEAD(freelist_head, ioreq) freelist;
112 113 114 115
    int                 requests_total;
    int                 requests_inflight;
    int                 requests_finished;

116 117 118 119 120 121
    /* Persistent grants extension */
    gboolean            feature_persistent;
    GTree               *persistent_gnts;
    unsigned int        persistent_gnt_count;
    unsigned int        max_grants;

122
    /* qemu block driver */
G
Gerd Hoffmann 已提交
123
    DriveInfo           *dinfo;
124 125 126 127 128 129
    BlockDriverState    *bs;
    QEMUBH              *bh;
};

/* ------------------------------------------------------------- */

R
Roger Pau Monne 已提交
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
static void ioreq_reset(struct ioreq *ioreq)
{
    memset(&ioreq->req, 0, sizeof(ioreq->req));
    ioreq->status = 0;
    ioreq->start = 0;
    ioreq->presync = 0;
    ioreq->postsync = 0;
    ioreq->mapped = 0;

    memset(ioreq->domids, 0, sizeof(ioreq->domids));
    memset(ioreq->refs, 0, sizeof(ioreq->refs));
    ioreq->prot = 0;
    memset(ioreq->page, 0, sizeof(ioreq->page));
    ioreq->pages = NULL;

    ioreq->aio_inflight = 0;
    ioreq->aio_errors = 0;

    ioreq->blkdev = NULL;
    memset(&ioreq->list, 0, sizeof(ioreq->list));
    memset(&ioreq->acct, 0, sizeof(ioreq->acct));

    qemu_iovec_reset(&ioreq->v);
}

155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
{
    uint ua = GPOINTER_TO_UINT(a);
    uint ub = GPOINTER_TO_UINT(b);
    return (ua > ub) - (ua < ub);
}

static void destroy_grant(gpointer pgnt)
{
    PersistentGrant *grant = pgnt;
    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;

    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
        xen_be_printf(&grant->blkdev->xendev, 0,
                      "xc_gnttab_munmap failed: %s\n",
                      strerror(errno));
    }
    grant->blkdev->persistent_gnt_count--;
    xen_be_printf(&grant->blkdev->xendev, 3,
                  "unmapped grant %p\n", grant->page);
    g_free(grant);
}

178 179 180 181
static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
{
    struct ioreq *ioreq = NULL;

B
Blue Swirl 已提交
182
    if (QLIST_EMPTY(&blkdev->freelist)) {
183 184 185 186
        if (blkdev->requests_total >= max_requests) {
            goto out;
        }
        /* allocate new struct */
187
        ioreq = g_malloc0(sizeof(*ioreq));
188 189
        ioreq->blkdev = blkdev;
        blkdev->requests_total++;
190 191
        qemu_iovec_init(&ioreq->v, BLKIF_MAX_SEGMENTS_PER_REQUEST);
    } else {
192 193 194
        /* get one from freelist */
        ioreq = QLIST_FIRST(&blkdev->freelist);
        QLIST_REMOVE(ioreq, list);
195
    }
B
Blue Swirl 已提交
196
    QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
197 198 199 200 201 202 203 204 205 206
    blkdev->requests_inflight++;

out:
    return ioreq;
}

static void ioreq_finish(struct ioreq *ioreq)
{
    struct XenBlkDev *blkdev = ioreq->blkdev;

B
Blue Swirl 已提交
207 208
    QLIST_REMOVE(ioreq, list);
    QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
209 210 211 212
    blkdev->requests_inflight--;
    blkdev->requests_finished++;
}

213
static void ioreq_release(struct ioreq *ioreq, bool finish)
214 215 216
{
    struct XenBlkDev *blkdev = ioreq->blkdev;

B
Blue Swirl 已提交
217
    QLIST_REMOVE(ioreq, list);
R
Roger Pau Monne 已提交
218
    ioreq_reset(ioreq);
219
    ioreq->blkdev = blkdev;
B
Blue Swirl 已提交
220
    QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
221 222 223 224 225
    if (finish) {
        blkdev->requests_finished--;
    } else {
        blkdev->requests_inflight--;
    }
226 227 228 229 230 231 232 233 234 235 236 237 238 239
}

/*
 * translate request into iovec + start offset
 * do sanity checks along the way
 */
static int ioreq_parse(struct ioreq *ioreq)
{
    struct XenBlkDev *blkdev = ioreq->blkdev;
    uintptr_t mem;
    size_t len;
    int i;

    xen_be_printf(&blkdev->xendev, 3,
240 241 242
                  "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
                  ioreq->req.operation, ioreq->req.nr_segments,
                  ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
243 244
    switch (ioreq->req.operation) {
    case BLKIF_OP_READ:
245 246
        ioreq->prot = PROT_WRITE; /* to memory */
        break;
247 248
    case BLKIF_OP_FLUSH_DISKCACHE:
        ioreq->presync = 1;
249 250 251
        if (!ioreq->req.nr_segments) {
            return 0;
        }
252
        /* fall through */
253
    case BLKIF_OP_WRITE:
254 255
        ioreq->prot = PROT_READ; /* from memory */
        break;
256
    default:
257 258 259
        xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
                      ioreq->req.operation);
        goto err;
260 261
    };

262 263 264 265 266
    if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
        xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
        goto err;
    }

267 268
    ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
    for (i = 0; i < ioreq->req.nr_segments; i++) {
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
        if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
            xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
            goto err;
        }
        if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
            xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n");
            goto err;
        }
        if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
            xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n");
            goto err;
        }

        ioreq->domids[i] = blkdev->xendev.dom;
        ioreq->refs[i]   = ioreq->req.seg[i].gref;

        mem = ioreq->req.seg[i].first_sect * blkdev->file_blk;
        len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
287 288 289
        qemu_iovec_add(&ioreq->v, (void*)mem, len);
    }
    if (ioreq->start + ioreq->v.size > blkdev->file_size) {
290 291
        xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
        goto err;
292 293 294 295 296 297 298 299 300 301
    }
    return 0;

err:
    ioreq->status = BLKIF_RSP_ERROR;
    return -1;
}

static void ioreq_unmap(struct ioreq *ioreq)
{
302
    XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
303 304
    int i;

305
    if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
306
        return;
307
    }
308
    if (batch_maps) {
309 310 311
        if (!ioreq->pages) {
            return;
        }
312
        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
313 314 315
            xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
                          strerror(errno));
        }
316
        ioreq->blkdev->cnt_map -= ioreq->num_unmap;
317
        ioreq->pages = NULL;
318
    } else {
319
        for (i = 0; i < ioreq->num_unmap; i++) {
320 321 322 323 324 325 326 327 328 329
            if (!ioreq->page[i]) {
                continue;
            }
            if (xc_gnttab_munmap(gnt, ioreq->page[i], 1) != 0) {
                xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
                              strerror(errno));
            }
            ioreq->blkdev->cnt_map--;
            ioreq->page[i] = NULL;
        }
330
    }
331
    ioreq->mapped = 0;
332 333 334 335
}

static int ioreq_map(struct ioreq *ioreq)
{
336
    XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
337 338 339 340 341 342 343 344 345 346 347 348
    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    int i, j, new_maps = 0;
    PersistentGrant *grant;
    /* domids and refs variables will contain the information necessary
     * to map the grants that are needed to fulfill this request.
     *
     * After mapping the needed grants, the page array will contain the
     * memory address of each granted page in the order specified in ioreq
     * (disregarding if it's a persistent grant or not).
     */
349

350
    if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
351
        return 0;
352
    }
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
    if (ioreq->blkdev->feature_persistent) {
        for (i = 0; i < ioreq->v.niov; i++) {
            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
                                    GUINT_TO_POINTER(ioreq->refs[i]));

            if (grant != NULL) {
                page[i] = grant->page;
                xen_be_printf(&ioreq->blkdev->xendev, 3,
                              "using persistent-grant %" PRIu32 "\n",
                              ioreq->refs[i]);
            } else {
                    /* Add the grant to the list of grants that
                     * should be mapped
                     */
                    domids[new_maps] = ioreq->domids[i];
                    refs[new_maps] = ioreq->refs[i];
                    page[i] = NULL;
                    new_maps++;
            }
        }
        /* Set the protection to RW, since grants may be reused later
         * with a different protection than the one needed for this request
         */
        ioreq->prot = PROT_WRITE | PROT_READ;
    } else {
        /* All grants in the request should be mapped */
        memcpy(refs, ioreq->refs, sizeof(refs));
        memcpy(domids, ioreq->domids, sizeof(domids));
        memset(page, 0, sizeof(page));
        new_maps = ioreq->v.niov;
    }

    if (batch_maps && new_maps) {
386
        ioreq->pages = xc_gnttab_map_grant_refs
387
            (gnt, new_maps, domids, refs, ioreq->prot);
388 389 390
        if (ioreq->pages == NULL) {
            xen_be_printf(&ioreq->blkdev->xendev, 0,
                          "can't map %d grant refs (%s, %d maps)\n",
391
                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
392 393
            return -1;
        }
394 395 396 397
        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
            if (page[i] == NULL) {
                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
            }
398
        }
399 400 401
        ioreq->blkdev->cnt_map += new_maps;
    } else if (new_maps)  {
        for (i = 0; i < new_maps; i++) {
402
            ioreq->page[i] = xc_gnttab_map_grant_ref
403
                (gnt, domids[i], refs[i], ioreq->prot);
404 405 406
            if (ioreq->page[i] == NULL) {
                xen_be_printf(&ioreq->blkdev->xendev, 0,
                              "can't map grant ref %d (%s, %d maps)\n",
407
                              refs[i], strerror(errno), ioreq->blkdev->cnt_map);
408 409 410 411 412
                ioreq_unmap(ioreq);
                return -1;
            }
            ioreq->blkdev->cnt_map++;
        }
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
            if (page[i] == NULL) {
                page[i] = ioreq->page[j++];
            }
        }
    }
    if (ioreq->blkdev->feature_persistent) {
        while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants)
              && new_maps) {
            /* Go through the list of newly mapped grants and add as many
             * as possible to the list of persistently mapped grants.
             *
             * Since we start at the end of ioreq->page(s), we only need
             * to decrease new_maps to prevent this granted pages from
             * being unmapped in ioreq_unmap.
             */
            grant = g_malloc0(sizeof(*grant));
            new_maps--;
            if (batch_maps) {
                grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
            } else {
                grant->page = ioreq->page[new_maps];
            }
            grant->blkdev = ioreq->blkdev;
            xen_be_printf(&ioreq->blkdev->xendev, 3,
                          "adding grant %" PRIu32 " page: %p\n",
                          refs[new_maps], grant->page);
            g_tree_insert(ioreq->blkdev->persistent_gnts,
                          GUINT_TO_POINTER(refs[new_maps]),
                          grant);
            ioreq->blkdev->persistent_gnt_count++;
        }
    }
    for (i = 0; i < ioreq->v.niov; i++) {
        ioreq->v.iov[i].iov_base += (uintptr_t)page[i];
448
    }
449
    ioreq->mapped = 1;
450
    ioreq->num_unmap = new_maps;
451 452 453
    return 0;
}

454 455
static int ioreq_runio_qemu_aio(struct ioreq *ioreq);

456 457 458 459 460 461 462 463 464 465 466
static void qemu_aio_complete(void *opaque, int ret)
{
    struct ioreq *ioreq = opaque;

    if (ret != 0) {
        xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
                      ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
        ioreq->aio_errors++;
    }

    ioreq->aio_inflight--;
467 468 469 470 471
    if (ioreq->presync) {
        ioreq->presync = 0;
        ioreq_runio_qemu_aio(ioreq);
        return;
    }
472
    if (ioreq->aio_inflight > 0) {
473
        return;
474
    }
475
    if (ioreq->postsync) {
476 477 478 479
        ioreq->postsync = 0;
        ioreq->aio_inflight++;
        bdrv_aio_flush(ioreq->blkdev->bs, qemu_aio_complete, ioreq);
        return;
480
    }
481 482 483 484

    ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
    ioreq_unmap(ioreq);
    ioreq_finish(ioreq);
485
    bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct);
486 487 488 489 490 491 492
    qemu_bh_schedule(ioreq->blkdev->bh);
}

static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
{
    struct XenBlkDev *blkdev = ioreq->blkdev;

493 494 495
    if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
        goto err_no_map;
    }
496 497

    ioreq->aio_inflight++;
498
    if (ioreq->presync) {
499 500
        bdrv_aio_flush(ioreq->blkdev->bs, qemu_aio_complete, ioreq);
        return 0;
501
    }
502 503 504

    switch (ioreq->req.operation) {
    case BLKIF_OP_READ:
505
        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ);
506 507 508 509
        ioreq->aio_inflight++;
        bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
                       &ioreq->v, ioreq->v.size / BLOCK_SIZE,
                       qemu_aio_complete, ioreq);
510
        break;
511
    case BLKIF_OP_WRITE:
512
    case BLKIF_OP_FLUSH_DISKCACHE:
513
        if (!ioreq->req.nr_segments) {
514
            break;
515
        }
516 517

        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE);
518
        ioreq->aio_inflight++;
519 520 521
        bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
                        qemu_aio_complete, ioreq);
522
        break;
523
    default:
524 525
        /* unknown operation (shouldn't happen -- parse catches this) */
        goto err;
526 527 528 529 530 531 532
    }

    qemu_aio_complete(ioreq, 0);

    return 0;

err:
533 534 535
    ioreq_unmap(ioreq);
err_no_map:
    ioreq_finish(ioreq);
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
    ioreq->status = BLKIF_RSP_ERROR;
    return -1;
}

static int blk_send_response_one(struct ioreq *ioreq)
{
    struct XenBlkDev  *blkdev = ioreq->blkdev;
    int               send_notify   = 0;
    int               have_requests = 0;
    blkif_response_t  resp;
    void              *dst;

    resp.id        = ioreq->req.id;
    resp.operation = ioreq->req.operation;
    resp.status    = ioreq->status;

    /* Place on the response ring for the relevant domain. */
    switch (blkdev->protocol) {
    case BLKIF_PROTOCOL_NATIVE:
555 556
        dst = RING_GET_RESPONSE(&blkdev->rings.native, blkdev->rings.native.rsp_prod_pvt);
        break;
557
    case BLKIF_PROTOCOL_X86_32:
558 559
        dst = RING_GET_RESPONSE(&blkdev->rings.x86_32_part,
                                blkdev->rings.x86_32_part.rsp_prod_pvt);
560
        break;
561
    case BLKIF_PROTOCOL_X86_64:
562 563
        dst = RING_GET_RESPONSE(&blkdev->rings.x86_64_part,
                                blkdev->rings.x86_64_part.rsp_prod_pvt);
564
        break;
565
    default:
566
        dst = NULL;
567 568 569 570 571 572
    }
    memcpy(dst, &resp, sizeof(resp));
    blkdev->rings.common.rsp_prod_pvt++;

    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
    if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
573 574 575 576 577 578
        /*
         * Tail check for pending requests. Allows frontend to avoid
         * notifications if requests are already in flight (lower
         * overheads and promotes batching).
         */
        RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
579
    } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
580
        have_requests = 1;
581 582
    }

583 584 585
    if (have_requests) {
        blkdev->more_work++;
    }
586 587 588 589 590 591 592 593 594
    return send_notify;
}

/* walk finished list, send outstanding responses, free requests */
static void blk_send_response_all(struct XenBlkDev *blkdev)
{
    struct ioreq *ioreq;
    int send_notify = 0;

B
Blue Swirl 已提交
595 596
    while (!QLIST_EMPTY(&blkdev->finished)) {
        ioreq = QLIST_FIRST(&blkdev->finished);
597
        send_notify += blk_send_response_one(ioreq);
598
        ioreq_release(ioreq, true);
599 600 601
    }
    if (send_notify) {
        xen_be_send_notify(&blkdev->xendev);
602 603 604 605 606 607 608
    }
}

static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, RING_IDX rc)
{
    switch (blkdev->protocol) {
    case BLKIF_PROTOCOL_NATIVE:
609 610 611
        memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
               sizeof(ioreq->req));
        break;
612
    case BLKIF_PROTOCOL_X86_32:
613 614
        blkif_get_x86_32_req(&ioreq->req,
                             RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
615
        break;
616
    case BLKIF_PROTOCOL_X86_64:
617 618
        blkif_get_x86_64_req(&ioreq->req,
                             RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
619
        break;
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
    }
    return 0;
}

static void blk_handle_requests(struct XenBlkDev *blkdev)
{
    RING_IDX rc, rp;
    struct ioreq *ioreq;

    blkdev->more_work = 0;

    rc = blkdev->rings.common.req_cons;
    rp = blkdev->rings.common.sring->req_prod;
    xen_rmb(); /* Ensure we see queued requests up to 'rp'. */

P
Paolo Bonzini 已提交
635
    blk_send_response_all(blkdev);
B
blueswir1 已提交
636
    while (rc != rp) {
637
        /* pull request from ring */
638
        if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
639
            break;
640
        }
641 642 643 644 645 646 647 648 649 650
        ioreq = ioreq_start(blkdev);
        if (ioreq == NULL) {
            blkdev->more_work++;
            break;
        }
        blk_get_request(blkdev, ioreq, rc);
        blkdev->rings.common.req_cons = ++rc;

        /* parse them */
        if (ioreq_parse(ioreq) != 0) {
651
            if (blk_send_response_one(ioreq)) {
652
                xen_be_send_notify(&blkdev->xendev);
653
            }
654
            ioreq_release(ioreq, false);
655 656 657
            continue;
        }

P
Paolo Bonzini 已提交
658
        ioreq_runio_qemu_aio(ioreq);
659
    }
660

661
    if (blkdev->more_work && blkdev->requests_inflight < max_requests) {
662
        qemu_bh_schedule(blkdev->bh);
663
    }
664 665 666 667 668 669 670 671 672 673
}

/* ------------------------------------------------------------- */

static void blk_bh(void *opaque)
{
    struct XenBlkDev *blkdev = opaque;
    blk_handle_requests(blkdev);
}

674 675 676 677 678 679 680 681 682
/*
 * We need to account for the grant allocations requiring contiguous
 * chunks; the worst case number would be
 *     max_req * max_seg + (max_req - 1) * (max_seg - 1) + 1,
 * but in order to keep things simple just use
 *     2 * max_req * max_seg.
 */
#define MAX_GRANTS(max_req, max_seg) (2 * (max_req) * (max_seg))

683 684 685 686
static void blk_alloc(struct XenDevice *xendev)
{
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);

B
Blue Swirl 已提交
687 688 689
    QLIST_INIT(&blkdev->inflight);
    QLIST_INIT(&blkdev->finished);
    QLIST_INIT(&blkdev->freelist);
690
    blkdev->bh = qemu_bh_new(blk_bh, blkdev);
691
    if (xen_mode != XEN_EMULATE) {
692
        batch_maps = 1;
693
    }
694 695 696 697 698
    if (xc_gnttab_set_max_grants(xendev->gnttabdev,
            MAX_GRANTS(max_requests, BLKIF_MAX_SEGMENTS_PER_REQUEST)) < 0) {
        xen_be_printf(xendev, 0, "xc_gnttab_set_max_grants failed: %s\n",
                      strerror(errno));
    }
699 700 701 702 703
}

static int blk_init(struct XenDevice *xendev)
{
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
704
    int info = 0;
705
    char *directiosafe = NULL;
706 707 708

    /* read xenstore entries */
    if (blkdev->params == NULL) {
709
        char *h = NULL;
710
        blkdev->params = xenstore_read_be_str(&blkdev->xendev, "params");
711 712 713
        if (blkdev->params != NULL) {
            h = strchr(blkdev->params, ':');
        }
714 715 716 717 718 719 720 721 722
        if (h != NULL) {
            blkdev->fileproto = blkdev->params;
            blkdev->filename  = h+1;
            *h = 0;
        } else {
            blkdev->fileproto = "<unset>";
            blkdev->filename  = blkdev->params;
        }
    }
723 724 725
    if (!strcmp("aio", blkdev->fileproto)) {
        blkdev->fileproto = "raw";
    }
726 727 728 729 730 731 732 733 734 735 736 737
    if (blkdev->mode == NULL) {
        blkdev->mode = xenstore_read_be_str(&blkdev->xendev, "mode");
    }
    if (blkdev->type == NULL) {
        blkdev->type = xenstore_read_be_str(&blkdev->xendev, "type");
    }
    if (blkdev->dev == NULL) {
        blkdev->dev = xenstore_read_be_str(&blkdev->xendev, "dev");
    }
    if (blkdev->devtype == NULL) {
        blkdev->devtype = xenstore_read_be_str(&blkdev->xendev, "device-type");
    }
738 739
    directiosafe = xenstore_read_be_str(&blkdev->xendev, "direct-io-safe");
    blkdev->directiosafe = (directiosafe && atoi(directiosafe));
740 741 742

    /* do we have all we need? */
    if (blkdev->params == NULL ||
743 744 745
        blkdev->mode == NULL   ||
        blkdev->type == NULL   ||
        blkdev->dev == NULL) {
746
        goto out_error;
747
    }
748 749

    /* read-only ? */
750
    if (strcmp(blkdev->mode, "w")) {
751
        info  |= VDISK_READONLY;
752 753 754
    }

    /* cdrom ? */
755 756 757
    if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom")) {
        info  |= VDISK_CDROM;
    }
758

759 760 761 762 763 764 765 766
    blkdev->file_blk  = BLOCK_SIZE;

    /* fill info
     * blk_connect supplies sector-size and sectors
     */
    xenstore_write_be_int(&blkdev->xendev, "feature-flush-cache", 1);
    xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
    xenstore_write_be_int(&blkdev->xendev, "info", info);
767 768

    g_free(directiosafe);
769 770 771 772 773 774 775 776 777 778 779 780 781
    return 0;

out_error:
    g_free(blkdev->params);
    blkdev->params = NULL;
    g_free(blkdev->mode);
    blkdev->mode = NULL;
    g_free(blkdev->type);
    blkdev->type = NULL;
    g_free(blkdev->dev);
    blkdev->dev = NULL;
    g_free(blkdev->devtype);
    blkdev->devtype = NULL;
782 783
    g_free(directiosafe);
    blkdev->directiosafe = false;
784 785 786 787 788 789 790
    return -1;
}

static int blk_connect(struct XenDevice *xendev)
{
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
    int pers, index, qflags;
791
    bool readonly = true;
792 793

    /* read-only ? */
794 795 796 797 798
    if (blkdev->directiosafe) {
        qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO;
    } else {
        qflags = BDRV_O_CACHE_WB;
    }
799 800
    if (strcmp(blkdev->mode, "w") == 0) {
        qflags |= BDRV_O_RDWR;
801
        readonly = false;
802 803
    }

804
    /* init qemu block driver */
G
Gerd Hoffmann 已提交
805 806 807
    index = (blkdev->xendev.dev - 202 * 256) / 16;
    blkdev->dinfo = drive_get(IF_XEN, 0, index);
    if (!blkdev->dinfo) {
808 809
        /* setup via xenbus -> create new block driver instance */
        xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
810
        blkdev->bs = bdrv_new(blkdev->dev);
811
        if (blkdev->bs) {
812
            Error *local_err = NULL;
813 814 815
            BlockDriver *drv = bdrv_find_whitelisted_format(blkdev->fileproto,
                                                           readonly);
            if (bdrv_open(blkdev->bs,
816 817 818 819 820
                          blkdev->filename, NULL, qflags, drv, &local_err) != 0)
            {
                xen_be_printf(&blkdev->xendev, 0, "error: %s\n",
                              error_get_pretty(local_err));
                error_free(local_err);
F
Fam Zheng 已提交
821
                bdrv_unref(blkdev->bs);
822 823 824 825
                blkdev->bs = NULL;
            }
        }
        if (!blkdev->bs) {
826
            return -1;
827
        }
828 829 830
    } else {
        /* setup via qemu cmdline -> already setup for us */
        xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n");
831
        blkdev->bs = blkdev->dinfo->bdrv;
832 833 834
        /* blkdev->bs is not create by us, we get a reference
         * so we can bdrv_unref() unconditionally */
        bdrv_ref(blkdev->bs);
835
    }
836
    bdrv_attach_dev_nofail(blkdev->bs, blkdev);
837 838 839 840
    blkdev->file_size = bdrv_getlength(blkdev->bs);
    if (blkdev->file_size < 0) {
        xen_be_printf(&blkdev->xendev, 1, "bdrv_getlength: %d (%s) | drv %s\n",
                      (int)blkdev->file_size, strerror(-blkdev->file_size),
841
                      bdrv_get_format_name(blkdev->bs) ?: "-");
842
        blkdev->file_size = 0;
843 844 845
    }

    xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
846 847 848
                  " size %" PRId64 " (%" PRId64 " MB)\n",
                  blkdev->type, blkdev->fileproto, blkdev->filename,
                  blkdev->file_size, blkdev->file_size >> 20);
849

850 851
    /* Fill in number of sector size and number of sectors */
    xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk);
852 853
    xenstore_write_be_int64(&blkdev->xendev, "sectors",
                            blkdev->file_size / blkdev->file_blk);
854

855 856 857
    if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
        return -1;
    }
858
    if (xenstore_read_fe_int(&blkdev->xendev, "event-channel",
859 860 861
                             &blkdev->xendev.remote_port) == -1) {
        return -1;
    }
862 863 864 865 866
    if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
        blkdev->feature_persistent = FALSE;
    } else {
        blkdev->feature_persistent = !!pers;
    }
867 868 869

    blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
    if (blkdev->xendev.protocol) {
870
        if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
871
            blkdev->protocol = BLKIF_PROTOCOL_X86_32;
872 873
        }
        if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_64) == 0) {
874
            blkdev->protocol = BLKIF_PROTOCOL_X86_64;
875
        }
876 877 878
    }

    blkdev->sring = xc_gnttab_map_grant_ref(blkdev->xendev.gnttabdev,
879 880 881 882 883 884
                                            blkdev->xendev.dom,
                                            blkdev->ring_ref,
                                            PROT_READ | PROT_WRITE);
    if (!blkdev->sring) {
        return -1;
    }
885 886 887 888 889
    blkdev->cnt_map++;

    switch (blkdev->protocol) {
    case BLKIF_PROTOCOL_NATIVE:
    {
890 891 892
        blkif_sring_t *sring_native = blkdev->sring;
        BACK_RING_INIT(&blkdev->rings.native, sring_native, XC_PAGE_SIZE);
        break;
893 894 895
    }
    case BLKIF_PROTOCOL_X86_32:
    {
896
        blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
897 898

        BACK_RING_INIT(&blkdev->rings.x86_32_part, sring_x86_32, XC_PAGE_SIZE);
899
        break;
900 901 902
    }
    case BLKIF_PROTOCOL_X86_64:
    {
903
        blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
904 905

        BACK_RING_INIT(&blkdev->rings.x86_64_part, sring_x86_64, XC_PAGE_SIZE);
906
        break;
907 908 909
    }
    }

910 911 912 913 914 915 916 917 918
    if (blkdev->feature_persistent) {
        /* Init persistent grants */
        blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
        blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
                                             NULL, NULL,
                                             (GDestroyNotify)destroy_grant);
        blkdev->persistent_gnt_count = 0;
    }

919 920 921
    xen_be_bind_evtchn(&blkdev->xendev);

    xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
922 923 924
                  "remote port %d, local port %d\n",
                  blkdev->xendev.protocol, blkdev->ring_ref,
                  blkdev->xendev.remote_port, blkdev->xendev.local_port);
925 926 927 928 929 930 931 932
    return 0;
}

static void blk_disconnect(struct XenDevice *xendev)
{
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);

    if (blkdev->bs) {
933 934
        bdrv_detach_dev(blkdev->bs, blkdev);
        bdrv_unref(blkdev->bs);
935
        blkdev->bs = NULL;
936 937 938 939
    }
    xen_be_unbind_evtchn(&blkdev->xendev);

    if (blkdev->sring) {
940 941 942
        xc_gnttab_munmap(blkdev->xendev.gnttabdev, blkdev->sring, 1);
        blkdev->cnt_map--;
        blkdev->sring = NULL;
943 944 945 946 947 948 949 950
    }
}

static int blk_free(struct XenDevice *xendev)
{
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
    struct ioreq *ioreq;

951 952 953 954
    if (blkdev->bs || blkdev->sring) {
        blk_disconnect(xendev);
    }

955 956 957 958 959
    /* Free persistent grants */
    if (blkdev->feature_persistent) {
        g_tree_destroy(blkdev->persistent_gnts);
    }

B
Blue Swirl 已提交
960
    while (!QLIST_EMPTY(&blkdev->freelist)) {
961
        ioreq = QLIST_FIRST(&blkdev->freelist);
B
Blue Swirl 已提交
962
        QLIST_REMOVE(ioreq, list);
963
        qemu_iovec_destroy(&ioreq->v);
964
        g_free(ioreq);
965 966
    }

967 968 969 970 971
    g_free(blkdev->params);
    g_free(blkdev->mode);
    g_free(blkdev->type);
    g_free(blkdev->dev);
    g_free(blkdev->devtype);
972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
    qemu_bh_delete(blkdev->bh);
    return 0;
}

static void blk_event(struct XenDevice *xendev)
{
    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);

    qemu_bh_schedule(blkdev->bh);
}

struct XenDevOps xen_blkdev_ops = {
    .size       = sizeof(struct XenBlkDev),
    .flags      = DEVOPS_FLAG_NEED_GNTDEV,
    .alloc      = blk_alloc,
    .init       = blk_init,
988
    .initialise    = blk_connect,
989 990 991 992
    .disconnect = blk_disconnect,
    .event      = blk_event,
    .free       = blk_free,
};