qed.c 49.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * QEMU Enhanced Disk Format
 *
 * Copyright IBM, Corp. 2010
 *
 * Authors:
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
15
#include "qemu/osdep.h"
16
#include "block/qdict.h"
17
#include "qapi/error.h"
18
#include "qemu/timer.h"
19
#include "qemu/bswap.h"
20
#include "qemu/main-loop.h"
21
#include "qemu/module.h"
22
#include "qemu/option.h"
S
Stefan Hajnoczi 已提交
23
#include "trace.h"
24
#include "qed.h"
25
#include "sysemu/block-backend.h"
K
Kevin Wolf 已提交
26 27 28 29 30
#include "qapi/qmp/qdict.h"
#include "qapi/qobject-input-visitor.h"
#include "qapi/qapi-visit-block-core.h"

static QemuOptsList qed_create_opts;
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
                          const char *filename)
{
    const QEDHeader *header = (const QEDHeader *)buf;

    if (buf_size < sizeof(*header)) {
        return 0;
    }
    if (le32_to_cpu(header->magic) != QED_MAGIC) {
        return 0;
    }
    return 100;
}

/**
 * Check whether an image format is raw
 *
 * @fmt:    Backing file format, may be NULL
 */
static bool qed_fmt_is_raw(const char *fmt)
{
    return fmt && strcmp(fmt, "raw") == 0;
}

static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
{
    cpu->magic = le32_to_cpu(le->magic);
    cpu->cluster_size = le32_to_cpu(le->cluster_size);
    cpu->table_size = le32_to_cpu(le->table_size);
    cpu->header_size = le32_to_cpu(le->header_size);
    cpu->features = le64_to_cpu(le->features);
    cpu->compat_features = le64_to_cpu(le->compat_features);
    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
    cpu->image_size = le64_to_cpu(le->image_size);
    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
}

static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
{
    le->magic = cpu_to_le32(cpu->magic);
    le->cluster_size = cpu_to_le32(cpu->cluster_size);
    le->table_size = cpu_to_le32(cpu->table_size);
    le->header_size = cpu_to_le32(cpu->header_size);
    le->features = cpu_to_le64(cpu->features);
    le->compat_features = cpu_to_le64(cpu->compat_features);
    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
    le->image_size = cpu_to_le64(cpu->image_size);
    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
}

86
int qed_write_header_sync(BDRVQEDState *s)
87 88 89 90 91
{
    QEDHeader le;
    int ret;

    qed_header_cpu_to_le(&s->header, &le);
92
    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
93 94 95 96 97 98
    if (ret != sizeof(le)) {
        return ret;
    }
    return 0;
}

S
Stefan Hajnoczi 已提交
99 100 101 102 103
/**
 * Update header in-place (does not rewrite backing filename or other strings)
 *
 * This function only updates known header fields in-place and does not affect
 * extra data after the QED header.
104 105
 *
 * No new allocating reqs can start while this function runs.
S
Stefan Hajnoczi 已提交
106
 */
107
static int coroutine_fn qed_write_header(BDRVQEDState *s)
S
Stefan Hajnoczi 已提交
108 109 110 111 112 113 114
{
    /* We must write full sectors for O_DIRECT but cannot necessarily generate
     * the data following the header if an unrecognized compat feature is
     * active.  Therefore, first read the sectors containing the header, update
     * them, and write back.
     */

L
Laurent Vivier 已提交
115
    int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
S
Stefan Hajnoczi 已提交
116
    size_t len = nsectors * BDRV_SECTOR_SIZE;
117 118 119
    uint8_t *buf;
    int ret;

120 121
    assert(s->allocating_acb || s->allocating_write_reqs_plugged);

122 123
    buf = qemu_blockalign(s->bs, len);

124
    ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
125 126 127 128 129 130 131
    if (ret < 0) {
        goto out;
    }

    /* Update header */
    qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);

132
    ret = bdrv_co_pwrite(s->bs->file, 0, len,  buf, 0);
133 134 135 136 137 138 139
    if (ret < 0) {
        goto out;
    }

    ret = 0;
out:
    qemu_vfree(buf);
140
    return ret;
S
Stefan Hajnoczi 已提交
141 142
}

143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
{
    uint64_t table_entries;
    uint64_t l2_size;

    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
    l2_size = table_entries * cluster_size;

    return l2_size * table_entries;
}

static bool qed_is_cluster_size_valid(uint32_t cluster_size)
{
    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
        cluster_size > QED_MAX_CLUSTER_SIZE) {
        return false;
    }
    if (cluster_size & (cluster_size - 1)) {
        return false; /* not power of 2 */
    }
    return true;
}

static bool qed_is_table_size_valid(uint32_t table_size)
{
    if (table_size < QED_MIN_TABLE_SIZE ||
        table_size > QED_MAX_TABLE_SIZE) {
        return false;
    }
    if (table_size & (table_size - 1)) {
        return false; /* not power of 2 */
    }
    return true;
}

static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
                                    uint32_t table_size)
{
    if (image_size % BDRV_SECTOR_SIZE != 0) {
        return false; /* not multiple of sector size */
    }
    if (image_size > qed_max_image_size(cluster_size, table_size)) {
        return false; /* image is too large */
    }
    return true;
}

/**
 * Read a string of known length from the image file
 *
 * @file:       Image file
 * @offset:     File offset to start of string, in bytes
 * @n:          String length in bytes
 * @buf:        Destination buffer
 * @buflen:     Destination buffer length in bytes
 * @ret:        0 on success, -errno on failure
 *
 * The string is NUL-terminated.
 */
202
static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
203 204 205 206 207 208 209 210 211 212 213 214 215 216
                           char *buf, size_t buflen)
{
    int ret;
    if (n >= buflen) {
        return -EINVAL;
    }
    ret = bdrv_pread(file, offset, buf, n);
    if (ret < 0) {
        return ret;
    }
    buf[n] = '\0';
    return 0;
}

S
Stefan Hajnoczi 已提交
217 218 219 220 221 222 223 224 225 226
/**
 * Allocate new clusters
 *
 * @s:          QED state
 * @n:          Number of contiguous clusters to allocate
 * @ret:        Offset of first allocated cluster
 *
 * This function only produces the offset where the new clusters should be
 * written.  It updates BDRVQEDState but does not make any changes to the image
 * file.
227 228
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
229 230 231 232 233 234 235 236
 */
static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
{
    uint64_t offset = s->file_size;
    s->file_size += n * s->header.cluster_size;
    return offset;
}

237 238 239 240 241 242 243
QEDTable *qed_alloc_table(BDRVQEDState *s)
{
    /* Honor O_DIRECT memory alignment requirements */
    return qemu_blockalign(s->bs,
                           s->header.cluster_size * s->header.table_size);
}

S
Stefan Hajnoczi 已提交
244 245
/**
 * Allocate a new zeroed L2 table
246 247
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260
 */
static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
{
    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);

    l2_table->table = qed_alloc_table(s);
    l2_table->offset = qed_alloc_clusters(s, s->header.table_size);

    memset(l2_table->table->offsets, 0,
           s->header.cluster_size * s->header.table_size);
    return l2_table;
}

261
static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
262
{
263 264 265
    qemu_co_mutex_lock(&s->table_lock);

    /* No reentrancy is allowed.  */
266
    assert(!s->allocating_write_reqs_plugged);
267 268
    if (s->allocating_acb != NULL) {
        /* Another allocating write came concurrently.  This cannot happen
269
         * from bdrv_qed_co_drain_begin, but it can happen when the timer runs.
270 271 272 273
         */
        qemu_co_mutex_unlock(&s->table_lock);
        return false;
    }
274 275

    s->allocating_write_reqs_plugged = true;
276 277
    qemu_co_mutex_unlock(&s->table_lock);
    return true;
278 279 280 281
}

static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
{
282
    qemu_co_mutex_lock(&s->table_lock);
283 284
    assert(s->allocating_write_reqs_plugged);
    s->allocating_write_reqs_plugged = false;
285 286
    qemu_co_queue_next(&s->allocating_write_reqs);
    qemu_co_mutex_unlock(&s->table_lock);
287 288
}

289
static void coroutine_fn qed_need_check_timer_entry(void *opaque)
290 291
{
    BDRVQEDState *s = opaque;
292 293 294
    int ret;

    trace_qed_need_check_timer_cb(s);
295

296 297 298
    if (!qed_plug_allocating_write_reqs(s)) {
        return;
    }
299 300 301 302

    /* Ensure writes are on disk before clearing flag */
    ret = bdrv_co_flush(s->bs->file->bs);
    if (ret < 0) {
303 304 305 306 307
        qed_unplug_allocating_write_reqs(s);
        return;
    }

    s->header.features &= ~QED_F_NEED_CHECK;
308 309 310 311 312
    ret = qed_write_header(s);
    (void) ret;

    qed_unplug_allocating_write_reqs(s);

313
    ret = bdrv_co_flush(s->bs);
314
    (void) ret;
315 316 317 318
}

static void qed_need_check_timer_cb(void *opaque)
{
319 320
    Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
    qemu_coroutine_enter(co);
321 322
}

323 324 325 326
static void qed_start_need_check_timer(BDRVQEDState *s)
{
    trace_qed_start_need_check_timer(s);

327
    /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
328 329
     * migration.
     */
330
    timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
331
                   NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
332 333 334 335 336 337
}

/* It's okay to call this multiple times or when no timer is started */
static void qed_cancel_need_check_timer(BDRVQEDState *s)
{
    trace_qed_cancel_need_check_timer(s);
338
    timer_del(s->need_check_timer);
339 340
}

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
{
    BDRVQEDState *s = bs->opaque;

    qed_cancel_need_check_timer(s);
    timer_free(s->need_check_timer);
}

static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
                                        AioContext *new_context)
{
    BDRVQEDState *s = bs->opaque;

    s->need_check_timer = aio_timer_new(new_context,
                                        QEMU_CLOCK_VIRTUAL, SCALE_NS,
                                        qed_need_check_timer_cb, s);
    if (s->header.features & QED_F_NEED_CHECK) {
        qed_start_need_check_timer(s);
    }
}

362
static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
F
Fam Zheng 已提交
363 364 365 366 367 368 369 370
{
    BDRVQEDState *s = bs->opaque;

    /* Fire the timer immediately in order to start doing I/O as soon as the
     * header is flushed.
     */
    if (s->need_check_timer && timer_pending(s->need_check_timer)) {
        qed_cancel_need_check_timer(s);
371
        qed_need_check_timer_entry(s);
F
Fam Zheng 已提交
372 373 374
    }
}

375 376 377 378 379 380
static void bdrv_qed_init_state(BlockDriverState *bs)
{
    BDRVQEDState *s = bs->opaque;

    memset(s, 0, sizeof(BDRVQEDState));
    s->bs = bs;
381
    qemu_co_mutex_init(&s->table_lock);
382 383 384
    qemu_co_queue_init(&s->allocating_write_reqs);
}

385 386 387
/* Called with table_lock held.  */
static int coroutine_fn bdrv_qed_do_open(BlockDriverState *bs, QDict *options,
                                         int flags, Error **errp)
388 389 390 391 392 393
{
    BDRVQEDState *s = bs->opaque;
    QEDHeader le_header;
    int64_t file_size;
    int ret;

394
    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
395 396 397 398 399 400
    if (ret < 0) {
        return ret;
    }
    qed_header_le_to_cpu(&le_header, &s->header);

    if (s->header.magic != QED_MAGIC) {
P
Paolo Bonzini 已提交
401 402
        error_setg(errp, "Image not in QED format");
        return -EINVAL;
403 404
    }
    if (s->header.features & ~QED_FEATURE_MASK) {
405
        /* image uses unsupported feature bits */
406 407
        error_setg(errp, "Unsupported QED features: %" PRIx64,
                   s->header.features & ~QED_FEATURE_MASK);
408
        return -ENOTSUP;
409 410 411 412 413 414
    }
    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
        return -EINVAL;
    }

    /* Round down file size to the last cluster */
K
Kevin Wolf 已提交
415
    file_size = bdrv_getlength(bs->file->bs);
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
    if (file_size < 0) {
        return file_size;
    }
    s->file_size = qed_start_of_cluster(s, file_size);

    if (!qed_is_table_size_valid(s->header.table_size)) {
        return -EINVAL;
    }
    if (!qed_is_image_size_valid(s->header.image_size,
                                 s->header.cluster_size,
                                 s->header.table_size)) {
        return -EINVAL;
    }
    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
        return -EINVAL;
    }

    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
                      sizeof(uint64_t);
435
    s->l2_shift = ctz32(s->header.cluster_size);
436
    s->l2_mask = s->table_nelems - 1;
437
    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
438

439 440 441 442 443
    /* Header size calculation must not overflow uint32_t */
    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
        return -EINVAL;
    }

444 445 446 447 448 449 450
    if ((s->header.features & QED_F_BACKING_FILE)) {
        if ((uint64_t)s->header.backing_filename_offset +
            s->header.backing_filename_size >
            s->header.cluster_size * s->header.header_size) {
            return -EINVAL;
        }

451
        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
M
Max Reitz 已提交
452 453 454
                              s->header.backing_filename_size,
                              bs->auto_backing_file,
                              sizeof(bs->auto_backing_file));
455 456 457
        if (ret < 0) {
            return ret;
        }
M
Max Reitz 已提交
458 459
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
                bs->auto_backing_file);
460 461 462 463 464 465 466 467 468 469 470 471 472

        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
        }
    }

    /* Reset unknown autoclear feature bits.  This is a backwards
     * compatibility mechanism that allows images to be opened by older
     * programs, which "knock out" unknown feature bits.  When an image is
     * opened by a newer program again it can detect that the autoclear
     * feature is no longer valid.
     */
    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
473
        !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
474 475 476 477 478 479 480 481
        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;

        ret = qed_write_header_sync(s);
        if (ret) {
            return ret;
        }

        /* From here on only known autoclear feature bits are valid */
K
Kevin Wolf 已提交
482
        bdrv_flush(bs->file->bs);
483 484
    }

485 486 487 488
    s->l1_table = qed_alloc_table(s);
    qed_init_l2_cache(&s->l2_cache);

    ret = qed_read_l1_table_sync(s);
S
Stefan Hajnoczi 已提交
489 490 491 492 493
    if (ret) {
        goto out;
    }

    /* If image was not closed cleanly, check consistency */
494
    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
S
Stefan Hajnoczi 已提交
495 496 497 498 499
        /* Read-only images cannot be fixed.  There is no risk of corruption
         * since write operations are not possible.  Therefore, allow
         * potentially inconsistent images to be opened read-only.  This can
         * aid data recovery from an otherwise inconsistent image.
         */
K
Kevin Wolf 已提交
500
        if (!bdrv_is_read_only(bs->file->bs) &&
501
            !(flags & BDRV_O_INACTIVE)) {
S
Stefan Hajnoczi 已提交
502 503 504
            BdrvCheckResult result = {0};

            ret = qed_check(s, &result, true);
505 506 507
            if (ret) {
                goto out;
            }
S
Stefan Hajnoczi 已提交
508 509 510
        }
    }

511
    bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
512

S
Stefan Hajnoczi 已提交
513
out:
514 515 516 517
    if (ret) {
        qed_free_l2_cache(&s->l2_cache);
        qemu_vfree(s->l1_table);
    }
518 519 520
    return ret;
}

521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
typedef struct QEDOpenCo {
    BlockDriverState *bs;
    QDict *options;
    int flags;
    Error **errp;
    int ret;
} QEDOpenCo;

static void coroutine_fn bdrv_qed_open_entry(void *opaque)
{
    QEDOpenCo *qoc = opaque;
    BDRVQEDState *s = qoc->bs->opaque;

    qemu_co_mutex_lock(&s->table_lock);
    qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
    qemu_co_mutex_unlock(&s->table_lock);
}

539 540 541
static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
{
542 543 544 545 546 547 548 549
    QEDOpenCo qoc = {
        .bs = bs,
        .options = options,
        .flags = flags,
        .errp = errp,
        .ret = -EINPROGRESS
    };

550 551 552 553 554 555
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

556
    bdrv_qed_init_state(bs);
557 558 559
    if (qemu_in_coroutine()) {
        bdrv_qed_open_entry(&qoc);
    } else {
560
        assert(qemu_get_current_aio_context() == qemu_get_aio_context());
561 562 563 564 565
        qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
    }
    BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
    return qoc.ret;
566 567
}

568
static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
569 570 571
{
    BDRVQEDState *s = bs->opaque;

572
    bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
573 574
}

J
Jeff Cody 已提交
575 576 577 578 579 580 581 582
/* We have nothing to do for QED reopen, stubs just return
 * success */
static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
                                   BlockReopenQueue *queue, Error **errp)
{
    return 0;
}

583 584
static void bdrv_qed_close(BlockDriverState *bs)
{
585 586
    BDRVQEDState *s = bs->opaque;

587
    bdrv_qed_detach_aio_context(bs);
588

S
Stefan Hajnoczi 已提交
589
    /* Ensure writes reach stable storage */
K
Kevin Wolf 已提交
590
    bdrv_flush(bs->file->bs);
S
Stefan Hajnoczi 已提交
591 592 593 594 595 596 597

    /* Clean shutdown, no check required on next open */
    if (s->header.features & QED_F_NEED_CHECK) {
        s->header.features &= ~QED_F_NEED_CHECK;
        qed_write_header_sync(s);
    }

598 599
    qed_free_l2_cache(&s->l2_cache);
    qemu_vfree(s->l1_table);
600 601
}

K
Kevin Wolf 已提交
602 603
static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
                                           Error **errp)
604
{
K
Kevin Wolf 已提交
605 606 607 608 609
    BlockdevCreateOptionsQed *qed_opts;
    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;

    QEDHeader header;
610 611
    QEDHeader le_header;
    uint8_t *l1_table = NULL;
K
Kevin Wolf 已提交
612
    size_t l1_size;
613 614
    int ret = 0;

K
Kevin Wolf 已提交
615 616 617 618 619 620 621 622 623
    assert(opts->driver == BLOCKDEV_DRIVER_QED);
    qed_opts = &opts->u.qed;

    /* Validate options and set default values */
    if (!qed_opts->has_cluster_size) {
        qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
    }
    if (!qed_opts->has_table_size) {
        qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
624 625
    }

K
Kevin Wolf 已提交
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
    if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
        error_setg(errp, "QED cluster size must be within range [%u, %u] "
                         "and power of 2",
                   QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
        return -EINVAL;
    }
    if (!qed_is_table_size_valid(qed_opts->table_size)) {
        error_setg(errp, "QED table size must be within range [%u, %u] "
                         "and power of 2",
                   QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
        return -EINVAL;
    }
    if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
                                 qed_opts->table_size))
    {
        error_setg(errp, "QED image size must be a non-zero multiple of "
                         "cluster size and less than %" PRIu64 " bytes",
                   qed_max_image_size(qed_opts->cluster_size,
                                      qed_opts->table_size));
        return -EINVAL;
    }

    /* Create BlockBackend to write to the image */
    bs = bdrv_open_blockdev_ref(qed_opts->file, errp);
    if (bs == NULL) {
651
        return -EIO;
652 653
    }

K
Kevin Wolf 已提交
654 655
    blk = blk_new(bdrv_get_aio_context(bs),
                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
K
Kevin Wolf 已提交
656 657 658 659
    ret = blk_insert_bs(blk, bs, errp);
    if (ret < 0) {
        goto out;
    }
660 661
    blk_set_allow_write_beyond_eof(blk, true);

K
Kevin Wolf 已提交
662 663 664 665 666 667 668 669 670 671 672 673 674 675
    /* Prepare image format */
    header = (QEDHeader) {
        .magic = QED_MAGIC,
        .cluster_size = qed_opts->cluster_size,
        .table_size = qed_opts->table_size,
        .header_size = 1,
        .features = 0,
        .compat_features = 0,
        .l1_table_offset = qed_opts->cluster_size,
        .image_size = qed_opts->size,
    };

    l1_size = header.cluster_size * header.table_size;

676 677 678 679 680
    /*
     * The QED format associates file length with allocation status,
     * so a new file (which is empty) must have a length of 0.
     */
    ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp);
681 682 683 684
    if (ret < 0) {
        goto out;
    }

K
Kevin Wolf 已提交
685
    if (qed_opts->has_backing_file) {
686 687
        header.features |= QED_F_BACKING_FILE;
        header.backing_filename_offset = sizeof(le_header);
K
Kevin Wolf 已提交
688
        header.backing_filename_size = strlen(qed_opts->backing_file);
689

K
Kevin Wolf 已提交
690 691 692 693 694
        if (qed_opts->has_backing_fmt) {
            const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
            if (qed_fmt_is_raw(backing_fmt)) {
                header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
            }
695 696 697 698
        }
    }

    qed_header_cpu_to_le(&header, &le_header);
699
    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
700 701 702
    if (ret < 0) {
        goto out;
    }
K
Kevin Wolf 已提交
703
    ret = blk_pwrite(blk, sizeof(le_header), qed_opts->backing_file,
704
                     header.backing_filename_size, 0);
705 706 707 708
    if (ret < 0) {
        goto out;
    }

709
    l1_table = g_malloc0(l1_size);
710
    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
711 712 713 714 715 716
    if (ret < 0) {
        goto out;
    }

    ret = 0; /* success */
out:
717
    g_free(l1_table);
718
    blk_unref(blk);
K
Kevin Wolf 已提交
719
    bdrv_unref(bs);
720 721 722
    return ret;
}

723 724 725
static int coroutine_fn bdrv_qed_co_create_opts(const char *filename,
                                                QemuOpts *opts,
                                                Error **errp)
726
{
K
Kevin Wolf 已提交
727
    BlockdevCreateOptions *create_options = NULL;
728
    QDict *qdict;
K
Kevin Wolf 已提交
729 730 731
    Visitor *v;
    BlockDriverState *bs = NULL;
    Error *local_err = NULL;
732 733
    int ret;

K
Kevin Wolf 已提交
734 735 736 737 738 739 740 741 742 743 744 745
    static const QDictRenames opt_renames[] = {
        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
        { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
        { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
        { BLOCK_OPT_TABLE_SIZE,         "table-size" },
        { NULL, NULL },
    };

    /* Parse options and convert legacy syntax */
    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);

    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
746
        ret = -EINVAL;
K
Kevin Wolf 已提交
747
        goto fail;
748
    }
K
Kevin Wolf 已提交
749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767

    /* Create and open the file (protocol layer) */
    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto fail;
    }

    bs = bdrv_open(filename, NULL, NULL,
                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    if (bs == NULL) {
        ret = -EIO;
        goto fail;
    }

    /* Now get the QAPI type BlockdevCreateOptions */
    qdict_put_str(qdict, "driver", "qed");
    qdict_put_str(qdict, "file", bs->node_name);

768 769
    v = qobject_input_visitor_new_flat_confused(qdict, errp);
    if (!v) {
770
        ret = -EINVAL;
K
Kevin Wolf 已提交
771
        goto fail;
772
    }
K
Kevin Wolf 已提交
773 774 775 776 777 778

    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
    visit_free(v);

    if (local_err) {
        error_propagate(errp, local_err);
779
        ret = -EINVAL;
K
Kevin Wolf 已提交
780
        goto fail;
781 782
    }

K
Kevin Wolf 已提交
783 784 785 786 787 788 789
    /* Silently round up size */
    assert(create_options->driver == BLOCKDEV_DRIVER_QED);
    create_options->u.qed.size =
        ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);

    /* Create the qed image (format layer) */
    ret = bdrv_qed_co_create(create_options, errp);
790

K
Kevin Wolf 已提交
791
fail:
792
    qobject_unref(qdict);
K
Kevin Wolf 已提交
793 794
    bdrv_unref(bs);
    qapi_free_BlockdevCreateOptions(create_options);
795
    return ret;
796 797
}

798 799 800 801 802
static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs,
                                                 bool want_zero,
                                                 int64_t pos, int64_t bytes,
                                                 int64_t *pnum, int64_t *map,
                                                 BlockDriverState **file)
803
{
804 805 806 807 808 809 810 811 812 813 814
    BDRVQEDState *s = bs->opaque;
    size_t len = MIN(bytes, SIZE_MAX);
    int status;
    QEDRequest request = { .l2_table = NULL };
    uint64_t offset;
    int ret;

    qemu_co_mutex_lock(&s->table_lock);
    ret = qed_find_cluster(s, &request, pos, &len, &offset);

    *pnum = len;
815 816
    switch (ret) {
    case QED_CLUSTER_FOUND:
817 818 819
        *map = offset | qed_offset_into_cluster(s, pos);
        status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
        *file = bs->file->bs;
820 821
        break;
    case QED_CLUSTER_ZERO:
822
        status = BDRV_BLOCK_ZERO;
823 824 825
        break;
    case QED_CLUSTER_L2:
    case QED_CLUSTER_L1:
826
        status = 0;
827 828 829
        break;
    default:
        assert(ret < 0);
830
        status = ret;
831 832 833
        break;
    }

834
    qed_unref_l2_cache_entry(request.l2_table);
835
    qemu_co_mutex_unlock(&s->table_lock);
836

837
    return status;
838 839
}

S
Stefan Hajnoczi 已提交
840 841
static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
{
K
Kevin Wolf 已提交
842
    return acb->bs->opaque;
S
Stefan Hajnoczi 已提交
843 844 845 846 847
}

/**
 * Read from the backing file or zero-fill if no backing file
 *
848 849 850 851 852 853
 * @s:              QED state
 * @pos:            Byte position in device
 * @qiov:           Destination I/O vector
 * @backing_qiov:   Possibly shortened copy of qiov, to be allocated here
 * @cb:             Completion function
 * @opaque:         User data for completion function
S
Stefan Hajnoczi 已提交
854 855 856 857
 *
 * This function reads qiov->size bytes starting at pos from the backing file.
 * If there is no backing file then zeroes are read.
 */
858 859 860
static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
                                              QEMUIOVector *qiov,
                                              QEMUIOVector **backing_qiov)
S
Stefan Hajnoczi 已提交
861 862 863
{
    uint64_t backing_length = 0;
    size_t size;
864
    int ret;
S
Stefan Hajnoczi 已提交
865 866 867 868

    /* If there is a backing file, get its length.  Treat the absence of a
     * backing file like a zero length backing file.
     */
869 870
    if (s->bs->backing) {
        int64_t l = bdrv_getlength(s->bs->backing->bs);
S
Stefan Hajnoczi 已提交
871
        if (l < 0) {
872
            return l;
S
Stefan Hajnoczi 已提交
873 874 875 876 877 878 879
        }
        backing_length = l;
    }

    /* Zero all sectors if reading beyond the end of the backing file */
    if (pos >= backing_length ||
        pos + qiov->size > backing_length) {
880
        qemu_iovec_memset(qiov, 0, 0, qiov->size);
S
Stefan Hajnoczi 已提交
881 882 883 884
    }

    /* Complete now if there are no backing file sectors to read */
    if (pos >= backing_length) {
885
        return 0;
S
Stefan Hajnoczi 已提交
886 887 888 889 890
    }

    /* If the read straddles the end of the backing file, shorten it */
    size = MIN((uint64_t)backing_length - pos, qiov->size);

891 892 893 894 895
    assert(*backing_qiov == NULL);
    *backing_qiov = g_new(QEMUIOVector, 1);
    qemu_iovec_init(*backing_qiov, qiov->niov);
    qemu_iovec_concat(*backing_qiov, qiov, 0, size);

P
Paolo Bonzini 已提交
896
    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
897
    ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
898 899 900 901
    if (ret < 0) {
        return ret;
    }
    return 0;
S
Stefan Hajnoczi 已提交
902 903 904 905 906 907 908 909 910 911
}

/**
 * Copy data from backing file into the image
 *
 * @s:          QED state
 * @pos:        Byte position in device
 * @len:        Number of bytes
 * @offset:     Byte offset in image file
 */
912 913 914
static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
                                                   uint64_t pos, uint64_t len,
                                                   uint64_t offset)
S
Stefan Hajnoczi 已提交
915
{
916 917
    QEMUIOVector qiov;
    QEMUIOVector *backing_qiov = NULL;
918
    int ret;
S
Stefan Hajnoczi 已提交
919 920 921

    /* Skip copy entirely if there is no work to do */
    if (len == 0) {
922
        return 0;
S
Stefan Hajnoczi 已提交
923 924
    }

925
    qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
926 927 928 929 930 931 932 933 934 935 936 937

    ret = qed_read_backing_file(s, pos, &qiov, &backing_qiov);

    if (backing_qiov) {
        qemu_iovec_destroy(backing_qiov);
        g_free(backing_qiov);
        backing_qiov = NULL;
    }

    if (ret) {
        goto out;
    }
S
Stefan Hajnoczi 已提交
938

939
    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
940
    ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
941 942 943 944 945
    if (ret < 0) {
        goto out;
    }
    ret = 0;
out:
946
    qemu_vfree(qemu_iovec_buf(&qiov));
947
    return ret;
S
Stefan Hajnoczi 已提交
948 949 950 951 952 953 954 955 956
}

/**
 * Link one or more contiguous clusters into a table
 *
 * @s:              QED state
 * @table:          L2 table
 * @index:          First cluster index
 * @n:              Number of contiguous clusters
957 958 959 960
 * @cluster:        First cluster offset
 *
 * The cluster offset may be an allocated byte offset in the image file, the
 * zero cluster marker, or the unallocated cluster marker.
961 962
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
963
 */
964 965 966
static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
                                             int index, unsigned int n,
                                             uint64_t cluster)
S
Stefan Hajnoczi 已提交
967 968 969 970
{
    int i;
    for (i = index; i < index + n; i++) {
        table->offsets[i] = cluster;
971 972 973 974
        if (!qed_offset_is_unalloc_cluster(cluster) &&
            !qed_offset_is_zero_cluster(cluster)) {
            cluster += s->header.cluster_size;
        }
S
Stefan Hajnoczi 已提交
975 976 977
    }
}

978
/* Called with table_lock held.  */
979
static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
980
{
981
    BDRVQEDState *s = acb_to_s(acb);
S
Stefan Hajnoczi 已提交
982 983 984 985 986

    /* Free resources */
    qemu_iovec_destroy(&acb->cur_qiov);
    qed_unref_l2_cache_entry(acb->request.l2_table);

987 988 989 990 991 992
    /* Free the buffer we may have allocated for zero writes */
    if (acb->flags & QED_AIOCB_ZERO) {
        qemu_vfree(acb->qiov->iov[0].iov_base);
        acb->qiov->iov[0].iov_base = NULL;
    }

S
Stefan Hajnoczi 已提交
993 994 995 996 997 998
    /* Start next allocating write request waiting behind this one.  Note that
     * requests enqueue themselves when they first hit an unallocated cluster
     * but they wait until the entire request is finished before waking up the
     * next request in the queue.  This ensures that we don't cycle through
     * requests multiple times but rather finish one at a time completely.
     */
999 1000 1001
    if (acb == s->allocating_acb) {
        s->allocating_acb = NULL;
        if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
1002
            qemu_co_queue_next(&s->allocating_write_reqs);
1003 1004
        } else if (s->header.features & QED_F_NEED_CHECK) {
            qed_start_need_check_timer(s);
S
Stefan Hajnoczi 已提交
1005 1006 1007 1008 1009
        }
    }
}

/**
K
Kevin Wolf 已提交
1010
 * Update L1 table with new L2 table offset and write it out
1011 1012
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1013
 */
1014
static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
1015 1016 1017
{
    BDRVQEDState *s = acb_to_s(acb);
    CachedL2Table *l2_table = acb->request.l2_table;
1018
    uint64_t l2_offset = l2_table->offset;
1019
    int index, ret;
S
Stefan Hajnoczi 已提交
1020

K
Kevin Wolf 已提交
1021 1022 1023 1024 1025 1026
    index = qed_l1_index(s, acb->cur_pos);
    s->l1_table->offsets[index] = l2_table->offset;

    ret = qed_write_l1_table(s, index, 1);

    /* Commit the current L2 table to the cache */
S
Stefan Hajnoczi 已提交
1027 1028 1029 1030 1031
    qed_commit_l2_cache_entry(&s->l2_cache, l2_table);

    /* This is guaranteed to succeed because we just committed the entry to the
     * cache.
     */
1032
    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
S
Stefan Hajnoczi 已提交
1033 1034
    assert(acb->request.l2_table != NULL);

1035
    return ret;
S
Stefan Hajnoczi 已提交
1036 1037 1038 1039 1040
}


/**
 * Update L2 table with new cluster offsets and write them out
1041 1042
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1043
 */
1044
static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
S
Stefan Hajnoczi 已提交
1045 1046 1047
{
    BDRVQEDState *s = acb_to_s(acb);
    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1048
    int index, ret;
S
Stefan Hajnoczi 已提交
1049 1050 1051 1052 1053 1054 1055 1056

    if (need_alloc) {
        qed_unref_l2_cache_entry(acb->request.l2_table);
        acb->request.l2_table = qed_new_l2_table(s);
    }

    index = qed_l2_index(s, acb->cur_pos);
    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1057
                         offset);
S
Stefan Hajnoczi 已提交
1058 1059 1060

    if (need_alloc) {
        /* Write out the whole new L2 table */
1061
        ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
1062
        if (ret) {
1063
            return ret;
1064
        }
1065
        return qed_aio_write_l1_update(acb);
S
Stefan Hajnoczi 已提交
1066 1067
    } else {
        /* Write out only the updated part of the L2 table */
1068 1069
        ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
                                 false);
1070 1071 1072
        if (ret) {
            return ret;
        }
S
Stefan Hajnoczi 已提交
1073
    }
1074
    return 0;
S
Stefan Hajnoczi 已提交
1075 1076 1077 1078
}

/**
 * Write data to the image file
1079 1080
 *
 * Called with table_lock *not* held.
S
Stefan Hajnoczi 已提交
1081
 */
1082
static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
1083 1084 1085 1086 1087
{
    BDRVQEDState *s = acb_to_s(acb);
    uint64_t offset = acb->cur_cluster +
                      qed_offset_into_cluster(s, acb->cur_pos);

1088
    trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
S
Stefan Hajnoczi 已提交
1089

1090
    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1091 1092
    return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
                           &acb->cur_qiov, 0);
S
Stefan Hajnoczi 已提交
1093 1094 1095
}

/**
1096
 * Populate untouched regions of new data cluster
1097 1098
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1099
 */
1100
static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
1101 1102
{
    BDRVQEDState *s = acb_to_s(acb);
1103
    uint64_t start, len, offset;
1104
    int ret;
S
Stefan Hajnoczi 已提交
1105

1106 1107
    qemu_co_mutex_unlock(&s->table_lock);

1108 1109 1110 1111 1112 1113
    /* Populate front untouched region of new data cluster */
    start = qed_start_of_cluster(s, acb->cur_pos);
    len = qed_offset_into_cluster(s, acb->cur_pos);

    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
    ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1114
    if (ret < 0) {
1115
        goto out;
S
Stefan Hajnoczi 已提交
1116 1117
    }

1118 1119 1120 1121 1122 1123
    /* Populate back untouched region of new data cluster */
    start = acb->cur_pos + acb->cur_qiov.size;
    len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
    offset = acb->cur_cluster +
             qed_offset_into_cluster(s, acb->cur_pos) +
             acb->cur_qiov.size;
S
Stefan Hajnoczi 已提交
1124

1125 1126
    trace_qed_aio_write_postfill(s, acb, start, len, offset);
    ret = qed_copy_from_backing_file(s, start, len, offset);
1127
    if (ret < 0) {
1128
        goto out;
1129
    }
1130

1131 1132
    ret = qed_aio_write_main(acb);
    if (ret < 0) {
1133
        goto out;
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
    }

    if (s->bs->backing) {
        /*
         * Flush new data clusters before updating the L2 table
         *
         * This flush is necessary when a backing file is in use.  A crash
         * during an allocating write could result in empty clusters in the
         * image.  If the write only touched a subregion of the cluster,
         * then backing image sectors have been lost in the untouched
         * region.  The solution is to flush after writing a new data
         * cluster and before updating the L2 table.
         */
        ret = bdrv_co_flush(s->bs->file->bs);
    }

1150 1151 1152
out:
    qemu_co_mutex_lock(&s->table_lock);
    return ret;
S
Stefan Hajnoczi 已提交
1153 1154
}

1155 1156 1157 1158 1159 1160
/**
 * Check if the QED_F_NEED_CHECK bit should be set during allocating write
 */
static bool qed_should_set_need_check(BDRVQEDState *s)
{
    /* The flush before L2 update path ensures consistency */
1161
    if (s->bs->backing) {
1162 1163 1164 1165 1166 1167
        return false;
    }

    return !(s->header.features & QED_F_NEED_CHECK);
}

S
Stefan Hajnoczi 已提交
1168 1169 1170 1171 1172 1173 1174
/**
 * Write new data cluster
 *
 * @acb:        Write request
 * @len:        Length in bytes
 *
 * This path is taken when writing to previously unallocated clusters.
1175 1176
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1177
 */
1178
static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
S
Stefan Hajnoczi 已提交
1179 1180
{
    BDRVQEDState *s = acb_to_s(acb);
1181
    int ret;
S
Stefan Hajnoczi 已提交
1182

1183
    /* Cancel timer when the first allocating request comes in */
1184
    if (s->allocating_acb == NULL) {
1185 1186 1187
        qed_cancel_need_check_timer(s);
    }

S
Stefan Hajnoczi 已提交
1188
    /* Freeze this request if another allocating write is in progress */
1189 1190
    if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
        if (s->allocating_acb != NULL) {
1191
            qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1192 1193 1194 1195
            assert(s->allocating_acb == NULL);
        }
        s->allocating_acb = acb;
        return -EAGAIN; /* start over with looking up table entries */
S
Stefan Hajnoczi 已提交
1196 1197 1198 1199
    }

    acb->cur_nclusters = qed_bytes_to_clusters(s,
            qed_offset_into_cluster(s, acb->cur_pos) + len);
1200
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
S
Stefan Hajnoczi 已提交
1201

1202 1203 1204
    if (acb->flags & QED_AIOCB_ZERO) {
        /* Skip ahead if the clusters are already zero */
        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1205
            return 0;
1206
        }
1207
        acb->cur_cluster = 1;
1208 1209 1210 1211
    } else {
        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
    }

1212 1213
    if (qed_should_set_need_check(s)) {
        s->header.features |= QED_F_NEED_CHECK;
1214
        ret = qed_write_header(s);
1215
        if (ret < 0) {
1216
            return ret;
1217 1218 1219
        }
    }

1220
    if (!(acb->flags & QED_AIOCB_ZERO)) {
1221
        ret = qed_aio_write_cow(acb);
1222 1223 1224
        if (ret < 0) {
            return ret;
        }
S
Stefan Hajnoczi 已提交
1225
    }
1226 1227

    return qed_aio_write_l2_update(acb, acb->cur_cluster);
S
Stefan Hajnoczi 已提交
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
}

/**
 * Write data cluster in place
 *
 * @acb:        Write request
 * @offset:     Cluster offset in bytes
 * @len:        Length in bytes
 *
 * This path is taken when writing to already allocated clusters.
1238 1239
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1240
 */
1241 1242
static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
                                              size_t len)
S
Stefan Hajnoczi 已提交
1243
{
1244 1245 1246 1247 1248
    BDRVQEDState *s = acb_to_s(acb);
    int r;

    qemu_co_mutex_unlock(&s->table_lock);

1249 1250 1251 1252 1253
    /* Allocate buffer for zero writes */
    if (acb->flags & QED_AIOCB_ZERO) {
        struct iovec *iov = acb->qiov->iov;

        if (!iov->iov_base) {
K
Kevin Wolf 已提交
1254
            iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1255
            if (iov->iov_base == NULL) {
1256 1257
                r = -ENOMEM;
                goto out;
1258
            }
1259 1260 1261 1262
            memset(iov->iov_base, 0, iov->iov_len);
        }
    }

S
Stefan Hajnoczi 已提交
1263 1264
    /* Calculate the I/O vector */
    acb->cur_cluster = offset;
1265
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
S
Stefan Hajnoczi 已提交
1266

1267 1268 1269 1270 1271
    /* Do the actual write.  */
    r = qed_aio_write_main(acb);
out:
    qemu_co_mutex_lock(&s->table_lock);
    return r;
S
Stefan Hajnoczi 已提交
1272 1273 1274 1275 1276 1277
}

/**
 * Write data cluster
 *
 * @opaque:     Write request
1278
 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
S
Stefan Hajnoczi 已提交
1279 1280
 * @offset:     Cluster offset in bytes
 * @len:        Length in bytes
1281 1282
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1283
 */
1284 1285
static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
                                           uint64_t offset, size_t len)
S
Stefan Hajnoczi 已提交
1286 1287 1288 1289 1290 1291 1292 1293 1294
{
    QEDAIOCB *acb = opaque;

    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);

    acb->find_cluster_ret = ret;

    switch (ret) {
    case QED_CLUSTER_FOUND:
1295
        return qed_aio_write_inplace(acb, offset, len);
S
Stefan Hajnoczi 已提交
1296 1297 1298

    case QED_CLUSTER_L2:
    case QED_CLUSTER_L1:
1299
    case QED_CLUSTER_ZERO:
1300
        return qed_aio_write_alloc(acb, len);
S
Stefan Hajnoczi 已提交
1301 1302

    default:
1303
        g_assert_not_reached();
1304
    }
S
Stefan Hajnoczi 已提交
1305 1306 1307 1308 1309 1310
}

/**
 * Read data cluster
 *
 * @opaque:     Read request
1311
 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
S
Stefan Hajnoczi 已提交
1312 1313
 * @offset:     Cluster offset in bytes
 * @len:        Length in bytes
1314 1315
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1316
 */
1317 1318
static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
                                          uint64_t offset, size_t len)
S
Stefan Hajnoczi 已提交
1319 1320 1321
{
    QEDAIOCB *acb = opaque;
    BDRVQEDState *s = acb_to_s(acb);
K
Kevin Wolf 已提交
1322
    BlockDriverState *bs = acb->bs;
1323 1324 1325
    int r;

    qemu_co_mutex_unlock(&s->table_lock);
S
Stefan Hajnoczi 已提交
1326 1327 1328 1329 1330 1331

    /* Adjust offset into cluster */
    offset += qed_offset_into_cluster(s, acb->cur_pos);

    trace_qed_aio_read_data(s, acb, ret, offset, len);

1332
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
S
Stefan Hajnoczi 已提交
1333

1334 1335 1336
    /* Handle zero cluster and backing file reads, otherwise read
     * data cluster directly.
     */
1337
    if (ret == QED_CLUSTER_ZERO) {
1338
        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1339
        r = 0;
1340
    } else if (ret != QED_CLUSTER_FOUND) {
1341 1342 1343 1344 1345 1346
        r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
                                  &acb->backing_qiov);
    } else {
        BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
        r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
                           &acb->cur_qiov, 0);
S
Stefan Hajnoczi 已提交
1347 1348
    }

1349 1350
    qemu_co_mutex_lock(&s->table_lock);
    return r;
S
Stefan Hajnoczi 已提交
1351 1352 1353 1354 1355
}

/**
 * Begin next I/O or complete the request
 */
1356
static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
1357 1358
{
    BDRVQEDState *s = acb_to_s(acb);
1359 1360
    uint64_t offset;
    size_t len;
1361
    int ret;
S
Stefan Hajnoczi 已提交
1362

1363
    qemu_co_mutex_lock(&s->table_lock);
1364 1365
    while (1) {
        trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
S
Stefan Hajnoczi 已提交
1366

1367 1368 1369 1370 1371
        if (acb->backing_qiov) {
            qemu_iovec_destroy(acb->backing_qiov);
            g_free(acb->backing_qiov);
            acb->backing_qiov = NULL;
        }
1372

1373 1374 1375
        acb->qiov_offset += acb->cur_qiov.size;
        acb->cur_pos += acb->cur_qiov.size;
        qemu_iovec_reset(&acb->cur_qiov);
S
Stefan Hajnoczi 已提交
1376

1377 1378
        /* Complete request */
        if (acb->cur_pos >= acb->end_pos) {
K
Kevin Wolf 已提交
1379 1380
            ret = 0;
            break;
1381
        }
S
Stefan Hajnoczi 已提交
1382

1383 1384 1385 1386
        /* Find next cluster and start I/O */
        len = acb->end_pos - acb->cur_pos;
        ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
        if (ret < 0) {
K
Kevin Wolf 已提交
1387
            break;
1388
        }
1389

1390 1391 1392 1393 1394
        if (acb->flags & QED_AIOCB_WRITE) {
            ret = qed_aio_write_data(acb, ret, offset, len);
        } else {
            ret = qed_aio_read_data(acb, ret, offset, len);
        }
1395

1396
        if (ret < 0 && ret != -EAGAIN) {
K
Kevin Wolf 已提交
1397
            break;
1398 1399
        }
    }
S
Stefan Hajnoczi 已提交
1400

K
Kevin Wolf 已提交
1401 1402
    trace_qed_aio_complete(s, acb, ret);
    qed_aio_complete(acb);
1403
    qemu_co_mutex_unlock(&s->table_lock);
K
Kevin Wolf 已提交
1404
    return ret;
1405 1406 1407 1408 1409 1410
}

static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
                                       QEMUIOVector *qiov, int nb_sectors,
                                       int flags)
{
K
Kevin Wolf 已提交
1411 1412 1413 1414 1415 1416
    QEDAIOCB acb = {
        .bs         = bs,
        .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
        .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
        .qiov       = qiov,
        .flags      = flags,
1417
    };
K
Kevin Wolf 已提交
1418
    qemu_iovec_init(&acb.cur_qiov, qiov->niov);
S
Stefan Hajnoczi 已提交
1419

K
Kevin Wolf 已提交
1420
    trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
S
Stefan Hajnoczi 已提交
1421 1422

    /* Start request */
K
Kevin Wolf 已提交
1423
    return qed_aio_next_io(&acb);
1424 1425
}

1426 1427 1428
static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
                                          int64_t sector_num, int nb_sectors,
                                          QEMUIOVector *qiov)
1429
{
1430
    return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
1431 1432
}

1433 1434
static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
                                           int64_t sector_num, int nb_sectors,
1435
                                           QEMUIOVector *qiov, int flags)
1436
{
1437
    assert(!flags);
1438
    return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1439 1440
}

1441 1442
static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
                                                  int64_t offset,
1443
                                                  int bytes,
1444
                                                  BdrvRequestFlags flags)
1445
{
1446
    BDRVQEDState *s = bs->opaque;
1447 1448 1449 1450 1451 1452

    /*
     * Zero writes start without an I/O buffer.  If a buffer becomes necessary
     * then it will be allocated during request processing.
     */
    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
1453

1454 1455
    /* Fall back if the request is not aligned */
    if (qed_offset_into_cluster(s, offset) ||
1456
        qed_offset_into_cluster(s, bytes)) {
1457
        return -ENOTSUP;
1458 1459
    }

1460
    return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1461
                          bytes >> BDRV_SECTOR_BITS,
1462
                          QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1463 1464
}

1465 1466
static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
                                             int64_t offset,
1467
                                             bool exact,
1468 1469
                                             PreallocMode prealloc,
                                             Error **errp)
1470
{
1471 1472 1473 1474
    BDRVQEDState *s = bs->opaque;
    uint64_t old_image_size;
    int ret;

1475 1476
    if (prealloc != PREALLOC_MODE_OFF) {
        error_setg(errp, "Unsupported preallocation mode '%s'",
1477
                   PreallocMode_str(prealloc));
1478 1479 1480
        return -ENOTSUP;
    }

1481 1482
    if (!qed_is_image_size_valid(offset, s->header.cluster_size,
                                 s->header.table_size)) {
1483
        error_setg(errp, "Invalid image size specified");
1484 1485 1486 1487
        return -EINVAL;
    }

    if ((uint64_t)offset < s->header.image_size) {
1488
        error_setg(errp, "Shrinking images is currently not supported");
1489 1490 1491 1492 1493 1494 1495 1496
        return -ENOTSUP;
    }

    old_image_size = s->header.image_size;
    s->header.image_size = offset;
    ret = qed_write_header_sync(s);
    if (ret < 0) {
        s->header.image_size = old_image_size;
1497
        error_setg_errno(errp, -ret, "Failed to update the image size");
1498 1499
    }
    return ret;
1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513
}

static int64_t bdrv_qed_getlength(BlockDriverState *bs)
{
    BDRVQEDState *s = bs->opaque;
    return s->header.image_size;
}

static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    BDRVQEDState *s = bs->opaque;

    memset(bdi, 0, sizeof(*bdi));
    bdi->cluster_size = s->header.cluster_size;
D
Dong Xu Wang 已提交
1514
    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1515
    bdi->unallocated_blocks_are_zero = true;
1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570
    return 0;
}

static int bdrv_qed_change_backing_file(BlockDriverState *bs,
                                        const char *backing_file,
                                        const char *backing_fmt)
{
    BDRVQEDState *s = bs->opaque;
    QEDHeader new_header, le_header;
    void *buffer;
    size_t buffer_len, backing_file_len;
    int ret;

    /* Refuse to set backing filename if unknown compat feature bits are
     * active.  If the image uses an unknown compat feature then we may not
     * know the layout of data following the header structure and cannot safely
     * add a new string.
     */
    if (backing_file && (s->header.compat_features &
                         ~QED_COMPAT_FEATURE_MASK)) {
        return -ENOTSUP;
    }

    memcpy(&new_header, &s->header, sizeof(new_header));

    new_header.features &= ~(QED_F_BACKING_FILE |
                             QED_F_BACKING_FORMAT_NO_PROBE);

    /* Adjust feature flags */
    if (backing_file) {
        new_header.features |= QED_F_BACKING_FILE;

        if (qed_fmt_is_raw(backing_fmt)) {
            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
        }
    }

    /* Calculate new header size */
    backing_file_len = 0;

    if (backing_file) {
        backing_file_len = strlen(backing_file);
    }

    buffer_len = sizeof(new_header);
    new_header.backing_filename_offset = buffer_len;
    new_header.backing_filename_size = backing_file_len;
    buffer_len += backing_file_len;

    /* Make sure we can rewrite header without failing */
    if (buffer_len > new_header.header_size * new_header.cluster_size) {
        return -ENOSPC;
    }

    /* Prepare new header */
1571
    buffer = g_malloc(buffer_len);
1572 1573 1574 1575 1576

    qed_header_cpu_to_le(&new_header, &le_header);
    memcpy(buffer, &le_header, sizeof(le_header));
    buffer_len = sizeof(le_header);

P
Pavel Borzenkov 已提交
1577 1578 1579 1580
    if (backing_file) {
        memcpy(buffer + buffer_len, backing_file, backing_file_len);
        buffer_len += backing_file_len;
    }
1581 1582

    /* Write new header */
1583
    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
1584
    g_free(buffer);
1585 1586 1587 1588 1589 1590
    if (ret == 0) {
        memcpy(&s->header, &new_header, sizeof(new_header));
    }
    return ret;
}

1591 1592
static void coroutine_fn bdrv_qed_co_invalidate_cache(BlockDriverState *bs,
                                                      Error **errp)
1593
{
1594
    BDRVQEDState *s = bs->opaque;
1595 1596
    Error *local_err = NULL;
    int ret;
1597 1598

    bdrv_qed_close(bs);
1599

1600
    bdrv_qed_init_state(bs);
1601
    qemu_co_mutex_lock(&s->table_lock);
1602
    ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err);
1603
    qemu_co_mutex_unlock(&s->table_lock);
1604
    if (local_err) {
1605 1606
        error_propagate_prepend(errp, local_err,
                                "Could not reopen qed layer: ");
1607 1608 1609 1610 1611
        return;
    } else if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not reopen qed layer");
        return;
    }
1612 1613
}

1614 1615 1616
static int coroutine_fn bdrv_qed_co_check(BlockDriverState *bs,
                                          BdrvCheckResult *result,
                                          BdrvCheckMode fix)
1617
{
S
Stefan Hajnoczi 已提交
1618
    BDRVQEDState *s = bs->opaque;
1619
    int ret;
S
Stefan Hajnoczi 已提交
1620

1621 1622 1623 1624 1625
    qemu_co_mutex_lock(&s->table_lock);
    ret = qed_check(s, result, !!fix);
    qemu_co_mutex_unlock(&s->table_lock);

    return ret;
1626 1627
}

1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659
static QemuOptsList qed_create_opts = {
    .name = "qed-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_BACKING_FMT,
            .type = QEMU_OPT_STRING,
            .help = "Image format of the base image"
        },
        {
            .name = BLOCK_OPT_CLUSTER_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Cluster size (in bytes)",
            .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
        },
        {
            .name = BLOCK_OPT_TABLE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "L1/L2 table size (in clusters)"
        },
        { /* end of list */ }
    }
1660 1661 1662 1663 1664
};

static BlockDriver bdrv_qed = {
    .format_name              = "qed",
    .instance_size            = sizeof(BDRVQEDState),
1665
    .create_opts              = &qed_create_opts,
1666
    .supports_backing         = true,
1667 1668 1669 1670

    .bdrv_probe               = bdrv_qed_probe,
    .bdrv_open                = bdrv_qed_open,
    .bdrv_close               = bdrv_qed_close,
J
Jeff Cody 已提交
1671
    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
1672
    .bdrv_child_perm          = bdrv_format_default_perms,
K
Kevin Wolf 已提交
1673
    .bdrv_co_create           = bdrv_qed_co_create,
1674
    .bdrv_co_create_opts      = bdrv_qed_co_create_opts,
1675
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
1676
    .bdrv_has_zero_init_truncate = bdrv_has_zero_init_1,
1677
    .bdrv_co_block_status     = bdrv_qed_co_block_status,
1678 1679
    .bdrv_co_readv            = bdrv_qed_co_readv,
    .bdrv_co_writev           = bdrv_qed_co_writev,
1680
    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
1681
    .bdrv_co_truncate         = bdrv_qed_co_truncate,
1682 1683
    .bdrv_getlength           = bdrv_qed_getlength,
    .bdrv_get_info            = bdrv_qed_get_info,
1684
    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
1685
    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1686
    .bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
1687
    .bdrv_co_check            = bdrv_qed_co_check,
1688 1689
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
1690
    .bdrv_co_drain_begin      = bdrv_qed_co_drain_begin,
1691 1692 1693 1694 1695 1696 1697 1698
};

static void bdrv_qed_init(void)
{
    bdrv_register(&bdrv_qed);
}

block_init(bdrv_qed_init);