qed.c 46.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * QEMU Enhanced Disk Format
 *
 * Copyright IBM, Corp. 2010
 *
 * Authors:
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

P
Peter Maydell 已提交
15
#include "qemu/osdep.h"
16
#include "qapi/error.h"
17
#include "qemu/timer.h"
18
#include "qemu/bswap.h"
S
Stefan Hajnoczi 已提交
19
#include "trace.h"
20
#include "qed.h"
21
#include "qapi/qmp/qerror.h"
22
#include "sysemu/block-backend.h"
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77

static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
                          const char *filename)
{
    const QEDHeader *header = (const QEDHeader *)buf;

    if (buf_size < sizeof(*header)) {
        return 0;
    }
    if (le32_to_cpu(header->magic) != QED_MAGIC) {
        return 0;
    }
    return 100;
}

/**
 * Check whether an image format is raw
 *
 * @fmt:    Backing file format, may be NULL
 */
static bool qed_fmt_is_raw(const char *fmt)
{
    return fmt && strcmp(fmt, "raw") == 0;
}

static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
{
    cpu->magic = le32_to_cpu(le->magic);
    cpu->cluster_size = le32_to_cpu(le->cluster_size);
    cpu->table_size = le32_to_cpu(le->table_size);
    cpu->header_size = le32_to_cpu(le->header_size);
    cpu->features = le64_to_cpu(le->features);
    cpu->compat_features = le64_to_cpu(le->compat_features);
    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
    cpu->image_size = le64_to_cpu(le->image_size);
    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
}

static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
{
    le->magic = cpu_to_le32(cpu->magic);
    le->cluster_size = cpu_to_le32(cpu->cluster_size);
    le->table_size = cpu_to_le32(cpu->table_size);
    le->header_size = cpu_to_le32(cpu->header_size);
    le->features = cpu_to_le64(cpu->features);
    le->compat_features = cpu_to_le64(cpu->compat_features);
    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
    le->image_size = cpu_to_le64(cpu->image_size);
    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
}

78
int qed_write_header_sync(BDRVQEDState *s)
79 80 81 82 83
{
    QEDHeader le;
    int ret;

    qed_header_cpu_to_le(&s->header, &le);
84
    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
85 86 87 88 89 90
    if (ret != sizeof(le)) {
        return ret;
    }
    return 0;
}

S
Stefan Hajnoczi 已提交
91 92 93 94 95
/**
 * Update header in-place (does not rewrite backing filename or other strings)
 *
 * This function only updates known header fields in-place and does not affect
 * extra data after the QED header.
96 97
 *
 * No new allocating reqs can start while this function runs.
S
Stefan Hajnoczi 已提交
98
 */
99
static int coroutine_fn qed_write_header(BDRVQEDState *s)
S
Stefan Hajnoczi 已提交
100 101 102 103 104 105 106
{
    /* We must write full sectors for O_DIRECT but cannot necessarily generate
     * the data following the header if an unrecognized compat feature is
     * active.  Therefore, first read the sectors containing the header, update
     * them, and write back.
     */

L
Laurent Vivier 已提交
107
    int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
S
Stefan Hajnoczi 已提交
108
    size_t len = nsectors * BDRV_SECTOR_SIZE;
109 110 111 112 113
    uint8_t *buf;
    struct iovec iov;
    QEMUIOVector qiov;
    int ret;

114 115
    assert(s->allocating_acb || s->allocating_write_reqs_plugged);

116 117 118 119 120 121 122
    buf = qemu_blockalign(s->bs, len);
    iov = (struct iovec) {
        .iov_base = buf,
        .iov_len = len,
    };
    qemu_iovec_init_external(&qiov, &iov, 1);

123
    ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0);
124 125 126 127 128 129 130
    if (ret < 0) {
        goto out;
    }

    /* Update header */
    qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);

131
    ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size,  &qiov, 0);
132 133 134 135 136 137 138
    if (ret < 0) {
        goto out;
    }

    ret = 0;
out:
    qemu_vfree(buf);
139
    return ret;
S
Stefan Hajnoczi 已提交
140 141
}

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
{
    uint64_t table_entries;
    uint64_t l2_size;

    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
    l2_size = table_entries * cluster_size;

    return l2_size * table_entries;
}

static bool qed_is_cluster_size_valid(uint32_t cluster_size)
{
    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
        cluster_size > QED_MAX_CLUSTER_SIZE) {
        return false;
    }
    if (cluster_size & (cluster_size - 1)) {
        return false; /* not power of 2 */
    }
    return true;
}

static bool qed_is_table_size_valid(uint32_t table_size)
{
    if (table_size < QED_MIN_TABLE_SIZE ||
        table_size > QED_MAX_TABLE_SIZE) {
        return false;
    }
    if (table_size & (table_size - 1)) {
        return false; /* not power of 2 */
    }
    return true;
}

static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
                                    uint32_t table_size)
{
    if (image_size % BDRV_SECTOR_SIZE != 0) {
        return false; /* not multiple of sector size */
    }
    if (image_size > qed_max_image_size(cluster_size, table_size)) {
        return false; /* image is too large */
    }
    return true;
}

/**
 * Read a string of known length from the image file
 *
 * @file:       Image file
 * @offset:     File offset to start of string, in bytes
 * @n:          String length in bytes
 * @buf:        Destination buffer
 * @buflen:     Destination buffer length in bytes
 * @ret:        0 on success, -errno on failure
 *
 * The string is NUL-terminated.
 */
201
static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
202 203 204 205 206 207 208 209 210 211 212 213 214 215
                           char *buf, size_t buflen)
{
    int ret;
    if (n >= buflen) {
        return -EINVAL;
    }
    ret = bdrv_pread(file, offset, buf, n);
    if (ret < 0) {
        return ret;
    }
    buf[n] = '\0';
    return 0;
}

S
Stefan Hajnoczi 已提交
216 217 218 219 220 221 222 223 224 225
/**
 * Allocate new clusters
 *
 * @s:          QED state
 * @n:          Number of contiguous clusters to allocate
 * @ret:        Offset of first allocated cluster
 *
 * This function only produces the offset where the new clusters should be
 * written.  It updates BDRVQEDState but does not make any changes to the image
 * file.
226 227
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
228 229 230 231 232 233 234 235
 */
static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
{
    uint64_t offset = s->file_size;
    s->file_size += n * s->header.cluster_size;
    return offset;
}

236 237 238 239 240 241 242
QEDTable *qed_alloc_table(BDRVQEDState *s)
{
    /* Honor O_DIRECT memory alignment requirements */
    return qemu_blockalign(s->bs,
                           s->header.cluster_size * s->header.table_size);
}

S
Stefan Hajnoczi 已提交
243 244
/**
 * Allocate a new zeroed L2 table
245 246
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
247 248 249 250 251 252 253 254 255 256 257 258 259
 */
static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
{
    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);

    l2_table->table = qed_alloc_table(s);
    l2_table->offset = qed_alloc_clusters(s, s->header.table_size);

    memset(l2_table->table->offsets, 0,
           s->header.cluster_size * s->header.table_size);
    return l2_table;
}

260
static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
261
{
262 263 264
    qemu_co_mutex_lock(&s->table_lock);

    /* No reentrancy is allowed.  */
265
    assert(!s->allocating_write_reqs_plugged);
266 267 268 269 270 271 272
    if (s->allocating_acb != NULL) {
        /* Another allocating write came concurrently.  This cannot happen
         * from bdrv_qed_co_drain, but it can happen when the timer runs.
         */
        qemu_co_mutex_unlock(&s->table_lock);
        return false;
    }
273 274

    s->allocating_write_reqs_plugged = true;
275 276
    qemu_co_mutex_unlock(&s->table_lock);
    return true;
277 278 279 280
}

static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
{
281
    qemu_co_mutex_lock(&s->table_lock);
282 283
    assert(s->allocating_write_reqs_plugged);
    s->allocating_write_reqs_plugged = false;
284 285
    qemu_co_queue_next(&s->allocating_write_reqs);
    qemu_co_mutex_unlock(&s->table_lock);
286 287
}

288
static void coroutine_fn qed_need_check_timer_entry(void *opaque)
289 290
{
    BDRVQEDState *s = opaque;
291 292 293
    int ret;

    trace_qed_need_check_timer_cb(s);
294

295 296 297
    if (!qed_plug_allocating_write_reqs(s)) {
        return;
    }
298 299 300 301

    /* Ensure writes are on disk before clearing flag */
    ret = bdrv_co_flush(s->bs->file->bs);
    if (ret < 0) {
302 303 304 305 306
        qed_unplug_allocating_write_reqs(s);
        return;
    }

    s->header.features &= ~QED_F_NEED_CHECK;
307 308 309 310 311
    ret = qed_write_header(s);
    (void) ret;

    qed_unplug_allocating_write_reqs(s);

312
    ret = bdrv_co_flush(s->bs);
313
    (void) ret;
314 315 316 317
}

static void qed_need_check_timer_cb(void *opaque)
{
318 319
    Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
    qemu_coroutine_enter(co);
320 321
}

322 323 324 325
static void qed_start_need_check_timer(BDRVQEDState *s)
{
    trace_qed_start_need_check_timer(s);

326
    /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
327 328
     * migration.
     */
329
    timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
330
                   NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
331 332 333 334 335 336
}

/* It's okay to call this multiple times or when no timer is started */
static void qed_cancel_need_check_timer(BDRVQEDState *s)
{
    trace_qed_cancel_need_check_timer(s);
337
    timer_del(s->need_check_timer);
338 339
}

340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
{
    BDRVQEDState *s = bs->opaque;

    qed_cancel_need_check_timer(s);
    timer_free(s->need_check_timer);
}

static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
                                        AioContext *new_context)
{
    BDRVQEDState *s = bs->opaque;

    s->need_check_timer = aio_timer_new(new_context,
                                        QEMU_CLOCK_VIRTUAL, SCALE_NS,
                                        qed_need_check_timer_cb, s);
    if (s->header.features & QED_F_NEED_CHECK) {
        qed_start_need_check_timer(s);
    }
}

361
static void coroutine_fn bdrv_qed_co_drain(BlockDriverState *bs)
F
Fam Zheng 已提交
362 363 364 365 366 367 368 369
{
    BDRVQEDState *s = bs->opaque;

    /* Fire the timer immediately in order to start doing I/O as soon as the
     * header is flushed.
     */
    if (s->need_check_timer && timer_pending(s->need_check_timer)) {
        qed_cancel_need_check_timer(s);
370
        qed_need_check_timer_entry(s);
F
Fam Zheng 已提交
371 372 373
    }
}

374 375 376 377 378 379
static void bdrv_qed_init_state(BlockDriverState *bs)
{
    BDRVQEDState *s = bs->opaque;

    memset(s, 0, sizeof(BDRVQEDState));
    s->bs = bs;
380
    qemu_co_mutex_init(&s->table_lock);
381 382 383
    qemu_co_queue_init(&s->allocating_write_reqs);
}

384 385
static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags,
                            Error **errp)
386 387 388 389 390 391
{
    BDRVQEDState *s = bs->opaque;
    QEDHeader le_header;
    int64_t file_size;
    int ret;

392
    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
393 394 395 396 397 398
    if (ret < 0) {
        return ret;
    }
    qed_header_le_to_cpu(&le_header, &s->header);

    if (s->header.magic != QED_MAGIC) {
P
Paolo Bonzini 已提交
399 400
        error_setg(errp, "Image not in QED format");
        return -EINVAL;
401 402
    }
    if (s->header.features & ~QED_FEATURE_MASK) {
403
        /* image uses unsupported feature bits */
404 405
        error_setg(errp, "Unsupported QED features: %" PRIx64,
                   s->header.features & ~QED_FEATURE_MASK);
406
        return -ENOTSUP;
407 408 409 410 411 412
    }
    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
        return -EINVAL;
    }

    /* Round down file size to the last cluster */
K
Kevin Wolf 已提交
413
    file_size = bdrv_getlength(bs->file->bs);
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
    if (file_size < 0) {
        return file_size;
    }
    s->file_size = qed_start_of_cluster(s, file_size);

    if (!qed_is_table_size_valid(s->header.table_size)) {
        return -EINVAL;
    }
    if (!qed_is_image_size_valid(s->header.image_size,
                                 s->header.cluster_size,
                                 s->header.table_size)) {
        return -EINVAL;
    }
    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
        return -EINVAL;
    }

    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
                      sizeof(uint64_t);
433
    s->l2_shift = ctz32(s->header.cluster_size);
434
    s->l2_mask = s->table_nelems - 1;
435
    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
436

437 438 439 440 441
    /* Header size calculation must not overflow uint32_t */
    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
        return -EINVAL;
    }

442 443 444 445 446 447 448
    if ((s->header.features & QED_F_BACKING_FILE)) {
        if ((uint64_t)s->header.backing_filename_offset +
            s->header.backing_filename_size >
            s->header.cluster_size * s->header.header_size) {
            return -EINVAL;
        }

449
        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
                              s->header.backing_filename_size, bs->backing_file,
                              sizeof(bs->backing_file));
        if (ret < 0) {
            return ret;
        }

        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
        }
    }

    /* Reset unknown autoclear feature bits.  This is a backwards
     * compatibility mechanism that allows images to be opened by older
     * programs, which "knock out" unknown feature bits.  When an image is
     * opened by a newer program again it can detect that the autoclear
     * feature is no longer valid.
     */
    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
468
        !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
469 470 471 472 473 474 475 476
        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;

        ret = qed_write_header_sync(s);
        if (ret) {
            return ret;
        }

        /* From here on only known autoclear feature bits are valid */
K
Kevin Wolf 已提交
477
        bdrv_flush(bs->file->bs);
478 479
    }

480 481 482 483
    s->l1_table = qed_alloc_table(s);
    qed_init_l2_cache(&s->l2_cache);

    ret = qed_read_l1_table_sync(s);
S
Stefan Hajnoczi 已提交
484 485 486 487 488
    if (ret) {
        goto out;
    }

    /* If image was not closed cleanly, check consistency */
489
    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
S
Stefan Hajnoczi 已提交
490 491 492 493 494
        /* Read-only images cannot be fixed.  There is no risk of corruption
         * since write operations are not possible.  Therefore, allow
         * potentially inconsistent images to be opened read-only.  This can
         * aid data recovery from an otherwise inconsistent image.
         */
K
Kevin Wolf 已提交
495
        if (!bdrv_is_read_only(bs->file->bs) &&
496
            !(flags & BDRV_O_INACTIVE)) {
S
Stefan Hajnoczi 已提交
497 498 499
            BdrvCheckResult result = {0};

            ret = qed_check(s, &result, true);
500 501 502
            if (ret) {
                goto out;
            }
S
Stefan Hajnoczi 已提交
503 504 505
        }
    }

506
    bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
507

S
Stefan Hajnoczi 已提交
508
out:
509 510 511 512
    if (ret) {
        qed_free_l2_cache(&s->l2_cache);
        qemu_vfree(s->l1_table);
    }
513 514 515
    return ret;
}

516 517 518 519 520 521 522 523 524
static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                         Error **errp)
{
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

525
    bdrv_qed_init_state(bs);
526 527 528
    return bdrv_qed_do_open(bs, options, flags, errp);
}

529
static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
530 531 532
{
    BDRVQEDState *s = bs->opaque;

533
    bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
534 535
}

J
Jeff Cody 已提交
536 537 538 539 540 541 542 543
/* We have nothing to do for QED reopen, stubs just return
 * success */
static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
                                   BlockReopenQueue *queue, Error **errp)
{
    return 0;
}

544 545
static void bdrv_qed_close(BlockDriverState *bs)
{
546 547
    BDRVQEDState *s = bs->opaque;

548
    bdrv_qed_detach_aio_context(bs);
549

S
Stefan Hajnoczi 已提交
550
    /* Ensure writes reach stable storage */
K
Kevin Wolf 已提交
551
    bdrv_flush(bs->file->bs);
S
Stefan Hajnoczi 已提交
552 553 554 555 556 557 558

    /* Clean shutdown, no check required on next open */
    if (s->header.features & QED_F_NEED_CHECK) {
        s->header.features &= ~QED_F_NEED_CHECK;
        qed_write_header_sync(s);
    }

559 560
    qed_free_l2_cache(&s->l2_cache);
    qemu_vfree(s->l1_table);
561 562 563 564
}

static int qed_create(const char *filename, uint32_t cluster_size,
                      uint64_t image_size, uint32_t table_size,
P
Paolo Bonzini 已提交
565
                      const char *backing_file, const char *backing_fmt,
566
                      QemuOpts *opts, Error **errp)
567 568 569 570 571 572 573 574 575 576 577 578 579 580
{
    QEDHeader header = {
        .magic = QED_MAGIC,
        .cluster_size = cluster_size,
        .table_size = table_size,
        .header_size = 1,
        .features = 0,
        .compat_features = 0,
        .l1_table_offset = cluster_size,
        .image_size = image_size,
    };
    QEDHeader le_header;
    uint8_t *l1_table = NULL;
    size_t l1_size = header.cluster_size * header.table_size;
581
    Error *local_err = NULL;
582
    int ret = 0;
583
    BlockBackend *blk;
584

585
    ret = bdrv_create_file(filename, opts, &local_err);
586
    if (ret < 0) {
P
Paolo Bonzini 已提交
587
        error_propagate(errp, local_err);
588 589 590
        return ret;
    }

591
    blk = blk_new_open(filename, NULL, NULL,
592 593
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
                       &local_err);
594
    if (blk == NULL) {
P
Paolo Bonzini 已提交
595
        error_propagate(errp, local_err);
596
        return -EIO;
597 598
    }

599 600
    blk_set_allow_write_beyond_eof(blk, true);

601
    /* File must start empty and grow, check truncate is supported */
602
    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
603 604 605 606
    if (ret < 0) {
        goto out;
    }

607 608 609 610 611 612 613 614 615 616 617
    if (backing_file) {
        header.features |= QED_F_BACKING_FILE;
        header.backing_filename_offset = sizeof(le_header);
        header.backing_filename_size = strlen(backing_file);

        if (qed_fmt_is_raw(backing_fmt)) {
            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
        }
    }

    qed_header_cpu_to_le(&header, &le_header);
618
    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
619 620 621
    if (ret < 0) {
        goto out;
    }
622
    ret = blk_pwrite(blk, sizeof(le_header), backing_file,
623
                     header.backing_filename_size, 0);
624 625 626 627
    if (ret < 0) {
        goto out;
    }

628
    l1_table = g_malloc0(l1_size);
629
    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
630 631 632 633 634 635
    if (ret < 0) {
        goto out;
    }

    ret = 0; /* success */
out:
636
    g_free(l1_table);
637
    blk_unref(blk);
638 639 640
    return ret;
}

641
static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp)
642 643 644 645
{
    uint64_t image_size = 0;
    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
646 647 648 649
    char *backing_file = NULL;
    char *backing_fmt = NULL;
    int ret;

650 651
    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
652 653 654 655 656 657 658
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
    cluster_size = qemu_opt_get_size_del(opts,
                                         BLOCK_OPT_CLUSTER_SIZE,
                                         QED_DEFAULT_CLUSTER_SIZE);
    table_size = qemu_opt_get_size_del(opts, BLOCK_OPT_TABLE_SIZE,
                                       QED_DEFAULT_TABLE_SIZE);
659 660

    if (!qed_is_cluster_size_valid(cluster_size)) {
661 662 663
        error_setg(errp, "QED cluster size must be within range [%u, %u] "
                         "and power of 2",
                   QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
664 665
        ret = -EINVAL;
        goto finish;
666 667
    }
    if (!qed_is_table_size_valid(table_size)) {
668 669 670
        error_setg(errp, "QED table size must be within range [%u, %u] "
                         "and power of 2",
                   QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
671 672
        ret = -EINVAL;
        goto finish;
673 674
    }
    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
675 676 677
        error_setg(errp, "QED image size must be a non-zero multiple of "
                         "cluster size and less than %" PRIu64 " bytes",
                   qed_max_image_size(cluster_size, table_size));
678 679
        ret = -EINVAL;
        goto finish;
680 681
    }

682
    ret = qed_create(filename, cluster_size, image_size, table_size,
683
                     backing_file, backing_fmt, opts, errp);
684 685 686 687 688

finish:
    g_free(backing_file);
    g_free(backing_fmt);
    return ret;
689 690
}

691
typedef struct {
692
    BlockDriverState *bs;
693
    Coroutine *co;
694 695
    uint64_t pos;
    int64_t status;
696
    int *pnum;
697
    BlockDriverState **file;
698 699
} QEDIsAllocatedCB;

700
/* Called with table_lock held.  */
701 702 703
static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
{
    QEDIsAllocatedCB *cb = opaque;
704
    BDRVQEDState *s = cb->bs->opaque;
705
    *cb->pnum = len / BDRV_SECTOR_SIZE;
706 707 708 709
    switch (ret) {
    case QED_CLUSTER_FOUND:
        offset |= qed_offset_into_cluster(s, cb->pos);
        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
710
        *cb->file = cb->bs->file->bs;
711 712 713 714 715 716 717 718 719 720 721 722 723 724
        break;
    case QED_CLUSTER_ZERO:
        cb->status = BDRV_BLOCK_ZERO;
        break;
    case QED_CLUSTER_L2:
    case QED_CLUSTER_L1:
        cb->status = 0;
        break;
    default:
        assert(ret < 0);
        cb->status = ret;
        break;
    }

725
    if (cb->co) {
726
        aio_co_wake(cb->co);
727
    }
728 729
}

730
static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
731
                                                 int64_t sector_num,
732 733
                                                 int nb_sectors, int *pnum,
                                                 BlockDriverState **file)
734
{
735 736 737
    BDRVQEDState *s = bs->opaque;
    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
    QEDIsAllocatedCB cb = {
738 739 740
        .bs = bs,
        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
        .status = BDRV_BLOCK_OFFSET_MASK,
741
        .pnum = pnum,
742
        .file = file,
743 744
    };
    QEDRequest request = { .l2_table = NULL };
745 746
    uint64_t offset;
    int ret;
747

748
    qemu_co_mutex_lock(&s->table_lock);
749 750
    ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
    qed_is_allocated_cb(&cb, ret, offset, len);
751

752 753
    /* The callback was invoked immediately */
    assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
754 755

    qed_unref_l2_cache_entry(request.l2_table);
756
    qemu_co_mutex_unlock(&s->table_lock);
757

758
    return cb.status;
759 760
}

S
Stefan Hajnoczi 已提交
761 762
static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
{
K
Kevin Wolf 已提交
763
    return acb->bs->opaque;
S
Stefan Hajnoczi 已提交
764 765 766 767 768
}

/**
 * Read from the backing file or zero-fill if no backing file
 *
769 770 771 772 773 774
 * @s:              QED state
 * @pos:            Byte position in device
 * @qiov:           Destination I/O vector
 * @backing_qiov:   Possibly shortened copy of qiov, to be allocated here
 * @cb:             Completion function
 * @opaque:         User data for completion function
S
Stefan Hajnoczi 已提交
775 776 777 778
 *
 * This function reads qiov->size bytes starting at pos from the backing file.
 * If there is no backing file then zeroes are read.
 */
779 780 781
static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
                                              QEMUIOVector *qiov,
                                              QEMUIOVector **backing_qiov)
S
Stefan Hajnoczi 已提交
782 783 784
{
    uint64_t backing_length = 0;
    size_t size;
785
    int ret;
S
Stefan Hajnoczi 已提交
786 787 788 789

    /* If there is a backing file, get its length.  Treat the absence of a
     * backing file like a zero length backing file.
     */
790 791
    if (s->bs->backing) {
        int64_t l = bdrv_getlength(s->bs->backing->bs);
S
Stefan Hajnoczi 已提交
792
        if (l < 0) {
793
            return l;
S
Stefan Hajnoczi 已提交
794 795 796 797 798 799 800
        }
        backing_length = l;
    }

    /* Zero all sectors if reading beyond the end of the backing file */
    if (pos >= backing_length ||
        pos + qiov->size > backing_length) {
801
        qemu_iovec_memset(qiov, 0, 0, qiov->size);
S
Stefan Hajnoczi 已提交
802 803 804 805
    }

    /* Complete now if there are no backing file sectors to read */
    if (pos >= backing_length) {
806
        return 0;
S
Stefan Hajnoczi 已提交
807 808 809 810 811
    }

    /* If the read straddles the end of the backing file, shorten it */
    size = MIN((uint64_t)backing_length - pos, qiov->size);

812 813 814 815 816
    assert(*backing_qiov == NULL);
    *backing_qiov = g_new(QEMUIOVector, 1);
    qemu_iovec_init(*backing_qiov, qiov->niov);
    qemu_iovec_concat(*backing_qiov, qiov, 0, size);

P
Paolo Bonzini 已提交
817
    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
818
    ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
819 820 821 822
    if (ret < 0) {
        return ret;
    }
    return 0;
S
Stefan Hajnoczi 已提交
823 824 825 826 827 828 829 830 831 832
}

/**
 * Copy data from backing file into the image
 *
 * @s:          QED state
 * @pos:        Byte position in device
 * @len:        Number of bytes
 * @offset:     Byte offset in image file
 */
833 834 835
static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
                                                   uint64_t pos, uint64_t len,
                                                   uint64_t offset)
S
Stefan Hajnoczi 已提交
836
{
837 838 839
    QEMUIOVector qiov;
    QEMUIOVector *backing_qiov = NULL;
    struct iovec iov;
840
    int ret;
S
Stefan Hajnoczi 已提交
841 842 843

    /* Skip copy entirely if there is no work to do */
    if (len == 0) {
844
        return 0;
S
Stefan Hajnoczi 已提交
845 846
    }

847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863
    iov = (struct iovec) {
        .iov_base = qemu_blockalign(s->bs, len),
        .iov_len = len,
    };
    qemu_iovec_init_external(&qiov, &iov, 1);

    ret = qed_read_backing_file(s, pos, &qiov, &backing_qiov);

    if (backing_qiov) {
        qemu_iovec_destroy(backing_qiov);
        g_free(backing_qiov);
        backing_qiov = NULL;
    }

    if (ret) {
        goto out;
    }
S
Stefan Hajnoczi 已提交
864

865
    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
866
    ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
867 868 869 870 871 872
    if (ret < 0) {
        goto out;
    }
    ret = 0;
out:
    qemu_vfree(iov.iov_base);
873
    return ret;
S
Stefan Hajnoczi 已提交
874 875 876 877 878 879 880 881 882
}

/**
 * Link one or more contiguous clusters into a table
 *
 * @s:              QED state
 * @table:          L2 table
 * @index:          First cluster index
 * @n:              Number of contiguous clusters
883 884 885 886
 * @cluster:        First cluster offset
 *
 * The cluster offset may be an allocated byte offset in the image file, the
 * zero cluster marker, or the unallocated cluster marker.
887 888
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
889
 */
890 891 892
static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
                                             int index, unsigned int n,
                                             uint64_t cluster)
S
Stefan Hajnoczi 已提交
893 894 895 896
{
    int i;
    for (i = index; i < index + n; i++) {
        table->offsets[i] = cluster;
897 898 899 900
        if (!qed_offset_is_unalloc_cluster(cluster) &&
            !qed_offset_is_zero_cluster(cluster)) {
            cluster += s->header.cluster_size;
        }
S
Stefan Hajnoczi 已提交
901 902 903
    }
}

904
/* Called with table_lock held.  */
905
static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
906
{
907
    BDRVQEDState *s = acb_to_s(acb);
S
Stefan Hajnoczi 已提交
908 909 910 911 912

    /* Free resources */
    qemu_iovec_destroy(&acb->cur_qiov);
    qed_unref_l2_cache_entry(acb->request.l2_table);

913 914 915 916 917 918
    /* Free the buffer we may have allocated for zero writes */
    if (acb->flags & QED_AIOCB_ZERO) {
        qemu_vfree(acb->qiov->iov[0].iov_base);
        acb->qiov->iov[0].iov_base = NULL;
    }

S
Stefan Hajnoczi 已提交
919 920 921 922 923 924
    /* Start next allocating write request waiting behind this one.  Note that
     * requests enqueue themselves when they first hit an unallocated cluster
     * but they wait until the entire request is finished before waking up the
     * next request in the queue.  This ensures that we don't cycle through
     * requests multiple times but rather finish one at a time completely.
     */
925 926 927
    if (acb == s->allocating_acb) {
        s->allocating_acb = NULL;
        if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
928
            qemu_co_queue_next(&s->allocating_write_reqs);
929 930
        } else if (s->header.features & QED_F_NEED_CHECK) {
            qed_start_need_check_timer(s);
S
Stefan Hajnoczi 已提交
931 932 933 934 935
        }
    }
}

/**
K
Kevin Wolf 已提交
936
 * Update L1 table with new L2 table offset and write it out
937 938
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
939
 */
940
static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
941 942 943
{
    BDRVQEDState *s = acb_to_s(acb);
    CachedL2Table *l2_table = acb->request.l2_table;
944
    uint64_t l2_offset = l2_table->offset;
945
    int index, ret;
S
Stefan Hajnoczi 已提交
946

K
Kevin Wolf 已提交
947 948 949 950 951 952
    index = qed_l1_index(s, acb->cur_pos);
    s->l1_table->offsets[index] = l2_table->offset;

    ret = qed_write_l1_table(s, index, 1);

    /* Commit the current L2 table to the cache */
S
Stefan Hajnoczi 已提交
953 954 955 956 957
    qed_commit_l2_cache_entry(&s->l2_cache, l2_table);

    /* This is guaranteed to succeed because we just committed the entry to the
     * cache.
     */
958
    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
S
Stefan Hajnoczi 已提交
959 960
    assert(acb->request.l2_table != NULL);

961
    return ret;
S
Stefan Hajnoczi 已提交
962 963 964 965 966
}


/**
 * Update L2 table with new cluster offsets and write them out
967 968
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
969
 */
970
static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
S
Stefan Hajnoczi 已提交
971 972 973
{
    BDRVQEDState *s = acb_to_s(acb);
    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
974
    int index, ret;
S
Stefan Hajnoczi 已提交
975 976 977 978 979 980 981 982

    if (need_alloc) {
        qed_unref_l2_cache_entry(acb->request.l2_table);
        acb->request.l2_table = qed_new_l2_table(s);
    }

    index = qed_l2_index(s, acb->cur_pos);
    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
983
                         offset);
S
Stefan Hajnoczi 已提交
984 985 986

    if (need_alloc) {
        /* Write out the whole new L2 table */
987
        ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
988
        if (ret) {
989
            return ret;
990
        }
991
        return qed_aio_write_l1_update(acb);
S
Stefan Hajnoczi 已提交
992 993
    } else {
        /* Write out only the updated part of the L2 table */
994 995
        ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
                                 false);
996 997 998
        if (ret) {
            return ret;
        }
S
Stefan Hajnoczi 已提交
999
    }
1000
    return 0;
S
Stefan Hajnoczi 已提交
1001 1002 1003 1004
}

/**
 * Write data to the image file
1005 1006
 *
 * Called with table_lock *not* held.
S
Stefan Hajnoczi 已提交
1007
 */
1008
static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
1009 1010 1011 1012 1013
{
    BDRVQEDState *s = acb_to_s(acb);
    uint64_t offset = acb->cur_cluster +
                      qed_offset_into_cluster(s, acb->cur_pos);

1014
    trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
S
Stefan Hajnoczi 已提交
1015

1016
    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1017 1018
    return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
                           &acb->cur_qiov, 0);
S
Stefan Hajnoczi 已提交
1019 1020 1021
}

/**
1022
 * Populate untouched regions of new data cluster
1023 1024
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1025
 */
1026
static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
1027 1028
{
    BDRVQEDState *s = acb_to_s(acb);
1029
    uint64_t start, len, offset;
1030
    int ret;
S
Stefan Hajnoczi 已提交
1031

1032 1033
    qemu_co_mutex_unlock(&s->table_lock);

1034 1035 1036 1037 1038 1039
    /* Populate front untouched region of new data cluster */
    start = qed_start_of_cluster(s, acb->cur_pos);
    len = qed_offset_into_cluster(s, acb->cur_pos);

    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
    ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1040
    if (ret < 0) {
1041
        goto out;
S
Stefan Hajnoczi 已提交
1042 1043
    }

1044 1045 1046 1047 1048 1049
    /* Populate back untouched region of new data cluster */
    start = acb->cur_pos + acb->cur_qiov.size;
    len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
    offset = acb->cur_cluster +
             qed_offset_into_cluster(s, acb->cur_pos) +
             acb->cur_qiov.size;
S
Stefan Hajnoczi 已提交
1050

1051 1052
    trace_qed_aio_write_postfill(s, acb, start, len, offset);
    ret = qed_copy_from_backing_file(s, start, len, offset);
1053
    if (ret < 0) {
1054
        goto out;
1055
    }
1056

1057 1058
    ret = qed_aio_write_main(acb);
    if (ret < 0) {
1059
        goto out;
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
    }

    if (s->bs->backing) {
        /*
         * Flush new data clusters before updating the L2 table
         *
         * This flush is necessary when a backing file is in use.  A crash
         * during an allocating write could result in empty clusters in the
         * image.  If the write only touched a subregion of the cluster,
         * then backing image sectors have been lost in the untouched
         * region.  The solution is to flush after writing a new data
         * cluster and before updating the L2 table.
         */
        ret = bdrv_co_flush(s->bs->file->bs);
    }

1076 1077 1078
out:
    qemu_co_mutex_lock(&s->table_lock);
    return ret;
S
Stefan Hajnoczi 已提交
1079 1080
}

1081 1082 1083 1084 1085 1086
/**
 * Check if the QED_F_NEED_CHECK bit should be set during allocating write
 */
static bool qed_should_set_need_check(BDRVQEDState *s)
{
    /* The flush before L2 update path ensures consistency */
1087
    if (s->bs->backing) {
1088 1089 1090 1091 1092 1093
        return false;
    }

    return !(s->header.features & QED_F_NEED_CHECK);
}

S
Stefan Hajnoczi 已提交
1094 1095 1096 1097 1098 1099 1100
/**
 * Write new data cluster
 *
 * @acb:        Write request
 * @len:        Length in bytes
 *
 * This path is taken when writing to previously unallocated clusters.
1101 1102
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1103
 */
1104
static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
S
Stefan Hajnoczi 已提交
1105 1106
{
    BDRVQEDState *s = acb_to_s(acb);
1107
    int ret;
S
Stefan Hajnoczi 已提交
1108

1109
    /* Cancel timer when the first allocating request comes in */
1110
    if (s->allocating_acb == NULL) {
1111 1112 1113
        qed_cancel_need_check_timer(s);
    }

S
Stefan Hajnoczi 已提交
1114
    /* Freeze this request if another allocating write is in progress */
1115 1116
    if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
        if (s->allocating_acb != NULL) {
1117
            qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1118 1119 1120 1121
            assert(s->allocating_acb == NULL);
        }
        s->allocating_acb = acb;
        return -EAGAIN; /* start over with looking up table entries */
S
Stefan Hajnoczi 已提交
1122 1123 1124 1125
    }

    acb->cur_nclusters = qed_bytes_to_clusters(s,
            qed_offset_into_cluster(s, acb->cur_pos) + len);
1126
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
S
Stefan Hajnoczi 已提交
1127

1128 1129 1130
    if (acb->flags & QED_AIOCB_ZERO) {
        /* Skip ahead if the clusters are already zero */
        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1131
            return 0;
1132
        }
1133
        acb->cur_cluster = 1;
1134 1135 1136 1137
    } else {
        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
    }

1138 1139
    if (qed_should_set_need_check(s)) {
        s->header.features |= QED_F_NEED_CHECK;
1140
        ret = qed_write_header(s);
1141
        if (ret < 0) {
1142
            return ret;
1143 1144 1145
        }
    }

1146
    if (!(acb->flags & QED_AIOCB_ZERO)) {
1147
        ret = qed_aio_write_cow(acb);
1148 1149 1150
        if (ret < 0) {
            return ret;
        }
S
Stefan Hajnoczi 已提交
1151
    }
1152 1153

    return qed_aio_write_l2_update(acb, acb->cur_cluster);
S
Stefan Hajnoczi 已提交
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
}

/**
 * Write data cluster in place
 *
 * @acb:        Write request
 * @offset:     Cluster offset in bytes
 * @len:        Length in bytes
 *
 * This path is taken when writing to already allocated clusters.
1164 1165
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1166
 */
1167 1168
static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
                                              size_t len)
S
Stefan Hajnoczi 已提交
1169
{
1170 1171 1172 1173 1174
    BDRVQEDState *s = acb_to_s(acb);
    int r;

    qemu_co_mutex_unlock(&s->table_lock);

1175 1176 1177 1178 1179
    /* Allocate buffer for zero writes */
    if (acb->flags & QED_AIOCB_ZERO) {
        struct iovec *iov = acb->qiov->iov;

        if (!iov->iov_base) {
K
Kevin Wolf 已提交
1180
            iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1181
            if (iov->iov_base == NULL) {
1182 1183
                r = -ENOMEM;
                goto out;
1184
            }
1185 1186 1187 1188
            memset(iov->iov_base, 0, iov->iov_len);
        }
    }

S
Stefan Hajnoczi 已提交
1189 1190
    /* Calculate the I/O vector */
    acb->cur_cluster = offset;
1191
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
S
Stefan Hajnoczi 已提交
1192

1193 1194 1195 1196 1197
    /* Do the actual write.  */
    r = qed_aio_write_main(acb);
out:
    qemu_co_mutex_lock(&s->table_lock);
    return r;
S
Stefan Hajnoczi 已提交
1198 1199 1200 1201 1202 1203
}

/**
 * Write data cluster
 *
 * @opaque:     Write request
1204
 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
S
Stefan Hajnoczi 已提交
1205 1206
 * @offset:     Cluster offset in bytes
 * @len:        Length in bytes
1207 1208
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1209
 */
1210 1211
static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
                                           uint64_t offset, size_t len)
S
Stefan Hajnoczi 已提交
1212 1213 1214 1215 1216 1217 1218 1219 1220
{
    QEDAIOCB *acb = opaque;

    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);

    acb->find_cluster_ret = ret;

    switch (ret) {
    case QED_CLUSTER_FOUND:
1221
        return qed_aio_write_inplace(acb, offset, len);
S
Stefan Hajnoczi 已提交
1222 1223 1224

    case QED_CLUSTER_L2:
    case QED_CLUSTER_L1:
1225
    case QED_CLUSTER_ZERO:
1226
        return qed_aio_write_alloc(acb, len);
S
Stefan Hajnoczi 已提交
1227 1228

    default:
1229
        g_assert_not_reached();
1230
    }
S
Stefan Hajnoczi 已提交
1231 1232 1233 1234 1235 1236
}

/**
 * Read data cluster
 *
 * @opaque:     Read request
1237
 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
S
Stefan Hajnoczi 已提交
1238 1239
 * @offset:     Cluster offset in bytes
 * @len:        Length in bytes
1240 1241
 *
 * Called with table_lock held.
S
Stefan Hajnoczi 已提交
1242
 */
1243 1244
static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
                                          uint64_t offset, size_t len)
S
Stefan Hajnoczi 已提交
1245 1246 1247
{
    QEDAIOCB *acb = opaque;
    BDRVQEDState *s = acb_to_s(acb);
K
Kevin Wolf 已提交
1248
    BlockDriverState *bs = acb->bs;
1249 1250 1251
    int r;

    qemu_co_mutex_unlock(&s->table_lock);
S
Stefan Hajnoczi 已提交
1252 1253 1254 1255 1256 1257

    /* Adjust offset into cluster */
    offset += qed_offset_into_cluster(s, acb->cur_pos);

    trace_qed_aio_read_data(s, acb, ret, offset, len);

1258
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
S
Stefan Hajnoczi 已提交
1259

1260 1261 1262
    /* Handle zero cluster and backing file reads, otherwise read
     * data cluster directly.
     */
1263
    if (ret == QED_CLUSTER_ZERO) {
1264
        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1265
        r = 0;
1266
    } else if (ret != QED_CLUSTER_FOUND) {
1267 1268 1269 1270 1271 1272
        r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
                                  &acb->backing_qiov);
    } else {
        BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
        r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
                           &acb->cur_qiov, 0);
S
Stefan Hajnoczi 已提交
1273 1274
    }

1275 1276
    qemu_co_mutex_lock(&s->table_lock);
    return r;
S
Stefan Hajnoczi 已提交
1277 1278 1279 1280 1281
}

/**
 * Begin next I/O or complete the request
 */
1282
static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
S
Stefan Hajnoczi 已提交
1283 1284
{
    BDRVQEDState *s = acb_to_s(acb);
1285 1286
    uint64_t offset;
    size_t len;
1287
    int ret;
S
Stefan Hajnoczi 已提交
1288

1289
    qemu_co_mutex_lock(&s->table_lock);
1290 1291
    while (1) {
        trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
S
Stefan Hajnoczi 已提交
1292

1293 1294 1295 1296 1297
        if (acb->backing_qiov) {
            qemu_iovec_destroy(acb->backing_qiov);
            g_free(acb->backing_qiov);
            acb->backing_qiov = NULL;
        }
1298

1299 1300 1301
        acb->qiov_offset += acb->cur_qiov.size;
        acb->cur_pos += acb->cur_qiov.size;
        qemu_iovec_reset(&acb->cur_qiov);
S
Stefan Hajnoczi 已提交
1302

1303 1304
        /* Complete request */
        if (acb->cur_pos >= acb->end_pos) {
K
Kevin Wolf 已提交
1305 1306
            ret = 0;
            break;
1307
        }
S
Stefan Hajnoczi 已提交
1308

1309 1310 1311 1312
        /* Find next cluster and start I/O */
        len = acb->end_pos - acb->cur_pos;
        ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
        if (ret < 0) {
K
Kevin Wolf 已提交
1313
            break;
1314
        }
1315

1316 1317 1318 1319 1320
        if (acb->flags & QED_AIOCB_WRITE) {
            ret = qed_aio_write_data(acb, ret, offset, len);
        } else {
            ret = qed_aio_read_data(acb, ret, offset, len);
        }
1321

1322
        if (ret < 0 && ret != -EAGAIN) {
K
Kevin Wolf 已提交
1323
            break;
1324 1325
        }
    }
S
Stefan Hajnoczi 已提交
1326

K
Kevin Wolf 已提交
1327 1328
    trace_qed_aio_complete(s, acb, ret);
    qed_aio_complete(acb);
1329
    qemu_co_mutex_unlock(&s->table_lock);
K
Kevin Wolf 已提交
1330
    return ret;
1331 1332 1333 1334 1335 1336
}

static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
                                       QEMUIOVector *qiov, int nb_sectors,
                                       int flags)
{
K
Kevin Wolf 已提交
1337 1338 1339 1340 1341 1342
    QEDAIOCB acb = {
        .bs         = bs,
        .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
        .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
        .qiov       = qiov,
        .flags      = flags,
1343
    };
K
Kevin Wolf 已提交
1344
    qemu_iovec_init(&acb.cur_qiov, qiov->niov);
S
Stefan Hajnoczi 已提交
1345

K
Kevin Wolf 已提交
1346
    trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
S
Stefan Hajnoczi 已提交
1347 1348

    /* Start request */
K
Kevin Wolf 已提交
1349
    return qed_aio_next_io(&acb);
1350 1351
}

1352 1353 1354
static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
                                          int64_t sector_num, int nb_sectors,
                                          QEMUIOVector *qiov)
1355
{
1356
    return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
1357 1358
}

1359 1360 1361
static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
                                           int64_t sector_num, int nb_sectors,
                                           QEMUIOVector *qiov)
1362
{
1363
    return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1364 1365
}

1366 1367
static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
                                                  int64_t offset,
1368
                                                  int bytes,
1369
                                                  BdrvRequestFlags flags)
1370
{
1371
    BDRVQEDState *s = bs->opaque;
1372 1373 1374
    QEMUIOVector qiov;
    struct iovec iov;

1375 1376
    /* Fall back if the request is not aligned */
    if (qed_offset_into_cluster(s, offset) ||
1377
        qed_offset_into_cluster(s, bytes)) {
1378
        return -ENOTSUP;
1379 1380
    }

1381 1382 1383
    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
     * then it will be allocated during request processing.
     */
1384
    iov.iov_base = NULL;
1385
    iov.iov_len = bytes;
1386 1387

    qemu_iovec_init_external(&qiov, &iov, 1);
1388
    return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1389
                          bytes >> BDRV_SECTOR_BITS,
1390
                          QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1391 1392
}

1393 1394
static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset,
                             PreallocMode prealloc, Error **errp)
1395
{
1396 1397 1398 1399
    BDRVQEDState *s = bs->opaque;
    uint64_t old_image_size;
    int ret;

1400 1401
    if (prealloc != PREALLOC_MODE_OFF) {
        error_setg(errp, "Unsupported preallocation mode '%s'",
1402
                   PreallocMode_str(prealloc));
1403 1404 1405
        return -ENOTSUP;
    }

1406 1407
    if (!qed_is_image_size_valid(offset, s->header.cluster_size,
                                 s->header.table_size)) {
1408
        error_setg(errp, "Invalid image size specified");
1409 1410 1411 1412
        return -EINVAL;
    }

    if ((uint64_t)offset < s->header.image_size) {
1413
        error_setg(errp, "Shrinking images is currently not supported");
1414 1415 1416 1417 1418 1419 1420 1421
        return -ENOTSUP;
    }

    old_image_size = s->header.image_size;
    s->header.image_size = offset;
    ret = qed_write_header_sync(s);
    if (ret < 0) {
        s->header.image_size = old_image_size;
1422
        error_setg_errno(errp, -ret, "Failed to update the image size");
1423 1424
    }
    return ret;
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438
}

static int64_t bdrv_qed_getlength(BlockDriverState *bs)
{
    BDRVQEDState *s = bs->opaque;
    return s->header.image_size;
}

static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    BDRVQEDState *s = bs->opaque;

    memset(bdi, 0, sizeof(*bdi));
    bdi->cluster_size = s->header.cluster_size;
D
Dong Xu Wang 已提交
1439
    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1440 1441
    bdi->unallocated_blocks_are_zero = true;
    bdi->can_write_zeroes_with_unmap = true;
1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496
    return 0;
}

static int bdrv_qed_change_backing_file(BlockDriverState *bs,
                                        const char *backing_file,
                                        const char *backing_fmt)
{
    BDRVQEDState *s = bs->opaque;
    QEDHeader new_header, le_header;
    void *buffer;
    size_t buffer_len, backing_file_len;
    int ret;

    /* Refuse to set backing filename if unknown compat feature bits are
     * active.  If the image uses an unknown compat feature then we may not
     * know the layout of data following the header structure and cannot safely
     * add a new string.
     */
    if (backing_file && (s->header.compat_features &
                         ~QED_COMPAT_FEATURE_MASK)) {
        return -ENOTSUP;
    }

    memcpy(&new_header, &s->header, sizeof(new_header));

    new_header.features &= ~(QED_F_BACKING_FILE |
                             QED_F_BACKING_FORMAT_NO_PROBE);

    /* Adjust feature flags */
    if (backing_file) {
        new_header.features |= QED_F_BACKING_FILE;

        if (qed_fmt_is_raw(backing_fmt)) {
            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
        }
    }

    /* Calculate new header size */
    backing_file_len = 0;

    if (backing_file) {
        backing_file_len = strlen(backing_file);
    }

    buffer_len = sizeof(new_header);
    new_header.backing_filename_offset = buffer_len;
    new_header.backing_filename_size = backing_file_len;
    buffer_len += backing_file_len;

    /* Make sure we can rewrite header without failing */
    if (buffer_len > new_header.header_size * new_header.cluster_size) {
        return -ENOSPC;
    }

    /* Prepare new header */
1497
    buffer = g_malloc(buffer_len);
1498 1499 1500 1501 1502

    qed_header_cpu_to_le(&new_header, &le_header);
    memcpy(buffer, &le_header, sizeof(le_header));
    buffer_len = sizeof(le_header);

P
Pavel Borzenkov 已提交
1503 1504 1505 1506
    if (backing_file) {
        memcpy(buffer + buffer_len, backing_file, backing_file_len);
        buffer_len += backing_file_len;
    }
1507 1508

    /* Write new header */
1509
    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
1510
    g_free(buffer);
1511 1512 1513 1514 1515 1516
    if (ret == 0) {
        memcpy(&s->header, &new_header, sizeof(new_header));
    }
    return ret;
}

1517
static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
1518
{
1519
    BDRVQEDState *s = bs->opaque;
1520 1521
    Error *local_err = NULL;
    int ret;
1522 1523

    bdrv_qed_close(bs);
1524

1525
    bdrv_qed_init_state(bs);
1526 1527 1528
    if (qemu_in_coroutine()) {
        qemu_co_mutex_lock(&s->table_lock);
    }
1529
    ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err);
1530 1531 1532
    if (qemu_in_coroutine()) {
        qemu_co_mutex_unlock(&s->table_lock);
    }
1533
    if (local_err) {
1534 1535
        error_propagate(errp, local_err);
        error_prepend(errp, "Could not reopen qed layer: ");
1536 1537 1538 1539 1540
        return;
    } else if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not reopen qed layer");
        return;
    }
1541 1542
}

1543 1544
static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
                          BdrvCheckMode fix)
1545
{
S
Stefan Hajnoczi 已提交
1546 1547
    BDRVQEDState *s = bs->opaque;

1548
    return qed_check(s, result, !!fix);
1549 1550
}

1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582
static QemuOptsList qed_create_opts = {
    .name = "qed-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_BACKING_FMT,
            .type = QEMU_OPT_STRING,
            .help = "Image format of the base image"
        },
        {
            .name = BLOCK_OPT_CLUSTER_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Cluster size (in bytes)",
            .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
        },
        {
            .name = BLOCK_OPT_TABLE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "L1/L2 table size (in clusters)"
        },
        { /* end of list */ }
    }
1583 1584 1585 1586 1587
};

static BlockDriver bdrv_qed = {
    .format_name              = "qed",
    .instance_size            = sizeof(BDRVQEDState),
1588
    .create_opts              = &qed_create_opts,
1589
    .supports_backing         = true,
1590 1591 1592 1593

    .bdrv_probe               = bdrv_qed_probe,
    .bdrv_open                = bdrv_qed_open,
    .bdrv_close               = bdrv_qed_close,
J
Jeff Cody 已提交
1594
    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
1595
    .bdrv_child_perm          = bdrv_format_default_perms,
C
Chunyan Liu 已提交
1596
    .bdrv_create              = bdrv_qed_create,
1597
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
1598
    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
1599 1600
    .bdrv_co_readv            = bdrv_qed_co_readv,
    .bdrv_co_writev           = bdrv_qed_co_writev,
1601
    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
1602 1603 1604
    .bdrv_truncate            = bdrv_qed_truncate,
    .bdrv_getlength           = bdrv_qed_getlength,
    .bdrv_get_info            = bdrv_qed_get_info,
1605
    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
1606
    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1607
    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
1608
    .bdrv_check               = bdrv_qed_check,
1609 1610
    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
1611
    .bdrv_co_drain            = bdrv_qed_co_drain,
1612 1613 1614 1615 1616 1617 1618 1619
};

static void bdrv_qed_init(void)
{
    bdrv_register(&bdrv_qed);
}

block_init(bdrv_qed_init);