qcow2.c 184.3 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the QCOW version 2 format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
24

P
Peter Maydell 已提交
25
#include "qemu/osdep.h"
26

27
#include "block/qdict.h"
28
#include "sysemu/block-backend.h"
29
#include "qemu/main-loop.h"
30
#include "qemu/module.h"
31
#include "qcow2.h"
32
#include "qemu/error-report.h"
33
#include "qapi/error.h"
34
#include "qapi/qapi-events-block-core.h"
M
Markus Armbruster 已提交
35 36
#include "qapi/qmp/qdict.h"
#include "qapi/qmp/qstring.h"
K
Kevin Wolf 已提交
37
#include "trace.h"
38
#include "qemu/option_int.h"
39
#include "qemu/cutils.h"
40
#include "qemu/bswap.h"
41 42
#include "qapi/qobject-input-visitor.h"
#include "qapi/qapi-visit-block-core.h"
43
#include "crypto.h"
44
#include "block/aio_task.h"
B
bellard 已提交
45 46 47 48 49 50 51 52

/*
  Differences with QCOW:

  - Support for multiple incremental snapshots.
  - Memory management by reference counts.
  - Clusters which have a reference count of one have the bit
    QCOW_OFLAG_COPIED to optimize write performance.
53
  - Size of compressed clusters is stored in sectors to reduce bit usage
B
bellard 已提交
54 55
    in the cluster offsets.
  - Support for storing additional data (such as the VM state) in the
56
    snapshots.
B
bellard 已提交
57 58 59 60 61
  - If a backing store is used, the cluster size is not constrained
    (could be backported to QCOW).
  - L2 tables have always a size of one cluster.
*/

62 63 64 65

typedef struct {
    uint32_t magic;
    uint32_t len;
66
} QEMU_PACKED QCowExtension;
J
Jeff Cody 已提交
67

68 69
#define  QCOW2_EXT_MAGIC_END 0
#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
70
#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
71
#define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
72
#define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
73
#define  QCOW2_EXT_MAGIC_DATA_FILE 0x44415441
74

75 76 77 78 79
static int coroutine_fn
qcow2_co_preadv_compressed(BlockDriverState *bs,
                           uint64_t file_cluster_offset,
                           uint64_t offset,
                           uint64_t bytes,
80 81
                           QEMUIOVector *qiov,
                           size_t qiov_offset);
82

83
static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
B
bellard 已提交
84 85
{
    const QCowHeader *cow_header = (const void *)buf;
86

B
bellard 已提交
87 88
    if (buf_size >= sizeof(QCowHeader) &&
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
K
Kevin Wolf 已提交
89
        be32_to_cpu(cow_header->version) >= 2)
B
bellard 已提交
90 91 92 93 94
        return 100;
    else
        return 0;
}

95

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
                                          uint8_t *buf, size_t buflen,
                                          void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    ssize_t ret;

    if ((offset + buflen) > s->crypto_header.length) {
        error_setg(errp, "Request for data outside of extension header");
        return -1;
    }

    ret = bdrv_pread(bs->file,
                     s->crypto_header.offset + offset, buf, buflen);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
        return -1;
    }
    return ret;
}


static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
                                          void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    int64_t ret;
    int64_t clusterlen;

    ret = qcow2_alloc_clusters(bs, headerlen);
    if (ret < 0) {
        error_setg_errno(errp, -ret,
                         "Cannot allocate cluster for LUKS header size %zu",
                         headerlen);
        return -1;
    }

    s->crypto_header.length = headerlen;
    s->crypto_header.offset = ret;

    /* Zero fill remaining space in cluster so it has predictable
     * content in case of future spec changes */
    clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
K
Kevin Wolf 已提交
141
    assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0);
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
    ret = bdrv_pwrite_zeroes(bs->file,
                             ret + headerlen,
                             clusterlen - headerlen, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not zero fill encryption header");
        return -1;
    }

    return ret;
}


static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
                                           const uint8_t *buf, size_t buflen,
                                           void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    ssize_t ret;

    if ((offset + buflen) > s->crypto_header.length) {
        error_setg(errp, "Request for data outside of extension header");
        return -1;
    }

    ret = bdrv_pwrite(bs->file,
                      s->crypto_header.offset + offset, buf, buflen);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
        return -1;
    }
    return ret;
}


177 178 179 180 181 182 183
/* 
 * read qcow2 extension and fill bs
 * start reading from start_offset
 * finish reading upon magic of value 0 or when end_offset reached
 * unknown magic is skipped (future extension this version knows nothing about)
 * return 0 upon success, non-0 otherwise
 */
184
static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
M
Max Reitz 已提交
185
                                 uint64_t end_offset, void **p_feature_table,
186 187
                                 int flags, bool *need_update_header,
                                 Error **errp)
188
{
189
    BDRVQcow2State *s = bs->opaque;
190 191
    QCowExtension ext;
    uint64_t offset;
192
    int ret;
193 194 195 196 197
    Qcow2BitmapHeaderExt bitmaps_ext;

    if (need_update_header != NULL) {
        *need_update_header = false;
    }
198 199

#ifdef DEBUG_EXT
200
    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
201 202 203 204 205 206 207
#endif
    offset = start_offset;
    while (offset < end_offset) {

#ifdef DEBUG_EXT
        /* Sanity check */
        if (offset > s->cluster_size)
208
            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
209

D
Dong Xu Wang 已提交
210
        printf("attempting to read extended header in offset %lu\n", offset);
211 212
#endif

213
        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
M
Max Reitz 已提交
214 215 216
        if (ret < 0) {
            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
                             "pread fail from offset %" PRIu64, offset);
217 218
            return 1;
        }
219 220
        ext.magic = be32_to_cpu(ext.magic);
        ext.len = be32_to_cpu(ext.len);
221 222 223 224
        offset += sizeof(ext);
#ifdef DEBUG_EXT
        printf("ext.magic = 0x%x\n", ext.magic);
#endif
225
        if (offset > end_offset || ext.len > end_offset - offset) {
M
Max Reitz 已提交
226
            error_setg(errp, "Header extension too large");
227 228 229
            return -EINVAL;
        }

230
        switch (ext.magic) {
231
        case QCOW2_EXT_MAGIC_END:
232
            return 0;
233

234
        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
235
            if (ext.len >= sizeof(bs->backing_format)) {
236 237 238
                error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
                           " too large (>=%zu)", ext.len,
                           sizeof(bs->backing_format));
239 240
                return 2;
            }
241
            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
M
Max Reitz 已提交
242 243 244
            if (ret < 0) {
                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
                                 "Could not read format name");
245
                return 3;
M
Max Reitz 已提交
246
            }
247
            bs->backing_format[ext.len] = '\0';
248
            s->image_backing_format = g_strdup(bs->backing_format);
249 250 251 252 253
#ifdef DEBUG_EXT
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
#endif
            break;

254 255 256
        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
            if (p_feature_table != NULL) {
                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
257
                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
258
                if (ret < 0) {
M
Max Reitz 已提交
259 260
                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
                                     "Could not read table");
261 262 263 264 265 266 267
                    return ret;
                }

                *p_feature_table = feature_table;
            }
            break;

268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
        case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
            unsigned int cflags = 0;
            if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
                error_setg(errp, "CRYPTO header extension only "
                           "expected with LUKS encryption method");
                return -EINVAL;
            }
            if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
                error_setg(errp, "CRYPTO header extension size %u, "
                           "but expected size %zu", ext.len,
                           sizeof(Qcow2CryptoHeaderExtension));
                return -EINVAL;
            }

            ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret,
                                 "Unable to read CRYPTO header extension");
                return ret;
            }
288 289
            s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
            s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
290 291 292 293 294 295 296 297 298 299 300

            if ((s->crypto_header.offset % s->cluster_size) != 0) {
                error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
                           "not a multiple of cluster size '%u'",
                           s->crypto_header.offset, s->cluster_size);
                return -EINVAL;
            }

            if (flags & BDRV_O_NO_IO) {
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
            }
301
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
302
                                           qcow2_crypto_hdr_read_func,
303
                                           bs, cflags, QCOW2_MAX_THREADS, errp);
304 305 306 307 308
            if (!s->crypto) {
                return -EINVAL;
            }
        }   break;

309 310 311 312 313 314 315 316
        case QCOW2_EXT_MAGIC_BITMAPS:
            if (ext.len != sizeof(bitmaps_ext)) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Invalid extension length");
                return -EINVAL;
            }

            if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
317 318 319 320 321 322 323 324 325 326 327
                if (s->qcow_version < 3) {
                    /* Let's be a bit more specific */
                    warn_report("This qcow2 v2 image contains bitmaps, but "
                                "they may have been modified by a program "
                                "without persistent bitmap support; so now "
                                "they must all be considered inconsistent");
                } else {
                    warn_report("a program lacking bitmap support "
                                "modified this file, so all bitmaps are now "
                                "considered inconsistent");
                }
328 329
                error_printf("Some clusters may be leaked, "
                             "run 'qemu-img check -r' on the image "
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
                             "file to fix.");
                if (need_update_header != NULL) {
                    /* Updating is needed to drop invalid bitmap extension. */
                    *need_update_header = true;
                }
                break;
            }

            ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Could not read ext header");
                return ret;
            }

            if (bitmaps_ext.reserved32 != 0) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Reserved field is not zero");
                return -EINVAL;
            }

351 352 353 354 355
            bitmaps_ext.nb_bitmaps = be32_to_cpu(bitmaps_ext.nb_bitmaps);
            bitmaps_ext.bitmap_directory_size =
                be64_to_cpu(bitmaps_ext.bitmap_directory_size);
            bitmaps_ext.bitmap_directory_offset =
                be64_to_cpu(bitmaps_ext.bitmap_directory_offset);
356 357 358 359 360 361 362 363 364 365 366 367 368 369

            if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
                error_setg(errp,
                           "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
                           "exceeding the QEMU supported maximum of %d",
                           bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
                return -EINVAL;
            }

            if (bitmaps_ext.nb_bitmaps == 0) {
                error_setg(errp, "found bitmaps extension with zero bitmaps");
                return -EINVAL;
            }

A
Alberto Garcia 已提交
370
            if (offset_into_cluster(s, bitmaps_ext.bitmap_directory_offset)) {
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
                error_setg(errp, "bitmaps_ext: "
                                 "invalid bitmap directory offset");
                return -EINVAL;
            }

            if (bitmaps_ext.bitmap_directory_size >
                QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
                error_setg(errp, "bitmaps_ext: "
                                 "bitmap directory size (%" PRIu64 ") exceeds "
                                 "the maximum supported size (%d)",
                                 bitmaps_ext.bitmap_directory_size,
                                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
                return -EINVAL;
            }

            s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
            s->bitmap_directory_offset =
                    bitmaps_ext.bitmap_directory_offset;
            s->bitmap_directory_size =
                    bitmaps_ext.bitmap_directory_size;

#ifdef DEBUG_EXT
            printf("Qcow2: Got bitmaps extension: "
                   "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
                   s->bitmap_directory_offset, s->nb_bitmaps);
#endif
            break;

399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
        case QCOW2_EXT_MAGIC_DATA_FILE:
        {
            s->image_data_file = g_malloc0(ext.len + 1);
            ret = bdrv_pread(bs->file, offset, s->image_data_file, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret,
                                 "ERROR: Could not read data file name");
                return ret;
            }
#ifdef DEBUG_EXT
            printf("Qcow2: Got external data file %s\n", s->image_data_file);
#endif
            break;
        }

414
        default:
415
            /* unknown magic - save it in case we need to rewrite the header */
416 417
            /* If you add a new feature, make sure to also update the fast
             * path of qcow2_make_empty() to deal with it. */
418 419 420 421 422 423 424 425
            {
                Qcow2UnknownHeaderExtension *uext;

                uext = g_malloc0(sizeof(*uext)  + ext.len);
                uext->magic = ext.magic;
                uext->len = ext.len;
                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);

426
                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
427
                if (ret < 0) {
M
Max Reitz 已提交
428 429
                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
                                     "Could not read data");
430 431 432
                    return ret;
                }
            }
433 434
            break;
        }
435 436

        offset += ((ext.len + 7) & ~7);
437 438 439 440 441
    }

    return 0;
}

442 443
static void cleanup_unknown_header_ext(BlockDriverState *bs)
{
444
    BDRVQcow2State *s = bs->opaque;
445 446 447 448 449 450 451
    Qcow2UnknownHeaderExtension *uext, *next;

    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
        QLIST_REMOVE(uext, next);
        g_free(uext);
    }
}
452

453 454
static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
                                       uint64_t mask)
455
{
456
    g_autoptr(GString) features = g_string_sized_new(60);
457

458 459
    while (table && table->name[0] != '\0') {
        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
460
            if (mask & (1ULL << table->bit)) {
461 462 463 464
                if (features->len > 0) {
                    g_string_append(features, ", ");
                }
                g_string_append_printf(features, "%.46s", table->name);
465
                mask &= ~(1ULL << table->bit);
466 467 468 469 470 471
            }
        }
        table++;
    }

    if (mask) {
472 473 474 475 476
        if (features->len > 0) {
            g_string_append(features, ", ");
        }
        g_string_append_printf(features,
                               "Unknown incompatible feature: %" PRIx64, mask);
477
    }
478

479
    error_setg(errp, "Unsupported qcow2 feature(s): %s", features->str);
480 481
}

482 483 484 485 486 487 488
/*
 * Sets the dirty bit and flushes afterwards if necessary.
 *
 * The incompatible_features bit is only set if the image file header was
 * updated successfully.  Therefore it is not required to check the return
 * value of this function.
 */
489
int qcow2_mark_dirty(BlockDriverState *bs)
490
{
491
    BDRVQcow2State *s = bs->opaque;
492 493 494 495 496 497 498 499 500 501
    uint64_t val;
    int ret;

    assert(s->qcow_version >= 3);

    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
        return 0; /* already dirty */
    }

    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
502
    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
503 504 505 506
                      &val, sizeof(val));
    if (ret < 0) {
        return ret;
    }
K
Kevin Wolf 已提交
507
    ret = bdrv_flush(bs->file->bs);
508 509 510 511 512 513 514 515 516
    if (ret < 0) {
        return ret;
    }

    /* Only treat image as dirty if the header was updated successfully */
    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
    return 0;
}

S
Stefan Hajnoczi 已提交
517 518 519 520 521 522 523
/*
 * Clears the dirty bit and flushes before if necessary.  Only call this
 * function when there are no pending requests, it does not guard against
 * concurrent requests dirtying the image.
 */
static int qcow2_mark_clean(BlockDriverState *bs)
{
524
    BDRVQcow2State *s = bs->opaque;
S
Stefan Hajnoczi 已提交
525 526

    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
527 528 529 530
        int ret;

        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;

531
        ret = qcow2_flush_caches(bs);
S
Stefan Hajnoczi 已提交
532 533 534 535 536 537 538 539 540
        if (ret < 0) {
            return ret;
        }

        return qcow2_update_header(bs);
    }
    return 0;
}

M
Max Reitz 已提交
541 542 543 544 545
/*
 * Marks the image as corrupt.
 */
int qcow2_mark_corrupt(BlockDriverState *bs)
{
546
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
547 548 549 550 551 552 553 554 555 556 557

    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
    return qcow2_update_header(bs);
}

/*
 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
 * before if necessary.
 */
int qcow2_mark_consistent(BlockDriverState *bs)
{
558
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
559 560

    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
561
        int ret = qcow2_flush_caches(bs);
M
Max Reitz 已提交
562 563 564 565 566 567 568 569 570 571
        if (ret < 0) {
            return ret;
        }

        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
        return qcow2_update_header(bs);
    }
    return 0;
}

572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
static void qcow2_add_check_result(BdrvCheckResult *out,
                                   const BdrvCheckResult *src,
                                   bool set_allocation_info)
{
    out->corruptions += src->corruptions;
    out->leaks += src->leaks;
    out->check_errors += src->check_errors;
    out->corruptions_fixed += src->corruptions_fixed;
    out->leaks_fixed += src->leaks_fixed;

    if (set_allocation_info) {
        out->image_end_offset = src->image_end_offset;
        out->bfi = src->bfi;
    }
}

588 589 590
static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
                                              BdrvCheckResult *result,
                                              BdrvCheckMode fix)
591
{
592 593 594 595 596 597 598 599
    BdrvCheckResult snapshot_res = {};
    BdrvCheckResult refcount_res = {};
    int ret;

    memset(result, 0, sizeof(*result));

    ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
    if (ret < 0) {
600
        qcow2_add_check_result(result, &snapshot_res, false);
601 602 603 604 605
        return ret;
    }

    ret = qcow2_check_refcounts(bs, &refcount_res, fix);
    qcow2_add_check_result(result, &refcount_res, true);
606 607 608 609 610 611 612
    if (ret < 0) {
        qcow2_add_check_result(result, &snapshot_res, false);
        return ret;
    }

    ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix);
    qcow2_add_check_result(result, &snapshot_res, false);
613 614 615 616 617
    if (ret < 0) {
        return ret;
    }

    if (fix && result->check_errors == 0 && result->corruptions == 0) {
M
Max Reitz 已提交
618 619 620 621 622
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
            return ret;
        }
        return qcow2_mark_consistent(bs);
623 624 625 626
    }
    return ret;
}

627 628 629 630 631 632 633 634 635 636 637 638 639
static int coroutine_fn qcow2_co_check(BlockDriverState *bs,
                                       BdrvCheckResult *result,
                                       BdrvCheckMode fix)
{
    BDRVQcow2State *s = bs->opaque;
    int ret;

    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_co_check_locked(bs, result, fix);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

640 641 642 643
int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
                         uint64_t entries, size_t entry_len,
                         int64_t max_size_bytes, const char *table_name,
                         Error **errp)
644
{
645
    BDRVQcow2State *s = bs->opaque;
646

647 648 649
    if (entries > max_size_bytes / entry_len) {
        error_setg(errp, "%s too large", table_name);
        return -EFBIG;
650 651
    }

652 653 654 655 656
    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
     * because values will be passed to qemu functions taking int64_t. */
    if ((INT64_MAX - entries * entry_len < offset) ||
        (offset_into_cluster(s, offset) != 0)) {
        error_setg(errp, "%s offset invalid", table_name);
657 658 659 660 661 662
        return -EINVAL;
    }

    return 0;
}

663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
static const char *const mutable_opts[] = {
    QCOW2_OPT_LAZY_REFCOUNTS,
    QCOW2_OPT_DISCARD_REQUEST,
    QCOW2_OPT_DISCARD_SNAPSHOT,
    QCOW2_OPT_DISCARD_OTHER,
    QCOW2_OPT_OVERLAP,
    QCOW2_OPT_OVERLAP_TEMPLATE,
    QCOW2_OPT_OVERLAP_MAIN_HEADER,
    QCOW2_OPT_OVERLAP_ACTIVE_L1,
    QCOW2_OPT_OVERLAP_ACTIVE_L2,
    QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
    QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
    QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
    QCOW2_OPT_OVERLAP_INACTIVE_L1,
    QCOW2_OPT_OVERLAP_INACTIVE_L2,
    QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
    QCOW2_OPT_CACHE_SIZE,
    QCOW2_OPT_L2_CACHE_SIZE,
    QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
    QCOW2_OPT_REFCOUNT_CACHE_SIZE,
    QCOW2_OPT_CACHE_CLEAN_INTERVAL,
    NULL
};

687 688 689 690 691
static QemuOptsList qcow2_runtime_opts = {
    .name = "qcow2",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
    .desc = {
        {
692
            .name = QCOW2_OPT_LAZY_REFCOUNTS,
693 694 695
            .type = QEMU_OPT_BOOL,
            .help = "Postpone refcount updates",
        },
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711
        {
            .name = QCOW2_OPT_DISCARD_REQUEST,
            .type = QEMU_OPT_BOOL,
            .help = "Pass guest discard requests to the layer below",
        },
        {
            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
            .type = QEMU_OPT_BOOL,
            .help = "Generate discard requests when snapshot related space "
                    "is freed",
        },
        {
            .name = QCOW2_OPT_DISCARD_OTHER,
            .type = QEMU_OPT_BOOL,
            .help = "Generate discard requests when other clusters are freed",
        },
M
Max Reitz 已提交
712 713 714 715 716 717
        {
            .name = QCOW2_OPT_OVERLAP,
            .type = QEMU_OPT_STRING,
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
718 719 720 721 722 723
        {
            .name = QCOW2_OPT_OVERLAP_TEMPLATE,
            .type = QEMU_OPT_STRING,
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
M
Max Reitz 已提交
724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
        {
            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the main qcow2 header",
        },
        {
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the active L1 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an active L2 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the refcount table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into a refcount block",
        },
        {
            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the snapshot table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L1 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L2 table",
        },
764 765 766 767 768
        {
            .name = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the bitmap directory",
        },
769 770 771 772 773 774 775 776 777 778 779
        {
            .name = QCOW2_OPT_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum combined metadata (L2 tables and refcount blocks) "
                    "cache size",
        },
        {
            .name = QCOW2_OPT_L2_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum L2 table cache size",
        },
780 781 782 783 784
        {
            .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Size of each entry in the L2 cache",
        },
785 786 787 788 789
        {
            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum refcount block cache size",
        },
790 791 792 793 794
        {
            .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
            .type = QEMU_OPT_NUMBER,
            .help = "Clean unused cache entries after this time (in seconds)",
        },
795 796
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
            "ID of secret providing qcow2 AES key or LUKS passphrase"),
797 798 799 800
        { /* end of list */ }
    },
};

801
static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
802 803 804 805 806 807 808 809 810
    [QCOW2_OL_MAIN_HEADER_BITNR]      = QCOW2_OPT_OVERLAP_MAIN_HEADER,
    [QCOW2_OL_ACTIVE_L1_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L1,
    [QCOW2_OL_ACTIVE_L2_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L2,
    [QCOW2_OL_REFCOUNT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
    [QCOW2_OL_INACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L1,
    [QCOW2_OL_INACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L2,
    [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
811 812
};

813 814 815
static void cache_clean_timer_cb(void *opaque)
{
    BlockDriverState *bs = opaque;
816
    BDRVQcow2State *s = bs->opaque;
817 818
    qcow2_cache_clean_unused(s->l2_table_cache);
    qcow2_cache_clean_unused(s->refcount_block_cache);
819 820 821 822 823 824
    timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
              (int64_t) s->cache_clean_interval * 1000);
}

static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
{
825
    BDRVQcow2State *s = bs->opaque;
826 827 828 829 830 831 832 833 834 835 836
    if (s->cache_clean_interval > 0) {
        s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL,
                                             SCALE_MS, cache_clean_timer_cb,
                                             bs);
        timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
                  (int64_t) s->cache_clean_interval * 1000);
    }
}

static void cache_clean_timer_del(BlockDriverState *bs)
{
837
    BDRVQcow2State *s = bs->opaque;
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855
    if (s->cache_clean_timer) {
        timer_del(s->cache_clean_timer);
        timer_free(s->cache_clean_timer);
        s->cache_clean_timer = NULL;
    }
}

static void qcow2_detach_aio_context(BlockDriverState *bs)
{
    cache_clean_timer_del(bs);
}

static void qcow2_attach_aio_context(BlockDriverState *bs,
                                     AioContext *new_context)
{
    cache_clean_timer_init(bs, new_context);
}

M
Max Reitz 已提交
856 857
static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                             uint64_t *l2_cache_size,
858
                             uint64_t *l2_cache_entry_size,
859 860
                             uint64_t *refcount_cache_size, Error **errp)
{
861
    BDRVQcow2State *s = bs->opaque;
862
    uint64_t combined_cache_size, l2_cache_max_setting;
863
    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
864
    bool l2_cache_entry_size_set;
865
    int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
866
    uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
867 868 869 870 871
    uint64_t max_l2_entries = DIV_ROUND_UP(virtual_disk_size, s->cluster_size);
    /* An L2 table is always one cluster in size so the max cache size
     * should be a multiple of the cluster size. */
    uint64_t max_l2_cache = ROUND_UP(max_l2_entries * sizeof(uint64_t),
                                     s->cluster_size);
872 873 874 875

    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
876
    l2_cache_entry_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE);
877 878

    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
879 880
    l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
                                             DEFAULT_L2_CACHE_MAX_SIZE);
881 882 883
    *refcount_cache_size = qemu_opt_get_size(opts,
                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);

884 885 886
    *l2_cache_entry_size = qemu_opt_get_size(
        opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);

887 888
    *l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting);

889 890 891 892
    if (combined_cache_size_set) {
        if (l2_cache_size_set && refcount_cache_size_set) {
            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
893
                       "at the same time");
894
            return;
895 896
        } else if (l2_cache_size_set &&
                   (l2_cache_max_setting > combined_cache_size)) {
897 898 899 900 901 902 903 904 905 906 907 908 909 910
            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
                       QCOW2_OPT_CACHE_SIZE);
            return;
        } else if (*refcount_cache_size > combined_cache_size) {
            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
                       QCOW2_OPT_CACHE_SIZE);
            return;
        }

        if (l2_cache_size_set) {
            *refcount_cache_size = combined_cache_size - *l2_cache_size;
        } else if (refcount_cache_size_set) {
            *l2_cache_size = combined_cache_size - *refcount_cache_size;
        } else {
911 912 913 914 915 916 917 918 919 920
            /* Assign as much memory as possible to the L2 cache, and
             * use the remainder for the refcount cache */
            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
                *l2_cache_size = max_l2_cache;
                *refcount_cache_size = combined_cache_size - *l2_cache_size;
            } else {
                *refcount_cache_size =
                    MIN(combined_cache_size, min_refcount_cache);
                *l2_cache_size = combined_cache_size - *refcount_cache_size;
            }
921 922
        }
    }
923 924 925 926 927 928 929 930 931 932

    /*
     * If the L2 cache is not enough to cover the whole disk then
     * default to 4KB entries. Smaller entries reduce the cost of
     * loads and evictions and increase I/O performance.
     */
    if (*l2_cache_size < max_l2_cache && !l2_cache_entry_size_set) {
        *l2_cache_entry_size = MIN(s->cluster_size, 4096);
    }

933 934
    /* l2_cache_size and refcount_cache_size are ensured to have at least
     * their minimum values in qcow2_update_options_prepare() */
935 936 937 938 939 940 941 942 943

    if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
        *l2_cache_entry_size > s->cluster_size ||
        !is_power_of_2(*l2_cache_entry_size)) {
        error_setg(errp, "L2 cache entry size must be a power of two "
                   "between %d and the cluster size (%d)",
                   1 << MIN_CLUSTER_BITS, s->cluster_size);
        return;
    }
944 945
}

946 947 948
typedef struct Qcow2ReopenState {
    Qcow2Cache *l2_table_cache;
    Qcow2Cache *refcount_block_cache;
949
    int l2_slice_size; /* Number of entries in a slice of the L2 table */
950 951 952 953
    bool use_lazy_refcounts;
    int overlap_check;
    bool discard_passthrough[QCOW2_DISCARD_MAX];
    uint64_t cache_clean_interval;
954
    QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
955 956 957 958 959 960
} Qcow2ReopenState;

static int qcow2_update_options_prepare(BlockDriverState *bs,
                                        Qcow2ReopenState *r,
                                        QDict *options, int flags,
                                        Error **errp)
961 962
{
    BDRVQcow2State *s = bs->opaque;
963
    QemuOpts *opts = NULL;
964 965
    const char *opt_overlap_check, *opt_overlap_check_template;
    int overlap_check_template = 0;
966
    uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
967
    int i;
968 969
    const char *encryptfmt;
    QDict *encryptopts = NULL;
970
    Error *local_err = NULL;
971 972
    int ret;

973 974 975
    qdict_extract_subqdict(options, &encryptopts, "encrypt.");
    encryptfmt = qdict_get_try_str(encryptopts, "format");

976 977 978 979 980 981 982 983 984
    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

    /* get L2 table/refcount block cache size from command line options */
985 986
    read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
                     &refcount_cache_size, &local_err);
987 988 989 990 991 992
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

993
    l2_cache_size /= l2_cache_entry_size;
994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
    if (l2_cache_size < MIN_L2_CACHE_SIZE) {
        l2_cache_size = MIN_L2_CACHE_SIZE;
    }
    if (l2_cache_size > INT_MAX) {
        error_setg(errp, "L2 cache size too big");
        ret = -EINVAL;
        goto fail;
    }

    refcount_cache_size /= s->cluster_size;
    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
    }
    if (refcount_cache_size > INT_MAX) {
        error_setg(errp, "Refcount cache size too big");
        ret = -EINVAL;
        goto fail;
    }

1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
    /* alloc new L2 table/refcount block cache, flush old one */
    if (s->l2_table_cache) {
        ret = qcow2_cache_flush(bs, s->l2_table_cache);
        if (ret) {
            error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
            goto fail;
        }
    }

    if (s->refcount_block_cache) {
        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
        if (ret) {
            error_setg_errno(errp, -ret,
                             "Failed to flush the refcount block cache");
            goto fail;
        }
    }

1031 1032 1033 1034 1035
    r->l2_slice_size = l2_cache_entry_size / sizeof(uint64_t);
    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
                                           l2_cache_entry_size);
    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
                                                 s->cluster_size);
1036
    if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
1037 1038 1039 1040 1041 1042
        error_setg(errp, "Could not allocate metadata caches");
        ret = -ENOMEM;
        goto fail;
    }

    /* New interval for cache cleanup timer */
1043
    r->cache_clean_interval =
1044
        qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
1045
                            DEFAULT_CACHE_CLEAN_INTERVAL);
1046 1047 1048 1049 1050 1051 1052 1053
#ifndef CONFIG_LINUX
    if (r->cache_clean_interval != 0) {
        error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
                   " not supported on this host");
        ret = -EINVAL;
        goto fail;
    }
#endif
1054
    if (r->cache_clean_interval > UINT_MAX) {
1055 1056 1057 1058 1059
        error_setg(errp, "Cache clean interval too big");
        ret = -EINVAL;
        goto fail;
    }

1060
    /* lazy-refcounts; flush if going from enabled to disabled */
1061
    r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
1062
        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
1063
    if (r->use_lazy_refcounts && s->qcow_version < 3) {
1064 1065 1066 1067 1068
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
                   "qemu 1.1 compatibility level");
        ret = -EINVAL;
        goto fail;
    }
1069

1070 1071 1072 1073 1074 1075 1076 1077
    if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
            goto fail;
        }
    }

1078
    /* Overlap check options */
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
    if (opt_overlap_check_template && opt_overlap_check &&
        strcmp(opt_overlap_check_template, opt_overlap_check))
    {
        error_setg(errp, "Conflicting values for qcow2 options '"
                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
                   "' ('%s')", opt_overlap_check, opt_overlap_check_template);
        ret = -EINVAL;
        goto fail;
    }
    if (!opt_overlap_check) {
        opt_overlap_check = opt_overlap_check_template ?: "cached";
    }

    if (!strcmp(opt_overlap_check, "none")) {
        overlap_check_template = 0;
    } else if (!strcmp(opt_overlap_check, "constant")) {
        overlap_check_template = QCOW2_OL_CONSTANT;
    } else if (!strcmp(opt_overlap_check, "cached")) {
        overlap_check_template = QCOW2_OL_CACHED;
    } else if (!strcmp(opt_overlap_check, "all")) {
        overlap_check_template = QCOW2_OL_ALL;
    } else {
        error_setg(errp, "Unsupported value '%s' for qcow2 option "
                   "'overlap-check'. Allowed are any of the following: "
                   "none, constant, cached, all", opt_overlap_check);
        ret = -EINVAL;
        goto fail;
    }

1110
    r->overlap_check = 0;
1111 1112 1113
    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
        /* overlap-check defines a template bitmask, but every flag may be
         * overwritten through the associated boolean option */
1114
        r->overlap_check |=
1115 1116 1117 1118
            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
                              overlap_check_template & (1 << i)) << i;
    }

1119 1120 1121
    r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
    r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
    r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
1122 1123
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
                          flags & BDRV_O_UNMAP);
1124
    r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
1125
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
1126
    r->discard_passthrough[QCOW2_DISCARD_OTHER] =
1127 1128
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);

1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
    switch (s->crypt_method_header) {
    case QCOW_CRYPT_NONE:
        if (encryptfmt) {
            error_setg(errp, "No encryption in image header, but options "
                       "specified format '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
        break;

    case QCOW_CRYPT_AES:
        if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
            error_setg(errp,
                       "Header reported 'aes' encryption format but "
                       "options specify '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
1147 1148
        qdict_put_str(encryptopts, "format", "qcow");
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1149 1150
        break;

1151 1152 1153 1154 1155 1156 1157 1158
    case QCOW_CRYPT_LUKS:
        if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
            error_setg(errp,
                       "Header reported 'luks' encryption format but "
                       "options specify '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
1159 1160
        qdict_put_str(encryptopts, "format", "luks");
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1161 1162
        break;

1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
    default:
        error_setg(errp, "Unsupported encryption method %d",
                   s->crypt_method_header);
        break;
    }
    if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) {
        ret = -EINVAL;
        goto fail;
    }

1173 1174
    ret = 0;
fail:
1175
    qobject_unref(encryptopts);
1176 1177
    qemu_opts_del(opts);
    opts = NULL;
1178 1179 1180 1181 1182 1183 1184 1185 1186
    return ret;
}

static void qcow2_update_options_commit(BlockDriverState *bs,
                                        Qcow2ReopenState *r)
{
    BDRVQcow2State *s = bs->opaque;
    int i;

1187
    if (s->l2_table_cache) {
1188
        qcow2_cache_destroy(s->l2_table_cache);
1189 1190
    }
    if (s->refcount_block_cache) {
1191
        qcow2_cache_destroy(s->refcount_block_cache);
1192
    }
1193 1194
    s->l2_table_cache = r->l2_table_cache;
    s->refcount_block_cache = r->refcount_block_cache;
1195
    s->l2_slice_size = r->l2_slice_size;
1196 1197 1198 1199 1200 1201 1202 1203

    s->overlap_check = r->overlap_check;
    s->use_lazy_refcounts = r->use_lazy_refcounts;

    for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
        s->discard_passthrough[i] = r->discard_passthrough[i];
    }

1204 1205 1206 1207 1208
    if (s->cache_clean_interval != r->cache_clean_interval) {
        cache_clean_timer_del(bs);
        s->cache_clean_interval = r->cache_clean_interval;
        cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
    }
1209 1210 1211

    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
    s->crypto_opts = r->crypto_opts;
1212 1213 1214 1215 1216 1217
}

static void qcow2_update_options_abort(BlockDriverState *bs,
                                       Qcow2ReopenState *r)
{
    if (r->l2_table_cache) {
1218
        qcow2_cache_destroy(r->l2_table_cache);
1219 1220
    }
    if (r->refcount_block_cache) {
1221
        qcow2_cache_destroy(r->refcount_block_cache);
1222
    }
1223
    qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
}

static int qcow2_update_options(BlockDriverState *bs, QDict *options,
                                int flags, Error **errp)
{
    Qcow2ReopenState r = {};
    int ret;

    ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
    if (ret >= 0) {
        qcow2_update_options_commit(bs, &r);
    } else {
        qcow2_update_options_abort(bs, &r);
    }
1238

1239 1240 1241
    return ret;
}

1242 1243 1244
/* Called with s->lock held.  */
static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
                                      int flags, Error **errp)
B
bellard 已提交
1245
{
1246
    BDRVQcow2State *s = bs->opaque;
1247 1248
    unsigned int len, i;
    int ret = 0;
B
bellard 已提交
1249
    QCowHeader header;
1250
    Error *local_err = NULL;
1251
    uint64_t ext_end;
1252
    uint64_t l1_vm_state_index;
1253
    bool update_header = false;
B
bellard 已提交
1254

1255
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
1256
    if (ret < 0) {
M
Max Reitz 已提交
1257
        error_setg_errno(errp, -ret, "Could not read qcow2 header");
B
bellard 已提交
1258
        goto fail;
1259
    }
1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
    header.magic = be32_to_cpu(header.magic);
    header.version = be32_to_cpu(header.version);
    header.backing_file_offset = be64_to_cpu(header.backing_file_offset);
    header.backing_file_size = be32_to_cpu(header.backing_file_size);
    header.size = be64_to_cpu(header.size);
    header.cluster_bits = be32_to_cpu(header.cluster_bits);
    header.crypt_method = be32_to_cpu(header.crypt_method);
    header.l1_table_offset = be64_to_cpu(header.l1_table_offset);
    header.l1_size = be32_to_cpu(header.l1_size);
    header.refcount_table_offset = be64_to_cpu(header.refcount_table_offset);
    header.refcount_table_clusters =
        be32_to_cpu(header.refcount_table_clusters);
    header.snapshots_offset = be64_to_cpu(header.snapshots_offset);
    header.nb_snapshots = be32_to_cpu(header.nb_snapshots);
1274

K
Kevin Wolf 已提交
1275
    if (header.magic != QCOW_MAGIC) {
M
Max Reitz 已提交
1276
        error_setg(errp, "Image is not in qcow2 format");
P
Paolo Bonzini 已提交
1277
        ret = -EINVAL;
B
bellard 已提交
1278
        goto fail;
1279
    }
K
Kevin Wolf 已提交
1280
    if (header.version < 2 || header.version > 3) {
1281
        error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
K
Kevin Wolf 已提交
1282 1283 1284 1285 1286 1287
        ret = -ENOTSUP;
        goto fail;
    }

    s->qcow_version = header.version;

1288 1289 1290
    /* Initialise cluster size */
    if (header.cluster_bits < MIN_CLUSTER_BITS ||
        header.cluster_bits > MAX_CLUSTER_BITS) {
1291 1292
        error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
                   header.cluster_bits);
1293 1294 1295 1296 1297 1298 1299
        ret = -EINVAL;
        goto fail;
    }

    s->cluster_bits = header.cluster_bits;
    s->cluster_size = 1 << s->cluster_bits;

K
Kevin Wolf 已提交
1300 1301 1302 1303 1304 1305 1306 1307
    /* Initialise version 3 header fields */
    if (header.version == 2) {
        header.incompatible_features    = 0;
        header.compatible_features      = 0;
        header.autoclear_features       = 0;
        header.refcount_order           = 4;
        header.header_length            = 72;
    } else {
1308 1309 1310 1311 1312 1313
        header.incompatible_features =
            be64_to_cpu(header.incompatible_features);
        header.compatible_features = be64_to_cpu(header.compatible_features);
        header.autoclear_features = be64_to_cpu(header.autoclear_features);
        header.refcount_order = be32_to_cpu(header.refcount_order);
        header.header_length = be32_to_cpu(header.header_length);
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325

        if (header.header_length < 104) {
            error_setg(errp, "qcow2 header too short");
            ret = -EINVAL;
            goto fail;
        }
    }

    if (header.header_length > s->cluster_size) {
        error_setg(errp, "qcow2 header exceeds cluster size");
        ret = -EINVAL;
        goto fail;
K
Kevin Wolf 已提交
1326 1327 1328 1329 1330
    }

    if (header.header_length > sizeof(header)) {
        s->unknown_header_fields_size = header.header_length - sizeof(header);
        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
1331
        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
K
Kevin Wolf 已提交
1332 1333
                         s->unknown_header_fields_size);
        if (ret < 0) {
M
Max Reitz 已提交
1334 1335
            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
                             "fields");
K
Kevin Wolf 已提交
1336 1337 1338 1339
            goto fail;
        }
    }

1340 1341 1342 1343 1344 1345
    if (header.backing_file_offset > s->cluster_size) {
        error_setg(errp, "Invalid backing file offset");
        ret = -EINVAL;
        goto fail;
    }

1346 1347 1348 1349 1350 1351
    if (header.backing_file_offset) {
        ext_end = header.backing_file_offset;
    } else {
        ext_end = 1 << header.cluster_bits;
    }

K
Kevin Wolf 已提交
1352 1353 1354 1355 1356
    /* Handle feature bits */
    s->incompatible_features    = header.incompatible_features;
    s->compatible_features      = header.compatible_features;
    s->autoclear_features       = header.autoclear_features;

S
Stefan Hajnoczi 已提交
1357
    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
1358 1359
        void *feature_table = NULL;
        qcow2_read_extensions(bs, header.header_length, ext_end,
1360
                              &feature_table, flags, NULL, NULL);
1361
        report_unsupported_feature(errp, feature_table,
S
Stefan Hajnoczi 已提交
1362 1363
                                   s->incompatible_features &
                                   ~QCOW2_INCOMPAT_MASK);
K
Kevin Wolf 已提交
1364
        ret = -ENOTSUP;
1365
        g_free(feature_table);
K
Kevin Wolf 已提交
1366 1367 1368
        goto fail;
    }

M
Max Reitz 已提交
1369 1370 1371 1372
    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
        /* Corrupt images may not be written to unless they are being repaired
         */
        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
M
Max Reitz 已提交
1373 1374
            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
                       "read/write");
M
Max Reitz 已提交
1375 1376 1377 1378 1379
            ret = -EACCES;
            goto fail;
        }
    }

K
Kevin Wolf 已提交
1380
    /* Check support for various header values */
1381 1382 1383 1384
    if (header.refcount_order > 6) {
        error_setg(errp, "Reference count entry width too large; may not "
                   "exceed 64 bits");
        ret = -EINVAL;
K
Kevin Wolf 已提交
1385 1386
        goto fail;
    }
1387
    s->refcount_order = header.refcount_order;
1388 1389 1390
    s->refcount_bits = 1 << s->refcount_order;
    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
    s->refcount_max += s->refcount_max - 1;
K
Kevin Wolf 已提交
1391

B
bellard 已提交
1392
    s->crypt_method_header = header.crypt_method;
1393
    if (s->crypt_method_header) {
1394 1395
        if (bdrv_uses_whitelist() &&
            s->crypt_method_header == QCOW_CRYPT_AES) {
1396 1397 1398 1399 1400 1401 1402 1403 1404 1405
            error_setg(errp,
                       "Use of AES-CBC encrypted qcow2 images is no longer "
                       "supported in system emulators");
            error_append_hint(errp,
                              "You can use 'qemu-img convert' to convert your "
                              "image to an alternative supported format, such "
                              "as unencrypted qcow2, or raw with the LUKS "
                              "format instead.\n");
            ret = -ENOSYS;
            goto fail;
1406 1407
        }

1408 1409 1410 1411 1412 1413 1414 1415 1416
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
            s->crypt_physical_offset = false;
        } else {
            /* Assuming LUKS and any future crypt methods we
             * add will all use physical offsets, due to the
             * fact that the alternative is insecure...  */
            s->crypt_physical_offset = true;
        }

1417
        bs->encrypted = true;
1418
    }
1419

B
bellard 已提交
1420 1421
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
1422 1423 1424
    /* 2^(s->refcount_order - 3) is the refcount width in bytes */
    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
    s->refcount_block_size = 1 << s->refcount_block_bits;
1425
    bs->total_sectors = header.size / BDRV_SECTOR_SIZE;
B
bellard 已提交
1426 1427 1428
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
1429

B
bellard 已提交
1430
    s->refcount_table_offset = header.refcount_table_offset;
1431
    s->refcount_table_size =
B
bellard 已提交
1432 1433
        header.refcount_table_clusters << (s->cluster_bits - 3);

1434 1435 1436 1437 1438 1439
    if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
        error_setg(errp, "Image does not contain a reference count table");
        ret = -EINVAL;
        goto fail;
    }

1440 1441 1442 1443
    ret = qcow2_validate_table(bs, s->refcount_table_offset,
                               header.refcount_table_clusters,
                               s->cluster_size, QCOW_MAX_REFTABLE_SIZE,
                               "Reference count table", errp);
1444 1445 1446 1447
    if (ret < 0) {
        goto fail;
    }

1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463
    if (!(flags & BDRV_O_CHECK)) {
        /*
         * The total size in bytes of the snapshot table is checked in
         * qcow2_read_snapshots() because the size of each snapshot is
         * variable and we don't know it yet.
         * Here we only check the offset and number of snapshots.
         */
        ret = qcow2_validate_table(bs, header.snapshots_offset,
                                   header.nb_snapshots,
                                   sizeof(QCowSnapshotHeader),
                                   sizeof(QCowSnapshotHeader) *
                                       QCOW_MAX_SNAPSHOTS,
                                   "Snapshot table", errp);
        if (ret < 0) {
            goto fail;
        }
1464 1465
    }

B
bellard 已提交
1466
    /* read the level 1 table */
1467 1468 1469 1470
    ret = qcow2_validate_table(bs, header.l1_table_offset,
                               header.l1_size, sizeof(uint64_t),
                               QCOW_MAX_L1_SIZE, "Active L1 table", errp);
    if (ret < 0) {
1471 1472
        goto fail;
    }
B
bellard 已提交
1473
    s->l1_size = header.l1_size;
1474
    s->l1_table_offset = header.l1_table_offset;
1475 1476 1477

    l1_vm_state_index = size_to_l1(s, header.size);
    if (l1_vm_state_index > INT_MAX) {
M
Max Reitz 已提交
1478
        error_setg(errp, "Image is too big");
1479 1480 1481 1482 1483
        ret = -EFBIG;
        goto fail;
    }
    s->l1_vm_state_index = l1_vm_state_index;

B
bellard 已提交
1484 1485
    /* the L1 table must contain at least enough entries to put
       header.size bytes */
1486
    if (s->l1_size < s->l1_vm_state_index) {
M
Max Reitz 已提交
1487
        error_setg(errp, "L1 table is too small");
1488
        ret = -EINVAL;
B
bellard 已提交
1489
        goto fail;
1490
    }
1491

1492
    if (s->l1_size > 0) {
K
Kevin Wolf 已提交
1493
        s->l1_table = qemu_try_blockalign(bs->file->bs,
1494
                                          s->l1_size * sizeof(uint64_t));
1495 1496 1497 1498 1499
        if (s->l1_table == NULL) {
            error_setg(errp, "Could not allocate L1 table");
            ret = -ENOMEM;
            goto fail;
        }
1500
        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
1501 1502
                         s->l1_size * sizeof(uint64_t));
        if (ret < 0) {
M
Max Reitz 已提交
1503
            error_setg_errno(errp, -ret, "Could not read L1 table");
1504
            goto fail;
1505
        }
1506
        for(i = 0;i < s->l1_size; i++) {
1507
            s->l1_table[i] = be64_to_cpu(s->l1_table[i]);
1508
        }
B
bellard 已提交
1509
    }
K
Kevin Wolf 已提交
1510

1511 1512
    /* Parse driver-specific options */
    ret = qcow2_update_options(bs, options, flags, errp);
1513 1514 1515 1516
    if (ret < 0) {
        goto fail;
    }

1517
    s->flags = flags;
1518

1519 1520
    ret = qcow2_refcount_init(bs);
    if (ret != 0) {
M
Max Reitz 已提交
1521
        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
B
bellard 已提交
1522
        goto fail;
1523
    }
B
bellard 已提交
1524

B
Blue Swirl 已提交
1525
    QLIST_INIT(&s->cluster_allocs);
K
Kevin Wolf 已提交
1526
    QTAILQ_INIT(&s->discards);
1527

1528
    /* read qcow2 extensions */
M
Max Reitz 已提交
1529
    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
1530
                              flags, &update_header, &local_err)) {
M
Max Reitz 已提交
1531
        error_propagate(errp, local_err);
1532
        ret = -EINVAL;
1533
        goto fail;
1534
    }
1535

1536 1537 1538 1539 1540 1541 1542 1543 1544 1545
    /* Open external data file */
    s->data_file = bdrv_open_child(NULL, options, "data-file", bs, &child_file,
                                   true, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

    if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
1546 1547 1548 1549 1550 1551 1552 1553 1554
        if (!s->data_file && s->image_data_file) {
            s->data_file = bdrv_open_child(s->image_data_file, options,
                                           "data-file", bs, &child_file,
                                           false, errp);
            if (!s->data_file) {
                ret = -EINVAL;
                goto fail;
            }
        }
1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565
        if (!s->data_file) {
            error_setg(errp, "'data-file' is required for this image");
            ret = -EINVAL;
            goto fail;
        }
    } else {
        if (s->data_file) {
            error_setg(errp, "'data-file' can only be set for images with an "
                             "external data file");
            ret = -EINVAL;
            goto fail;
1566 1567 1568 1569 1570 1571 1572 1573
        }

        s->data_file = bs->file;

        if (data_file_is_raw(bs)) {
            error_setg(errp, "data-file-raw requires a data file");
            ret = -EINVAL;
            goto fail;
1574 1575
        }
    }
1576

1577 1578 1579 1580 1581 1582 1583 1584 1585 1586
    /* qcow2_read_extension may have set up the crypto context
     * if the crypt method needs a header region, some methods
     * don't need header extensions, so must check here
     */
    if (s->crypt_method_header && !s->crypto) {
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
            unsigned int cflags = 0;
            if (flags & BDRV_O_NO_IO) {
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
            }
1587
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
1588 1589
                                           NULL, NULL, cflags,
                                           QCOW2_MAX_THREADS, errp);
1590 1591 1592 1593 1594 1595 1596
            if (!s->crypto) {
                ret = -EINVAL;
                goto fail;
            }
        } else if (!(flags & BDRV_O_NO_IO)) {
            error_setg(errp, "Missing CRYPTO header for crypt method %d",
                       s->crypt_method_header);
1597 1598 1599 1600 1601
            ret = -EINVAL;
            goto fail;
        }
    }

B
bellard 已提交
1602 1603 1604
    /* read the backing file name */
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
1605
        if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
1606
            len >= sizeof(bs->backing_file)) {
1607 1608 1609
            error_setg(errp, "Backing file name too long");
            ret = -EINVAL;
            goto fail;
1610
        }
1611
        ret = bdrv_pread(bs->file, header.backing_file_offset,
M
Max Reitz 已提交
1612
                         bs->auto_backing_file, len);
1613
        if (ret < 0) {
M
Max Reitz 已提交
1614
            error_setg_errno(errp, -ret, "Could not read backing file name");
B
bellard 已提交
1615
            goto fail;
1616
        }
M
Max Reitz 已提交
1617 1618 1619 1620
        bs->auto_backing_file[len] = '\0';
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
                bs->auto_backing_file);
        s->image_backing_file = g_strdup(bs->auto_backing_file);
B
bellard 已提交
1621
    }
1622

1623 1624 1625 1626 1627 1628 1629 1630
    /*
     * Internal snapshots; skip reading them in check mode, because
     * we do not need them then, and we do not want to abort because
     * of a broken table.
     */
    if (!(flags & BDRV_O_CHECK)) {
        s->snapshots_offset = header.snapshots_offset;
        s->nb_snapshots = header.nb_snapshots;
1631

1632 1633 1634 1635
        ret = qcow2_read_snapshots(bs, errp);
        if (ret < 0) {
            goto fail;
        }
1636
    }
B
bellard 已提交
1637

1638
    /* Clear unknown autoclear feature bits */
1639
    update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
1640 1641 1642
    update_header =
        update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE);
    if (update_header) {
1643
        s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
1644 1645
    }

1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706
    /* == Handle persistent dirty bitmaps ==
     *
     * We want load dirty bitmaps in three cases:
     *
     * 1. Normal open of the disk in active mode, not related to invalidation
     *    after migration.
     *
     * 2. Invalidation of the target vm after pre-copy phase of migration, if
     *    bitmaps are _not_ migrating through migration channel, i.e.
     *    'dirty-bitmaps' capability is disabled.
     *
     * 3. Invalidation of source vm after failed or canceled migration.
     *    This is a very interesting case. There are two possible types of
     *    bitmaps:
     *
     *    A. Stored on inactivation and removed. They should be loaded from the
     *       image.
     *
     *    B. Not stored: not-persistent bitmaps and bitmaps, migrated through
     *       the migration channel (with dirty-bitmaps capability).
     *
     *    On the other hand, there are two possible sub-cases:
     *
     *    3.1 disk was changed by somebody else while were inactive. In this
     *        case all in-RAM dirty bitmaps (both persistent and not) are
     *        definitely invalid. And we don't have any method to determine
     *        this.
     *
     *        Simple and safe thing is to just drop all the bitmaps of type B on
     *        inactivation. But in this case we lose bitmaps in valid 4.2 case.
     *
     *        On the other hand, resuming source vm, if disk was already changed
     *        is a bad thing anyway: not only bitmaps, the whole vm state is
     *        out of sync with disk.
     *
     *        This means, that user or management tool, who for some reason
     *        decided to resume source vm, after disk was already changed by
     *        target vm, should at least drop all dirty bitmaps by hand.
     *
     *        So, we can ignore this case for now, but TODO: "generation"
     *        extension for qcow2, to determine, that image was changed after
     *        last inactivation. And if it is changed, we will drop (or at least
     *        mark as 'invalid' all the bitmaps of type B, both persistent
     *        and not).
     *
     *    3.2 disk was _not_ changed while were inactive. Bitmaps may be saved
     *        to disk ('dirty-bitmaps' capability disabled), or not saved
     *        ('dirty-bitmaps' capability enabled), but we don't need to care
     *        of: let's load bitmaps as always: stored bitmaps will be loaded,
     *        and not stored has flag IN_USE=1 in the image and will be skipped
     *        on loading.
     *
     * One remaining possible case when we don't want load bitmaps:
     *
     * 4. Open disk in inactive mode in target vm (bitmaps are migrating or
     *    will be loaded on invalidation, no needs try loading them before)
     */

    if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) {
        /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */
        bool header_updated = qcow2_load_dirty_bitmaps(bs, &local_err);
1707 1708 1709 1710 1711
        if (local_err != NULL) {
            error_propagate(errp, local_err);
            ret = -EINVAL;
            goto fail;
        }
1712 1713

        update_header = update_header && !header_updated;
1714 1715 1716
    }

    if (update_header) {
1717 1718
        ret = qcow2_update_header(bs);
        if (ret < 0) {
M
Max Reitz 已提交
1719
            error_setg_errno(errp, -ret, "Could not update qcow2 header");
1720 1721 1722 1723
            goto fail;
        }
    }

1724 1725
    bs->supported_zero_flags = header.version >= 3 ?
                               BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK : 0;
K
Kevin Wolf 已提交
1726

S
Stefan Hajnoczi 已提交
1727
    /* Repair image if dirty */
1728
    if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
1729
        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
S
Stefan Hajnoczi 已提交
1730 1731
        BdrvCheckResult result = {0};

1732 1733
        ret = qcow2_co_check_locked(bs, &result,
                                    BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
M
Max Reitz 已提交
1734 1735 1736 1737
        if (ret < 0 || result.check_errors) {
            if (ret >= 0) {
                ret = -EIO;
            }
M
Max Reitz 已提交
1738
            error_setg_errno(errp, -ret, "Could not repair dirty image");
S
Stefan Hajnoczi 已提交
1739 1740 1741 1742
            goto fail;
        }
    }

B
bellard 已提交
1743
#ifdef DEBUG_ALLOC
P
Philipp Hahn 已提交
1744 1745
    {
        BdrvCheckResult result = {0};
1746
        qcow2_check_refcounts(bs, &result, 0);
P
Philipp Hahn 已提交
1747
    }
B
bellard 已提交
1748
#endif
1749

1750
    qemu_co_queue_init(&s->thread_task_queue);
1751

1752
    return ret;
B
bellard 已提交
1753 1754

 fail:
1755
    g_free(s->image_data_file);
1756 1757 1758
    if (has_data_file(bs)) {
        bdrv_unref_child(bs, s->data_file);
    }
K
Kevin Wolf 已提交
1759
    g_free(s->unknown_header_fields);
1760
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
1761 1762
    qcow2_free_snapshots(bs);
    qcow2_refcount_close(bs);
1763
    qemu_vfree(s->l1_table);
1764 1765
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
1766
    cache_clean_timer_del(bs);
K
Kevin Wolf 已提交
1767
    if (s->l2_table_cache) {
1768
        qcow2_cache_destroy(s->l2_table_cache);
K
Kevin Wolf 已提交
1769
    }
1770
    if (s->refcount_block_cache) {
1771
        qcow2_cache_destroy(s->refcount_block_cache);
1772
    }
1773 1774
    qcrypto_block_free(s->crypto);
    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1775
    return ret;
B
bellard 已提交
1776 1777
}

1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
typedef struct QCow2OpenCo {
    BlockDriverState *bs;
    QDict *options;
    int flags;
    Error **errp;
    int ret;
} QCow2OpenCo;

static void coroutine_fn qcow2_open_entry(void *opaque)
{
    QCow2OpenCo *qoc = opaque;
    BDRVQcow2State *s = qoc->bs->opaque;

    qemu_co_mutex_lock(&s->lock);
    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
    qemu_co_mutex_unlock(&s->lock);
}

1796 1797 1798
static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
{
1799 1800 1801 1802 1803 1804 1805 1806 1807
    BDRVQcow2State *s = bs->opaque;
    QCow2OpenCo qoc = {
        .bs = bs,
        .options = options,
        .flags = flags,
        .errp = errp,
        .ret = -EINPROGRESS
    };

1808 1809 1810 1811 1812 1813
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

1814 1815 1816 1817 1818 1819 1820
    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);

    if (qemu_in_coroutine()) {
        /* From bdrv_co_create.  */
        qcow2_open_entry(&qoc);
    } else {
1821
        assert(qemu_get_current_aio_context() == qemu_get_aio_context());
1822 1823 1824 1825
        qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
    }
    return qoc.ret;
1826 1827
}

1828
static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1829
{
1830
    BDRVQcow2State *s = bs->opaque;
1831

1832 1833
    if (bs->encrypted) {
        /* Encryption works on a sector granularity */
1834
        bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
1835
    }
1836
    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
1837
    bs->bl.pdiscard_alignment = s->cluster_size;
1838 1839
}

J
Jeff Cody 已提交
1840 1841 1842
static int qcow2_reopen_prepare(BDRVReopenState *state,
                                BlockReopenQueue *queue, Error **errp)
{
1843
    Qcow2ReopenState *r;
1844 1845
    int ret;

1846 1847 1848 1849 1850 1851 1852 1853 1854 1855
    r = g_new0(Qcow2ReopenState, 1);
    state->opaque = r;

    ret = qcow2_update_options_prepare(state->bs, r, state->options,
                                       state->flags, errp);
    if (ret < 0) {
        goto fail;
    }

    /* We need to write out any unwritten data if we reopen read-only. */
1856
    if ((state->flags & BDRV_O_RDWR) == 0) {
1857 1858 1859 1860 1861
        ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
        if (ret < 0) {
            goto fail;
        }

1862 1863
        ret = bdrv_flush(state->bs);
        if (ret < 0) {
1864
            goto fail;
1865 1866 1867 1868
        }

        ret = qcow2_mark_clean(state->bs);
        if (ret < 0) {
1869
            goto fail;
1870 1871 1872
        }
    }

J
Jeff Cody 已提交
1873
    return 0;
1874 1875 1876 1877 1878 1879 1880 1881 1882 1883

fail:
    qcow2_update_options_abort(state->bs, r);
    g_free(r);
    return ret;
}

static void qcow2_reopen_commit(BDRVReopenState *state)
{
    qcow2_update_options_commit(state->bs, state->opaque);
1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897
    if (state->flags & BDRV_O_RDWR) {
        Error *local_err = NULL;

        if (qcow2_reopen_bitmaps_rw(state->bs, &local_err) < 0) {
            /*
             * This is not fatal, bitmaps just left read-only, so all following
             * writes will fail. User can remove read-only bitmaps to unblock
             * writes or retry reopen.
             */
            error_reportf_err(local_err,
                              "%s: Failed to make dirty bitmaps writable: ",
                              bdrv_get_node_name(state->bs));
        }
    }
1898 1899 1900 1901 1902 1903 1904
    g_free(state->opaque);
}

static void qcow2_reopen_abort(BDRVReopenState *state)
{
    qcow2_update_options_abort(state->bs, state->opaque);
    g_free(state->opaque);
J
Jeff Cody 已提交
1905 1906
}

1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952
static void qcow2_join_options(QDict *options, QDict *old_options)
{
    bool has_new_overlap_template =
        qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
        qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
    bool has_new_total_cache_size =
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
    bool has_all_cache_options;

    /* New overlap template overrides all old overlap options */
    if (has_new_overlap_template) {
        qdict_del(old_options, QCOW2_OPT_OVERLAP);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
    }

    /* New total cache size overrides all old options */
    if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
        qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
        qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
    }

    qdict_join(options, old_options, false);

    /*
     * If after merging all cache size options are set, an old total size is
     * overwritten. Do keep all options, however, if all three are new. The
     * resulting error message is what we want to happen.
     */
    has_all_cache_options =
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
        qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
        qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);

    if (has_all_cache_options && !has_new_total_cache_size) {
        qdict_del(options, QCOW2_OPT_CACHE_SIZE);
    }
}

1953 1954 1955 1956 1957
static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
                                              bool want_zero,
                                              int64_t offset, int64_t count,
                                              int64_t *pnum, int64_t *map,
                                              BlockDriverState **file)
B
bellard 已提交
1958
{
1959
    BDRVQcow2State *s = bs->opaque;
B
bellard 已提交
1960
    uint64_t cluster_offset;
K
Kevin Wolf 已提交
1961
    unsigned int bytes;
A
Alberto Garcia 已提交
1962
    int ret, status = 0;
B
bellard 已提交
1963

1964 1965
    qemu_co_mutex_lock(&s->lock);

1966 1967 1968 1969 1970 1971
    if (!s->metadata_preallocation_checked) {
        ret = qcow2_detect_metadata_preallocation(bs);
        s->metadata_preallocation = (ret == 1);
        s->metadata_preallocation_checked = true;
    }

1972 1973
    bytes = MIN(INT_MAX, count);
    ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
1974
    qemu_co_mutex_unlock(&s->lock);
1975
    if (ret < 0) {
1976
        return ret;
1977
    }
1978

1979
    *pnum = bytes;
K
Kevin Wolf 已提交
1980

1981
    if ((ret == QCOW2_CLUSTER_NORMAL || ret == QCOW2_CLUSTER_ZERO_ALLOC) &&
1982
        !s->crypto) {
A
Alberto Garcia 已提交
1983
        *map = cluster_offset | offset_into_cluster(s, offset);
1984
        *file = s->data_file->bs;
1985
        status |= BDRV_BLOCK_OFFSET_VALID;
1986
    }
1987
    if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
1988 1989 1990 1991
        status |= BDRV_BLOCK_ZERO;
    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
        status |= BDRV_BLOCK_DATA;
    }
1992 1993 1994 1995 1996
    if (s->metadata_preallocation && (status & BDRV_BLOCK_DATA) &&
        (status & BDRV_BLOCK_OFFSET_VALID))
    {
        status |= BDRV_BLOCK_RECURSE;
    }
1997
    return status;
B
bellard 已提交
1998 1999
}

F
Fam Zheng 已提交
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs,
                                            QCowL2Meta **pl2meta,
                                            bool link_l2)
{
    int ret = 0;
    QCowL2Meta *l2meta = *pl2meta;

    while (l2meta != NULL) {
        QCowL2Meta *next;

F
Fam Zheng 已提交
2010
        if (link_l2) {
F
Fam Zheng 已提交
2011 2012 2013 2014
            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
            if (ret) {
                goto out;
            }
2015 2016
        } else {
            qcow2_alloc_cluster_abort(bs, l2meta);
F
Fam Zheng 已提交
2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034
        }

        /* Take the request off the list of running requests */
        if (l2meta->nb_clusters != 0) {
            QLIST_REMOVE(l2meta, next_in_flight);
        }

        qemu_co_queue_restart_all(&l2meta->dependent_requests);

        next = l2meta->next;
        g_free(l2meta);
        l2meta = next;
    }
out:
    *pl2meta = l2meta;
    return ret;
}

2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087
static coroutine_fn int
qcow2_co_preadv_encrypted(BlockDriverState *bs,
                           uint64_t file_cluster_offset,
                           uint64_t offset,
                           uint64_t bytes,
                           QEMUIOVector *qiov,
                           uint64_t qiov_offset)
{
    int ret;
    BDRVQcow2State *s = bs->opaque;
    uint8_t *buf;

    assert(bs->encrypted && s->crypto);
    assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);

    /*
     * For encrypted images, read everything into a temporary
     * contiguous buffer on which the AES functions can work.
     * Also, decryption in a separate buffer is better as it
     * prevents the guest from learning information about the
     * encrypted nature of the virtual disk.
     */

    buf = qemu_try_blockalign(s->data_file->bs, bytes);
    if (buf == NULL) {
        return -ENOMEM;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
    ret = bdrv_co_pread(s->data_file,
                        file_cluster_offset + offset_into_cluster(s, offset),
                        bytes, buf, 0);
    if (ret < 0) {
        goto fail;
    }

    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
    if (qcow2_co_decrypt(bs,
                         file_cluster_offset + offset_into_cluster(s, offset),
                         offset, buf, bytes) < 0)
    {
        ret = -EIO;
        goto fail;
    }
    qemu_iovec_from_buf(qiov, qiov_offset, buf, bytes);

fail:
    qemu_vfree(buf);

    return ret;
}

2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141
typedef struct Qcow2AioTask {
    AioTask task;

    BlockDriverState *bs;
    QCow2ClusterType cluster_type; /* only for read */
    uint64_t file_cluster_offset;
    uint64_t offset;
    uint64_t bytes;
    QEMUIOVector *qiov;
    uint64_t qiov_offset;
    QCowL2Meta *l2meta; /* only for write */
} Qcow2AioTask;

static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task);
static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
                                       AioTaskPool *pool,
                                       AioTaskFunc func,
                                       QCow2ClusterType cluster_type,
                                       uint64_t file_cluster_offset,
                                       uint64_t offset,
                                       uint64_t bytes,
                                       QEMUIOVector *qiov,
                                       size_t qiov_offset,
                                       QCowL2Meta *l2meta)
{
    Qcow2AioTask local_task;
    Qcow2AioTask *task = pool ? g_new(Qcow2AioTask, 1) : &local_task;

    *task = (Qcow2AioTask) {
        .task.func = func,
        .bs = bs,
        .cluster_type = cluster_type,
        .qiov = qiov,
        .file_cluster_offset = file_cluster_offset,
        .offset = offset,
        .bytes = bytes,
        .qiov_offset = qiov_offset,
        .l2meta = l2meta,
    };

    trace_qcow2_add_task(qemu_coroutine_self(), bs, pool,
                         func == qcow2_co_preadv_task_entry ? "read" : "write",
                         cluster_type, file_cluster_offset, offset, bytes,
                         qiov, qiov_offset);

    if (!pool) {
        return func(&task->task);
    }

    aio_task_pool_start_task(pool, &task->task);

    return 0;
}

2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169
static coroutine_fn int qcow2_co_preadv_task(BlockDriverState *bs,
                                             QCow2ClusterType cluster_type,
                                             uint64_t file_cluster_offset,
                                             uint64_t offset, uint64_t bytes,
                                             QEMUIOVector *qiov,
                                             size_t qiov_offset)
{
    BDRVQcow2State *s = bs->opaque;
    int offset_in_cluster = offset_into_cluster(s, offset);

    switch (cluster_type) {
    case QCOW2_CLUSTER_ZERO_PLAIN:
    case QCOW2_CLUSTER_ZERO_ALLOC:
        /* Both zero types are handled in qcow2_co_preadv_part */
        g_assert_not_reached();

    case QCOW2_CLUSTER_UNALLOCATED:
        assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */

        BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
        return bdrv_co_preadv_part(bs->backing, offset, bytes,
                                   qiov, qiov_offset, 0);

    case QCOW2_CLUSTER_COMPRESSED:
        return qcow2_co_preadv_compressed(bs, file_cluster_offset,
                                          offset, bytes, qiov, qiov_offset);

    case QCOW2_CLUSTER_NORMAL:
2170
        assert(offset_into_cluster(s, file_cluster_offset) == 0);
2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187
        if (bs->encrypted) {
            return qcow2_co_preadv_encrypted(bs, file_cluster_offset,
                                             offset, bytes, qiov, qiov_offset);
        }

        BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
        return bdrv_co_preadv_part(s->data_file,
                                   file_cluster_offset + offset_in_cluster,
                                   bytes, qiov, qiov_offset, 0);

    default:
        g_assert_not_reached();
    }

    g_assert_not_reached();
}

2188 2189 2190 2191 2192 2193 2194 2195 2196 2197
static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task)
{
    Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);

    assert(!t->l2meta);

    return qcow2_co_preadv_task(t->bs, t->cluster_type, t->file_cluster_offset,
                                t->offset, t->bytes, t->qiov, t->qiov_offset);
}

2198 2199 2200 2201
static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
                                             uint64_t offset, uint64_t bytes,
                                             QEMUIOVector *qiov,
                                             size_t qiov_offset, int flags)
B
bellard 已提交
2202
{
2203
    BDRVQcow2State *s = bs->opaque;
2204
    int ret = 0;
K
Kevin Wolf 已提交
2205
    unsigned int cur_bytes; /* number of bytes in current iteration */
2206
    uint64_t cluster_offset = 0;
2207
    AioTaskPool *aio = NULL;
B
bellard 已提交
2208

2209
    while (bytes != 0 && aio_task_pool_status(aio) == 0) {
2210
        /* prepare next request */
K
Kevin Wolf 已提交
2211
        cur_bytes = MIN(bytes, INT_MAX);
2212
        if (s->crypto) {
K
Kevin Wolf 已提交
2213 2214
            cur_bytes = MIN(cur_bytes,
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
B
bellard 已提交
2215
        }
2216

2217
        qemu_co_mutex_lock(&s->lock);
K
Kevin Wolf 已提交
2218
        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
2219
        qemu_co_mutex_unlock(&s->lock);
2220
        if (ret < 0) {
2221
            goto out;
2222
        }
2223

2224 2225 2226 2227
        if (ret == QCOW2_CLUSTER_ZERO_PLAIN ||
            ret == QCOW2_CLUSTER_ZERO_ALLOC ||
            (ret == QCOW2_CLUSTER_UNALLOCATED && !bs->backing))
        {
2228
            qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes);
2229
        } else {
2230 2231 2232 2233 2234 2235
            if (!aio && cur_bytes != bytes) {
                aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
            }
            ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, ret,
                                 cluster_offset, offset, cur_bytes,
                                 qiov, qiov_offset, NULL);
2236
            if (ret < 0) {
2237
                goto out;
2238
            }
2239
        }
2240

K
Kevin Wolf 已提交
2241 2242
        bytes -= cur_bytes;
        offset += cur_bytes;
2243
        qiov_offset += cur_bytes;
2244
    }
K
Kevin Wolf 已提交
2245

2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
out:
    if (aio) {
        aio_task_pool_wait_all(aio);
        if (ret == 0) {
            ret = aio_task_pool_status(aio);
        }
        g_free(aio);
    }

    return ret;
B
bellard 已提交
2256 2257
}

2258 2259 2260
/* Check if it's possible to merge a write request with the writing of
 * the data from the COW regions */
static bool merge_cow(uint64_t offset, unsigned bytes,
2261 2262
                      QEMUIOVector *qiov, size_t qiov_offset,
                      QCowL2Meta *l2meta)
2263 2264 2265 2266 2267 2268 2269 2270 2271
{
    QCowL2Meta *m;

    for (m = l2meta; m != NULL; m = m->next) {
        /* If both COW regions are empty then there's nothing to merge */
        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
            continue;
        }

2272 2273 2274 2275 2276
        /* If COW regions are handled already, skip this too */
        if (m->skip_cow) {
            continue;
        }

2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290
        /* The data (middle) region must be immediately after the
         * start region */
        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
            continue;
        }

        /* The end region must be immediately after the data (middle)
         * region */
        if (m->offset + m->cow_end.offset != offset + bytes) {
            continue;
        }

        /* Make sure that adding both COW regions to the QEMUIOVector
         * does not exceed IOV_MAX */
2291
        if (qemu_iovec_subvec_niov(qiov, qiov_offset, bytes) > IOV_MAX - 2) {
2292 2293 2294
            continue;
        }

2295 2296
        m->data_qiov = qiov;
        m->data_qiov_offset = qiov_offset;
2297 2298 2299 2300 2301 2302
        return true;
    }

    return false;
}

2303 2304 2305 2306
static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
{
    int64_t nr;
    return !bytes ||
2307 2308
        (!bdrv_is_allocated_above(bs, NULL, false, offset, bytes, &nr) &&
         nr == bytes);
2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377
}

static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
{
    /*
     * This check is designed for optimization shortcut so it must be
     * efficient.
     * Instead of is_zero(), use is_unallocated() as it is faster (but not
     * as accurate and can result in false negatives).
     */
    return is_unallocated(bs, m->offset + m->cow_start.offset,
                          m->cow_start.nb_bytes) &&
           is_unallocated(bs, m->offset + m->cow_end.offset,
                          m->cow_end.nb_bytes);
}

static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
{
    BDRVQcow2State *s = bs->opaque;
    QCowL2Meta *m;

    if (!(s->data_file->bs->supported_zero_flags & BDRV_REQ_NO_FALLBACK)) {
        return 0;
    }

    if (bs->encrypted) {
        return 0;
    }

    for (m = l2meta; m != NULL; m = m->next) {
        int ret;

        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
            continue;
        }

        if (!is_zero_cow(bs, m)) {
            continue;
        }

        /*
         * instead of writing zero COW buffers,
         * efficiently zero out the whole clusters
         */

        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
                                            m->nb_clusters * s->cluster_size,
                                            true);
        if (ret < 0) {
            return ret;
        }

        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
                                    m->nb_clusters * s->cluster_size,
                                    BDRV_REQ_NO_FALLBACK);
        if (ret < 0) {
            if (ret != -ENOTSUP && ret != -EAGAIN) {
                return ret;
            }
            continue;
        }

        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
        m->skip_cow = true;
    }
    return 0;
}

2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459
/*
 * qcow2_co_pwritev_task
 * Called with s->lock unlocked
 * l2meta  - if not NULL, qcow2_co_pwritev_task() will consume it. Caller must
 *           not use it somehow after qcow2_co_pwritev_task() call
 */
static coroutine_fn int qcow2_co_pwritev_task(BlockDriverState *bs,
                                              uint64_t file_cluster_offset,
                                              uint64_t offset, uint64_t bytes,
                                              QEMUIOVector *qiov,
                                              uint64_t qiov_offset,
                                              QCowL2Meta *l2meta)
{
    int ret;
    BDRVQcow2State *s = bs->opaque;
    void *crypt_buf = NULL;
    int offset_in_cluster = offset_into_cluster(s, offset);
    QEMUIOVector encrypted_qiov;

    if (bs->encrypted) {
        assert(s->crypto);
        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
        if (crypt_buf == NULL) {
            ret = -ENOMEM;
            goto out_unlocked;
        }
        qemu_iovec_to_buf(qiov, qiov_offset, crypt_buf, bytes);

        if (qcow2_co_encrypt(bs, file_cluster_offset + offset_in_cluster,
                             offset, crypt_buf, bytes) < 0)
        {
            ret = -EIO;
            goto out_unlocked;
        }

        qemu_iovec_init_buf(&encrypted_qiov, crypt_buf, bytes);
        qiov = &encrypted_qiov;
        qiov_offset = 0;
    }

    /* Try to efficiently initialize the physical space with zeroes */
    ret = handle_alloc_space(bs, l2meta);
    if (ret < 0) {
        goto out_unlocked;
    }

    /*
     * If we need to do COW, check if it's possible to merge the
     * writing of the guest data together with that of the COW regions.
     * If it's not possible (or not necessary) then write the
     * guest data now.
     */
    if (!merge_cow(offset, bytes, qiov, qiov_offset, l2meta)) {
        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
        trace_qcow2_writev_data(qemu_coroutine_self(),
                                file_cluster_offset + offset_in_cluster);
        ret = bdrv_co_pwritev_part(s->data_file,
                                   file_cluster_offset + offset_in_cluster,
                                   bytes, qiov, qiov_offset, 0);
        if (ret < 0) {
            goto out_unlocked;
        }
    }

    qemu_co_mutex_lock(&s->lock);

    ret = qcow2_handle_l2meta(bs, &l2meta, true);
    goto out_locked;

out_unlocked:
    qemu_co_mutex_lock(&s->lock);

out_locked:
    qcow2_handle_l2meta(bs, &l2meta, false);
    qemu_co_mutex_unlock(&s->lock);

    qemu_vfree(crypt_buf);

    return ret;
}

2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470
static coroutine_fn int qcow2_co_pwritev_task_entry(AioTask *task)
{
    Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);

    assert(!t->cluster_type);

    return qcow2_co_pwritev_task(t->bs, t->file_cluster_offset,
                                 t->offset, t->bytes, t->qiov, t->qiov_offset,
                                 t->l2meta);
}

2471 2472 2473
static coroutine_fn int qcow2_co_pwritev_part(
        BlockDriverState *bs, uint64_t offset, uint64_t bytes,
        QEMUIOVector *qiov, size_t qiov_offset, int flags)
B
bellard 已提交
2474
{
2475
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
2476
    int offset_in_cluster;
K
Kevin Wolf 已提交
2477
    int ret;
K
Kevin Wolf 已提交
2478
    unsigned int cur_bytes; /* number of sectors in current iteration */
2479
    uint64_t cluster_offset;
2480
    QCowL2Meta *l2meta = NULL;
2481
    AioTaskPool *aio = NULL;
2482

K
Kevin Wolf 已提交
2483
    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
K
Kevin Wolf 已提交
2484

2485
    while (bytes != 0 && aio_task_pool_status(aio) == 0) {
2486

2487
        l2meta = NULL;
K
Kevin Wolf 已提交
2488

K
Kevin Wolf 已提交
2489
        trace_qcow2_writev_start_part(qemu_coroutine_self());
K
Kevin Wolf 已提交
2490 2491 2492 2493 2494 2495
        offset_in_cluster = offset_into_cluster(s, offset);
        cur_bytes = MIN(bytes, INT_MAX);
        if (bs->encrypted) {
            cur_bytes = MIN(cur_bytes,
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
                            - offset_in_cluster);
2496
        }
2497

2498 2499
        qemu_co_mutex_lock(&s->lock);

K
Kevin Wolf 已提交
2500 2501
        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
                                         &cluster_offset, &l2meta);
2502
        if (ret < 0) {
2503
            goto out_locked;
2504
        }
2505

2506
        assert(offset_into_cluster(s, cluster_offset) == 0);
2507

2508 2509 2510 2511 2512 2513 2514 2515 2516
        ret = qcow2_pre_write_overlap_check(bs, 0,
                                            cluster_offset + offset_in_cluster,
                                            cur_bytes, true);
        if (ret < 0) {
            goto out_locked;
        }

        qemu_co_mutex_unlock(&s->lock);

2517 2518 2519 2520 2521 2522
        if (!aio && cur_bytes != bytes) {
            aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
        }
        ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_task_entry, 0,
                             cluster_offset, offset, cur_bytes,
                             qiov, qiov_offset, l2meta);
2523
        l2meta = NULL; /* l2meta is consumed by qcow2_co_pwritev_task() */
2524
        if (ret < 0) {
2525
            goto fail_nometa;
2526
        }
2527

K
Kevin Wolf 已提交
2528 2529
        bytes -= cur_bytes;
        offset += cur_bytes;
2530
        qiov_offset += cur_bytes;
K
Kevin Wolf 已提交
2531
        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
2532
    }
2533
    ret = 0;
2534

2535 2536 2537
    qemu_co_mutex_lock(&s->lock);

out_locked:
F
Fam Zheng 已提交
2538
    qcow2_handle_l2meta(bs, &l2meta, false);
2539

2540 2541
    qemu_co_mutex_unlock(&s->lock);

2542
fail_nometa:
2543 2544 2545 2546 2547 2548 2549 2550
    if (aio) {
        aio_task_pool_wait_all(aio);
        if (ret == 0) {
            ret = aio_task_pool_status(aio);
        }
        g_free(aio);
    }

K
Kevin Wolf 已提交
2551
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
K
Kevin Wolf 已提交
2552

K
Kevin Wolf 已提交
2553
    return ret;
B
bellard 已提交
2554 2555
}

K
Kevin Wolf 已提交
2556 2557 2558 2559
static int qcow2_inactivate(BlockDriverState *bs)
{
    BDRVQcow2State *s = bs->opaque;
    int ret, result = 0;
2560
    Error *local_err = NULL;
K
Kevin Wolf 已提交
2561

2562
    qcow2_store_persistent_dirty_bitmaps(bs, true, &local_err);
2563 2564
    if (local_err != NULL) {
        result = -EINVAL;
2565 2566 2567
        error_reportf_err(local_err, "Lost persistent bitmaps during "
                          "inactivation of node '%s': ",
                          bdrv_get_device_or_node_name(bs));
2568 2569
    }

K
Kevin Wolf 已提交
2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
    if (ret) {
        result = ret;
        error_report("Failed to flush the L2 table cache: %s",
                     strerror(-ret));
    }

    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret) {
        result = ret;
        error_report("Failed to flush the refcount block cache: %s",
                     strerror(-ret));
    }

    if (result == 0) {
        qcow2_mark_clean(bs);
    }

    return result;
}

2591
static void qcow2_close(BlockDriverState *bs)
B
bellard 已提交
2592
{
2593
    BDRVQcow2State *s = bs->opaque;
2594
    qemu_vfree(s->l1_table);
2595 2596
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
K
Kevin Wolf 已提交
2597

2598
    if (!(s->flags & BDRV_O_INACTIVE)) {
K
Kevin Wolf 已提交
2599
        qcow2_inactivate(bs);
2600
    }
S
Stefan Hajnoczi 已提交
2601

2602
    cache_clean_timer_del(bs);
2603 2604
    qcow2_cache_destroy(s->l2_table_cache);
    qcow2_cache_destroy(s->refcount_block_cache);
K
Kevin Wolf 已提交
2605

2606 2607
    qcrypto_block_free(s->crypto);
    s->crypto = NULL;
2608

K
Kevin Wolf 已提交
2609
    g_free(s->unknown_header_fields);
2610
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
2611

2612
    g_free(s->image_data_file);
2613 2614 2615
    g_free(s->image_backing_file);
    g_free(s->image_backing_format);

2616 2617 2618 2619
    if (has_data_file(bs)) {
        bdrv_unref_child(bs, s->data_file);
    }

K
Kevin Wolf 已提交
2620
    qcow2_refcount_close(bs);
2621
    qcow2_free_snapshots(bs);
B
bellard 已提交
2622 2623
}

2624 2625
static void coroutine_fn qcow2_co_invalidate_cache(BlockDriverState *bs,
                                                   Error **errp)
2626
{
2627
    BDRVQcow2State *s = bs->opaque;
2628
    int flags = s->flags;
2629
    QCryptoBlock *crypto = NULL;
2630
    QDict *options;
2631 2632
    Error *local_err = NULL;
    int ret;
2633 2634 2635 2636 2637 2638

    /*
     * Backing files are read-only which makes all of their metadata immutable,
     * that means we don't have to worry about reopening them here.
     */

2639 2640
    crypto = s->crypto;
    s->crypto = NULL;
2641 2642 2643

    qcow2_close(bs);

2644
    memset(s, 0, sizeof(BDRVQcow2State));
2645
    options = qdict_clone_shallow(bs->options);
2646

2647
    flags &= ~BDRV_O_INACTIVE;
2648
    qemu_co_mutex_lock(&s->lock);
2649
    ret = qcow2_do_open(bs, options, flags, &local_err);
2650
    qemu_co_mutex_unlock(&s->lock);
2651
    qobject_unref(options);
2652
    if (local_err) {
2653 2654
        error_propagate_prepend(errp, local_err,
                                "Could not reopen qcow2 layer: ");
2655
        bs->drv = NULL;
2656 2657 2658
        return;
    } else if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
2659
        bs->drv = NULL;
2660 2661
        return;
    }
2662

2663
    s->crypto = crypto;
2664 2665
}

K
Kevin Wolf 已提交
2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679
static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
    size_t len, size_t buflen)
{
    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);

    if (buflen < ext_len) {
        return -ENOSPC;
    }

    *ext_backing_fmt = (QCowExtension) {
        .magic  = cpu_to_be32(magic),
        .len    = cpu_to_be32(len),
    };
2680 2681 2682 2683

    if (len) {
        memcpy(buf + sizeof(QCowExtension), s, len);
    }
K
Kevin Wolf 已提交
2684 2685 2686 2687

    return ext_len;
}

K
Kevin Wolf 已提交
2688
/*
K
Kevin Wolf 已提交
2689 2690 2691 2692
 * Updates the qcow2 header, including the variable length parts of it, i.e.
 * the backing file name and all extensions. qcow2 was not designed to allow
 * such changes, so if we run out of space (we can only use the first cluster)
 * this function may fail.
K
Kevin Wolf 已提交
2693 2694 2695
 *
 * Returns 0 on success, -errno in error cases.
 */
K
Kevin Wolf 已提交
2696
int qcow2_update_header(BlockDriverState *bs)
K
Kevin Wolf 已提交
2697
{
2698
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
2699 2700 2701
    QCowHeader *header;
    char *buf;
    size_t buflen = s->cluster_size;
K
Kevin Wolf 已提交
2702
    int ret;
K
Kevin Wolf 已提交
2703 2704
    uint64_t total_size;
    uint32_t refcount_table_clusters;
K
Kevin Wolf 已提交
2705
    size_t header_length;
2706
    Qcow2UnknownHeaderExtension *uext;
K
Kevin Wolf 已提交
2707

K
Kevin Wolf 已提交
2708
    buf = qemu_blockalign(bs, buflen);
K
Kevin Wolf 已提交
2709

K
Kevin Wolf 已提交
2710 2711
    /* Header structure */
    header = (QCowHeader*) buf;
K
Kevin Wolf 已提交
2712

K
Kevin Wolf 已提交
2713 2714 2715
    if (buflen < sizeof(*header)) {
        ret = -ENOSPC;
        goto fail;
K
Kevin Wolf 已提交
2716 2717
    }

K
Kevin Wolf 已提交
2718
    header_length = sizeof(*header) + s->unknown_header_fields_size;
K
Kevin Wolf 已提交
2719 2720 2721 2722
    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);

    *header = (QCowHeader) {
K
Kevin Wolf 已提交
2723
        /* Version 2 fields */
K
Kevin Wolf 已提交
2724
        .magic                  = cpu_to_be32(QCOW_MAGIC),
K
Kevin Wolf 已提交
2725
        .version                = cpu_to_be32(s->qcow_version),
K
Kevin Wolf 已提交
2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736
        .backing_file_offset    = 0,
        .backing_file_size      = 0,
        .cluster_bits           = cpu_to_be32(s->cluster_bits),
        .size                   = cpu_to_be64(total_size),
        .crypt_method           = cpu_to_be32(s->crypt_method_header),
        .l1_size                = cpu_to_be32(s->l1_size),
        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
K
Kevin Wolf 已提交
2737 2738 2739 2740 2741

        /* Version 3 fields */
        .incompatible_features  = cpu_to_be64(s->incompatible_features),
        .compatible_features    = cpu_to_be64(s->compatible_features),
        .autoclear_features     = cpu_to_be64(s->autoclear_features),
2742
        .refcount_order         = cpu_to_be32(s->refcount_order),
K
Kevin Wolf 已提交
2743
        .header_length          = cpu_to_be32(header_length),
K
Kevin Wolf 已提交
2744
    };
K
Kevin Wolf 已提交
2745

K
Kevin Wolf 已提交
2746 2747 2748 2749 2750 2751 2752 2753 2754
    /* For older versions, write a shorter header */
    switch (s->qcow_version) {
    case 2:
        ret = offsetof(QCowHeader, incompatible_features);
        break;
    case 3:
        ret = sizeof(*header);
        break;
    default:
2755 2756
        ret = -EINVAL;
        goto fail;
K
Kevin Wolf 已提交
2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773
    }

    buf += ret;
    buflen -= ret;
    memset(buf, 0, buflen);

    /* Preserve any unknown field in the header */
    if (s->unknown_header_fields_size) {
        if (buflen < s->unknown_header_fields_size) {
            ret = -ENOSPC;
            goto fail;
        }

        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
        buf += s->unknown_header_fields_size;
        buflen -= s->unknown_header_fields_size;
    }
K
Kevin Wolf 已提交
2774

K
Kevin Wolf 已提交
2775
    /* Backing file format header extension */
2776
    if (s->image_backing_format) {
K
Kevin Wolf 已提交
2777
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
2778 2779
                             s->image_backing_format,
                             strlen(s->image_backing_format),
K
Kevin Wolf 已提交
2780 2781 2782
                             buflen);
        if (ret < 0) {
            goto fail;
K
Kevin Wolf 已提交
2783 2784
        }

K
Kevin Wolf 已提交
2785 2786
        buf += ret;
        buflen -= ret;
K
Kevin Wolf 已提交
2787 2788
    }

2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801
    /* External data file header extension */
    if (has_data_file(bs) && s->image_data_file) {
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_DATA_FILE,
                             s->image_data_file, strlen(s->image_data_file),
                             buflen);
        if (ret < 0) {
            goto fail;
        }

        buf += ret;
        buflen -= ret;
    }

2802 2803
    /* Full disk encryption header pointer extension */
    if (s->crypto_header.offset != 0) {
2804 2805
        s->crypto_header.offset = cpu_to_be64(s->crypto_header.offset);
        s->crypto_header.length = cpu_to_be64(s->crypto_header.length);
2806 2807 2808
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
                             &s->crypto_header, sizeof(s->crypto_header),
                             buflen);
2809 2810
        s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
        s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
2811 2812 2813 2814 2815 2816 2817
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
    }

2818
    /* Feature table */
2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830
    if (s->qcow_version >= 3) {
        Qcow2Feature features[] = {
            {
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
                .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
                .name = "dirty bit",
            },
            {
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
                .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
                .name = "corrupt bit",
            },
2831 2832 2833 2834 2835
            {
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
                .bit  = QCOW2_INCOMPAT_DATA_FILE_BITNR,
                .name = "external data file",
            },
2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849
            {
                .type = QCOW2_FEAT_TYPE_COMPATIBLE,
                .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
                .name = "lazy refcounts",
            },
        };

        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
                             features, sizeof(features), buflen);
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
2850 2851
    }

2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870
    /* Bitmap extension */
    if (s->nb_bitmaps > 0) {
        Qcow2BitmapHeaderExt bitmaps_header = {
            .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
            .bitmap_directory_size =
                    cpu_to_be64(s->bitmap_directory_size),
            .bitmap_directory_offset =
                    cpu_to_be64(s->bitmap_directory_offset)
        };
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
                             &bitmaps_header, sizeof(bitmaps_header),
                             buflen);
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
    }

2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881
    /* Keep unknown header extensions */
    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
        if (ret < 0) {
            goto fail;
        }

        buf += ret;
        buflen -= ret;
    }

K
Kevin Wolf 已提交
2882 2883
    /* End of header extensions */
    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
K
Kevin Wolf 已提交
2884 2885 2886 2887
    if (ret < 0) {
        goto fail;
    }

K
Kevin Wolf 已提交
2888 2889
    buf += ret;
    buflen -= ret;
K
Kevin Wolf 已提交
2890

K
Kevin Wolf 已提交
2891
    /* Backing file name */
2892 2893
    if (s->image_backing_file) {
        size_t backing_file_len = strlen(s->image_backing_file);
K
Kevin Wolf 已提交
2894 2895 2896 2897 2898 2899

        if (buflen < backing_file_len) {
            ret = -ENOSPC;
            goto fail;
        }

2900
        /* Using strncpy is ok here, since buf is not NUL-terminated. */
2901
        strncpy(buf, s->image_backing_file, buflen);
K
Kevin Wolf 已提交
2902 2903 2904

        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
        header->backing_file_size   = cpu_to_be32(backing_file_len);
K
Kevin Wolf 已提交
2905 2906
    }

K
Kevin Wolf 已提交
2907
    /* Write the new header */
2908
    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
K
Kevin Wolf 已提交
2909 2910 2911 2912 2913 2914
    if (ret < 0) {
        goto fail;
    }

    ret = 0;
fail:
K
Kevin Wolf 已提交
2915
    qemu_vfree(header);
K
Kevin Wolf 已提交
2916 2917 2918 2919 2920 2921
    return ret;
}

static int qcow2_change_backing_file(BlockDriverState *bs,
    const char *backing_file, const char *backing_fmt)
{
2922
    BDRVQcow2State *s = bs->opaque;
2923

2924 2925 2926 2927 2928 2929
    /* Adding a backing file means that the external data file alone won't be
     * enough to make sense of the content */
    if (backing_file && data_file_is_raw(bs)) {
        return -EINVAL;
    }

2930 2931 2932 2933
    if (backing_file && strlen(backing_file) > 1023) {
        return -EINVAL;
    }

M
Max Reitz 已提交
2934 2935
    pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
            backing_file ?: "");
K
Kevin Wolf 已提交
2936 2937 2938
    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");

2939 2940 2941 2942 2943 2944
    g_free(s->image_backing_file);
    g_free(s->image_backing_format);

    s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
    s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;

K
Kevin Wolf 已提交
2945
    return qcow2_update_header(bs);
K
Kevin Wolf 已提交
2946 2947
}

2948 2949 2950 2951 2952 2953 2954 2955 2956 2957
static int qcow2_crypt_method_from_format(const char *encryptfmt)
{
    if (g_str_equal(encryptfmt, "luks")) {
        return QCOW_CRYPT_LUKS;
    } else if (g_str_equal(encryptfmt, "aes")) {
        return QCOW_CRYPT_AES;
    } else {
        return -EINVAL;
    }
}
2958

2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976
static int qcow2_set_up_encryption(BlockDriverState *bs,
                                   QCryptoBlockCreateOptions *cryptoopts,
                                   Error **errp)
{
    BDRVQcow2State *s = bs->opaque;
    QCryptoBlock *crypto = NULL;
    int fmt, ret;

    switch (cryptoopts->format) {
    case Q_CRYPTO_BLOCK_FORMAT_LUKS:
        fmt = QCOW_CRYPT_LUKS;
        break;
    case Q_CRYPTO_BLOCK_FORMAT_QCOW:
        fmt = QCOW_CRYPT_AES;
        break;
    default:
        error_setg(errp, "Crypto format not supported in qcow2");
        return -EINVAL;
2977
    }
2978

2979
    s->crypt_method_header = fmt;
2980

2981
    crypto = qcrypto_block_create(cryptoopts, "encrypt.",
2982 2983
                                  qcow2_crypto_hdr_init_func,
                                  qcow2_crypto_hdr_write_func,
2984 2985
                                  bs, errp);
    if (!crypto) {
2986
        return -EINVAL;
2987 2988 2989 2990 2991 2992 2993 2994
    }

    ret = qcow2_update_header(bs);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write encryption header");
        goto out;
    }

2995
    ret = 0;
2996 2997 2998 2999 3000
 out:
    qcrypto_block_free(crypto);
    return ret;
}

3001 3002 3003 3004 3005 3006 3007
/**
 * Preallocates metadata structures for data clusters between @offset (in the
 * guest disk) and @new_length (which is thus generally the new guest disk
 * size).
 *
 * Returns: 0 on success, -errno on failure.
 */
3008
static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
3009 3010
                                       uint64_t new_length, PreallocMode mode,
                                       Error **errp)
K
Kevin Wolf 已提交
3011
{
3012
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
3013
    uint64_t bytes;
K
Kevin Wolf 已提交
3014
    uint64_t host_offset = 0;
3015
    int64_t file_length;
K
Kevin Wolf 已提交
3016
    unsigned int cur_bytes;
3017
    int ret;
3018
    QCowL2Meta *meta;
K
Kevin Wolf 已提交
3019

3020 3021
    assert(offset <= new_length);
    bytes = new_length - offset;
K
Kevin Wolf 已提交
3022

K
Kevin Wolf 已提交
3023
    while (bytes) {
3024
        cur_bytes = MIN(bytes, QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size));
K
Kevin Wolf 已提交
3025
        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
K
Kevin Wolf 已提交
3026
                                         &host_offset, &meta);
3027
        if (ret < 0) {
K
Kevin Wolf 已提交
3028
            error_setg_errno(errp, -ret, "Allocating clusters failed");
3029
            return ret;
K
Kevin Wolf 已提交
3030 3031
        }

3032 3033 3034
        while (meta) {
            QCowL2Meta *next = meta->next;

H
Hu Tao 已提交
3035 3036
            ret = qcow2_alloc_cluster_link_l2(bs, meta);
            if (ret < 0) {
K
Kevin Wolf 已提交
3037
                error_setg_errno(errp, -ret, "Mapping clusters failed");
H
Hu Tao 已提交
3038 3039
                qcow2_free_any_clusters(bs, meta->alloc_offset,
                                        meta->nb_clusters, QCOW2_DISCARD_NEVER);
3040
                return ret;
H
Hu Tao 已提交
3041 3042 3043 3044
            }

            /* There are no dependent requests, but we need to remove our
             * request from the list of in-flight requests */
3045
            QLIST_REMOVE(meta, next_in_flight);
3046 3047 3048

            g_free(meta);
            meta = next;
3049
        }
3050

K
Kevin Wolf 已提交
3051 3052
        /* TODO Preallocate data if requested */

K
Kevin Wolf 已提交
3053 3054
        bytes -= cur_bytes;
        offset += cur_bytes;
K
Kevin Wolf 已提交
3055 3056 3057 3058 3059 3060 3061
    }

    /*
     * It is expected that the image file is large enough to actually contain
     * all of the allocated clusters (otherwise we get failing reads after
     * EOF). Extend the image to the last allocated sector.
     */
3062 3063 3064 3065 3066 3067 3068 3069 3070 3071
    file_length = bdrv_getlength(s->data_file->bs);
    if (file_length < 0) {
        error_setg_errno(errp, -file_length, "Could not get file size");
        return file_length;
    }

    if (host_offset + cur_bytes > file_length) {
        if (mode == PREALLOC_MODE_METADATA) {
            mode = PREALLOC_MODE_OFF;
        }
3072 3073
        ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
                               mode, errp);
3074
        if (ret < 0) {
3075
            return ret;
3076
        }
K
Kevin Wolf 已提交
3077 3078
    }

3079
    return 0;
K
Kevin Wolf 已提交
3080 3081
}

3082 3083 3084 3085
/* qcow2_refcount_metadata_size:
 * @clusters: number of clusters to refcount (including data and L1/L2 tables)
 * @cluster_size: size of a cluster, in bytes
 * @refcount_order: refcount bits power-of-2 exponent
3086 3087
 * @generous_increase: allow for the refcount table to be 1.5x as large as it
 *                     needs to be
3088 3089 3090
 *
 * Returns: Number of bytes required for refcount blocks and table metadata.
 */
3091 3092 3093
int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                     int refcount_order, bool generous_increase,
                                     uint64_t *refblock_count)
3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115
{
    /*
     * Every host cluster is reference-counted, including metadata (even
     * refcount metadata is recursively included).
     *
     * An accurate formula for the size of refcount metadata size is difficult
     * to derive.  An easier method of calculation is finding the fixed point
     * where no further refcount blocks or table clusters are required to
     * reference count every cluster.
     */
    int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t);
    int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
    int64_t table = 0;  /* number of refcount table clusters */
    int64_t blocks = 0; /* number of refcount block clusters */
    int64_t last;
    int64_t n = 0;

    do {
        last = n;
        blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
        table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
        n = clusters + blocks + table;
3116 3117 3118 3119 3120 3121

        if (n == last && generous_increase) {
            clusters += DIV_ROUND_UP(table, 2);
            n = 0; /* force another loop */
            generous_increase = false;
        }
3122 3123
    } while (n != last);

3124 3125 3126 3127
    if (refblock_count) {
        *refblock_count = blocks;
    }

3128 3129 3130
    return (blocks + table) * cluster_size;
}

3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144
/**
 * qcow2_calc_prealloc_size:
 * @total_size: virtual disk size in bytes
 * @cluster_size: cluster size in bytes
 * @refcount_order: refcount bits power-of-2 exponent
 *
 * Returns: Total number of bytes required for the fully allocated image
 * (including metadata).
 */
static int64_t qcow2_calc_prealloc_size(int64_t total_size,
                                        size_t cluster_size,
                                        int refcount_order)
{
    int64_t meta_size = 0;
3145
    uint64_t nl1e, nl2e;
3146
    int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);
3147 3148 3149 3150 3151 3152

    /* header: 1 cluster */
    meta_size += cluster_size;

    /* total size of L2 tables */
    nl2e = aligned_total_size / cluster_size;
3153
    nl2e = ROUND_UP(nl2e, cluster_size / sizeof(uint64_t));
3154 3155 3156 3157
    meta_size += nl2e * sizeof(uint64_t);

    /* total size of L1 tables */
    nl1e = nl2e * sizeof(uint64_t) / cluster_size;
3158
    nl1e = ROUND_UP(nl1e, cluster_size / sizeof(uint64_t));
3159 3160
    meta_size += nl1e * sizeof(uint64_t);

3161 3162 3163
    /* total size of refcount table and blocks */
    meta_size += qcow2_refcount_metadata_size(
            (meta_size + aligned_total_size) / cluster_size,
3164
            cluster_size, refcount_order, false, NULL);
3165 3166 3167 3168

    return meta_size + aligned_total_size;
}

3169
static bool validate_cluster_size(size_t cluster_size, Error **errp)
K
Kevin Wolf 已提交
3170
{
3171
    int cluster_bits = ctz32(cluster_size);
K
Kevin Wolf 已提交
3172 3173 3174
    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
        (1 << cluster_bits) != cluster_size)
    {
M
Max Reitz 已提交
3175 3176
        error_setg(errp, "Cluster size must be a power of two between %d and "
                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188
        return false;
    }
    return true;
}

static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp)
{
    size_t cluster_size;

    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
                                         DEFAULT_CLUSTER_SIZE);
    if (!validate_cluster_size(cluster_size, errp)) {
3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230
        return 0;
    }
    return cluster_size;
}

static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
{
    char *buf;
    int ret;

    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
    if (!buf) {
        ret = 3; /* default */
    } else if (!strcmp(buf, "0.10")) {
        ret = 2;
    } else if (!strcmp(buf, "1.1")) {
        ret = 3;
    } else {
        error_setg(errp, "Invalid compatibility level: '%s'", buf);
        ret = -EINVAL;
    }
    g_free(buf);
    return ret;
}

static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
                                                Error **errp)
{
    uint64_t refcount_bits;

    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
        error_setg(errp, "Refcount width must be a power of two and may not "
                   "exceed 64 bits");
        return 0;
    }

    if (version < 3 && refcount_bits != 16) {
        error_setg(errp, "Different refcount widths than 16 bits require "
                   "compatibility level 1.1 or above (use compat=1.1 or "
                   "greater)");
        return 0;
K
Kevin Wolf 已提交
3231 3232
    }

3233 3234 3235
    return refcount_bits;
}

3236
static int coroutine_fn
3237
qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
3238
{
3239
    BlockdevCreateOptionsQcow2 *qcow2_opts;
3240 3241
    QDict *options;

K
Kevin Wolf 已提交
3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253
    /*
     * Open the image file and write a minimal qcow2 header.
     *
     * We keep things simple and start with a zero-sized image. We also
     * do without refcount blocks or a L1 table for now. We'll fix the
     * inconsistency later.
     *
     * We do need a refcount table because growing the refcount table means
     * allocating two new refcount blocks - the seconds of which would be at
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
     * size for any qcow2 image.
     */
3254 3255
    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;
3256
    BlockDriverState *data_bs = NULL;
3257
    QCowHeader *header;
3258 3259 3260
    size_t cluster_size;
    int version;
    int refcount_order;
3261
    uint64_t* refcount_table;
M
Max Reitz 已提交
3262
    Error *local_err = NULL;
K
Kevin Wolf 已提交
3263 3264
    int ret;

3265 3266 3267
    assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
    qcow2_opts = &create_options->u.qcow2;

3268 3269 3270 3271 3272 3273
    bs = bdrv_open_blockdev_ref(qcow2_opts->file, errp);
    if (bs == NULL) {
        return -EIO;
    }

    /* Validate options and set default values */
3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301
    if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) {
        error_setg(errp, "Image size must be a multiple of 512 bytes");
        ret = -EINVAL;
        goto out;
    }

    if (qcow2_opts->has_version) {
        switch (qcow2_opts->version) {
        case BLOCKDEV_QCOW2_VERSION_V2:
            version = 2;
            break;
        case BLOCKDEV_QCOW2_VERSION_V3:
            version = 3;
            break;
        default:
            g_assert_not_reached();
        }
    } else {
        version = 3;
    }

    if (qcow2_opts->has_cluster_size) {
        cluster_size = qcow2_opts->cluster_size;
    } else {
        cluster_size = DEFAULT_CLUSTER_SIZE;
    }

    if (!validate_cluster_size(cluster_size, errp)) {
3302 3303
        ret = -EINVAL;
        goto out;
3304 3305 3306 3307 3308 3309 3310 3311 3312 3313
    }

    if (!qcow2_opts->has_preallocation) {
        qcow2_opts->preallocation = PREALLOC_MODE_OFF;
    }
    if (qcow2_opts->has_backing_file &&
        qcow2_opts->preallocation != PREALLOC_MODE_OFF)
    {
        error_setg(errp, "Backing file and preallocation cannot be used at "
                   "the same time");
3314 3315
        ret = -EINVAL;
        goto out;
3316 3317 3318
    }
    if (qcow2_opts->has_backing_fmt && !qcow2_opts->has_backing_file) {
        error_setg(errp, "Backing format cannot be used without backing file");
3319 3320
        ret = -EINVAL;
        goto out;
3321 3322 3323 3324 3325 3326 3327
    }

    if (!qcow2_opts->has_lazy_refcounts) {
        qcow2_opts->lazy_refcounts = false;
    }
    if (version < 3 && qcow2_opts->lazy_refcounts) {
        error_setg(errp, "Lazy refcounts only supported with compatibility "
3328
                   "level 1.1 and above (use version=v3 or greater)");
3329 3330
        ret = -EINVAL;
        goto out;
3331 3332 3333 3334 3335 3336 3337 3338 3339 3340
    }

    if (!qcow2_opts->has_refcount_bits) {
        qcow2_opts->refcount_bits = 16;
    }
    if (qcow2_opts->refcount_bits > 64 ||
        !is_power_of_2(qcow2_opts->refcount_bits))
    {
        error_setg(errp, "Refcount width must be a power of two and may not "
                   "exceed 64 bits");
3341 3342
        ret = -EINVAL;
        goto out;
3343 3344 3345
    }
    if (version < 3 && qcow2_opts->refcount_bits != 16) {
        error_setg(errp, "Different refcount widths than 16 bits require "
3346
                   "compatibility level 1.1 or above (use version=v3 or "
3347
                   "greater)");
3348 3349
        ret = -EINVAL;
        goto out;
3350 3351 3352
    }
    refcount_order = ctz32(qcow2_opts->refcount_bits);

3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364
    if (qcow2_opts->data_file_raw && !qcow2_opts->data_file) {
        error_setg(errp, "data-file-raw requires data-file");
        ret = -EINVAL;
        goto out;
    }
    if (qcow2_opts->data_file_raw && qcow2_opts->has_backing_file) {
        error_setg(errp, "Backing file and data-file-raw cannot be used at "
                   "the same time");
        ret = -EINVAL;
        goto out;
    }

3365 3366 3367 3368 3369 3370 3371 3372 3373
    if (qcow2_opts->data_file) {
        if (version < 3) {
            error_setg(errp, "External data files are only supported with "
                       "compatibility level 1.1 and above (use version=v3 or "
                       "greater)");
            ret = -EINVAL;
            goto out;
        }
        data_bs = bdrv_open_blockdev_ref(qcow2_opts->data_file, errp);
3374
        if (data_bs == NULL) {
3375 3376 3377 3378
            ret = -EIO;
            goto out;
        }
    }
3379 3380

    /* Create BlockBackend to write to the image */
K
Kevin Wolf 已提交
3381 3382
    blk = blk_new(bdrv_get_aio_context(bs),
                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
3383
    ret = blk_insert_bs(blk, bs, errp);
K
Kevin Wolf 已提交
3384
    if (ret < 0) {
3385
        goto out;
K
Kevin Wolf 已提交
3386
    }
3387 3388
    blk_set_allow_write_beyond_eof(blk, true);

K
Kevin Wolf 已提交
3389
    /* Write the header */
3390 3391 3392 3393 3394
    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
    header = g_malloc0(cluster_size);
    *header = (QCowHeader) {
        .magic                      = cpu_to_be32(QCOW_MAGIC),
        .version                    = cpu_to_be32(version),
3395
        .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
3396 3397 3398 3399 3400
        .size                       = cpu_to_be64(0),
        .l1_table_offset            = cpu_to_be64(0),
        .l1_size                    = cpu_to_be32(0),
        .refcount_table_offset      = cpu_to_be64(cluster_size),
        .refcount_table_clusters    = cpu_to_be32(1),
3401
        .refcount_order             = cpu_to_be32(refcount_order),
3402 3403
        .header_length              = cpu_to_be32(sizeof(*header)),
    };
K
Kevin Wolf 已提交
3404

3405 3406
    /* We'll update this to correct value later */
    header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
K
Kevin Wolf 已提交
3407

3408
    if (qcow2_opts->lazy_refcounts) {
3409
        header->compatible_features |=
3410 3411
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }
3412 3413 3414 3415
    if (data_bs) {
        header->incompatible_features |=
            cpu_to_be64(QCOW2_INCOMPAT_DATA_FILE);
    }
3416 3417 3418 3419
    if (qcow2_opts->data_file_raw) {
        header->autoclear_features |=
            cpu_to_be64(QCOW2_AUTOCLEAR_DATA_FILE_RAW);
    }
3420

3421
    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
3422
    g_free(header);
K
Kevin Wolf 已提交
3423
    if (ret < 0) {
M
Max Reitz 已提交
3424
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
K
Kevin Wolf 已提交
3425 3426 3427
        goto out;
    }

3428 3429 3430
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
3431
    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
3432
    g_free(refcount_table);
K
Kevin Wolf 已提交
3433 3434

    if (ret < 0) {
M
Max Reitz 已提交
3435
        error_setg_errno(errp, -ret, "Could not write refcount table");
K
Kevin Wolf 已提交
3436 3437 3438
        goto out;
    }

3439 3440
    blk_unref(blk);
    blk = NULL;
K
Kevin Wolf 已提交
3441 3442 3443 3444 3445 3446

    /*
     * And now open the image and make it consistent first (i.e. increase the
     * refcount of the cluster that is occupied by the header and the refcount
     * table)
     */
3447
    options = qdict_new();
3448
    qdict_put_str(options, "driver", "qcow2");
3449
    qdict_put_str(options, "file", bs->node_name);
3450 3451 3452
    if (data_bs) {
        qdict_put_str(options, "data-file", data_bs->node_name);
    }
3453
    blk = blk_new_open(NULL, NULL, options,
3454 3455
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
                       &local_err);
3456
    if (blk == NULL) {
M
Max Reitz 已提交
3457
        error_propagate(errp, local_err);
3458
        ret = -EIO;
K
Kevin Wolf 已提交
3459 3460 3461
        goto out;
    }

3462
    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
K
Kevin Wolf 已提交
3463
    if (ret < 0) {
M
Max Reitz 已提交
3464 3465
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
                         "header and refcount table");
K
Kevin Wolf 已提交
3466 3467 3468 3469 3470 3471 3472
        goto out;

    } else if (ret != 0) {
        error_report("Huh, first cluster in empty image is already in use?");
        abort();
    }

3473 3474 3475 3476 3477 3478
    /* Set the external data file if necessary */
    if (data_bs) {
        BDRVQcow2State *s = blk_bs(blk)->opaque;
        s->image_data_file = g_strdup(data_bs->filename);
    }

3479
    /* Create a full header (including things like feature table) */
3480
    ret = qcow2_update_header(blk_bs(blk));
3481 3482 3483 3484 3485
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not update qcow2 header");
        goto out;
    }

K
Kevin Wolf 已提交
3486
    /* Okay, now that we have a valid image, let's give it the right size */
3487 3488
    ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation,
                       errp);
K
Kevin Wolf 已提交
3489
    if (ret < 0) {
3490
        error_prepend(errp, "Could not resize image: ");
K
Kevin Wolf 已提交
3491 3492 3493 3494
        goto out;
    }

    /* Want a backing file? There you go.*/
3495 3496 3497 3498 3499 3500 3501 3502 3503
    if (qcow2_opts->has_backing_file) {
        const char *backing_format = NULL;

        if (qcow2_opts->has_backing_fmt) {
            backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
        }

        ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
                                       backing_format);
K
Kevin Wolf 已提交
3504
        if (ret < 0) {
M
Max Reitz 已提交
3505
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
3506 3507
                             "with format '%s'", qcow2_opts->backing_file,
                             backing_format);
K
Kevin Wolf 已提交
3508 3509 3510 3511
            goto out;
        }
    }

3512
    /* Want encryption? There you go. */
3513 3514
    if (qcow2_opts->has_encrypt) {
        ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
3515 3516 3517 3518 3519
        if (ret < 0) {
            goto out;
        }
    }

3520 3521
    blk_unref(blk);
    blk = NULL;
M
Max Reitz 已提交
3522

3523 3524 3525 3526 3527 3528
    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
     * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
     * have to setup decryption context. We're not doing any I/O on the top
     * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
     * not have effect.
     */
3529
    options = qdict_new();
3530
    qdict_put_str(options, "driver", "qcow2");
3531
    qdict_put_str(options, "file", bs->node_name);
3532 3533 3534
    if (data_bs) {
        qdict_put_str(options, "data-file", data_bs->node_name);
    }
3535
    blk = blk_new_open(NULL, NULL, options,
3536 3537
                       BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
                       &local_err);
3538
    if (blk == NULL) {
M
Max Reitz 已提交
3539
        error_propagate(errp, local_err);
3540
        ret = -EIO;
M
Max Reitz 已提交
3541 3542 3543
        goto out;
    }

K
Kevin Wolf 已提交
3544 3545
    ret = 0;
out:
3546 3547
    blk_unref(blk);
    bdrv_unref(bs);
3548
    bdrv_unref(data_bs);
K
Kevin Wolf 已提交
3549 3550
    return ret;
}
K
Kevin Wolf 已提交
3551

3552 3553
static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts,
                                             Error **errp)
K
Kevin Wolf 已提交
3554
{
3555
    BlockdevCreateOptions *create_options = NULL;
3556
    QDict *qdict;
3557
    Visitor *v;
3558
    BlockDriverState *bs = NULL;
3559
    BlockDriverState *data_bs = NULL;
M
Max Reitz 已提交
3560
    Error *local_err = NULL;
3561
    const char *val;
M
Max Reitz 已提交
3562
    int ret;
K
Kevin Wolf 已提交
3563

3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601
    /* Only the keyval visitor supports the dotted syntax needed for
     * encryption, so go through a QDict before getting a QAPI type. Ignore
     * options meant for the protocol layer so that the visitor doesn't
     * complain. */
    qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts,
                                        true);

    /* Handle encryption options */
    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
    if (val && !strcmp(val, "on")) {
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
    } else if (val && !strcmp(val, "off")) {
        qdict_del(qdict, BLOCK_OPT_ENCRYPT);
    }

    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
    if (val && !strcmp(val, "aes")) {
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
    }

    /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into
     * version=v2/v3 below. */
    val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL);
    if (val && !strcmp(val, "0.10")) {
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2");
    } else if (val && !strcmp(val, "1.1")) {
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3");
    }

    /* Change legacy command line options into QMP ones */
    static const QDictRenames opt_renames[] = {
        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
        { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
        { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
        { BLOCK_OPT_LAZY_REFCOUNTS,     "lazy-refcounts" },
        { BLOCK_OPT_REFCOUNT_BITS,      "refcount-bits" },
        { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
        { BLOCK_OPT_COMPAT_LEVEL,       "version" },
3602
        { BLOCK_OPT_DATA_FILE_RAW,      "data-file-raw" },
3603 3604 3605 3606
        { NULL, NULL },
    };

    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
3607 3608 3609
        ret = -EINVAL;
        goto finish;
    }
3610

3611 3612 3613
    /* Create and open the file (protocol layer) */
    ret = bdrv_create_file(filename, opts, errp);
    if (ret < 0) {
3614 3615
        goto finish;
    }
3616 3617 3618 3619 3620

    bs = bdrv_open(filename, NULL, NULL,
                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    if (bs == NULL) {
        ret = -EIO;
3621 3622
        goto finish;
    }
3623

3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643
    /* Create and open an external data file (protocol layer) */
    val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
    if (val) {
        ret = bdrv_create_file(val, opts, errp);
        if (ret < 0) {
            goto finish;
        }

        data_bs = bdrv_open(val, NULL, NULL,
                            BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
                            errp);
        if (data_bs == NULL) {
            ret = -EIO;
            goto finish;
        }

        qdict_del(qdict, BLOCK_OPT_DATA_FILE);
        qdict_put_str(qdict, "data-file", data_bs->node_name);
    }

3644 3645 3646 3647 3648
    /* Set 'driver' and 'node' options */
    qdict_put_str(qdict, "driver", "qcow2");
    qdict_put_str(qdict, "file", bs->node_name);

    /* Now get the QAPI type BlockdevCreateOptions */
3649 3650
    v = qobject_input_visitor_new_flat_confused(qdict, errp);
    if (!v) {
3651 3652 3653 3654
        ret = -EINVAL;
        goto finish;
    }

3655 3656
    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
    visit_free(v);
K
Kevin Wolf 已提交
3657

3658 3659
    if (local_err) {
        error_propagate(errp, local_err);
3660 3661 3662 3663
        ret = -EINVAL;
        goto finish;
    }

3664 3665 3666
    /* Silently round up size */
    create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size,
                                            BDRV_SECTOR_SIZE);
3667 3668

    /* Create the qcow2 image (format layer) */
3669
    ret = qcow2_co_create(create_options, errp);
3670 3671 3672
    if (ret < 0) {
        goto finish;
    }
3673

3674
    ret = 0;
3675
finish:
3676
    qobject_unref(qdict);
3677
    bdrv_unref(bs);
3678
    bdrv_unref(data_bs);
3679
    qapi_free_BlockdevCreateOptions(create_options);
M
Max Reitz 已提交
3680
    return ret;
K
Kevin Wolf 已提交
3681 3682
}

3683

3684
static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
3685
{
3686 3687
    int64_t nr;
    int res;
3688 3689

    /* Clamp to image length, before checking status of underlying sectors */
E
Eric Blake 已提交
3690 3691
    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
        bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
3692 3693
    }

3694
    if (!bytes) {
3695 3696
        return true;
    }
E
Eric Blake 已提交
3697
    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
3698
    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
3699 3700
}

3701
static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
3702
    int64_t offset, int bytes, BdrvRequestFlags flags)
K
Kevin Wolf 已提交
3703 3704
{
    int ret;
3705
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
3706

3707
    uint32_t head = offset % s->cluster_size;
3708
    uint32_t tail = (offset + bytes) % s->cluster_size;
3709

3710 3711
    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
    if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
3712 3713
        tail = 0;
    }
3714

3715 3716
    if (head || tail) {
        uint64_t off;
K
Kevin Wolf 已提交
3717
        unsigned int nr;
3718

3719
        assert(head + bytes <= s->cluster_size);
3720

3721
        /* check whether remainder of cluster already reads as zero */
3722 3723 3724
        if (!(is_zero(bs, offset - head, head) &&
              is_zero(bs, offset + bytes,
                      tail ? s->cluster_size - tail : 0))) {
3725 3726 3727 3728 3729
            return -ENOTSUP;
        }

        qemu_co_mutex_lock(&s->lock);
        /* We can have new write after previous check */
3730
        offset = QEMU_ALIGN_DOWN(offset, s->cluster_size);
3731
        bytes = s->cluster_size;
K
Kevin Wolf 已提交
3732
        nr = s->cluster_size;
3733
        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
3734 3735 3736
        if (ret != QCOW2_CLUSTER_UNALLOCATED &&
            ret != QCOW2_CLUSTER_ZERO_PLAIN &&
            ret != QCOW2_CLUSTER_ZERO_ALLOC) {
3737 3738 3739 3740 3741
            qemu_co_mutex_unlock(&s->lock);
            return -ENOTSUP;
        }
    } else {
        qemu_co_mutex_lock(&s->lock);
K
Kevin Wolf 已提交
3742 3743
    }

3744
    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
3745

K
Kevin Wolf 已提交
3746
    /* Whatever is left can use real zero clusters */
3747
    ret = qcow2_cluster_zeroize(bs, offset, bytes, flags);
K
Kevin Wolf 已提交
3748 3749 3750 3751 3752
    qemu_co_mutex_unlock(&s->lock);

    return ret;
}

3753
static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
3754
                                          int64_t offset, int bytes)
K
Kevin Wolf 已提交
3755
{
3756
    int ret;
3757
    BDRVQcow2State *s = bs->opaque;
3758

3759 3760
    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
        assert(bytes < s->cluster_size);
3761 3762 3763
        /* Ignore partial clusters, except for the special case of the
         * complete partial cluster at the end of an unaligned file */
        if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
3764
            offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
3765 3766
            return -ENOTSUP;
        }
3767 3768
    }

3769
    qemu_co_mutex_lock(&s->lock);
3770
    ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
3771
                                false);
3772 3773
    qemu_co_mutex_unlock(&s->lock);
    return ret;
K
Kevin Wolf 已提交
3774 3775
}

F
Fam Zheng 已提交
3776 3777 3778 3779
static int coroutine_fn
qcow2_co_copy_range_from(BlockDriverState *bs,
                         BdrvChild *src, uint64_t src_offset,
                         BdrvChild *dst, uint64_t dst_offset,
3780 3781
                         uint64_t bytes, BdrvRequestFlags read_flags,
                         BdrvRequestFlags write_flags)
F
Fam Zheng 已提交
3782 3783 3784 3785 3786
{
    BDRVQcow2State *s = bs->opaque;
    int ret;
    unsigned int cur_bytes; /* number of bytes in current iteration */
    BdrvChild *child = NULL;
3787
    BdrvRequestFlags cur_write_flags;
F
Fam Zheng 已提交
3788 3789 3790 3791 3792 3793 3794 3795

    assert(!bs->encrypted);
    qemu_co_mutex_lock(&s->lock);

    while (bytes != 0) {
        uint64_t copy_offset = 0;
        /* prepare next request */
        cur_bytes = MIN(bytes, INT_MAX);
3796
        cur_write_flags = write_flags;
F
Fam Zheng 已提交
3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807

        ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, &copy_offset);
        if (ret < 0) {
            goto out;
        }

        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
            if (bs->backing && bs->backing->bs) {
                int64_t backing_length = bdrv_getlength(bs->backing->bs);
                if (src_offset >= backing_length) {
3808
                    cur_write_flags |= BDRV_REQ_ZERO_WRITE;
F
Fam Zheng 已提交
3809 3810 3811 3812 3813 3814
                } else {
                    child = bs->backing;
                    cur_bytes = MIN(cur_bytes, backing_length - src_offset);
                    copy_offset = src_offset;
                }
            } else {
3815
                cur_write_flags |= BDRV_REQ_ZERO_WRITE;
F
Fam Zheng 已提交
3816 3817 3818 3819 3820
            }
            break;

        case QCOW2_CLUSTER_ZERO_PLAIN:
        case QCOW2_CLUSTER_ZERO_ALLOC:
3821
            cur_write_flags |= BDRV_REQ_ZERO_WRITE;
F
Fam Zheng 已提交
3822 3823 3824 3825 3826 3827 3828
            break;

        case QCOW2_CLUSTER_COMPRESSED:
            ret = -ENOTSUP;
            goto out;

        case QCOW2_CLUSTER_NORMAL:
K
Kevin Wolf 已提交
3829
            child = s->data_file;
F
Fam Zheng 已提交
3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843
            copy_offset += offset_into_cluster(s, src_offset);
            if ((copy_offset & 511) != 0) {
                ret = -EIO;
                goto out;
            }
            break;

        default:
            abort();
        }
        qemu_co_mutex_unlock(&s->lock);
        ret = bdrv_co_copy_range_from(child,
                                      copy_offset,
                                      dst, dst_offset,
3844
                                      cur_bytes, read_flags, cur_write_flags);
F
Fam Zheng 已提交
3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto out;
        }

        bytes -= cur_bytes;
        src_offset += cur_bytes;
        dst_offset += cur_bytes;
    }
    ret = 0;

out:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

static int coroutine_fn
qcow2_co_copy_range_to(BlockDriverState *bs,
                       BdrvChild *src, uint64_t src_offset,
                       BdrvChild *dst, uint64_t dst_offset,
3865 3866
                       uint64_t bytes, BdrvRequestFlags read_flags,
                       BdrvRequestFlags write_flags)
F
Fam Zheng 已提交
3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895
{
    BDRVQcow2State *s = bs->opaque;
    int offset_in_cluster;
    int ret;
    unsigned int cur_bytes; /* number of sectors in current iteration */
    uint64_t cluster_offset;
    QCowL2Meta *l2meta = NULL;

    assert(!bs->encrypted);

    qemu_co_mutex_lock(&s->lock);

    while (bytes != 0) {

        l2meta = NULL;

        offset_in_cluster = offset_into_cluster(s, dst_offset);
        cur_bytes = MIN(bytes, INT_MAX);

        /* TODO:
         * If src->bs == dst->bs, we could simply copy by incrementing
         * the refcnt, without copying user data.
         * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
        ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes,
                                         &cluster_offset, &l2meta);
        if (ret < 0) {
            goto fail;
        }

3896
        assert(offset_into_cluster(s, cluster_offset) == 0);
F
Fam Zheng 已提交
3897 3898

        ret = qcow2_pre_write_overlap_check(bs, 0,
K
Kevin Wolf 已提交
3899
                cluster_offset + offset_in_cluster, cur_bytes, true);
F
Fam Zheng 已提交
3900 3901 3902 3903 3904 3905
        if (ret < 0) {
            goto fail;
        }

        qemu_co_mutex_unlock(&s->lock);
        ret = bdrv_co_copy_range_to(src, src_offset,
K
Kevin Wolf 已提交
3906
                                    s->data_file,
F
Fam Zheng 已提交
3907
                                    cluster_offset + offset_in_cluster,
3908
                                    cur_bytes, read_flags, write_flags);
F
Fam Zheng 已提交
3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto fail;
        }

        ret = qcow2_handle_l2meta(bs, &l2meta, true);
        if (ret) {
            goto fail;
        }

        bytes -= cur_bytes;
3920
        src_offset += cur_bytes;
F
Fam Zheng 已提交
3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934
        dst_offset += cur_bytes;
    }
    ret = 0;

fail:
    qcow2_handle_l2meta(bs, &l2meta, false);

    qemu_co_mutex_unlock(&s->lock);

    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);

    return ret;
}

3935
static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
3936 3937
                                          bool exact, PreallocMode prealloc,
                                          Error **errp)
3938
{
3939
    BDRVQcow2State *s = bs->opaque;
3940
    uint64_t old_length;
3941 3942
    int64_t new_l1_size;
    int ret;
3943
    QDict *options;
3944

3945 3946 3947
    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
        prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
    {
3948
        error_setg(errp, "Unsupported preallocation mode '%s'",
3949
                   PreallocMode_str(prealloc));
3950 3951 3952
        return -ENOTSUP;
    }

3953
    if (offset & 511) {
3954
        error_setg(errp, "The new size must be a multiple of 512");
3955 3956 3957
        return -EINVAL;
    }

3958 3959
    qemu_co_mutex_lock(&s->lock);

3960 3961
    /* cannot proceed if image has snapshots */
    if (s->nb_snapshots) {
3962
        error_setg(errp, "Can't resize an image which has snapshots");
3963 3964
        ret = -ENOTSUP;
        goto fail;
3965 3966
    }

3967
    /* cannot proceed if image has bitmaps */
3968
    if (qcow2_truncate_bitmaps_check(bs, errp)) {
3969 3970
        ret = -ENOTSUP;
        goto fail;
3971 3972
    }

3973
    old_length = bs->total_sectors * BDRV_SECTOR_SIZE;
P
Pavel Butsykin 已提交
3974
    new_l1_size = size_to_l1(s, offset);
3975 3976

    if (offset < old_length) {
3977
        int64_t last_cluster, old_file_size;
P
Pavel Butsykin 已提交
3978 3979 3980
        if (prealloc != PREALLOC_MODE_OFF) {
            error_setg(errp,
                       "Preallocation can't be used for shrinking an image");
3981 3982
            ret = -EINVAL;
            goto fail;
P
Pavel Butsykin 已提交
3983
        }
3984

P
Pavel Butsykin 已提交
3985 3986 3987 3988 3989 3990
        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
                                    old_length - ROUND_UP(offset,
                                                          s->cluster_size),
                                    QCOW2_DISCARD_ALWAYS, true);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
3991
            goto fail;
P
Pavel Butsykin 已提交
3992 3993 3994 3995 3996 3997
        }

        ret = qcow2_shrink_l1_table(bs, new_l1_size);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to reduce the number of L2 tables");
3998
            goto fail;
P
Pavel Butsykin 已提交
3999 4000 4001 4002 4003 4004
        }

        ret = qcow2_shrink_reftable(bs);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to discard unused refblocks");
4005
            goto fail;
P
Pavel Butsykin 已提交
4006
        }
4007 4008 4009 4010 4011

        old_file_size = bdrv_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
4012 4013
            ret = old_file_size;
            goto fail;
4014 4015 4016 4017 4018
        }
        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
        if (last_cluster < 0) {
            error_setg_errno(errp, -last_cluster,
                             "Failed to find the last cluster");
4019 4020
            ret = last_cluster;
            goto fail;
4021 4022
        }
        if ((last_cluster + 1) * s->cluster_size < old_file_size) {
4023 4024
            Error *local_err = NULL;

M
Max Reitz 已提交
4025 4026 4027 4028 4029 4030 4031
            /*
             * Do not pass @exact here: It will not help the user if
             * we get an error here just because they wanted to shrink
             * their qcow2 image (on a block device) with qemu-img.
             * (And on the qcow2 layer, the @exact requirement is
             * always fulfilled, so there is no need to pass it on.)
             */
4032
            bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
4033
                             false, PREALLOC_MODE_OFF, &local_err);
4034 4035 4036
            if (local_err) {
                warn_reportf_err(local_err,
                                 "Failed to truncate the tail of the image: ");
4037 4038
            }
        }
P
Pavel Butsykin 已提交
4039 4040 4041 4042
    } else {
        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
4043
            goto fail;
P
Pavel Butsykin 已提交
4044
        }
4045 4046
    }

4047 4048
    switch (prealloc) {
    case PREALLOC_MODE_OFF:
4049
        if (has_data_file(bs)) {
M
Max Reitz 已提交
4050 4051 4052 4053 4054 4055
            /*
             * If the caller wants an exact resize, the external data
             * file should be resized to the exact target size, too,
             * so we pass @exact here.
             */
            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp);
4056 4057 4058 4059
            if (ret < 0) {
                goto fail;
            }
        }
4060 4061 4062
        break;

    case PREALLOC_MODE_METADATA:
4063
        ret = preallocate_co(bs, old_length, offset, prealloc, errp);
4064
        if (ret < 0) {
4065
            goto fail;
4066 4067 4068
        }
        break;

4069 4070 4071 4072 4073 4074 4075 4076
    case PREALLOC_MODE_FALLOC:
    case PREALLOC_MODE_FULL:
    {
        int64_t allocation_start, host_offset, guest_offset;
        int64_t clusters_allocated;
        int64_t old_file_size, new_file_size;
        uint64_t nb_new_data_clusters, nb_new_l2_tables;

K
Kevin Wolf 已提交
4077 4078 4079
        /* With a data file, preallocation means just allocating the metadata
         * and forwarding the truncate request to the data file */
        if (has_data_file(bs)) {
4080
            ret = preallocate_co(bs, old_length, offset, prealloc, errp);
K
Kevin Wolf 已提交
4081 4082 4083 4084 4085 4086
            if (ret < 0) {
                goto fail;
            }
            break;
        }

4087 4088 4089 4090
        old_file_size = bdrv_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
4091 4092
            ret = old_file_size;
            goto fail;
4093
        }
4094
        old_file_size = ROUND_UP(old_file_size, s->cluster_size);
4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121

        nb_new_data_clusters = DIV_ROUND_UP(offset - old_length,
                                            s->cluster_size);

        /* This is an overestimation; we will not actually allocate space for
         * these in the file but just make sure the new refcount structures are
         * able to cover them so we will not have to allocate new refblocks
         * while entering the data blocks in the potentially new L2 tables.
         * (We do not actually care where the L2 tables are placed. Maybe they
         *  are already allocated or they can be placed somewhere before
         *  @old_file_size. It does not matter because they will be fully
         *  allocated automatically, so they do not need to be covered by the
         *  preallocation. All that matters is that we will not have to allocate
         *  new refcount structures for them.) */
        nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
                                        s->cluster_size / sizeof(uint64_t));
        /* The cluster range may not be aligned to L2 boundaries, so add one L2
         * table for a potential head/tail */
        nb_new_l2_tables++;

        allocation_start = qcow2_refcount_area(bs, old_file_size,
                                               nb_new_data_clusters +
                                               nb_new_l2_tables,
                                               true, 0, 0);
        if (allocation_start < 0) {
            error_setg_errno(errp, -allocation_start,
                             "Failed to resize refcount structures");
4122 4123
            ret = allocation_start;
            goto fail;
4124 4125 4126 4127 4128 4129 4130
        }

        clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
                                                     nb_new_data_clusters);
        if (clusters_allocated < 0) {
            error_setg_errno(errp, -clusters_allocated,
                             "Failed to allocate data clusters");
4131 4132
            ret = clusters_allocated;
            goto fail;
4133 4134 4135 4136 4137 4138 4139
        }

        assert(clusters_allocated == nb_new_data_clusters);

        /* Allocate the data area */
        new_file_size = allocation_start +
                        nb_new_data_clusters * s->cluster_size;
M
Max Reitz 已提交
4140
        /* Image file grows, so @exact does not matter */
4141
        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp);
4142 4143 4144 4145 4146
        if (ret < 0) {
            error_prepend(errp, "Failed to resize underlying file: ");
            qcow2_free_clusters(bs, allocation_start,
                                nb_new_data_clusters * s->cluster_size,
                                QCOW2_DISCARD_OTHER);
4147
            goto fail;
4148 4149 4150 4151 4152 4153
        }

        /* Create the necessary L2 entries */
        host_offset = allocation_start;
        guest_offset = old_length;
        while (nb_new_data_clusters) {
4154 4155 4156
            int64_t nb_clusters = MIN(
                nb_new_data_clusters,
                s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169
            QCowL2Meta allocation = {
                .offset       = guest_offset,
                .alloc_offset = host_offset,
                .nb_clusters  = nb_clusters,
            };
            qemu_co_queue_init(&allocation.dependent_requests);

            ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
            if (ret < 0) {
                error_setg_errno(errp, -ret, "Failed to update L2 tables");
                qcow2_free_clusters(bs, host_offset,
                                    nb_new_data_clusters * s->cluster_size,
                                    QCOW2_DISCARD_OTHER);
4170
                goto fail;
4171 4172 4173 4174 4175 4176 4177 4178 4179
            }

            guest_offset += nb_clusters * s->cluster_size;
            host_offset += nb_clusters * s->cluster_size;
            nb_new_data_clusters -= nb_clusters;
        }
        break;
    }

4180 4181 4182 4183 4184 4185
    default:
        g_assert_not_reached();
    }

    if (prealloc != PREALLOC_MODE_OFF) {
        /* Flush metadata before actually changing the image size */
4186
        ret = qcow2_write_caches(bs);
4187 4188 4189
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to flush the preallocated area to disk");
4190
            goto fail;
4191 4192 4193
        }
    }

4194 4195
    bs->total_sectors = offset / BDRV_SECTOR_SIZE;

4196 4197
    /* write updated header.size */
    offset = cpu_to_be64(offset);
4198
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
4199
                           &offset, sizeof(uint64_t));
4200
    if (ret < 0) {
4201
        error_setg_errno(errp, -ret, "Failed to update the image size");
4202
        goto fail;
4203 4204 4205
    }

    s->l1_vm_state_index = new_l1_size;
4206 4207 4208 4209 4210 4211 4212 4213

    /* Update cache sizes */
    options = qdict_clone_shallow(bs->options);
    ret = qcow2_update_options(bs, options, s->flags, errp);
    qobject_unref(options);
    if (ret < 0) {
        goto fail;
    }
4214 4215 4216 4217
    ret = 0;
fail:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
4218 4219
}

4220
static coroutine_fn int
4221
qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
4222 4223
                                 uint64_t offset, uint64_t bytes,
                                 QEMUIOVector *qiov, size_t qiov_offset)
B
Blue Swirl 已提交
4224
{
4225
    BDRVQcow2State *s = bs->opaque;
4226
    int ret;
4227
    ssize_t out_len;
4228
    uint8_t *buf, *out_buf;
4229
    uint64_t cluster_offset;
B
Blue Swirl 已提交
4230

4231 4232
    assert(bytes == s->cluster_size || (bytes < s->cluster_size &&
           (offset + bytes == bs->total_sectors << BDRV_SECTOR_BITS)));
4233

4234
    buf = qemu_blockalign(bs, s->cluster_size);
4235
    if (bytes < s->cluster_size) {
4236 4237
        /* Zero-pad last write if image size is not cluster aligned */
        memset(buf + bytes, 0, s->cluster_size - bytes);
4238
    }
4239
    qemu_iovec_to_buf(qiov, qiov_offset, buf, bytes);
B
Blue Swirl 已提交
4240

4241
    out_buf = g_malloc(s->cluster_size);
B
Blue Swirl 已提交
4242

4243 4244
    out_len = qcow2_co_compress(bs, out_buf, s->cluster_size - 1,
                                buf, s->cluster_size);
4245
    if (out_len == -ENOMEM) {
B
Blue Swirl 已提交
4246
        /* could not compress: write normal cluster */
4247
        ret = qcow2_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 0);
4248 4249 4250
        if (ret < 0) {
            goto fail;
        }
4251
        goto success;
4252 4253 4254
    } else if (out_len < 0) {
        ret = -EINVAL;
        goto fail;
4255
    }
4256

4257
    qemu_co_mutex_lock(&s->lock);
4258 4259 4260
    ret = qcow2_alloc_compressed_cluster_offset(bs, offset, out_len,
                                                &cluster_offset);
    if (ret < 0) {
4261 4262 4263
        qemu_co_mutex_unlock(&s->lock);
        goto fail;
    }
4264

K
Kevin Wolf 已提交
4265
    ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
4266 4267 4268
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        goto fail;
B
Blue Swirl 已提交
4269 4270
    }

K
Kevin Wolf 已提交
4271
    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
4272
    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
4273 4274 4275 4276
    if (ret < 0) {
        goto fail;
    }
success:
4277 4278
    ret = 0;
fail:
4279
    qemu_vfree(buf);
4280
    g_free(out_buf);
4281
    return ret;
B
Blue Swirl 已提交
4282 4283
}

4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354
static coroutine_fn int qcow2_co_pwritev_compressed_task_entry(AioTask *task)
{
    Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);

    assert(!t->cluster_type && !t->l2meta);

    return qcow2_co_pwritev_compressed_task(t->bs, t->offset, t->bytes, t->qiov,
                                            t->qiov_offset);
}

/*
 * XXX: put compressed sectors first, then all the cluster aligned
 * tables to avoid losing bytes in alignment
 */
static coroutine_fn int
qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
                                 uint64_t offset, uint64_t bytes,
                                 QEMUIOVector *qiov, size_t qiov_offset)
{
    BDRVQcow2State *s = bs->opaque;
    AioTaskPool *aio = NULL;
    int ret = 0;

    if (has_data_file(bs)) {
        return -ENOTSUP;
    }

    if (bytes == 0) {
        /*
         * align end of file to a sector boundary to ease reading with
         * sector based I/Os
         */
        int64_t len = bdrv_getlength(bs->file->bs);
        if (len < 0) {
            return len;
        }
        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL);
    }

    if (offset_into_cluster(s, offset)) {
        return -EINVAL;
    }

    while (bytes && aio_task_pool_status(aio) == 0) {
        uint64_t chunk_size = MIN(bytes, s->cluster_size);

        if (!aio && chunk_size != bytes) {
            aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
        }

        ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_compressed_task_entry,
                             0, 0, offset, chunk_size, qiov, qiov_offset, NULL);
        if (ret < 0) {
            break;
        }
        qiov_offset += chunk_size;
        offset += chunk_size;
        bytes -= chunk_size;
    }

    if (aio) {
        aio_task_pool_wait_all(aio);
        if (ret == 0) {
            ret = aio_task_pool_status(aio);
        }
        g_free(aio);
    }

    return ret;
}

4355 4356 4357 4358 4359
static int coroutine_fn
qcow2_co_preadv_compressed(BlockDriverState *bs,
                           uint64_t file_cluster_offset,
                           uint64_t offset,
                           uint64_t bytes,
4360 4361
                           QEMUIOVector *qiov,
                           size_t qiov_offset)
4362 4363
{
    BDRVQcow2State *s = bs->opaque;
4364
    int ret = 0, csize, nb_csectors;
4365
    uint64_t coffset;
4366 4367
    uint8_t *buf, *out_buf;
    int offset_in_cluster = offset_into_cluster(s, offset);
4368

4369 4370
    coffset = file_cluster_offset & s->cluster_offset_mask;
    nb_csectors = ((file_cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
4371 4372
    csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
        (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK);
4373

4374 4375 4376 4377
    buf = g_try_malloc(csize);
    if (!buf) {
        return -ENOMEM;
    }
4378

4379
    out_buf = qemu_blockalign(bs, s->cluster_size);
4380

4381
    BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
4382
    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
4383 4384
    if (ret < 0) {
        goto fail;
4385
    }
4386

4387
    if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
4388 4389 4390 4391
        ret = -EIO;
        goto fail;
    }

4392
    qemu_iovec_from_buf(qiov, qiov_offset, out_buf + offset_in_cluster, bytes);
4393 4394 4395 4396 4397 4398

fail:
    qemu_vfree(out_buf);
    g_free(buf);

    return ret;
4399 4400
}

M
Max Reitz 已提交
4401 4402
static int make_completely_empty(BlockDriverState *bs)
{
4403
    BDRVQcow2State *s = bs->opaque;
4404
    Error *local_err = NULL;
M
Max Reitz 已提交
4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438
    int ret, l1_clusters;
    int64_t offset;
    uint64_t *new_reftable = NULL;
    uint64_t rt_entry, l1_size2;
    struct {
        uint64_t l1_offset;
        uint64_t reftable_offset;
        uint32_t reftable_clusters;
    } QEMU_PACKED l1_ofs_rt_ofs_cls;

    ret = qcow2_cache_empty(bs, s->l2_table_cache);
    if (ret < 0) {
        goto fail;
    }

    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
    if (ret < 0) {
        goto fail;
    }

    /* Refcounts will be broken utterly */
    ret = qcow2_mark_dirty(bs);
    if (ret < 0) {
        goto fail;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);

    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);

    /* After this call, neither the in-memory nor the on-disk refcount
     * information accurately describe the actual references */

4439
    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
4440
                             l1_clusters * s->cluster_size, 0);
M
Max Reitz 已提交
4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
    memset(s->l1_table, 0, l1_size2);

    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);

    /* Overwrite enough clusters at the beginning of the sectors to place
     * the refcount table, a refcount block and the L1 table in; this may
     * overwrite parts of the existing refcount and L1 table, which is not
     * an issue because the dirty flag is set, complete data loss is in fact
     * desired and partial data loss is consequently fine as well */
4453
    ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
4454
                             (2 + l1_clusters) * s->cluster_size, 0);
M
Max Reitz 已提交
4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468
    /* This call (even if it failed overall) may have overwritten on-disk
     * refcount structures; in that case, the in-memory refcount information
     * will probably differ from the on-disk information which makes the BDS
     * unusable */
    if (ret < 0) {
        goto fail_broken_refcounts;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);

    /* "Create" an empty reftable (one cluster) directly after the image
     * header and an empty L1 table three clusters after the image header;
     * the cluster between those two will be used as the first refblock */
4469 4470 4471
    l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
    l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
    l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
4472
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
M
Max Reitz 已提交
4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487
                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
    if (ret < 0) {
        goto fail_broken_refcounts;
    }

    s->l1_table_offset = 3 * s->cluster_size;

    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
    if (!new_reftable) {
        ret = -ENOMEM;
        goto fail_broken_refcounts;
    }

    s->refcount_table_offset = s->cluster_size;
    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
4488
    s->max_refcount_table_index = 0;
M
Max Reitz 已提交
4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503

    g_free(s->refcount_table);
    s->refcount_table = new_reftable;
    new_reftable = NULL;

    /* Now the in-memory refcount information again corresponds to the on-disk
     * information (reftable is empty and no refblocks (the refblock cache is
     * empty)); however, this means some clusters (e.g. the image header) are
     * referenced, but not refcounted, but the normal qcow2 code assumes that
     * the in-memory information is always correct */

    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);

    /* Enter the first refblock into the reftable */
    rt_entry = cpu_to_be64(2 * s->cluster_size);
4504
    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
M
Max Reitz 已提交
4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528
                           &rt_entry, sizeof(rt_entry));
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
    s->refcount_table[0] = 2 * s->cluster_size;

    s->free_cluster_index = 0;
    assert(3 + l1_clusters <= s->refcount_block_size);
    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
    if (offset < 0) {
        ret = offset;
        goto fail_broken_refcounts;
    } else if (offset > 0) {
        error_report("First cluster in emptied image is in use");
        abort();
    }

    /* Now finally the in-memory information corresponds to the on-disk
     * structures and is correct */
    ret = qcow2_mark_clean(bs);
    if (ret < 0) {
        goto fail;
    }

4529
    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
4530
                        PREALLOC_MODE_OFF, &local_err);
M
Max Reitz 已提交
4531
    if (ret < 0) {
4532
        error_report_err(local_err);
M
Max Reitz 已提交
4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551
        goto fail;
    }

    return 0;

fail_broken_refcounts:
    /* The BDS is unusable at this point. If we wanted to make it usable, we
     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
     * again. However, because the functions which could have caused this error
     * path to be taken are used by those functions as well, it's very likely
     * that that sequence will fail as well. Therefore, just eject the BDS. */
    bs->drv = NULL;

fail:
    g_free(new_reftable);
    return ret;
}

M
Max Reitz 已提交
4552 4553
static int qcow2_make_empty(BlockDriverState *bs)
{
4554
    BDRVQcow2State *s = bs->opaque;
4555 4556
    uint64_t offset, end_offset;
    int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
M
Max Reitz 已提交
4557 4558 4559 4560
    int l1_clusters, ret = 0;

    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));

4561
    if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
4562
        3 + l1_clusters <= s->refcount_block_size &&
4563 4564
        s->crypt_method_header != QCOW_CRYPT_LUKS &&
        !has_data_file(bs)) {
4565 4566 4567 4568 4569 4570
        /* The following function only works for qcow2 v3 images (it
         * requires the dirty flag) and only as long as there are no
         * features that reserve extra clusters (such as snapshots,
         * LUKS header, or persistent bitmaps), because it completely
         * empties the image.  Furthermore, the L1 table and three
         * additional clusters (image header, refcount table, one
4571 4572 4573
         * refcount block) have to fit inside one refcount block. It
         * only resets the image file, i.e. does not work with an
         * external data file. */
M
Max Reitz 已提交
4574 4575
        return make_completely_empty(bs);
    }
M
Max Reitz 已提交
4576

M
Max Reitz 已提交
4577 4578
    /* This fallback code simply discards every active cluster; this is slow,
     * but works in all cases */
4579 4580
    end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
    for (offset = 0; offset < end_offset; offset += step) {
M
Max Reitz 已提交
4581 4582 4583 4584 4585
        /* As this function is generally used after committing an external
         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
         * default action for this kind of discard is to pass the discard,
         * which will ideally result in an actually smaller image file, as
         * is probably desired. */
4586 4587
        ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
                                    QCOW2_DISCARD_SNAPSHOT, true);
M
Max Reitz 已提交
4588 4589 4590 4591 4592 4593 4594 4595
        if (ret < 0) {
            break;
        }
    }

    return ret;
}

4596
static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
B
Blue Swirl 已提交
4597
{
4598
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
4599 4600
    int ret;

P
Paolo Bonzini 已提交
4601
    qemu_co_mutex_lock(&s->lock);
4602
    ret = qcow2_write_caches(bs);
P
Paolo Bonzini 已提交
4603
    qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
4604

4605
    return ret;
K
Kevin Wolf 已提交
4606 4607
}

4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661
static ssize_t qcow2_measure_crypto_hdr_init_func(QCryptoBlock *block,
        size_t headerlen, void *opaque, Error **errp)
{
    size_t *headerlenp = opaque;

    /* Stash away the payload size */
    *headerlenp = headerlen;
    return 0;
}

static ssize_t qcow2_measure_crypto_hdr_write_func(QCryptoBlock *block,
        size_t offset, const uint8_t *buf, size_t buflen,
        void *opaque, Error **errp)
{
    /* Discard the bytes, we're not actually writing to an image */
    return buflen;
}

/* Determine the number of bytes for the LUKS payload */
static bool qcow2_measure_luks_headerlen(QemuOpts *opts, size_t *len,
                                         Error **errp)
{
    QDict *opts_qdict;
    QDict *cryptoopts_qdict;
    QCryptoBlockCreateOptions *cryptoopts;
    QCryptoBlock *crypto;

    /* Extract "encrypt." options into a qdict */
    opts_qdict = qemu_opts_to_qdict(opts, NULL);
    qdict_extract_subqdict(opts_qdict, &cryptoopts_qdict, "encrypt.");
    qobject_unref(opts_qdict);

    /* Build QCryptoBlockCreateOptions object from qdict */
    qdict_put_str(cryptoopts_qdict, "format", "luks");
    cryptoopts = block_crypto_create_opts_init(cryptoopts_qdict, errp);
    qobject_unref(cryptoopts_qdict);
    if (!cryptoopts) {
        return false;
    }

    /* Fake LUKS creation in order to determine the payload size */
    crypto = qcrypto_block_create(cryptoopts, "encrypt.",
                                  qcow2_measure_crypto_hdr_init_func,
                                  qcow2_measure_crypto_hdr_write_func,
                                  len, errp);
    qapi_free_QCryptoBlockCreateOptions(cryptoopts);
    if (!crypto) {
        return false;
    }

    qcrypto_block_free(crypto);
    return true;
}

4662 4663 4664 4665 4666 4667 4668 4669 4670
static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
                                       Error **errp)
{
    Error *local_err = NULL;
    BlockMeasureInfo *info;
    uint64_t required = 0; /* bytes that contribute to required size */
    uint64_t virtual_size; /* disk size as seen by guest */
    uint64_t refcount_bits;
    uint64_t l2_tables;
4671
    uint64_t luks_payload_size = 0;
4672 4673 4674 4675 4676
    size_t cluster_size;
    int version;
    char *optstr;
    PreallocMode prealloc;
    bool has_backing_file;
4677
    bool has_luks;
4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695

    /* Parse image creation options */
    cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
    if (local_err) {
        goto err;
    }

    version = qcow2_opt_get_version_del(opts, &local_err);
    if (local_err) {
        goto err;
    }

    refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
    if (local_err) {
        goto err;
    }

    optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
4696
    prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
4697
                               PREALLOC_MODE_OFF, &local_err);
4698 4699 4700 4701 4702 4703 4704 4705 4706
    g_free(optstr);
    if (local_err) {
        goto err;
    }

    optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    has_backing_file = !!optstr;
    g_free(optstr);

4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720
    optstr = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
    has_luks = optstr && strcmp(optstr, "luks") == 0;
    g_free(optstr);

    if (has_luks) {
        size_t headerlen;

        if (!qcow2_measure_luks_headerlen(opts, &headerlen, &local_err)) {
            goto err;
        }

        luks_payload_size = ROUND_UP(headerlen, cluster_size);
    }

4721 4722
    virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
    virtual_size = ROUND_UP(virtual_size, cluster_size);
4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741

    /* Check that virtual disk size is valid */
    l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
                             cluster_size / sizeof(uint64_t));
    if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) {
        error_setg(&local_err, "The image size is too large "
                               "(try using a larger cluster size)");
        goto err;
    }

    /* Account for input image */
    if (in_bs) {
        int64_t ssize = bdrv_getlength(in_bs);
        if (ssize < 0) {
            error_setg_errno(&local_err, -ssize,
                             "Unable to get image virtual_size");
            goto err;
        }

4742
        virtual_size = ROUND_UP(ssize, cluster_size);
4743 4744 4745 4746 4747 4748 4749 4750 4751

        if (has_backing_file) {
            /* We don't how much of the backing chain is shared by the input
             * image and the new image file.  In the worst case the new image's
             * backing file has nothing in common with the input image.  Be
             * conservative and assume all clusters need to be written.
             */
            required = virtual_size;
        } else {
4752
            int64_t offset;
4753
            int64_t pnum = 0;
4754

4755 4756
            for (offset = 0; offset < ssize; offset += pnum) {
                int ret;
4757

4758 4759 4760
                ret = bdrv_block_status_above(in_bs, NULL, offset,
                                              ssize - offset, &pnum, NULL,
                                              NULL);
4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771
                if (ret < 0) {
                    error_setg_errno(&local_err, -ret,
                                     "Unable to get block status");
                    goto err;
                }

                if (ret & BDRV_BLOCK_ZERO) {
                    /* Skip zero regions (safe with no backing file) */
                } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
                           (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
                    /* Extend pnum to end of cluster for next iteration */
4772
                    pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
4773 4774

                    /* Count clusters we've seen */
4775
                    required += offset % cluster_size + pnum;
4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790
                }
            }
        }
    }

    /* Take into account preallocation.  Nothing special is needed for
     * PREALLOC_MODE_METADATA since metadata is always counted.
     */
    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
        required = virtual_size;
    }

    info = g_new(BlockMeasureInfo, 1);
    info->fully_allocated =
        qcow2_calc_prealloc_size(virtual_size, cluster_size,
4791
                                 ctz32(refcount_bits)) + luks_payload_size;
4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804

    /* Remove data clusters that are not required.  This overestimates the
     * required size because metadata needed for the fully allocated file is
     * still counted.
     */
    info->required = info->fully_allocated - virtual_size + required;
    return info;

err:
    error_propagate(errp, local_err);
    return NULL;
}

4805
static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
B
Blue Swirl 已提交
4806
{
4807
    BDRVQcow2State *s = bs->opaque;
4808
    bdi->unallocated_blocks_are_zero = true;
B
Blue Swirl 已提交
4809
    bdi->cluster_size = s->cluster_size;
4810
    bdi->vm_state_offset = qcow2_vm_state_offset(s);
B
Blue Swirl 已提交
4811 4812 4813
    return 0;
}

4814 4815
static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs,
                                                  Error **errp)
4816
{
4817
    BDRVQcow2State *s = bs->opaque;
4818 4819
    ImageInfoSpecific *spec_info;
    QCryptoBlockInfo *encrypt_info = NULL;
4820
    Error *local_err = NULL;
4821

4822
    if (s->crypto != NULL) {
4823 4824 4825 4826 4827
        encrypt_info = qcrypto_block_get_info(s->crypto, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            return NULL;
        }
4828 4829 4830
    }

    spec_info = g_new(ImageInfoSpecific, 1);
4831
    *spec_info = (ImageInfoSpecific){
4832
        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
4833
        .u.qcow2.data = g_new0(ImageInfoSpecificQCow2, 1),
4834 4835
    };
    if (s->qcow_version == 2) {
4836
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4837 4838
            .compat             = g_strdup("0.10"),
            .refcount_bits      = s->refcount_bits,
4839 4840
        };
    } else if (s->qcow_version == 3) {
4841 4842 4843 4844 4845 4846 4847
        Qcow2BitmapInfoList *bitmaps;
        bitmaps = qcow2_get_bitmap_info_list(bs, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            qapi_free_ImageInfoSpecific(spec_info);
            return NULL;
        }
4848
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4849 4850 4851 4852
            .compat             = g_strdup("1.1"),
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
            .has_lazy_refcounts = true,
4853 4854 4855
            .corrupt            = s->incompatible_features &
                                  QCOW2_INCOMPAT_CORRUPT,
            .has_corrupt        = true,
4856
            .refcount_bits      = s->refcount_bits,
4857 4858
            .has_bitmaps        = !!bitmaps,
            .bitmaps            = bitmaps,
4859 4860
            .has_data_file      = !!s->image_data_file,
            .data_file          = g_strdup(s->image_data_file),
4861 4862
            .has_data_file_raw  = has_data_file(bs),
            .data_file_raw      = data_file_is_raw(bs),
4863
        };
4864 4865 4866 4867
    } else {
        /* if this assertion fails, this probably means a new version was
         * added without having it covered here */
        assert(false);
4868 4869
    }

4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892
    if (encrypt_info) {
        ImageInfoSpecificQCow2Encryption *qencrypt =
            g_new(ImageInfoSpecificQCow2Encryption, 1);
        switch (encrypt_info->format) {
        case Q_CRYPTO_BLOCK_FORMAT_QCOW:
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
            break;
        case Q_CRYPTO_BLOCK_FORMAT_LUKS:
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
            qencrypt->u.luks = encrypt_info->u.luks;
            break;
        default:
            abort();
        }
        /* Since we did shallow copy above, erase any pointers
         * in the original info */
        memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
        qapi_free_QCryptoBlockInfo(encrypt_info);

        spec_info->u.qcow2.data->has_encrypt = true;
        spec_info->u.qcow2.data->encrypt = qencrypt;
    }

4893 4894 4895
    return spec_info;
}

M
Max Reitz 已提交
4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922
static int qcow2_has_zero_init(BlockDriverState *bs)
{
    BDRVQcow2State *s = bs->opaque;
    bool preallocated;

    if (qemu_in_coroutine()) {
        qemu_co_mutex_lock(&s->lock);
    }
    /*
     * Check preallocation status: Preallocated images have all L2
     * tables allocated, nonpreallocated images have none.  It is
     * therefore enough to check the first one.
     */
    preallocated = s->l1_size > 0 && s->l1_table[0] != 0;
    if (qemu_in_coroutine()) {
        qemu_co_mutex_unlock(&s->lock);
    }

    if (!preallocated) {
        return 1;
    } else if (bs->encrypted) {
        return 0;
    } else {
        return bdrv_has_zero_init(s->data_file->bs);
    }
}

4923 4924
static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
B
Blue Swirl 已提交
4925
{
4926
    BDRVQcow2State *s = bs->opaque;
B
Blue Swirl 已提交
4927

4928
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
4929 4930
    return bs->drv->bdrv_co_pwritev_part(bs, qcow2_vm_state_offset(s) + pos,
                                         qiov->size, qiov, 0, 0);
B
Blue Swirl 已提交
4931 4932
}

4933 4934
static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
B
Blue Swirl 已提交
4935
{
4936
    BDRVQcow2State *s = bs->opaque;
B
Blue Swirl 已提交
4937

4938
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
4939 4940
    return bs->drv->bdrv_co_preadv_part(bs, qcow2_vm_state_offset(s) + pos,
                                        qiov->size, qiov, 0, 0);
B
Blue Swirl 已提交
4941 4942
}

M
Max Reitz 已提交
4943 4944 4945 4946
/*
 * Downgrades an image's version. To achieve this, any incompatible features
 * have to be removed.
 */
4947
static int qcow2_downgrade(BlockDriverState *bs, int target_version,
4948 4949
                           BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
                           Error **errp)
M
Max Reitz 已提交
4950
{
4951
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
4952 4953 4954
    int current_version = s->qcow_version;
    int ret;

4955 4956 4957 4958 4959
    /* This is qcow2_downgrade(), not qcow2_upgrade() */
    assert(target_version < current_version);

    /* There are no other versions (now) that you can downgrade to */
    assert(target_version == 2);
M
Max Reitz 已提交
4960 4961

    if (s->refcount_order != 4) {
4962
        error_setg(errp, "compat=0.10 requires refcount_bits=16");
M
Max Reitz 已提交
4963 4964 4965
        return -ENOTSUP;
    }

K
Kevin Wolf 已提交
4966 4967 4968 4969 4970
    if (has_data_file(bs)) {
        error_setg(errp, "Cannot downgrade an image with a data file");
        return -ENOTSUP;
    }

M
Max Reitz 已提交
4971 4972 4973 4974
    /* clear incompatible features */
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
4975
            error_setg_errno(errp, -ret, "Failed to make the image clean");
M
Max Reitz 已提交
4976 4977 4978 4979 4980 4981 4982 4983 4984
            return ret;
        }
    }

    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
     * the first place; if that happens nonetheless, returning -ENOTSUP is the
     * best thing to do anyway */

    if (s->incompatible_features) {
4985 4986
        error_setg(errp, "Cannot downgrade an image with incompatible features "
                   "%#" PRIx64 " set", s->incompatible_features);
M
Max Reitz 已提交
4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997
        return -ENOTSUP;
    }

    /* since we can ignore compatible features, we can set them to 0 as well */
    s->compatible_features = 0;
    /* if lazy refcounts have been used, they have already been fixed through
     * clearing the dirty flag */

    /* clearing autoclear features is trivial */
    s->autoclear_features = 0;

4998
    ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
M
Max Reitz 已提交
4999
    if (ret < 0) {
5000
        error_setg_errno(errp, -ret, "Failed to turn zero into data clusters");
M
Max Reitz 已提交
5001 5002 5003 5004 5005 5006 5007
        return ret;
    }

    s->qcow_version = target_version;
    ret = qcow2_update_header(bs);
    if (ret < 0) {
        s->qcow_version = current_version;
5008
        error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
5009 5010 5011 5012 5013
        return ret;
    }
    return 0;
}

5014 5015 5016 5017 5018 5019 5020 5021 5022 5023
/*
 * Upgrades an image's version.  While newer versions encompass all
 * features of older versions, some things may have to be presented
 * differently.
 */
static int qcow2_upgrade(BlockDriverState *bs, int target_version,
                         BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
                         Error **errp)
{
    BDRVQcow2State *s = bs->opaque;
5024
    bool need_snapshot_update;
5025
    int current_version = s->qcow_version;
5026
    int i;
5027 5028 5029 5030 5031 5032 5033 5034
    int ret;

    /* This is qcow2_upgrade(), not qcow2_downgrade() */
    assert(target_version > current_version);

    /* There are no other versions (yet) that you can upgrade to */
    assert(target_version == 3);

5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061
    status_cb(bs, 0, 2, cb_opaque);

    /*
     * In v2, snapshots do not need to have extra data.  v3 requires
     * the 64-bit VM state size and the virtual disk size to be
     * present.
     * qcow2_write_snapshots() will always write the list in the
     * v3-compliant format.
     */
    need_snapshot_update = false;
    for (i = 0; i < s->nb_snapshots; i++) {
        if (s->snapshots[i].extra_data_size <
            sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
            sizeof_field(QCowSnapshotExtraData, disk_size))
        {
            need_snapshot_update = true;
            break;
        }
    }
    if (need_snapshot_update) {
        ret = qcow2_write_snapshots(bs);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to update the snapshot table");
            return ret;
        }
    }
    status_cb(bs, 1, 2, cb_opaque);
5062 5063 5064 5065 5066 5067 5068 5069

    s->qcow_version = target_version;
    ret = qcow2_update_header(bs);
    if (ret < 0) {
        s->qcow_version = current_version;
        error_setg_errno(errp, -ret, "Failed to update the image header");
        return ret;
    }
5070
    status_cb(bs, 2, 2, cb_opaque);
5071 5072 5073 5074

    return 0;
}

5075 5076 5077 5078 5079 5080
typedef enum Qcow2AmendOperation {
    /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
     * statically initialized to so that the helper CB can discern the first
     * invocation from an operation change */
    QCOW2_NO_OPERATION = 0,

5081
    QCOW2_UPGRADING,
5082
    QCOW2_CHANGING_REFCOUNT_ORDER,
5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145
    QCOW2_DOWNGRADING,
} Qcow2AmendOperation;

typedef struct Qcow2AmendHelperCBInfo {
    /* The code coordinating the amend operations should only modify
     * these four fields; the rest will be managed by the CB */
    BlockDriverAmendStatusCB *original_status_cb;
    void *original_cb_opaque;

    Qcow2AmendOperation current_operation;

    /* Total number of operations to perform (only set once) */
    int total_operations;

    /* The following fields are managed by the CB */

    /* Number of operations completed */
    int operations_completed;

    /* Cumulative offset of all completed operations */
    int64_t offset_completed;

    Qcow2AmendOperation last_operation;
    int64_t last_work_size;
} Qcow2AmendHelperCBInfo;

static void qcow2_amend_helper_cb(BlockDriverState *bs,
                                  int64_t operation_offset,
                                  int64_t operation_work_size, void *opaque)
{
    Qcow2AmendHelperCBInfo *info = opaque;
    int64_t current_work_size;
    int64_t projected_work_size;

    if (info->current_operation != info->last_operation) {
        if (info->last_operation != QCOW2_NO_OPERATION) {
            info->offset_completed += info->last_work_size;
            info->operations_completed++;
        }

        info->last_operation = info->current_operation;
    }

    assert(info->total_operations > 0);
    assert(info->operations_completed < info->total_operations);

    info->last_work_size = operation_work_size;

    current_work_size = info->offset_completed + operation_work_size;

    /* current_work_size is the total work size for (operations_completed + 1)
     * operations (which includes this one), so multiply it by the number of
     * operations not covered and divide it by the number of operations
     * covered to get a projection for the operations not covered */
    projected_work_size = current_work_size * (info->total_operations -
                                               info->operations_completed - 1)
                                            / (info->operations_completed + 1);

    info->original_status_cb(bs, info->offset_completed + operation_offset,
                             current_work_size + projected_work_size,
                             info->original_cb_opaque);
}

5146
static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
5147
                               BlockDriverAmendStatusCB *status_cb,
5148 5149
                               void *cb_opaque,
                               Error **errp)
M
Max Reitz 已提交
5150
{
5151
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
5152 5153
    int old_version = s->qcow_version, new_version = old_version;
    uint64_t new_size = 0;
5154
    const char *backing_file = NULL, *backing_format = NULL, *data_file = NULL;
M
Max Reitz 已提交
5155
    bool lazy_refcounts = s->use_lazy_refcounts;
5156
    bool data_file_raw = data_file_is_raw(bs);
5157 5158 5159
    const char *compat = NULL;
    uint64_t cluster_size = s->cluster_size;
    bool encrypt;
5160
    int encformat;
5161
    int refcount_bits = s->refcount_bits;
M
Max Reitz 已提交
5162
    int ret;
5163
    QemuOptDesc *desc = opts->list->desc;
5164
    Qcow2AmendHelperCBInfo helper_cb_info;
M
Max Reitz 已提交
5165

5166 5167
    while (desc && desc->name) {
        if (!qemu_opt_find(opts, desc->name)) {
M
Max Reitz 已提交
5168
            /* only change explicitly defined options */
5169
            desc++;
M
Max Reitz 已提交
5170 5171 5172
            continue;
        }

5173 5174
        if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
            compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
5175
            if (!compat) {
M
Max Reitz 已提交
5176
                /* preserve default */
5177
            } else if (!strcmp(compat, "0.10") || !strcmp(compat, "v2")) {
M
Max Reitz 已提交
5178
                new_version = 2;
5179
            } else if (!strcmp(compat, "1.1") || !strcmp(compat, "v3")) {
M
Max Reitz 已提交
5180 5181
                new_version = 3;
            } else {
5182
                error_setg(errp, "Unknown compatibility level %s", compat);
M
Max Reitz 已提交
5183 5184
                return -EINVAL;
            }
5185
        } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
5186
            error_setg(errp, "Cannot change preallocation mode");
M
Max Reitz 已提交
5187
            return -ENOTSUP;
5188 5189 5190 5191 5192 5193 5194 5195
        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
            backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
            backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
            encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
5196
                                        !!s->crypto);
5197

5198
            if (encrypt != !!s->crypto) {
5199 5200
                error_setg(errp,
                           "Changing the encryption flag is not supported");
M
Max Reitz 已提交
5201 5202
                return -ENOTSUP;
            }
5203 5204 5205 5206 5207
        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) {
            encformat = qcow2_crypt_method_from_format(
                qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT));

            if (encformat != s->crypt_method_header) {
5208 5209
                error_setg(errp,
                           "Changing the encryption format is not supported");
5210 5211
                return -ENOTSUP;
            }
5212
        } else if (g_str_has_prefix(desc->name, "encrypt.")) {
5213 5214
            error_setg(errp,
                       "Changing the encryption parameters is not supported");
5215
            return -ENOTSUP;
5216 5217
        } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
            cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
5218 5219
                                             cluster_size);
            if (cluster_size != s->cluster_size) {
5220
                error_setg(errp, "Changing the cluster size is not supported");
M
Max Reitz 已提交
5221 5222
                return -ENOTSUP;
            }
5223 5224
        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
5225
                                               lazy_refcounts);
5226
        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
5227 5228 5229 5230 5231 5232
            refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
                                                refcount_bits);

            if (refcount_bits <= 0 || refcount_bits > 64 ||
                !is_power_of_2(refcount_bits))
            {
5233 5234
                error_setg(errp, "Refcount width must be a power of two and "
                           "may not exceed 64 bits");
5235 5236
                return -EINVAL;
            }
5237 5238 5239 5240 5241 5242 5243
        } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE)) {
            data_file = qemu_opt_get(opts, BLOCK_OPT_DATA_FILE);
            if (data_file && !has_data_file(bs)) {
                error_setg(errp, "data-file can only be set for images that "
                                 "use an external data file");
                return -EINVAL;
            }
5244 5245 5246 5247 5248 5249 5250 5251
        } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE_RAW)) {
            data_file_raw = qemu_opt_get_bool(opts, BLOCK_OPT_DATA_FILE_RAW,
                                              data_file_raw);
            if (data_file_raw && !data_file_is_raw(bs)) {
                error_setg(errp, "data-file-raw cannot be set on existing "
                                 "images");
                return -EINVAL;
            }
M
Max Reitz 已提交
5252
        } else {
5253
            /* if this point is reached, this probably means a new option was
M
Max Reitz 已提交
5254
             * added without having it covered here */
5255
            abort();
M
Max Reitz 已提交
5256
        }
5257 5258

        desc++;
M
Max Reitz 已提交
5259 5260
    }

5261 5262 5263
    helper_cb_info = (Qcow2AmendHelperCBInfo){
        .original_status_cb = status_cb,
        .original_cb_opaque = cb_opaque,
5264
        .total_operations = (new_version != old_version)
5265
                          + (s->refcount_bits != refcount_bits)
5266 5267
    };

5268 5269
    /* Upgrade first (some features may require compat=1.1) */
    if (new_version > old_version) {
5270 5271 5272
        helper_cb_info.current_operation = QCOW2_UPGRADING;
        ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb,
                            &helper_cb_info, errp);
5273 5274
        if (ret < 0) {
            return ret;
M
Max Reitz 已提交
5275 5276 5277
        }
    }

5278 5279 5280 5281
    if (s->refcount_bits != refcount_bits) {
        int refcount_order = ctz32(refcount_bits);

        if (new_version < 3 && refcount_bits != 16) {
5282 5283 5284
            error_setg(errp, "Refcount widths other than 16 bits require "
                       "compatibility level 1.1 or above (use compat=1.1 or "
                       "greater)");
5285 5286 5287 5288 5289 5290
            return -EINVAL;
        }

        helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
        ret = qcow2_change_refcount_order(bs, refcount_order,
                                          &qcow2_amend_helper_cb,
5291
                                          &helper_cb_info, errp);
5292 5293 5294 5295 5296
        if (ret < 0) {
            return ret;
        }
    }

5297 5298 5299 5300 5301 5302 5303
    /* data-file-raw blocks backing files, so clear it first if requested */
    if (data_file_raw) {
        s->autoclear_features |= QCOW2_AUTOCLEAR_DATA_FILE_RAW;
    } else {
        s->autoclear_features &= ~QCOW2_AUTOCLEAR_DATA_FILE_RAW;
    }

5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314
    if (data_file) {
        g_free(s->image_data_file);
        s->image_data_file = *data_file ? g_strdup(data_file) : NULL;
    }

    ret = qcow2_update_header(bs);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Failed to update the image header");
        return ret;
    }

M
Max Reitz 已提交
5315
    if (backing_file || backing_format) {
5316 5317 5318
        ret = qcow2_change_backing_file(bs,
                    backing_file ?: s->image_backing_file,
                    backing_format ?: s->image_backing_format);
M
Max Reitz 已提交
5319
        if (ret < 0) {
5320
            error_setg_errno(errp, -ret, "Failed to change the backing file");
M
Max Reitz 已提交
5321 5322 5323 5324 5325 5326
            return ret;
        }
    }

    if (s->use_lazy_refcounts != lazy_refcounts) {
        if (lazy_refcounts) {
5327
            if (new_version < 3) {
5328 5329 5330
                error_setg(errp, "Lazy refcounts only supported with "
                           "compatibility level 1.1 and above (use compat=1.1 "
                           "or greater)");
M
Max Reitz 已提交
5331 5332 5333 5334 5335 5336
                return -EINVAL;
            }
            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
            ret = qcow2_update_header(bs);
            if (ret < 0) {
                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
5337
                error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
5338 5339 5340 5341 5342 5343 5344
                return ret;
            }
            s->use_lazy_refcounts = true;
        } else {
            /* make image clean first */
            ret = qcow2_mark_clean(bs);
            if (ret < 0) {
5345
                error_setg_errno(errp, -ret, "Failed to make the image clean");
M
Max Reitz 已提交
5346 5347 5348 5349 5350 5351 5352
                return ret;
            }
            /* now disallow lazy refcounts */
            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
            ret = qcow2_update_header(bs);
            if (ret < 0) {
                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
5353
                error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
5354 5355 5356 5357 5358 5359 5360
                return ret;
            }
            s->use_lazy_refcounts = false;
        }
    }

    if (new_size) {
K
Kevin Wolf 已提交
5361 5362
        BlockBackend *blk = blk_new(bdrv_get_aio_context(bs),
                                    BLK_PERM_RESIZE, BLK_PERM_ALL);
5363
        ret = blk_insert_bs(blk, bs, errp);
5364 5365 5366 5367 5368
        if (ret < 0) {
            blk_unref(blk);
            return ret;
        }

5369 5370 5371 5372 5373
        /*
         * Amending image options should ensure that the image has
         * exactly the given new values, so pass exact=true here.
         */
        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp);
5374
        blk_unref(blk);
M
Max Reitz 已提交
5375 5376 5377 5378 5379
        if (ret < 0) {
            return ret;
        }
    }

5380 5381
    /* Downgrade last (so unsupported features can be removed before) */
    if (new_version < old_version) {
5382 5383
        helper_cb_info.current_operation = QCOW2_DOWNGRADING;
        ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
5384
                              &helper_cb_info, errp);
5385 5386 5387 5388 5389
        if (ret < 0) {
            return ret;
        }
    }

M
Max Reitz 已提交
5390 5391 5392
    return 0;
}

M
Max Reitz 已提交
5393 5394 5395 5396 5397 5398 5399 5400 5401
/*
 * If offset or size are negative, respectively, they will not be included in
 * the BLOCK_IMAGE_CORRUPTED event emitted.
 * fatal will be ignored for read-only BDS; corruptions found there will always
 * be considered non-fatal.
 */
void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                             int64_t size, const char *message_format, ...)
{
5402
    BDRVQcow2State *s = bs->opaque;
5403
    const char *node_name;
M
Max Reitz 已提交
5404 5405 5406
    char *message;
    va_list ap;

5407
    fatal = fatal && bdrv_is_writable(bs);
M
Max Reitz 已提交
5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426

    if (s->signaled_corruption &&
        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
    {
        return;
    }

    va_start(ap, message_format);
    message = g_strdup_vprintf(message_format, ap);
    va_end(ap);

    if (fatal) {
        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
                "corruption events will be suppressed\n", message);
    } else {
        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
                "corruption events will be suppressed\n", message);
    }

5427 5428 5429 5430 5431
    node_name = bdrv_get_node_name(bs);
    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
                                          *node_name != '\0', node_name,
                                          message, offset >= 0, offset,
                                          size >= 0, size,
5432
                                          fatal);
M
Max Reitz 已提交
5433 5434 5435 5436 5437 5438 5439 5440 5441 5442
    g_free(message);

    if (fatal) {
        qcow2_mark_corrupt(bs);
        bs->drv = NULL; /* make BDS unusable */
    }

    s->signaled_corruption = true;
}

5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454
static QemuOptsList qcow2_create_opts = {
    .name = "qcow2-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_COMPAT_LEVEL,
            .type = QEMU_OPT_STRING,
5455
            .help = "Compatibility level (v2 [0.10] or v3 [1.1])"
5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_BACKING_FMT,
            .type = QEMU_OPT_STRING,
            .help = "Image format of the base image"
        },
5467 5468 5469 5470 5471
        {
            .name = BLOCK_OPT_DATA_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of an external data file"
        },
5472 5473 5474 5475 5476
        {
            .name = BLOCK_OPT_DATA_FILE_RAW,
            .type = QEMU_OPT_BOOL,
            .help = "The external data file must stay valid as a raw image"
        },
5477 5478 5479
        {
            .name = BLOCK_OPT_ENCRYPT,
            .type = QEMU_OPT_BOOL,
5480 5481 5482 5483 5484 5485
            .help = "Encrypt the image with format 'aes'. (Deprecated "
                    "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
        },
        {
            .name = BLOCK_OPT_ENCRYPT_FORMAT,
            .type = QEMU_OPT_STRING,
5486
            .help = "Encrypt the image, format choices: 'aes', 'luks'",
5487
        },
5488 5489 5490 5491 5492 5493 5494 5495
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
            "ID of secret providing qcow AES key or LUKS passphrase"),
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
5496 5497 5498 5499 5500 5501 5502 5503 5504
        {
            .name = BLOCK_OPT_CLUSTER_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "qcow2 cluster size",
            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
        },
        {
            .name = BLOCK_OPT_PREALLOC,
            .type = QEMU_OPT_STRING,
5505 5506
            .help = "Preallocation mode (allowed values: off, metadata, "
                    "falloc, full)"
5507 5508 5509 5510 5511 5512 5513
        },
        {
            .name = BLOCK_OPT_LAZY_REFCOUNTS,
            .type = QEMU_OPT_BOOL,
            .help = "Postpone refcount updates",
            .def_value_str = "off"
        },
5514 5515 5516 5517 5518 5519
        {
            .name = BLOCK_OPT_REFCOUNT_BITS,
            .type = QEMU_OPT_NUMBER,
            .help = "Width of a reference count entry in bits",
            .def_value_str = "16"
        },
5520 5521
        { /* end of list */ }
    }
B
Blue Swirl 已提交
5522 5523
};

5524 5525 5526 5527 5528 5529
static const char *const qcow2_strong_runtime_opts[] = {
    "encrypt." BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET,

    NULL
};

5530
BlockDriver bdrv_qcow2 = {
5531
    .format_name        = "qcow2",
5532
    .instance_size      = sizeof(BDRVQcow2State),
5533 5534 5535
    .bdrv_probe         = qcow2_probe,
    .bdrv_open          = qcow2_open,
    .bdrv_close         = qcow2_close,
J
Jeff Cody 已提交
5536
    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
5537 5538
    .bdrv_reopen_commit   = qcow2_reopen_commit,
    .bdrv_reopen_abort    = qcow2_reopen_abort,
5539
    .bdrv_join_options    = qcow2_join_options,
5540
    .bdrv_child_perm      = bdrv_format_default_perms,
5541
    .bdrv_co_create_opts  = qcow2_co_create_opts,
5542
    .bdrv_co_create       = qcow2_co_create,
M
Max Reitz 已提交
5543
    .bdrv_has_zero_init   = qcow2_has_zero_init,
5544
    .bdrv_has_zero_init_truncate = bdrv_has_zero_init_1,
5545
    .bdrv_co_block_status = qcow2_co_block_status,
5546

5547
    .bdrv_co_preadv_part    = qcow2_co_preadv_part,
5548
    .bdrv_co_pwritev_part   = qcow2_co_pwritev_part,
K
Kevin Wolf 已提交
5549
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
5550

5551
    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
5552
    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
F
Fam Zheng 已提交
5553 5554
    .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
    .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
5555
    .bdrv_co_truncate       = qcow2_co_truncate,
5556
    .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
M
Max Reitz 已提交
5557
    .bdrv_make_empty        = qcow2_make_empty,
B
Blue Swirl 已提交
5558 5559 5560 5561 5562

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
    .bdrv_snapshot_list     = qcow2_snapshot_list,
5563
    .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
5564
    .bdrv_measure           = qcow2_measure,
5565
    .bdrv_get_info          = qcow2_get_info,
5566
    .bdrv_get_specific_info = qcow2_get_specific_info,
B
Blue Swirl 已提交
5567

5568 5569
    .bdrv_save_vmstate    = qcow2_save_vmstate,
    .bdrv_load_vmstate    = qcow2_load_vmstate,
B
Blue Swirl 已提交
5570

5571
    .supports_backing           = true,
B
Blue Swirl 已提交
5572 5573
    .bdrv_change_backing_file   = qcow2_change_backing_file,

5574
    .bdrv_refresh_limits        = qcow2_refresh_limits,
5575
    .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
K
Kevin Wolf 已提交
5576
    .bdrv_inactivate            = qcow2_inactivate,
5577

5578
    .create_opts         = &qcow2_create_opts,
5579
    .strong_runtime_opts = qcow2_strong_runtime_opts,
5580
    .mutable_opts        = mutable_opts,
5581
    .bdrv_co_check       = qcow2_co_check,
C
Chunyan Liu 已提交
5582
    .bdrv_amend_options  = qcow2_amend_options,
5583 5584 5585

    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
    .bdrv_attach_aio_context  = qcow2_attach_aio_context,
5586

5587 5588 5589
    .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
    .bdrv_co_remove_persistent_dirty_bitmap =
            qcow2_co_remove_persistent_dirty_bitmap,
B
Blue Swirl 已提交
5590 5591
};

5592 5593 5594 5595 5596 5597
static void bdrv_qcow2_init(void)
{
    bdrv_register(&bdrv_qcow2);
}

block_init(bdrv_qcow2_init);