qcow2.c 154.5 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the QCOW version 2 format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
24

P
Peter Maydell 已提交
25
#include "qemu/osdep.h"
26
#include "block/block_int.h"
27
#include "block/qdict.h"
28
#include "sysemu/block-backend.h"
29
#include "qemu/module.h"
B
bellard 已提交
30
#include <zlib.h>
31
#include "qcow2.h"
32
#include "qemu/error-report.h"
33
#include "qapi/error.h"
34
#include "qapi/qapi-events-block-core.h"
M
Markus Armbruster 已提交
35 36
#include "qapi/qmp/qdict.h"
#include "qapi/qmp/qstring.h"
K
Kevin Wolf 已提交
37
#include "trace.h"
38
#include "qemu/option_int.h"
39
#include "qemu/cutils.h"
40
#include "qemu/bswap.h"
41 42
#include "qapi/qobject-input-visitor.h"
#include "qapi/qapi-visit-block-core.h"
43
#include "crypto.h"
B
bellard 已提交
44 45 46 47 48 49 50 51

/*
  Differences with QCOW:

  - Support for multiple incremental snapshots.
  - Memory management by reference counts.
  - Clusters which have a reference count of one have the bit
    QCOW_OFLAG_COPIED to optimize write performance.
52
  - Size of compressed clusters is stored in sectors to reduce bit usage
B
bellard 已提交
53 54
    in the cluster offsets.
  - Support for storing additional data (such as the VM state) in the
55
    snapshots.
B
bellard 已提交
56 57 58 59 60
  - If a backing store is used, the cluster size is not constrained
    (could be backported to QCOW).
  - L2 tables have always a size of one cluster.
*/

61 62 63 64

typedef struct {
    uint32_t magic;
    uint32_t len;
65
} QEMU_PACKED QCowExtension;
J
Jeff Cody 已提交
66

67 68
#define  QCOW2_EXT_MAGIC_END 0
#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
69
#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
70
#define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
71
#define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
72

73
static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
B
bellard 已提交
74 75
{
    const QCowHeader *cow_header = (const void *)buf;
76

B
bellard 已提交
77 78
    if (buf_size >= sizeof(QCowHeader) &&
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
K
Kevin Wolf 已提交
79
        be32_to_cpu(cow_header->version) >= 2)
B
bellard 已提交
80 81 82 83 84
        return 100;
    else
        return 0;
}

85

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
                                          uint8_t *buf, size_t buflen,
                                          void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    ssize_t ret;

    if ((offset + buflen) > s->crypto_header.length) {
        error_setg(errp, "Request for data outside of extension header");
        return -1;
    }

    ret = bdrv_pread(bs->file,
                     s->crypto_header.offset + offset, buf, buflen);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
        return -1;
    }
    return ret;
}


static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
                                          void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    int64_t ret;
    int64_t clusterlen;

    ret = qcow2_alloc_clusters(bs, headerlen);
    if (ret < 0) {
        error_setg_errno(errp, -ret,
                         "Cannot allocate cluster for LUKS header size %zu",
                         headerlen);
        return -1;
    }

    s->crypto_header.length = headerlen;
    s->crypto_header.offset = ret;

    /* Zero fill remaining space in cluster so it has predictable
     * content in case of future spec changes */
    clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
131
    assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen) == 0);
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
    ret = bdrv_pwrite_zeroes(bs->file,
                             ret + headerlen,
                             clusterlen - headerlen, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not zero fill encryption header");
        return -1;
    }

    return ret;
}


static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
                                           const uint8_t *buf, size_t buflen,
                                           void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    ssize_t ret;

    if ((offset + buflen) > s->crypto_header.length) {
        error_setg(errp, "Request for data outside of extension header");
        return -1;
    }

    ret = bdrv_pwrite(bs->file,
                      s->crypto_header.offset + offset, buf, buflen);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
        return -1;
    }
    return ret;
}


167 168 169 170 171 172 173
/* 
 * read qcow2 extension and fill bs
 * start reading from start_offset
 * finish reading upon magic of value 0 or when end_offset reached
 * unknown magic is skipped (future extension this version knows nothing about)
 * return 0 upon success, non-0 otherwise
 */
174
static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
M
Max Reitz 已提交
175
                                 uint64_t end_offset, void **p_feature_table,
176 177
                                 int flags, bool *need_update_header,
                                 Error **errp)
178
{
179
    BDRVQcow2State *s = bs->opaque;
180 181
    QCowExtension ext;
    uint64_t offset;
182
    int ret;
183 184 185 186 187
    Qcow2BitmapHeaderExt bitmaps_ext;

    if (need_update_header != NULL) {
        *need_update_header = false;
    }
188 189

#ifdef DEBUG_EXT
190
    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
191 192 193 194 195 196 197
#endif
    offset = start_offset;
    while (offset < end_offset) {

#ifdef DEBUG_EXT
        /* Sanity check */
        if (offset > s->cluster_size)
198
            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
199

D
Dong Xu Wang 已提交
200
        printf("attempting to read extended header in offset %lu\n", offset);
201 202
#endif

203
        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
M
Max Reitz 已提交
204 205 206
        if (ret < 0) {
            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
                             "pread fail from offset %" PRIu64, offset);
207 208 209 210 211 212 213 214
            return 1;
        }
        be32_to_cpus(&ext.magic);
        be32_to_cpus(&ext.len);
        offset += sizeof(ext);
#ifdef DEBUG_EXT
        printf("ext.magic = 0x%x\n", ext.magic);
#endif
215
        if (offset > end_offset || ext.len > end_offset - offset) {
M
Max Reitz 已提交
216
            error_setg(errp, "Header extension too large");
217 218 219
            return -EINVAL;
        }

220
        switch (ext.magic) {
221
        case QCOW2_EXT_MAGIC_END:
222
            return 0;
223

224
        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
225
            if (ext.len >= sizeof(bs->backing_format)) {
226 227 228
                error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
                           " too large (>=%zu)", ext.len,
                           sizeof(bs->backing_format));
229 230
                return 2;
            }
231
            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
M
Max Reitz 已提交
232 233 234
            if (ret < 0) {
                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
                                 "Could not read format name");
235
                return 3;
M
Max Reitz 已提交
236
            }
237
            bs->backing_format[ext.len] = '\0';
238
            s->image_backing_format = g_strdup(bs->backing_format);
239 240 241 242 243
#ifdef DEBUG_EXT
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
#endif
            break;

244 245 246
        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
            if (p_feature_table != NULL) {
                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
247
                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
248
                if (ret < 0) {
M
Max Reitz 已提交
249 250
                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
                                     "Could not read table");
251 252 253 254 255 256 257
                    return ret;
                }

                *p_feature_table = feature_table;
            }
            break;

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
        case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
            unsigned int cflags = 0;
            if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
                error_setg(errp, "CRYPTO header extension only "
                           "expected with LUKS encryption method");
                return -EINVAL;
            }
            if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
                error_setg(errp, "CRYPTO header extension size %u, "
                           "but expected size %zu", ext.len,
                           sizeof(Qcow2CryptoHeaderExtension));
                return -EINVAL;
            }

            ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret,
                                 "Unable to read CRYPTO header extension");
                return ret;
            }
            be64_to_cpus(&s->crypto_header.offset);
            be64_to_cpus(&s->crypto_header.length);

            if ((s->crypto_header.offset % s->cluster_size) != 0) {
                error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
                           "not a multiple of cluster size '%u'",
                           s->crypto_header.offset, s->cluster_size);
                return -EINVAL;
            }

            if (flags & BDRV_O_NO_IO) {
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
            }
291
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
292 293 294 295 296 297 298
                                           qcow2_crypto_hdr_read_func,
                                           bs, cflags, errp);
            if (!s->crypto) {
                return -EINVAL;
            }
        }   break;

299 300 301 302 303 304 305 306
        case QCOW2_EXT_MAGIC_BITMAPS:
            if (ext.len != sizeof(bitmaps_ext)) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Invalid extension length");
                return -EINVAL;
            }

            if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
307 308 309 310 311 312 313 314 315 316 317
                if (s->qcow_version < 3) {
                    /* Let's be a bit more specific */
                    warn_report("This qcow2 v2 image contains bitmaps, but "
                                "they may have been modified by a program "
                                "without persistent bitmap support; so now "
                                "they must all be considered inconsistent");
                } else {
                    warn_report("a program lacking bitmap support "
                                "modified this file, so all bitmaps are now "
                                "considered inconsistent");
                }
318 319
                error_printf("Some clusters may be leaked, "
                             "run 'qemu-img check -r' on the image "
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
                             "file to fix.");
                if (need_update_header != NULL) {
                    /* Updating is needed to drop invalid bitmap extension. */
                    *need_update_header = true;
                }
                break;
            }

            ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Could not read ext header");
                return ret;
            }

            if (bitmaps_ext.reserved32 != 0) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Reserved field is not zero");
                return -EINVAL;
            }

            be32_to_cpus(&bitmaps_ext.nb_bitmaps);
            be64_to_cpus(&bitmaps_ext.bitmap_directory_size);
            be64_to_cpus(&bitmaps_ext.bitmap_directory_offset);

            if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
                error_setg(errp,
                           "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
                           "exceeding the QEMU supported maximum of %d",
                           bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
                return -EINVAL;
            }

            if (bitmaps_ext.nb_bitmaps == 0) {
                error_setg(errp, "found bitmaps extension with zero bitmaps");
                return -EINVAL;
            }

            if (bitmaps_ext.bitmap_directory_offset & (s->cluster_size - 1)) {
                error_setg(errp, "bitmaps_ext: "
                                 "invalid bitmap directory offset");
                return -EINVAL;
            }

            if (bitmaps_ext.bitmap_directory_size >
                QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
                error_setg(errp, "bitmaps_ext: "
                                 "bitmap directory size (%" PRIu64 ") exceeds "
                                 "the maximum supported size (%d)",
                                 bitmaps_ext.bitmap_directory_size,
                                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
                return -EINVAL;
            }

            s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
            s->bitmap_directory_offset =
                    bitmaps_ext.bitmap_directory_offset;
            s->bitmap_directory_size =
                    bitmaps_ext.bitmap_directory_size;

#ifdef DEBUG_EXT
            printf("Qcow2: Got bitmaps extension: "
                   "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
                   s->bitmap_directory_offset, s->nb_bitmaps);
#endif
            break;

387
        default:
388
            /* unknown magic - save it in case we need to rewrite the header */
389 390
            /* If you add a new feature, make sure to also update the fast
             * path of qcow2_make_empty() to deal with it. */
391 392 393 394 395 396 397 398
            {
                Qcow2UnknownHeaderExtension *uext;

                uext = g_malloc0(sizeof(*uext)  + ext.len);
                uext->magic = ext.magic;
                uext->len = ext.len;
                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);

399
                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
400
                if (ret < 0) {
M
Max Reitz 已提交
401 402
                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
                                     "Could not read data");
403 404 405
                    return ret;
                }
            }
406 407
            break;
        }
408 409

        offset += ((ext.len + 7) & ~7);
410 411 412 413 414
    }

    return 0;
}

415 416
static void cleanup_unknown_header_ext(BlockDriverState *bs)
{
417
    BDRVQcow2State *s = bs->opaque;
418 419 420 421 422 423 424
    Qcow2UnknownHeaderExtension *uext, *next;

    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
        QLIST_REMOVE(uext, next);
        g_free(uext);
    }
}
425

426 427
static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
                                       uint64_t mask)
428
{
429 430 431
    char *features = g_strdup("");
    char *old;

432 433
    while (table && table->name[0] != '\0') {
        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
434 435 436 437 438 439
            if (mask & (1ULL << table->bit)) {
                old = features;
                features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "",
                                           table->name);
                g_free(old);
                mask &= ~(1ULL << table->bit);
440 441 442 443 444 445
            }
        }
        table++;
    }

    if (mask) {
446 447 448 449
        old = features;
        features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64,
                                   old, *old ? ", " : "", mask);
        g_free(old);
450
    }
451

452
    error_setg(errp, "Unsupported qcow2 feature(s): %s", features);
453
    g_free(features);
454 455
}

456 457 458 459 460 461 462
/*
 * Sets the dirty bit and flushes afterwards if necessary.
 *
 * The incompatible_features bit is only set if the image file header was
 * updated successfully.  Therefore it is not required to check the return
 * value of this function.
 */
463
int qcow2_mark_dirty(BlockDriverState *bs)
464
{
465
    BDRVQcow2State *s = bs->opaque;
466 467 468 469 470 471 472 473 474 475
    uint64_t val;
    int ret;

    assert(s->qcow_version >= 3);

    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
        return 0; /* already dirty */
    }

    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
476
    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
477 478 479 480
                      &val, sizeof(val));
    if (ret < 0) {
        return ret;
    }
K
Kevin Wolf 已提交
481
    ret = bdrv_flush(bs->file->bs);
482 483 484 485 486 487 488 489 490
    if (ret < 0) {
        return ret;
    }

    /* Only treat image as dirty if the header was updated successfully */
    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
    return 0;
}

S
Stefan Hajnoczi 已提交
491 492 493 494 495 496 497
/*
 * Clears the dirty bit and flushes before if necessary.  Only call this
 * function when there are no pending requests, it does not guard against
 * concurrent requests dirtying the image.
 */
static int qcow2_mark_clean(BlockDriverState *bs)
{
498
    BDRVQcow2State *s = bs->opaque;
S
Stefan Hajnoczi 已提交
499 500

    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
501 502 503 504
        int ret;

        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;

505
        ret = qcow2_flush_caches(bs);
S
Stefan Hajnoczi 已提交
506 507 508 509 510 511 512 513 514
        if (ret < 0) {
            return ret;
        }

        return qcow2_update_header(bs);
    }
    return 0;
}

M
Max Reitz 已提交
515 516 517 518 519
/*
 * Marks the image as corrupt.
 */
int qcow2_mark_corrupt(BlockDriverState *bs)
{
520
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
521 522 523 524 525 526 527 528 529 530 531

    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
    return qcow2_update_header(bs);
}

/*
 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
 * before if necessary.
 */
int qcow2_mark_consistent(BlockDriverState *bs)
{
532
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
533 534

    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
535
        int ret = qcow2_flush_caches(bs);
M
Max Reitz 已提交
536 537 538 539 540 541 542 543 544 545
        if (ret < 0) {
            return ret;
        }

        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
        return qcow2_update_header(bs);
    }
    return 0;
}

546 547 548
static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
                                              BdrvCheckResult *result,
                                              BdrvCheckMode fix)
549 550 551 552 553 554 555
{
    int ret = qcow2_check_refcounts(bs, result, fix);
    if (ret < 0) {
        return ret;
    }

    if (fix && result->check_errors == 0 && result->corruptions == 0) {
M
Max Reitz 已提交
556 557 558 559 560
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
            return ret;
        }
        return qcow2_mark_consistent(bs);
561 562 563 564
    }
    return ret;
}

565 566 567 568 569 570 571 572 573 574 575 576 577
static int coroutine_fn qcow2_co_check(BlockDriverState *bs,
                                       BdrvCheckResult *result,
                                       BdrvCheckMode fix)
{
    BDRVQcow2State *s = bs->opaque;
    int ret;

    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_co_check_locked(bs, result, fix);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

578 579 580 581
int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
                         uint64_t entries, size_t entry_len,
                         int64_t max_size_bytes, const char *table_name,
                         Error **errp)
582
{
583
    BDRVQcow2State *s = bs->opaque;
584

585 586 587
    if (entries > max_size_bytes / entry_len) {
        error_setg(errp, "%s too large", table_name);
        return -EFBIG;
588 589
    }

590 591 592 593 594
    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
     * because values will be passed to qemu functions taking int64_t. */
    if ((INT64_MAX - entries * entry_len < offset) ||
        (offset_into_cluster(s, offset) != 0)) {
        error_setg(errp, "%s offset invalid", table_name);
595 596 597 598 599 600
        return -EINVAL;
    }

    return 0;
}

601 602 603 604 605
static QemuOptsList qcow2_runtime_opts = {
    .name = "qcow2",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
    .desc = {
        {
606
            .name = QCOW2_OPT_LAZY_REFCOUNTS,
607 608 609
            .type = QEMU_OPT_BOOL,
            .help = "Postpone refcount updates",
        },
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
        {
            .name = QCOW2_OPT_DISCARD_REQUEST,
            .type = QEMU_OPT_BOOL,
            .help = "Pass guest discard requests to the layer below",
        },
        {
            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
            .type = QEMU_OPT_BOOL,
            .help = "Generate discard requests when snapshot related space "
                    "is freed",
        },
        {
            .name = QCOW2_OPT_DISCARD_OTHER,
            .type = QEMU_OPT_BOOL,
            .help = "Generate discard requests when other clusters are freed",
        },
M
Max Reitz 已提交
626 627 628 629 630 631
        {
            .name = QCOW2_OPT_OVERLAP,
            .type = QEMU_OPT_STRING,
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
632 633 634 635 636 637
        {
            .name = QCOW2_OPT_OVERLAP_TEMPLATE,
            .type = QEMU_OPT_STRING,
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
M
Max Reitz 已提交
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
        {
            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the main qcow2 header",
        },
        {
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the active L1 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an active L2 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the refcount table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into a refcount block",
        },
        {
            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the snapshot table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L1 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L2 table",
        },
678 679 680 681 682 683 684 685 686 687 688
        {
            .name = QCOW2_OPT_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum combined metadata (L2 tables and refcount blocks) "
                    "cache size",
        },
        {
            .name = QCOW2_OPT_L2_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum L2 table cache size",
        },
689 690 691 692 693
        {
            .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Size of each entry in the L2 cache",
        },
694 695 696 697 698
        {
            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum refcount block cache size",
        },
699 700 701 702 703
        {
            .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
            .type = QEMU_OPT_NUMBER,
            .help = "Clean unused cache entries after this time (in seconds)",
        },
704 705
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
            "ID of secret providing qcow2 AES key or LUKS passphrase"),
706 707 708 709
        { /* end of list */ }
    },
};

710 711 712 713 714 715 716 717 718 719 720
static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
    [QCOW2_OL_MAIN_HEADER_BITNR]    = QCOW2_OPT_OVERLAP_MAIN_HEADER,
    [QCOW2_OL_ACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L1,
    [QCOW2_OL_ACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L2,
    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
    [QCOW2_OL_INACTIVE_L1_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L1,
    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
};

721 722 723
static void cache_clean_timer_cb(void *opaque)
{
    BlockDriverState *bs = opaque;
724
    BDRVQcow2State *s = bs->opaque;
725 726
    qcow2_cache_clean_unused(s->l2_table_cache);
    qcow2_cache_clean_unused(s->refcount_block_cache);
727 728 729 730 731 732
    timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
              (int64_t) s->cache_clean_interval * 1000);
}

static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
{
733
    BDRVQcow2State *s = bs->opaque;
734 735 736 737 738 739 740 741 742 743 744
    if (s->cache_clean_interval > 0) {
        s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL,
                                             SCALE_MS, cache_clean_timer_cb,
                                             bs);
        timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
                  (int64_t) s->cache_clean_interval * 1000);
    }
}

static void cache_clean_timer_del(BlockDriverState *bs)
{
745
    BDRVQcow2State *s = bs->opaque;
746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
    if (s->cache_clean_timer) {
        timer_del(s->cache_clean_timer);
        timer_free(s->cache_clean_timer);
        s->cache_clean_timer = NULL;
    }
}

static void qcow2_detach_aio_context(BlockDriverState *bs)
{
    cache_clean_timer_del(bs);
}

static void qcow2_attach_aio_context(BlockDriverState *bs,
                                     AioContext *new_context)
{
    cache_clean_timer_init(bs, new_context);
}

M
Max Reitz 已提交
764 765
static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                             uint64_t *l2_cache_size,
766
                             uint64_t *l2_cache_entry_size,
767 768
                             uint64_t *refcount_cache_size, Error **errp)
{
769
    BDRVQcow2State *s = bs->opaque;
770 771
    uint64_t combined_cache_size;
    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
772
    int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
773 774 775 776 777 778 779 780 781 782

    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);

    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
    *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0);
    *refcount_cache_size = qemu_opt_get_size(opts,
                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);

783 784 785
    *l2_cache_entry_size = qemu_opt_get_size(
        opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806
    if (combined_cache_size_set) {
        if (l2_cache_size_set && refcount_cache_size_set) {
            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
                       "the same time");
            return;
        } else if (*l2_cache_size > combined_cache_size) {
            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
                       QCOW2_OPT_CACHE_SIZE);
            return;
        } else if (*refcount_cache_size > combined_cache_size) {
            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
                       QCOW2_OPT_CACHE_SIZE);
            return;
        }

        if (l2_cache_size_set) {
            *refcount_cache_size = combined_cache_size - *l2_cache_size;
        } else if (refcount_cache_size_set) {
            *l2_cache_size = combined_cache_size - *refcount_cache_size;
        } else {
807 808 809 810 811 812 813 814 815 816 817 818 819
            uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
            uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);

            /* Assign as much memory as possible to the L2 cache, and
             * use the remainder for the refcount cache */
            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
                *l2_cache_size = max_l2_cache;
                *refcount_cache_size = combined_cache_size - *l2_cache_size;
            } else {
                *refcount_cache_size =
                    MIN(combined_cache_size, min_refcount_cache);
                *l2_cache_size = combined_cache_size - *refcount_cache_size;
            }
820 821
        }
    } else {
822
        if (!l2_cache_size_set) {
M
Max Reitz 已提交
823 824 825
            *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
                                 (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
                                 * s->cluster_size);
826 827
        }
        if (!refcount_cache_size_set) {
828
            *refcount_cache_size = min_refcount_cache;
829 830
        }
    }
831 832 833 834 835 836 837 838 839

    if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
        *l2_cache_entry_size > s->cluster_size ||
        !is_power_of_2(*l2_cache_entry_size)) {
        error_setg(errp, "L2 cache entry size must be a power of two "
                   "between %d and the cluster size (%d)",
                   1 << MIN_CLUSTER_BITS, s->cluster_size);
        return;
    }
840 841
}

842 843 844
typedef struct Qcow2ReopenState {
    Qcow2Cache *l2_table_cache;
    Qcow2Cache *refcount_block_cache;
845
    int l2_slice_size; /* Number of entries in a slice of the L2 table */
846 847 848 849
    bool use_lazy_refcounts;
    int overlap_check;
    bool discard_passthrough[QCOW2_DISCARD_MAX];
    uint64_t cache_clean_interval;
850
    QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
851 852 853 854 855 856
} Qcow2ReopenState;

static int qcow2_update_options_prepare(BlockDriverState *bs,
                                        Qcow2ReopenState *r,
                                        QDict *options, int flags,
                                        Error **errp)
857 858
{
    BDRVQcow2State *s = bs->opaque;
859
    QemuOpts *opts = NULL;
860 861
    const char *opt_overlap_check, *opt_overlap_check_template;
    int overlap_check_template = 0;
862
    uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
863
    int i;
864 865
    const char *encryptfmt;
    QDict *encryptopts = NULL;
866
    Error *local_err = NULL;
867 868
    int ret;

869 870 871
    qdict_extract_subqdict(options, &encryptopts, "encrypt.");
    encryptfmt = qdict_get_try_str(encryptopts, "format");

872 873 874 875 876 877 878 879 880
    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

    /* get L2 table/refcount block cache size from command line options */
881 882
    read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
                     &refcount_cache_size, &local_err);
883 884 885 886 887 888
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

889
    l2_cache_size /= l2_cache_entry_size;
890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908
    if (l2_cache_size < MIN_L2_CACHE_SIZE) {
        l2_cache_size = MIN_L2_CACHE_SIZE;
    }
    if (l2_cache_size > INT_MAX) {
        error_setg(errp, "L2 cache size too big");
        ret = -EINVAL;
        goto fail;
    }

    refcount_cache_size /= s->cluster_size;
    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
    }
    if (refcount_cache_size > INT_MAX) {
        error_setg(errp, "Refcount cache size too big");
        ret = -EINVAL;
        goto fail;
    }

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
    /* alloc new L2 table/refcount block cache, flush old one */
    if (s->l2_table_cache) {
        ret = qcow2_cache_flush(bs, s->l2_table_cache);
        if (ret) {
            error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
            goto fail;
        }
    }

    if (s->refcount_block_cache) {
        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
        if (ret) {
            error_setg_errno(errp, -ret,
                             "Failed to flush the refcount block cache");
            goto fail;
        }
    }

927 928 929 930 931
    r->l2_slice_size = l2_cache_entry_size / sizeof(uint64_t);
    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
                                           l2_cache_entry_size);
    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
                                                 s->cluster_size);
932
    if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
933 934 935 936 937 938
        error_setg(errp, "Could not allocate metadata caches");
        ret = -ENOMEM;
        goto fail;
    }

    /* New interval for cache cleanup timer */
939
    r->cache_clean_interval =
940 941
        qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
                            s->cache_clean_interval);
942 943 944 945 946 947 948 949
#ifndef CONFIG_LINUX
    if (r->cache_clean_interval != 0) {
        error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
                   " not supported on this host");
        ret = -EINVAL;
        goto fail;
    }
#endif
950
    if (r->cache_clean_interval > UINT_MAX) {
951 952 953 954 955
        error_setg(errp, "Cache clean interval too big");
        ret = -EINVAL;
        goto fail;
    }

956
    /* lazy-refcounts; flush if going from enabled to disabled */
957
    r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
958
        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
959
    if (r->use_lazy_refcounts && s->qcow_version < 3) {
960 961 962 963 964
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
                   "qemu 1.1 compatibility level");
        ret = -EINVAL;
        goto fail;
    }
965

966 967 968 969 970 971 972 973
    if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
            goto fail;
        }
    }

974
    /* Overlap check options */
975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
    if (opt_overlap_check_template && opt_overlap_check &&
        strcmp(opt_overlap_check_template, opt_overlap_check))
    {
        error_setg(errp, "Conflicting values for qcow2 options '"
                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
                   "' ('%s')", opt_overlap_check, opt_overlap_check_template);
        ret = -EINVAL;
        goto fail;
    }
    if (!opt_overlap_check) {
        opt_overlap_check = opt_overlap_check_template ?: "cached";
    }

    if (!strcmp(opt_overlap_check, "none")) {
        overlap_check_template = 0;
    } else if (!strcmp(opt_overlap_check, "constant")) {
        overlap_check_template = QCOW2_OL_CONSTANT;
    } else if (!strcmp(opt_overlap_check, "cached")) {
        overlap_check_template = QCOW2_OL_CACHED;
    } else if (!strcmp(opt_overlap_check, "all")) {
        overlap_check_template = QCOW2_OL_ALL;
    } else {
        error_setg(errp, "Unsupported value '%s' for qcow2 option "
                   "'overlap-check'. Allowed are any of the following: "
                   "none, constant, cached, all", opt_overlap_check);
        ret = -EINVAL;
        goto fail;
    }

1006
    r->overlap_check = 0;
1007 1008 1009
    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
        /* overlap-check defines a template bitmask, but every flag may be
         * overwritten through the associated boolean option */
1010
        r->overlap_check |=
1011 1012 1013 1014
            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
                              overlap_check_template & (1 << i)) << i;
    }

1015 1016 1017
    r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
    r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
    r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
1018 1019
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
                          flags & BDRV_O_UNMAP);
1020
    r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
1021
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
1022
    r->discard_passthrough[QCOW2_DISCARD_OTHER] =
1023 1024
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);

1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
    switch (s->crypt_method_header) {
    case QCOW_CRYPT_NONE:
        if (encryptfmt) {
            error_setg(errp, "No encryption in image header, but options "
                       "specified format '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
        break;

    case QCOW_CRYPT_AES:
        if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
            error_setg(errp,
                       "Header reported 'aes' encryption format but "
                       "options specify '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
1043 1044
        qdict_put_str(encryptopts, "format", "qcow");
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1045 1046
        break;

1047 1048 1049 1050 1051 1052 1053 1054
    case QCOW_CRYPT_LUKS:
        if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
            error_setg(errp,
                       "Header reported 'luks' encryption format but "
                       "options specify '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
1055 1056
        qdict_put_str(encryptopts, "format", "luks");
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1057 1058
        break;

1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
    default:
        error_setg(errp, "Unsupported encryption method %d",
                   s->crypt_method_header);
        break;
    }
    if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) {
        ret = -EINVAL;
        goto fail;
    }

1069 1070
    ret = 0;
fail:
1071
    qobject_unref(encryptopts);
1072 1073
    qemu_opts_del(opts);
    opts = NULL;
1074 1075 1076 1077 1078 1079 1080 1081 1082
    return ret;
}

static void qcow2_update_options_commit(BlockDriverState *bs,
                                        Qcow2ReopenState *r)
{
    BDRVQcow2State *s = bs->opaque;
    int i;

1083
    if (s->l2_table_cache) {
1084
        qcow2_cache_destroy(s->l2_table_cache);
1085 1086
    }
    if (s->refcount_block_cache) {
1087
        qcow2_cache_destroy(s->refcount_block_cache);
1088
    }
1089 1090
    s->l2_table_cache = r->l2_table_cache;
    s->refcount_block_cache = r->refcount_block_cache;
1091
    s->l2_slice_size = r->l2_slice_size;
1092 1093 1094 1095 1096 1097 1098 1099

    s->overlap_check = r->overlap_check;
    s->use_lazy_refcounts = r->use_lazy_refcounts;

    for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
        s->discard_passthrough[i] = r->discard_passthrough[i];
    }

1100 1101 1102 1103 1104
    if (s->cache_clean_interval != r->cache_clean_interval) {
        cache_clean_timer_del(bs);
        s->cache_clean_interval = r->cache_clean_interval;
        cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
    }
1105 1106 1107

    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
    s->crypto_opts = r->crypto_opts;
1108 1109 1110 1111 1112 1113
}

static void qcow2_update_options_abort(BlockDriverState *bs,
                                       Qcow2ReopenState *r)
{
    if (r->l2_table_cache) {
1114
        qcow2_cache_destroy(r->l2_table_cache);
1115 1116
    }
    if (r->refcount_block_cache) {
1117
        qcow2_cache_destroy(r->refcount_block_cache);
1118
    }
1119
    qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
}

static int qcow2_update_options(BlockDriverState *bs, QDict *options,
                                int flags, Error **errp)
{
    Qcow2ReopenState r = {};
    int ret;

    ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
    if (ret >= 0) {
        qcow2_update_options_commit(bs, &r);
    } else {
        qcow2_update_options_abort(bs, &r);
    }
1134

1135 1136 1137
    return ret;
}

1138 1139 1140
/* Called with s->lock held.  */
static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
                                      int flags, Error **errp)
B
bellard 已提交
1141
{
1142
    BDRVQcow2State *s = bs->opaque;
1143 1144
    unsigned int len, i;
    int ret = 0;
B
bellard 已提交
1145
    QCowHeader header;
1146
    Error *local_err = NULL;
1147
    uint64_t ext_end;
1148
    uint64_t l1_vm_state_index;
1149
    bool update_header = false;
1150
    bool header_updated = false;
B
bellard 已提交
1151

1152
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
1153
    if (ret < 0) {
M
Max Reitz 已提交
1154
        error_setg_errno(errp, -ret, "Could not read qcow2 header");
B
bellard 已提交
1155
        goto fail;
1156
    }
B
bellard 已提交
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
    be32_to_cpus(&header.magic);
    be32_to_cpus(&header.version);
    be64_to_cpus(&header.backing_file_offset);
    be32_to_cpus(&header.backing_file_size);
    be64_to_cpus(&header.size);
    be32_to_cpus(&header.cluster_bits);
    be32_to_cpus(&header.crypt_method);
    be64_to_cpus(&header.l1_table_offset);
    be32_to_cpus(&header.l1_size);
    be64_to_cpus(&header.refcount_table_offset);
    be32_to_cpus(&header.refcount_table_clusters);
    be64_to_cpus(&header.snapshots_offset);
    be32_to_cpus(&header.nb_snapshots);
1170

K
Kevin Wolf 已提交
1171
    if (header.magic != QCOW_MAGIC) {
M
Max Reitz 已提交
1172
        error_setg(errp, "Image is not in qcow2 format");
P
Paolo Bonzini 已提交
1173
        ret = -EINVAL;
B
bellard 已提交
1174
        goto fail;
1175
    }
K
Kevin Wolf 已提交
1176
    if (header.version < 2 || header.version > 3) {
1177
        error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
K
Kevin Wolf 已提交
1178 1179 1180 1181 1182 1183
        ret = -ENOTSUP;
        goto fail;
    }

    s->qcow_version = header.version;

1184 1185 1186
    /* Initialise cluster size */
    if (header.cluster_bits < MIN_CLUSTER_BITS ||
        header.cluster_bits > MAX_CLUSTER_BITS) {
1187 1188
        error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
                   header.cluster_bits);
1189 1190 1191 1192 1193 1194
        ret = -EINVAL;
        goto fail;
    }

    s->cluster_bits = header.cluster_bits;
    s->cluster_size = 1 << s->cluster_bits;
1195
    s->cluster_sectors = 1 << (s->cluster_bits - BDRV_SECTOR_BITS);
1196

K
Kevin Wolf 已提交
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209
    /* Initialise version 3 header fields */
    if (header.version == 2) {
        header.incompatible_features    = 0;
        header.compatible_features      = 0;
        header.autoclear_features       = 0;
        header.refcount_order           = 4;
        header.header_length            = 72;
    } else {
        be64_to_cpus(&header.incompatible_features);
        be64_to_cpus(&header.compatible_features);
        be64_to_cpus(&header.autoclear_features);
        be32_to_cpus(&header.refcount_order);
        be32_to_cpus(&header.header_length);
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221

        if (header.header_length < 104) {
            error_setg(errp, "qcow2 header too short");
            ret = -EINVAL;
            goto fail;
        }
    }

    if (header.header_length > s->cluster_size) {
        error_setg(errp, "qcow2 header exceeds cluster size");
        ret = -EINVAL;
        goto fail;
K
Kevin Wolf 已提交
1222 1223 1224 1225 1226
    }

    if (header.header_length > sizeof(header)) {
        s->unknown_header_fields_size = header.header_length - sizeof(header);
        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
1227
        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
K
Kevin Wolf 已提交
1228 1229
                         s->unknown_header_fields_size);
        if (ret < 0) {
M
Max Reitz 已提交
1230 1231
            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
                             "fields");
K
Kevin Wolf 已提交
1232 1233 1234 1235
            goto fail;
        }
    }

1236 1237 1238 1239 1240 1241
    if (header.backing_file_offset > s->cluster_size) {
        error_setg(errp, "Invalid backing file offset");
        ret = -EINVAL;
        goto fail;
    }

1242 1243 1244 1245 1246 1247
    if (header.backing_file_offset) {
        ext_end = header.backing_file_offset;
    } else {
        ext_end = 1 << header.cluster_bits;
    }

K
Kevin Wolf 已提交
1248 1249 1250 1251 1252
    /* Handle feature bits */
    s->incompatible_features    = header.incompatible_features;
    s->compatible_features      = header.compatible_features;
    s->autoclear_features       = header.autoclear_features;

S
Stefan Hajnoczi 已提交
1253
    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
1254 1255
        void *feature_table = NULL;
        qcow2_read_extensions(bs, header.header_length, ext_end,
1256
                              &feature_table, flags, NULL, NULL);
1257
        report_unsupported_feature(errp, feature_table,
S
Stefan Hajnoczi 已提交
1258 1259
                                   s->incompatible_features &
                                   ~QCOW2_INCOMPAT_MASK);
K
Kevin Wolf 已提交
1260
        ret = -ENOTSUP;
1261
        g_free(feature_table);
K
Kevin Wolf 已提交
1262 1263 1264
        goto fail;
    }

M
Max Reitz 已提交
1265 1266 1267 1268
    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
        /* Corrupt images may not be written to unless they are being repaired
         */
        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
M
Max Reitz 已提交
1269 1270
            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
                       "read/write");
M
Max Reitz 已提交
1271 1272 1273 1274 1275
            ret = -EACCES;
            goto fail;
        }
    }

K
Kevin Wolf 已提交
1276
    /* Check support for various header values */
1277 1278 1279 1280
    if (header.refcount_order > 6) {
        error_setg(errp, "Reference count entry width too large; may not "
                   "exceed 64 bits");
        ret = -EINVAL;
K
Kevin Wolf 已提交
1281 1282
        goto fail;
    }
1283
    s->refcount_order = header.refcount_order;
1284 1285 1286
    s->refcount_bits = 1 << s->refcount_order;
    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
    s->refcount_max += s->refcount_max - 1;
K
Kevin Wolf 已提交
1287

B
bellard 已提交
1288
    s->crypt_method_header = header.crypt_method;
1289
    if (s->crypt_method_header) {
1290 1291
        if (bdrv_uses_whitelist() &&
            s->crypt_method_header == QCOW_CRYPT_AES) {
1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
            error_setg(errp,
                       "Use of AES-CBC encrypted qcow2 images is no longer "
                       "supported in system emulators");
            error_append_hint(errp,
                              "You can use 'qemu-img convert' to convert your "
                              "image to an alternative supported format, such "
                              "as unencrypted qcow2, or raw with the LUKS "
                              "format instead.\n");
            ret = -ENOSYS;
            goto fail;
1302 1303
        }

1304 1305 1306 1307 1308 1309 1310 1311 1312
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
            s->crypt_physical_offset = false;
        } else {
            /* Assuming LUKS and any future crypt methods we
             * add will all use physical offsets, due to the
             * fact that the alternative is insecure...  */
            s->crypt_physical_offset = true;
        }

1313
        bs->encrypted = true;
1314
    }
1315

B
bellard 已提交
1316 1317
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
1318 1319 1320
    /* 2^(s->refcount_order - 3) is the refcount width in bytes */
    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
    s->refcount_block_size = 1 << s->refcount_block_bits;
B
bellard 已提交
1321 1322 1323 1324
    bs->total_sectors = header.size / 512;
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
1325

B
bellard 已提交
1326
    s->refcount_table_offset = header.refcount_table_offset;
1327
    s->refcount_table_size =
B
bellard 已提交
1328 1329
        header.refcount_table_clusters << (s->cluster_bits - 3);

1330 1331 1332 1333 1334 1335
    if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
        error_setg(errp, "Image does not contain a reference count table");
        ret = -EINVAL;
        goto fail;
    }

1336 1337 1338 1339
    ret = qcow2_validate_table(bs, s->refcount_table_offset,
                               header.refcount_table_clusters,
                               s->cluster_size, QCOW_MAX_REFTABLE_SIZE,
                               "Reference count table", errp);
1340 1341 1342 1343
    if (ret < 0) {
        goto fail;
    }

1344 1345 1346 1347 1348 1349 1350 1351 1352
    /* The total size in bytes of the snapshot table is checked in
     * qcow2_read_snapshots() because the size of each snapshot is
     * variable and we don't know it yet.
     * Here we only check the offset and number of snapshots. */
    ret = qcow2_validate_table(bs, header.snapshots_offset,
                               header.nb_snapshots,
                               sizeof(QCowSnapshotHeader),
                               sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
                               "Snapshot table", errp);
1353 1354 1355 1356
    if (ret < 0) {
        goto fail;
    }

B
bellard 已提交
1357
    /* read the level 1 table */
1358 1359 1360 1361
    ret = qcow2_validate_table(bs, header.l1_table_offset,
                               header.l1_size, sizeof(uint64_t),
                               QCOW_MAX_L1_SIZE, "Active L1 table", errp);
    if (ret < 0) {
1362 1363
        goto fail;
    }
B
bellard 已提交
1364
    s->l1_size = header.l1_size;
1365
    s->l1_table_offset = header.l1_table_offset;
1366 1367 1368

    l1_vm_state_index = size_to_l1(s, header.size);
    if (l1_vm_state_index > INT_MAX) {
M
Max Reitz 已提交
1369
        error_setg(errp, "Image is too big");
1370 1371 1372 1373 1374
        ret = -EFBIG;
        goto fail;
    }
    s->l1_vm_state_index = l1_vm_state_index;

B
bellard 已提交
1375 1376
    /* the L1 table must contain at least enough entries to put
       header.size bytes */
1377
    if (s->l1_size < s->l1_vm_state_index) {
M
Max Reitz 已提交
1378
        error_setg(errp, "L1 table is too small");
1379
        ret = -EINVAL;
B
bellard 已提交
1380
        goto fail;
1381
    }
1382

1383
    if (s->l1_size > 0) {
K
Kevin Wolf 已提交
1384
        s->l1_table = qemu_try_blockalign(bs->file->bs,
1385
            ROUND_UP(s->l1_size * sizeof(uint64_t), 512));
1386 1387 1388 1389 1390
        if (s->l1_table == NULL) {
            error_setg(errp, "Could not allocate L1 table");
            ret = -ENOMEM;
            goto fail;
        }
1391
        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
1392 1393
                         s->l1_size * sizeof(uint64_t));
        if (ret < 0) {
M
Max Reitz 已提交
1394
            error_setg_errno(errp, -ret, "Could not read L1 table");
1395
            goto fail;
1396
        }
1397 1398 1399
        for(i = 0;i < s->l1_size; i++) {
            be64_to_cpus(&s->l1_table[i]);
        }
B
bellard 已提交
1400
    }
K
Kevin Wolf 已提交
1401

1402 1403
    /* Parse driver-specific options */
    ret = qcow2_update_options(bs, options, flags, errp);
1404 1405 1406 1407
    if (ret < 0) {
        goto fail;
    }

B
bellard 已提交
1408
    s->cluster_cache_offset = -1;
1409
    s->flags = flags;
1410

1411 1412
    ret = qcow2_refcount_init(bs);
    if (ret != 0) {
M
Max Reitz 已提交
1413
        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
B
bellard 已提交
1414
        goto fail;
1415
    }
B
bellard 已提交
1416

B
Blue Swirl 已提交
1417
    QLIST_INIT(&s->cluster_allocs);
K
Kevin Wolf 已提交
1418
    QTAILQ_INIT(&s->discards);
1419

1420
    /* read qcow2 extensions */
M
Max Reitz 已提交
1421
    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
1422
                              flags, &update_header, &local_err)) {
M
Max Reitz 已提交
1423
        error_propagate(errp, local_err);
1424
        ret = -EINVAL;
1425
        goto fail;
1426
    }
1427

1428 1429 1430 1431 1432 1433 1434 1435 1436 1437
    /* qcow2_read_extension may have set up the crypto context
     * if the crypt method needs a header region, some methods
     * don't need header extensions, so must check here
     */
    if (s->crypt_method_header && !s->crypto) {
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
            unsigned int cflags = 0;
            if (flags & BDRV_O_NO_IO) {
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
            }
1438 1439
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
                                           NULL, NULL, cflags, errp);
1440 1441 1442 1443 1444 1445 1446
            if (!s->crypto) {
                ret = -EINVAL;
                goto fail;
            }
        } else if (!(flags & BDRV_O_NO_IO)) {
            error_setg(errp, "Missing CRYPTO header for crypt method %d",
                       s->crypt_method_header);
1447 1448 1449 1450 1451
            ret = -EINVAL;
            goto fail;
        }
    }

B
bellard 已提交
1452 1453 1454
    /* read the backing file name */
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
1455
        if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
1456
            len >= sizeof(bs->backing_file)) {
1457 1458 1459
            error_setg(errp, "Backing file name too long");
            ret = -EINVAL;
            goto fail;
1460
        }
1461
        ret = bdrv_pread(bs->file, header.backing_file_offset,
1462 1463
                         bs->backing_file, len);
        if (ret < 0) {
M
Max Reitz 已提交
1464
            error_setg_errno(errp, -ret, "Could not read backing file name");
B
bellard 已提交
1465
            goto fail;
1466
        }
B
bellard 已提交
1467
        bs->backing_file[len] = '\0';
1468
        s->image_backing_file = g_strdup(bs->backing_file);
B
bellard 已提交
1469
    }
1470

1471 1472 1473 1474
    /* Internal snapshots */
    s->snapshots_offset = header.snapshots_offset;
    s->nb_snapshots = header.nb_snapshots;

1475 1476
    ret = qcow2_read_snapshots(bs);
    if (ret < 0) {
M
Max Reitz 已提交
1477
        error_setg_errno(errp, -ret, "Could not read snapshots");
B
bellard 已提交
1478
        goto fail;
1479
    }
B
bellard 已提交
1480

1481
    /* Clear unknown autoclear feature bits */
1482
    update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
1483 1484 1485
    update_header =
        update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE);
    if (update_header) {
1486
        s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
1487 1488
    }

1489 1490 1491
    if (s->dirty_bitmaps_loaded) {
        /* It's some kind of reopen. There are no known cases where we need to
         * reload bitmaps in such a situation, so it's safer to skip them.
1492 1493 1494 1495 1496 1497 1498 1499 1500
         *
         * Moreover, if we have some readonly bitmaps and we are reopening for
         * rw we should reopen bitmaps correspondingly.
         */
        if (bdrv_has_readonly_bitmaps(bs) &&
            !bdrv_is_read_only(bs) && !(bdrv_get_flags(bs) & BDRV_O_INACTIVE))
        {
            qcow2_reopen_bitmaps_rw_hint(bs, &header_updated, &local_err);
        }
1501 1502 1503
    } else {
        header_updated = qcow2_load_dirty_bitmaps(bs, &local_err);
        s->dirty_bitmaps_loaded = true;
1504
    }
1505
    update_header = update_header && !header_updated;
1506 1507 1508 1509 1510 1511 1512
    if (local_err != NULL) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

    if (update_header) {
1513 1514
        ret = qcow2_update_header(bs);
        if (ret < 0) {
M
Max Reitz 已提交
1515
            error_setg_errno(errp, -ret, "Could not update qcow2 header");
1516 1517 1518 1519
            goto fail;
        }
    }

1520
    bs->supported_zero_flags = header.version >= 3 ? BDRV_REQ_MAY_UNMAP : 0;
K
Kevin Wolf 已提交
1521

S
Stefan Hajnoczi 已提交
1522
    /* Repair image if dirty */
1523
    if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
1524
        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
S
Stefan Hajnoczi 已提交
1525 1526
        BdrvCheckResult result = {0};

1527 1528
        ret = qcow2_co_check_locked(bs, &result,
                                    BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
M
Max Reitz 已提交
1529 1530 1531 1532
        if (ret < 0 || result.check_errors) {
            if (ret >= 0) {
                ret = -EIO;
            }
M
Max Reitz 已提交
1533
            error_setg_errno(errp, -ret, "Could not repair dirty image");
S
Stefan Hajnoczi 已提交
1534 1535 1536 1537
            goto fail;
        }
    }

B
bellard 已提交
1538
#ifdef DEBUG_ALLOC
P
Philipp Hahn 已提交
1539 1540
    {
        BdrvCheckResult result = {0};
1541
        qcow2_check_refcounts(bs, &result, 0);
P
Philipp Hahn 已提交
1542
    }
B
bellard 已提交
1543
#endif
1544
    return ret;
B
bellard 已提交
1545 1546

 fail:
K
Kevin Wolf 已提交
1547
    g_free(s->unknown_header_fields);
1548
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
1549 1550
    qcow2_free_snapshots(bs);
    qcow2_refcount_close(bs);
1551
    qemu_vfree(s->l1_table);
1552 1553
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
1554
    cache_clean_timer_del(bs);
K
Kevin Wolf 已提交
1555
    if (s->l2_table_cache) {
1556
        qcow2_cache_destroy(s->l2_table_cache);
K
Kevin Wolf 已提交
1557
    }
1558
    if (s->refcount_block_cache) {
1559
        qcow2_cache_destroy(s->refcount_block_cache);
1560
    }
1561 1562
    qcrypto_block_free(s->crypto);
    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1563
    return ret;
B
bellard 已提交
1564 1565
}

1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583
typedef struct QCow2OpenCo {
    BlockDriverState *bs;
    QDict *options;
    int flags;
    Error **errp;
    int ret;
} QCow2OpenCo;

static void coroutine_fn qcow2_open_entry(void *opaque)
{
    QCow2OpenCo *qoc = opaque;
    BDRVQcow2State *s = qoc->bs->opaque;

    qemu_co_mutex_lock(&s->lock);
    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
    qemu_co_mutex_unlock(&s->lock);
}

1584 1585 1586
static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
{
1587 1588 1589 1590 1591 1592 1593 1594 1595
    BDRVQcow2State *s = bs->opaque;
    QCow2OpenCo qoc = {
        .bs = bs,
        .options = options,
        .flags = flags,
        .errp = errp,
        .ret = -EINPROGRESS
    };

1596 1597 1598 1599 1600 1601
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612
    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);

    if (qemu_in_coroutine()) {
        /* From bdrv_co_create.  */
        qcow2_open_entry(&qoc);
    } else {
        qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
    }
    return qoc.ret;
1613 1614
}

1615
static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1616
{
1617
    BDRVQcow2State *s = bs->opaque;
1618

1619 1620
    if (bs->encrypted) {
        /* Encryption works on a sector granularity */
1621
        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
1622
    }
1623
    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
1624
    bs->bl.pdiscard_alignment = s->cluster_size;
1625 1626
}

J
Jeff Cody 已提交
1627 1628 1629
static int qcow2_reopen_prepare(BDRVReopenState *state,
                                BlockReopenQueue *queue, Error **errp)
{
1630
    Qcow2ReopenState *r;
1631 1632
    int ret;

1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
    r = g_new0(Qcow2ReopenState, 1);
    state->opaque = r;

    ret = qcow2_update_options_prepare(state->bs, r, state->options,
                                       state->flags, errp);
    if (ret < 0) {
        goto fail;
    }

    /* We need to write out any unwritten data if we reopen read-only. */
1643
    if ((state->flags & BDRV_O_RDWR) == 0) {
1644 1645 1646 1647 1648
        ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
        if (ret < 0) {
            goto fail;
        }

1649 1650
        ret = bdrv_flush(state->bs);
        if (ret < 0) {
1651
            goto fail;
1652 1653 1654 1655
        }

        ret = qcow2_mark_clean(state->bs);
        if (ret < 0) {
1656
            goto fail;
1657 1658 1659
        }
    }

J
Jeff Cody 已提交
1660
    return 0;
1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677

fail:
    qcow2_update_options_abort(state->bs, r);
    g_free(r);
    return ret;
}

static void qcow2_reopen_commit(BDRVReopenState *state)
{
    qcow2_update_options_commit(state->bs, state->opaque);
    g_free(state->opaque);
}

static void qcow2_reopen_abort(BDRVReopenState *state)
{
    qcow2_update_options_abort(state->bs, state->opaque);
    g_free(state->opaque);
J
Jeff Cody 已提交
1678 1679
}

1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725
static void qcow2_join_options(QDict *options, QDict *old_options)
{
    bool has_new_overlap_template =
        qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
        qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
    bool has_new_total_cache_size =
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
    bool has_all_cache_options;

    /* New overlap template overrides all old overlap options */
    if (has_new_overlap_template) {
        qdict_del(old_options, QCOW2_OPT_OVERLAP);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
    }

    /* New total cache size overrides all old options */
    if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
        qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
        qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
    }

    qdict_join(options, old_options, false);

    /*
     * If after merging all cache size options are set, an old total size is
     * overwritten. Do keep all options, however, if all three are new. The
     * resulting error message is what we want to happen.
     */
    has_all_cache_options =
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
        qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
        qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);

    if (has_all_cache_options && !has_new_total_cache_size) {
        qdict_del(options, QCOW2_OPT_CACHE_SIZE);
    }
}

1726 1727 1728 1729 1730
static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
                                              bool want_zero,
                                              int64_t offset, int64_t count,
                                              int64_t *pnum, int64_t *map,
                                              BlockDriverState **file)
B
bellard 已提交
1731
{
1732
    BDRVQcow2State *s = bs->opaque;
B
bellard 已提交
1733
    uint64_t cluster_offset;
1734
    int index_in_cluster, ret;
K
Kevin Wolf 已提交
1735
    unsigned int bytes;
1736
    int status = 0;
B
bellard 已提交
1737

1738
    bytes = MIN(INT_MAX, count);
1739
    qemu_co_mutex_lock(&s->lock);
1740
    ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
1741
    qemu_co_mutex_unlock(&s->lock);
1742
    if (ret < 0) {
1743
        return ret;
1744
    }
1745

1746
    *pnum = bytes;
K
Kevin Wolf 已提交
1747

1748
    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
1749
        !s->crypto) {
1750 1751
        index_in_cluster = offset & (s->cluster_size - 1);
        *map = cluster_offset | index_in_cluster;
1752
        *file = bs->file->bs;
1753
        status |= BDRV_BLOCK_OFFSET_VALID;
1754
    }
1755
    if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
1756 1757 1758 1759 1760
        status |= BDRV_BLOCK_ZERO;
    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
        status |= BDRV_BLOCK_DATA;
    }
    return status;
B
bellard 已提交
1761 1762
}

F
Fam Zheng 已提交
1763 1764 1765 1766 1767 1768 1769 1770 1771 1772
static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs,
                                            QCowL2Meta **pl2meta,
                                            bool link_l2)
{
    int ret = 0;
    QCowL2Meta *l2meta = *pl2meta;

    while (l2meta != NULL) {
        QCowL2Meta *next;

F
Fam Zheng 已提交
1773
        if (link_l2) {
F
Fam Zheng 已提交
1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
            if (ret) {
                goto out;
            }
        }

        /* Take the request off the list of running requests */
        if (l2meta->nb_clusters != 0) {
            QLIST_REMOVE(l2meta, next_in_flight);
        }

        qemu_co_queue_restart_all(&l2meta->dependent_requests);

        next = l2meta->next;
        g_free(l2meta);
        l2meta = next;
    }
out:
    *pl2meta = l2meta;
    return ret;
}

K
Kevin Wolf 已提交
1796 1797 1798
static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                        uint64_t bytes, QEMUIOVector *qiov,
                                        int flags)
B
bellard 已提交
1799
{
1800
    BDRVQcow2State *s = bs->opaque;
1801
    int offset_in_cluster;
K
Kevin Wolf 已提交
1802
    int ret;
K
Kevin Wolf 已提交
1803
    unsigned int cur_bytes; /* number of bytes in current iteration */
1804
    uint64_t cluster_offset = 0;
1805 1806 1807
    uint64_t bytes_done = 0;
    QEMUIOVector hd_qiov;
    uint8_t *cluster_data = NULL;
B
bellard 已提交
1808

1809 1810 1811 1812
    qemu_iovec_init(&hd_qiov, qiov->niov);

    qemu_co_mutex_lock(&s->lock);

K
Kevin Wolf 已提交
1813
    while (bytes != 0) {
1814

1815
        /* prepare next request */
K
Kevin Wolf 已提交
1816
        cur_bytes = MIN(bytes, INT_MAX);
1817
        if (s->crypto) {
K
Kevin Wolf 已提交
1818 1819
            cur_bytes = MIN(cur_bytes,
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
B
bellard 已提交
1820
        }
1821

K
Kevin Wolf 已提交
1822
        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
1823
        if (ret < 0) {
1824
            goto fail;
1825
        }
1826

K
Kevin Wolf 已提交
1827
        offset_in_cluster = offset_into_cluster(s, offset);
1828

1829
        qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
1830
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
1831

1832 1833
        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
1834

1835
            if (bs->backing) {
1836 1837 1838 1839 1840 1841 1842
                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                qemu_co_mutex_unlock(&s->lock);
                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
                                     &hd_qiov, 0);
                qemu_co_mutex_lock(&s->lock);
                if (ret < 0) {
                    goto fail;
1843 1844 1845
                }
            } else {
                /* Note: in this case, no need to wait */
K
Kevin Wolf 已提交
1846
                qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1847
            }
1848 1849
            break;

1850 1851
        case QCOW2_CLUSTER_ZERO_PLAIN:
        case QCOW2_CLUSTER_ZERO_ALLOC:
K
Kevin Wolf 已提交
1852
            qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1853 1854
            break;

1855
        case QCOW2_CLUSTER_COMPRESSED:
1856 1857 1858
            /* add AIO support for compressed blocks ? */
            ret = qcow2_decompress_cluster(bs, cluster_offset);
            if (ret < 0) {
1859
                goto fail;
1860 1861
            }

1862
            qemu_iovec_from_buf(&hd_qiov, 0,
K
Kevin Wolf 已提交
1863 1864
                                s->cluster_cache + offset_in_cluster,
                                cur_bytes);
1865 1866 1867
            break;

        case QCOW2_CLUSTER_NORMAL:
1868
            if ((cluster_offset & 511) != 0) {
1869 1870
                ret = -EIO;
                goto fail;
1871
            }
1872

1873
            if (bs->encrypted) {
1874
                assert(s->crypto);
1875

1876 1877 1878 1879
                /*
                 * For encrypted images, read everything into a temporary
                 * contiguous buffer on which the AES functions can work.
                 */
1880 1881
                if (!cluster_data) {
                    cluster_data =
K
Kevin Wolf 已提交
1882 1883 1884
                        qemu_try_blockalign(bs->file->bs,
                                            QCOW_MAX_CRYPT_CLUSTERS
                                            * s->cluster_size);
1885 1886 1887 1888
                    if (cluster_data == NULL) {
                        ret = -ENOMEM;
                        goto fail;
                    }
1889 1890
                }

K
Kevin Wolf 已提交
1891
                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1892
                qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
1893
                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
1894 1895 1896 1897
            }

            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
            qemu_co_mutex_unlock(&s->lock);
1898
            ret = bdrv_co_preadv(bs->file,
K
Kevin Wolf 已提交
1899 1900
                                 cluster_offset + offset_in_cluster,
                                 cur_bytes, &hd_qiov, 0);
1901 1902
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
1903
                goto fail;
1904
            }
1905
            if (bs->encrypted) {
1906
                assert(s->crypto);
K
Kevin Wolf 已提交
1907 1908
                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1909
                if (qcrypto_block_decrypt(s->crypto,
1910 1911
                                          (s->crypt_physical_offset ?
                                           cluster_offset + offset_in_cluster :
1912
                                           offset),
1913
                                          cluster_data,
1914
                                          cur_bytes,
1915
                                          NULL) < 0) {
1916 1917 1918
                    ret = -EIO;
                    goto fail;
                }
K
Kevin Wolf 已提交
1919
                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
1920
            }
1921 1922 1923 1924 1925 1926
            break;

        default:
            g_assert_not_reached();
            ret = -EIO;
            goto fail;
1927
        }
1928

K
Kevin Wolf 已提交
1929 1930 1931
        bytes -= cur_bytes;
        offset += cur_bytes;
        bytes_done += cur_bytes;
1932
    }
1933
    ret = 0;
1934

1935
fail:
K
Kevin Wolf 已提交
1936
    qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
1937

1938
    qemu_iovec_destroy(&hd_qiov);
1939
    qemu_vfree(cluster_data);
K
Kevin Wolf 已提交
1940 1941

    return ret;
B
bellard 已提交
1942 1943
}

1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
/* Check if it's possible to merge a write request with the writing of
 * the data from the COW regions */
static bool merge_cow(uint64_t offset, unsigned bytes,
                      QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
{
    QCowL2Meta *m;

    for (m = l2meta; m != NULL; m = m->next) {
        /* If both COW regions are empty then there's nothing to merge */
        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
            continue;
        }

        /* The data (middle) region must be immediately after the
         * start region */
        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
            continue;
        }

        /* The end region must be immediately after the data (middle)
         * region */
        if (m->offset + m->cow_end.offset != offset + bytes) {
            continue;
        }

        /* Make sure that adding both COW regions to the QEMUIOVector
         * does not exceed IOV_MAX */
        if (hd_qiov->niov > IOV_MAX - 2) {
            continue;
        }

        m->data_qiov = hd_qiov;
        return true;
    }

    return false;
}

K
Kevin Wolf 已提交
1982 1983 1984
static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
B
bellard 已提交
1985
{
1986
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
1987
    int offset_in_cluster;
K
Kevin Wolf 已提交
1988
    int ret;
K
Kevin Wolf 已提交
1989
    unsigned int cur_bytes; /* number of sectors in current iteration */
1990
    uint64_t cluster_offset;
1991 1992 1993
    QEMUIOVector hd_qiov;
    uint64_t bytes_done = 0;
    uint8_t *cluster_data = NULL;
1994
    QCowL2Meta *l2meta = NULL;
1995

K
Kevin Wolf 已提交
1996
    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
K
Kevin Wolf 已提交
1997

1998 1999 2000
    qemu_iovec_init(&hd_qiov, qiov->niov);

    s->cluster_cache_offset = -1; /* disable compressed cache */
2001

2002 2003
    qemu_co_mutex_lock(&s->lock);

K
Kevin Wolf 已提交
2004
    while (bytes != 0) {
2005

2006
        l2meta = NULL;
K
Kevin Wolf 已提交
2007

K
Kevin Wolf 已提交
2008
        trace_qcow2_writev_start_part(qemu_coroutine_self());
K
Kevin Wolf 已提交
2009 2010 2011 2012 2013 2014
        offset_in_cluster = offset_into_cluster(s, offset);
        cur_bytes = MIN(bytes, INT_MAX);
        if (bs->encrypted) {
            cur_bytes = MIN(cur_bytes,
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
                            - offset_in_cluster);
2015
        }
2016

K
Kevin Wolf 已提交
2017 2018
        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
                                         &cluster_offset, &l2meta);
2019
        if (ret < 0) {
2020
            goto fail;
2021
        }
2022

2023
        assert((cluster_offset & 511) == 0);
2024

2025
        qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
2026
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
2027

2028
        if (bs->encrypted) {
2029
            assert(s->crypto);
2030
            if (!cluster_data) {
K
Kevin Wolf 已提交
2031
                cluster_data = qemu_try_blockalign(bs->file->bs,
2032 2033 2034 2035 2036 2037
                                                   QCOW_MAX_CRYPT_CLUSTERS
                                                   * s->cluster_size);
                if (cluster_data == NULL) {
                    ret = -ENOMEM;
                    goto fail;
                }
2038
            }
2039

2040
            assert(hd_qiov.size <=
2041
                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2042
            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
2043

2044 2045 2046
            if (qcrypto_block_encrypt(s->crypto,
                                      (s->crypt_physical_offset ?
                                       cluster_offset + offset_in_cluster :
2047
                                       offset),
2048
                                      cluster_data,
2049
                                      cur_bytes, NULL) < 0) {
2050 2051 2052
                ret = -EIO;
                goto fail;
            }
2053

2054
            qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
2055
            qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
2056
        }
2057

M
Max Reitz 已提交
2058
        ret = qcow2_pre_write_overlap_check(bs, 0,
K
Kevin Wolf 已提交
2059
                cluster_offset + offset_in_cluster, cur_bytes);
2060 2061 2062 2063
        if (ret < 0) {
            goto fail;
        }

2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079
        /* If we need to do COW, check if it's possible to merge the
         * writing of the guest data together with that of the COW regions.
         * If it's not possible (or not necessary) then write the
         * guest data now. */
        if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
            qemu_co_mutex_unlock(&s->lock);
            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
            trace_qcow2_writev_data(qemu_coroutine_self(),
                                    cluster_offset + offset_in_cluster);
            ret = bdrv_co_pwritev(bs->file,
                                  cluster_offset + offset_in_cluster,
                                  cur_bytes, &hd_qiov, 0);
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
                goto fail;
            }
2080
        }
2081

F
Fam Zheng 已提交
2082 2083 2084
        ret = qcow2_handle_l2meta(bs, &l2meta, true);
        if (ret) {
            goto fail;
2085
        }
2086

K
Kevin Wolf 已提交
2087 2088 2089 2090
        bytes -= cur_bytes;
        offset += cur_bytes;
        bytes_done += cur_bytes;
        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
2091
    }
2092
    ret = 0;
2093

2094
fail:
F
Fam Zheng 已提交
2095
    qcow2_handle_l2meta(bs, &l2meta, false);
2096

2097 2098
    qemu_co_mutex_unlock(&s->lock);

2099
    qemu_iovec_destroy(&hd_qiov);
2100
    qemu_vfree(cluster_data);
K
Kevin Wolf 已提交
2101
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
K
Kevin Wolf 已提交
2102

K
Kevin Wolf 已提交
2103
    return ret;
B
bellard 已提交
2104 2105
}

K
Kevin Wolf 已提交
2106 2107 2108 2109
static int qcow2_inactivate(BlockDriverState *bs)
{
    BDRVQcow2State *s = bs->opaque;
    int ret, result = 0;
2110
    Error *local_err = NULL;
K
Kevin Wolf 已提交
2111

2112 2113 2114 2115 2116 2117 2118 2119
    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
    if (local_err != NULL) {
        result = -EINVAL;
        error_report_err(local_err);
        error_report("Persistent bitmaps are lost for node '%s'",
                     bdrv_get_device_or_node_name(bs));
    }

K
Kevin Wolf 已提交
2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
    if (ret) {
        result = ret;
        error_report("Failed to flush the L2 table cache: %s",
                     strerror(-ret));
    }

    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret) {
        result = ret;
        error_report("Failed to flush the refcount block cache: %s",
                     strerror(-ret));
    }

    if (result == 0) {
        qcow2_mark_clean(bs);
    }

    return result;
}

2141
static void qcow2_close(BlockDriverState *bs)
B
bellard 已提交
2142
{
2143
    BDRVQcow2State *s = bs->opaque;
2144
    qemu_vfree(s->l1_table);
2145 2146
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
K
Kevin Wolf 已提交
2147

2148
    if (!(s->flags & BDRV_O_INACTIVE)) {
K
Kevin Wolf 已提交
2149
        qcow2_inactivate(bs);
2150
    }
S
Stefan Hajnoczi 已提交
2151

2152
    cache_clean_timer_del(bs);
2153 2154
    qcow2_cache_destroy(s->l2_table_cache);
    qcow2_cache_destroy(s->refcount_block_cache);
K
Kevin Wolf 已提交
2155

2156 2157
    qcrypto_block_free(s->crypto);
    s->crypto = NULL;
2158

K
Kevin Wolf 已提交
2159
    g_free(s->unknown_header_fields);
2160
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
2161

2162 2163 2164
    g_free(s->image_backing_file);
    g_free(s->image_backing_format);

2165
    g_free(s->cluster_cache);
2166
    qemu_vfree(s->cluster_data);
K
Kevin Wolf 已提交
2167
    qcow2_refcount_close(bs);
2168
    qcow2_free_snapshots(bs);
B
bellard 已提交
2169 2170
}

2171 2172
static void coroutine_fn qcow2_co_invalidate_cache(BlockDriverState *bs,
                                                   Error **errp)
2173
{
2174
    BDRVQcow2State *s = bs->opaque;
2175
    int flags = s->flags;
2176
    QCryptoBlock *crypto = NULL;
2177
    QDict *options;
2178 2179
    Error *local_err = NULL;
    int ret;
2180 2181 2182 2183 2184 2185

    /*
     * Backing files are read-only which makes all of their metadata immutable,
     * that means we don't have to worry about reopening them here.
     */

2186 2187
    crypto = s->crypto;
    s->crypto = NULL;
2188 2189 2190

    qcow2_close(bs);

2191
    memset(s, 0, sizeof(BDRVQcow2State));
2192
    options = qdict_clone_shallow(bs->options);
2193

2194
    flags &= ~BDRV_O_INACTIVE;
2195
    qemu_co_mutex_lock(&s->lock);
2196
    ret = qcow2_do_open(bs, options, flags, &local_err);
2197
    qemu_co_mutex_unlock(&s->lock);
2198
    qobject_unref(options);
2199
    if (local_err) {
2200 2201
        error_propagate(errp, local_err);
        error_prepend(errp, "Could not reopen qcow2 layer: ");
2202
        bs->drv = NULL;
2203 2204 2205
        return;
    } else if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
2206
        bs->drv = NULL;
2207 2208
        return;
    }
2209

2210
    s->crypto = crypto;
2211 2212
}

K
Kevin Wolf 已提交
2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226
static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
    size_t len, size_t buflen)
{
    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);

    if (buflen < ext_len) {
        return -ENOSPC;
    }

    *ext_backing_fmt = (QCowExtension) {
        .magic  = cpu_to_be32(magic),
        .len    = cpu_to_be32(len),
    };
2227 2228 2229 2230

    if (len) {
        memcpy(buf + sizeof(QCowExtension), s, len);
    }
K
Kevin Wolf 已提交
2231 2232 2233 2234

    return ext_len;
}

K
Kevin Wolf 已提交
2235
/*
K
Kevin Wolf 已提交
2236 2237 2238 2239
 * Updates the qcow2 header, including the variable length parts of it, i.e.
 * the backing file name and all extensions. qcow2 was not designed to allow
 * such changes, so if we run out of space (we can only use the first cluster)
 * this function may fail.
K
Kevin Wolf 已提交
2240 2241 2242
 *
 * Returns 0 on success, -errno in error cases.
 */
K
Kevin Wolf 已提交
2243
int qcow2_update_header(BlockDriverState *bs)
K
Kevin Wolf 已提交
2244
{
2245
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
2246 2247 2248
    QCowHeader *header;
    char *buf;
    size_t buflen = s->cluster_size;
K
Kevin Wolf 已提交
2249
    int ret;
K
Kevin Wolf 已提交
2250 2251
    uint64_t total_size;
    uint32_t refcount_table_clusters;
K
Kevin Wolf 已提交
2252
    size_t header_length;
2253
    Qcow2UnknownHeaderExtension *uext;
K
Kevin Wolf 已提交
2254

K
Kevin Wolf 已提交
2255
    buf = qemu_blockalign(bs, buflen);
K
Kevin Wolf 已提交
2256

K
Kevin Wolf 已提交
2257 2258
    /* Header structure */
    header = (QCowHeader*) buf;
K
Kevin Wolf 已提交
2259

K
Kevin Wolf 已提交
2260 2261 2262
    if (buflen < sizeof(*header)) {
        ret = -ENOSPC;
        goto fail;
K
Kevin Wolf 已提交
2263 2264
    }

K
Kevin Wolf 已提交
2265
    header_length = sizeof(*header) + s->unknown_header_fields_size;
K
Kevin Wolf 已提交
2266 2267 2268 2269
    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);

    *header = (QCowHeader) {
K
Kevin Wolf 已提交
2270
        /* Version 2 fields */
K
Kevin Wolf 已提交
2271
        .magic                  = cpu_to_be32(QCOW_MAGIC),
K
Kevin Wolf 已提交
2272
        .version                = cpu_to_be32(s->qcow_version),
K
Kevin Wolf 已提交
2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283
        .backing_file_offset    = 0,
        .backing_file_size      = 0,
        .cluster_bits           = cpu_to_be32(s->cluster_bits),
        .size                   = cpu_to_be64(total_size),
        .crypt_method           = cpu_to_be32(s->crypt_method_header),
        .l1_size                = cpu_to_be32(s->l1_size),
        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
K
Kevin Wolf 已提交
2284 2285 2286 2287 2288

        /* Version 3 fields */
        .incompatible_features  = cpu_to_be64(s->incompatible_features),
        .compatible_features    = cpu_to_be64(s->compatible_features),
        .autoclear_features     = cpu_to_be64(s->autoclear_features),
2289
        .refcount_order         = cpu_to_be32(s->refcount_order),
K
Kevin Wolf 已提交
2290
        .header_length          = cpu_to_be32(header_length),
K
Kevin Wolf 已提交
2291
    };
K
Kevin Wolf 已提交
2292

K
Kevin Wolf 已提交
2293 2294 2295 2296 2297 2298 2299 2300 2301
    /* For older versions, write a shorter header */
    switch (s->qcow_version) {
    case 2:
        ret = offsetof(QCowHeader, incompatible_features);
        break;
    case 3:
        ret = sizeof(*header);
        break;
    default:
2302 2303
        ret = -EINVAL;
        goto fail;
K
Kevin Wolf 已提交
2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320
    }

    buf += ret;
    buflen -= ret;
    memset(buf, 0, buflen);

    /* Preserve any unknown field in the header */
    if (s->unknown_header_fields_size) {
        if (buflen < s->unknown_header_fields_size) {
            ret = -ENOSPC;
            goto fail;
        }

        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
        buf += s->unknown_header_fields_size;
        buflen -= s->unknown_header_fields_size;
    }
K
Kevin Wolf 已提交
2321

K
Kevin Wolf 已提交
2322
    /* Backing file format header extension */
2323
    if (s->image_backing_format) {
K
Kevin Wolf 已提交
2324
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
2325 2326
                             s->image_backing_format,
                             strlen(s->image_backing_format),
K
Kevin Wolf 已提交
2327 2328 2329
                             buflen);
        if (ret < 0) {
            goto fail;
K
Kevin Wolf 已提交
2330 2331
        }

K
Kevin Wolf 已提交
2332 2333
        buf += ret;
        buflen -= ret;
K
Kevin Wolf 已提交
2334 2335
    }

2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351
    /* Full disk encryption header pointer extension */
    if (s->crypto_header.offset != 0) {
        cpu_to_be64s(&s->crypto_header.offset);
        cpu_to_be64s(&s->crypto_header.length);
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
                             &s->crypto_header, sizeof(s->crypto_header),
                             buflen);
        be64_to_cpus(&s->crypto_header.offset);
        be64_to_cpus(&s->crypto_header.length);
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
    }

2352
    /* Feature table */
2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378
    if (s->qcow_version >= 3) {
        Qcow2Feature features[] = {
            {
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
                .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
                .name = "dirty bit",
            },
            {
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
                .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
                .name = "corrupt bit",
            },
            {
                .type = QCOW2_FEAT_TYPE_COMPATIBLE,
                .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
                .name = "lazy refcounts",
            },
        };

        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
                             features, sizeof(features), buflen);
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
2379 2380
    }

2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399
    /* Bitmap extension */
    if (s->nb_bitmaps > 0) {
        Qcow2BitmapHeaderExt bitmaps_header = {
            .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
            .bitmap_directory_size =
                    cpu_to_be64(s->bitmap_directory_size),
            .bitmap_directory_offset =
                    cpu_to_be64(s->bitmap_directory_offset)
        };
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
                             &bitmaps_header, sizeof(bitmaps_header),
                             buflen);
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
    }

2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410
    /* Keep unknown header extensions */
    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
        if (ret < 0) {
            goto fail;
        }

        buf += ret;
        buflen -= ret;
    }

K
Kevin Wolf 已提交
2411 2412
    /* End of header extensions */
    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
K
Kevin Wolf 已提交
2413 2414 2415 2416
    if (ret < 0) {
        goto fail;
    }

K
Kevin Wolf 已提交
2417 2418
    buf += ret;
    buflen -= ret;
K
Kevin Wolf 已提交
2419

K
Kevin Wolf 已提交
2420
    /* Backing file name */
2421 2422
    if (s->image_backing_file) {
        size_t backing_file_len = strlen(s->image_backing_file);
K
Kevin Wolf 已提交
2423 2424 2425 2426 2427 2428

        if (buflen < backing_file_len) {
            ret = -ENOSPC;
            goto fail;
        }

2429
        /* Using strncpy is ok here, since buf is not NUL-terminated. */
2430
        strncpy(buf, s->image_backing_file, buflen);
K
Kevin Wolf 已提交
2431 2432 2433

        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
        header->backing_file_size   = cpu_to_be32(backing_file_len);
K
Kevin Wolf 已提交
2434 2435
    }

K
Kevin Wolf 已提交
2436
    /* Write the new header */
2437
    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
K
Kevin Wolf 已提交
2438 2439 2440 2441 2442 2443
    if (ret < 0) {
        goto fail;
    }

    ret = 0;
fail:
K
Kevin Wolf 已提交
2444
    qemu_vfree(header);
K
Kevin Wolf 已提交
2445 2446 2447 2448 2449 2450
    return ret;
}

static int qcow2_change_backing_file(BlockDriverState *bs,
    const char *backing_file, const char *backing_fmt)
{
2451
    BDRVQcow2State *s = bs->opaque;
2452

2453 2454 2455 2456
    if (backing_file && strlen(backing_file) > 1023) {
        return -EINVAL;
    }

K
Kevin Wolf 已提交
2457 2458 2459
    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");

2460 2461 2462 2463 2464 2465
    g_free(s->image_backing_file);
    g_free(s->image_backing_format);

    s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
    s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;

K
Kevin Wolf 已提交
2466
    return qcow2_update_header(bs);
K
Kevin Wolf 已提交
2467 2468
}

2469 2470 2471 2472 2473 2474 2475 2476 2477 2478
static int qcow2_crypt_method_from_format(const char *encryptfmt)
{
    if (g_str_equal(encryptfmt, "luks")) {
        return QCOW_CRYPT_LUKS;
    } else if (g_str_equal(encryptfmt, "aes")) {
        return QCOW_CRYPT_AES;
    } else {
        return -EINVAL;
    }
}
2479

2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497
static int qcow2_set_up_encryption(BlockDriverState *bs,
                                   QCryptoBlockCreateOptions *cryptoopts,
                                   Error **errp)
{
    BDRVQcow2State *s = bs->opaque;
    QCryptoBlock *crypto = NULL;
    int fmt, ret;

    switch (cryptoopts->format) {
    case Q_CRYPTO_BLOCK_FORMAT_LUKS:
        fmt = QCOW_CRYPT_LUKS;
        break;
    case Q_CRYPTO_BLOCK_FORMAT_QCOW:
        fmt = QCOW_CRYPT_AES;
        break;
    default:
        error_setg(errp, "Crypto format not supported in qcow2");
        return -EINVAL;
2498
    }
2499

2500
    s->crypt_method_header = fmt;
2501

2502
    crypto = qcrypto_block_create(cryptoopts, "encrypt.",
2503 2504
                                  qcow2_crypto_hdr_init_func,
                                  qcow2_crypto_hdr_write_func,
2505 2506
                                  bs, errp);
    if (!crypto) {
2507
        return -EINVAL;
2508 2509 2510 2511 2512 2513 2514 2515
    }

    ret = qcow2_update_header(bs);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write encryption header");
        goto out;
    }

2516
    ret = 0;
2517 2518 2519 2520 2521
 out:
    qcrypto_block_free(crypto);
    return ret;
}

2522 2523 2524 2525 2526 2527 2528
/**
 * Preallocates metadata structures for data clusters between @offset (in the
 * guest disk) and @new_length (which is thus generally the new guest disk
 * size).
 *
 * Returns: 0 on success, -errno on failure.
 */
2529 2530
static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
                                       uint64_t new_length)
K
Kevin Wolf 已提交
2531
{
K
Kevin Wolf 已提交
2532
    uint64_t bytes;
K
Kevin Wolf 已提交
2533
    uint64_t host_offset = 0;
K
Kevin Wolf 已提交
2534
    unsigned int cur_bytes;
2535
    int ret;
2536
    QCowL2Meta *meta;
K
Kevin Wolf 已提交
2537

2538 2539
    assert(offset <= new_length);
    bytes = new_length - offset;
K
Kevin Wolf 已提交
2540

K
Kevin Wolf 已提交
2541 2542 2543
    while (bytes) {
        cur_bytes = MIN(bytes, INT_MAX);
        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
K
Kevin Wolf 已提交
2544
                                         &host_offset, &meta);
2545
        if (ret < 0) {
2546
            return ret;
K
Kevin Wolf 已提交
2547 2548
        }

2549 2550 2551
        while (meta) {
            QCowL2Meta *next = meta->next;

H
Hu Tao 已提交
2552 2553 2554 2555
            ret = qcow2_alloc_cluster_link_l2(bs, meta);
            if (ret < 0) {
                qcow2_free_any_clusters(bs, meta->alloc_offset,
                                        meta->nb_clusters, QCOW2_DISCARD_NEVER);
2556
                return ret;
H
Hu Tao 已提交
2557 2558 2559 2560
            }

            /* There are no dependent requests, but we need to remove our
             * request from the list of in-flight requests */
2561
            QLIST_REMOVE(meta, next_in_flight);
2562 2563 2564

            g_free(meta);
            meta = next;
2565
        }
2566

K
Kevin Wolf 已提交
2567 2568
        /* TODO Preallocate data if requested */

K
Kevin Wolf 已提交
2569 2570
        bytes -= cur_bytes;
        offset += cur_bytes;
K
Kevin Wolf 已提交
2571 2572 2573 2574 2575 2576 2577
    }

    /*
     * It is expected that the image file is large enough to actually contain
     * all of the allocated clusters (otherwise we get failing reads after
     * EOF). Extend the image to the last allocated sector.
     */
K
Kevin Wolf 已提交
2578
    if (host_offset != 0) {
K
Kevin Wolf 已提交
2579
        uint8_t data = 0;
2580
        ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1,
K
Kevin Wolf 已提交
2581
                          &data, 1);
2582
        if (ret < 0) {
2583
            return ret;
2584
        }
K
Kevin Wolf 已提交
2585 2586
    }

2587
    return 0;
K
Kevin Wolf 已提交
2588 2589
}

2590 2591 2592 2593
/* qcow2_refcount_metadata_size:
 * @clusters: number of clusters to refcount (including data and L1/L2 tables)
 * @cluster_size: size of a cluster, in bytes
 * @refcount_order: refcount bits power-of-2 exponent
2594 2595
 * @generous_increase: allow for the refcount table to be 1.5x as large as it
 *                     needs to be
2596 2597 2598
 *
 * Returns: Number of bytes required for refcount blocks and table metadata.
 */
2599 2600 2601
int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                     int refcount_order, bool generous_increase,
                                     uint64_t *refblock_count)
2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623
{
    /*
     * Every host cluster is reference-counted, including metadata (even
     * refcount metadata is recursively included).
     *
     * An accurate formula for the size of refcount metadata size is difficult
     * to derive.  An easier method of calculation is finding the fixed point
     * where no further refcount blocks or table clusters are required to
     * reference count every cluster.
     */
    int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t);
    int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
    int64_t table = 0;  /* number of refcount table clusters */
    int64_t blocks = 0; /* number of refcount block clusters */
    int64_t last;
    int64_t n = 0;

    do {
        last = n;
        blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
        table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
        n = clusters + blocks + table;
2624 2625 2626 2627 2628 2629

        if (n == last && generous_increase) {
            clusters += DIV_ROUND_UP(table, 2);
            n = 0; /* force another loop */
            generous_increase = false;
        }
2630 2631
    } while (n != last);

2632 2633 2634 2635
    if (refblock_count) {
        *refblock_count = blocks;
    }

2636 2637 2638
    return (blocks + table) * cluster_size;
}

2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652
/**
 * qcow2_calc_prealloc_size:
 * @total_size: virtual disk size in bytes
 * @cluster_size: cluster size in bytes
 * @refcount_order: refcount bits power-of-2 exponent
 *
 * Returns: Total number of bytes required for the fully allocated image
 * (including metadata).
 */
static int64_t qcow2_calc_prealloc_size(int64_t total_size,
                                        size_t cluster_size,
                                        int refcount_order)
{
    int64_t meta_size = 0;
2653
    uint64_t nl1e, nl2e;
2654
    int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);
2655 2656 2657 2658 2659 2660

    /* header: 1 cluster */
    meta_size += cluster_size;

    /* total size of L2 tables */
    nl2e = aligned_total_size / cluster_size;
2661
    nl2e = ROUND_UP(nl2e, cluster_size / sizeof(uint64_t));
2662 2663 2664 2665
    meta_size += nl2e * sizeof(uint64_t);

    /* total size of L1 tables */
    nl1e = nl2e * sizeof(uint64_t) / cluster_size;
2666
    nl1e = ROUND_UP(nl1e, cluster_size / sizeof(uint64_t));
2667 2668
    meta_size += nl1e * sizeof(uint64_t);

2669 2670 2671
    /* total size of refcount table and blocks */
    meta_size += qcow2_refcount_metadata_size(
            (meta_size + aligned_total_size) / cluster_size,
2672
            cluster_size, refcount_order, false, NULL);
2673 2674 2675 2676

    return meta_size + aligned_total_size;
}

2677
static bool validate_cluster_size(size_t cluster_size, Error **errp)
K
Kevin Wolf 已提交
2678
{
2679
    int cluster_bits = ctz32(cluster_size);
K
Kevin Wolf 已提交
2680 2681 2682
    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
        (1 << cluster_bits) != cluster_size)
    {
M
Max Reitz 已提交
2683 2684
        error_setg(errp, "Cluster size must be a power of two between %d and "
                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696
        return false;
    }
    return true;
}

static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp)
{
    size_t cluster_size;

    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
                                         DEFAULT_CLUSTER_SIZE);
    if (!validate_cluster_size(cluster_size, errp)) {
2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738
        return 0;
    }
    return cluster_size;
}

static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
{
    char *buf;
    int ret;

    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
    if (!buf) {
        ret = 3; /* default */
    } else if (!strcmp(buf, "0.10")) {
        ret = 2;
    } else if (!strcmp(buf, "1.1")) {
        ret = 3;
    } else {
        error_setg(errp, "Invalid compatibility level: '%s'", buf);
        ret = -EINVAL;
    }
    g_free(buf);
    return ret;
}

static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
                                                Error **errp)
{
    uint64_t refcount_bits;

    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
        error_setg(errp, "Refcount width must be a power of two and may not "
                   "exceed 64 bits");
        return 0;
    }

    if (version < 3 && refcount_bits != 16) {
        error_setg(errp, "Different refcount widths than 16 bits require "
                   "compatibility level 1.1 or above (use compat=1.1 or "
                   "greater)");
        return 0;
K
Kevin Wolf 已提交
2739 2740
    }

2741 2742 2743
    return refcount_bits;
}

2744
static int coroutine_fn
2745
qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
2746
{
2747
    BlockdevCreateOptionsQcow2 *qcow2_opts;
2748 2749
    QDict *options;

K
Kevin Wolf 已提交
2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761
    /*
     * Open the image file and write a minimal qcow2 header.
     *
     * We keep things simple and start with a zero-sized image. We also
     * do without refcount blocks or a L1 table for now. We'll fix the
     * inconsistency later.
     *
     * We do need a refcount table because growing the refcount table means
     * allocating two new refcount blocks - the seconds of which would be at
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
     * size for any qcow2 image.
     */
2762 2763
    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;
2764
    QCowHeader *header;
2765 2766 2767
    size_t cluster_size;
    int version;
    int refcount_order;
2768
    uint64_t* refcount_table;
M
Max Reitz 已提交
2769
    Error *local_err = NULL;
K
Kevin Wolf 已提交
2770 2771
    int ret;

2772 2773 2774
    assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
    qcow2_opts = &create_options->u.qcow2;

2775 2776 2777 2778 2779 2780
    bs = bdrv_open_blockdev_ref(qcow2_opts->file, errp);
    if (bs == NULL) {
        return -EIO;
    }

    /* Validate options and set default values */
2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808
    if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) {
        error_setg(errp, "Image size must be a multiple of 512 bytes");
        ret = -EINVAL;
        goto out;
    }

    if (qcow2_opts->has_version) {
        switch (qcow2_opts->version) {
        case BLOCKDEV_QCOW2_VERSION_V2:
            version = 2;
            break;
        case BLOCKDEV_QCOW2_VERSION_V3:
            version = 3;
            break;
        default:
            g_assert_not_reached();
        }
    } else {
        version = 3;
    }

    if (qcow2_opts->has_cluster_size) {
        cluster_size = qcow2_opts->cluster_size;
    } else {
        cluster_size = DEFAULT_CLUSTER_SIZE;
    }

    if (!validate_cluster_size(cluster_size, errp)) {
2809 2810
        ret = -EINVAL;
        goto out;
2811 2812 2813 2814 2815 2816 2817 2818 2819 2820
    }

    if (!qcow2_opts->has_preallocation) {
        qcow2_opts->preallocation = PREALLOC_MODE_OFF;
    }
    if (qcow2_opts->has_backing_file &&
        qcow2_opts->preallocation != PREALLOC_MODE_OFF)
    {
        error_setg(errp, "Backing file and preallocation cannot be used at "
                   "the same time");
2821 2822
        ret = -EINVAL;
        goto out;
2823 2824 2825
    }
    if (qcow2_opts->has_backing_fmt && !qcow2_opts->has_backing_file) {
        error_setg(errp, "Backing format cannot be used without backing file");
2826 2827
        ret = -EINVAL;
        goto out;
2828 2829 2830 2831 2832 2833 2834
    }

    if (!qcow2_opts->has_lazy_refcounts) {
        qcow2_opts->lazy_refcounts = false;
    }
    if (version < 3 && qcow2_opts->lazy_refcounts) {
        error_setg(errp, "Lazy refcounts only supported with compatibility "
2835
                   "level 1.1 and above (use version=v3 or greater)");
2836 2837
        ret = -EINVAL;
        goto out;
2838 2839 2840 2841 2842 2843 2844 2845 2846 2847
    }

    if (!qcow2_opts->has_refcount_bits) {
        qcow2_opts->refcount_bits = 16;
    }
    if (qcow2_opts->refcount_bits > 64 ||
        !is_power_of_2(qcow2_opts->refcount_bits))
    {
        error_setg(errp, "Refcount width must be a power of two and may not "
                   "exceed 64 bits");
2848 2849
        ret = -EINVAL;
        goto out;
2850 2851 2852
    }
    if (version < 3 && qcow2_opts->refcount_bits != 16) {
        error_setg(errp, "Different refcount widths than 16 bits require "
2853
                   "compatibility level 1.1 or above (use version=v3 or "
2854
                   "greater)");
2855 2856
        ret = -EINVAL;
        goto out;
2857 2858 2859 2860 2861
    }
    refcount_order = ctz32(qcow2_opts->refcount_bits);


    /* Create BlockBackend to write to the image */
2862 2863
    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
    ret = blk_insert_bs(blk, bs, errp);
K
Kevin Wolf 已提交
2864
    if (ret < 0) {
2865
        goto out;
K
Kevin Wolf 已提交
2866
    }
2867 2868
    blk_set_allow_write_beyond_eof(blk, true);

2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887
    /* Clear the protocol layer and preallocate it if necessary */
    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
    if (ret < 0) {
        goto out;
    }

    if (qcow2_opts->preallocation == PREALLOC_MODE_FULL ||
        qcow2_opts->preallocation == PREALLOC_MODE_FALLOC)
    {
        int64_t prealloc_size =
            qcow2_calc_prealloc_size(qcow2_opts->size, cluster_size,
                                     refcount_order);

        ret = blk_truncate(blk, prealloc_size, qcow2_opts->preallocation, errp);
        if (ret < 0) {
            goto out;
        }
    }

K
Kevin Wolf 已提交
2888
    /* Write the header */
2889 2890 2891 2892 2893
    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
    header = g_malloc0(cluster_size);
    *header = (QCowHeader) {
        .magic                      = cpu_to_be32(QCOW_MAGIC),
        .version                    = cpu_to_be32(version),
2894
        .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
2895 2896 2897 2898 2899
        .size                       = cpu_to_be64(0),
        .l1_table_offset            = cpu_to_be64(0),
        .l1_size                    = cpu_to_be32(0),
        .refcount_table_offset      = cpu_to_be64(cluster_size),
        .refcount_table_clusters    = cpu_to_be32(1),
2900
        .refcount_order             = cpu_to_be32(refcount_order),
2901 2902
        .header_length              = cpu_to_be32(sizeof(*header)),
    };
K
Kevin Wolf 已提交
2903

2904 2905
    /* We'll update this to correct value later */
    header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
K
Kevin Wolf 已提交
2906

2907
    if (qcow2_opts->lazy_refcounts) {
2908
        header->compatible_features |=
2909 2910 2911
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }

2912
    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
2913
    g_free(header);
K
Kevin Wolf 已提交
2914
    if (ret < 0) {
M
Max Reitz 已提交
2915
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
K
Kevin Wolf 已提交
2916 2917 2918
        goto out;
    }

2919 2920 2921
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
2922
    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
2923
    g_free(refcount_table);
K
Kevin Wolf 已提交
2924 2925

    if (ret < 0) {
M
Max Reitz 已提交
2926
        error_setg_errno(errp, -ret, "Could not write refcount table");
K
Kevin Wolf 已提交
2927 2928 2929
        goto out;
    }

2930 2931
    blk_unref(blk);
    blk = NULL;
K
Kevin Wolf 已提交
2932 2933 2934 2935 2936 2937

    /*
     * And now open the image and make it consistent first (i.e. increase the
     * refcount of the cluster that is occupied by the header and the refcount
     * table)
     */
2938
    options = qdict_new();
2939
    qdict_put_str(options, "driver", "qcow2");
2940 2941
    qdict_put_str(options, "file", bs->node_name);
    blk = blk_new_open(NULL, NULL, options,
2942 2943
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
                       &local_err);
2944
    if (blk == NULL) {
M
Max Reitz 已提交
2945
        error_propagate(errp, local_err);
2946
        ret = -EIO;
K
Kevin Wolf 已提交
2947 2948 2949
        goto out;
    }

2950
    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
K
Kevin Wolf 已提交
2951
    if (ret < 0) {
M
Max Reitz 已提交
2952 2953
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
                         "header and refcount table");
K
Kevin Wolf 已提交
2954 2955 2956 2957 2958 2959 2960
        goto out;

    } else if (ret != 0) {
        error_report("Huh, first cluster in empty image is already in use?");
        abort();
    }

2961
    /* Create a full header (including things like feature table) */
2962
    ret = qcow2_update_header(blk_bs(blk));
2963 2964 2965 2966 2967
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not update qcow2 header");
        goto out;
    }

K
Kevin Wolf 已提交
2968
    /* Okay, now that we have a valid image, let's give it the right size */
2969
    ret = blk_truncate(blk, qcow2_opts->size, PREALLOC_MODE_OFF, errp);
K
Kevin Wolf 已提交
2970
    if (ret < 0) {
2971
        error_prepend(errp, "Could not resize image: ");
K
Kevin Wolf 已提交
2972 2973 2974 2975
        goto out;
    }

    /* Want a backing file? There you go.*/
2976 2977 2978 2979 2980 2981 2982 2983 2984
    if (qcow2_opts->has_backing_file) {
        const char *backing_format = NULL;

        if (qcow2_opts->has_backing_fmt) {
            backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
        }

        ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
                                       backing_format);
K
Kevin Wolf 已提交
2985
        if (ret < 0) {
M
Max Reitz 已提交
2986
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
2987 2988
                             "with format '%s'", qcow2_opts->backing_file,
                             backing_format);
K
Kevin Wolf 已提交
2989 2990 2991 2992
            goto out;
        }
    }

2993
    /* Want encryption? There you go. */
2994 2995
    if (qcow2_opts->has_encrypt) {
        ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
2996 2997 2998 2999 3000
        if (ret < 0) {
            goto out;
        }
    }

K
Kevin Wolf 已提交
3001
    /* And if we're supposed to preallocate metadata, do that now */
3002
    if (qcow2_opts->preallocation != PREALLOC_MODE_OFF) {
3003 3004
        BDRVQcow2State *s = blk_bs(blk)->opaque;
        qemu_co_mutex_lock(&s->lock);
3005
        ret = preallocate_co(blk_bs(blk), 0, qcow2_opts->size);
3006 3007
        qemu_co_mutex_unlock(&s->lock);

K
Kevin Wolf 已提交
3008
        if (ret < 0) {
M
Max Reitz 已提交
3009
            error_setg_errno(errp, -ret, "Could not preallocate metadata");
K
Kevin Wolf 已提交
3010 3011 3012 3013
            goto out;
        }
    }

3014 3015
    blk_unref(blk);
    blk = NULL;
M
Max Reitz 已提交
3016

3017 3018 3019 3020 3021 3022
    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
     * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
     * have to setup decryption context. We're not doing any I/O on the top
     * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
     * not have effect.
     */
3023
    options = qdict_new();
3024
    qdict_put_str(options, "driver", "qcow2");
3025 3026
    qdict_put_str(options, "file", bs->node_name);
    blk = blk_new_open(NULL, NULL, options,
3027 3028
                       BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
                       &local_err);
3029
    if (blk == NULL) {
M
Max Reitz 已提交
3030
        error_propagate(errp, local_err);
3031
        ret = -EIO;
M
Max Reitz 已提交
3032 3033 3034
        goto out;
    }

K
Kevin Wolf 已提交
3035 3036
    ret = 0;
out:
3037 3038
    blk_unref(blk);
    bdrv_unref(bs);
K
Kevin Wolf 已提交
3039 3040
    return ret;
}
K
Kevin Wolf 已提交
3041

3042 3043
static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts,
                                             Error **errp)
K
Kevin Wolf 已提交
3044
{
3045
    BlockdevCreateOptions *create_options = NULL;
3046
    QDict *qdict;
3047
    Visitor *v;
3048
    BlockDriverState *bs = NULL;
M
Max Reitz 已提交
3049
    Error *local_err = NULL;
3050
    const char *val;
M
Max Reitz 已提交
3051
    int ret;
K
Kevin Wolf 已提交
3052

3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094
    /* Only the keyval visitor supports the dotted syntax needed for
     * encryption, so go through a QDict before getting a QAPI type. Ignore
     * options meant for the protocol layer so that the visitor doesn't
     * complain. */
    qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts,
                                        true);

    /* Handle encryption options */
    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
    if (val && !strcmp(val, "on")) {
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
    } else if (val && !strcmp(val, "off")) {
        qdict_del(qdict, BLOCK_OPT_ENCRYPT);
    }

    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
    if (val && !strcmp(val, "aes")) {
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
    }

    /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into
     * version=v2/v3 below. */
    val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL);
    if (val && !strcmp(val, "0.10")) {
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2");
    } else if (val && !strcmp(val, "1.1")) {
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3");
    }

    /* Change legacy command line options into QMP ones */
    static const QDictRenames opt_renames[] = {
        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
        { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
        { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
        { BLOCK_OPT_LAZY_REFCOUNTS,     "lazy-refcounts" },
        { BLOCK_OPT_REFCOUNT_BITS,      "refcount-bits" },
        { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
        { BLOCK_OPT_COMPAT_LEVEL,       "version" },
        { NULL, NULL },
    };

    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
3095 3096 3097
        ret = -EINVAL;
        goto finish;
    }
3098

3099 3100 3101
    /* Create and open the file (protocol layer) */
    ret = bdrv_create_file(filename, opts, errp);
    if (ret < 0) {
3102 3103
        goto finish;
    }
3104 3105 3106 3107 3108

    bs = bdrv_open(filename, NULL, NULL,
                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    if (bs == NULL) {
        ret = -EIO;
3109 3110
        goto finish;
    }
3111

3112 3113 3114 3115 3116
    /* Set 'driver' and 'node' options */
    qdict_put_str(qdict, "driver", "qcow2");
    qdict_put_str(qdict, "file", bs->node_name);

    /* Now get the QAPI type BlockdevCreateOptions */
3117 3118
    v = qobject_input_visitor_new_flat_confused(qdict, errp);
    if (!v) {
3119 3120 3121 3122
        ret = -EINVAL;
        goto finish;
    }

3123 3124
    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
    visit_free(v);
K
Kevin Wolf 已提交
3125

3126 3127
    if (local_err) {
        error_propagate(errp, local_err);
3128 3129 3130 3131
        ret = -EINVAL;
        goto finish;
    }

3132 3133 3134
    /* Silently round up size */
    create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size,
                                            BDRV_SECTOR_SIZE);
3135 3136

    /* Create the qcow2 image (format layer) */
3137
    ret = qcow2_co_create(create_options, errp);
3138 3139 3140
    if (ret < 0) {
        goto finish;
    }
3141

3142
    ret = 0;
3143
finish:
3144
    qobject_unref(qdict);
3145
    bdrv_unref(bs);
3146
    qapi_free_BlockdevCreateOptions(create_options);
M
Max Reitz 已提交
3147
    return ret;
K
Kevin Wolf 已提交
3148 3149
}

3150

3151
static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
3152
{
3153 3154
    int64_t nr;
    int res;
3155 3156

    /* Clamp to image length, before checking status of underlying sectors */
E
Eric Blake 已提交
3157 3158
    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
        bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
3159 3160
    }

3161
    if (!bytes) {
3162 3163
        return true;
    }
E
Eric Blake 已提交
3164
    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
3165
    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
3166 3167
}

3168
static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
3169
    int64_t offset, int bytes, BdrvRequestFlags flags)
K
Kevin Wolf 已提交
3170 3171
{
    int ret;
3172
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
3173

3174
    uint32_t head = offset % s->cluster_size;
3175
    uint32_t tail = (offset + bytes) % s->cluster_size;
3176

3177 3178
    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
    if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
3179 3180
        tail = 0;
    }
3181

3182 3183
    if (head || tail) {
        uint64_t off;
K
Kevin Wolf 已提交
3184
        unsigned int nr;
3185

3186
        assert(head + bytes <= s->cluster_size);
3187

3188
        /* check whether remainder of cluster already reads as zero */
3189 3190 3191
        if (!(is_zero(bs, offset - head, head) &&
              is_zero(bs, offset + bytes,
                      tail ? s->cluster_size - tail : 0))) {
3192 3193 3194 3195 3196
            return -ENOTSUP;
        }

        qemu_co_mutex_lock(&s->lock);
        /* We can have new write after previous check */
3197
        offset = QEMU_ALIGN_DOWN(offset, s->cluster_size);
3198
        bytes = s->cluster_size;
K
Kevin Wolf 已提交
3199
        nr = s->cluster_size;
3200
        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
3201 3202 3203
        if (ret != QCOW2_CLUSTER_UNALLOCATED &&
            ret != QCOW2_CLUSTER_ZERO_PLAIN &&
            ret != QCOW2_CLUSTER_ZERO_ALLOC) {
3204 3205 3206 3207 3208
            qemu_co_mutex_unlock(&s->lock);
            return -ENOTSUP;
        }
    } else {
        qemu_co_mutex_lock(&s->lock);
K
Kevin Wolf 已提交
3209 3210
    }

3211
    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
3212

K
Kevin Wolf 已提交
3213
    /* Whatever is left can use real zero clusters */
3214
    ret = qcow2_cluster_zeroize(bs, offset, bytes, flags);
K
Kevin Wolf 已提交
3215 3216 3217 3218 3219
    qemu_co_mutex_unlock(&s->lock);

    return ret;
}

3220
static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
3221
                                          int64_t offset, int bytes)
K
Kevin Wolf 已提交
3222
{
3223
    int ret;
3224
    BDRVQcow2State *s = bs->opaque;
3225

3226 3227
    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
        assert(bytes < s->cluster_size);
3228 3229 3230
        /* Ignore partial clusters, except for the special case of the
         * complete partial cluster at the end of an unaligned file */
        if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
3231
            offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
3232 3233
            return -ENOTSUP;
        }
3234 3235
    }

3236
    qemu_co_mutex_lock(&s->lock);
3237
    ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
3238
                                false);
3239 3240
    qemu_co_mutex_unlock(&s->lock);
    return ret;
K
Kevin Wolf 已提交
3241 3242
}

F
Fam Zheng 已提交
3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402
static int coroutine_fn
qcow2_co_copy_range_from(BlockDriverState *bs,
                         BdrvChild *src, uint64_t src_offset,
                         BdrvChild *dst, uint64_t dst_offset,
                         uint64_t bytes, BdrvRequestFlags flags)
{
    BDRVQcow2State *s = bs->opaque;
    int ret;
    unsigned int cur_bytes; /* number of bytes in current iteration */
    BdrvChild *child = NULL;
    BdrvRequestFlags cur_flags;

    assert(!bs->encrypted);
    qemu_co_mutex_lock(&s->lock);

    while (bytes != 0) {
        uint64_t copy_offset = 0;
        /* prepare next request */
        cur_bytes = MIN(bytes, INT_MAX);
        cur_flags = flags;

        ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, &copy_offset);
        if (ret < 0) {
            goto out;
        }

        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
            if (bs->backing && bs->backing->bs) {
                int64_t backing_length = bdrv_getlength(bs->backing->bs);
                if (src_offset >= backing_length) {
                    cur_flags |= BDRV_REQ_ZERO_WRITE;
                } else {
                    child = bs->backing;
                    cur_bytes = MIN(cur_bytes, backing_length - src_offset);
                    copy_offset = src_offset;
                }
            } else {
                cur_flags |= BDRV_REQ_ZERO_WRITE;
            }
            break;

        case QCOW2_CLUSTER_ZERO_PLAIN:
        case QCOW2_CLUSTER_ZERO_ALLOC:
            cur_flags |= BDRV_REQ_ZERO_WRITE;
            break;

        case QCOW2_CLUSTER_COMPRESSED:
            ret = -ENOTSUP;
            goto out;
            break;

        case QCOW2_CLUSTER_NORMAL:
            child = bs->file;
            copy_offset += offset_into_cluster(s, src_offset);
            if ((copy_offset & 511) != 0) {
                ret = -EIO;
                goto out;
            }
            break;

        default:
            abort();
        }
        qemu_co_mutex_unlock(&s->lock);
        ret = bdrv_co_copy_range_from(child,
                                      copy_offset,
                                      dst, dst_offset,
                                      cur_bytes, cur_flags);
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto out;
        }

        bytes -= cur_bytes;
        src_offset += cur_bytes;
        dst_offset += cur_bytes;
    }
    ret = 0;

out:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

static int coroutine_fn
qcow2_co_copy_range_to(BlockDriverState *bs,
                       BdrvChild *src, uint64_t src_offset,
                       BdrvChild *dst, uint64_t dst_offset,
                       uint64_t bytes, BdrvRequestFlags flags)
{
    BDRVQcow2State *s = bs->opaque;
    int offset_in_cluster;
    int ret;
    unsigned int cur_bytes; /* number of sectors in current iteration */
    uint64_t cluster_offset;
    uint8_t *cluster_data = NULL;
    QCowL2Meta *l2meta = NULL;

    assert(!bs->encrypted);
    s->cluster_cache_offset = -1; /* disable compressed cache */

    qemu_co_mutex_lock(&s->lock);

    while (bytes != 0) {

        l2meta = NULL;

        offset_in_cluster = offset_into_cluster(s, dst_offset);
        cur_bytes = MIN(bytes, INT_MAX);

        /* TODO:
         * If src->bs == dst->bs, we could simply copy by incrementing
         * the refcnt, without copying user data.
         * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
        ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes,
                                         &cluster_offset, &l2meta);
        if (ret < 0) {
            goto fail;
        }

        assert((cluster_offset & 511) == 0);

        ret = qcow2_pre_write_overlap_check(bs, 0,
                cluster_offset + offset_in_cluster, cur_bytes);
        if (ret < 0) {
            goto fail;
        }

        qemu_co_mutex_unlock(&s->lock);
        ret = bdrv_co_copy_range_to(src, src_offset,
                                    bs->file,
                                    cluster_offset + offset_in_cluster,
                                    cur_bytes, flags);
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto fail;
        }

        ret = qcow2_handle_l2meta(bs, &l2meta, true);
        if (ret) {
            goto fail;
        }

        bytes -= cur_bytes;
        dst_offset += cur_bytes;
    }
    ret = 0;

fail:
    qcow2_handle_l2meta(bs, &l2meta, false);

    qemu_co_mutex_unlock(&s->lock);

    qemu_vfree(cluster_data);
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);

    return ret;
}

3403 3404
static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
                                          PreallocMode prealloc, Error **errp)
3405
{
3406
    BDRVQcow2State *s = bs->opaque;
3407
    uint64_t old_length;
3408 3409
    int64_t new_l1_size;
    int ret;
3410

3411 3412 3413
    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
        prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
    {
3414
        error_setg(errp, "Unsupported preallocation mode '%s'",
3415
                   PreallocMode_str(prealloc));
3416 3417 3418
        return -ENOTSUP;
    }

3419
    if (offset & 511) {
3420
        error_setg(errp, "The new size must be a multiple of 512");
3421 3422 3423
        return -EINVAL;
    }

3424 3425
    qemu_co_mutex_lock(&s->lock);

3426 3427
    /* cannot proceed if image has snapshots */
    if (s->nb_snapshots) {
3428
        error_setg(errp, "Can't resize an image which has snapshots");
3429 3430
        ret = -ENOTSUP;
        goto fail;
3431 3432
    }

3433 3434 3435 3436
    /* cannot proceed if image has bitmaps */
    if (s->nb_bitmaps) {
        /* TODO: resize bitmaps in the image */
        error_setg(errp, "Can't resize an image which has bitmaps");
3437 3438
        ret = -ENOTSUP;
        goto fail;
3439 3440
    }

3441
    old_length = bs->total_sectors * 512;
P
Pavel Butsykin 已提交
3442
    new_l1_size = size_to_l1(s, offset);
3443 3444

    if (offset < old_length) {
3445
        int64_t last_cluster, old_file_size;
P
Pavel Butsykin 已提交
3446 3447 3448
        if (prealloc != PREALLOC_MODE_OFF) {
            error_setg(errp,
                       "Preallocation can't be used for shrinking an image");
3449 3450
            ret = -EINVAL;
            goto fail;
P
Pavel Butsykin 已提交
3451
        }
3452

P
Pavel Butsykin 已提交
3453 3454 3455 3456 3457 3458
        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
                                    old_length - ROUND_UP(offset,
                                                          s->cluster_size),
                                    QCOW2_DISCARD_ALWAYS, true);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
3459
            goto fail;
P
Pavel Butsykin 已提交
3460 3461 3462 3463 3464 3465
        }

        ret = qcow2_shrink_l1_table(bs, new_l1_size);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to reduce the number of L2 tables");
3466
            goto fail;
P
Pavel Butsykin 已提交
3467 3468 3469 3470 3471 3472
        }

        ret = qcow2_shrink_reftable(bs);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to discard unused refblocks");
3473
            goto fail;
P
Pavel Butsykin 已提交
3474
        }
3475 3476 3477 3478 3479

        old_file_size = bdrv_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
3480 3481
            ret = old_file_size;
            goto fail;
3482 3483 3484 3485 3486
        }
        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
        if (last_cluster < 0) {
            error_setg_errno(errp, -last_cluster,
                             "Failed to find the last cluster");
3487 3488
            ret = last_cluster;
            goto fail;
3489 3490
        }
        if ((last_cluster + 1) * s->cluster_size < old_file_size) {
3491 3492
            Error *local_err = NULL;

3493 3494
            bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
                             PREALLOC_MODE_OFF, &local_err);
3495 3496 3497
            if (local_err) {
                warn_reportf_err(local_err,
                                 "Failed to truncate the tail of the image: ");
3498 3499
            }
        }
P
Pavel Butsykin 已提交
3500 3501 3502 3503
    } else {
        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
3504
            goto fail;
P
Pavel Butsykin 已提交
3505
        }
3506 3507
    }

3508 3509 3510 3511 3512
    switch (prealloc) {
    case PREALLOC_MODE_OFF:
        break;

    case PREALLOC_MODE_METADATA:
3513
        ret = preallocate_co(bs, old_length, offset);
3514 3515
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Preallocation failed");
3516
            goto fail;
3517 3518 3519
        }
        break;

3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531
    case PREALLOC_MODE_FALLOC:
    case PREALLOC_MODE_FULL:
    {
        int64_t allocation_start, host_offset, guest_offset;
        int64_t clusters_allocated;
        int64_t old_file_size, new_file_size;
        uint64_t nb_new_data_clusters, nb_new_l2_tables;

        old_file_size = bdrv_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
3532 3533
            ret = old_file_size;
            goto fail;
3534
        }
3535
        old_file_size = ROUND_UP(old_file_size, s->cluster_size);
3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562

        nb_new_data_clusters = DIV_ROUND_UP(offset - old_length,
                                            s->cluster_size);

        /* This is an overestimation; we will not actually allocate space for
         * these in the file but just make sure the new refcount structures are
         * able to cover them so we will not have to allocate new refblocks
         * while entering the data blocks in the potentially new L2 tables.
         * (We do not actually care where the L2 tables are placed. Maybe they
         *  are already allocated or they can be placed somewhere before
         *  @old_file_size. It does not matter because they will be fully
         *  allocated automatically, so they do not need to be covered by the
         *  preallocation. All that matters is that we will not have to allocate
         *  new refcount structures for them.) */
        nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
                                        s->cluster_size / sizeof(uint64_t));
        /* The cluster range may not be aligned to L2 boundaries, so add one L2
         * table for a potential head/tail */
        nb_new_l2_tables++;

        allocation_start = qcow2_refcount_area(bs, old_file_size,
                                               nb_new_data_clusters +
                                               nb_new_l2_tables,
                                               true, 0, 0);
        if (allocation_start < 0) {
            error_setg_errno(errp, -allocation_start,
                             "Failed to resize refcount structures");
3563 3564
            ret = allocation_start;
            goto fail;
3565 3566 3567 3568 3569 3570 3571
        }

        clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
                                                     nb_new_data_clusters);
        if (clusters_allocated < 0) {
            error_setg_errno(errp, -clusters_allocated,
                             "Failed to allocate data clusters");
3572 3573
            ret = clusters_allocated;
            goto fail;
3574 3575 3576 3577 3578 3579 3580
        }

        assert(clusters_allocated == nb_new_data_clusters);

        /* Allocate the data area */
        new_file_size = allocation_start +
                        nb_new_data_clusters * s->cluster_size;
3581
        ret = bdrv_co_truncate(bs->file, new_file_size, prealloc, errp);
3582 3583 3584 3585 3586
        if (ret < 0) {
            error_prepend(errp, "Failed to resize underlying file: ");
            qcow2_free_clusters(bs, allocation_start,
                                nb_new_data_clusters * s->cluster_size,
                                QCOW2_DISCARD_OTHER);
3587
            goto fail;
3588 3589 3590 3591 3592 3593
        }

        /* Create the necessary L2 entries */
        host_offset = allocation_start;
        guest_offset = old_length;
        while (nb_new_data_clusters) {
3594 3595 3596
            int64_t nb_clusters = MIN(
                nb_new_data_clusters,
                s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609
            QCowL2Meta allocation = {
                .offset       = guest_offset,
                .alloc_offset = host_offset,
                .nb_clusters  = nb_clusters,
            };
            qemu_co_queue_init(&allocation.dependent_requests);

            ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
            if (ret < 0) {
                error_setg_errno(errp, -ret, "Failed to update L2 tables");
                qcow2_free_clusters(bs, host_offset,
                                    nb_new_data_clusters * s->cluster_size,
                                    QCOW2_DISCARD_OTHER);
3610
                goto fail;
3611 3612 3613 3614 3615 3616 3617 3618 3619
            }

            guest_offset += nb_clusters * s->cluster_size;
            host_offset += nb_clusters * s->cluster_size;
            nb_new_data_clusters -= nb_clusters;
        }
        break;
    }

3620 3621 3622 3623 3624 3625
    default:
        g_assert_not_reached();
    }

    if (prealloc != PREALLOC_MODE_OFF) {
        /* Flush metadata before actually changing the image size */
3626
        ret = qcow2_write_caches(bs);
3627 3628 3629
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to flush the preallocated area to disk");
3630
            goto fail;
3631 3632 3633
        }
    }

3634 3635
    /* write updated header.size */
    offset = cpu_to_be64(offset);
3636
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
3637
                           &offset, sizeof(uint64_t));
3638
    if (ret < 0) {
3639
        error_setg_errno(errp, -ret, "Failed to update the image size");
3640
        goto fail;
3641 3642 3643
    }

    s->l1_vm_state_index = new_l1_size;
3644 3645 3646 3647
    ret = 0;
fail:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
3648 3649
}

B
Blue Swirl 已提交
3650 3651
/* XXX: put compressed sectors first, then all the cluster aligned
   tables to avoid losing bytes in alignment */
3652 3653 3654
static coroutine_fn int
qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                            uint64_t bytes, QEMUIOVector *qiov)
B
Blue Swirl 已提交
3655
{
3656
    BDRVQcow2State *s = bs->opaque;
3657 3658
    QEMUIOVector hd_qiov;
    struct iovec iov;
B
Blue Swirl 已提交
3659 3660
    z_stream strm;
    int ret, out_len;
3661
    uint8_t *buf, *out_buf;
3662
    int64_t cluster_offset;
B
Blue Swirl 已提交
3663

3664
    if (bytes == 0) {
B
Blue Swirl 已提交
3665 3666
        /* align end of file to a sector boundary to ease reading with
           sector based I/Os */
K
Kevin Wolf 已提交
3667
        cluster_offset = bdrv_getlength(bs->file->bs);
3668 3669 3670
        if (cluster_offset < 0) {
            return cluster_offset;
        }
3671 3672
        return bdrv_co_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF,
                                NULL);
B
Blue Swirl 已提交
3673 3674
    }

3675 3676 3677 3678
    if (offset_into_cluster(s, offset)) {
        return -EINVAL;
    }

3679
    buf = qemu_blockalign(bs, s->cluster_size);
3680
    if (bytes != s->cluster_size) {
3681 3682
        if (bytes > s->cluster_size ||
            offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
3683
        {
3684 3685
            qemu_vfree(buf);
            return -EINVAL;
3686
        }
3687 3688
        /* Zero-pad last write if image size is not cluster aligned */
        memset(buf + bytes, 0, s->cluster_size - bytes);
3689
    }
3690
    qemu_iovec_to_buf(qiov, 0, buf, bytes);
B
Blue Swirl 已提交
3691

3692
    out_buf = g_malloc(s->cluster_size);
B
Blue Swirl 已提交
3693 3694 3695 3696 3697 3698 3699

    /* best compression, small window, no zlib header */
    memset(&strm, 0, sizeof(strm));
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
                       Z_DEFLATED, -12,
                       9, Z_DEFAULT_STRATEGY);
    if (ret != 0) {
3700 3701
        ret = -EINVAL;
        goto fail;
B
Blue Swirl 已提交
3702 3703 3704 3705 3706 3707 3708 3709 3710 3711
    }

    strm.avail_in = s->cluster_size;
    strm.next_in = (uint8_t *)buf;
    strm.avail_out = s->cluster_size;
    strm.next_out = out_buf;

    ret = deflate(&strm, Z_FINISH);
    if (ret != Z_STREAM_END && ret != Z_OK) {
        deflateEnd(&strm);
3712 3713
        ret = -EINVAL;
        goto fail;
B
Blue Swirl 已提交
3714 3715 3716 3717 3718 3719 3720
    }
    out_len = strm.next_out - out_buf;

    deflateEnd(&strm);

    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
        /* could not compress: write normal cluster */
3721
        ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0);
3722 3723 3724
        if (ret < 0) {
            goto fail;
        }
3725 3726
        goto success;
    }
3727

3728 3729 3730 3731 3732 3733 3734 3735 3736
    qemu_co_mutex_lock(&s->lock);
    cluster_offset =
        qcow2_alloc_compressed_cluster_offset(bs, offset, out_len);
    if (!cluster_offset) {
        qemu_co_mutex_unlock(&s->lock);
        ret = -EIO;
        goto fail;
    }
    cluster_offset &= s->cluster_offset_mask;
3737

3738 3739 3740 3741
    ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        goto fail;
B
Blue Swirl 已提交
3742 3743
    }

3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755
    iov = (struct iovec) {
        .iov_base   = out_buf,
        .iov_len    = out_len,
    };
    qemu_iovec_init_external(&hd_qiov, &iov, 1);

    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
    ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
    if (ret < 0) {
        goto fail;
    }
success:
3756 3757
    ret = 0;
fail:
3758
    qemu_vfree(buf);
3759
    g_free(out_buf);
3760
    return ret;
B
Blue Swirl 已提交
3761 3762
}

M
Max Reitz 已提交
3763 3764
static int make_completely_empty(BlockDriverState *bs)
{
3765
    BDRVQcow2State *s = bs->opaque;
3766
    Error *local_err = NULL;
M
Max Reitz 已提交
3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800
    int ret, l1_clusters;
    int64_t offset;
    uint64_t *new_reftable = NULL;
    uint64_t rt_entry, l1_size2;
    struct {
        uint64_t l1_offset;
        uint64_t reftable_offset;
        uint32_t reftable_clusters;
    } QEMU_PACKED l1_ofs_rt_ofs_cls;

    ret = qcow2_cache_empty(bs, s->l2_table_cache);
    if (ret < 0) {
        goto fail;
    }

    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
    if (ret < 0) {
        goto fail;
    }

    /* Refcounts will be broken utterly */
    ret = qcow2_mark_dirty(bs);
    if (ret < 0) {
        goto fail;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);

    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);

    /* After this call, neither the in-memory nor the on-disk refcount
     * information accurately describe the actual references */

3801
    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
3802
                             l1_clusters * s->cluster_size, 0);
M
Max Reitz 已提交
3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
    memset(s->l1_table, 0, l1_size2);

    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);

    /* Overwrite enough clusters at the beginning of the sectors to place
     * the refcount table, a refcount block and the L1 table in; this may
     * overwrite parts of the existing refcount and L1 table, which is not
     * an issue because the dirty flag is set, complete data loss is in fact
     * desired and partial data loss is consequently fine as well */
3815
    ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
3816
                             (2 + l1_clusters) * s->cluster_size, 0);
M
Max Reitz 已提交
3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830
    /* This call (even if it failed overall) may have overwritten on-disk
     * refcount structures; in that case, the in-memory refcount information
     * will probably differ from the on-disk information which makes the BDS
     * unusable */
    if (ret < 0) {
        goto fail_broken_refcounts;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);

    /* "Create" an empty reftable (one cluster) directly after the image
     * header and an empty L1 table three clusters after the image header;
     * the cluster between those two will be used as the first refblock */
3831 3832 3833
    l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
    l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
    l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
3834
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
M
Max Reitz 已提交
3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849
                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
    if (ret < 0) {
        goto fail_broken_refcounts;
    }

    s->l1_table_offset = 3 * s->cluster_size;

    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
    if (!new_reftable) {
        ret = -ENOMEM;
        goto fail_broken_refcounts;
    }

    s->refcount_table_offset = s->cluster_size;
    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
3850
    s->max_refcount_table_index = 0;
M
Max Reitz 已提交
3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865

    g_free(s->refcount_table);
    s->refcount_table = new_reftable;
    new_reftable = NULL;

    /* Now the in-memory refcount information again corresponds to the on-disk
     * information (reftable is empty and no refblocks (the refblock cache is
     * empty)); however, this means some clusters (e.g. the image header) are
     * referenced, but not refcounted, but the normal qcow2 code assumes that
     * the in-memory information is always correct */

    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);

    /* Enter the first refblock into the reftable */
    rt_entry = cpu_to_be64(2 * s->cluster_size);
3866
    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
M
Max Reitz 已提交
3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890
                           &rt_entry, sizeof(rt_entry));
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
    s->refcount_table[0] = 2 * s->cluster_size;

    s->free_cluster_index = 0;
    assert(3 + l1_clusters <= s->refcount_block_size);
    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
    if (offset < 0) {
        ret = offset;
        goto fail_broken_refcounts;
    } else if (offset > 0) {
        error_report("First cluster in emptied image is in use");
        abort();
    }

    /* Now finally the in-memory information corresponds to the on-disk
     * structures and is correct */
    ret = qcow2_mark_clean(bs);
    if (ret < 0) {
        goto fail;
    }

3891
    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
3892
                        PREALLOC_MODE_OFF, &local_err);
M
Max Reitz 已提交
3893
    if (ret < 0) {
3894
        error_report_err(local_err);
M
Max Reitz 已提交
3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913
        goto fail;
    }

    return 0;

fail_broken_refcounts:
    /* The BDS is unusable at this point. If we wanted to make it usable, we
     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
     * again. However, because the functions which could have caused this error
     * path to be taken are used by those functions as well, it's very likely
     * that that sequence will fail as well. Therefore, just eject the BDS. */
    bs->drv = NULL;

fail:
    g_free(new_reftable);
    return ret;
}

M
Max Reitz 已提交
3914 3915
static int qcow2_make_empty(BlockDriverState *bs)
{
3916
    BDRVQcow2State *s = bs->opaque;
3917 3918
    uint64_t offset, end_offset;
    int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
M
Max Reitz 已提交
3919 3920 3921 3922
    int l1_clusters, ret = 0;

    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));

3923
    if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
3924 3925
        3 + l1_clusters <= s->refcount_block_size &&
        s->crypt_method_header != QCOW_CRYPT_LUKS) {
3926 3927 3928 3929 3930 3931 3932
        /* The following function only works for qcow2 v3 images (it
         * requires the dirty flag) and only as long as there are no
         * features that reserve extra clusters (such as snapshots,
         * LUKS header, or persistent bitmaps), because it completely
         * empties the image.  Furthermore, the L1 table and three
         * additional clusters (image header, refcount table, one
         * refcount block) have to fit inside one refcount block. */
M
Max Reitz 已提交
3933 3934
        return make_completely_empty(bs);
    }
M
Max Reitz 已提交
3935

M
Max Reitz 已提交
3936 3937
    /* This fallback code simply discards every active cluster; this is slow,
     * but works in all cases */
3938 3939
    end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
    for (offset = 0; offset < end_offset; offset += step) {
M
Max Reitz 已提交
3940 3941 3942 3943 3944
        /* As this function is generally used after committing an external
         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
         * default action for this kind of discard is to pass the discard,
         * which will ideally result in an actually smaller image file, as
         * is probably desired. */
3945 3946
        ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
                                    QCOW2_DISCARD_SNAPSHOT, true);
M
Max Reitz 已提交
3947 3948 3949 3950 3951 3952 3953 3954
        if (ret < 0) {
            break;
        }
    }

    return ret;
}

3955
static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
B
Blue Swirl 已提交
3956
{
3957
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
3958 3959
    int ret;

P
Paolo Bonzini 已提交
3960
    qemu_co_mutex_lock(&s->lock);
3961
    ret = qcow2_write_caches(bs);
P
Paolo Bonzini 已提交
3962
    qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
3963

3964
    return ret;
K
Kevin Wolf 已提交
3965 3966
}

3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998
static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
                                       Error **errp)
{
    Error *local_err = NULL;
    BlockMeasureInfo *info;
    uint64_t required = 0; /* bytes that contribute to required size */
    uint64_t virtual_size; /* disk size as seen by guest */
    uint64_t refcount_bits;
    uint64_t l2_tables;
    size_t cluster_size;
    int version;
    char *optstr;
    PreallocMode prealloc;
    bool has_backing_file;

    /* Parse image creation options */
    cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
    if (local_err) {
        goto err;
    }

    version = qcow2_opt_get_version_del(opts, &local_err);
    if (local_err) {
        goto err;
    }

    refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
    if (local_err) {
        goto err;
    }

    optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
3999
    prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
4000
                               PREALLOC_MODE_OFF, &local_err);
4001 4002 4003 4004 4005 4006 4007 4008 4009
    g_free(optstr);
    if (local_err) {
        goto err;
    }

    optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    has_backing_file = !!optstr;
    g_free(optstr);

4010 4011
    virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
    virtual_size = ROUND_UP(virtual_size, cluster_size);
4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030

    /* Check that virtual disk size is valid */
    l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
                             cluster_size / sizeof(uint64_t));
    if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) {
        error_setg(&local_err, "The image size is too large "
                               "(try using a larger cluster size)");
        goto err;
    }

    /* Account for input image */
    if (in_bs) {
        int64_t ssize = bdrv_getlength(in_bs);
        if (ssize < 0) {
            error_setg_errno(&local_err, -ssize,
                             "Unable to get image virtual_size");
            goto err;
        }

4031
        virtual_size = ROUND_UP(ssize, cluster_size);
4032 4033 4034 4035 4036 4037 4038 4039 4040

        if (has_backing_file) {
            /* We don't how much of the backing chain is shared by the input
             * image and the new image file.  In the worst case the new image's
             * backing file has nothing in common with the input image.  Be
             * conservative and assume all clusters need to be written.
             */
            required = virtual_size;
        } else {
4041
            int64_t offset;
4042
            int64_t pnum = 0;
4043

4044 4045
            for (offset = 0; offset < ssize; offset += pnum) {
                int ret;
4046

4047 4048 4049
                ret = bdrv_block_status_above(in_bs, NULL, offset,
                                              ssize - offset, &pnum, NULL,
                                              NULL);
4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060
                if (ret < 0) {
                    error_setg_errno(&local_err, -ret,
                                     "Unable to get block status");
                    goto err;
                }

                if (ret & BDRV_BLOCK_ZERO) {
                    /* Skip zero regions (safe with no backing file) */
                } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
                           (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
                    /* Extend pnum to end of cluster for next iteration */
4061
                    pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
4062 4063

                    /* Count clusters we've seen */
4064
                    required += offset % cluster_size + pnum;
4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093
                }
            }
        }
    }

    /* Take into account preallocation.  Nothing special is needed for
     * PREALLOC_MODE_METADATA since metadata is always counted.
     */
    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
        required = virtual_size;
    }

    info = g_new(BlockMeasureInfo, 1);
    info->fully_allocated =
        qcow2_calc_prealloc_size(virtual_size, cluster_size,
                                 ctz32(refcount_bits));

    /* Remove data clusters that are not required.  This overestimates the
     * required size because metadata needed for the fully allocated file is
     * still counted.
     */
    info->required = info->fully_allocated - virtual_size + required;
    return info;

err:
    error_propagate(errp, local_err);
    return NULL;
}

4094
static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
B
Blue Swirl 已提交
4095
{
4096
    BDRVQcow2State *s = bs->opaque;
4097
    bdi->unallocated_blocks_are_zero = true;
B
Blue Swirl 已提交
4098
    bdi->cluster_size = s->cluster_size;
4099
    bdi->vm_state_offset = qcow2_vm_state_offset(s);
B
Blue Swirl 已提交
4100 4101 4102
    return 0;
}

4103 4104
static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
{
4105
    BDRVQcow2State *s = bs->opaque;
4106 4107
    ImageInfoSpecific *spec_info;
    QCryptoBlockInfo *encrypt_info = NULL;
4108

4109 4110 4111 4112 4113
    if (s->crypto != NULL) {
        encrypt_info = qcrypto_block_get_info(s->crypto, &error_abort);
    }

    spec_info = g_new(ImageInfoSpecific, 1);
4114
    *spec_info = (ImageInfoSpecific){
4115
        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
4116
        .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1),
4117 4118
    };
    if (s->qcow_version == 2) {
4119
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4120 4121
            .compat             = g_strdup("0.10"),
            .refcount_bits      = s->refcount_bits,
4122 4123
        };
    } else if (s->qcow_version == 3) {
4124
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4125 4126 4127 4128
            .compat             = g_strdup("1.1"),
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
            .has_lazy_refcounts = true,
4129 4130 4131
            .corrupt            = s->incompatible_features &
                                  QCOW2_INCOMPAT_CORRUPT,
            .has_corrupt        = true,
4132
            .refcount_bits      = s->refcount_bits,
4133
        };
4134 4135 4136 4137
    } else {
        /* if this assertion fails, this probably means a new version was
         * added without having it covered here */
        assert(false);
4138 4139
    }

4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162
    if (encrypt_info) {
        ImageInfoSpecificQCow2Encryption *qencrypt =
            g_new(ImageInfoSpecificQCow2Encryption, 1);
        switch (encrypt_info->format) {
        case Q_CRYPTO_BLOCK_FORMAT_QCOW:
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
            break;
        case Q_CRYPTO_BLOCK_FORMAT_LUKS:
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
            qencrypt->u.luks = encrypt_info->u.luks;
            break;
        default:
            abort();
        }
        /* Since we did shallow copy above, erase any pointers
         * in the original info */
        memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
        qapi_free_QCryptoBlockInfo(encrypt_info);

        spec_info->u.qcow2.data->has_encrypt = true;
        spec_info->u.qcow2.data->encrypt = qencrypt;
    }

4163 4164 4165
    return spec_info;
}

4166 4167
static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
B
Blue Swirl 已提交
4168
{
4169
    BDRVQcow2State *s = bs->opaque;
B
Blue Swirl 已提交
4170

4171
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
4172 4173
    return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
                                    qiov->size, qiov, 0);
B
Blue Swirl 已提交
4174 4175
}

4176 4177
static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
B
Blue Swirl 已提交
4178
{
4179
    BDRVQcow2State *s = bs->opaque;
B
Blue Swirl 已提交
4180

4181
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
4182 4183
    return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
                                   qiov->size, qiov, 0);
B
Blue Swirl 已提交
4184 4185
}

M
Max Reitz 已提交
4186 4187 4188 4189
/*
 * Downgrades an image's version. To achieve this, any incompatible features
 * have to be removed.
 */
4190
static int qcow2_downgrade(BlockDriverState *bs, int target_version,
4191 4192
                           BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
                           Error **errp)
M
Max Reitz 已提交
4193
{
4194
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
4195 4196 4197
    int current_version = s->qcow_version;
    int ret;

4198 4199 4200 4201 4202
    /* This is qcow2_downgrade(), not qcow2_upgrade() */
    assert(target_version < current_version);

    /* There are no other versions (now) that you can downgrade to */
    assert(target_version == 2);
M
Max Reitz 已提交
4203 4204

    if (s->refcount_order != 4) {
4205
        error_setg(errp, "compat=0.10 requires refcount_bits=16");
M
Max Reitz 已提交
4206 4207 4208 4209 4210 4211 4212
        return -ENOTSUP;
    }

    /* clear incompatible features */
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
4213
            error_setg_errno(errp, -ret, "Failed to make the image clean");
M
Max Reitz 已提交
4214 4215 4216 4217 4218 4219 4220 4221 4222
            return ret;
        }
    }

    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
     * the first place; if that happens nonetheless, returning -ENOTSUP is the
     * best thing to do anyway */

    if (s->incompatible_features) {
4223 4224
        error_setg(errp, "Cannot downgrade an image with incompatible features "
                   "%#" PRIx64 " set", s->incompatible_features);
M
Max Reitz 已提交
4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235
        return -ENOTSUP;
    }

    /* since we can ignore compatible features, we can set them to 0 as well */
    s->compatible_features = 0;
    /* if lazy refcounts have been used, they have already been fixed through
     * clearing the dirty flag */

    /* clearing autoclear features is trivial */
    s->autoclear_features = 0;

4236
    ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
M
Max Reitz 已提交
4237
    if (ret < 0) {
4238
        error_setg_errno(errp, -ret, "Failed to turn zero into data clusters");
M
Max Reitz 已提交
4239 4240 4241 4242 4243 4244 4245
        return ret;
    }

    s->qcow_version = target_version;
    ret = qcow2_update_header(bs);
    if (ret < 0) {
        s->qcow_version = current_version;
4246
        error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
4247 4248 4249 4250 4251
        return ret;
    }
    return 0;
}

4252 4253 4254 4255 4256 4257
typedef enum Qcow2AmendOperation {
    /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
     * statically initialized to so that the helper CB can discern the first
     * invocation from an operation change */
    QCOW2_NO_OPERATION = 0,

4258
    QCOW2_CHANGING_REFCOUNT_ORDER,
4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321
    QCOW2_DOWNGRADING,
} Qcow2AmendOperation;

typedef struct Qcow2AmendHelperCBInfo {
    /* The code coordinating the amend operations should only modify
     * these four fields; the rest will be managed by the CB */
    BlockDriverAmendStatusCB *original_status_cb;
    void *original_cb_opaque;

    Qcow2AmendOperation current_operation;

    /* Total number of operations to perform (only set once) */
    int total_operations;

    /* The following fields are managed by the CB */

    /* Number of operations completed */
    int operations_completed;

    /* Cumulative offset of all completed operations */
    int64_t offset_completed;

    Qcow2AmendOperation last_operation;
    int64_t last_work_size;
} Qcow2AmendHelperCBInfo;

static void qcow2_amend_helper_cb(BlockDriverState *bs,
                                  int64_t operation_offset,
                                  int64_t operation_work_size, void *opaque)
{
    Qcow2AmendHelperCBInfo *info = opaque;
    int64_t current_work_size;
    int64_t projected_work_size;

    if (info->current_operation != info->last_operation) {
        if (info->last_operation != QCOW2_NO_OPERATION) {
            info->offset_completed += info->last_work_size;
            info->operations_completed++;
        }

        info->last_operation = info->current_operation;
    }

    assert(info->total_operations > 0);
    assert(info->operations_completed < info->total_operations);

    info->last_work_size = operation_work_size;

    current_work_size = info->offset_completed + operation_work_size;

    /* current_work_size is the total work size for (operations_completed + 1)
     * operations (which includes this one), so multiply it by the number of
     * operations not covered and divide it by the number of operations
     * covered to get a projection for the operations not covered */
    projected_work_size = current_work_size * (info->total_operations -
                                               info->operations_completed - 1)
                                            / (info->operations_completed + 1);

    info->original_status_cb(bs, info->offset_completed + operation_offset,
                             current_work_size + projected_work_size,
                             info->original_cb_opaque);
}

4322
static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
4323
                               BlockDriverAmendStatusCB *status_cb,
4324 4325
                               void *cb_opaque,
                               Error **errp)
M
Max Reitz 已提交
4326
{
4327
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
4328 4329 4330 4331
    int old_version = s->qcow_version, new_version = old_version;
    uint64_t new_size = 0;
    const char *backing_file = NULL, *backing_format = NULL;
    bool lazy_refcounts = s->use_lazy_refcounts;
4332 4333 4334
    const char *compat = NULL;
    uint64_t cluster_size = s->cluster_size;
    bool encrypt;
4335
    int encformat;
4336
    int refcount_bits = s->refcount_bits;
M
Max Reitz 已提交
4337
    int ret;
4338
    QemuOptDesc *desc = opts->list->desc;
4339
    Qcow2AmendHelperCBInfo helper_cb_info;
M
Max Reitz 已提交
4340

4341 4342
    while (desc && desc->name) {
        if (!qemu_opt_find(opts, desc->name)) {
M
Max Reitz 已提交
4343
            /* only change explicitly defined options */
4344
            desc++;
M
Max Reitz 已提交
4345 4346 4347
            continue;
        }

4348 4349
        if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
            compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
4350
            if (!compat) {
M
Max Reitz 已提交
4351
                /* preserve default */
4352
            } else if (!strcmp(compat, "0.10")) {
M
Max Reitz 已提交
4353
                new_version = 2;
4354
            } else if (!strcmp(compat, "1.1")) {
M
Max Reitz 已提交
4355 4356
                new_version = 3;
            } else {
4357
                error_setg(errp, "Unknown compatibility level %s", compat);
M
Max Reitz 已提交
4358 4359
                return -EINVAL;
            }
4360
        } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
4361
            error_setg(errp, "Cannot change preallocation mode");
M
Max Reitz 已提交
4362
            return -ENOTSUP;
4363 4364 4365 4366 4367 4368 4369 4370
        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
            backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
            backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
            encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
4371
                                        !!s->crypto);
4372

4373
            if (encrypt != !!s->crypto) {
4374 4375
                error_setg(errp,
                           "Changing the encryption flag is not supported");
M
Max Reitz 已提交
4376 4377
                return -ENOTSUP;
            }
4378 4379 4380 4381 4382
        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) {
            encformat = qcow2_crypt_method_from_format(
                qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT));

            if (encformat != s->crypt_method_header) {
4383 4384
                error_setg(errp,
                           "Changing the encryption format is not supported");
4385 4386
                return -ENOTSUP;
            }
4387
        } else if (g_str_has_prefix(desc->name, "encrypt.")) {
4388 4389
            error_setg(errp,
                       "Changing the encryption parameters is not supported");
4390
            return -ENOTSUP;
4391 4392
        } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
            cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
4393 4394
                                             cluster_size);
            if (cluster_size != s->cluster_size) {
4395
                error_setg(errp, "Changing the cluster size is not supported");
M
Max Reitz 已提交
4396 4397
                return -ENOTSUP;
            }
4398 4399
        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
4400
                                               lazy_refcounts);
4401
        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
4402 4403 4404 4405 4406 4407
            refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
                                                refcount_bits);

            if (refcount_bits <= 0 || refcount_bits > 64 ||
                !is_power_of_2(refcount_bits))
            {
4408 4409
                error_setg(errp, "Refcount width must be a power of two and "
                           "may not exceed 64 bits");
4410 4411
                return -EINVAL;
            }
M
Max Reitz 已提交
4412
        } else {
4413
            /* if this point is reached, this probably means a new option was
M
Max Reitz 已提交
4414
             * added without having it covered here */
4415
            abort();
M
Max Reitz 已提交
4416
        }
4417 4418

        desc++;
M
Max Reitz 已提交
4419 4420
    }

4421 4422 4423 4424
    helper_cb_info = (Qcow2AmendHelperCBInfo){
        .original_status_cb = status_cb,
        .original_cb_opaque = cb_opaque,
        .total_operations = (new_version < old_version)
4425
                          + (s->refcount_bits != refcount_bits)
4426 4427
    };

4428 4429 4430 4431 4432 4433
    /* Upgrade first (some features may require compat=1.1) */
    if (new_version > old_version) {
        s->qcow_version = new_version;
        ret = qcow2_update_header(bs);
        if (ret < 0) {
            s->qcow_version = old_version;
4434
            error_setg_errno(errp, -ret, "Failed to update the image header");
4435
            return ret;
M
Max Reitz 已提交
4436 4437 4438
        }
    }

4439 4440 4441 4442
    if (s->refcount_bits != refcount_bits) {
        int refcount_order = ctz32(refcount_bits);

        if (new_version < 3 && refcount_bits != 16) {
4443 4444 4445
            error_setg(errp, "Refcount widths other than 16 bits require "
                       "compatibility level 1.1 or above (use compat=1.1 or "
                       "greater)");
4446 4447 4448 4449 4450 4451
            return -EINVAL;
        }

        helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
        ret = qcow2_change_refcount_order(bs, refcount_order,
                                          &qcow2_amend_helper_cb,
4452
                                          &helper_cb_info, errp);
4453 4454 4455 4456 4457
        if (ret < 0) {
            return ret;
        }
    }

M
Max Reitz 已提交
4458
    if (backing_file || backing_format) {
4459 4460 4461
        ret = qcow2_change_backing_file(bs,
                    backing_file ?: s->image_backing_file,
                    backing_format ?: s->image_backing_format);
M
Max Reitz 已提交
4462
        if (ret < 0) {
4463
            error_setg_errno(errp, -ret, "Failed to change the backing file");
M
Max Reitz 已提交
4464 4465 4466 4467 4468 4469
            return ret;
        }
    }

    if (s->use_lazy_refcounts != lazy_refcounts) {
        if (lazy_refcounts) {
4470
            if (new_version < 3) {
4471 4472 4473
                error_setg(errp, "Lazy refcounts only supported with "
                           "compatibility level 1.1 and above (use compat=1.1 "
                           "or greater)");
M
Max Reitz 已提交
4474 4475 4476 4477 4478 4479
                return -EINVAL;
            }
            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
            ret = qcow2_update_header(bs);
            if (ret < 0) {
                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
4480
                error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
4481 4482 4483 4484 4485 4486 4487
                return ret;
            }
            s->use_lazy_refcounts = true;
        } else {
            /* make image clean first */
            ret = qcow2_mark_clean(bs);
            if (ret < 0) {
4488
                error_setg_errno(errp, -ret, "Failed to make the image clean");
M
Max Reitz 已提交
4489 4490 4491 4492 4493 4494 4495
                return ret;
            }
            /* now disallow lazy refcounts */
            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
            ret = qcow2_update_header(bs);
            if (ret < 0) {
                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
4496
                error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
4497 4498 4499 4500 4501 4502 4503
                return ret;
            }
            s->use_lazy_refcounts = false;
        }
    }

    if (new_size) {
K
Kevin Wolf 已提交
4504
        BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
4505
        ret = blk_insert_bs(blk, bs, errp);
4506 4507 4508 4509 4510
        if (ret < 0) {
            blk_unref(blk);
            return ret;
        }

4511
        ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, errp);
4512
        blk_unref(blk);
M
Max Reitz 已提交
4513 4514 4515 4516 4517
        if (ret < 0) {
            return ret;
        }
    }

4518 4519
    /* Downgrade last (so unsupported features can be removed before) */
    if (new_version < old_version) {
4520 4521
        helper_cb_info.current_operation = QCOW2_DOWNGRADING;
        ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
4522
                              &helper_cb_info, errp);
4523 4524 4525 4526 4527
        if (ret < 0) {
            return ret;
        }
    }

M
Max Reitz 已提交
4528 4529 4530
    return 0;
}

M
Max Reitz 已提交
4531 4532 4533 4534 4535 4536 4537 4538 4539
/*
 * If offset or size are negative, respectively, they will not be included in
 * the BLOCK_IMAGE_CORRUPTED event emitted.
 * fatal will be ignored for read-only BDS; corruptions found there will always
 * be considered non-fatal.
 */
void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                             int64_t size, const char *message_format, ...)
{
4540
    BDRVQcow2State *s = bs->opaque;
4541
    const char *node_name;
M
Max Reitz 已提交
4542 4543 4544
    char *message;
    va_list ap;

4545
    fatal = fatal && bdrv_is_writable(bs);
M
Max Reitz 已提交
4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564

    if (s->signaled_corruption &&
        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
    {
        return;
    }

    va_start(ap, message_format);
    message = g_strdup_vprintf(message_format, ap);
    va_end(ap);

    if (fatal) {
        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
                "corruption events will be suppressed\n", message);
    } else {
        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
                "corruption events will be suppressed\n", message);
    }

4565 4566 4567 4568 4569
    node_name = bdrv_get_node_name(bs);
    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
                                          *node_name != '\0', node_name,
                                          message, offset >= 0, offset,
                                          size >= 0, size,
M
Max Reitz 已提交
4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580
                                          fatal, &error_abort);
    g_free(message);

    if (fatal) {
        qcow2_mark_corrupt(bs);
        bs->drv = NULL; /* make BDS unusable */
    }

    s->signaled_corruption = true;
}

4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607
static QemuOptsList qcow2_create_opts = {
    .name = "qcow2-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_COMPAT_LEVEL,
            .type = QEMU_OPT_STRING,
            .help = "Compatibility level (0.10 or 1.1)"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_BACKING_FMT,
            .type = QEMU_OPT_STRING,
            .help = "Image format of the base image"
        },
        {
            .name = BLOCK_OPT_ENCRYPT,
            .type = QEMU_OPT_BOOL,
4608 4609 4610 4611 4612 4613
            .help = "Encrypt the image with format 'aes'. (Deprecated "
                    "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
        },
        {
            .name = BLOCK_OPT_ENCRYPT_FORMAT,
            .type = QEMU_OPT_STRING,
4614
            .help = "Encrypt the image, format choices: 'aes', 'luks'",
4615
        },
4616 4617 4618 4619 4620 4621 4622 4623
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
            "ID of secret providing qcow AES key or LUKS passphrase"),
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
4624 4625 4626 4627 4628 4629 4630 4631 4632
        {
            .name = BLOCK_OPT_CLUSTER_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "qcow2 cluster size",
            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
        },
        {
            .name = BLOCK_OPT_PREALLOC,
            .type = QEMU_OPT_STRING,
4633 4634
            .help = "Preallocation mode (allowed values: off, metadata, "
                    "falloc, full)"
4635 4636 4637 4638 4639 4640 4641
        },
        {
            .name = BLOCK_OPT_LAZY_REFCOUNTS,
            .type = QEMU_OPT_BOOL,
            .help = "Postpone refcount updates",
            .def_value_str = "off"
        },
4642 4643 4644 4645 4646 4647
        {
            .name = BLOCK_OPT_REFCOUNT_BITS,
            .type = QEMU_OPT_NUMBER,
            .help = "Width of a reference count entry in bits",
            .def_value_str = "16"
        },
4648 4649
        { /* end of list */ }
    }
B
Blue Swirl 已提交
4650 4651
};

4652
BlockDriver bdrv_qcow2 = {
4653
    .format_name        = "qcow2",
4654
    .instance_size      = sizeof(BDRVQcow2State),
4655 4656 4657
    .bdrv_probe         = qcow2_probe,
    .bdrv_open          = qcow2_open,
    .bdrv_close         = qcow2_close,
J
Jeff Cody 已提交
4658
    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
4659 4660
    .bdrv_reopen_commit   = qcow2_reopen_commit,
    .bdrv_reopen_abort    = qcow2_reopen_abort,
4661
    .bdrv_join_options    = qcow2_join_options,
4662
    .bdrv_child_perm      = bdrv_format_default_perms,
4663
    .bdrv_co_create_opts  = qcow2_co_create_opts,
4664
    .bdrv_co_create       = qcow2_co_create,
4665
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
4666
    .bdrv_co_block_status = qcow2_co_block_status,
4667

K
Kevin Wolf 已提交
4668
    .bdrv_co_preadv         = qcow2_co_preadv,
K
Kevin Wolf 已提交
4669
    .bdrv_co_pwritev        = qcow2_co_pwritev,
K
Kevin Wolf 已提交
4670
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
4671

4672
    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
4673
    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
F
Fam Zheng 已提交
4674 4675
    .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
    .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
4676
    .bdrv_co_truncate       = qcow2_co_truncate,
4677
    .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed,
M
Max Reitz 已提交
4678
    .bdrv_make_empty        = qcow2_make_empty,
B
Blue Swirl 已提交
4679 4680 4681 4682 4683

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
    .bdrv_snapshot_list     = qcow2_snapshot_list,
4684
    .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
4685
    .bdrv_measure           = qcow2_measure,
4686
    .bdrv_get_info          = qcow2_get_info,
4687
    .bdrv_get_specific_info = qcow2_get_specific_info,
B
Blue Swirl 已提交
4688

4689 4690
    .bdrv_save_vmstate    = qcow2_save_vmstate,
    .bdrv_load_vmstate    = qcow2_load_vmstate,
B
Blue Swirl 已提交
4691

4692
    .supports_backing           = true,
B
Blue Swirl 已提交
4693 4694
    .bdrv_change_backing_file   = qcow2_change_backing_file,

4695
    .bdrv_refresh_limits        = qcow2_refresh_limits,
4696
    .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
K
Kevin Wolf 已提交
4697
    .bdrv_inactivate            = qcow2_inactivate,
4698

4699
    .create_opts         = &qcow2_create_opts,
4700
    .bdrv_co_check       = qcow2_co_check,
C
Chunyan Liu 已提交
4701
    .bdrv_amend_options  = qcow2_amend_options,
4702 4703 4704

    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
    .bdrv_attach_aio_context  = qcow2_attach_aio_context,
4705 4706

    .bdrv_reopen_bitmaps_rw = qcow2_reopen_bitmaps_rw,
4707
    .bdrv_can_store_new_dirty_bitmap = qcow2_can_store_new_dirty_bitmap,
4708
    .bdrv_remove_persistent_dirty_bitmap = qcow2_remove_persistent_dirty_bitmap,
B
Blue Swirl 已提交
4709 4710
};

4711 4712 4713 4714 4715 4716
static void bdrv_qcow2_init(void)
{
    bdrv_register(&bdrv_qcow2);
}

block_init(bdrv_qcow2_init);