qcow2.c 160.2 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the QCOW version 2 format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
24

P
Peter Maydell 已提交
25
#include "qemu/osdep.h"
26 27 28 29

#define ZLIB_CONST
#include <zlib.h>

30
#include "block/block_int.h"
31
#include "block/qdict.h"
32
#include "sysemu/block-backend.h"
33
#include "qemu/module.h"
34
#include "qcow2.h"
35
#include "qemu/error-report.h"
36
#include "qapi/error.h"
37
#include "qapi/qapi-events-block-core.h"
M
Markus Armbruster 已提交
38 39
#include "qapi/qmp/qdict.h"
#include "qapi/qmp/qstring.h"
K
Kevin Wolf 已提交
40
#include "trace.h"
41
#include "qemu/option_int.h"
42
#include "qemu/cutils.h"
43
#include "qemu/bswap.h"
44 45
#include "qapi/qobject-input-visitor.h"
#include "qapi/qapi-visit-block-core.h"
46
#include "crypto.h"
47
#include "block/thread-pool.h"
B
bellard 已提交
48 49 50 51 52 53 54 55

/*
  Differences with QCOW:

  - Support for multiple incremental snapshots.
  - Memory management by reference counts.
  - Clusters which have a reference count of one have the bit
    QCOW_OFLAG_COPIED to optimize write performance.
56
  - Size of compressed clusters is stored in sectors to reduce bit usage
B
bellard 已提交
57 58
    in the cluster offsets.
  - Support for storing additional data (such as the VM state) in the
59
    snapshots.
B
bellard 已提交
60 61 62 63 64
  - If a backing store is used, the cluster size is not constrained
    (could be backported to QCOW).
  - L2 tables have always a size of one cluster.
*/

65 66 67 68

typedef struct {
    uint32_t magic;
    uint32_t len;
69
} QEMU_PACKED QCowExtension;
J
Jeff Cody 已提交
70

71 72
#define  QCOW2_EXT_MAGIC_END 0
#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
73
#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
74
#define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
75
#define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
76

77
static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
B
bellard 已提交
78 79
{
    const QCowHeader *cow_header = (const void *)buf;
80

B
bellard 已提交
81 82
    if (buf_size >= sizeof(QCowHeader) &&
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
K
Kevin Wolf 已提交
83
        be32_to_cpu(cow_header->version) >= 2)
B
bellard 已提交
84 85 86 87 88
        return 100;
    else
        return 0;
}

89

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
                                          uint8_t *buf, size_t buflen,
                                          void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    ssize_t ret;

    if ((offset + buflen) > s->crypto_header.length) {
        error_setg(errp, "Request for data outside of extension header");
        return -1;
    }

    ret = bdrv_pread(bs->file,
                     s->crypto_header.offset + offset, buf, buflen);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
        return -1;
    }
    return ret;
}


static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
                                          void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    int64_t ret;
    int64_t clusterlen;

    ret = qcow2_alloc_clusters(bs, headerlen);
    if (ret < 0) {
        error_setg_errno(errp, -ret,
                         "Cannot allocate cluster for LUKS header size %zu",
                         headerlen);
        return -1;
    }

    s->crypto_header.length = headerlen;
    s->crypto_header.offset = ret;

    /* Zero fill remaining space in cluster so it has predictable
     * content in case of future spec changes */
    clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
135
    assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen) == 0);
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
    ret = bdrv_pwrite_zeroes(bs->file,
                             ret + headerlen,
                             clusterlen - headerlen, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not zero fill encryption header");
        return -1;
    }

    return ret;
}


static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
                                           const uint8_t *buf, size_t buflen,
                                           void *opaque, Error **errp)
{
    BlockDriverState *bs = opaque;
    BDRVQcow2State *s = bs->opaque;
    ssize_t ret;

    if ((offset + buflen) > s->crypto_header.length) {
        error_setg(errp, "Request for data outside of extension header");
        return -1;
    }

    ret = bdrv_pwrite(bs->file,
                      s->crypto_header.offset + offset, buf, buflen);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read encryption header");
        return -1;
    }
    return ret;
}


171 172 173 174 175 176 177
/* 
 * read qcow2 extension and fill bs
 * start reading from start_offset
 * finish reading upon magic of value 0 or when end_offset reached
 * unknown magic is skipped (future extension this version knows nothing about)
 * return 0 upon success, non-0 otherwise
 */
178
static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
M
Max Reitz 已提交
179
                                 uint64_t end_offset, void **p_feature_table,
180 181
                                 int flags, bool *need_update_header,
                                 Error **errp)
182
{
183
    BDRVQcow2State *s = bs->opaque;
184 185
    QCowExtension ext;
    uint64_t offset;
186
    int ret;
187 188 189 190 191
    Qcow2BitmapHeaderExt bitmaps_ext;

    if (need_update_header != NULL) {
        *need_update_header = false;
    }
192 193

#ifdef DEBUG_EXT
194
    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
195 196 197 198 199 200 201
#endif
    offset = start_offset;
    while (offset < end_offset) {

#ifdef DEBUG_EXT
        /* Sanity check */
        if (offset > s->cluster_size)
202
            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
203

D
Dong Xu Wang 已提交
204
        printf("attempting to read extended header in offset %lu\n", offset);
205 206
#endif

207
        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
M
Max Reitz 已提交
208 209 210
        if (ret < 0) {
            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
                             "pread fail from offset %" PRIu64, offset);
211 212
            return 1;
        }
213 214
        ext.magic = be32_to_cpu(ext.magic);
        ext.len = be32_to_cpu(ext.len);
215 216 217 218
        offset += sizeof(ext);
#ifdef DEBUG_EXT
        printf("ext.magic = 0x%x\n", ext.magic);
#endif
219
        if (offset > end_offset || ext.len > end_offset - offset) {
M
Max Reitz 已提交
220
            error_setg(errp, "Header extension too large");
221 222 223
            return -EINVAL;
        }

224
        switch (ext.magic) {
225
        case QCOW2_EXT_MAGIC_END:
226
            return 0;
227

228
        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
229
            if (ext.len >= sizeof(bs->backing_format)) {
230 231 232
                error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
                           " too large (>=%zu)", ext.len,
                           sizeof(bs->backing_format));
233 234
                return 2;
            }
235
            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
M
Max Reitz 已提交
236 237 238
            if (ret < 0) {
                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
                                 "Could not read format name");
239
                return 3;
M
Max Reitz 已提交
240
            }
241
            bs->backing_format[ext.len] = '\0';
242
            s->image_backing_format = g_strdup(bs->backing_format);
243 244 245 246 247
#ifdef DEBUG_EXT
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
#endif
            break;

248 249 250
        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
            if (p_feature_table != NULL) {
                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
251
                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
252
                if (ret < 0) {
M
Max Reitz 已提交
253 254
                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
                                     "Could not read table");
255 256 257 258 259 260 261
                    return ret;
                }

                *p_feature_table = feature_table;
            }
            break;

262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
        case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
            unsigned int cflags = 0;
            if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
                error_setg(errp, "CRYPTO header extension only "
                           "expected with LUKS encryption method");
                return -EINVAL;
            }
            if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
                error_setg(errp, "CRYPTO header extension size %u, "
                           "but expected size %zu", ext.len,
                           sizeof(Qcow2CryptoHeaderExtension));
                return -EINVAL;
            }

            ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret,
                                 "Unable to read CRYPTO header extension");
                return ret;
            }
282 283
            s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
            s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
284 285 286 287 288 289 290 291 292 293 294

            if ((s->crypto_header.offset % s->cluster_size) != 0) {
                error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
                           "not a multiple of cluster size '%u'",
                           s->crypto_header.offset, s->cluster_size);
                return -EINVAL;
            }

            if (flags & BDRV_O_NO_IO) {
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
            }
295
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
296 297 298 299 300 301 302
                                           qcow2_crypto_hdr_read_func,
                                           bs, cflags, errp);
            if (!s->crypto) {
                return -EINVAL;
            }
        }   break;

303 304 305 306 307 308 309 310
        case QCOW2_EXT_MAGIC_BITMAPS:
            if (ext.len != sizeof(bitmaps_ext)) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Invalid extension length");
                return -EINVAL;
            }

            if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
311 312 313 314 315 316 317 318 319 320 321
                if (s->qcow_version < 3) {
                    /* Let's be a bit more specific */
                    warn_report("This qcow2 v2 image contains bitmaps, but "
                                "they may have been modified by a program "
                                "without persistent bitmap support; so now "
                                "they must all be considered inconsistent");
                } else {
                    warn_report("a program lacking bitmap support "
                                "modified this file, so all bitmaps are now "
                                "considered inconsistent");
                }
322 323
                error_printf("Some clusters may be leaked, "
                             "run 'qemu-img check -r' on the image "
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
                             "file to fix.");
                if (need_update_header != NULL) {
                    /* Updating is needed to drop invalid bitmap extension. */
                    *need_update_header = true;
                }
                break;
            }

            ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len);
            if (ret < 0) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Could not read ext header");
                return ret;
            }

            if (bitmaps_ext.reserved32 != 0) {
                error_setg_errno(errp, -ret, "bitmaps_ext: "
                                 "Reserved field is not zero");
                return -EINVAL;
            }

345 346 347 348 349
            bitmaps_ext.nb_bitmaps = be32_to_cpu(bitmaps_ext.nb_bitmaps);
            bitmaps_ext.bitmap_directory_size =
                be64_to_cpu(bitmaps_ext.bitmap_directory_size);
            bitmaps_ext.bitmap_directory_offset =
                be64_to_cpu(bitmaps_ext.bitmap_directory_offset);
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392

            if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
                error_setg(errp,
                           "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
                           "exceeding the QEMU supported maximum of %d",
                           bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
                return -EINVAL;
            }

            if (bitmaps_ext.nb_bitmaps == 0) {
                error_setg(errp, "found bitmaps extension with zero bitmaps");
                return -EINVAL;
            }

            if (bitmaps_ext.bitmap_directory_offset & (s->cluster_size - 1)) {
                error_setg(errp, "bitmaps_ext: "
                                 "invalid bitmap directory offset");
                return -EINVAL;
            }

            if (bitmaps_ext.bitmap_directory_size >
                QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
                error_setg(errp, "bitmaps_ext: "
                                 "bitmap directory size (%" PRIu64 ") exceeds "
                                 "the maximum supported size (%d)",
                                 bitmaps_ext.bitmap_directory_size,
                                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
                return -EINVAL;
            }

            s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
            s->bitmap_directory_offset =
                    bitmaps_ext.bitmap_directory_offset;
            s->bitmap_directory_size =
                    bitmaps_ext.bitmap_directory_size;

#ifdef DEBUG_EXT
            printf("Qcow2: Got bitmaps extension: "
                   "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
                   s->bitmap_directory_offset, s->nb_bitmaps);
#endif
            break;

393
        default:
394
            /* unknown magic - save it in case we need to rewrite the header */
395 396
            /* If you add a new feature, make sure to also update the fast
             * path of qcow2_make_empty() to deal with it. */
397 398 399 400 401 402 403 404
            {
                Qcow2UnknownHeaderExtension *uext;

                uext = g_malloc0(sizeof(*uext)  + ext.len);
                uext->magic = ext.magic;
                uext->len = ext.len;
                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);

405
                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
406
                if (ret < 0) {
M
Max Reitz 已提交
407 408
                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
                                     "Could not read data");
409 410 411
                    return ret;
                }
            }
412 413
            break;
        }
414 415

        offset += ((ext.len + 7) & ~7);
416 417 418 419 420
    }

    return 0;
}

421 422
static void cleanup_unknown_header_ext(BlockDriverState *bs)
{
423
    BDRVQcow2State *s = bs->opaque;
424 425 426 427 428 429 430
    Qcow2UnknownHeaderExtension *uext, *next;

    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
        QLIST_REMOVE(uext, next);
        g_free(uext);
    }
}
431

432 433
static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
                                       uint64_t mask)
434
{
435 436 437
    char *features = g_strdup("");
    char *old;

438 439
    while (table && table->name[0] != '\0') {
        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
440 441 442 443 444 445
            if (mask & (1ULL << table->bit)) {
                old = features;
                features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "",
                                           table->name);
                g_free(old);
                mask &= ~(1ULL << table->bit);
446 447 448 449 450 451
            }
        }
        table++;
    }

    if (mask) {
452 453 454 455
        old = features;
        features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64,
                                   old, *old ? ", " : "", mask);
        g_free(old);
456
    }
457

458
    error_setg(errp, "Unsupported qcow2 feature(s): %s", features);
459
    g_free(features);
460 461
}

462 463 464 465 466 467 468
/*
 * Sets the dirty bit and flushes afterwards if necessary.
 *
 * The incompatible_features bit is only set if the image file header was
 * updated successfully.  Therefore it is not required to check the return
 * value of this function.
 */
469
int qcow2_mark_dirty(BlockDriverState *bs)
470
{
471
    BDRVQcow2State *s = bs->opaque;
472 473 474 475 476 477 478 479 480 481
    uint64_t val;
    int ret;

    assert(s->qcow_version >= 3);

    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
        return 0; /* already dirty */
    }

    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
482
    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
483 484 485 486
                      &val, sizeof(val));
    if (ret < 0) {
        return ret;
    }
K
Kevin Wolf 已提交
487
    ret = bdrv_flush(bs->file->bs);
488 489 490 491 492 493 494 495 496
    if (ret < 0) {
        return ret;
    }

    /* Only treat image as dirty if the header was updated successfully */
    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
    return 0;
}

S
Stefan Hajnoczi 已提交
497 498 499 500 501 502 503
/*
 * Clears the dirty bit and flushes before if necessary.  Only call this
 * function when there are no pending requests, it does not guard against
 * concurrent requests dirtying the image.
 */
static int qcow2_mark_clean(BlockDriverState *bs)
{
504
    BDRVQcow2State *s = bs->opaque;
S
Stefan Hajnoczi 已提交
505 506

    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
507 508 509 510
        int ret;

        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;

511
        ret = qcow2_flush_caches(bs);
S
Stefan Hajnoczi 已提交
512 513 514 515 516 517 518 519 520
        if (ret < 0) {
            return ret;
        }

        return qcow2_update_header(bs);
    }
    return 0;
}

M
Max Reitz 已提交
521 522 523 524 525
/*
 * Marks the image as corrupt.
 */
int qcow2_mark_corrupt(BlockDriverState *bs)
{
526
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
527 528 529 530 531 532 533 534 535 536 537

    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
    return qcow2_update_header(bs);
}

/*
 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
 * before if necessary.
 */
int qcow2_mark_consistent(BlockDriverState *bs)
{
538
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
539 540

    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
541
        int ret = qcow2_flush_caches(bs);
M
Max Reitz 已提交
542 543 544 545 546 547 548 549 550 551
        if (ret < 0) {
            return ret;
        }

        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
        return qcow2_update_header(bs);
    }
    return 0;
}

552 553 554
static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
                                              BdrvCheckResult *result,
                                              BdrvCheckMode fix)
555 556 557 558 559 560 561
{
    int ret = qcow2_check_refcounts(bs, result, fix);
    if (ret < 0) {
        return ret;
    }

    if (fix && result->check_errors == 0 && result->corruptions == 0) {
M
Max Reitz 已提交
562 563 564 565 566
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
            return ret;
        }
        return qcow2_mark_consistent(bs);
567 568 569 570
    }
    return ret;
}

571 572 573 574 575 576 577 578 579 580 581 582 583
static int coroutine_fn qcow2_co_check(BlockDriverState *bs,
                                       BdrvCheckResult *result,
                                       BdrvCheckMode fix)
{
    BDRVQcow2State *s = bs->opaque;
    int ret;

    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_co_check_locked(bs, result, fix);
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

584 585 586 587
int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
                         uint64_t entries, size_t entry_len,
                         int64_t max_size_bytes, const char *table_name,
                         Error **errp)
588
{
589
    BDRVQcow2State *s = bs->opaque;
590

591 592 593
    if (entries > max_size_bytes / entry_len) {
        error_setg(errp, "%s too large", table_name);
        return -EFBIG;
594 595
    }

596 597 598 599 600
    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
     * because values will be passed to qemu functions taking int64_t. */
    if ((INT64_MAX - entries * entry_len < offset) ||
        (offset_into_cluster(s, offset) != 0)) {
        error_setg(errp, "%s offset invalid", table_name);
601 602 603 604 605 606
        return -EINVAL;
    }

    return 0;
}

607 608 609 610 611
static QemuOptsList qcow2_runtime_opts = {
    .name = "qcow2",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
    .desc = {
        {
612
            .name = QCOW2_OPT_LAZY_REFCOUNTS,
613 614 615
            .type = QEMU_OPT_BOOL,
            .help = "Postpone refcount updates",
        },
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
        {
            .name = QCOW2_OPT_DISCARD_REQUEST,
            .type = QEMU_OPT_BOOL,
            .help = "Pass guest discard requests to the layer below",
        },
        {
            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
            .type = QEMU_OPT_BOOL,
            .help = "Generate discard requests when snapshot related space "
                    "is freed",
        },
        {
            .name = QCOW2_OPT_DISCARD_OTHER,
            .type = QEMU_OPT_BOOL,
            .help = "Generate discard requests when other clusters are freed",
        },
M
Max Reitz 已提交
632 633 634 635 636 637
        {
            .name = QCOW2_OPT_OVERLAP,
            .type = QEMU_OPT_STRING,
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
638 639 640 641 642 643
        {
            .name = QCOW2_OPT_OVERLAP_TEMPLATE,
            .type = QEMU_OPT_STRING,
            .help = "Selects which overlap checks to perform from a range of "
                    "templates (none, constant, cached, all)",
        },
M
Max Reitz 已提交
644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
        {
            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the main qcow2 header",
        },
        {
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the active L1 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an active L2 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the refcount table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into a refcount block",
        },
        {
            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the snapshot table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L1 table",
        },
        {
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into an inactive L2 table",
        },
684 685 686 687 688
        {
            .name = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
            .type = QEMU_OPT_BOOL,
            .help = "Check for unintended writes into the bitmap directory",
        },
689 690 691 692 693 694 695 696 697 698 699
        {
            .name = QCOW2_OPT_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum combined metadata (L2 tables and refcount blocks) "
                    "cache size",
        },
        {
            .name = QCOW2_OPT_L2_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum L2 table cache size",
        },
700 701 702 703 704
        {
            .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Size of each entry in the L2 cache",
        },
705 706 707 708 709
        {
            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Maximum refcount block cache size",
        },
710 711 712 713 714
        {
            .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
            .type = QEMU_OPT_NUMBER,
            .help = "Clean unused cache entries after this time (in seconds)",
        },
715 716
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
            "ID of secret providing qcow2 AES key or LUKS passphrase"),
717 718 719 720
        { /* end of list */ }
    },
};

721
static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
722 723 724 725 726 727 728 729 730
    [QCOW2_OL_MAIN_HEADER_BITNR]      = QCOW2_OPT_OVERLAP_MAIN_HEADER,
    [QCOW2_OL_ACTIVE_L1_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L1,
    [QCOW2_OL_ACTIVE_L2_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L2,
    [QCOW2_OL_REFCOUNT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
    [QCOW2_OL_INACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L1,
    [QCOW2_OL_INACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L2,
    [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
731 732
};

733 734 735
static void cache_clean_timer_cb(void *opaque)
{
    BlockDriverState *bs = opaque;
736
    BDRVQcow2State *s = bs->opaque;
737 738
    qcow2_cache_clean_unused(s->l2_table_cache);
    qcow2_cache_clean_unused(s->refcount_block_cache);
739 740 741 742 743 744
    timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
              (int64_t) s->cache_clean_interval * 1000);
}

static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
{
745
    BDRVQcow2State *s = bs->opaque;
746 747 748 749 750 751 752 753 754 755 756
    if (s->cache_clean_interval > 0) {
        s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL,
                                             SCALE_MS, cache_clean_timer_cb,
                                             bs);
        timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
                  (int64_t) s->cache_clean_interval * 1000);
    }
}

static void cache_clean_timer_del(BlockDriverState *bs)
{
757
    BDRVQcow2State *s = bs->opaque;
758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
    if (s->cache_clean_timer) {
        timer_del(s->cache_clean_timer);
        timer_free(s->cache_clean_timer);
        s->cache_clean_timer = NULL;
    }
}

static void qcow2_detach_aio_context(BlockDriverState *bs)
{
    cache_clean_timer_del(bs);
}

static void qcow2_attach_aio_context(BlockDriverState *bs,
                                     AioContext *new_context)
{
    cache_clean_timer_init(bs, new_context);
}

M
Max Reitz 已提交
776 777
static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                             uint64_t *l2_cache_size,
778
                             uint64_t *l2_cache_entry_size,
779 780
                             uint64_t *refcount_cache_size, Error **errp)
{
781
    BDRVQcow2State *s = bs->opaque;
782
    uint64_t combined_cache_size, l2_cache_max_setting;
783
    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
784
    int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
785 786
    uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
787 788 789 790 791 792

    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);

    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
793 794
    l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
                                             DEFAULT_L2_CACHE_MAX_SIZE);
795 796 797
    *refcount_cache_size = qemu_opt_get_size(opts,
                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);

798 799 800
    *l2_cache_entry_size = qemu_opt_get_size(
        opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);

801 802
    *l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting);

803 804 805 806
    if (combined_cache_size_set) {
        if (l2_cache_size_set && refcount_cache_size_set) {
            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
807
                       "at the same time");
808
            return;
809 810
        } else if (l2_cache_size_set &&
                   (l2_cache_max_setting > combined_cache_size)) {
811 812 813 814 815 816 817 818 819 820 821 822 823 824
            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
                       QCOW2_OPT_CACHE_SIZE);
            return;
        } else if (*refcount_cache_size > combined_cache_size) {
            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
                       QCOW2_OPT_CACHE_SIZE);
            return;
        }

        if (l2_cache_size_set) {
            *refcount_cache_size = combined_cache_size - *l2_cache_size;
        } else if (refcount_cache_size_set) {
            *l2_cache_size = combined_cache_size - *refcount_cache_size;
        } else {
825 826 827 828 829 830 831 832 833 834
            /* Assign as much memory as possible to the L2 cache, and
             * use the remainder for the refcount cache */
            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
                *l2_cache_size = max_l2_cache;
                *refcount_cache_size = combined_cache_size - *l2_cache_size;
            } else {
                *refcount_cache_size =
                    MIN(combined_cache_size, min_refcount_cache);
                *l2_cache_size = combined_cache_size - *refcount_cache_size;
            }
835 836
        }
    }
837 838
    /* l2_cache_size and refcount_cache_size are ensured to have at least
     * their minimum values in qcow2_update_options_prepare() */
839 840 841 842 843 844 845 846 847

    if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
        *l2_cache_entry_size > s->cluster_size ||
        !is_power_of_2(*l2_cache_entry_size)) {
        error_setg(errp, "L2 cache entry size must be a power of two "
                   "between %d and the cluster size (%d)",
                   1 << MIN_CLUSTER_BITS, s->cluster_size);
        return;
    }
848 849
}

850 851 852
typedef struct Qcow2ReopenState {
    Qcow2Cache *l2_table_cache;
    Qcow2Cache *refcount_block_cache;
853
    int l2_slice_size; /* Number of entries in a slice of the L2 table */
854 855 856 857
    bool use_lazy_refcounts;
    int overlap_check;
    bool discard_passthrough[QCOW2_DISCARD_MAX];
    uint64_t cache_clean_interval;
858
    QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
859 860 861 862 863 864
} Qcow2ReopenState;

static int qcow2_update_options_prepare(BlockDriverState *bs,
                                        Qcow2ReopenState *r,
                                        QDict *options, int flags,
                                        Error **errp)
865 866
{
    BDRVQcow2State *s = bs->opaque;
867
    QemuOpts *opts = NULL;
868 869
    const char *opt_overlap_check, *opt_overlap_check_template;
    int overlap_check_template = 0;
870
    uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
871
    int i;
872 873
    const char *encryptfmt;
    QDict *encryptopts = NULL;
874
    Error *local_err = NULL;
875 876
    int ret;

877 878 879
    qdict_extract_subqdict(options, &encryptopts, "encrypt.");
    encryptfmt = qdict_get_try_str(encryptopts, "format");

880 881 882 883 884 885 886 887 888
    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

    /* get L2 table/refcount block cache size from command line options */
889 890
    read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
                     &refcount_cache_size, &local_err);
891 892 893 894 895 896
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

897
    l2_cache_size /= l2_cache_entry_size;
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
    if (l2_cache_size < MIN_L2_CACHE_SIZE) {
        l2_cache_size = MIN_L2_CACHE_SIZE;
    }
    if (l2_cache_size > INT_MAX) {
        error_setg(errp, "L2 cache size too big");
        ret = -EINVAL;
        goto fail;
    }

    refcount_cache_size /= s->cluster_size;
    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
    }
    if (refcount_cache_size > INT_MAX) {
        error_setg(errp, "Refcount cache size too big");
        ret = -EINVAL;
        goto fail;
    }

917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
    /* alloc new L2 table/refcount block cache, flush old one */
    if (s->l2_table_cache) {
        ret = qcow2_cache_flush(bs, s->l2_table_cache);
        if (ret) {
            error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
            goto fail;
        }
    }

    if (s->refcount_block_cache) {
        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
        if (ret) {
            error_setg_errno(errp, -ret,
                             "Failed to flush the refcount block cache");
            goto fail;
        }
    }

935 936 937 938 939
    r->l2_slice_size = l2_cache_entry_size / sizeof(uint64_t);
    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
                                           l2_cache_entry_size);
    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
                                                 s->cluster_size);
940
    if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
941 942 943 944 945 946
        error_setg(errp, "Could not allocate metadata caches");
        ret = -ENOMEM;
        goto fail;
    }

    /* New interval for cache cleanup timer */
947
    r->cache_clean_interval =
948
        qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
949
                            DEFAULT_CACHE_CLEAN_INTERVAL);
950 951 952 953 954 955 956 957
#ifndef CONFIG_LINUX
    if (r->cache_clean_interval != 0) {
        error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
                   " not supported on this host");
        ret = -EINVAL;
        goto fail;
    }
#endif
958
    if (r->cache_clean_interval > UINT_MAX) {
959 960 961 962 963
        error_setg(errp, "Cache clean interval too big");
        ret = -EINVAL;
        goto fail;
    }

964
    /* lazy-refcounts; flush if going from enabled to disabled */
965
    r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
966
        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
967
    if (r->use_lazy_refcounts && s->qcow_version < 3) {
968 969 970 971 972
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
                   "qemu 1.1 compatibility level");
        ret = -EINVAL;
        goto fail;
    }
973

974 975 976 977 978 979 980 981
    if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
            goto fail;
        }
    }

982
    /* Overlap check options */
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
    if (opt_overlap_check_template && opt_overlap_check &&
        strcmp(opt_overlap_check_template, opt_overlap_check))
    {
        error_setg(errp, "Conflicting values for qcow2 options '"
                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
                   "' ('%s')", opt_overlap_check, opt_overlap_check_template);
        ret = -EINVAL;
        goto fail;
    }
    if (!opt_overlap_check) {
        opt_overlap_check = opt_overlap_check_template ?: "cached";
    }

    if (!strcmp(opt_overlap_check, "none")) {
        overlap_check_template = 0;
    } else if (!strcmp(opt_overlap_check, "constant")) {
        overlap_check_template = QCOW2_OL_CONSTANT;
    } else if (!strcmp(opt_overlap_check, "cached")) {
        overlap_check_template = QCOW2_OL_CACHED;
    } else if (!strcmp(opt_overlap_check, "all")) {
        overlap_check_template = QCOW2_OL_ALL;
    } else {
        error_setg(errp, "Unsupported value '%s' for qcow2 option "
                   "'overlap-check'. Allowed are any of the following: "
                   "none, constant, cached, all", opt_overlap_check);
        ret = -EINVAL;
        goto fail;
    }

1014
    r->overlap_check = 0;
1015 1016 1017
    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
        /* overlap-check defines a template bitmask, but every flag may be
         * overwritten through the associated boolean option */
1018
        r->overlap_check |=
1019 1020 1021 1022
            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
                              overlap_check_template & (1 << i)) << i;
    }

1023 1024 1025
    r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
    r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
    r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
1026 1027
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
                          flags & BDRV_O_UNMAP);
1028
    r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
1029
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
1030
    r->discard_passthrough[QCOW2_DISCARD_OTHER] =
1031 1032
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
    switch (s->crypt_method_header) {
    case QCOW_CRYPT_NONE:
        if (encryptfmt) {
            error_setg(errp, "No encryption in image header, but options "
                       "specified format '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
        break;

    case QCOW_CRYPT_AES:
        if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
            error_setg(errp,
                       "Header reported 'aes' encryption format but "
                       "options specify '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
1051 1052
        qdict_put_str(encryptopts, "format", "qcow");
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1053 1054
        break;

1055 1056 1057 1058 1059 1060 1061 1062
    case QCOW_CRYPT_LUKS:
        if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
            error_setg(errp,
                       "Header reported 'luks' encryption format but "
                       "options specify '%s'", encryptfmt);
            ret = -EINVAL;
            goto fail;
        }
1063 1064
        qdict_put_str(encryptopts, "format", "luks");
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1065 1066
        break;

1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
    default:
        error_setg(errp, "Unsupported encryption method %d",
                   s->crypt_method_header);
        break;
    }
    if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) {
        ret = -EINVAL;
        goto fail;
    }

1077 1078
    ret = 0;
fail:
1079
    qobject_unref(encryptopts);
1080 1081
    qemu_opts_del(opts);
    opts = NULL;
1082 1083 1084 1085 1086 1087 1088 1089 1090
    return ret;
}

static void qcow2_update_options_commit(BlockDriverState *bs,
                                        Qcow2ReopenState *r)
{
    BDRVQcow2State *s = bs->opaque;
    int i;

1091
    if (s->l2_table_cache) {
1092
        qcow2_cache_destroy(s->l2_table_cache);
1093 1094
    }
    if (s->refcount_block_cache) {
1095
        qcow2_cache_destroy(s->refcount_block_cache);
1096
    }
1097 1098
    s->l2_table_cache = r->l2_table_cache;
    s->refcount_block_cache = r->refcount_block_cache;
1099
    s->l2_slice_size = r->l2_slice_size;
1100 1101 1102 1103 1104 1105 1106 1107

    s->overlap_check = r->overlap_check;
    s->use_lazy_refcounts = r->use_lazy_refcounts;

    for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
        s->discard_passthrough[i] = r->discard_passthrough[i];
    }

1108 1109 1110 1111 1112
    if (s->cache_clean_interval != r->cache_clean_interval) {
        cache_clean_timer_del(bs);
        s->cache_clean_interval = r->cache_clean_interval;
        cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
    }
1113 1114 1115

    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
    s->crypto_opts = r->crypto_opts;
1116 1117 1118 1119 1120 1121
}

static void qcow2_update_options_abort(BlockDriverState *bs,
                                       Qcow2ReopenState *r)
{
    if (r->l2_table_cache) {
1122
        qcow2_cache_destroy(r->l2_table_cache);
1123 1124
    }
    if (r->refcount_block_cache) {
1125
        qcow2_cache_destroy(r->refcount_block_cache);
1126
    }
1127
    qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141
}

static int qcow2_update_options(BlockDriverState *bs, QDict *options,
                                int flags, Error **errp)
{
    Qcow2ReopenState r = {};
    int ret;

    ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
    if (ret >= 0) {
        qcow2_update_options_commit(bs, &r);
    } else {
        qcow2_update_options_abort(bs, &r);
    }
1142

1143 1144 1145
    return ret;
}

1146 1147 1148
/* Called with s->lock held.  */
static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
                                      int flags, Error **errp)
B
bellard 已提交
1149
{
1150
    BDRVQcow2State *s = bs->opaque;
1151 1152
    unsigned int len, i;
    int ret = 0;
B
bellard 已提交
1153
    QCowHeader header;
1154
    Error *local_err = NULL;
1155
    uint64_t ext_end;
1156
    uint64_t l1_vm_state_index;
1157
    bool update_header = false;
B
bellard 已提交
1158

1159
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
1160
    if (ret < 0) {
M
Max Reitz 已提交
1161
        error_setg_errno(errp, -ret, "Could not read qcow2 header");
B
bellard 已提交
1162
        goto fail;
1163
    }
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
    header.magic = be32_to_cpu(header.magic);
    header.version = be32_to_cpu(header.version);
    header.backing_file_offset = be64_to_cpu(header.backing_file_offset);
    header.backing_file_size = be32_to_cpu(header.backing_file_size);
    header.size = be64_to_cpu(header.size);
    header.cluster_bits = be32_to_cpu(header.cluster_bits);
    header.crypt_method = be32_to_cpu(header.crypt_method);
    header.l1_table_offset = be64_to_cpu(header.l1_table_offset);
    header.l1_size = be32_to_cpu(header.l1_size);
    header.refcount_table_offset = be64_to_cpu(header.refcount_table_offset);
    header.refcount_table_clusters =
        be32_to_cpu(header.refcount_table_clusters);
    header.snapshots_offset = be64_to_cpu(header.snapshots_offset);
    header.nb_snapshots = be32_to_cpu(header.nb_snapshots);
1178

K
Kevin Wolf 已提交
1179
    if (header.magic != QCOW_MAGIC) {
M
Max Reitz 已提交
1180
        error_setg(errp, "Image is not in qcow2 format");
P
Paolo Bonzini 已提交
1181
        ret = -EINVAL;
B
bellard 已提交
1182
        goto fail;
1183
    }
K
Kevin Wolf 已提交
1184
    if (header.version < 2 || header.version > 3) {
1185
        error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
K
Kevin Wolf 已提交
1186 1187 1188 1189 1190 1191
        ret = -ENOTSUP;
        goto fail;
    }

    s->qcow_version = header.version;

1192 1193 1194
    /* Initialise cluster size */
    if (header.cluster_bits < MIN_CLUSTER_BITS ||
        header.cluster_bits > MAX_CLUSTER_BITS) {
1195 1196
        error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
                   header.cluster_bits);
1197 1198 1199 1200 1201 1202
        ret = -EINVAL;
        goto fail;
    }

    s->cluster_bits = header.cluster_bits;
    s->cluster_size = 1 << s->cluster_bits;
1203
    s->cluster_sectors = 1 << (s->cluster_bits - BDRV_SECTOR_BITS);
1204

K
Kevin Wolf 已提交
1205 1206 1207 1208 1209 1210 1211 1212
    /* Initialise version 3 header fields */
    if (header.version == 2) {
        header.incompatible_features    = 0;
        header.compatible_features      = 0;
        header.autoclear_features       = 0;
        header.refcount_order           = 4;
        header.header_length            = 72;
    } else {
1213 1214 1215 1216 1217 1218
        header.incompatible_features =
            be64_to_cpu(header.incompatible_features);
        header.compatible_features = be64_to_cpu(header.compatible_features);
        header.autoclear_features = be64_to_cpu(header.autoclear_features);
        header.refcount_order = be32_to_cpu(header.refcount_order);
        header.header_length = be32_to_cpu(header.header_length);
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230

        if (header.header_length < 104) {
            error_setg(errp, "qcow2 header too short");
            ret = -EINVAL;
            goto fail;
        }
    }

    if (header.header_length > s->cluster_size) {
        error_setg(errp, "qcow2 header exceeds cluster size");
        ret = -EINVAL;
        goto fail;
K
Kevin Wolf 已提交
1231 1232 1233 1234 1235
    }

    if (header.header_length > sizeof(header)) {
        s->unknown_header_fields_size = header.header_length - sizeof(header);
        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
1236
        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
K
Kevin Wolf 已提交
1237 1238
                         s->unknown_header_fields_size);
        if (ret < 0) {
M
Max Reitz 已提交
1239 1240
            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
                             "fields");
K
Kevin Wolf 已提交
1241 1242 1243 1244
            goto fail;
        }
    }

1245 1246 1247 1248 1249 1250
    if (header.backing_file_offset > s->cluster_size) {
        error_setg(errp, "Invalid backing file offset");
        ret = -EINVAL;
        goto fail;
    }

1251 1252 1253 1254 1255 1256
    if (header.backing_file_offset) {
        ext_end = header.backing_file_offset;
    } else {
        ext_end = 1 << header.cluster_bits;
    }

K
Kevin Wolf 已提交
1257 1258 1259 1260 1261
    /* Handle feature bits */
    s->incompatible_features    = header.incompatible_features;
    s->compatible_features      = header.compatible_features;
    s->autoclear_features       = header.autoclear_features;

S
Stefan Hajnoczi 已提交
1262
    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
1263 1264
        void *feature_table = NULL;
        qcow2_read_extensions(bs, header.header_length, ext_end,
1265
                              &feature_table, flags, NULL, NULL);
1266
        report_unsupported_feature(errp, feature_table,
S
Stefan Hajnoczi 已提交
1267 1268
                                   s->incompatible_features &
                                   ~QCOW2_INCOMPAT_MASK);
K
Kevin Wolf 已提交
1269
        ret = -ENOTSUP;
1270
        g_free(feature_table);
K
Kevin Wolf 已提交
1271 1272 1273
        goto fail;
    }

M
Max Reitz 已提交
1274 1275 1276 1277
    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
        /* Corrupt images may not be written to unless they are being repaired
         */
        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
M
Max Reitz 已提交
1278 1279
            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
                       "read/write");
M
Max Reitz 已提交
1280 1281 1282 1283 1284
            ret = -EACCES;
            goto fail;
        }
    }

K
Kevin Wolf 已提交
1285
    /* Check support for various header values */
1286 1287 1288 1289
    if (header.refcount_order > 6) {
        error_setg(errp, "Reference count entry width too large; may not "
                   "exceed 64 bits");
        ret = -EINVAL;
K
Kevin Wolf 已提交
1290 1291
        goto fail;
    }
1292
    s->refcount_order = header.refcount_order;
1293 1294 1295
    s->refcount_bits = 1 << s->refcount_order;
    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
    s->refcount_max += s->refcount_max - 1;
K
Kevin Wolf 已提交
1296

B
bellard 已提交
1297
    s->crypt_method_header = header.crypt_method;
1298
    if (s->crypt_method_header) {
1299 1300
        if (bdrv_uses_whitelist() &&
            s->crypt_method_header == QCOW_CRYPT_AES) {
1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
            error_setg(errp,
                       "Use of AES-CBC encrypted qcow2 images is no longer "
                       "supported in system emulators");
            error_append_hint(errp,
                              "You can use 'qemu-img convert' to convert your "
                              "image to an alternative supported format, such "
                              "as unencrypted qcow2, or raw with the LUKS "
                              "format instead.\n");
            ret = -ENOSYS;
            goto fail;
1311 1312
        }

1313 1314 1315 1316 1317 1318 1319 1320 1321
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
            s->crypt_physical_offset = false;
        } else {
            /* Assuming LUKS and any future crypt methods we
             * add will all use physical offsets, due to the
             * fact that the alternative is insecure...  */
            s->crypt_physical_offset = true;
        }

1322
        bs->encrypted = true;
1323
    }
1324

B
bellard 已提交
1325 1326
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
1327 1328 1329
    /* 2^(s->refcount_order - 3) is the refcount width in bytes */
    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
    s->refcount_block_size = 1 << s->refcount_block_bits;
1330
    bs->total_sectors = header.size / BDRV_SECTOR_SIZE;
B
bellard 已提交
1331 1332 1333
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
1334

B
bellard 已提交
1335
    s->refcount_table_offset = header.refcount_table_offset;
1336
    s->refcount_table_size =
B
bellard 已提交
1337 1338
        header.refcount_table_clusters << (s->cluster_bits - 3);

1339 1340 1341 1342 1343 1344
    if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
        error_setg(errp, "Image does not contain a reference count table");
        ret = -EINVAL;
        goto fail;
    }

1345 1346 1347 1348
    ret = qcow2_validate_table(bs, s->refcount_table_offset,
                               header.refcount_table_clusters,
                               s->cluster_size, QCOW_MAX_REFTABLE_SIZE,
                               "Reference count table", errp);
1349 1350 1351 1352
    if (ret < 0) {
        goto fail;
    }

1353 1354 1355 1356 1357 1358 1359 1360 1361
    /* The total size in bytes of the snapshot table is checked in
     * qcow2_read_snapshots() because the size of each snapshot is
     * variable and we don't know it yet.
     * Here we only check the offset and number of snapshots. */
    ret = qcow2_validate_table(bs, header.snapshots_offset,
                               header.nb_snapshots,
                               sizeof(QCowSnapshotHeader),
                               sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
                               "Snapshot table", errp);
1362 1363 1364 1365
    if (ret < 0) {
        goto fail;
    }

B
bellard 已提交
1366
    /* read the level 1 table */
1367 1368 1369 1370
    ret = qcow2_validate_table(bs, header.l1_table_offset,
                               header.l1_size, sizeof(uint64_t),
                               QCOW_MAX_L1_SIZE, "Active L1 table", errp);
    if (ret < 0) {
1371 1372
        goto fail;
    }
B
bellard 已提交
1373
    s->l1_size = header.l1_size;
1374
    s->l1_table_offset = header.l1_table_offset;
1375 1376 1377

    l1_vm_state_index = size_to_l1(s, header.size);
    if (l1_vm_state_index > INT_MAX) {
M
Max Reitz 已提交
1378
        error_setg(errp, "Image is too big");
1379 1380 1381 1382 1383
        ret = -EFBIG;
        goto fail;
    }
    s->l1_vm_state_index = l1_vm_state_index;

B
bellard 已提交
1384 1385
    /* the L1 table must contain at least enough entries to put
       header.size bytes */
1386
    if (s->l1_size < s->l1_vm_state_index) {
M
Max Reitz 已提交
1387
        error_setg(errp, "L1 table is too small");
1388
        ret = -EINVAL;
B
bellard 已提交
1389
        goto fail;
1390
    }
1391

1392
    if (s->l1_size > 0) {
K
Kevin Wolf 已提交
1393
        s->l1_table = qemu_try_blockalign(bs->file->bs,
1394
            ROUND_UP(s->l1_size * sizeof(uint64_t), 512));
1395 1396 1397 1398 1399
        if (s->l1_table == NULL) {
            error_setg(errp, "Could not allocate L1 table");
            ret = -ENOMEM;
            goto fail;
        }
1400
        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
1401 1402
                         s->l1_size * sizeof(uint64_t));
        if (ret < 0) {
M
Max Reitz 已提交
1403
            error_setg_errno(errp, -ret, "Could not read L1 table");
1404
            goto fail;
1405
        }
1406
        for(i = 0;i < s->l1_size; i++) {
1407
            s->l1_table[i] = be64_to_cpu(s->l1_table[i]);
1408
        }
B
bellard 已提交
1409
    }
K
Kevin Wolf 已提交
1410

1411 1412
    /* Parse driver-specific options */
    ret = qcow2_update_options(bs, options, flags, errp);
1413 1414 1415 1416
    if (ret < 0) {
        goto fail;
    }

B
bellard 已提交
1417
    s->cluster_cache_offset = -1;
1418
    s->flags = flags;
1419

1420 1421
    ret = qcow2_refcount_init(bs);
    if (ret != 0) {
M
Max Reitz 已提交
1422
        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
B
bellard 已提交
1423
        goto fail;
1424
    }
B
bellard 已提交
1425

B
Blue Swirl 已提交
1426
    QLIST_INIT(&s->cluster_allocs);
K
Kevin Wolf 已提交
1427
    QTAILQ_INIT(&s->discards);
1428

1429
    /* read qcow2 extensions */
M
Max Reitz 已提交
1430
    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
1431
                              flags, &update_header, &local_err)) {
M
Max Reitz 已提交
1432
        error_propagate(errp, local_err);
1433
        ret = -EINVAL;
1434
        goto fail;
1435
    }
1436

1437 1438 1439 1440 1441 1442 1443 1444 1445 1446
    /* qcow2_read_extension may have set up the crypto context
     * if the crypt method needs a header region, some methods
     * don't need header extensions, so must check here
     */
    if (s->crypt_method_header && !s->crypto) {
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
            unsigned int cflags = 0;
            if (flags & BDRV_O_NO_IO) {
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
            }
1447 1448
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
                                           NULL, NULL, cflags, errp);
1449 1450 1451 1452 1453 1454 1455
            if (!s->crypto) {
                ret = -EINVAL;
                goto fail;
            }
        } else if (!(flags & BDRV_O_NO_IO)) {
            error_setg(errp, "Missing CRYPTO header for crypt method %d",
                       s->crypt_method_header);
1456 1457 1458 1459 1460
            ret = -EINVAL;
            goto fail;
        }
    }

B
bellard 已提交
1461 1462 1463
    /* read the backing file name */
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
1464
        if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
1465
            len >= sizeof(bs->backing_file)) {
1466 1467 1468
            error_setg(errp, "Backing file name too long");
            ret = -EINVAL;
            goto fail;
1469
        }
1470
        ret = bdrv_pread(bs->file, header.backing_file_offset,
1471 1472
                         bs->backing_file, len);
        if (ret < 0) {
M
Max Reitz 已提交
1473
            error_setg_errno(errp, -ret, "Could not read backing file name");
B
bellard 已提交
1474
            goto fail;
1475
        }
B
bellard 已提交
1476
        bs->backing_file[len] = '\0';
1477
        s->image_backing_file = g_strdup(bs->backing_file);
B
bellard 已提交
1478
    }
1479

1480 1481 1482 1483
    /* Internal snapshots */
    s->snapshots_offset = header.snapshots_offset;
    s->nb_snapshots = header.nb_snapshots;

1484 1485
    ret = qcow2_read_snapshots(bs);
    if (ret < 0) {
M
Max Reitz 已提交
1486
        error_setg_errno(errp, -ret, "Could not read snapshots");
B
bellard 已提交
1487
        goto fail;
1488
    }
B
bellard 已提交
1489

1490
    /* Clear unknown autoclear feature bits */
1491
    update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
1492 1493 1494
    update_header =
        update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE);
    if (update_header) {
1495
        s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
1496 1497
    }

1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
    /* == Handle persistent dirty bitmaps ==
     *
     * We want load dirty bitmaps in three cases:
     *
     * 1. Normal open of the disk in active mode, not related to invalidation
     *    after migration.
     *
     * 2. Invalidation of the target vm after pre-copy phase of migration, if
     *    bitmaps are _not_ migrating through migration channel, i.e.
     *    'dirty-bitmaps' capability is disabled.
     *
     * 3. Invalidation of source vm after failed or canceled migration.
     *    This is a very interesting case. There are two possible types of
     *    bitmaps:
     *
     *    A. Stored on inactivation and removed. They should be loaded from the
     *       image.
     *
     *    B. Not stored: not-persistent bitmaps and bitmaps, migrated through
     *       the migration channel (with dirty-bitmaps capability).
     *
     *    On the other hand, there are two possible sub-cases:
     *
     *    3.1 disk was changed by somebody else while were inactive. In this
     *        case all in-RAM dirty bitmaps (both persistent and not) are
     *        definitely invalid. And we don't have any method to determine
     *        this.
     *
     *        Simple and safe thing is to just drop all the bitmaps of type B on
     *        inactivation. But in this case we lose bitmaps in valid 4.2 case.
     *
     *        On the other hand, resuming source vm, if disk was already changed
     *        is a bad thing anyway: not only bitmaps, the whole vm state is
     *        out of sync with disk.
     *
     *        This means, that user or management tool, who for some reason
     *        decided to resume source vm, after disk was already changed by
     *        target vm, should at least drop all dirty bitmaps by hand.
     *
     *        So, we can ignore this case for now, but TODO: "generation"
     *        extension for qcow2, to determine, that image was changed after
     *        last inactivation. And if it is changed, we will drop (or at least
     *        mark as 'invalid' all the bitmaps of type B, both persistent
     *        and not).
     *
     *    3.2 disk was _not_ changed while were inactive. Bitmaps may be saved
     *        to disk ('dirty-bitmaps' capability disabled), or not saved
     *        ('dirty-bitmaps' capability enabled), but we don't need to care
     *        of: let's load bitmaps as always: stored bitmaps will be loaded,
     *        and not stored has flag IN_USE=1 in the image and will be skipped
     *        on loading.
     *
     * One remaining possible case when we don't want load bitmaps:
     *
     * 4. Open disk in inactive mode in target vm (bitmaps are migrating or
     *    will be loaded on invalidation, no needs try loading them before)
     */

    if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) {
        /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */
        bool header_updated = qcow2_load_dirty_bitmaps(bs, &local_err);

        update_header = update_header && !header_updated;
1561 1562 1563 1564 1565 1566 1567 1568
    }
    if (local_err != NULL) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }

    if (update_header) {
1569 1570
        ret = qcow2_update_header(bs);
        if (ret < 0) {
M
Max Reitz 已提交
1571
            error_setg_errno(errp, -ret, "Could not update qcow2 header");
1572 1573 1574 1575
            goto fail;
        }
    }

1576
    bs->supported_zero_flags = header.version >= 3 ? BDRV_REQ_MAY_UNMAP : 0;
K
Kevin Wolf 已提交
1577

S
Stefan Hajnoczi 已提交
1578
    /* Repair image if dirty */
1579
    if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
1580
        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
S
Stefan Hajnoczi 已提交
1581 1582
        BdrvCheckResult result = {0};

1583 1584
        ret = qcow2_co_check_locked(bs, &result,
                                    BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
M
Max Reitz 已提交
1585 1586 1587 1588
        if (ret < 0 || result.check_errors) {
            if (ret >= 0) {
                ret = -EIO;
            }
M
Max Reitz 已提交
1589
            error_setg_errno(errp, -ret, "Could not repair dirty image");
S
Stefan Hajnoczi 已提交
1590 1591 1592 1593
            goto fail;
        }
    }

B
bellard 已提交
1594
#ifdef DEBUG_ALLOC
P
Philipp Hahn 已提交
1595 1596
    {
        BdrvCheckResult result = {0};
1597
        qcow2_check_refcounts(bs, &result, 0);
P
Philipp Hahn 已提交
1598
    }
B
bellard 已提交
1599
#endif
1600 1601 1602

    qemu_co_queue_init(&s->compress_wait_queue);

1603
    return ret;
B
bellard 已提交
1604 1605

 fail:
K
Kevin Wolf 已提交
1606
    g_free(s->unknown_header_fields);
1607
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
1608 1609
    qcow2_free_snapshots(bs);
    qcow2_refcount_close(bs);
1610
    qemu_vfree(s->l1_table);
1611 1612
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
1613
    cache_clean_timer_del(bs);
K
Kevin Wolf 已提交
1614
    if (s->l2_table_cache) {
1615
        qcow2_cache_destroy(s->l2_table_cache);
K
Kevin Wolf 已提交
1616
    }
1617
    if (s->refcount_block_cache) {
1618
        qcow2_cache_destroy(s->refcount_block_cache);
1619
    }
1620 1621
    qcrypto_block_free(s->crypto);
    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1622
    return ret;
B
bellard 已提交
1623 1624
}

1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
typedef struct QCow2OpenCo {
    BlockDriverState *bs;
    QDict *options;
    int flags;
    Error **errp;
    int ret;
} QCow2OpenCo;

static void coroutine_fn qcow2_open_entry(void *opaque)
{
    QCow2OpenCo *qoc = opaque;
    BDRVQcow2State *s = qoc->bs->opaque;

    qemu_co_mutex_lock(&s->lock);
    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
    qemu_co_mutex_unlock(&s->lock);
}

1643 1644 1645
static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
{
1646 1647 1648 1649 1650 1651 1652 1653 1654
    BDRVQcow2State *s = bs->opaque;
    QCow2OpenCo qoc = {
        .bs = bs,
        .options = options,
        .flags = flags,
        .errp = errp,
        .ret = -EINPROGRESS
    };

1655 1656 1657 1658 1659 1660
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671
    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);

    if (qemu_in_coroutine()) {
        /* From bdrv_co_create.  */
        qcow2_open_entry(&qoc);
    } else {
        qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
    }
    return qoc.ret;
1672 1673
}

1674
static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1675
{
1676
    BDRVQcow2State *s = bs->opaque;
1677

1678 1679
    if (bs->encrypted) {
        /* Encryption works on a sector granularity */
1680
        bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
1681
    }
1682
    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
1683
    bs->bl.pdiscard_alignment = s->cluster_size;
1684 1685
}

J
Jeff Cody 已提交
1686 1687 1688
static int qcow2_reopen_prepare(BDRVReopenState *state,
                                BlockReopenQueue *queue, Error **errp)
{
1689
    Qcow2ReopenState *r;
1690 1691
    int ret;

1692 1693 1694 1695 1696 1697 1698 1699 1700 1701
    r = g_new0(Qcow2ReopenState, 1);
    state->opaque = r;

    ret = qcow2_update_options_prepare(state->bs, r, state->options,
                                       state->flags, errp);
    if (ret < 0) {
        goto fail;
    }

    /* We need to write out any unwritten data if we reopen read-only. */
1702
    if ((state->flags & BDRV_O_RDWR) == 0) {
1703 1704 1705 1706 1707
        ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
        if (ret < 0) {
            goto fail;
        }

1708 1709
        ret = bdrv_flush(state->bs);
        if (ret < 0) {
1710
            goto fail;
1711 1712 1713 1714
        }

        ret = qcow2_mark_clean(state->bs);
        if (ret < 0) {
1715
            goto fail;
1716 1717 1718
        }
    }

J
Jeff Cody 已提交
1719
    return 0;
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736

fail:
    qcow2_update_options_abort(state->bs, r);
    g_free(r);
    return ret;
}

static void qcow2_reopen_commit(BDRVReopenState *state)
{
    qcow2_update_options_commit(state->bs, state->opaque);
    g_free(state->opaque);
}

static void qcow2_reopen_abort(BDRVReopenState *state)
{
    qcow2_update_options_abort(state->bs, state->opaque);
    g_free(state->opaque);
J
Jeff Cody 已提交
1737 1738
}

1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784
static void qcow2_join_options(QDict *options, QDict *old_options)
{
    bool has_new_overlap_template =
        qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
        qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
    bool has_new_total_cache_size =
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
    bool has_all_cache_options;

    /* New overlap template overrides all old overlap options */
    if (has_new_overlap_template) {
        qdict_del(old_options, QCOW2_OPT_OVERLAP);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
    }

    /* New total cache size overrides all old options */
    if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
        qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
        qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
    }

    qdict_join(options, old_options, false);

    /*
     * If after merging all cache size options are set, an old total size is
     * overwritten. Do keep all options, however, if all three are new. The
     * resulting error message is what we want to happen.
     */
    has_all_cache_options =
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
        qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
        qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);

    if (has_all_cache_options && !has_new_total_cache_size) {
        qdict_del(options, QCOW2_OPT_CACHE_SIZE);
    }
}

1785 1786 1787 1788 1789
static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
                                              bool want_zero,
                                              int64_t offset, int64_t count,
                                              int64_t *pnum, int64_t *map,
                                              BlockDriverState **file)
B
bellard 已提交
1790
{
1791
    BDRVQcow2State *s = bs->opaque;
B
bellard 已提交
1792
    uint64_t cluster_offset;
1793
    int index_in_cluster, ret;
K
Kevin Wolf 已提交
1794
    unsigned int bytes;
1795
    int status = 0;
B
bellard 已提交
1796

1797
    bytes = MIN(INT_MAX, count);
1798
    qemu_co_mutex_lock(&s->lock);
1799
    ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
1800
    qemu_co_mutex_unlock(&s->lock);
1801
    if (ret < 0) {
1802
        return ret;
1803
    }
1804

1805
    *pnum = bytes;
K
Kevin Wolf 已提交
1806

1807
    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
1808
        !s->crypto) {
1809 1810
        index_in_cluster = offset & (s->cluster_size - 1);
        *map = cluster_offset | index_in_cluster;
1811
        *file = bs->file->bs;
1812
        status |= BDRV_BLOCK_OFFSET_VALID;
1813
    }
1814
    if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
1815 1816 1817 1818 1819
        status |= BDRV_BLOCK_ZERO;
    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
        status |= BDRV_BLOCK_DATA;
    }
    return status;
B
bellard 已提交
1820 1821
}

F
Fam Zheng 已提交
1822 1823 1824 1825 1826 1827 1828 1829 1830 1831
static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs,
                                            QCowL2Meta **pl2meta,
                                            bool link_l2)
{
    int ret = 0;
    QCowL2Meta *l2meta = *pl2meta;

    while (l2meta != NULL) {
        QCowL2Meta *next;

F
Fam Zheng 已提交
1832
        if (link_l2) {
F
Fam Zheng 已提交
1833 1834 1835 1836
            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
            if (ret) {
                goto out;
            }
1837 1838
        } else {
            qcow2_alloc_cluster_abort(bs, l2meta);
F
Fam Zheng 已提交
1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856
        }

        /* Take the request off the list of running requests */
        if (l2meta->nb_clusters != 0) {
            QLIST_REMOVE(l2meta, next_in_flight);
        }

        qemu_co_queue_restart_all(&l2meta->dependent_requests);

        next = l2meta->next;
        g_free(l2meta);
        l2meta = next;
    }
out:
    *pl2meta = l2meta;
    return ret;
}

K
Kevin Wolf 已提交
1857 1858 1859
static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                        uint64_t bytes, QEMUIOVector *qiov,
                                        int flags)
B
bellard 已提交
1860
{
1861
    BDRVQcow2State *s = bs->opaque;
1862
    int offset_in_cluster;
K
Kevin Wolf 已提交
1863
    int ret;
K
Kevin Wolf 已提交
1864
    unsigned int cur_bytes; /* number of bytes in current iteration */
1865
    uint64_t cluster_offset = 0;
1866 1867 1868
    uint64_t bytes_done = 0;
    QEMUIOVector hd_qiov;
    uint8_t *cluster_data = NULL;
B
bellard 已提交
1869

1870 1871 1872 1873
    qemu_iovec_init(&hd_qiov, qiov->niov);

    qemu_co_mutex_lock(&s->lock);

K
Kevin Wolf 已提交
1874
    while (bytes != 0) {
1875

1876
        /* prepare next request */
K
Kevin Wolf 已提交
1877
        cur_bytes = MIN(bytes, INT_MAX);
1878
        if (s->crypto) {
K
Kevin Wolf 已提交
1879 1880
            cur_bytes = MIN(cur_bytes,
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
B
bellard 已提交
1881
        }
1882

K
Kevin Wolf 已提交
1883
        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
1884
        if (ret < 0) {
1885
            goto fail;
1886
        }
1887

K
Kevin Wolf 已提交
1888
        offset_in_cluster = offset_into_cluster(s, offset);
1889

1890
        qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
1891
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
1892

1893 1894
        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
1895

1896
            if (bs->backing) {
1897 1898 1899 1900 1901 1902 1903
                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                qemu_co_mutex_unlock(&s->lock);
                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
                                     &hd_qiov, 0);
                qemu_co_mutex_lock(&s->lock);
                if (ret < 0) {
                    goto fail;
1904 1905 1906
                }
            } else {
                /* Note: in this case, no need to wait */
K
Kevin Wolf 已提交
1907
                qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1908
            }
1909 1910
            break;

1911 1912
        case QCOW2_CLUSTER_ZERO_PLAIN:
        case QCOW2_CLUSTER_ZERO_ALLOC:
K
Kevin Wolf 已提交
1913
            qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1914 1915
            break;

1916
        case QCOW2_CLUSTER_COMPRESSED:
1917 1918 1919
            /* add AIO support for compressed blocks ? */
            ret = qcow2_decompress_cluster(bs, cluster_offset);
            if (ret < 0) {
1920
                goto fail;
1921 1922
            }

1923
            qemu_iovec_from_buf(&hd_qiov, 0,
K
Kevin Wolf 已提交
1924 1925
                                s->cluster_cache + offset_in_cluster,
                                cur_bytes);
1926 1927 1928
            break;

        case QCOW2_CLUSTER_NORMAL:
1929
            if ((cluster_offset & 511) != 0) {
1930 1931
                ret = -EIO;
                goto fail;
1932
            }
1933

1934
            if (bs->encrypted) {
1935
                assert(s->crypto);
1936

1937 1938 1939 1940
                /*
                 * For encrypted images, read everything into a temporary
                 * contiguous buffer on which the AES functions can work.
                 */
1941 1942
                if (!cluster_data) {
                    cluster_data =
K
Kevin Wolf 已提交
1943 1944 1945
                        qemu_try_blockalign(bs->file->bs,
                                            QCOW_MAX_CRYPT_CLUSTERS
                                            * s->cluster_size);
1946 1947 1948 1949
                    if (cluster_data == NULL) {
                        ret = -ENOMEM;
                        goto fail;
                    }
1950 1951
                }

K
Kevin Wolf 已提交
1952
                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1953
                qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
1954
                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
1955 1956 1957 1958
            }

            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
            qemu_co_mutex_unlock(&s->lock);
1959
            ret = bdrv_co_preadv(bs->file,
K
Kevin Wolf 已提交
1960 1961
                                 cluster_offset + offset_in_cluster,
                                 cur_bytes, &hd_qiov, 0);
1962 1963
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
1964
                goto fail;
1965
            }
1966
            if (bs->encrypted) {
1967
                assert(s->crypto);
K
Kevin Wolf 已提交
1968 1969
                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1970
                if (qcrypto_block_decrypt(s->crypto,
1971 1972
                                          (s->crypt_physical_offset ?
                                           cluster_offset + offset_in_cluster :
1973
                                           offset),
1974
                                          cluster_data,
1975
                                          cur_bytes,
1976
                                          NULL) < 0) {
1977 1978 1979
                    ret = -EIO;
                    goto fail;
                }
K
Kevin Wolf 已提交
1980
                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
1981
            }
1982 1983 1984 1985 1986 1987
            break;

        default:
            g_assert_not_reached();
            ret = -EIO;
            goto fail;
1988
        }
1989

K
Kevin Wolf 已提交
1990 1991 1992
        bytes -= cur_bytes;
        offset += cur_bytes;
        bytes_done += cur_bytes;
1993
    }
1994
    ret = 0;
1995

1996
fail:
K
Kevin Wolf 已提交
1997
    qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
1998

1999
    qemu_iovec_destroy(&hd_qiov);
2000
    qemu_vfree(cluster_data);
K
Kevin Wolf 已提交
2001 2002

    return ret;
B
bellard 已提交
2003 2004
}

2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042
/* Check if it's possible to merge a write request with the writing of
 * the data from the COW regions */
static bool merge_cow(uint64_t offset, unsigned bytes,
                      QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
{
    QCowL2Meta *m;

    for (m = l2meta; m != NULL; m = m->next) {
        /* If both COW regions are empty then there's nothing to merge */
        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
            continue;
        }

        /* The data (middle) region must be immediately after the
         * start region */
        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
            continue;
        }

        /* The end region must be immediately after the data (middle)
         * region */
        if (m->offset + m->cow_end.offset != offset + bytes) {
            continue;
        }

        /* Make sure that adding both COW regions to the QEMUIOVector
         * does not exceed IOV_MAX */
        if (hd_qiov->niov > IOV_MAX - 2) {
            continue;
        }

        m->data_qiov = hd_qiov;
        return true;
    }

    return false;
}

K
Kevin Wolf 已提交
2043 2044 2045
static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
B
bellard 已提交
2046
{
2047
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
2048
    int offset_in_cluster;
K
Kevin Wolf 已提交
2049
    int ret;
K
Kevin Wolf 已提交
2050
    unsigned int cur_bytes; /* number of sectors in current iteration */
2051
    uint64_t cluster_offset;
2052 2053 2054
    QEMUIOVector hd_qiov;
    uint64_t bytes_done = 0;
    uint8_t *cluster_data = NULL;
2055
    QCowL2Meta *l2meta = NULL;
2056

K
Kevin Wolf 已提交
2057
    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
K
Kevin Wolf 已提交
2058

2059 2060 2061
    qemu_iovec_init(&hd_qiov, qiov->niov);

    s->cluster_cache_offset = -1; /* disable compressed cache */
2062

2063 2064
    qemu_co_mutex_lock(&s->lock);

K
Kevin Wolf 已提交
2065
    while (bytes != 0) {
2066

2067
        l2meta = NULL;
K
Kevin Wolf 已提交
2068

K
Kevin Wolf 已提交
2069
        trace_qcow2_writev_start_part(qemu_coroutine_self());
K
Kevin Wolf 已提交
2070 2071 2072 2073 2074 2075
        offset_in_cluster = offset_into_cluster(s, offset);
        cur_bytes = MIN(bytes, INT_MAX);
        if (bs->encrypted) {
            cur_bytes = MIN(cur_bytes,
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
                            - offset_in_cluster);
2076
        }
2077

K
Kevin Wolf 已提交
2078 2079
        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
                                         &cluster_offset, &l2meta);
2080
        if (ret < 0) {
2081
            goto fail;
2082
        }
2083

2084
        assert((cluster_offset & 511) == 0);
2085

2086
        qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
2087
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
2088

2089
        if (bs->encrypted) {
2090
            assert(s->crypto);
2091
            if (!cluster_data) {
K
Kevin Wolf 已提交
2092
                cluster_data = qemu_try_blockalign(bs->file->bs,
2093 2094 2095 2096 2097 2098
                                                   QCOW_MAX_CRYPT_CLUSTERS
                                                   * s->cluster_size);
                if (cluster_data == NULL) {
                    ret = -ENOMEM;
                    goto fail;
                }
2099
            }
2100

2101
            assert(hd_qiov.size <=
2102
                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2103
            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
2104

2105 2106 2107
            if (qcrypto_block_encrypt(s->crypto,
                                      (s->crypt_physical_offset ?
                                       cluster_offset + offset_in_cluster :
2108
                                       offset),
2109
                                      cluster_data,
2110
                                      cur_bytes, NULL) < 0) {
2111 2112 2113
                ret = -EIO;
                goto fail;
            }
2114

2115
            qemu_iovec_reset(&hd_qiov);
K
Kevin Wolf 已提交
2116
            qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
2117
        }
2118

M
Max Reitz 已提交
2119
        ret = qcow2_pre_write_overlap_check(bs, 0,
K
Kevin Wolf 已提交
2120
                cluster_offset + offset_in_cluster, cur_bytes);
2121 2122 2123 2124
        if (ret < 0) {
            goto fail;
        }

2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140
        /* If we need to do COW, check if it's possible to merge the
         * writing of the guest data together with that of the COW regions.
         * If it's not possible (or not necessary) then write the
         * guest data now. */
        if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
            qemu_co_mutex_unlock(&s->lock);
            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
            trace_qcow2_writev_data(qemu_coroutine_self(),
                                    cluster_offset + offset_in_cluster);
            ret = bdrv_co_pwritev(bs->file,
                                  cluster_offset + offset_in_cluster,
                                  cur_bytes, &hd_qiov, 0);
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
                goto fail;
            }
2141
        }
2142

F
Fam Zheng 已提交
2143 2144 2145
        ret = qcow2_handle_l2meta(bs, &l2meta, true);
        if (ret) {
            goto fail;
2146
        }
2147

K
Kevin Wolf 已提交
2148 2149 2150 2151
        bytes -= cur_bytes;
        offset += cur_bytes;
        bytes_done += cur_bytes;
        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
2152
    }
2153
    ret = 0;
2154

2155
fail:
F
Fam Zheng 已提交
2156
    qcow2_handle_l2meta(bs, &l2meta, false);
2157

2158 2159
    qemu_co_mutex_unlock(&s->lock);

2160
    qemu_iovec_destroy(&hd_qiov);
2161
    qemu_vfree(cluster_data);
K
Kevin Wolf 已提交
2162
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
K
Kevin Wolf 已提交
2163

K
Kevin Wolf 已提交
2164
    return ret;
B
bellard 已提交
2165 2166
}

K
Kevin Wolf 已提交
2167 2168 2169 2170
static int qcow2_inactivate(BlockDriverState *bs)
{
    BDRVQcow2State *s = bs->opaque;
    int ret, result = 0;
2171
    Error *local_err = NULL;
K
Kevin Wolf 已提交
2172

2173 2174 2175
    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
    if (local_err != NULL) {
        result = -EINVAL;
2176 2177 2178
        error_reportf_err(local_err, "Lost persistent bitmaps during "
                          "inactivation of node '%s': ",
                          bdrv_get_device_or_node_name(bs));
2179 2180
    }

K
Kevin Wolf 已提交
2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
    if (ret) {
        result = ret;
        error_report("Failed to flush the L2 table cache: %s",
                     strerror(-ret));
    }

    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret) {
        result = ret;
        error_report("Failed to flush the refcount block cache: %s",
                     strerror(-ret));
    }

    if (result == 0) {
        qcow2_mark_clean(bs);
    }

    return result;
}

2202
static void qcow2_close(BlockDriverState *bs)
B
bellard 已提交
2203
{
2204
    BDRVQcow2State *s = bs->opaque;
2205
    qemu_vfree(s->l1_table);
2206 2207
    /* else pre-write overlap checks in cache_destroy may crash */
    s->l1_table = NULL;
K
Kevin Wolf 已提交
2208

2209
    if (!(s->flags & BDRV_O_INACTIVE)) {
K
Kevin Wolf 已提交
2210
        qcow2_inactivate(bs);
2211
    }
S
Stefan Hajnoczi 已提交
2212

2213
    cache_clean_timer_del(bs);
2214 2215
    qcow2_cache_destroy(s->l2_table_cache);
    qcow2_cache_destroy(s->refcount_block_cache);
K
Kevin Wolf 已提交
2216

2217 2218
    qcrypto_block_free(s->crypto);
    s->crypto = NULL;
2219

K
Kevin Wolf 已提交
2220
    g_free(s->unknown_header_fields);
2221
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
2222

2223 2224 2225
    g_free(s->image_backing_file);
    g_free(s->image_backing_format);

2226
    g_free(s->cluster_cache);
2227
    qemu_vfree(s->cluster_data);
K
Kevin Wolf 已提交
2228
    qcow2_refcount_close(bs);
2229
    qcow2_free_snapshots(bs);
B
bellard 已提交
2230 2231
}

2232 2233
static void coroutine_fn qcow2_co_invalidate_cache(BlockDriverState *bs,
                                                   Error **errp)
2234
{
2235
    BDRVQcow2State *s = bs->opaque;
2236
    int flags = s->flags;
2237
    QCryptoBlock *crypto = NULL;
2238
    QDict *options;
2239 2240
    Error *local_err = NULL;
    int ret;
2241 2242 2243 2244 2245 2246

    /*
     * Backing files are read-only which makes all of their metadata immutable,
     * that means we don't have to worry about reopening them here.
     */

2247 2248
    crypto = s->crypto;
    s->crypto = NULL;
2249 2250 2251

    qcow2_close(bs);

2252
    memset(s, 0, sizeof(BDRVQcow2State));
2253
    options = qdict_clone_shallow(bs->options);
2254

2255
    flags &= ~BDRV_O_INACTIVE;
2256
    qemu_co_mutex_lock(&s->lock);
2257
    ret = qcow2_do_open(bs, options, flags, &local_err);
2258
    qemu_co_mutex_unlock(&s->lock);
2259
    qobject_unref(options);
2260
    if (local_err) {
2261 2262
        error_propagate_prepend(errp, local_err,
                                "Could not reopen qcow2 layer: ");
2263
        bs->drv = NULL;
2264 2265 2266
        return;
    } else if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
2267
        bs->drv = NULL;
2268 2269
        return;
    }
2270

2271
    s->crypto = crypto;
2272 2273
}

K
Kevin Wolf 已提交
2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287
static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
    size_t len, size_t buflen)
{
    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);

    if (buflen < ext_len) {
        return -ENOSPC;
    }

    *ext_backing_fmt = (QCowExtension) {
        .magic  = cpu_to_be32(magic),
        .len    = cpu_to_be32(len),
    };
2288 2289 2290 2291

    if (len) {
        memcpy(buf + sizeof(QCowExtension), s, len);
    }
K
Kevin Wolf 已提交
2292 2293 2294 2295

    return ext_len;
}

K
Kevin Wolf 已提交
2296
/*
K
Kevin Wolf 已提交
2297 2298 2299 2300
 * Updates the qcow2 header, including the variable length parts of it, i.e.
 * the backing file name and all extensions. qcow2 was not designed to allow
 * such changes, so if we run out of space (we can only use the first cluster)
 * this function may fail.
K
Kevin Wolf 已提交
2301 2302 2303
 *
 * Returns 0 on success, -errno in error cases.
 */
K
Kevin Wolf 已提交
2304
int qcow2_update_header(BlockDriverState *bs)
K
Kevin Wolf 已提交
2305
{
2306
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
2307 2308 2309
    QCowHeader *header;
    char *buf;
    size_t buflen = s->cluster_size;
K
Kevin Wolf 已提交
2310
    int ret;
K
Kevin Wolf 已提交
2311 2312
    uint64_t total_size;
    uint32_t refcount_table_clusters;
K
Kevin Wolf 已提交
2313
    size_t header_length;
2314
    Qcow2UnknownHeaderExtension *uext;
K
Kevin Wolf 已提交
2315

K
Kevin Wolf 已提交
2316
    buf = qemu_blockalign(bs, buflen);
K
Kevin Wolf 已提交
2317

K
Kevin Wolf 已提交
2318 2319
    /* Header structure */
    header = (QCowHeader*) buf;
K
Kevin Wolf 已提交
2320

K
Kevin Wolf 已提交
2321 2322 2323
    if (buflen < sizeof(*header)) {
        ret = -ENOSPC;
        goto fail;
K
Kevin Wolf 已提交
2324 2325
    }

K
Kevin Wolf 已提交
2326
    header_length = sizeof(*header) + s->unknown_header_fields_size;
K
Kevin Wolf 已提交
2327 2328 2329 2330
    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);

    *header = (QCowHeader) {
K
Kevin Wolf 已提交
2331
        /* Version 2 fields */
K
Kevin Wolf 已提交
2332
        .magic                  = cpu_to_be32(QCOW_MAGIC),
K
Kevin Wolf 已提交
2333
        .version                = cpu_to_be32(s->qcow_version),
K
Kevin Wolf 已提交
2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344
        .backing_file_offset    = 0,
        .backing_file_size      = 0,
        .cluster_bits           = cpu_to_be32(s->cluster_bits),
        .size                   = cpu_to_be64(total_size),
        .crypt_method           = cpu_to_be32(s->crypt_method_header),
        .l1_size                = cpu_to_be32(s->l1_size),
        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
K
Kevin Wolf 已提交
2345 2346 2347 2348 2349

        /* Version 3 fields */
        .incompatible_features  = cpu_to_be64(s->incompatible_features),
        .compatible_features    = cpu_to_be64(s->compatible_features),
        .autoclear_features     = cpu_to_be64(s->autoclear_features),
2350
        .refcount_order         = cpu_to_be32(s->refcount_order),
K
Kevin Wolf 已提交
2351
        .header_length          = cpu_to_be32(header_length),
K
Kevin Wolf 已提交
2352
    };
K
Kevin Wolf 已提交
2353

K
Kevin Wolf 已提交
2354 2355 2356 2357 2358 2359 2360 2361 2362
    /* For older versions, write a shorter header */
    switch (s->qcow_version) {
    case 2:
        ret = offsetof(QCowHeader, incompatible_features);
        break;
    case 3:
        ret = sizeof(*header);
        break;
    default:
2363 2364
        ret = -EINVAL;
        goto fail;
K
Kevin Wolf 已提交
2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381
    }

    buf += ret;
    buflen -= ret;
    memset(buf, 0, buflen);

    /* Preserve any unknown field in the header */
    if (s->unknown_header_fields_size) {
        if (buflen < s->unknown_header_fields_size) {
            ret = -ENOSPC;
            goto fail;
        }

        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
        buf += s->unknown_header_fields_size;
        buflen -= s->unknown_header_fields_size;
    }
K
Kevin Wolf 已提交
2382

K
Kevin Wolf 已提交
2383
    /* Backing file format header extension */
2384
    if (s->image_backing_format) {
K
Kevin Wolf 已提交
2385
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
2386 2387
                             s->image_backing_format,
                             strlen(s->image_backing_format),
K
Kevin Wolf 已提交
2388 2389 2390
                             buflen);
        if (ret < 0) {
            goto fail;
K
Kevin Wolf 已提交
2391 2392
        }

K
Kevin Wolf 已提交
2393 2394
        buf += ret;
        buflen -= ret;
K
Kevin Wolf 已提交
2395 2396
    }

2397 2398
    /* Full disk encryption header pointer extension */
    if (s->crypto_header.offset != 0) {
2399 2400
        s->crypto_header.offset = cpu_to_be64(s->crypto_header.offset);
        s->crypto_header.length = cpu_to_be64(s->crypto_header.length);
2401 2402 2403
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
                             &s->crypto_header, sizeof(s->crypto_header),
                             buflen);
2404 2405
        s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
        s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
2406 2407 2408 2409 2410 2411 2412
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
    }

2413
    /* Feature table */
2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439
    if (s->qcow_version >= 3) {
        Qcow2Feature features[] = {
            {
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
                .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
                .name = "dirty bit",
            },
            {
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
                .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
                .name = "corrupt bit",
            },
            {
                .type = QCOW2_FEAT_TYPE_COMPATIBLE,
                .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
                .name = "lazy refcounts",
            },
        };

        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
                             features, sizeof(features), buflen);
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
2440 2441
    }

2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460
    /* Bitmap extension */
    if (s->nb_bitmaps > 0) {
        Qcow2BitmapHeaderExt bitmaps_header = {
            .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
            .bitmap_directory_size =
                    cpu_to_be64(s->bitmap_directory_size),
            .bitmap_directory_offset =
                    cpu_to_be64(s->bitmap_directory_offset)
        };
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
                             &bitmaps_header, sizeof(bitmaps_header),
                             buflen);
        if (ret < 0) {
            goto fail;
        }
        buf += ret;
        buflen -= ret;
    }

2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471
    /* Keep unknown header extensions */
    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
        if (ret < 0) {
            goto fail;
        }

        buf += ret;
        buflen -= ret;
    }

K
Kevin Wolf 已提交
2472 2473
    /* End of header extensions */
    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
K
Kevin Wolf 已提交
2474 2475 2476 2477
    if (ret < 0) {
        goto fail;
    }

K
Kevin Wolf 已提交
2478 2479
    buf += ret;
    buflen -= ret;
K
Kevin Wolf 已提交
2480

K
Kevin Wolf 已提交
2481
    /* Backing file name */
2482 2483
    if (s->image_backing_file) {
        size_t backing_file_len = strlen(s->image_backing_file);
K
Kevin Wolf 已提交
2484 2485 2486 2487 2488 2489

        if (buflen < backing_file_len) {
            ret = -ENOSPC;
            goto fail;
        }

2490
        /* Using strncpy is ok here, since buf is not NUL-terminated. */
2491
        strncpy(buf, s->image_backing_file, buflen);
K
Kevin Wolf 已提交
2492 2493 2494

        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
        header->backing_file_size   = cpu_to_be32(backing_file_len);
K
Kevin Wolf 已提交
2495 2496
    }

K
Kevin Wolf 已提交
2497
    /* Write the new header */
2498
    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
K
Kevin Wolf 已提交
2499 2500 2501 2502 2503 2504
    if (ret < 0) {
        goto fail;
    }

    ret = 0;
fail:
K
Kevin Wolf 已提交
2505
    qemu_vfree(header);
K
Kevin Wolf 已提交
2506 2507 2508 2509 2510 2511
    return ret;
}

static int qcow2_change_backing_file(BlockDriverState *bs,
    const char *backing_file, const char *backing_fmt)
{
2512
    BDRVQcow2State *s = bs->opaque;
2513

2514 2515 2516 2517
    if (backing_file && strlen(backing_file) > 1023) {
        return -EINVAL;
    }

K
Kevin Wolf 已提交
2518 2519 2520
    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");

2521 2522 2523 2524 2525 2526
    g_free(s->image_backing_file);
    g_free(s->image_backing_format);

    s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
    s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;

K
Kevin Wolf 已提交
2527
    return qcow2_update_header(bs);
K
Kevin Wolf 已提交
2528 2529
}

2530 2531 2532 2533 2534 2535 2536 2537 2538 2539
static int qcow2_crypt_method_from_format(const char *encryptfmt)
{
    if (g_str_equal(encryptfmt, "luks")) {
        return QCOW_CRYPT_LUKS;
    } else if (g_str_equal(encryptfmt, "aes")) {
        return QCOW_CRYPT_AES;
    } else {
        return -EINVAL;
    }
}
2540

2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558
static int qcow2_set_up_encryption(BlockDriverState *bs,
                                   QCryptoBlockCreateOptions *cryptoopts,
                                   Error **errp)
{
    BDRVQcow2State *s = bs->opaque;
    QCryptoBlock *crypto = NULL;
    int fmt, ret;

    switch (cryptoopts->format) {
    case Q_CRYPTO_BLOCK_FORMAT_LUKS:
        fmt = QCOW_CRYPT_LUKS;
        break;
    case Q_CRYPTO_BLOCK_FORMAT_QCOW:
        fmt = QCOW_CRYPT_AES;
        break;
    default:
        error_setg(errp, "Crypto format not supported in qcow2");
        return -EINVAL;
2559
    }
2560

2561
    s->crypt_method_header = fmt;
2562

2563
    crypto = qcrypto_block_create(cryptoopts, "encrypt.",
2564 2565
                                  qcow2_crypto_hdr_init_func,
                                  qcow2_crypto_hdr_write_func,
2566 2567
                                  bs, errp);
    if (!crypto) {
2568
        return -EINVAL;
2569 2570 2571 2572 2573 2574 2575 2576
    }

    ret = qcow2_update_header(bs);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write encryption header");
        goto out;
    }

2577
    ret = 0;
2578 2579 2580 2581 2582
 out:
    qcrypto_block_free(crypto);
    return ret;
}

2583 2584 2585 2586 2587 2588 2589
/**
 * Preallocates metadata structures for data clusters between @offset (in the
 * guest disk) and @new_length (which is thus generally the new guest disk
 * size).
 *
 * Returns: 0 on success, -errno on failure.
 */
2590 2591
static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
                                       uint64_t new_length)
K
Kevin Wolf 已提交
2592
{
K
Kevin Wolf 已提交
2593
    uint64_t bytes;
K
Kevin Wolf 已提交
2594
    uint64_t host_offset = 0;
K
Kevin Wolf 已提交
2595
    unsigned int cur_bytes;
2596
    int ret;
2597
    QCowL2Meta *meta;
K
Kevin Wolf 已提交
2598

2599 2600
    assert(offset <= new_length);
    bytes = new_length - offset;
K
Kevin Wolf 已提交
2601

K
Kevin Wolf 已提交
2602 2603 2604
    while (bytes) {
        cur_bytes = MIN(bytes, INT_MAX);
        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
K
Kevin Wolf 已提交
2605
                                         &host_offset, &meta);
2606
        if (ret < 0) {
2607
            return ret;
K
Kevin Wolf 已提交
2608 2609
        }

2610 2611 2612
        while (meta) {
            QCowL2Meta *next = meta->next;

H
Hu Tao 已提交
2613 2614 2615 2616
            ret = qcow2_alloc_cluster_link_l2(bs, meta);
            if (ret < 0) {
                qcow2_free_any_clusters(bs, meta->alloc_offset,
                                        meta->nb_clusters, QCOW2_DISCARD_NEVER);
2617
                return ret;
H
Hu Tao 已提交
2618 2619 2620 2621
            }

            /* There are no dependent requests, but we need to remove our
             * request from the list of in-flight requests */
2622
            QLIST_REMOVE(meta, next_in_flight);
2623 2624 2625

            g_free(meta);
            meta = next;
2626
        }
2627

K
Kevin Wolf 已提交
2628 2629
        /* TODO Preallocate data if requested */

K
Kevin Wolf 已提交
2630 2631
        bytes -= cur_bytes;
        offset += cur_bytes;
K
Kevin Wolf 已提交
2632 2633 2634 2635 2636 2637 2638
    }

    /*
     * It is expected that the image file is large enough to actually contain
     * all of the allocated clusters (otherwise we get failing reads after
     * EOF). Extend the image to the last allocated sector.
     */
K
Kevin Wolf 已提交
2639
    if (host_offset != 0) {
K
Kevin Wolf 已提交
2640
        uint8_t data = 0;
2641
        ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1,
K
Kevin Wolf 已提交
2642
                          &data, 1);
2643
        if (ret < 0) {
2644
            return ret;
2645
        }
K
Kevin Wolf 已提交
2646 2647
    }

2648
    return 0;
K
Kevin Wolf 已提交
2649 2650
}

2651 2652 2653 2654
/* qcow2_refcount_metadata_size:
 * @clusters: number of clusters to refcount (including data and L1/L2 tables)
 * @cluster_size: size of a cluster, in bytes
 * @refcount_order: refcount bits power-of-2 exponent
2655 2656
 * @generous_increase: allow for the refcount table to be 1.5x as large as it
 *                     needs to be
2657 2658 2659
 *
 * Returns: Number of bytes required for refcount blocks and table metadata.
 */
2660 2661 2662
int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
                                     int refcount_order, bool generous_increase,
                                     uint64_t *refblock_count)
2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684
{
    /*
     * Every host cluster is reference-counted, including metadata (even
     * refcount metadata is recursively included).
     *
     * An accurate formula for the size of refcount metadata size is difficult
     * to derive.  An easier method of calculation is finding the fixed point
     * where no further refcount blocks or table clusters are required to
     * reference count every cluster.
     */
    int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t);
    int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
    int64_t table = 0;  /* number of refcount table clusters */
    int64_t blocks = 0; /* number of refcount block clusters */
    int64_t last;
    int64_t n = 0;

    do {
        last = n;
        blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
        table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
        n = clusters + blocks + table;
2685 2686 2687 2688 2689 2690

        if (n == last && generous_increase) {
            clusters += DIV_ROUND_UP(table, 2);
            n = 0; /* force another loop */
            generous_increase = false;
        }
2691 2692
    } while (n != last);

2693 2694 2695 2696
    if (refblock_count) {
        *refblock_count = blocks;
    }

2697 2698 2699
    return (blocks + table) * cluster_size;
}

2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713
/**
 * qcow2_calc_prealloc_size:
 * @total_size: virtual disk size in bytes
 * @cluster_size: cluster size in bytes
 * @refcount_order: refcount bits power-of-2 exponent
 *
 * Returns: Total number of bytes required for the fully allocated image
 * (including metadata).
 */
static int64_t qcow2_calc_prealloc_size(int64_t total_size,
                                        size_t cluster_size,
                                        int refcount_order)
{
    int64_t meta_size = 0;
2714
    uint64_t nl1e, nl2e;
2715
    int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);
2716 2717 2718 2719 2720 2721

    /* header: 1 cluster */
    meta_size += cluster_size;

    /* total size of L2 tables */
    nl2e = aligned_total_size / cluster_size;
2722
    nl2e = ROUND_UP(nl2e, cluster_size / sizeof(uint64_t));
2723 2724 2725 2726
    meta_size += nl2e * sizeof(uint64_t);

    /* total size of L1 tables */
    nl1e = nl2e * sizeof(uint64_t) / cluster_size;
2727
    nl1e = ROUND_UP(nl1e, cluster_size / sizeof(uint64_t));
2728 2729
    meta_size += nl1e * sizeof(uint64_t);

2730 2731 2732
    /* total size of refcount table and blocks */
    meta_size += qcow2_refcount_metadata_size(
            (meta_size + aligned_total_size) / cluster_size,
2733
            cluster_size, refcount_order, false, NULL);
2734 2735 2736 2737

    return meta_size + aligned_total_size;
}

2738
static bool validate_cluster_size(size_t cluster_size, Error **errp)
K
Kevin Wolf 已提交
2739
{
2740
    int cluster_bits = ctz32(cluster_size);
K
Kevin Wolf 已提交
2741 2742 2743
    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
        (1 << cluster_bits) != cluster_size)
    {
M
Max Reitz 已提交
2744 2745
        error_setg(errp, "Cluster size must be a power of two between %d and "
                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757
        return false;
    }
    return true;
}

static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp)
{
    size_t cluster_size;

    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
                                         DEFAULT_CLUSTER_SIZE);
    if (!validate_cluster_size(cluster_size, errp)) {
2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799
        return 0;
    }
    return cluster_size;
}

static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
{
    char *buf;
    int ret;

    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
    if (!buf) {
        ret = 3; /* default */
    } else if (!strcmp(buf, "0.10")) {
        ret = 2;
    } else if (!strcmp(buf, "1.1")) {
        ret = 3;
    } else {
        error_setg(errp, "Invalid compatibility level: '%s'", buf);
        ret = -EINVAL;
    }
    g_free(buf);
    return ret;
}

static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
                                                Error **errp)
{
    uint64_t refcount_bits;

    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
        error_setg(errp, "Refcount width must be a power of two and may not "
                   "exceed 64 bits");
        return 0;
    }

    if (version < 3 && refcount_bits != 16) {
        error_setg(errp, "Different refcount widths than 16 bits require "
                   "compatibility level 1.1 or above (use compat=1.1 or "
                   "greater)");
        return 0;
K
Kevin Wolf 已提交
2800 2801
    }

2802 2803 2804
    return refcount_bits;
}

2805
static int coroutine_fn
2806
qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
2807
{
2808
    BlockdevCreateOptionsQcow2 *qcow2_opts;
2809 2810
    QDict *options;

K
Kevin Wolf 已提交
2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
    /*
     * Open the image file and write a minimal qcow2 header.
     *
     * We keep things simple and start with a zero-sized image. We also
     * do without refcount blocks or a L1 table for now. We'll fix the
     * inconsistency later.
     *
     * We do need a refcount table because growing the refcount table means
     * allocating two new refcount blocks - the seconds of which would be at
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
     * size for any qcow2 image.
     */
2823 2824
    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;
2825
    QCowHeader *header;
2826 2827 2828
    size_t cluster_size;
    int version;
    int refcount_order;
2829
    uint64_t* refcount_table;
M
Max Reitz 已提交
2830
    Error *local_err = NULL;
K
Kevin Wolf 已提交
2831 2832
    int ret;

2833 2834 2835
    assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
    qcow2_opts = &create_options->u.qcow2;

2836 2837 2838 2839 2840 2841
    bs = bdrv_open_blockdev_ref(qcow2_opts->file, errp);
    if (bs == NULL) {
        return -EIO;
    }

    /* Validate options and set default values */
2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869
    if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) {
        error_setg(errp, "Image size must be a multiple of 512 bytes");
        ret = -EINVAL;
        goto out;
    }

    if (qcow2_opts->has_version) {
        switch (qcow2_opts->version) {
        case BLOCKDEV_QCOW2_VERSION_V2:
            version = 2;
            break;
        case BLOCKDEV_QCOW2_VERSION_V3:
            version = 3;
            break;
        default:
            g_assert_not_reached();
        }
    } else {
        version = 3;
    }

    if (qcow2_opts->has_cluster_size) {
        cluster_size = qcow2_opts->cluster_size;
    } else {
        cluster_size = DEFAULT_CLUSTER_SIZE;
    }

    if (!validate_cluster_size(cluster_size, errp)) {
2870 2871
        ret = -EINVAL;
        goto out;
2872 2873 2874 2875 2876 2877 2878 2879 2880 2881
    }

    if (!qcow2_opts->has_preallocation) {
        qcow2_opts->preallocation = PREALLOC_MODE_OFF;
    }
    if (qcow2_opts->has_backing_file &&
        qcow2_opts->preallocation != PREALLOC_MODE_OFF)
    {
        error_setg(errp, "Backing file and preallocation cannot be used at "
                   "the same time");
2882 2883
        ret = -EINVAL;
        goto out;
2884 2885 2886
    }
    if (qcow2_opts->has_backing_fmt && !qcow2_opts->has_backing_file) {
        error_setg(errp, "Backing format cannot be used without backing file");
2887 2888
        ret = -EINVAL;
        goto out;
2889 2890 2891 2892 2893 2894 2895
    }

    if (!qcow2_opts->has_lazy_refcounts) {
        qcow2_opts->lazy_refcounts = false;
    }
    if (version < 3 && qcow2_opts->lazy_refcounts) {
        error_setg(errp, "Lazy refcounts only supported with compatibility "
2896
                   "level 1.1 and above (use version=v3 or greater)");
2897 2898
        ret = -EINVAL;
        goto out;
2899 2900 2901 2902 2903 2904 2905 2906 2907 2908
    }

    if (!qcow2_opts->has_refcount_bits) {
        qcow2_opts->refcount_bits = 16;
    }
    if (qcow2_opts->refcount_bits > 64 ||
        !is_power_of_2(qcow2_opts->refcount_bits))
    {
        error_setg(errp, "Refcount width must be a power of two and may not "
                   "exceed 64 bits");
2909 2910
        ret = -EINVAL;
        goto out;
2911 2912 2913
    }
    if (version < 3 && qcow2_opts->refcount_bits != 16) {
        error_setg(errp, "Different refcount widths than 16 bits require "
2914
                   "compatibility level 1.1 or above (use version=v3 or "
2915
                   "greater)");
2916 2917
        ret = -EINVAL;
        goto out;
2918 2919 2920 2921 2922
    }
    refcount_order = ctz32(qcow2_opts->refcount_bits);


    /* Create BlockBackend to write to the image */
2923 2924
    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
    ret = blk_insert_bs(blk, bs, errp);
K
Kevin Wolf 已提交
2925
    if (ret < 0) {
2926
        goto out;
K
Kevin Wolf 已提交
2927
    }
2928 2929
    blk_set_allow_write_beyond_eof(blk, true);

2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948
    /* Clear the protocol layer and preallocate it if necessary */
    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
    if (ret < 0) {
        goto out;
    }

    if (qcow2_opts->preallocation == PREALLOC_MODE_FULL ||
        qcow2_opts->preallocation == PREALLOC_MODE_FALLOC)
    {
        int64_t prealloc_size =
            qcow2_calc_prealloc_size(qcow2_opts->size, cluster_size,
                                     refcount_order);

        ret = blk_truncate(blk, prealloc_size, qcow2_opts->preallocation, errp);
        if (ret < 0) {
            goto out;
        }
    }

K
Kevin Wolf 已提交
2949
    /* Write the header */
2950 2951 2952 2953 2954
    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
    header = g_malloc0(cluster_size);
    *header = (QCowHeader) {
        .magic                      = cpu_to_be32(QCOW_MAGIC),
        .version                    = cpu_to_be32(version),
2955
        .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
2956 2957 2958 2959 2960
        .size                       = cpu_to_be64(0),
        .l1_table_offset            = cpu_to_be64(0),
        .l1_size                    = cpu_to_be32(0),
        .refcount_table_offset      = cpu_to_be64(cluster_size),
        .refcount_table_clusters    = cpu_to_be32(1),
2961
        .refcount_order             = cpu_to_be32(refcount_order),
2962 2963
        .header_length              = cpu_to_be32(sizeof(*header)),
    };
K
Kevin Wolf 已提交
2964

2965 2966
    /* We'll update this to correct value later */
    header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
K
Kevin Wolf 已提交
2967

2968
    if (qcow2_opts->lazy_refcounts) {
2969
        header->compatible_features |=
2970 2971 2972
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
    }

2973
    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
2974
    g_free(header);
K
Kevin Wolf 已提交
2975
    if (ret < 0) {
M
Max Reitz 已提交
2976
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
K
Kevin Wolf 已提交
2977 2978 2979
        goto out;
    }

2980 2981 2982
    /* Write a refcount table with one refcount block */
    refcount_table = g_malloc0(2 * cluster_size);
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
2983
    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
2984
    g_free(refcount_table);
K
Kevin Wolf 已提交
2985 2986

    if (ret < 0) {
M
Max Reitz 已提交
2987
        error_setg_errno(errp, -ret, "Could not write refcount table");
K
Kevin Wolf 已提交
2988 2989 2990
        goto out;
    }

2991 2992
    blk_unref(blk);
    blk = NULL;
K
Kevin Wolf 已提交
2993 2994 2995 2996 2997 2998

    /*
     * And now open the image and make it consistent first (i.e. increase the
     * refcount of the cluster that is occupied by the header and the refcount
     * table)
     */
2999
    options = qdict_new();
3000
    qdict_put_str(options, "driver", "qcow2");
3001 3002
    qdict_put_str(options, "file", bs->node_name);
    blk = blk_new_open(NULL, NULL, options,
3003 3004
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
                       &local_err);
3005
    if (blk == NULL) {
M
Max Reitz 已提交
3006
        error_propagate(errp, local_err);
3007
        ret = -EIO;
K
Kevin Wolf 已提交
3008 3009 3010
        goto out;
    }

3011
    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
K
Kevin Wolf 已提交
3012
    if (ret < 0) {
M
Max Reitz 已提交
3013 3014
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
                         "header and refcount table");
K
Kevin Wolf 已提交
3015 3016 3017 3018 3019 3020 3021
        goto out;

    } else if (ret != 0) {
        error_report("Huh, first cluster in empty image is already in use?");
        abort();
    }

3022
    /* Create a full header (including things like feature table) */
3023
    ret = qcow2_update_header(blk_bs(blk));
3024 3025 3026 3027 3028
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not update qcow2 header");
        goto out;
    }

K
Kevin Wolf 已提交
3029
    /* Okay, now that we have a valid image, let's give it the right size */
3030
    ret = blk_truncate(blk, qcow2_opts->size, PREALLOC_MODE_OFF, errp);
K
Kevin Wolf 已提交
3031
    if (ret < 0) {
3032
        error_prepend(errp, "Could not resize image: ");
K
Kevin Wolf 已提交
3033 3034 3035 3036
        goto out;
    }

    /* Want a backing file? There you go.*/
3037 3038 3039 3040 3041 3042 3043 3044 3045
    if (qcow2_opts->has_backing_file) {
        const char *backing_format = NULL;

        if (qcow2_opts->has_backing_fmt) {
            backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
        }

        ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
                                       backing_format);
K
Kevin Wolf 已提交
3046
        if (ret < 0) {
M
Max Reitz 已提交
3047
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
3048 3049
                             "with format '%s'", qcow2_opts->backing_file,
                             backing_format);
K
Kevin Wolf 已提交
3050 3051 3052 3053
            goto out;
        }
    }

3054
    /* Want encryption? There you go. */
3055 3056
    if (qcow2_opts->has_encrypt) {
        ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
3057 3058 3059 3060 3061
        if (ret < 0) {
            goto out;
        }
    }

K
Kevin Wolf 已提交
3062
    /* And if we're supposed to preallocate metadata, do that now */
3063
    if (qcow2_opts->preallocation != PREALLOC_MODE_OFF) {
3064 3065
        BDRVQcow2State *s = blk_bs(blk)->opaque;
        qemu_co_mutex_lock(&s->lock);
3066
        ret = preallocate_co(blk_bs(blk), 0, qcow2_opts->size);
3067 3068
        qemu_co_mutex_unlock(&s->lock);

K
Kevin Wolf 已提交
3069
        if (ret < 0) {
M
Max Reitz 已提交
3070
            error_setg_errno(errp, -ret, "Could not preallocate metadata");
K
Kevin Wolf 已提交
3071 3072 3073 3074
            goto out;
        }
    }

3075 3076
    blk_unref(blk);
    blk = NULL;
M
Max Reitz 已提交
3077

3078 3079 3080 3081 3082 3083
    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
     * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
     * have to setup decryption context. We're not doing any I/O on the top
     * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
     * not have effect.
     */
3084
    options = qdict_new();
3085
    qdict_put_str(options, "driver", "qcow2");
3086 3087
    qdict_put_str(options, "file", bs->node_name);
    blk = blk_new_open(NULL, NULL, options,
3088 3089
                       BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
                       &local_err);
3090
    if (blk == NULL) {
M
Max Reitz 已提交
3091
        error_propagate(errp, local_err);
3092
        ret = -EIO;
M
Max Reitz 已提交
3093 3094 3095
        goto out;
    }

K
Kevin Wolf 已提交
3096 3097
    ret = 0;
out:
3098 3099
    blk_unref(blk);
    bdrv_unref(bs);
K
Kevin Wolf 已提交
3100 3101
    return ret;
}
K
Kevin Wolf 已提交
3102

3103 3104
static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts,
                                             Error **errp)
K
Kevin Wolf 已提交
3105
{
3106
    BlockdevCreateOptions *create_options = NULL;
3107
    QDict *qdict;
3108
    Visitor *v;
3109
    BlockDriverState *bs = NULL;
M
Max Reitz 已提交
3110
    Error *local_err = NULL;
3111
    const char *val;
M
Max Reitz 已提交
3112
    int ret;
K
Kevin Wolf 已提交
3113

3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155
    /* Only the keyval visitor supports the dotted syntax needed for
     * encryption, so go through a QDict before getting a QAPI type. Ignore
     * options meant for the protocol layer so that the visitor doesn't
     * complain. */
    qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts,
                                        true);

    /* Handle encryption options */
    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
    if (val && !strcmp(val, "on")) {
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
    } else if (val && !strcmp(val, "off")) {
        qdict_del(qdict, BLOCK_OPT_ENCRYPT);
    }

    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
    if (val && !strcmp(val, "aes")) {
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
    }

    /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into
     * version=v2/v3 below. */
    val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL);
    if (val && !strcmp(val, "0.10")) {
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2");
    } else if (val && !strcmp(val, "1.1")) {
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3");
    }

    /* Change legacy command line options into QMP ones */
    static const QDictRenames opt_renames[] = {
        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
        { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
        { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
        { BLOCK_OPT_LAZY_REFCOUNTS,     "lazy-refcounts" },
        { BLOCK_OPT_REFCOUNT_BITS,      "refcount-bits" },
        { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
        { BLOCK_OPT_COMPAT_LEVEL,       "version" },
        { NULL, NULL },
    };

    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
3156 3157 3158
        ret = -EINVAL;
        goto finish;
    }
3159

3160 3161 3162
    /* Create and open the file (protocol layer) */
    ret = bdrv_create_file(filename, opts, errp);
    if (ret < 0) {
3163 3164
        goto finish;
    }
3165 3166 3167 3168 3169

    bs = bdrv_open(filename, NULL, NULL,
                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    if (bs == NULL) {
        ret = -EIO;
3170 3171
        goto finish;
    }
3172

3173 3174 3175 3176 3177
    /* Set 'driver' and 'node' options */
    qdict_put_str(qdict, "driver", "qcow2");
    qdict_put_str(qdict, "file", bs->node_name);

    /* Now get the QAPI type BlockdevCreateOptions */
3178 3179
    v = qobject_input_visitor_new_flat_confused(qdict, errp);
    if (!v) {
3180 3181 3182 3183
        ret = -EINVAL;
        goto finish;
    }

3184 3185
    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
    visit_free(v);
K
Kevin Wolf 已提交
3186

3187 3188
    if (local_err) {
        error_propagate(errp, local_err);
3189 3190 3191 3192
        ret = -EINVAL;
        goto finish;
    }

3193 3194 3195
    /* Silently round up size */
    create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size,
                                            BDRV_SECTOR_SIZE);
3196 3197

    /* Create the qcow2 image (format layer) */
3198
    ret = qcow2_co_create(create_options, errp);
3199 3200 3201
    if (ret < 0) {
        goto finish;
    }
3202

3203
    ret = 0;
3204
finish:
3205
    qobject_unref(qdict);
3206
    bdrv_unref(bs);
3207
    qapi_free_BlockdevCreateOptions(create_options);
M
Max Reitz 已提交
3208
    return ret;
K
Kevin Wolf 已提交
3209 3210
}

3211

3212
static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
3213
{
3214 3215
    int64_t nr;
    int res;
3216 3217

    /* Clamp to image length, before checking status of underlying sectors */
E
Eric Blake 已提交
3218 3219
    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
        bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
3220 3221
    }

3222
    if (!bytes) {
3223 3224
        return true;
    }
E
Eric Blake 已提交
3225
    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
3226
    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
3227 3228
}

3229
static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
3230
    int64_t offset, int bytes, BdrvRequestFlags flags)
K
Kevin Wolf 已提交
3231 3232
{
    int ret;
3233
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
3234

3235
    uint32_t head = offset % s->cluster_size;
3236
    uint32_t tail = (offset + bytes) % s->cluster_size;
3237

3238 3239
    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
    if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
3240 3241
        tail = 0;
    }
3242

3243 3244
    if (head || tail) {
        uint64_t off;
K
Kevin Wolf 已提交
3245
        unsigned int nr;
3246

3247
        assert(head + bytes <= s->cluster_size);
3248

3249
        /* check whether remainder of cluster already reads as zero */
3250 3251 3252
        if (!(is_zero(bs, offset - head, head) &&
              is_zero(bs, offset + bytes,
                      tail ? s->cluster_size - tail : 0))) {
3253 3254 3255 3256 3257
            return -ENOTSUP;
        }

        qemu_co_mutex_lock(&s->lock);
        /* We can have new write after previous check */
3258
        offset = QEMU_ALIGN_DOWN(offset, s->cluster_size);
3259
        bytes = s->cluster_size;
K
Kevin Wolf 已提交
3260
        nr = s->cluster_size;
3261
        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
3262 3263 3264
        if (ret != QCOW2_CLUSTER_UNALLOCATED &&
            ret != QCOW2_CLUSTER_ZERO_PLAIN &&
            ret != QCOW2_CLUSTER_ZERO_ALLOC) {
3265 3266 3267 3268 3269
            qemu_co_mutex_unlock(&s->lock);
            return -ENOTSUP;
        }
    } else {
        qemu_co_mutex_lock(&s->lock);
K
Kevin Wolf 已提交
3270 3271
    }

3272
    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
3273

K
Kevin Wolf 已提交
3274
    /* Whatever is left can use real zero clusters */
3275
    ret = qcow2_cluster_zeroize(bs, offset, bytes, flags);
K
Kevin Wolf 已提交
3276 3277 3278 3279 3280
    qemu_co_mutex_unlock(&s->lock);

    return ret;
}

3281
static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
3282
                                          int64_t offset, int bytes)
K
Kevin Wolf 已提交
3283
{
3284
    int ret;
3285
    BDRVQcow2State *s = bs->opaque;
3286

3287 3288
    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
        assert(bytes < s->cluster_size);
3289 3290 3291
        /* Ignore partial clusters, except for the special case of the
         * complete partial cluster at the end of an unaligned file */
        if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
3292
            offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
3293 3294
            return -ENOTSUP;
        }
3295 3296
    }

3297
    qemu_co_mutex_lock(&s->lock);
3298
    ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
3299
                                false);
3300 3301
    qemu_co_mutex_unlock(&s->lock);
    return ret;
K
Kevin Wolf 已提交
3302 3303
}

F
Fam Zheng 已提交
3304 3305 3306 3307
static int coroutine_fn
qcow2_co_copy_range_from(BlockDriverState *bs,
                         BdrvChild *src, uint64_t src_offset,
                         BdrvChild *dst, uint64_t dst_offset,
3308 3309
                         uint64_t bytes, BdrvRequestFlags read_flags,
                         BdrvRequestFlags write_flags)
F
Fam Zheng 已提交
3310 3311 3312 3313 3314
{
    BDRVQcow2State *s = bs->opaque;
    int ret;
    unsigned int cur_bytes; /* number of bytes in current iteration */
    BdrvChild *child = NULL;
3315
    BdrvRequestFlags cur_write_flags;
F
Fam Zheng 已提交
3316 3317 3318 3319 3320 3321 3322 3323

    assert(!bs->encrypted);
    qemu_co_mutex_lock(&s->lock);

    while (bytes != 0) {
        uint64_t copy_offset = 0;
        /* prepare next request */
        cur_bytes = MIN(bytes, INT_MAX);
3324
        cur_write_flags = write_flags;
F
Fam Zheng 已提交
3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335

        ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, &copy_offset);
        if (ret < 0) {
            goto out;
        }

        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
            if (bs->backing && bs->backing->bs) {
                int64_t backing_length = bdrv_getlength(bs->backing->bs);
                if (src_offset >= backing_length) {
3336
                    cur_write_flags |= BDRV_REQ_ZERO_WRITE;
F
Fam Zheng 已提交
3337 3338 3339 3340 3341 3342
                } else {
                    child = bs->backing;
                    cur_bytes = MIN(cur_bytes, backing_length - src_offset);
                    copy_offset = src_offset;
                }
            } else {
3343
                cur_write_flags |= BDRV_REQ_ZERO_WRITE;
F
Fam Zheng 已提交
3344 3345 3346 3347 3348
            }
            break;

        case QCOW2_CLUSTER_ZERO_PLAIN:
        case QCOW2_CLUSTER_ZERO_ALLOC:
3349
            cur_write_flags |= BDRV_REQ_ZERO_WRITE;
F
Fam Zheng 已提交
3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371
            break;

        case QCOW2_CLUSTER_COMPRESSED:
            ret = -ENOTSUP;
            goto out;

        case QCOW2_CLUSTER_NORMAL:
            child = bs->file;
            copy_offset += offset_into_cluster(s, src_offset);
            if ((copy_offset & 511) != 0) {
                ret = -EIO;
                goto out;
            }
            break;

        default:
            abort();
        }
        qemu_co_mutex_unlock(&s->lock);
        ret = bdrv_co_copy_range_from(child,
                                      copy_offset,
                                      dst, dst_offset,
3372
                                      cur_bytes, read_flags, cur_write_flags);
F
Fam Zheng 已提交
3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto out;
        }

        bytes -= cur_bytes;
        src_offset += cur_bytes;
        dst_offset += cur_bytes;
    }
    ret = 0;

out:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

static int coroutine_fn
qcow2_co_copy_range_to(BlockDriverState *bs,
                       BdrvChild *src, uint64_t src_offset,
                       BdrvChild *dst, uint64_t dst_offset,
3393 3394
                       uint64_t bytes, BdrvRequestFlags read_flags,
                       BdrvRequestFlags write_flags)
F
Fam Zheng 已提交
3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436
{
    BDRVQcow2State *s = bs->opaque;
    int offset_in_cluster;
    int ret;
    unsigned int cur_bytes; /* number of sectors in current iteration */
    uint64_t cluster_offset;
    QCowL2Meta *l2meta = NULL;

    assert(!bs->encrypted);
    s->cluster_cache_offset = -1; /* disable compressed cache */

    qemu_co_mutex_lock(&s->lock);

    while (bytes != 0) {

        l2meta = NULL;

        offset_in_cluster = offset_into_cluster(s, dst_offset);
        cur_bytes = MIN(bytes, INT_MAX);

        /* TODO:
         * If src->bs == dst->bs, we could simply copy by incrementing
         * the refcnt, without copying user data.
         * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
        ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes,
                                         &cluster_offset, &l2meta);
        if (ret < 0) {
            goto fail;
        }

        assert((cluster_offset & 511) == 0);

        ret = qcow2_pre_write_overlap_check(bs, 0,
                cluster_offset + offset_in_cluster, cur_bytes);
        if (ret < 0) {
            goto fail;
        }

        qemu_co_mutex_unlock(&s->lock);
        ret = bdrv_co_copy_range_to(src, src_offset,
                                    bs->file,
                                    cluster_offset + offset_in_cluster,
3437
                                    cur_bytes, read_flags, write_flags);
F
Fam Zheng 已提交
3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            goto fail;
        }

        ret = qcow2_handle_l2meta(bs, &l2meta, true);
        if (ret) {
            goto fail;
        }

        bytes -= cur_bytes;
3449
        src_offset += cur_bytes;
F
Fam Zheng 已提交
3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463
        dst_offset += cur_bytes;
    }
    ret = 0;

fail:
    qcow2_handle_l2meta(bs, &l2meta, false);

    qemu_co_mutex_unlock(&s->lock);

    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);

    return ret;
}

3464 3465
static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
                                          PreallocMode prealloc, Error **errp)
3466
{
3467
    BDRVQcow2State *s = bs->opaque;
3468
    uint64_t old_length;
3469 3470
    int64_t new_l1_size;
    int ret;
3471
    QDict *options;
3472

3473 3474 3475
    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
        prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
    {
3476
        error_setg(errp, "Unsupported preallocation mode '%s'",
3477
                   PreallocMode_str(prealloc));
3478 3479 3480
        return -ENOTSUP;
    }

3481
    if (offset & 511) {
3482
        error_setg(errp, "The new size must be a multiple of 512");
3483 3484 3485
        return -EINVAL;
    }

3486 3487
    qemu_co_mutex_lock(&s->lock);

3488 3489
    /* cannot proceed if image has snapshots */
    if (s->nb_snapshots) {
3490
        error_setg(errp, "Can't resize an image which has snapshots");
3491 3492
        ret = -ENOTSUP;
        goto fail;
3493 3494
    }

3495 3496 3497 3498
    /* cannot proceed if image has bitmaps */
    if (s->nb_bitmaps) {
        /* TODO: resize bitmaps in the image */
        error_setg(errp, "Can't resize an image which has bitmaps");
3499 3500
        ret = -ENOTSUP;
        goto fail;
3501 3502
    }

3503
    old_length = bs->total_sectors * BDRV_SECTOR_SIZE;
P
Pavel Butsykin 已提交
3504
    new_l1_size = size_to_l1(s, offset);
3505 3506

    if (offset < old_length) {
3507
        int64_t last_cluster, old_file_size;
P
Pavel Butsykin 已提交
3508 3509 3510
        if (prealloc != PREALLOC_MODE_OFF) {
            error_setg(errp,
                       "Preallocation can't be used for shrinking an image");
3511 3512
            ret = -EINVAL;
            goto fail;
P
Pavel Butsykin 已提交
3513
        }
3514

P
Pavel Butsykin 已提交
3515 3516 3517 3518 3519 3520
        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
                                    old_length - ROUND_UP(offset,
                                                          s->cluster_size),
                                    QCOW2_DISCARD_ALWAYS, true);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
3521
            goto fail;
P
Pavel Butsykin 已提交
3522 3523 3524 3525 3526 3527
        }

        ret = qcow2_shrink_l1_table(bs, new_l1_size);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to reduce the number of L2 tables");
3528
            goto fail;
P
Pavel Butsykin 已提交
3529 3530 3531 3532 3533 3534
        }

        ret = qcow2_shrink_reftable(bs);
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to discard unused refblocks");
3535
            goto fail;
P
Pavel Butsykin 已提交
3536
        }
3537 3538 3539 3540 3541

        old_file_size = bdrv_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
3542 3543
            ret = old_file_size;
            goto fail;
3544 3545 3546 3547 3548
        }
        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
        if (last_cluster < 0) {
            error_setg_errno(errp, -last_cluster,
                             "Failed to find the last cluster");
3549 3550
            ret = last_cluster;
            goto fail;
3551 3552
        }
        if ((last_cluster + 1) * s->cluster_size < old_file_size) {
3553 3554
            Error *local_err = NULL;

3555 3556
            bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
                             PREALLOC_MODE_OFF, &local_err);
3557 3558 3559
            if (local_err) {
                warn_reportf_err(local_err,
                                 "Failed to truncate the tail of the image: ");
3560 3561
            }
        }
P
Pavel Butsykin 已提交
3562 3563 3564 3565
    } else {
        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
3566
            goto fail;
P
Pavel Butsykin 已提交
3567
        }
3568 3569
    }

3570 3571 3572 3573 3574
    switch (prealloc) {
    case PREALLOC_MODE_OFF:
        break;

    case PREALLOC_MODE_METADATA:
3575
        ret = preallocate_co(bs, old_length, offset);
3576 3577
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Preallocation failed");
3578
            goto fail;
3579 3580 3581
        }
        break;

3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593
    case PREALLOC_MODE_FALLOC:
    case PREALLOC_MODE_FULL:
    {
        int64_t allocation_start, host_offset, guest_offset;
        int64_t clusters_allocated;
        int64_t old_file_size, new_file_size;
        uint64_t nb_new_data_clusters, nb_new_l2_tables;

        old_file_size = bdrv_getlength(bs->file->bs);
        if (old_file_size < 0) {
            error_setg_errno(errp, -old_file_size,
                             "Failed to inquire current file length");
3594 3595
            ret = old_file_size;
            goto fail;
3596
        }
3597
        old_file_size = ROUND_UP(old_file_size, s->cluster_size);
3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624

        nb_new_data_clusters = DIV_ROUND_UP(offset - old_length,
                                            s->cluster_size);

        /* This is an overestimation; we will not actually allocate space for
         * these in the file but just make sure the new refcount structures are
         * able to cover them so we will not have to allocate new refblocks
         * while entering the data blocks in the potentially new L2 tables.
         * (We do not actually care where the L2 tables are placed. Maybe they
         *  are already allocated or they can be placed somewhere before
         *  @old_file_size. It does not matter because they will be fully
         *  allocated automatically, so they do not need to be covered by the
         *  preallocation. All that matters is that we will not have to allocate
         *  new refcount structures for them.) */
        nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
                                        s->cluster_size / sizeof(uint64_t));
        /* The cluster range may not be aligned to L2 boundaries, so add one L2
         * table for a potential head/tail */
        nb_new_l2_tables++;

        allocation_start = qcow2_refcount_area(bs, old_file_size,
                                               nb_new_data_clusters +
                                               nb_new_l2_tables,
                                               true, 0, 0);
        if (allocation_start < 0) {
            error_setg_errno(errp, -allocation_start,
                             "Failed to resize refcount structures");
3625 3626
            ret = allocation_start;
            goto fail;
3627 3628 3629 3630 3631 3632 3633
        }

        clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
                                                     nb_new_data_clusters);
        if (clusters_allocated < 0) {
            error_setg_errno(errp, -clusters_allocated,
                             "Failed to allocate data clusters");
3634 3635
            ret = clusters_allocated;
            goto fail;
3636 3637 3638 3639 3640 3641 3642
        }

        assert(clusters_allocated == nb_new_data_clusters);

        /* Allocate the data area */
        new_file_size = allocation_start +
                        nb_new_data_clusters * s->cluster_size;
3643
        ret = bdrv_co_truncate(bs->file, new_file_size, prealloc, errp);
3644 3645 3646 3647 3648
        if (ret < 0) {
            error_prepend(errp, "Failed to resize underlying file: ");
            qcow2_free_clusters(bs, allocation_start,
                                nb_new_data_clusters * s->cluster_size,
                                QCOW2_DISCARD_OTHER);
3649
            goto fail;
3650 3651 3652 3653 3654 3655
        }

        /* Create the necessary L2 entries */
        host_offset = allocation_start;
        guest_offset = old_length;
        while (nb_new_data_clusters) {
3656 3657 3658
            int64_t nb_clusters = MIN(
                nb_new_data_clusters,
                s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671
            QCowL2Meta allocation = {
                .offset       = guest_offset,
                .alloc_offset = host_offset,
                .nb_clusters  = nb_clusters,
            };
            qemu_co_queue_init(&allocation.dependent_requests);

            ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
            if (ret < 0) {
                error_setg_errno(errp, -ret, "Failed to update L2 tables");
                qcow2_free_clusters(bs, host_offset,
                                    nb_new_data_clusters * s->cluster_size,
                                    QCOW2_DISCARD_OTHER);
3672
                goto fail;
3673 3674 3675 3676 3677 3678 3679 3680 3681
            }

            guest_offset += nb_clusters * s->cluster_size;
            host_offset += nb_clusters * s->cluster_size;
            nb_new_data_clusters -= nb_clusters;
        }
        break;
    }

3682 3683 3684 3685 3686 3687
    default:
        g_assert_not_reached();
    }

    if (prealloc != PREALLOC_MODE_OFF) {
        /* Flush metadata before actually changing the image size */
3688
        ret = qcow2_write_caches(bs);
3689 3690 3691
        if (ret < 0) {
            error_setg_errno(errp, -ret,
                             "Failed to flush the preallocated area to disk");
3692
            goto fail;
3693 3694 3695
        }
    }

3696 3697
    bs->total_sectors = offset / BDRV_SECTOR_SIZE;

3698 3699
    /* write updated header.size */
    offset = cpu_to_be64(offset);
3700
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
3701
                           &offset, sizeof(uint64_t));
3702
    if (ret < 0) {
3703
        error_setg_errno(errp, -ret, "Failed to update the image size");
3704
        goto fail;
3705 3706 3707
    }

    s->l1_vm_state_index = new_l1_size;
3708 3709 3710 3711 3712 3713 3714 3715

    /* Update cache sizes */
    options = qdict_clone_shallow(bs->options);
    ret = qcow2_update_options(bs, options, s->flags, errp);
    qobject_unref(options);
    if (ret < 0) {
        goto fail;
    }
3716 3717 3718 3719
    ret = 0;
fail:
    qemu_co_mutex_unlock(&s->lock);
    return ret;
3720 3721
}

3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763
/*
 * qcow2_compress()
 *
 * @dest - destination buffer, at least of @size-1 bytes
 * @src - source buffer, @size bytes
 *
 * Returns: compressed size on success
 *          -1 if compression is inefficient
 *          -2 on any other error
 */
static ssize_t qcow2_compress(void *dest, const void *src, size_t size)
{
    ssize_t ret;
    z_stream strm;

    /* best compression, small window, no zlib header */
    memset(&strm, 0, sizeof(strm));
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
                       -12, 9, Z_DEFAULT_STRATEGY);
    if (ret != 0) {
        return -2;
    }

    /* strm.next_in is not const in old zlib versions, such as those used on
     * OpenBSD/NetBSD, so cast the const away */
    strm.avail_in = size;
    strm.next_in = (void *) src;
    strm.avail_out = size - 1;
    strm.next_out = dest;

    ret = deflate(&strm, Z_FINISH);
    if (ret == Z_STREAM_END) {
        ret = size - 1 - strm.avail_out;
    } else {
        ret = (ret == Z_OK ? -1 : -2);
    }

    deflateEnd(&strm);

    return ret;
}

3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819
#define MAX_COMPRESS_THREADS 4

typedef struct Qcow2CompressData {
    void *dest;
    const void *src;
    size_t size;
    ssize_t ret;
} Qcow2CompressData;

static int qcow2_compress_pool_func(void *opaque)
{
    Qcow2CompressData *data = opaque;

    data->ret = qcow2_compress(data->dest, data->src, data->size);

    return 0;
}

static void qcow2_compress_complete(void *opaque, int ret)
{
    qemu_coroutine_enter(opaque);
}

/* See qcow2_compress definition for parameters description */
static ssize_t qcow2_co_compress(BlockDriverState *bs,
                                 void *dest, const void *src, size_t size)
{
    BDRVQcow2State *s = bs->opaque;
    BlockAIOCB *acb;
    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    Qcow2CompressData arg = {
        .dest = dest,
        .src = src,
        .size = size,
    };

    while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) {
        qemu_co_queue_wait(&s->compress_wait_queue, NULL);
    }

    s->nb_compress_threads++;
    acb = thread_pool_submit_aio(pool, qcow2_compress_pool_func, &arg,
                                 qcow2_compress_complete,
                                 qemu_coroutine_self());

    if (!acb) {
        s->nb_compress_threads--;
        return -EINVAL;
    }
    qemu_coroutine_yield();
    s->nb_compress_threads--;
    qemu_co_queue_next(&s->compress_wait_queue);

    return arg.ret;
}

B
Blue Swirl 已提交
3820 3821
/* XXX: put compressed sectors first, then all the cluster aligned
   tables to avoid losing bytes in alignment */
3822 3823 3824
static coroutine_fn int
qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                            uint64_t bytes, QEMUIOVector *qiov)
B
Blue Swirl 已提交
3825
{
3826
    BDRVQcow2State *s = bs->opaque;
3827 3828
    QEMUIOVector hd_qiov;
    struct iovec iov;
3829 3830
    int ret;
    size_t out_len;
3831
    uint8_t *buf, *out_buf;
3832
    int64_t cluster_offset;
B
Blue Swirl 已提交
3833

3834
    if (bytes == 0) {
B
Blue Swirl 已提交
3835 3836
        /* align end of file to a sector boundary to ease reading with
           sector based I/Os */
K
Kevin Wolf 已提交
3837
        cluster_offset = bdrv_getlength(bs->file->bs);
3838 3839 3840
        if (cluster_offset < 0) {
            return cluster_offset;
        }
3841 3842
        return bdrv_co_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF,
                                NULL);
B
Blue Swirl 已提交
3843 3844
    }

3845 3846 3847 3848
    if (offset_into_cluster(s, offset)) {
        return -EINVAL;
    }

3849
    buf = qemu_blockalign(bs, s->cluster_size);
3850
    if (bytes != s->cluster_size) {
3851 3852
        if (bytes > s->cluster_size ||
            offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
3853
        {
3854 3855
            qemu_vfree(buf);
            return -EINVAL;
3856
        }
3857 3858
        /* Zero-pad last write if image size is not cluster aligned */
        memset(buf + bytes, 0, s->cluster_size - bytes);
3859
    }
3860
    qemu_iovec_to_buf(qiov, 0, buf, bytes);
B
Blue Swirl 已提交
3861

3862
    out_buf = g_malloc(s->cluster_size);
B
Blue Swirl 已提交
3863

3864
    out_len = qcow2_co_compress(bs, out_buf, buf, s->cluster_size);
3865
    if (out_len == -2) {
3866 3867
        ret = -EINVAL;
        goto fail;
3868
    } else if (out_len == -1) {
B
Blue Swirl 已提交
3869
        /* could not compress: write normal cluster */
3870
        ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0);
3871 3872 3873
        if (ret < 0) {
            goto fail;
        }
3874 3875
        goto success;
    }
3876

3877 3878 3879 3880 3881 3882 3883 3884 3885
    qemu_co_mutex_lock(&s->lock);
    cluster_offset =
        qcow2_alloc_compressed_cluster_offset(bs, offset, out_len);
    if (!cluster_offset) {
        qemu_co_mutex_unlock(&s->lock);
        ret = -EIO;
        goto fail;
    }
    cluster_offset &= s->cluster_offset_mask;
3886

3887 3888 3889 3890
    ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        goto fail;
B
Blue Swirl 已提交
3891 3892
    }

3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904
    iov = (struct iovec) {
        .iov_base   = out_buf,
        .iov_len    = out_len,
    };
    qemu_iovec_init_external(&hd_qiov, &iov, 1);

    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
    ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
    if (ret < 0) {
        goto fail;
    }
success:
3905 3906
    ret = 0;
fail:
3907
    qemu_vfree(buf);
3908
    g_free(out_buf);
3909
    return ret;
B
Blue Swirl 已提交
3910 3911
}

M
Max Reitz 已提交
3912 3913
static int make_completely_empty(BlockDriverState *bs)
{
3914
    BDRVQcow2State *s = bs->opaque;
3915
    Error *local_err = NULL;
M
Max Reitz 已提交
3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949
    int ret, l1_clusters;
    int64_t offset;
    uint64_t *new_reftable = NULL;
    uint64_t rt_entry, l1_size2;
    struct {
        uint64_t l1_offset;
        uint64_t reftable_offset;
        uint32_t reftable_clusters;
    } QEMU_PACKED l1_ofs_rt_ofs_cls;

    ret = qcow2_cache_empty(bs, s->l2_table_cache);
    if (ret < 0) {
        goto fail;
    }

    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
    if (ret < 0) {
        goto fail;
    }

    /* Refcounts will be broken utterly */
    ret = qcow2_mark_dirty(bs);
    if (ret < 0) {
        goto fail;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);

    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);

    /* After this call, neither the in-memory nor the on-disk refcount
     * information accurately describe the actual references */

3950
    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
3951
                             l1_clusters * s->cluster_size, 0);
M
Max Reitz 已提交
3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
    memset(s->l1_table, 0, l1_size2);

    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);

    /* Overwrite enough clusters at the beginning of the sectors to place
     * the refcount table, a refcount block and the L1 table in; this may
     * overwrite parts of the existing refcount and L1 table, which is not
     * an issue because the dirty flag is set, complete data loss is in fact
     * desired and partial data loss is consequently fine as well */
3964
    ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
3965
                             (2 + l1_clusters) * s->cluster_size, 0);
M
Max Reitz 已提交
3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979
    /* This call (even if it failed overall) may have overwritten on-disk
     * refcount structures; in that case, the in-memory refcount information
     * will probably differ from the on-disk information which makes the BDS
     * unusable */
    if (ret < 0) {
        goto fail_broken_refcounts;
    }

    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);

    /* "Create" an empty reftable (one cluster) directly after the image
     * header and an empty L1 table three clusters after the image header;
     * the cluster between those two will be used as the first refblock */
3980 3981 3982
    l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
    l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
    l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
3983
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
M
Max Reitz 已提交
3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998
                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
    if (ret < 0) {
        goto fail_broken_refcounts;
    }

    s->l1_table_offset = 3 * s->cluster_size;

    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
    if (!new_reftable) {
        ret = -ENOMEM;
        goto fail_broken_refcounts;
    }

    s->refcount_table_offset = s->cluster_size;
    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
3999
    s->max_refcount_table_index = 0;
M
Max Reitz 已提交
4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014

    g_free(s->refcount_table);
    s->refcount_table = new_reftable;
    new_reftable = NULL;

    /* Now the in-memory refcount information again corresponds to the on-disk
     * information (reftable is empty and no refblocks (the refblock cache is
     * empty)); however, this means some clusters (e.g. the image header) are
     * referenced, but not refcounted, but the normal qcow2 code assumes that
     * the in-memory information is always correct */

    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);

    /* Enter the first refblock into the reftable */
    rt_entry = cpu_to_be64(2 * s->cluster_size);
4015
    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
M
Max Reitz 已提交
4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039
                           &rt_entry, sizeof(rt_entry));
    if (ret < 0) {
        goto fail_broken_refcounts;
    }
    s->refcount_table[0] = 2 * s->cluster_size;

    s->free_cluster_index = 0;
    assert(3 + l1_clusters <= s->refcount_block_size);
    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
    if (offset < 0) {
        ret = offset;
        goto fail_broken_refcounts;
    } else if (offset > 0) {
        error_report("First cluster in emptied image is in use");
        abort();
    }

    /* Now finally the in-memory information corresponds to the on-disk
     * structures and is correct */
    ret = qcow2_mark_clean(bs);
    if (ret < 0) {
        goto fail;
    }

4040
    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
4041
                        PREALLOC_MODE_OFF, &local_err);
M
Max Reitz 已提交
4042
    if (ret < 0) {
4043
        error_report_err(local_err);
M
Max Reitz 已提交
4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062
        goto fail;
    }

    return 0;

fail_broken_refcounts:
    /* The BDS is unusable at this point. If we wanted to make it usable, we
     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
     * again. However, because the functions which could have caused this error
     * path to be taken are used by those functions as well, it's very likely
     * that that sequence will fail as well. Therefore, just eject the BDS. */
    bs->drv = NULL;

fail:
    g_free(new_reftable);
    return ret;
}

M
Max Reitz 已提交
4063 4064
static int qcow2_make_empty(BlockDriverState *bs)
{
4065
    BDRVQcow2State *s = bs->opaque;
4066 4067
    uint64_t offset, end_offset;
    int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
M
Max Reitz 已提交
4068 4069 4070 4071
    int l1_clusters, ret = 0;

    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));

4072
    if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
4073 4074
        3 + l1_clusters <= s->refcount_block_size &&
        s->crypt_method_header != QCOW_CRYPT_LUKS) {
4075 4076 4077 4078 4079 4080 4081
        /* The following function only works for qcow2 v3 images (it
         * requires the dirty flag) and only as long as there are no
         * features that reserve extra clusters (such as snapshots,
         * LUKS header, or persistent bitmaps), because it completely
         * empties the image.  Furthermore, the L1 table and three
         * additional clusters (image header, refcount table, one
         * refcount block) have to fit inside one refcount block. */
M
Max Reitz 已提交
4082 4083
        return make_completely_empty(bs);
    }
M
Max Reitz 已提交
4084

M
Max Reitz 已提交
4085 4086
    /* This fallback code simply discards every active cluster; this is slow,
     * but works in all cases */
4087 4088
    end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
    for (offset = 0; offset < end_offset; offset += step) {
M
Max Reitz 已提交
4089 4090 4091 4092 4093
        /* As this function is generally used after committing an external
         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
         * default action for this kind of discard is to pass the discard,
         * which will ideally result in an actually smaller image file, as
         * is probably desired. */
4094 4095
        ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
                                    QCOW2_DISCARD_SNAPSHOT, true);
M
Max Reitz 已提交
4096 4097 4098 4099 4100 4101 4102 4103
        if (ret < 0) {
            break;
        }
    }

    return ret;
}

4104
static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
B
Blue Swirl 已提交
4105
{
4106
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
4107 4108
    int ret;

P
Paolo Bonzini 已提交
4109
    qemu_co_mutex_lock(&s->lock);
4110
    ret = qcow2_write_caches(bs);
P
Paolo Bonzini 已提交
4111
    qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
4112

4113
    return ret;
K
Kevin Wolf 已提交
4114 4115
}

4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147
static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
                                       Error **errp)
{
    Error *local_err = NULL;
    BlockMeasureInfo *info;
    uint64_t required = 0; /* bytes that contribute to required size */
    uint64_t virtual_size; /* disk size as seen by guest */
    uint64_t refcount_bits;
    uint64_t l2_tables;
    size_t cluster_size;
    int version;
    char *optstr;
    PreallocMode prealloc;
    bool has_backing_file;

    /* Parse image creation options */
    cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
    if (local_err) {
        goto err;
    }

    version = qcow2_opt_get_version_del(opts, &local_err);
    if (local_err) {
        goto err;
    }

    refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
    if (local_err) {
        goto err;
    }

    optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
4148
    prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
4149
                               PREALLOC_MODE_OFF, &local_err);
4150 4151 4152 4153 4154 4155 4156 4157 4158
    g_free(optstr);
    if (local_err) {
        goto err;
    }

    optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    has_backing_file = !!optstr;
    g_free(optstr);

4159 4160
    virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
    virtual_size = ROUND_UP(virtual_size, cluster_size);
4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179

    /* Check that virtual disk size is valid */
    l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
                             cluster_size / sizeof(uint64_t));
    if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) {
        error_setg(&local_err, "The image size is too large "
                               "(try using a larger cluster size)");
        goto err;
    }

    /* Account for input image */
    if (in_bs) {
        int64_t ssize = bdrv_getlength(in_bs);
        if (ssize < 0) {
            error_setg_errno(&local_err, -ssize,
                             "Unable to get image virtual_size");
            goto err;
        }

4180
        virtual_size = ROUND_UP(ssize, cluster_size);
4181 4182 4183 4184 4185 4186 4187 4188 4189

        if (has_backing_file) {
            /* We don't how much of the backing chain is shared by the input
             * image and the new image file.  In the worst case the new image's
             * backing file has nothing in common with the input image.  Be
             * conservative and assume all clusters need to be written.
             */
            required = virtual_size;
        } else {
4190
            int64_t offset;
4191
            int64_t pnum = 0;
4192

4193 4194
            for (offset = 0; offset < ssize; offset += pnum) {
                int ret;
4195

4196 4197 4198
                ret = bdrv_block_status_above(in_bs, NULL, offset,
                                              ssize - offset, &pnum, NULL,
                                              NULL);
4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209
                if (ret < 0) {
                    error_setg_errno(&local_err, -ret,
                                     "Unable to get block status");
                    goto err;
                }

                if (ret & BDRV_BLOCK_ZERO) {
                    /* Skip zero regions (safe with no backing file) */
                } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
                           (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
                    /* Extend pnum to end of cluster for next iteration */
4210
                    pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
4211 4212

                    /* Count clusters we've seen */
4213
                    required += offset % cluster_size + pnum;
4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242
                }
            }
        }
    }

    /* Take into account preallocation.  Nothing special is needed for
     * PREALLOC_MODE_METADATA since metadata is always counted.
     */
    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
        required = virtual_size;
    }

    info = g_new(BlockMeasureInfo, 1);
    info->fully_allocated =
        qcow2_calc_prealloc_size(virtual_size, cluster_size,
                                 ctz32(refcount_bits));

    /* Remove data clusters that are not required.  This overestimates the
     * required size because metadata needed for the fully allocated file is
     * still counted.
     */
    info->required = info->fully_allocated - virtual_size + required;
    return info;

err:
    error_propagate(errp, local_err);
    return NULL;
}

4243
static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
B
Blue Swirl 已提交
4244
{
4245
    BDRVQcow2State *s = bs->opaque;
4246
    bdi->unallocated_blocks_are_zero = true;
B
Blue Swirl 已提交
4247
    bdi->cluster_size = s->cluster_size;
4248
    bdi->vm_state_offset = qcow2_vm_state_offset(s);
B
Blue Swirl 已提交
4249 4250 4251
    return 0;
}

4252 4253
static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
{
4254
    BDRVQcow2State *s = bs->opaque;
4255 4256
    ImageInfoSpecific *spec_info;
    QCryptoBlockInfo *encrypt_info = NULL;
4257

4258 4259 4260 4261 4262
    if (s->crypto != NULL) {
        encrypt_info = qcrypto_block_get_info(s->crypto, &error_abort);
    }

    spec_info = g_new(ImageInfoSpecific, 1);
4263
    *spec_info = (ImageInfoSpecific){
4264
        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
4265
        .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1),
4266 4267
    };
    if (s->qcow_version == 2) {
4268
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4269 4270
            .compat             = g_strdup("0.10"),
            .refcount_bits      = s->refcount_bits,
4271 4272
        };
    } else if (s->qcow_version == 3) {
4273
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4274 4275 4276 4277
            .compat             = g_strdup("1.1"),
            .lazy_refcounts     = s->compatible_features &
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
            .has_lazy_refcounts = true,
4278 4279 4280
            .corrupt            = s->incompatible_features &
                                  QCOW2_INCOMPAT_CORRUPT,
            .has_corrupt        = true,
4281
            .refcount_bits      = s->refcount_bits,
4282
        };
4283 4284 4285 4286
    } else {
        /* if this assertion fails, this probably means a new version was
         * added without having it covered here */
        assert(false);
4287 4288
    }

4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311
    if (encrypt_info) {
        ImageInfoSpecificQCow2Encryption *qencrypt =
            g_new(ImageInfoSpecificQCow2Encryption, 1);
        switch (encrypt_info->format) {
        case Q_CRYPTO_BLOCK_FORMAT_QCOW:
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
            break;
        case Q_CRYPTO_BLOCK_FORMAT_LUKS:
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
            qencrypt->u.luks = encrypt_info->u.luks;
            break;
        default:
            abort();
        }
        /* Since we did shallow copy above, erase any pointers
         * in the original info */
        memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
        qapi_free_QCryptoBlockInfo(encrypt_info);

        spec_info->u.qcow2.data->has_encrypt = true;
        spec_info->u.qcow2.data->encrypt = qencrypt;
    }

4312 4313 4314
    return spec_info;
}

4315 4316
static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
B
Blue Swirl 已提交
4317
{
4318
    BDRVQcow2State *s = bs->opaque;
B
Blue Swirl 已提交
4319

4320
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
4321 4322
    return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
                                    qiov->size, qiov, 0);
B
Blue Swirl 已提交
4323 4324
}

4325 4326
static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                              int64_t pos)
B
Blue Swirl 已提交
4327
{
4328
    BDRVQcow2State *s = bs->opaque;
B
Blue Swirl 已提交
4329

4330
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
4331 4332
    return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
                                   qiov->size, qiov, 0);
B
Blue Swirl 已提交
4333 4334
}

M
Max Reitz 已提交
4335 4336 4337 4338
/*
 * Downgrades an image's version. To achieve this, any incompatible features
 * have to be removed.
 */
4339
static int qcow2_downgrade(BlockDriverState *bs, int target_version,
4340 4341
                           BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
                           Error **errp)
M
Max Reitz 已提交
4342
{
4343
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
4344 4345 4346
    int current_version = s->qcow_version;
    int ret;

4347 4348 4349 4350 4351
    /* This is qcow2_downgrade(), not qcow2_upgrade() */
    assert(target_version < current_version);

    /* There are no other versions (now) that you can downgrade to */
    assert(target_version == 2);
M
Max Reitz 已提交
4352 4353

    if (s->refcount_order != 4) {
4354
        error_setg(errp, "compat=0.10 requires refcount_bits=16");
M
Max Reitz 已提交
4355 4356 4357 4358 4359 4360 4361
        return -ENOTSUP;
    }

    /* clear incompatible features */
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
        ret = qcow2_mark_clean(bs);
        if (ret < 0) {
4362
            error_setg_errno(errp, -ret, "Failed to make the image clean");
M
Max Reitz 已提交
4363 4364 4365 4366 4367 4368 4369 4370 4371
            return ret;
        }
    }

    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
     * the first place; if that happens nonetheless, returning -ENOTSUP is the
     * best thing to do anyway */

    if (s->incompatible_features) {
4372 4373
        error_setg(errp, "Cannot downgrade an image with incompatible features "
                   "%#" PRIx64 " set", s->incompatible_features);
M
Max Reitz 已提交
4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384
        return -ENOTSUP;
    }

    /* since we can ignore compatible features, we can set them to 0 as well */
    s->compatible_features = 0;
    /* if lazy refcounts have been used, they have already been fixed through
     * clearing the dirty flag */

    /* clearing autoclear features is trivial */
    s->autoclear_features = 0;

4385
    ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
M
Max Reitz 已提交
4386
    if (ret < 0) {
4387
        error_setg_errno(errp, -ret, "Failed to turn zero into data clusters");
M
Max Reitz 已提交
4388 4389 4390 4391 4392 4393 4394
        return ret;
    }

    s->qcow_version = target_version;
    ret = qcow2_update_header(bs);
    if (ret < 0) {
        s->qcow_version = current_version;
4395
        error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
4396 4397 4398 4399 4400
        return ret;
    }
    return 0;
}

4401 4402 4403 4404 4405 4406
typedef enum Qcow2AmendOperation {
    /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
     * statically initialized to so that the helper CB can discern the first
     * invocation from an operation change */
    QCOW2_NO_OPERATION = 0,

4407
    QCOW2_CHANGING_REFCOUNT_ORDER,
4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470
    QCOW2_DOWNGRADING,
} Qcow2AmendOperation;

typedef struct Qcow2AmendHelperCBInfo {
    /* The code coordinating the amend operations should only modify
     * these four fields; the rest will be managed by the CB */
    BlockDriverAmendStatusCB *original_status_cb;
    void *original_cb_opaque;

    Qcow2AmendOperation current_operation;

    /* Total number of operations to perform (only set once) */
    int total_operations;

    /* The following fields are managed by the CB */

    /* Number of operations completed */
    int operations_completed;

    /* Cumulative offset of all completed operations */
    int64_t offset_completed;

    Qcow2AmendOperation last_operation;
    int64_t last_work_size;
} Qcow2AmendHelperCBInfo;

static void qcow2_amend_helper_cb(BlockDriverState *bs,
                                  int64_t operation_offset,
                                  int64_t operation_work_size, void *opaque)
{
    Qcow2AmendHelperCBInfo *info = opaque;
    int64_t current_work_size;
    int64_t projected_work_size;

    if (info->current_operation != info->last_operation) {
        if (info->last_operation != QCOW2_NO_OPERATION) {
            info->offset_completed += info->last_work_size;
            info->operations_completed++;
        }

        info->last_operation = info->current_operation;
    }

    assert(info->total_operations > 0);
    assert(info->operations_completed < info->total_operations);

    info->last_work_size = operation_work_size;

    current_work_size = info->offset_completed + operation_work_size;

    /* current_work_size is the total work size for (operations_completed + 1)
     * operations (which includes this one), so multiply it by the number of
     * operations not covered and divide it by the number of operations
     * covered to get a projection for the operations not covered */
    projected_work_size = current_work_size * (info->total_operations -
                                               info->operations_completed - 1)
                                            / (info->operations_completed + 1);

    info->original_status_cb(bs, info->offset_completed + operation_offset,
                             current_work_size + projected_work_size,
                             info->original_cb_opaque);
}

4471
static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
4472
                               BlockDriverAmendStatusCB *status_cb,
4473 4474
                               void *cb_opaque,
                               Error **errp)
M
Max Reitz 已提交
4475
{
4476
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
4477 4478 4479 4480
    int old_version = s->qcow_version, new_version = old_version;
    uint64_t new_size = 0;
    const char *backing_file = NULL, *backing_format = NULL;
    bool lazy_refcounts = s->use_lazy_refcounts;
4481 4482 4483
    const char *compat = NULL;
    uint64_t cluster_size = s->cluster_size;
    bool encrypt;
4484
    int encformat;
4485
    int refcount_bits = s->refcount_bits;
M
Max Reitz 已提交
4486
    int ret;
4487
    QemuOptDesc *desc = opts->list->desc;
4488
    Qcow2AmendHelperCBInfo helper_cb_info;
M
Max Reitz 已提交
4489

4490 4491
    while (desc && desc->name) {
        if (!qemu_opt_find(opts, desc->name)) {
M
Max Reitz 已提交
4492
            /* only change explicitly defined options */
4493
            desc++;
M
Max Reitz 已提交
4494 4495 4496
            continue;
        }

4497 4498
        if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
            compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
4499
            if (!compat) {
M
Max Reitz 已提交
4500
                /* preserve default */
4501
            } else if (!strcmp(compat, "0.10")) {
M
Max Reitz 已提交
4502
                new_version = 2;
4503
            } else if (!strcmp(compat, "1.1")) {
M
Max Reitz 已提交
4504 4505
                new_version = 3;
            } else {
4506
                error_setg(errp, "Unknown compatibility level %s", compat);
M
Max Reitz 已提交
4507 4508
                return -EINVAL;
            }
4509
        } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
4510
            error_setg(errp, "Cannot change preallocation mode");
M
Max Reitz 已提交
4511
            return -ENOTSUP;
4512 4513 4514 4515 4516 4517 4518 4519
        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
            backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
            backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
            encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
4520
                                        !!s->crypto);
4521

4522
            if (encrypt != !!s->crypto) {
4523 4524
                error_setg(errp,
                           "Changing the encryption flag is not supported");
M
Max Reitz 已提交
4525 4526
                return -ENOTSUP;
            }
4527 4528 4529 4530 4531
        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) {
            encformat = qcow2_crypt_method_from_format(
                qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT));

            if (encformat != s->crypt_method_header) {
4532 4533
                error_setg(errp,
                           "Changing the encryption format is not supported");
4534 4535
                return -ENOTSUP;
            }
4536
        } else if (g_str_has_prefix(desc->name, "encrypt.")) {
4537 4538
            error_setg(errp,
                       "Changing the encryption parameters is not supported");
4539
            return -ENOTSUP;
4540 4541
        } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
            cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
4542 4543
                                             cluster_size);
            if (cluster_size != s->cluster_size) {
4544
                error_setg(errp, "Changing the cluster size is not supported");
M
Max Reitz 已提交
4545 4546
                return -ENOTSUP;
            }
4547 4548
        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
4549
                                               lazy_refcounts);
4550
        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
4551 4552 4553 4554 4555 4556
            refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
                                                refcount_bits);

            if (refcount_bits <= 0 || refcount_bits > 64 ||
                !is_power_of_2(refcount_bits))
            {
4557 4558
                error_setg(errp, "Refcount width must be a power of two and "
                           "may not exceed 64 bits");
4559 4560
                return -EINVAL;
            }
M
Max Reitz 已提交
4561
        } else {
4562
            /* if this point is reached, this probably means a new option was
M
Max Reitz 已提交
4563
             * added without having it covered here */
4564
            abort();
M
Max Reitz 已提交
4565
        }
4566 4567

        desc++;
M
Max Reitz 已提交
4568 4569
    }

4570 4571 4572 4573
    helper_cb_info = (Qcow2AmendHelperCBInfo){
        .original_status_cb = status_cb,
        .original_cb_opaque = cb_opaque,
        .total_operations = (new_version < old_version)
4574
                          + (s->refcount_bits != refcount_bits)
4575 4576
    };

4577 4578 4579 4580 4581 4582
    /* Upgrade first (some features may require compat=1.1) */
    if (new_version > old_version) {
        s->qcow_version = new_version;
        ret = qcow2_update_header(bs);
        if (ret < 0) {
            s->qcow_version = old_version;
4583
            error_setg_errno(errp, -ret, "Failed to update the image header");
4584
            return ret;
M
Max Reitz 已提交
4585 4586 4587
        }
    }

4588 4589 4590 4591
    if (s->refcount_bits != refcount_bits) {
        int refcount_order = ctz32(refcount_bits);

        if (new_version < 3 && refcount_bits != 16) {
4592 4593 4594
            error_setg(errp, "Refcount widths other than 16 bits require "
                       "compatibility level 1.1 or above (use compat=1.1 or "
                       "greater)");
4595 4596 4597 4598 4599 4600
            return -EINVAL;
        }

        helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
        ret = qcow2_change_refcount_order(bs, refcount_order,
                                          &qcow2_amend_helper_cb,
4601
                                          &helper_cb_info, errp);
4602 4603 4604 4605 4606
        if (ret < 0) {
            return ret;
        }
    }

M
Max Reitz 已提交
4607
    if (backing_file || backing_format) {
4608 4609 4610
        ret = qcow2_change_backing_file(bs,
                    backing_file ?: s->image_backing_file,
                    backing_format ?: s->image_backing_format);
M
Max Reitz 已提交
4611
        if (ret < 0) {
4612
            error_setg_errno(errp, -ret, "Failed to change the backing file");
M
Max Reitz 已提交
4613 4614 4615 4616 4617 4618
            return ret;
        }
    }

    if (s->use_lazy_refcounts != lazy_refcounts) {
        if (lazy_refcounts) {
4619
            if (new_version < 3) {
4620 4621 4622
                error_setg(errp, "Lazy refcounts only supported with "
                           "compatibility level 1.1 and above (use compat=1.1 "
                           "or greater)");
M
Max Reitz 已提交
4623 4624 4625 4626 4627 4628
                return -EINVAL;
            }
            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
            ret = qcow2_update_header(bs);
            if (ret < 0) {
                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
4629
                error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
4630 4631 4632 4633 4634 4635 4636
                return ret;
            }
            s->use_lazy_refcounts = true;
        } else {
            /* make image clean first */
            ret = qcow2_mark_clean(bs);
            if (ret < 0) {
4637
                error_setg_errno(errp, -ret, "Failed to make the image clean");
M
Max Reitz 已提交
4638 4639 4640 4641 4642 4643 4644
                return ret;
            }
            /* now disallow lazy refcounts */
            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
            ret = qcow2_update_header(bs);
            if (ret < 0) {
                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
4645
                error_setg_errno(errp, -ret, "Failed to update the image header");
M
Max Reitz 已提交
4646 4647 4648 4649 4650 4651 4652
                return ret;
            }
            s->use_lazy_refcounts = false;
        }
    }

    if (new_size) {
K
Kevin Wolf 已提交
4653
        BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
4654
        ret = blk_insert_bs(blk, bs, errp);
4655 4656 4657 4658 4659
        if (ret < 0) {
            blk_unref(blk);
            return ret;
        }

4660
        ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, errp);
4661
        blk_unref(blk);
M
Max Reitz 已提交
4662 4663 4664 4665 4666
        if (ret < 0) {
            return ret;
        }
    }

4667 4668
    /* Downgrade last (so unsupported features can be removed before) */
    if (new_version < old_version) {
4669 4670
        helper_cb_info.current_operation = QCOW2_DOWNGRADING;
        ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
4671
                              &helper_cb_info, errp);
4672 4673 4674 4675 4676
        if (ret < 0) {
            return ret;
        }
    }

M
Max Reitz 已提交
4677 4678 4679
    return 0;
}

M
Max Reitz 已提交
4680 4681 4682 4683 4684 4685 4686 4687 4688
/*
 * If offset or size are negative, respectively, they will not be included in
 * the BLOCK_IMAGE_CORRUPTED event emitted.
 * fatal will be ignored for read-only BDS; corruptions found there will always
 * be considered non-fatal.
 */
void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                             int64_t size, const char *message_format, ...)
{
4689
    BDRVQcow2State *s = bs->opaque;
4690
    const char *node_name;
M
Max Reitz 已提交
4691 4692 4693
    char *message;
    va_list ap;

4694
    fatal = fatal && bdrv_is_writable(bs);
M
Max Reitz 已提交
4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713

    if (s->signaled_corruption &&
        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
    {
        return;
    }

    va_start(ap, message_format);
    message = g_strdup_vprintf(message_format, ap);
    va_end(ap);

    if (fatal) {
        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
                "corruption events will be suppressed\n", message);
    } else {
        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
                "corruption events will be suppressed\n", message);
    }

4714 4715 4716 4717 4718
    node_name = bdrv_get_node_name(bs);
    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
                                          *node_name != '\0', node_name,
                                          message, offset >= 0, offset,
                                          size >= 0, size,
4719
                                          fatal);
M
Max Reitz 已提交
4720 4721 4722 4723 4724 4725 4726 4727 4728 4729
    g_free(message);

    if (fatal) {
        qcow2_mark_corrupt(bs);
        bs->drv = NULL; /* make BDS unusable */
    }

    s->signaled_corruption = true;
}

4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756
static QemuOptsList qcow2_create_opts = {
    .name = "qcow2-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_COMPAT_LEVEL,
            .type = QEMU_OPT_STRING,
            .help = "Compatibility level (0.10 or 1.1)"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_BACKING_FMT,
            .type = QEMU_OPT_STRING,
            .help = "Image format of the base image"
        },
        {
            .name = BLOCK_OPT_ENCRYPT,
            .type = QEMU_OPT_BOOL,
4757 4758 4759 4760 4761 4762
            .help = "Encrypt the image with format 'aes'. (Deprecated "
                    "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
        },
        {
            .name = BLOCK_OPT_ENCRYPT_FORMAT,
            .type = QEMU_OPT_STRING,
4763
            .help = "Encrypt the image, format choices: 'aes', 'luks'",
4764
        },
4765 4766 4767 4768 4769 4770 4771 4772
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
            "ID of secret providing qcow AES key or LUKS passphrase"),
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),
        BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
4773 4774 4775 4776 4777 4778 4779 4780 4781
        {
            .name = BLOCK_OPT_CLUSTER_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "qcow2 cluster size",
            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
        },
        {
            .name = BLOCK_OPT_PREALLOC,
            .type = QEMU_OPT_STRING,
4782 4783
            .help = "Preallocation mode (allowed values: off, metadata, "
                    "falloc, full)"
4784 4785 4786 4787 4788 4789 4790
        },
        {
            .name = BLOCK_OPT_LAZY_REFCOUNTS,
            .type = QEMU_OPT_BOOL,
            .help = "Postpone refcount updates",
            .def_value_str = "off"
        },
4791 4792 4793 4794 4795 4796
        {
            .name = BLOCK_OPT_REFCOUNT_BITS,
            .type = QEMU_OPT_NUMBER,
            .help = "Width of a reference count entry in bits",
            .def_value_str = "16"
        },
4797 4798
        { /* end of list */ }
    }
B
Blue Swirl 已提交
4799 4800
};

4801
BlockDriver bdrv_qcow2 = {
4802
    .format_name        = "qcow2",
4803
    .instance_size      = sizeof(BDRVQcow2State),
4804 4805 4806
    .bdrv_probe         = qcow2_probe,
    .bdrv_open          = qcow2_open,
    .bdrv_close         = qcow2_close,
J
Jeff Cody 已提交
4807
    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
4808 4809
    .bdrv_reopen_commit   = qcow2_reopen_commit,
    .bdrv_reopen_abort    = qcow2_reopen_abort,
4810
    .bdrv_join_options    = qcow2_join_options,
4811
    .bdrv_child_perm      = bdrv_format_default_perms,
4812
    .bdrv_co_create_opts  = qcow2_co_create_opts,
4813
    .bdrv_co_create       = qcow2_co_create,
4814
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
4815
    .bdrv_co_block_status = qcow2_co_block_status,
4816

K
Kevin Wolf 已提交
4817
    .bdrv_co_preadv         = qcow2_co_preadv,
K
Kevin Wolf 已提交
4818
    .bdrv_co_pwritev        = qcow2_co_pwritev,
K
Kevin Wolf 已提交
4819
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
4820

4821
    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
4822
    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
F
Fam Zheng 已提交
4823 4824
    .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
    .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
4825
    .bdrv_co_truncate       = qcow2_co_truncate,
4826
    .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed,
M
Max Reitz 已提交
4827
    .bdrv_make_empty        = qcow2_make_empty,
B
Blue Swirl 已提交
4828 4829 4830 4831 4832

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
    .bdrv_snapshot_list     = qcow2_snapshot_list,
4833
    .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
4834
    .bdrv_measure           = qcow2_measure,
4835
    .bdrv_get_info          = qcow2_get_info,
4836
    .bdrv_get_specific_info = qcow2_get_specific_info,
B
Blue Swirl 已提交
4837

4838 4839
    .bdrv_save_vmstate    = qcow2_save_vmstate,
    .bdrv_load_vmstate    = qcow2_load_vmstate,
B
Blue Swirl 已提交
4840

4841
    .supports_backing           = true,
B
Blue Swirl 已提交
4842 4843
    .bdrv_change_backing_file   = qcow2_change_backing_file,

4844
    .bdrv_refresh_limits        = qcow2_refresh_limits,
4845
    .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
K
Kevin Wolf 已提交
4846
    .bdrv_inactivate            = qcow2_inactivate,
4847

4848
    .create_opts         = &qcow2_create_opts,
4849
    .bdrv_co_check       = qcow2_co_check,
C
Chunyan Liu 已提交
4850
    .bdrv_amend_options  = qcow2_amend_options,
4851 4852 4853

    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
    .bdrv_attach_aio_context  = qcow2_attach_aio_context,
4854 4855

    .bdrv_reopen_bitmaps_rw = qcow2_reopen_bitmaps_rw,
4856
    .bdrv_can_store_new_dirty_bitmap = qcow2_can_store_new_dirty_bitmap,
4857
    .bdrv_remove_persistent_dirty_bitmap = qcow2_remove_persistent_dirty_bitmap,
B
Blue Swirl 已提交
4858 4859
};

4860 4861 4862 4863 4864 4865
static void bdrv_qcow2_init(void)
{
    bdrv_register(&bdrv_qcow2);
}

block_init(bdrv_qcow2_init);