qcow2.c 47.1 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the QCOW version 2 format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
P
pbrook 已提交
24
#include "qemu-common.h"
B
bellard 已提交
25
#include "block_int.h"
26
#include "module.h"
B
bellard 已提交
27 28
#include <zlib.h>
#include "aes.h"
K
Kevin Wolf 已提交
29
#include "block/qcow2.h"
K
Kevin Wolf 已提交
30
#include "qemu-error.h"
K
Kevin Wolf 已提交
31
#include "qerror.h"
K
Kevin Wolf 已提交
32
#include "trace.h"
B
bellard 已提交
33 34 35 36 37 38 39 40

/*
  Differences with QCOW:

  - Support for multiple incremental snapshots.
  - Memory management by reference counts.
  - Clusters which have a reference count of one have the bit
    QCOW_OFLAG_COPIED to optimize write performance.
41
  - Size of compressed clusters is stored in sectors to reduce bit usage
B
bellard 已提交
42 43
    in the cluster offsets.
  - Support for storing additional data (such as the VM state) in the
44
    snapshots.
B
bellard 已提交
45 46 47 48 49
  - If a backing store is used, the cluster size is not constrained
    (could be backported to QCOW).
  - L2 tables have always a size of one cluster.
*/

50 51 52 53 54

typedef struct {
    uint32_t magic;
    uint32_t len;
} QCowExtension;
55 56
#define  QCOW2_EXT_MAGIC_END 0
#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
57
#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
58

59
static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
B
bellard 已提交
60 61
{
    const QCowHeader *cow_header = (const void *)buf;
62

B
bellard 已提交
63 64
    if (buf_size >= sizeof(QCowHeader) &&
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
K
Kevin Wolf 已提交
65
        be32_to_cpu(cow_header->version) >= 2)
B
bellard 已提交
66 67 68 69 70
        return 100;
    else
        return 0;
}

71 72 73 74 75 76 77 78

/* 
 * read qcow2 extension and fill bs
 * start reading from start_offset
 * finish reading upon magic of value 0 or when end_offset reached
 * unknown magic is skipped (future extension this version knows nothing about)
 * return 0 upon success, non-0 otherwise
 */
79
static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
80
                                 uint64_t end_offset, void **p_feature_table)
81
{
82
    BDRVQcowState *s = bs->opaque;
83 84
    QCowExtension ext;
    uint64_t offset;
85
    int ret;
86 87

#ifdef DEBUG_EXT
88
    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
89 90 91 92 93 94 95
#endif
    offset = start_offset;
    while (offset < end_offset) {

#ifdef DEBUG_EXT
        /* Sanity check */
        if (offset > s->cluster_size)
96
            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
97

D
Dong Xu Wang 已提交
98
        printf("attempting to read extended header in offset %lu\n", offset);
99 100
#endif

101
        if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
102
            fprintf(stderr, "qcow2_read_extension: ERROR: "
B
Blue Swirl 已提交
103 104
                    "pread fail from offset %" PRIu64 "\n",
                    offset);
105 106 107 108 109 110 111 112
            return 1;
        }
        be32_to_cpus(&ext.magic);
        be32_to_cpus(&ext.len);
        offset += sizeof(ext);
#ifdef DEBUG_EXT
        printf("ext.magic = 0x%x\n", ext.magic);
#endif
113 114 115 116 117
        if (ext.len > end_offset - offset) {
            error_report("Header extension too large");
            return -EINVAL;
        }

118
        switch (ext.magic) {
119
        case QCOW2_EXT_MAGIC_END:
120
            return 0;
121

122
        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
123 124
            if (ext.len >= sizeof(bs->backing_format)) {
                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
125
                        " (>=%zu)\n",
126 127 128
                        ext.len, sizeof(bs->backing_format));
                return 2;
            }
129
            if (bdrv_pread(bs->file, offset , bs->backing_format,
130 131 132 133 134 135 136 137
                           ext.len) != ext.len)
                return 3;
            bs->backing_format[ext.len] = '\0';
#ifdef DEBUG_EXT
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
#endif
            break;

138 139 140 141 142 143 144 145 146 147 148 149
        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
            if (p_feature_table != NULL) {
                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
                if (ret < 0) {
                    return ret;
                }

                *p_feature_table = feature_table;
            }
            break;

150
        default:
151 152 153 154 155 156 157 158 159 160 161 162 163 164
            /* unknown magic - save it in case we need to rewrite the header */
            {
                Qcow2UnknownHeaderExtension *uext;

                uext = g_malloc0(sizeof(*uext)  + ext.len);
                uext->magic = ext.magic;
                uext->len = ext.len;
                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);

                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
                if (ret < 0) {
                    return ret;
                }
            }
165 166
            break;
        }
167 168

        offset += ((ext.len + 7) & ~7);
169 170 171 172 173
    }

    return 0;
}

174 175 176 177 178 179 180 181 182 183
static void cleanup_unknown_header_ext(BlockDriverState *bs)
{
    BDRVQcowState *s = bs->opaque;
    Qcow2UnknownHeaderExtension *uext, *next;

    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
        QLIST_REMOVE(uext, next);
        g_free(uext);
    }
}
184

185 186
static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
    const char *fmt, ...)
K
Kevin Wolf 已提交
187 188 189 190 191 192 193 194 195 196 197 198
{
    char msg[64];
    va_list ap;

    va_start(ap, fmt);
    vsnprintf(msg, sizeof(msg), fmt, ap);
    va_end(ap);

    qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
        bs->device_name, "qcow2", msg);
}

199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
static void report_unsupported_feature(BlockDriverState *bs,
    Qcow2Feature *table, uint64_t mask)
{
    while (table && table->name[0] != '\0') {
        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
            if (mask & (1 << table->bit)) {
                report_unsupported(bs, "%.46s",table->name);
                mask &= ~(1 << table->bit);
            }
        }
        table++;
    }

    if (mask) {
        report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
    }
}

217
static int qcow2_open(BlockDriverState *bs, int flags)
B
bellard 已提交
218 219
{
    BDRVQcowState *s = bs->opaque;
220
    int len, i, ret = 0;
B
bellard 已提交
221
    QCowHeader header;
222
    uint64_t ext_end;
B
bellard 已提交
223

224 225
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
    if (ret < 0) {
B
bellard 已提交
226
        goto fail;
227
    }
B
bellard 已提交
228 229 230 231 232 233 234 235 236 237 238 239 240
    be32_to_cpus(&header.magic);
    be32_to_cpus(&header.version);
    be64_to_cpus(&header.backing_file_offset);
    be32_to_cpus(&header.backing_file_size);
    be64_to_cpus(&header.size);
    be32_to_cpus(&header.cluster_bits);
    be32_to_cpus(&header.crypt_method);
    be64_to_cpus(&header.l1_table_offset);
    be32_to_cpus(&header.l1_size);
    be64_to_cpus(&header.refcount_table_offset);
    be32_to_cpus(&header.refcount_table_clusters);
    be64_to_cpus(&header.snapshots_offset);
    be32_to_cpus(&header.nb_snapshots);
241

K
Kevin Wolf 已提交
242
    if (header.magic != QCOW_MAGIC) {
243
        ret = -EINVAL;
B
bellard 已提交
244
        goto fail;
245
    }
K
Kevin Wolf 已提交
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
    if (header.version < 2 || header.version > 3) {
        report_unsupported(bs, "QCOW version %d", header.version);
        ret = -ENOTSUP;
        goto fail;
    }

    s->qcow_version = header.version;

    /* Initialise version 3 header fields */
    if (header.version == 2) {
        header.incompatible_features    = 0;
        header.compatible_features      = 0;
        header.autoclear_features       = 0;
        header.refcount_order           = 4;
        header.header_length            = 72;
    } else {
        be64_to_cpus(&header.incompatible_features);
        be64_to_cpus(&header.compatible_features);
        be64_to_cpus(&header.autoclear_features);
        be32_to_cpus(&header.refcount_order);
        be32_to_cpus(&header.header_length);
    }

    if (header.header_length > sizeof(header)) {
        s->unknown_header_fields_size = header.header_length - sizeof(header);
        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
                         s->unknown_header_fields_size);
        if (ret < 0) {
            goto fail;
        }
    }

279 280 281 282 283 284
    if (header.backing_file_offset) {
        ext_end = header.backing_file_offset;
    } else {
        ext_end = 1 << header.cluster_bits;
    }

K
Kevin Wolf 已提交
285 286 287 288 289 290
    /* Handle feature bits */
    s->incompatible_features    = header.incompatible_features;
    s->compatible_features      = header.compatible_features;
    s->autoclear_features       = header.autoclear_features;

    if (s->incompatible_features != 0) {
291 292 293 294 295
        void *feature_table = NULL;
        qcow2_read_extensions(bs, header.header_length, ext_end,
                              &feature_table);
        report_unsupported_feature(bs, feature_table,
                                   s->incompatible_features);
K
Kevin Wolf 已提交
296 297 298 299 300 301 302 303
        ret = -ENOTSUP;
        goto fail;
    }

    /* Check support for various header values */
    if (header.refcount_order != 4) {
        report_unsupported(bs, "%d bit reference counts",
                           1 << header.refcount_order);
K
Kevin Wolf 已提交
304 305 306
        ret = -ENOTSUP;
        goto fail;
    }
K
Kevin Wolf 已提交
307

308
    if (header.cluster_bits < MIN_CLUSTER_BITS ||
309 310
        header.cluster_bits > MAX_CLUSTER_BITS) {
        ret = -EINVAL;
B
bellard 已提交
311
        goto fail;
312 313 314
    }
    if (header.crypt_method > QCOW_CRYPT_AES) {
        ret = -EINVAL;
B
bellard 已提交
315
        goto fail;
316
    }
B
bellard 已提交
317
    s->crypt_method_header = header.crypt_method;
318
    if (s->crypt_method_header) {
B
bellard 已提交
319
        bs->encrypted = 1;
320
    }
B
bellard 已提交
321 322 323 324 325 326 327 328 329 330
    s->cluster_bits = header.cluster_bits;
    s->cluster_size = 1 << s->cluster_bits;
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
    s->l2_size = 1 << s->l2_bits;
    bs->total_sectors = header.size / 512;
    s->csize_shift = (62 - (s->cluster_bits - 8));
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
    s->refcount_table_offset = header.refcount_table_offset;
331
    s->refcount_table_size =
B
bellard 已提交
332 333 334 335 336 337 338
        header.refcount_table_clusters << (s->cluster_bits - 3);

    s->snapshots_offset = header.snapshots_offset;
    s->nb_snapshots = header.nb_snapshots;

    /* read the level 1 table */
    s->l1_size = header.l1_size;
339
    s->l1_vm_state_index = size_to_l1(s, header.size);
B
bellard 已提交
340 341
    /* the L1 table must contain at least enough entries to put
       header.size bytes */
342 343
    if (s->l1_size < s->l1_vm_state_index) {
        ret = -EINVAL;
B
bellard 已提交
344
        goto fail;
345
    }
B
bellard 已提交
346
    s->l1_table_offset = header.l1_table_offset;
347
    if (s->l1_size > 0) {
348
        s->l1_table = g_malloc0(
349
            align_offset(s->l1_size * sizeof(uint64_t), 512));
350 351 352
        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
                         s->l1_size * sizeof(uint64_t));
        if (ret < 0) {
353
            goto fail;
354
        }
355 356 357
        for(i = 0;i < s->l1_size; i++) {
            be64_to_cpus(&s->l1_table[i]);
        }
B
bellard 已提交
358
    }
K
Kevin Wolf 已提交
359 360

    /* alloc L2 table/refcount block cache */
361 362
    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
K
Kevin Wolf 已提交
363

364
    s->cluster_cache = g_malloc(s->cluster_size);
B
bellard 已提交
365
    /* one more sector for decompressed data alignment */
366
    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
367
                                  + 512);
B
bellard 已提交
368
    s->cluster_cache_offset = -1;
369
    s->flags = flags;
370

371 372
    ret = qcow2_refcount_init(bs);
    if (ret != 0) {
B
bellard 已提交
373
        goto fail;
374
    }
B
bellard 已提交
375

B
Blue Swirl 已提交
376
    QLIST_INIT(&s->cluster_allocs);
377

378
    /* read qcow2 extensions */
379
    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
380
        ret = -EINVAL;
381
        goto fail;
382
    }
383

B
bellard 已提交
384 385 386
    /* read the backing file name */
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
387
        if (len > 1023) {
B
bellard 已提交
388
            len = 1023;
389 390 391 392
        }
        ret = bdrv_pread(bs->file, header.backing_file_offset,
                         bs->backing_file, len);
        if (ret < 0) {
B
bellard 已提交
393
            goto fail;
394
        }
B
bellard 已提交
395 396
        bs->backing_file[len] = '\0';
    }
397 398 399

    ret = qcow2_read_snapshots(bs);
    if (ret < 0) {
B
bellard 已提交
400
        goto fail;
401
    }
B
bellard 已提交
402

403 404 405 406 407 408 409 410 411
    /* Clear unknown autoclear feature bits */
    if (!bs->read_only && s->autoclear_features != 0) {
        s->autoclear_features = 0;
        ret = qcow2_update_header(bs);
        if (ret < 0) {
            goto fail;
        }
    }

K
Kevin Wolf 已提交
412 413 414
    /* Initialise locks */
    qemu_co_mutex_init(&s->lock);

B
bellard 已提交
415
#ifdef DEBUG_ALLOC
P
Philipp Hahn 已提交
416 417
    {
        BdrvCheckResult result = {0};
418
        qcow2_check_refcounts(bs, &result, 0);
P
Philipp Hahn 已提交
419
    }
B
bellard 已提交
420
#endif
421
    return ret;
B
bellard 已提交
422 423

 fail:
K
Kevin Wolf 已提交
424
    g_free(s->unknown_header_fields);
425
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
426 427
    qcow2_free_snapshots(bs);
    qcow2_refcount_close(bs);
428
    g_free(s->l1_table);
K
Kevin Wolf 已提交
429 430 431
    if (s->l2_table_cache) {
        qcow2_cache_destroy(bs, s->l2_table_cache);
    }
432
    g_free(s->cluster_cache);
433
    qemu_vfree(s->cluster_data);
434
    return ret;
B
bellard 已提交
435 436
}

437
static int qcow2_set_key(BlockDriverState *bs, const char *key)
B
bellard 已提交
438 439 440 441
{
    BDRVQcowState *s = bs->opaque;
    uint8_t keybuf[16];
    int len, i;
442

B
bellard 已提交
443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
    memset(keybuf, 0, 16);
    len = strlen(key);
    if (len > 16)
        len = 16;
    /* XXX: we could compress the chars to 7 bits to increase
       entropy */
    for(i = 0;i < len;i++) {
        keybuf[i] = key[i];
    }
    s->crypt_method = s->crypt_method_header;

    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
        return -1;
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
        return -1;
#if 0
    /* test */
    {
        uint8_t in[16];
        uint8_t out[16];
        uint8_t tmp[16];
        for(i=0;i<16;i++)
            in[i] = i;
        AES_encrypt(in, tmp, &s->aes_encrypt_key);
        AES_decrypt(tmp, out, &s->aes_decrypt_key);
        for(i = 0; i < 16; i++)
            printf(" %02x", tmp[i]);
        printf("\n");
        for(i = 0; i < 16; i++)
            printf(" %02x", out[i]);
        printf("\n");
    }
#endif
    return 0;
}

479 480
static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, int *pnum)
B
bellard 已提交
481
{
482
    BDRVQcowState *s = bs->opaque;
B
bellard 已提交
483
    uint64_t cluster_offset;
484
    int ret;
B
bellard 已提交
485

486
    *pnum = nb_sectors;
487 488 489
    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
     * can't pass them on today */
    qemu_co_mutex_lock(&s->lock);
490
    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
491
    qemu_co_mutex_unlock(&s->lock);
492 493 494
    if (ret < 0) {
        *pnum = 0;
    }
495

B
bellard 已提交
496 497 498
    return (cluster_offset != 0);
}

B
bellard 已提交
499
/* handle reading after the end of the backing file */
500 501
int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                  int64_t sector_num, int nb_sectors)
B
bellard 已提交
502 503 504 505 506 507 508 509
{
    int n1;
    if ((sector_num + nb_sectors) <= bs->total_sectors)
        return nb_sectors;
    if (sector_num >= bs->total_sectors)
        n1 = 0;
    else
        n1 = bs->total_sectors - sector_num;
510

511
    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
512

B
bellard 已提交
513 514 515
    return n1;
}

516
static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
517
                          int remaining_sectors, QEMUIOVector *qiov)
B
bellard 已提交
518 519
{
    BDRVQcowState *s = bs->opaque;
B
bellard 已提交
520
    int index_in_cluster, n1;
K
Kevin Wolf 已提交
521
    int ret;
522
    int cur_nr_sectors; /* number of sectors in current iteration */
523
    uint64_t cluster_offset = 0;
524 525 526
    uint64_t bytes_done = 0;
    QEMUIOVector hd_qiov;
    uint8_t *cluster_data = NULL;
B
bellard 已提交
527

528 529 530 531 532
    qemu_iovec_init(&hd_qiov, qiov->niov);

    qemu_co_mutex_lock(&s->lock);

    while (remaining_sectors != 0) {
533

534
        /* prepare next request */
535
        cur_nr_sectors = remaining_sectors;
536 537 538
        if (s->crypt_method) {
            cur_nr_sectors = MIN(cur_nr_sectors,
                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
B
bellard 已提交
539
        }
540

541
        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
542
            &cur_nr_sectors, &cluster_offset);
543
        if (ret < 0) {
544
            goto fail;
545
        }
546

547
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
548

549
        qemu_iovec_reset(&hd_qiov);
550
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
551 552
            cur_nr_sectors * 512);

553 554
        switch (ret) {
        case QCOW2_CLUSTER_UNALLOCATED:
555 556 557

            if (bs->backing_hd) {
                /* read from the base image */
558 559
                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
                    sector_num, cur_nr_sectors);
560 561 562
                if (n1 > 0) {
                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
                    qemu_co_mutex_unlock(&s->lock);
563 564
                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
                                        n1, &hd_qiov);
565 566
                    qemu_co_mutex_lock(&s->lock);
                    if (ret < 0) {
567
                        goto fail;
568 569 570 571
                    }
                }
            } else {
                /* Note: in this case, no need to wait */
572
                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
573
            }
574 575
            break;

576 577 578 579 580
        case QCOW2_CLUSTER_ZERO:
            if (s->qcow_version < 3) {
                ret = -EIO;
                goto fail;
            }
581
            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
582 583
            break;

584
        case QCOW2_CLUSTER_COMPRESSED:
585 586 587
            /* add AIO support for compressed blocks ? */
            ret = qcow2_decompress_cluster(bs, cluster_offset);
            if (ret < 0) {
588
                goto fail;
589 590
            }

591
            qemu_iovec_from_buf(&hd_qiov, 0,
592
                s->cluster_cache + index_in_cluster * 512,
593
                512 * cur_nr_sectors);
594 595 596
            break;

        case QCOW2_CLUSTER_NORMAL:
597
            if ((cluster_offset & 511) != 0) {
598 599
                ret = -EIO;
                goto fail;
600
            }
601

602 603 604 605 606
            if (s->crypt_method) {
                /*
                 * For encrypted images, read everything into a temporary
                 * contiguous buffer on which the AES functions can work.
                 */
607 608
                if (!cluster_data) {
                    cluster_data =
609
                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
610 611 612 613
                }

                assert(cur_nr_sectors <=
                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
614 615
                qemu_iovec_reset(&hd_qiov);
                qemu_iovec_add(&hd_qiov, cluster_data,
616 617 618 619 620 621 622
                    512 * cur_nr_sectors);
            }

            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
            qemu_co_mutex_unlock(&s->lock);
            ret = bdrv_co_readv(bs->file,
                                (cluster_offset >> 9) + index_in_cluster,
623
                                cur_nr_sectors, &hd_qiov);
624 625
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
626
                goto fail;
627 628
            }
            if (s->crypt_method) {
629 630
                qcow2_encrypt_sectors(s, sector_num,  cluster_data,
                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
631 632
                qemu_iovec_from_buf(qiov, bytes_done,
                    cluster_data, 512 * cur_nr_sectors);
633
            }
634 635 636 637 638 639
            break;

        default:
            g_assert_not_reached();
            ret = -EIO;
            goto fail;
640
        }
641

642 643 644
        remaining_sectors -= cur_nr_sectors;
        sector_num += cur_nr_sectors;
        bytes_done += cur_nr_sectors * 512;
645
    }
646
    ret = 0;
647

648
fail:
K
Kevin Wolf 已提交
649
    qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
650

651
    qemu_iovec_destroy(&hd_qiov);
652
    qemu_vfree(cluster_data);
K
Kevin Wolf 已提交
653 654

    return ret;
B
bellard 已提交
655 656
}

K
Kevin Wolf 已提交
657
static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
658 659 660
{
    /* Take the request off the list of running requests */
    if (m->nb_clusters != 0) {
B
Blue Swirl 已提交
661
        QLIST_REMOVE(m, next_in_flight);
662 663
    }

664
    /* Restart all dependent requests */
K
Kevin Wolf 已提交
665 666
    if (!qemu_co_queue_empty(&m->dependent_requests)) {
        qemu_co_mutex_unlock(&s->lock);
667
        qemu_co_queue_restart_all(&m->dependent_requests);
K
Kevin Wolf 已提交
668
        qemu_co_mutex_lock(&s->lock);
669 670 671
    }
}

672
static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
673 674 675
                           int64_t sector_num,
                           int remaining_sectors,
                           QEMUIOVector *qiov)
B
bellard 已提交
676 677 678
{
    BDRVQcowState *s = bs->opaque;
    int index_in_cluster;
679
    int n_end;
K
Kevin Wolf 已提交
680
    int ret;
681
    int cur_nr_sectors; /* number of sectors in current iteration */
682
    uint64_t cluster_offset;
683 684 685
    QEMUIOVector hd_qiov;
    uint64_t bytes_done = 0;
    uint8_t *cluster_data = NULL;
686 687 688
    QCowL2Meta l2meta = {
        .nb_clusters = 0,
    };
689

K
Kevin Wolf 已提交
690 691 692
    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
                                 remaining_sectors);

693
    qemu_co_queue_init(&l2meta.dependent_requests);
B
bellard 已提交
694

695 696 697
    qemu_iovec_init(&hd_qiov, qiov->niov);

    s->cluster_cache_offset = -1; /* disable compressed cache */
698

699 700 701 702
    qemu_co_mutex_lock(&s->lock);

    while (remaining_sectors != 0) {

K
Kevin Wolf 已提交
703
        trace_qcow2_writev_start_part(qemu_coroutine_self());
704 705
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
        n_end = index_in_cluster + remaining_sectors;
706 707 708 709
        if (s->crypt_method &&
            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
        }
710

711
        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
712 713
            index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
        if (ret < 0) {
714
            goto fail;
715
        }
716

717 718
        cluster_offset = l2meta.cluster_offset;
        assert((cluster_offset & 511) == 0);
719

720
        qemu_iovec_reset(&hd_qiov);
721
        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
722
            cur_nr_sectors * 512);
723

724
        if (s->crypt_method) {
725
            if (!cluster_data) {
726
                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
727 728
                                                 s->cluster_size);
            }
729

730
            assert(hd_qiov.size <=
731
                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
732
            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
733

734 735
            qcow2_encrypt_sectors(s, sector_num, cluster_data,
                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
736

737 738
            qemu_iovec_reset(&hd_qiov);
            qemu_iovec_add(&hd_qiov, cluster_data,
739 740
                cur_nr_sectors * 512);
        }
741

742 743
        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
        qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
744 745
        trace_qcow2_writev_data(qemu_coroutine_self(),
                                (cluster_offset >> 9) + index_in_cluster);
746 747
        ret = bdrv_co_writev(bs->file,
                             (cluster_offset >> 9) + index_in_cluster,
748
                             cur_nr_sectors, &hd_qiov);
749 750
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
751
            goto fail;
752
        }
753

754 755
        ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
        if (ret < 0) {
756
            goto fail;
757
        }
758

759 760
        run_dependent_requests(s, &l2meta);

761 762 763
        remaining_sectors -= cur_nr_sectors;
        sector_num += cur_nr_sectors;
        bytes_done += cur_nr_sectors * 512;
K
Kevin Wolf 已提交
764
        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
765
    }
766
    ret = 0;
767

768
fail:
769 770
    run_dependent_requests(s, &l2meta);

K
Kevin Wolf 已提交
771
    qemu_co_mutex_unlock(&s->lock);
772

773
    qemu_iovec_destroy(&hd_qiov);
774
    qemu_vfree(cluster_data);
K
Kevin Wolf 已提交
775
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
K
Kevin Wolf 已提交
776

K
Kevin Wolf 已提交
777
    return ret;
B
bellard 已提交
778 779
}

780
static void qcow2_close(BlockDriverState *bs)
B
bellard 已提交
781 782
{
    BDRVQcowState *s = bs->opaque;
783
    g_free(s->l1_table);
K
Kevin Wolf 已提交
784 785 786 787 788 789 790

    qcow2_cache_flush(bs, s->l2_table_cache);
    qcow2_cache_flush(bs, s->refcount_block_cache);

    qcow2_cache_destroy(bs, s->l2_table_cache);
    qcow2_cache_destroy(bs, s->refcount_block_cache);

K
Kevin Wolf 已提交
791
    g_free(s->unknown_header_fields);
792
    cleanup_unknown_header_ext(bs);
K
Kevin Wolf 已提交
793

794
    g_free(s->cluster_cache);
795
    qemu_vfree(s->cluster_data);
K
Kevin Wolf 已提交
796
    qcow2_refcount_close(bs);
797
    qcow2_free_snapshots(bs);
B
bellard 已提交
798 799
}

800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
static void qcow2_invalidate_cache(BlockDriverState *bs)
{
    BDRVQcowState *s = bs->opaque;
    int flags = s->flags;
    AES_KEY aes_encrypt_key;
    AES_KEY aes_decrypt_key;
    uint32_t crypt_method = 0;

    /*
     * Backing files are read-only which makes all of their metadata immutable,
     * that means we don't have to worry about reopening them here.
     */

    if (s->crypt_method) {
        crypt_method = s->crypt_method;
        memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
        memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
    }

    qcow2_close(bs);

    memset(s, 0, sizeof(BDRVQcowState));
    qcow2_open(bs, flags);

    if (crypt_method) {
        s->crypt_method = crypt_method;
        memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
        memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
    }
}

K
Kevin Wolf 已提交
831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
    size_t len, size_t buflen)
{
    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);

    if (buflen < ext_len) {
        return -ENOSPC;
    }

    *ext_backing_fmt = (QCowExtension) {
        .magic  = cpu_to_be32(magic),
        .len    = cpu_to_be32(len),
    };
    memcpy(buf + sizeof(QCowExtension), s, len);

    return ext_len;
}

K
Kevin Wolf 已提交
850
/*
K
Kevin Wolf 已提交
851 852 853 854
 * Updates the qcow2 header, including the variable length parts of it, i.e.
 * the backing file name and all extensions. qcow2 was not designed to allow
 * such changes, so if we run out of space (we can only use the first cluster)
 * this function may fail.
K
Kevin Wolf 已提交
855 856 857
 *
 * Returns 0 on success, -errno in error cases.
 */
K
Kevin Wolf 已提交
858
int qcow2_update_header(BlockDriverState *bs)
K
Kevin Wolf 已提交
859 860
{
    BDRVQcowState *s = bs->opaque;
K
Kevin Wolf 已提交
861 862 863
    QCowHeader *header;
    char *buf;
    size_t buflen = s->cluster_size;
K
Kevin Wolf 已提交
864
    int ret;
K
Kevin Wolf 已提交
865 866
    uint64_t total_size;
    uint32_t refcount_table_clusters;
K
Kevin Wolf 已提交
867
    size_t header_length;
868
    Qcow2UnknownHeaderExtension *uext;
K
Kevin Wolf 已提交
869

K
Kevin Wolf 已提交
870
    buf = qemu_blockalign(bs, buflen);
K
Kevin Wolf 已提交
871

K
Kevin Wolf 已提交
872 873
    /* Header structure */
    header = (QCowHeader*) buf;
K
Kevin Wolf 已提交
874

K
Kevin Wolf 已提交
875 876 877
    if (buflen < sizeof(*header)) {
        ret = -ENOSPC;
        goto fail;
K
Kevin Wolf 已提交
878 879
    }

K
Kevin Wolf 已提交
880
    header_length = sizeof(*header) + s->unknown_header_fields_size;
K
Kevin Wolf 已提交
881 882 883 884
    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);

    *header = (QCowHeader) {
K
Kevin Wolf 已提交
885
        /* Version 2 fields */
K
Kevin Wolf 已提交
886
        .magic                  = cpu_to_be32(QCOW_MAGIC),
K
Kevin Wolf 已提交
887
        .version                = cpu_to_be32(s->qcow_version),
K
Kevin Wolf 已提交
888 889 890 891 892 893 894 895 896 897 898
        .backing_file_offset    = 0,
        .backing_file_size      = 0,
        .cluster_bits           = cpu_to_be32(s->cluster_bits),
        .size                   = cpu_to_be64(total_size),
        .crypt_method           = cpu_to_be32(s->crypt_method_header),
        .l1_size                = cpu_to_be32(s->l1_size),
        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
K
Kevin Wolf 已提交
899 900 901 902 903 904 905

        /* Version 3 fields */
        .incompatible_features  = cpu_to_be64(s->incompatible_features),
        .compatible_features    = cpu_to_be64(s->compatible_features),
        .autoclear_features     = cpu_to_be64(s->autoclear_features),
        .refcount_order         = cpu_to_be32(3 + REFCOUNT_SHIFT),
        .header_length          = cpu_to_be32(header_length),
K
Kevin Wolf 已提交
906
    };
K
Kevin Wolf 已提交
907

K
Kevin Wolf 已提交
908 909 910 911 912 913 914 915 916
    /* For older versions, write a shorter header */
    switch (s->qcow_version) {
    case 2:
        ret = offsetof(QCowHeader, incompatible_features);
        break;
    case 3:
        ret = sizeof(*header);
        break;
    default:
917 918
        ret = -EINVAL;
        goto fail;
K
Kevin Wolf 已提交
919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935
    }

    buf += ret;
    buflen -= ret;
    memset(buf, 0, buflen);

    /* Preserve any unknown field in the header */
    if (s->unknown_header_fields_size) {
        if (buflen < s->unknown_header_fields_size) {
            ret = -ENOSPC;
            goto fail;
        }

        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
        buf += s->unknown_header_fields_size;
        buflen -= s->unknown_header_fields_size;
    }
K
Kevin Wolf 已提交
936

K
Kevin Wolf 已提交
937 938 939 940 941 942 943
    /* Backing file format header extension */
    if (*bs->backing_format) {
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
                             bs->backing_format, strlen(bs->backing_format),
                             buflen);
        if (ret < 0) {
            goto fail;
K
Kevin Wolf 已提交
944 945
        }

K
Kevin Wolf 已提交
946 947
        buf += ret;
        buflen -= ret;
K
Kevin Wolf 已提交
948 949
    }

950 951 952 953 954 955 956 957 958 959 960 961 962
    /* Feature table */
    Qcow2Feature features[] = {
        /* no feature defined yet */
    };

    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
                         features, sizeof(features), buflen);
    if (ret < 0) {
        goto fail;
    }
    buf += ret;
    buflen -= ret;

963 964 965 966 967 968 969 970 971 972 973
    /* Keep unknown header extensions */
    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
        if (ret < 0) {
            goto fail;
        }

        buf += ret;
        buflen -= ret;
    }

K
Kevin Wolf 已提交
974 975
    /* End of header extensions */
    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
K
Kevin Wolf 已提交
976 977 978 979
    if (ret < 0) {
        goto fail;
    }

K
Kevin Wolf 已提交
980 981
    buf += ret;
    buflen -= ret;
K
Kevin Wolf 已提交
982

K
Kevin Wolf 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995
    /* Backing file name */
    if (*bs->backing_file) {
        size_t backing_file_len = strlen(bs->backing_file);

        if (buflen < backing_file_len) {
            ret = -ENOSPC;
            goto fail;
        }

        strncpy(buf, bs->backing_file, buflen);

        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
        header->backing_file_size   = cpu_to_be32(backing_file_len);
K
Kevin Wolf 已提交
996 997
    }

K
Kevin Wolf 已提交
998 999
    /* Write the new header */
    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
K
Kevin Wolf 已提交
1000 1001 1002 1003 1004 1005
    if (ret < 0) {
        goto fail;
    }

    ret = 0;
fail:
K
Kevin Wolf 已提交
1006
    qemu_vfree(header);
K
Kevin Wolf 已提交
1007 1008 1009 1010 1011 1012
    return ret;
}

static int qcow2_change_backing_file(BlockDriverState *bs,
    const char *backing_file, const char *backing_fmt)
{
K
Kevin Wolf 已提交
1013 1014 1015 1016
    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");

    return qcow2_update_header(bs);
K
Kevin Wolf 已提交
1017 1018
}

K
Kevin Wolf 已提交
1019 1020 1021 1022 1023
static int preallocate(BlockDriverState *bs)
{
    uint64_t nb_sectors;
    uint64_t offset;
    int num;
1024
    int ret;
K
Kevin Wolf 已提交
1025 1026 1027 1028
    QCowL2Meta meta;

    nb_sectors = bdrv_getlength(bs) >> 9;
    offset = 0;
K
Kevin Wolf 已提交
1029
    qemu_co_queue_init(&meta.dependent_requests);
1030
    meta.cluster_offset = 0;
K
Kevin Wolf 已提交
1031 1032 1033

    while (nb_sectors) {
        num = MIN(nb_sectors, INT_MAX >> 9);
1034 1035
        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
        if (ret < 0) {
1036
            return ret;
K
Kevin Wolf 已提交
1037 1038
        }

1039 1040
        ret = qcow2_alloc_cluster_link_l2(bs, &meta);
        if (ret < 0) {
1041
            qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
1042
            return ret;
K
Kevin Wolf 已提交
1043 1044
        }

1045 1046
        /* There are no dependent requests, but we need to remove our request
         * from the list of in-flight requests */
K
Kevin Wolf 已提交
1047
        run_dependent_requests(bs->opaque, &meta);
1048

K
Kevin Wolf 已提交
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
        /* TODO Preallocate data if requested */

        nb_sectors -= num;
        offset += num << 9;
    }

    /*
     * It is expected that the image file is large enough to actually contain
     * all of the allocated clusters (otherwise we get failing reads after
     * EOF). Extend the image to the last allocated sector.
     */
1060
    if (meta.cluster_offset != 0) {
K
Kevin Wolf 已提交
1061 1062
        uint8_t buf[512];
        memset(buf, 0, 512);
1063 1064 1065 1066
        ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
        if (ret < 0) {
            return ret;
        }
K
Kevin Wolf 已提交
1067 1068 1069 1070 1071
    }

    return 0;
}

1072 1073 1074
static int qcow2_create2(const char *filename, int64_t total_size,
                         const char *backing_file, const char *backing_format,
                         int flags, size_t cluster_size, int prealloc,
K
Kevin Wolf 已提交
1075
                         QEMUOptionParameter *options, int version)
K
Kevin Wolf 已提交
1076
{
D
Dong Xu Wang 已提交
1077
    /* Calculate cluster_bits */
K
Kevin Wolf 已提交
1078 1079 1080 1081 1082 1083
    int cluster_bits;
    cluster_bits = ffs(cluster_size) - 1;
    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
        (1 << cluster_bits) != cluster_size)
    {
        error_report(
1084
            "Cluster size must be a power of two between %d and %dk",
K
Kevin Wolf 已提交
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
            1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
        return -EINVAL;
    }

    /*
     * Open the image file and write a minimal qcow2 header.
     *
     * We keep things simple and start with a zero-sized image. We also
     * do without refcount blocks or a L1 table for now. We'll fix the
     * inconsistency later.
     *
     * We do need a refcount table because growing the refcount table means
     * allocating two new refcount blocks - the seconds of which would be at
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
     * size for any qcow2 image.
     */
    BlockDriverState* bs;
    QCowHeader header;
    uint8_t* refcount_table;
    int ret;

    ret = bdrv_create_file(filename, options);
    if (ret < 0) {
        return ret;
    }

    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
    if (ret < 0) {
        return ret;
    }

    /* Write the header */
    memset(&header, 0, sizeof(header));
    header.magic = cpu_to_be32(QCOW_MAGIC);
K
Kevin Wolf 已提交
1119
    header.version = cpu_to_be32(version);
K
Kevin Wolf 已提交
1120 1121 1122 1123 1124 1125
    header.cluster_bits = cpu_to_be32(cluster_bits);
    header.size = cpu_to_be64(0);
    header.l1_table_offset = cpu_to_be64(0);
    header.l1_size = cpu_to_be32(0);
    header.refcount_table_offset = cpu_to_be64(cluster_size);
    header.refcount_table_clusters = cpu_to_be32(1);
K
Kevin Wolf 已提交
1126 1127
    header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
    header.header_length = cpu_to_be32(sizeof(header));
K
Kevin Wolf 已提交
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140

    if (flags & BLOCK_FLAG_ENCRYPT) {
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
    } else {
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
    }

    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
    if (ret < 0) {
        goto out;
    }

    /* Write an empty refcount table */
1141
    refcount_table = g_malloc0(cluster_size);
K
Kevin Wolf 已提交
1142
    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
1143
    g_free(refcount_table);
K
Kevin Wolf 已提交
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157

    if (ret < 0) {
        goto out;
    }

    bdrv_close(bs);

    /*
     * And now open the image and make it consistent first (i.e. increase the
     * refcount of the cluster that is occupied by the header and the refcount
     * table)
     */
    BlockDriver* drv = bdrv_find_format("qcow2");
    assert(drv != NULL);
1158 1159
    ret = bdrv_open(bs, filename,
        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
K
Kevin Wolf 已提交
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
    if (ret < 0) {
        goto out;
    }

    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
    if (ret < 0) {
        goto out;

    } else if (ret != 0) {
        error_report("Huh, first cluster in empty image is already in use?");
        abort();
    }

    /* Okay, now that we have a valid image, let's give it the right size */
    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
    if (ret < 0) {
        goto out;
    }

    /* Want a backing file? There you go.*/
    if (backing_file) {
        ret = bdrv_change_backing_file(bs, backing_file, backing_format);
        if (ret < 0) {
            goto out;
        }
    }

    /* And if we're supposed to preallocate metadata, do that now */
    if (prealloc) {
Z
Zhi Yong Wu 已提交
1189 1190
        BDRVQcowState *s = bs->opaque;
        qemu_co_mutex_lock(&s->lock);
K
Kevin Wolf 已提交
1191
        ret = preallocate(bs);
Z
Zhi Yong Wu 已提交
1192
        qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
        if (ret < 0) {
            goto out;
        }
    }

    ret = 0;
out:
    bdrv_delete(bs);
    return ret;
}
K
Kevin Wolf 已提交
1203

1204
static int qcow2_create(const char *filename, QEMUOptionParameter *options)
K
Kevin Wolf 已提交
1205 1206 1207 1208 1209
{
    const char *backing_file = NULL;
    const char *backing_fmt = NULL;
    uint64_t sectors = 0;
    int flags = 0;
1210
    size_t cluster_size = DEFAULT_CLUSTER_SIZE;
K
Kevin Wolf 已提交
1211
    int prealloc = 0;
K
Kevin Wolf 已提交
1212
    int version = 2;
K
Kevin Wolf 已提交
1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237

    /* Read out options */
    while (options && options->name) {
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
            sectors = options->value.n / 512;
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
            backing_file = options->value.s;
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
            backing_fmt = options->value.s;
        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
            if (options->value.n) {
                cluster_size = options->value.n;
            }
        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
            if (!options->value.s || !strcmp(options->value.s, "off")) {
                prealloc = 0;
            } else if (!strcmp(options->value.s, "metadata")) {
                prealloc = 1;
            } else {
                fprintf(stderr, "Invalid preallocation mode: '%s'\n",
                    options->value.s);
                return -EINVAL;
            }
K
Kevin Wolf 已提交
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
            if (!options->value.s || !strcmp(options->value.s, "0.10")) {
                version = 2;
            } else if (!strcmp(options->value.s, "1.1")) {
                version = 3;
            } else {
                fprintf(stderr, "Invalid compatibility level: '%s'\n",
                    options->value.s);
                return -EINVAL;
            }
K
Kevin Wolf 已提交
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
        }
        options++;
    }

    if (backing_file && prealloc) {
        fprintf(stderr, "Backing file and preallocation cannot be used at "
            "the same time\n");
        return -EINVAL;
    }

1258
    return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
K
Kevin Wolf 已提交
1259
                         cluster_size, prealloc, options, version);
K
Kevin Wolf 已提交
1260 1261
}

1262
static int qcow2_make_empty(BlockDriverState *bs)
B
Blue Swirl 已提交
1263 1264 1265 1266 1267 1268 1269 1270
{
#if 0
    /* XXX: not correct */
    BDRVQcowState *s = bs->opaque;
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
    int ret;

    memset(s->l1_table, 0, l1_length);
1271
    if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
B
Blue Swirl 已提交
1272
        return -1;
1273
    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
B
Blue Swirl 已提交
1274 1275 1276 1277 1278 1279 1280 1281
    if (ret < 0)
        return ret;

    l2_cache_reset(bs);
#endif
    return 0;
}

K
Kevin Wolf 已提交
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors)
{
    int ret;
    BDRVQcowState *s = bs->opaque;

    /* Emulate misaligned zero writes */
    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
        return -ENOTSUP;
    }

    /* Whatever is left can use real zero clusters */
    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
        nb_sectors);
    qemu_co_mutex_unlock(&s->lock);

    return ret;
}

1302 1303
static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
    int64_t sector_num, int nb_sectors)
K
Kevin Wolf 已提交
1304
{
1305 1306 1307 1308 1309
    int ret;
    BDRVQcowState *s = bs->opaque;

    qemu_co_mutex_lock(&s->lock);
    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
K
Kevin Wolf 已提交
1310
        nb_sectors);
1311 1312
    qemu_co_mutex_unlock(&s->lock);
    return ret;
K
Kevin Wolf 已提交
1313 1314
}

1315 1316 1317 1318 1319 1320
static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
{
    BDRVQcowState *s = bs->opaque;
    int ret, new_l1_size;

    if (offset & 511) {
1321
        error_report("The new size must be a multiple of 512");
1322 1323 1324 1325 1326
        return -EINVAL;
    }

    /* cannot proceed if image has snapshots */
    if (s->nb_snapshots) {
1327
        error_report("Can't resize an image which has snapshots");
1328 1329 1330 1331 1332
        return -ENOTSUP;
    }

    /* shrinking is currently not supported */
    if (offset < bs->total_sectors * 512) {
1333
        error_report("qcow2 doesn't support shrinking images yet");
1334 1335 1336 1337
        return -ENOTSUP;
    }

    new_l1_size = size_to_l1(s, offset);
1338
    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
1339 1340 1341 1342 1343 1344
    if (ret < 0) {
        return ret;
    }

    /* write updated header.size */
    offset = cpu_to_be64(offset);
1345 1346
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
                           &offset, sizeof(uint64_t));
1347 1348 1349 1350 1351 1352 1353 1354
    if (ret < 0) {
        return ret;
    }

    s->l1_vm_state_index = new_l1_size;
    return 0;
}

B
Blue Swirl 已提交
1355 1356
/* XXX: put compressed sectors first, then all the cluster aligned
   tables to avoid losing bytes in alignment */
1357 1358
static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
                                  const uint8_t *buf, int nb_sectors)
B
Blue Swirl 已提交
1359 1360 1361 1362 1363 1364 1365 1366 1367 1368
{
    BDRVQcowState *s = bs->opaque;
    z_stream strm;
    int ret, out_len;
    uint8_t *out_buf;
    uint64_t cluster_offset;

    if (nb_sectors == 0) {
        /* align end of file to a sector boundary to ease reading with
           sector based I/Os */
1369
        cluster_offset = bdrv_getlength(bs->file);
B
Blue Swirl 已提交
1370
        cluster_offset = (cluster_offset + 511) & ~511;
1371
        bdrv_truncate(bs->file, cluster_offset);
B
Blue Swirl 已提交
1372 1373 1374 1375 1376 1377
        return 0;
    }

    if (nb_sectors != s->cluster_sectors)
        return -EINVAL;

1378
    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
B
Blue Swirl 已提交
1379 1380 1381 1382 1383 1384 1385

    /* best compression, small window, no zlib header */
    memset(&strm, 0, sizeof(strm));
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
                       Z_DEFLATED, -12,
                       9, Z_DEFAULT_STRATEGY);
    if (ret != 0) {
1386 1387
        ret = -EINVAL;
        goto fail;
B
Blue Swirl 已提交
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397
    }

    strm.avail_in = s->cluster_size;
    strm.next_in = (uint8_t *)buf;
    strm.avail_out = s->cluster_size;
    strm.next_out = out_buf;

    ret = deflate(&strm, Z_FINISH);
    if (ret != Z_STREAM_END && ret != Z_OK) {
        deflateEnd(&strm);
1398 1399
        ret = -EINVAL;
        goto fail;
B
Blue Swirl 已提交
1400 1401 1402 1403 1404 1405 1406
    }
    out_len = strm.next_out - out_buf;

    deflateEnd(&strm);

    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
        /* could not compress: write normal cluster */
1407 1408 1409 1410
        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
        if (ret < 0) {
            goto fail;
        }
B
Blue Swirl 已提交
1411 1412 1413
    } else {
        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
            sector_num << 9, out_len);
1414 1415 1416 1417
        if (!cluster_offset) {
            ret = -EIO;
            goto fail;
        }
B
Blue Swirl 已提交
1418
        cluster_offset &= s->cluster_offset_mask;
1419
        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1420 1421 1422
        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
        if (ret < 0) {
            goto fail;
B
Blue Swirl 已提交
1423 1424 1425
        }
    }

1426 1427
    ret = 0;
fail:
1428
    g_free(out_buf);
1429
    return ret;
B
Blue Swirl 已提交
1430 1431
}

1432
static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
B
Blue Swirl 已提交
1433
{
K
Kevin Wolf 已提交
1434 1435 1436
    BDRVQcowState *s = bs->opaque;
    int ret;

P
Paolo Bonzini 已提交
1437
    qemu_co_mutex_lock(&s->lock);
K
Kevin Wolf 已提交
1438 1439
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
    if (ret < 0) {
D
Dong Xu Wang 已提交
1440
        qemu_co_mutex_unlock(&s->lock);
P
Paolo Bonzini 已提交
1441
        return ret;
K
Kevin Wolf 已提交
1442 1443 1444 1445
    }

    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret < 0) {
D
Dong Xu Wang 已提交
1446
        qemu_co_mutex_unlock(&s->lock);
P
Paolo Bonzini 已提交
1447
        return ret;
K
Kevin Wolf 已提交
1448
    }
P
Paolo Bonzini 已提交
1449
    qemu_co_mutex_unlock(&s->lock);
K
Kevin Wolf 已提交
1450

K
Kevin Wolf 已提交
1451 1452 1453
    return 0;
}

1454
static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
B
Blue Swirl 已提交
1455 1456 1457 1458
{
	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
}

1459
static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
B
Blue Swirl 已提交
1460 1461 1462
{
    BDRVQcowState *s = bs->opaque;
    bdi->cluster_size = s->cluster_size;
1463
    bdi->vm_state_offset = qcow2_vm_state_offset(s);
B
Blue Swirl 已提交
1464 1465 1466 1467
    return 0;
}


1468 1469
static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
                       BdrvCheckMode fix)
B
Blue Swirl 已提交
1470
{
1471
    return qcow2_check_refcounts(bs, result, fix);
B
Blue Swirl 已提交
1472 1473 1474 1475 1476 1477 1478 1479 1480
}

#if 0
static void dump_refcounts(BlockDriverState *bs)
{
    BDRVQcowState *s = bs->opaque;
    int64_t nb_clusters, k, k1, size;
    int refcount;

1481
    size = bdrv_getlength(bs->file);
B
Blue Swirl 已提交
1482 1483 1484 1485 1486 1487 1488
    nb_clusters = size_to_clusters(s, size);
    for(k = 0; k < nb_clusters;) {
        k1 = k;
        refcount = get_refcount(bs, k);
        k++;
        while (k < nb_clusters && get_refcount(bs, k) == refcount)
            k++;
B
Blue Swirl 已提交
1489 1490
        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
               k - k1);
B
Blue Swirl 已提交
1491 1492 1493 1494
    }
}
#endif

1495 1496
static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
                              int64_t pos, int size)
B
Blue Swirl 已提交
1497 1498 1499 1500 1501
{
    BDRVQcowState *s = bs->opaque;
    int growable = bs->growable;
    int ret;

1502
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
B
Blue Swirl 已提交
1503
    bs->growable = 1;
1504
    ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size);
B
Blue Swirl 已提交
1505 1506 1507 1508 1509
    bs->growable = growable;

    return ret;
}

1510 1511
static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
                              int64_t pos, int size)
B
Blue Swirl 已提交
1512 1513 1514 1515 1516
{
    BDRVQcowState *s = bs->opaque;
    int growable = bs->growable;
    int ret;

1517
    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
B
Blue Swirl 已提交
1518
    bs->growable = 1;
1519
    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
B
Blue Swirl 已提交
1520 1521 1522 1523 1524
    bs->growable = growable;

    return ret;
}

1525
static QEMUOptionParameter qcow2_create_options[] = {
B
Blue Swirl 已提交
1526 1527 1528 1529 1530
    {
        .name = BLOCK_OPT_SIZE,
        .type = OPT_SIZE,
        .help = "Virtual disk size"
    },
K
Kevin Wolf 已提交
1531 1532 1533 1534 1535
    {
        .name = BLOCK_OPT_COMPAT_LEVEL,
        .type = OPT_STRING,
        .help = "Compatibility level (0.10 or 1.1)"
    },
B
Blue Swirl 已提交
1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553
    {
        .name = BLOCK_OPT_BACKING_FILE,
        .type = OPT_STRING,
        .help = "File name of a base image"
    },
    {
        .name = BLOCK_OPT_BACKING_FMT,
        .type = OPT_STRING,
        .help = "Image format of the base image"
    },
    {
        .name = BLOCK_OPT_ENCRYPT,
        .type = OPT_FLAG,
        .help = "Encrypt the image"
    },
    {
        .name = BLOCK_OPT_CLUSTER_SIZE,
        .type = OPT_SIZE,
1554 1555
        .help = "qcow2 cluster size",
        .value = { .n = DEFAULT_CLUSTER_SIZE },
B
Blue Swirl 已提交
1556 1557 1558 1559 1560 1561 1562 1563 1564 1565
    },
    {
        .name = BLOCK_OPT_PREALLOC,
        .type = OPT_STRING,
        .help = "Preallocation mode (allowed values: off, metadata)"
    },
    { NULL }
};

static BlockDriver bdrv_qcow2 = {
1566 1567 1568 1569 1570 1571
    .format_name        = "qcow2",
    .instance_size      = sizeof(BDRVQcowState),
    .bdrv_probe         = qcow2_probe,
    .bdrv_open          = qcow2_open,
    .bdrv_close         = qcow2_close,
    .bdrv_create        = qcow2_create,
1572
    .bdrv_co_is_allocated = qcow2_co_is_allocated,
1573 1574 1575
    .bdrv_set_key       = qcow2_set_key,
    .bdrv_make_empty    = qcow2_make_empty,

1576 1577
    .bdrv_co_readv          = qcow2_co_readv,
    .bdrv_co_writev         = qcow2_co_writev,
K
Kevin Wolf 已提交
1578
    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
1579

K
Kevin Wolf 已提交
1580
    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
1581
    .bdrv_co_discard        = qcow2_co_discard,
1582
    .bdrv_truncate          = qcow2_truncate,
1583
    .bdrv_write_compressed  = qcow2_write_compressed,
B
Blue Swirl 已提交
1584 1585 1586 1587 1588

    .bdrv_snapshot_create   = qcow2_snapshot_create,
    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
    .bdrv_snapshot_list     = qcow2_snapshot_list,
E
edison 已提交
1589
    .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
1590
    .bdrv_get_info      = qcow2_get_info,
B
Blue Swirl 已提交
1591

1592 1593
    .bdrv_save_vmstate    = qcow2_save_vmstate,
    .bdrv_load_vmstate    = qcow2_load_vmstate,
B
Blue Swirl 已提交
1594 1595 1596

    .bdrv_change_backing_file   = qcow2_change_backing_file,

1597 1598
    .bdrv_invalidate_cache      = qcow2_invalidate_cache,

1599 1600
    .create_options = qcow2_create_options,
    .bdrv_check = qcow2_check,
B
Blue Swirl 已提交
1601 1602
};

1603 1604 1605 1606 1607 1608
static void bdrv_qcow2_init(void)
{
    bdrv_register(&bdrv_qcow2);
}

block_init(bdrv_qcow2_init);