qcow2-cluster.c 58.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*
 * Block driver for the QCOW version 2 format
 *
 * Copyright (c) 2004-2006 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include <zlib.h>

#include "qemu-common.h"
28
#include "block/block_int.h"
29
#include "block/qcow2.h"
K
Kevin Wolf 已提交
30
#include "trace.h"
31

32 33
int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                        bool exact_size)
34
{
35
    BDRVQcow2State *s = bs->opaque;
36
    int new_l1_size2, ret, i;
37
    uint64_t *new_l1_table;
38
    int64_t old_l1_table_offset, old_l1_size;
39
    int64_t new_l1_table_offset, new_l1_size;
40 41
    uint8_t data[12];

42
    if (min_size <= s->l1_size)
43
        return 0;
44

45 46 47 48 49 50 51
    /* Do a sanity check on min_size before trying to calculate new_l1_size
     * (this prevents overflows during the while loop for the calculation of
     * new_l1_size) */
    if (min_size > INT_MAX / sizeof(uint64_t)) {
        return -EFBIG;
    }

52 53 54 55 56 57 58 59 60 61 62
    if (exact_size) {
        new_l1_size = min_size;
    } else {
        /* Bump size up to reduce the number of times we have to grow */
        new_l1_size = s->l1_size;
        if (new_l1_size == 0) {
            new_l1_size = 1;
        }
        while (min_size > new_l1_size) {
            new_l1_size = (new_l1_size * 3 + 1) / 2;
        }
63
    }
64

65
    if (new_l1_size > INT_MAX / sizeof(uint64_t)) {
66 67 68
        return -EFBIG;
    }

69
#ifdef DEBUG_ALLOC2
70 71
    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
            s->l1_size, new_l1_size);
72 73 74
#endif

    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
K
Kevin Wolf 已提交
75
    new_l1_table = qemu_try_blockalign(bs->file->bs,
76 77 78 79 80 81
                                       align_offset(new_l1_size2, 512));
    if (new_l1_table == NULL) {
        return -ENOMEM;
    }
    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));

82 83 84
    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));

    /* write new table (align to cluster) */
85
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
K
Kevin Wolf 已提交
86
    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
87
    if (new_l1_table_offset < 0) {
88
        qemu_vfree(new_l1_table);
89 90
        return new_l1_table_offset;
    }
K
Kevin Wolf 已提交
91 92 93

    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret < 0) {
94
        goto fail;
K
Kevin Wolf 已提交
95
    }
96

97 98
    /* the L1 position has not yet been updated, so these clusters must
     * indeed be completely free */
M
Max Reitz 已提交
99 100
    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
                                        new_l1_size2);
101 102 103 104
    if (ret < 0) {
        goto fail;
    }

105
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
106 107
    for(i = 0; i < s->l1_size; i++)
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
K
Kevin Wolf 已提交
108 109
    ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset,
                           new_l1_table, new_l1_size2);
110
    if (ret < 0)
111 112 113 114 115
        goto fail;
    for(i = 0; i < s->l1_size; i++)
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);

    /* set new table */
116
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
117
    cpu_to_be32w((uint32_t*)data, new_l1_size);
P
Peter Maydell 已提交
118
    stq_be_p(data + 4, new_l1_table_offset);
K
Kevin Wolf 已提交
119 120
    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size),
                           data, sizeof(data));
121
    if (ret < 0) {
122
        goto fail;
123
    }
124
    qemu_vfree(s->l1_table);
125
    old_l1_table_offset = s->l1_table_offset;
126 127
    s->l1_table_offset = new_l1_table_offset;
    s->l1_table = new_l1_table;
128
    old_l1_size = s->l1_size;
129
    s->l1_size = new_l1_size;
130 131
    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
                        QCOW2_DISCARD_OTHER);
132 133
    return 0;
 fail:
134
    qemu_vfree(new_l1_table);
135 136
    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
                        QCOW2_DISCARD_OTHER);
137
    return ret;
138 139 140 141 142 143 144 145 146 147 148 149
}

/*
 * l2_load
 *
 * Loads a L2 table into memory. If the table is in the cache, the cache
 * is used; otherwise the L2 table is loaded from the image file.
 *
 * Returns a pointer to the L2 table on success, or NULL if the read from
 * the image file failed.
 */

150 151
static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
    uint64_t **l2_table)
152
{
153
    BDRVQcow2State *s = bs->opaque;
154
    int ret;
155

K
Kevin Wolf 已提交
156
    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
157

K
Kevin Wolf 已提交
158
    return ret;
159 160
}

K
Kevin Wolf 已提交
161 162 163 164 165
/*
 * Writes one sector of the L1 table to the disk (can't update single entries
 * and we really don't want bdrv_pread to perform a read-modify-write)
 */
#define L1_ENTRIES_PER_SECTOR (512 / 8)
166
int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
K
Kevin Wolf 已提交
167
{
168
    BDRVQcow2State *s = bs->opaque;
169
    uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 };
K
Kevin Wolf 已提交
170
    int l1_start_index;
171
    int i, ret;
K
Kevin Wolf 已提交
172 173

    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
174 175 176
    for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size;
         i++)
    {
K
Kevin Wolf 已提交
177 178 179
        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
    }

M
Max Reitz 已提交
180
    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
181 182 183 184 185
            s->l1_table_offset + 8 * l1_start_index, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

186
    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
K
Kevin Wolf 已提交
187 188 189
    ret = bdrv_pwrite_sync(bs->file->bs,
                           s->l1_table_offset + 8 * l1_start_index,
                           buf, sizeof(buf));
190 191
    if (ret < 0) {
        return ret;
K
Kevin Wolf 已提交
192 193 194 195 196
    }

    return 0;
}

197 198 199 200 201 202 203 204 205 206
/*
 * l2_allocate
 *
 * Allocate a new l2 entry in the file. If l1_index points to an already
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
 * table) copy the contents of the old L2 table into the newly allocated one.
 * Otherwise the new table is initialized with zeros.
 *
 */

207
static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
208
{
209
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
210
    uint64_t old_l2_offset;
211
    uint64_t *l2_table = NULL;
K
Kevin Wolf 已提交
212
    int64_t l2_offset;
213
    int ret;
214 215 216

    old_l2_offset = s->l1_table[l1_index];

K
Kevin Wolf 已提交
217 218
    trace_qcow2_l2_allocate(bs, l1_index);

219 220
    /* allocate a new l2 entry */

K
Kevin Wolf 已提交
221
    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
222
    if (l2_offset < 0) {
223 224
        ret = l2_offset;
        goto fail;
225
    }
K
Kevin Wolf 已提交
226 227 228 229 230

    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    if (ret < 0) {
        goto fail;
    }
231 232 233

    /* allocate a new entry in the l2 cache */

K
Kevin Wolf 已提交
234
    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
K
Kevin Wolf 已提交
235 236
    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
    if (ret < 0) {
237
        goto fail;
K
Kevin Wolf 已提交
238 239 240
    }

    l2_table = *table;
241

242
    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
243 244 245
        /* if there was no old l2 table, clear the new table */
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
    } else {
K
Kevin Wolf 已提交
246 247
        uint64_t* old_table;

248
        /* if there was an old l2 table, read it from the disk */
249
        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
250 251
        ret = qcow2_cache_get(bs, s->l2_table_cache,
            old_l2_offset & L1E_OFFSET_MASK,
K
Kevin Wolf 已提交
252 253 254 255 256 257 258
            (void**) &old_table);
        if (ret < 0) {
            goto fail;
        }

        memcpy(l2_table, old_table, s->cluster_size);

259
        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
260
    }
K
Kevin Wolf 已提交
261

262
    /* write the l2 table to the file */
263
    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
K
Kevin Wolf 已提交
264

K
Kevin Wolf 已提交
265
    trace_qcow2_l2_allocate_write_l2(bs, l1_index);
266
    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
K
Kevin Wolf 已提交
267
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
268
    if (ret < 0) {
269 270 271 272
        goto fail;
    }

    /* update the L1 entry */
K
Kevin Wolf 已提交
273
    trace_qcow2_l2_allocate_write_l1(bs, l1_index);
274
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
275
    ret = qcow2_write_l1_entry(bs, l1_index);
276 277
    if (ret < 0) {
        goto fail;
278
    }
279

280
    *table = l2_table;
K
Kevin Wolf 已提交
281
    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
282
    return 0;
283 284

fail:
K
Kevin Wolf 已提交
285
    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
286 287 288
    if (l2_table != NULL) {
        qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
    }
289
    s->l1_table[l1_index] = old_l2_offset;
290 291 292 293
    if (l2_offset > 0) {
        qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
                            QCOW2_DISCARD_ALWAYS);
    }
294
    return ret;
295 296
}

297 298 299 300 301 302 303
/*
 * Checks how many clusters in a given L2 table are contiguous in the image
 * file. As soon as one of the flags in the bitmask stop_flags changes compared
 * to the first cluster, the search is stopped and the cluster is not counted
 * as contiguous. (This allows it, for example, to stop at the first compressed
 * cluster which may require a different handling)
 */
304
static int count_contiguous_clusters(int nb_clusters, int cluster_size,
305
        uint64_t *l2_table, uint64_t stop_flags)
306 307
{
    int i;
308
    uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
309 310
    uint64_t first_entry = be64_to_cpu(l2_table[0]);
    uint64_t offset = first_entry & mask;
311 312 313 314

    if (!offset)
        return 0;

315
    assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);
316

317
    for (i = 0; i < nb_clusters; i++) {
318 319
        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
        if (offset + (uint64_t) i * cluster_size != l2_entry) {
320
            break;
321 322
        }
    }
323

324
	return i;
325 326
}

327 328 329
static int count_contiguous_clusters_by_type(int nb_clusters,
                                             uint64_t *l2_table,
                                             int wanted_type)
330
{
331 332 333 334
    int i;

    for (i = 0; i < nb_clusters; i++) {
        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
335

336
        if (type != wanted_type) {
337 338 339
            break;
        }
    }
340 341 342 343 344 345 346

    return i;
}

/* The crypt function is compatible with the linux cryptoloop
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
   supported */
347
int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
348 349 350
                          uint8_t *out_buf, const uint8_t *in_buf,
                          int nb_sectors, bool enc,
                          Error **errp)
351 352 353 354 355 356
{
    union {
        uint64_t ll[2];
        uint8_t b[16];
    } ivec;
    int i;
357
    int ret;
358 359 360 361

    for(i = 0; i < nb_sectors; i++) {
        ivec.ll[0] = cpu_to_le64(sector_num);
        ivec.ll[1] = 0;
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
        if (qcrypto_cipher_setiv(s->cipher,
                                 ivec.b, G_N_ELEMENTS(ivec.b),
                                 errp) < 0) {
            return -1;
        }
        if (enc) {
            ret = qcrypto_cipher_encrypt(s->cipher,
                                         in_buf,
                                         out_buf,
                                         512,
                                         errp);
        } else {
            ret = qcrypto_cipher_decrypt(s->cipher,
                                         in_buf,
                                         out_buf,
                                         512,
                                         errp);
        }
        if (ret < 0) {
            return -1;
        }
383 384 385 386
        sector_num++;
        in_buf += 512;
        out_buf += 512;
    }
387
    return 0;
388 389
}

390 391 392 393
static int coroutine_fn copy_sectors(BlockDriverState *bs,
                                     uint64_t start_sect,
                                     uint64_t cluster_offset,
                                     int n_start, int n_end)
394
{
395
    BDRVQcow2State *s = bs->opaque;
396 397
    QEMUIOVector qiov;
    struct iovec iov;
398
    int n, ret;
K
Kevin Wolf 已提交
399

400
    n = n_end - n_start;
K
Kevin Wolf 已提交
401
    if (n <= 0) {
402
        return 0;
K
Kevin Wolf 已提交
403 404
    }

405
    iov.iov_len = n * BDRV_SECTOR_SIZE;
406 407 408 409
    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
    if (iov.iov_base == NULL) {
        return -ENOMEM;
    }
410 411

    qemu_iovec_init_external(&qiov, &iov, 1);
K
Kevin Wolf 已提交
412

413
    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
414

415
    if (!bs->drv) {
416 417
        ret = -ENOMEDIUM;
        goto out;
418 419
    }

420 421 422 423 424
    /* Call .bdrv_co_readv() directly instead of using the public block-layer
     * interface.  This avoids double I/O throttling and request tracking,
     * which can lead to deadlock when block layer copy-on-read is enabled.
     */
    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
K
Kevin Wolf 已提交
425 426 427 428
    if (ret < 0) {
        goto out;
    }

429
    if (bs->encrypted) {
430 431 432 433 434 435 436 437 438
        Error *err = NULL;
        assert(s->cipher);
        if (qcow2_encrypt_sectors(s, start_sect + n_start,
                                  iov.iov_base, iov.iov_base, n,
                                  true, &err) < 0) {
            ret = -EIO;
            error_free(err);
            goto out;
        }
439
    }
K
Kevin Wolf 已提交
440

M
Max Reitz 已提交
441
    ret = qcow2_pre_write_overlap_check(bs, 0,
442 443 444 445 446
            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
    if (ret < 0) {
        goto out;
    }

447
    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
K
Kevin Wolf 已提交
448 449
    ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
                         &qiov);
K
Kevin Wolf 已提交
450 451 452 453 454 455
    if (ret < 0) {
        goto out;
    }

    ret = 0;
out:
456
    qemu_vfree(iov.iov_base);
K
Kevin Wolf 已提交
457
    return ret;
458 459 460 461 462 463
}


/*
 * get_cluster_offset
 *
464 465
 * For a given offset of the disk image, find the cluster offset in
 * qcow2 file. The offset is stored in *cluster_offset.
466
 *
467
 * on entry, *num is the number of contiguous sectors we'd like to
468 469
 * access following offset.
 *
470
 * on exit, *num is the number of contiguous sectors we can read.
471
 *
472 473
 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
 * cases.
474
 */
475 476
int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
    int *num, uint64_t *cluster_offset)
477
{
478
    BDRVQcow2State *s = bs->opaque;
479 480
    unsigned int l2_index;
    uint64_t l1_index, l2_offset, *l2_table;
481
    int l1_bits, c;
482 483
    unsigned int index_in_cluster, nb_clusters;
    uint64_t nb_available, nb_needed;
484
    int ret;
485 486 487 488 489 490 491 492 493 494

    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
    nb_needed = *num + index_in_cluster;

    l1_bits = s->l2_bits + s->cluster_bits;

    /* compute how many bytes there are between the offset and
     * the end of the l1 entry
     */

495
    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
496 497 498 499 500 501 502 503

    /* compute the number of available sectors */

    nb_available = (nb_available >> 9) + index_in_cluster;

    if (nb_needed > nb_available) {
        nb_needed = nb_available;
    }
504
    assert(nb_needed <= INT_MAX);
505

506
    *cluster_offset = 0;
507

508
    /* seek to the l2 offset in the l1 table */
509 510

    l1_index = offset >> l1_bits;
511 512
    if (l1_index >= s->l1_size) {
        ret = QCOW2_CLUSTER_UNALLOCATED;
513
        goto out;
514
    }
515

516 517 518
    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
    if (!l2_offset) {
        ret = QCOW2_CLUSTER_UNALLOCATED;
519
        goto out;
520
    }
521

522 523 524 525 526 527 528
    if (offset_into_cluster(s, l2_offset)) {
        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
                                " unaligned (L1 index: %#" PRIx64 ")",
                                l2_offset, l1_index);
        return -EIO;
    }

529 530
    /* load the l2 table in memory */

531 532 533
    ret = l2_load(bs, l2_offset, &l2_table);
    if (ret < 0) {
        return ret;
534
    }
535 536 537 538

    /* find the cluster offset for the given disk offset */

    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
539
    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
540 541

    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
542 543
    nb_clusters = size_to_clusters(s, nb_needed << 9);

544 545 546 547 548 549 550
    ret = qcow2_get_cluster_type(*cluster_offset);
    switch (ret) {
    case QCOW2_CLUSTER_COMPRESSED:
        /* Compressed clusters can only be processed one by one */
        c = 1;
        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
        break;
551
    case QCOW2_CLUSTER_ZERO:
552
        if (s->qcow_version < 3) {
553 554 555 556 557
            qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
                                    " in pre-v3 image (L2 offset: %#" PRIx64
                                    ", L2 index: %#x)", l2_offset, l2_index);
            ret = -EIO;
            goto fail;
558
        }
559 560
        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
                                              QCOW2_CLUSTER_ZERO);
561 562
        *cluster_offset = 0;
        break;
563
    case QCOW2_CLUSTER_UNALLOCATED:
564
        /* how many empty clusters ? */
565 566
        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
                                              QCOW2_CLUSTER_UNALLOCATED);
567 568 569
        *cluster_offset = 0;
        break;
    case QCOW2_CLUSTER_NORMAL:
570 571
        /* how many allocated clusters ? */
        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
572
                &l2_table[l2_index], QCOW_OFLAG_ZERO);
573
        *cluster_offset &= L2E_OFFSET_MASK;
574 575 576 577 578 579 580 581
        if (offset_into_cluster(s, *cluster_offset)) {
            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#"
                                    PRIx64 " unaligned (L2 offset: %#" PRIx64
                                    ", L2 index: %#x)", *cluster_offset,
                                    l2_offset, l2_index);
            ret = -EIO;
            goto fail;
        }
582
        break;
K
Kevin Wolf 已提交
583 584
    default:
        abort();
585 586
    }

K
Kevin Wolf 已提交
587 588
    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);

589 590
    nb_available = (c * s->cluster_sectors);

591 592 593 594 595 596
out:
    if (nb_available > nb_needed)
        nb_available = nb_needed;

    *num = nb_available - index_in_cluster;

597
    return ret;
598 599 600 601

fail:
    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
    return ret;
602 603 604 605 606 607 608 609 610 611 612
}

/*
 * get_cluster_table
 *
 * for a given disk offset, load (and allocate if needed)
 * the l2 table.
 *
 * the l2 table offset in the qcow2 file and the cluster index
 * in the l2 table are given to the caller.
 *
613
 * Returns 0 on success, -errno in failure case
614 615 616 617 618
 */
static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
                             uint64_t **new_l2_table,
                             int *new_l2_index)
{
619
    BDRVQcow2State *s = bs->opaque;
620 621
    unsigned int l2_index;
    uint64_t l1_index, l2_offset;
622
    uint64_t *l2_table = NULL;
623
    int ret;
624

625
    /* seek to the l2 offset in the l1 table */
626 627 628

    l1_index = offset >> (s->l2_bits + s->cluster_bits);
    if (l1_index >= s->l1_size) {
629
        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
630 631 632
        if (ret < 0) {
            return ret;
        }
633
    }
634

635
    assert(l1_index < s->l1_size);
636
    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
637 638 639 640 641 642
    if (offset_into_cluster(s, l2_offset)) {
        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
                                " unaligned (L1 index: %#" PRIx64 ")",
                                l2_offset, l1_index);
        return -EIO;
    }
643 644 645

    /* seek the l2 table of the given l2 offset */

646
    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
647
        /* load the l2 table in memory */
648 649 650
        ret = l2_load(bs, l2_offset, &l2_table);
        if (ret < 0) {
            return ret;
651
        }
652
    } else {
K
Kevin Wolf 已提交
653
        /* First allocate a new L2 table (and do COW if needed) */
654 655 656
        ret = l2_allocate(bs, l1_index, &l2_table);
        if (ret < 0) {
            return ret;
657
        }
K
Kevin Wolf 已提交
658 659 660

        /* Then decrease the refcount of the old table */
        if (l2_offset) {
661 662
            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
                                QCOW2_DISCARD_OTHER);
K
Kevin Wolf 已提交
663
        }
664 665 666 667 668 669 670 671 672
    }

    /* find the cluster offset for the given disk offset */

    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);

    *new_l2_table = l2_table;
    *new_l2_index = l2_index;

673
    return 0;
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
}

/*
 * alloc_compressed_cluster_offset
 *
 * For a given offset of the disk image, return cluster offset in
 * qcow2 file.
 *
 * If the offset is not found, allocate a new compressed cluster.
 *
 * Return the cluster offset if successful,
 * Return 0, otherwise.
 *
 */

K
Kevin Wolf 已提交
689 690 691
uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                               uint64_t offset,
                                               int compressed_size)
692
{
693
    BDRVQcow2State *s = bs->opaque;
694
    int l2_index, ret;
695
    uint64_t *l2_table;
K
Kevin Wolf 已提交
696
    int64_t cluster_offset;
697 698
    int nb_csectors;

699
    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
700
    if (ret < 0) {
701
        return 0;
702
    }
703

704 705
    /* Compression can't overwrite anything. Fail if the cluster was already
     * allocated. */
706
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
707
    if (cluster_offset & L2E_OFFSET_MASK) {
708 709 710
        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
        return 0;
    }
711

K
Kevin Wolf 已提交
712
    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
713
    if (cluster_offset < 0) {
K
Kevin Wolf 已提交
714
        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
715 716 717
        return 0;
    }

718 719 720 721 722 723 724 725 726 727
    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
                  (cluster_offset >> 9);

    cluster_offset |= QCOW_OFLAG_COMPRESSED |
                      ((uint64_t)nb_csectors << s->csize_shift);

    /* update L2 table */

    /* compressed clusters never have the copied flag */

728
    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
729
    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
730
    l2_table[l2_index] = cpu_to_be64(cluster_offset);
731
    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
732

K
Kevin Wolf 已提交
733
    return cluster_offset;
734 735
}

K
Kevin Wolf 已提交
736 737
static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
{
738
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
    int ret;

    if (r->nb_sectors == 0) {
        return 0;
    }

    qemu_co_mutex_unlock(&s->lock);
    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
                       r->offset / BDRV_SECTOR_SIZE,
                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
    qemu_co_mutex_lock(&s->lock);

    if (ret < 0) {
        return ret;
    }

    /*
     * Before we update the L2 table to actually point to the new cluster, we
     * need to be sure that the refcounts have been increased and COW was
     * handled.
     */
    qcow2_cache_depends_on_flush(s->l2_table_cache);

    return 0;
}

765
int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
766
{
767
    BDRVQcow2State *s = bs->opaque;
768
    int i, j = 0, l2_index, ret;
K
Kevin Wolf 已提交
769
    uint64_t *old_cluster, *l2_table;
770
    uint64_t cluster_offset = m->alloc_offset;
771

K
Kevin Wolf 已提交
772
    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
773
    assert(m->nb_clusters > 0);
774

775
    old_cluster = g_try_new(uint64_t, m->nb_clusters);
776 777 778 779
    if (old_cluster == NULL) {
        ret = -ENOMEM;
        goto err;
    }
780 781

    /* copy content of unmodified sectors */
K
Kevin Wolf 已提交
782 783 784
    ret = perform_cow(bs, m, &m->cow_start);
    if (ret < 0) {
        goto err;
785 786
    }

K
Kevin Wolf 已提交
787 788 789
    ret = perform_cow(bs, m, &m->cow_end);
    if (ret < 0) {
        goto err;
K
Kevin Wolf 已提交
790 791
    }

K
Kevin Wolf 已提交
792
    /* Update L2 table. */
793
    if (s->use_lazy_refcounts) {
794 795
        qcow2_mark_dirty(bs);
    }
796 797 798 799
    if (qcow2_need_accurate_refcounts(s)) {
        qcow2_cache_set_dependency(bs, s->l2_table_cache,
                                   s->refcount_block_cache);
    }
800

801
    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
802
    if (ret < 0) {
803
        goto err;
804
    }
805
    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
806

807
    assert(l2_index + m->nb_clusters <= s->l2_size);
808 809 810 811 812 813 814 815 816 817 818 819 820 821
    for (i = 0; i < m->nb_clusters; i++) {
        /* if two concurrent writes happen to the same unallocated cluster
	 * each write allocates separate cluster and writes data concurrently.
	 * The first one to complete updates l2 table with pointer to its
	 * cluster the second one has to do RMW (which is done above by
	 * copy_sectors()), update l2 table with its cluster pointer and free
	 * old cluster. This is what this loop does */
        if(l2_table[l2_index + i] != 0)
            old_cluster[j++] = l2_table[l2_index + i];

        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
     }

822

823
    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
824

825 826
    /*
     * If this was a COW, we need to decrease the refcount of the old cluster.
827 828 829
     *
     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
     * clusters), the next write will reuse them anyway.
830 831 832
     */
    if (j != 0) {
        for (i = 0; i < j; i++) {
833 834
            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
                                    QCOW2_DISCARD_NEVER);
835 836
        }
    }
837 838 839

    ret = 0;
err:
840
    g_free(old_cluster);
841 842 843
    return ret;
 }

844 845 846 847 848
/*
 * Returns the number of contiguous clusters that can be used for an allocating
 * write, but require COW to be performed (this includes yet unallocated space,
 * which must copy from the backing file)
 */
849
static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
850 851
    uint64_t *l2_table, int l2_index)
{
K
Kevin Wolf 已提交
852
    int i;
853

K
Kevin Wolf 已提交
854 855 856 857 858 859 860 861 862
    for (i = 0; i < nb_clusters; i++) {
        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
        int cluster_type = qcow2_get_cluster_type(l2_entry);

        switch(cluster_type) {
        case QCOW2_CLUSTER_NORMAL:
            if (l2_entry & QCOW_OFLAG_COPIED) {
                goto out;
            }
863
            break;
K
Kevin Wolf 已提交
864 865
        case QCOW2_CLUSTER_UNALLOCATED:
        case QCOW2_CLUSTER_COMPRESSED:
866
        case QCOW2_CLUSTER_ZERO:
867
            break;
K
Kevin Wolf 已提交
868 869 870
        default:
            abort();
        }
871 872
    }

K
Kevin Wolf 已提交
873
out:
874 875 876 877
    assert(i <= nb_clusters);
    return i;
}

878
/*
879 880 881
 * Check if there already is an AIO write request in flight which allocates
 * the same cluster. In this case we need to wait until the previous
 * request has completed and updated the L2 table accordingly.
882 883 884 885 886 887 888 889 890
 *
 * Returns:
 *   0       if there was no dependency. *cur_bytes indicates the number of
 *           bytes from guest_offset that can be read before the next
 *           dependency must be processed (or the request is complete)
 *
 *   -EAGAIN if we had to wait for another request, previously gathered
 *           information on cluster allocation may be invalid now. The caller
 *           must start over anyway, so consider *cur_bytes undefined.
891
 */
892
static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
893
    uint64_t *cur_bytes, QCowL2Meta **m)
894
{
895
    BDRVQcow2State *s = bs->opaque;
896
    QCowL2Meta *old_alloc;
897
    uint64_t bytes = *cur_bytes;
898 899 900

    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {

901 902 903 904
        uint64_t start = guest_offset;
        uint64_t end = start + bytes;
        uint64_t old_start = l2meta_cow_start(old_alloc);
        uint64_t old_end = l2meta_cow_end(old_alloc);
905

906
        if (end <= old_start || start >= old_end) {
907 908 909 910
            /* No intersection */
        } else {
            if (start < old_start) {
                /* Stop at the start of a running allocation */
911
                bytes = old_start - start;
912
            } else {
913
                bytes = 0;
914 915
            }

916 917 918 919 920 921 922 923 924
            /* Stop if already an l2meta exists. After yielding, it wouldn't
             * be valid any more, so we'd have to clean up the old L2Metas
             * and deal with requests depending on them before starting to
             * gather new ones. Not worth the trouble. */
            if (bytes == 0 && *m) {
                *cur_bytes = 0;
                return 0;
            }

925
            if (bytes == 0) {
926 927 928 929 930 931 932 933 934 935
                /* Wait for the dependency to complete. We need to recheck
                 * the free/allocated clusters when we continue. */
                qemu_co_mutex_unlock(&s->lock);
                qemu_co_queue_wait(&old_alloc->dependent_requests);
                qemu_co_mutex_lock(&s->lock);
                return -EAGAIN;
            }
        }
    }

936 937 938
    /* Make sure that existing clusters and new allocations are only used up to
     * the next dependency if we shortened the request above */
    *cur_bytes = bytes;
939

940 941 942
    return 0;
}

K
Kevin Wolf 已提交
943 944 945 946 947 948
/*
 * Checks how many already allocated clusters that don't require a copy on
 * write there are at the given guest_offset (up to *bytes). If
 * *host_offset is not zero, only physically contiguous clusters beginning at
 * this host offset are counted.
 *
949 950 951
 * Note that guest_offset may not be cluster aligned. In this case, the
 * returned *host_offset points to exact byte referenced by guest_offset and
 * therefore isn't cluster aligned as well.
K
Kevin Wolf 已提交
952 953 954 955 956 957 958 959 960 961 962 963 964 965
 *
 * Returns:
 *   0:     if no allocated clusters are available at the given offset.
 *          *bytes is normally unchanged. It is set to 0 if the cluster
 *          is allocated and doesn't need COW, but doesn't have the right
 *          physical offset.
 *
 *   1:     if allocated clusters that don't require a COW are available at
 *          the requested offset. *bytes may have decreased and describes
 *          the length of the area that can be written to.
 *
 *  -errno: in error cases
 */
static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
966
    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
K
Kevin Wolf 已提交
967
{
968
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
969 970 971
    int l2_index;
    uint64_t cluster_offset;
    uint64_t *l2_table;
972
    uint64_t nb_clusters;
973
    unsigned int keep_clusters;
974
    int ret;
K
Kevin Wolf 已提交
975 976 977 978

    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
                              *bytes);

979 980 981
    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
                                == offset_into_cluster(s, *host_offset));

982 983 984 985 986 987 988 989 990
    /*
     * Calculate the number of clusters to look for. We stop at L2 table
     * boundaries to keep things simple.
     */
    nb_clusters =
        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);

    l2_index = offset_to_l2_index(s, guest_offset);
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
991
    assert(nb_clusters <= INT_MAX);
992

K
Kevin Wolf 已提交
993 994 995 996 997 998 999 1000 1001 1002 1003 1004
    /* Find L2 entry for the first involved cluster */
    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
    if (ret < 0) {
        return ret;
    }

    cluster_offset = be64_to_cpu(l2_table[l2_index]);

    /* Check how many clusters are already allocated and don't need COW */
    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
        && (cluster_offset & QCOW_OFLAG_COPIED))
    {
1005 1006 1007 1008
        /* If a specific host_offset is required, check it */
        bool offset_matches =
            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;

1009 1010 1011 1012 1013 1014 1015 1016 1017
        if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
                                    "%#llx unaligned (guest offset: %#" PRIx64
                                    ")", cluster_offset & L2E_OFFSET_MASK,
                                    guest_offset);
            ret = -EIO;
            goto out;
        }

1018 1019 1020 1021 1022 1023
        if (*host_offset != 0 && !offset_matches) {
            *bytes = 0;
            ret = 0;
            goto out;
        }

K
Kevin Wolf 已提交
1024
        /* We keep all QCOW_OFLAG_COPIED clusters */
1025
        keep_clusters =
1026
            count_contiguous_clusters(nb_clusters, s->cluster_size,
1027
                                      &l2_table[l2_index],
K
Kevin Wolf 已提交
1028
                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
1029 1030 1031 1032 1033
        assert(keep_clusters <= nb_clusters);

        *bytes = MIN(*bytes,
                 keep_clusters * s->cluster_size
                 - offset_into_cluster(s, guest_offset));
K
Kevin Wolf 已提交
1034 1035 1036 1037 1038 1039 1040

        ret = 1;
    } else {
        ret = 0;
    }

    /* Cleanup */
1041
out:
1042
    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
K
Kevin Wolf 已提交
1043

1044 1045
    /* Only return a host offset if we actually made progress. Otherwise we
     * would make requirements for handle_alloc() that it can't fulfill */
1046
    if (ret > 0) {
1047 1048
        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
                     + offset_into_cluster(s, guest_offset);
1049 1050
    }

K
Kevin Wolf 已提交
1051 1052 1053
    return ret;
}

1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
/*
 * Allocates new clusters for the given guest_offset.
 *
 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
 * contain the number of clusters that have been allocated and are contiguous
 * in the image file.
 *
 * If *host_offset is non-zero, it specifies the offset in the image file at
 * which the new clusters must start. *nb_clusters can be 0 on return in this
 * case if the cluster at host_offset is already in use. If *host_offset is
 * zero, the clusters can be allocated anywhere in the image file.
 *
 * *host_offset is updated to contain the offset into the image file at which
 * the first allocated cluster starts.
 *
 * Return 0 on success and -errno in error cases. -EAGAIN means that the
 * function has been waiting for another request and the allocation must be
 * restarted, but the whole request should not be failed.
 */
static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
1074
                                   uint64_t *host_offset, uint64_t *nb_clusters)
1075
{
1076
    BDRVQcow2State *s = bs->opaque;
1077 1078 1079 1080

    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
                                         *host_offset, *nb_clusters);

1081 1082 1083
    /* Allocate new clusters */
    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
    if (*host_offset == 0) {
1084 1085 1086 1087 1088 1089 1090
        int64_t cluster_offset =
            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
        if (cluster_offset < 0) {
            return cluster_offset;
        }
        *host_offset = cluster_offset;
        return 0;
1091
    } else {
1092
        int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1093 1094 1095 1096 1097
        if (ret < 0) {
            return ret;
        }
        *nb_clusters = ret;
        return 0;
1098 1099 1100
    }
}

K
Kevin Wolf 已提交
1101 1102 1103 1104 1105
/*
 * Allocates new clusters for an area that either is yet unallocated or needs a
 * copy on write. If *host_offset is non-zero, clusters are only allocated if
 * the new allocation can match the specified host offset.
 *
1106 1107 1108
 * Note that guest_offset may not be cluster aligned. In this case, the
 * returned *host_offset points to exact byte referenced by guest_offset and
 * therefore isn't cluster aligned as well.
K
Kevin Wolf 已提交
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121
 *
 * Returns:
 *   0:     if no clusters could be allocated. *bytes is set to 0,
 *          *host_offset is left unchanged.
 *
 *   1:     if new clusters were allocated. *bytes may be decreased if the
 *          new allocation doesn't cover all of the requested area.
 *          *host_offset is updated to contain the host offset of the first
 *          newly allocated cluster.
 *
 *  -errno: in error cases
 */
static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1122
    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
K
Kevin Wolf 已提交
1123
{
1124
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
1125 1126 1127
    int l2_index;
    uint64_t *l2_table;
    uint64_t entry;
1128
    uint64_t nb_clusters;
K
Kevin Wolf 已提交
1129 1130 1131 1132 1133 1134 1135 1136
    int ret;

    uint64_t alloc_cluster_offset;

    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
                             *bytes);
    assert(*bytes > 0);

1137 1138 1139 1140
    /*
     * Calculate the number of clusters to look for. We stop at L2 table
     * boundaries to keep things simple.
     */
1141 1142 1143
    nb_clusters =
        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);

1144
    l2_index = offset_to_l2_index(s, guest_offset);
1145
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1146
    assert(nb_clusters <= INT_MAX);
1147

K
Kevin Wolf 已提交
1148 1149 1150 1151 1152 1153
    /* Find L2 entry for the first involved cluster */
    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
    if (ret < 0) {
        return ret;
    }

1154
    entry = be64_to_cpu(l2_table[l2_index]);
K
Kevin Wolf 已提交
1155 1156 1157 1158 1159

    /* For the moment, overwrite compressed clusters one by one */
    if (entry & QCOW_OFLAG_COMPRESSED) {
        nb_clusters = 1;
    } else {
1160
        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
K
Kevin Wolf 已提交
1161 1162
    }

1163 1164 1165 1166 1167
    /* This function is only called when there were no non-COW clusters, so if
     * we can't find any unallocated or COW clusters either, something is
     * wrong with our code. */
    assert(nb_clusters > 0);

1168
    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
K
Kevin Wolf 已提交
1169 1170

    /* Allocate, if necessary at a given offset in the image file */
1171
    alloc_cluster_offset = start_of_cluster(s, *host_offset);
K
Kevin Wolf 已提交
1172
    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
K
Kevin Wolf 已提交
1173 1174 1175 1176 1177
                                  &nb_clusters);
    if (ret < 0) {
        goto fail;
    }

K
Kevin Wolf 已提交
1178 1179
    /* Can't extend contiguous allocation */
    if (nb_clusters == 0) {
K
Kevin Wolf 已提交
1180 1181 1182 1183
        *bytes = 0;
        return 0;
    }

1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
    /* !*host_offset would overwrite the image header and is reserved for "no
     * host offset preferred". If 0 was a valid host offset, it'd trigger the
     * following overlap check; do that now to avoid having an invalid value in
     * *host_offset. */
    if (!alloc_cluster_offset) {
        ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
                                            nb_clusters * s->cluster_size);
        assert(ret < 0);
        goto fail;
    }

K
Kevin Wolf 已提交
1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
    /*
     * Save info needed for meta data update.
     *
     * requested_sectors: Number of sectors from the start of the first
     * newly allocated cluster to the end of the (possibly shortened
     * before) write request.
     *
     * avail_sectors: Number of sectors from the start of the first
     * newly allocated to the end of the last newly allocated cluster.
     *
     * nb_sectors: The number of sectors from the start of the first
     * newly allocated cluster to the end of the area that the write
     * request actually writes to (excluding COW at the end)
     */
    int requested_sectors =
        (*bytes + offset_into_cluster(s, guest_offset))
        >> BDRV_SECTOR_BITS;
    int avail_sectors = nb_clusters
                        << (s->cluster_bits - BDRV_SECTOR_BITS);
    int alloc_n_start = offset_into_cluster(s, guest_offset)
                        >> BDRV_SECTOR_BITS;
    int nb_sectors = MIN(requested_sectors, avail_sectors);
1217
    QCowL2Meta *old_m = *m;
K
Kevin Wolf 已提交
1218 1219 1220 1221

    *m = g_malloc0(sizeof(**m));

    **m = (QCowL2Meta) {
1222 1223
        .next           = old_m,

1224
        .alloc_offset   = alloc_cluster_offset,
K
Kevin Wolf 已提交
1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
        .offset         = start_of_cluster(s, guest_offset),
        .nb_clusters    = nb_clusters,
        .nb_available   = nb_sectors,

        .cow_start = {
            .offset     = 0,
            .nb_sectors = alloc_n_start,
        },
        .cow_end = {
            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
            .nb_sectors = avail_sectors - nb_sectors,
        },
    };
    qemu_co_queue_init(&(*m)->dependent_requests);
    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);

1241
    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
K
Kevin Wolf 已提交
1242 1243 1244 1245
    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
                         - offset_into_cluster(s, guest_offset));
    assert(*bytes != 0);

K
Kevin Wolf 已提交
1246 1247 1248 1249 1250 1251 1252 1253 1254
    return 1;

fail:
    if (*m && (*m)->nb_clusters > 0) {
        QLIST_REMOVE(*m, next_in_flight);
    }
    return ret;
}

1255 1256 1257
/*
 * alloc_cluster_offset
 *
1258 1259
 * For a given offset on the virtual disk, find the cluster offset in qcow2
 * file. If the offset is not found, allocate a new cluster.
1260
 *
1261
 * If the cluster was already allocated, m->nb_clusters is set to 0 and
1262
 * other fields in m are meaningless.
1263 1264
 *
 * If the cluster is newly allocated, m->nb_clusters is set to the number of
K
Kevin Wolf 已提交
1265 1266 1267
 * contiguous clusters that have been allocated. In this case, the other
 * fields of m are valid and contain information about the first allocated
 * cluster.
1268
 *
K
Kevin Wolf 已提交
1269 1270
 * If the request conflicts with another write request in flight, the coroutine
 * is queued and will be reentered when the dependency has completed.
1271 1272
 *
 * Return 0 on success and -errno in error cases
1273
 */
K
Kevin Wolf 已提交
1274
int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
1275
    int *num, uint64_t *host_offset, QCowL2Meta **m)
1276
{
1277
    BDRVQcow2State *s = bs->opaque;
1278
    uint64_t start, remaining;
1279
    uint64_t cluster_offset;
1280
    uint64_t cur_bytes;
1281
    int ret;
1282

1283
    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
K
Kevin Wolf 已提交
1284

1285
    assert((offset & ~BDRV_SECTOR_MASK) == 0);
1286

1287
again:
1288
    start = offset;
M
Max Reitz 已提交
1289
    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
K
Kevin Wolf 已提交
1290 1291
    cluster_offset = 0;
    *host_offset = 0;
1292 1293
    cur_bytes = 0;
    *m = NULL;
K
Kevin Wolf 已提交
1294

1295
    while (true) {
1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312

        if (!*host_offset) {
            *host_offset = start_of_cluster(s, cluster_offset);
        }

        assert(remaining >= cur_bytes);

        start           += cur_bytes;
        remaining       -= cur_bytes;
        cluster_offset  += cur_bytes;

        if (remaining == 0) {
            break;
        }

        cur_bytes = remaining;

1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330
        /*
         * Now start gathering as many contiguous clusters as possible:
         *
         * 1. Check for overlaps with in-flight allocations
         *
         *      a) Overlap not in the first cluster -> shorten this request and
         *         let the caller handle the rest in its next loop iteration.
         *
         *      b) Real overlaps of two requests. Yield and restart the search
         *         for contiguous clusters (the situation could have changed
         *         while we were sleeping)
         *
         *      c) TODO: Request starts in the same cluster as the in-flight
         *         allocation ends. Shorten the COW of the in-fight allocation,
         *         set cluster_offset to write to the same cluster and set up
         *         the right synchronisation between the in-flight request and
         *         the new one.
         */
1331
        ret = handle_dependencies(bs, start, &cur_bytes, m);
1332
        if (ret == -EAGAIN) {
1333 1334 1335 1336
            /* Currently handle_dependencies() doesn't yield if we already had
             * an allocation. If it did, we would have to clean up the L2Meta
             * structs before starting over. */
            assert(*m == NULL);
1337 1338 1339
            goto again;
        } else if (ret < 0) {
            return ret;
1340 1341
        } else if (cur_bytes == 0) {
            break;
1342 1343 1344 1345
        } else {
            /* handle_dependencies() may have decreased cur_bytes (shortened
             * the allocations below) so that the next dependency is processed
             * correctly during the next loop iteration. */
K
Kevin Wolf 已提交
1346
        }
1347

1348 1349 1350 1351 1352 1353 1354
        /*
         * 2. Count contiguous COPIED clusters.
         */
        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
        if (ret < 0) {
            return ret;
        } else if (ret) {
1355
            continue;
1356 1357 1358
        } else if (cur_bytes == 0) {
            break;
        }
K
Kevin Wolf 已提交
1359

1360 1361 1362 1363 1364 1365 1366 1367
        /*
         * 3. If the request still hasn't completed, allocate new clusters,
         *    considering any cluster_offset of steps 1c or 2.
         */
        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
        if (ret < 0) {
            return ret;
        } else if (ret) {
1368
            continue;
1369 1370 1371 1372
        } else {
            assert(cur_bytes == 0);
            break;
        }
1373
    }
K
Kevin Wolf 已提交
1374

1375
    *num -= remaining >> BDRV_SECTOR_BITS;
1376 1377
    assert(*num > 0);
    assert(*host_offset != 0);
1378

1379
    return 0;
1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
}

static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
                             const uint8_t *buf, int buf_size)
{
    z_stream strm1, *strm = &strm1;
    int ret, out_len;

    memset(strm, 0, sizeof(*strm));

    strm->next_in = (uint8_t *)buf;
    strm->avail_in = buf_size;
    strm->next_out = out_buf;
    strm->avail_out = out_buf_size;

    ret = inflateInit2(strm, -12);
    if (ret != Z_OK)
        return -1;
    ret = inflate(strm, Z_FINISH);
    out_len = strm->next_out - out_buf;
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
        out_len != out_buf_size) {
        inflateEnd(strm);
        return -1;
    }
    inflateEnd(strm);
    return 0;
}

1409
int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
1410
{
1411
    BDRVQcow2State *s = bs->opaque;
1412 1413 1414 1415 1416 1417 1418 1419
    int ret, csize, nb_csectors, sector_offset;
    uint64_t coffset;

    coffset = cluster_offset & s->cluster_offset_mask;
    if (s->cluster_cache_offset != coffset) {
        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
        sector_offset = coffset & 511;
        csize = nb_csectors * 512 - sector_offset;
1420
        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
K
Kevin Wolf 已提交
1421 1422
        ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data,
                        nb_csectors);
1423
        if (ret < 0) {
1424
            return ret;
1425 1426 1427
        }
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
                              s->cluster_data + sector_offset, csize) < 0) {
1428
            return -EIO;
1429 1430 1431 1432 1433
        }
        s->cluster_cache_offset = coffset;
    }
    return 0;
}
K
Kevin Wolf 已提交
1434 1435 1436 1437 1438 1439 1440

/*
 * This discards as many clusters of nb_clusters as possible at once (i.e.
 * all clusters in the same L2 table) and returns the number of discarded
 * clusters.
 */
static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
1441 1442
                             uint64_t nb_clusters, enum qcow2_discard_type type,
                             bool full_discard)
K
Kevin Wolf 已提交
1443
{
1444
    BDRVQcow2State *s = bs->opaque;
1445
    uint64_t *l2_table;
K
Kevin Wolf 已提交
1446 1447 1448 1449
    int l2_index;
    int ret;
    int i;

1450
    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
K
Kevin Wolf 已提交
1451 1452 1453 1454 1455 1456
    if (ret < 0) {
        return ret;
    }

    /* Limit nb_clusters to one L2 table */
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1457
    assert(nb_clusters <= INT_MAX);
K
Kevin Wolf 已提交
1458 1459

    for (i = 0; i < nb_clusters; i++) {
M
Max Reitz 已提交
1460
        uint64_t old_l2_entry;
K
Kevin Wolf 已提交
1461

M
Max Reitz 已提交
1462
        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);
1463 1464

        /*
M
Max Reitz 已提交
1465 1466 1467 1468 1469
         * If full_discard is false, make sure that a discarded area reads back
         * as zeroes for v3 images (we cannot do it for v2 without actually
         * writing a zero-filled buffer). We can skip the operation if the
         * cluster is already marked as zero, or if it's unallocated and we
         * don't have a backing file.
1470 1471 1472
         *
         * TODO We might want to use bdrv_get_block_status(bs) here, but we're
         * holding s->lock, so that doesn't work today.
M
Max Reitz 已提交
1473 1474 1475
         *
         * If full_discard is true, the sector should not read back as zeroes,
         * but rather fall through to the backing file.
1476
         */
M
Max Reitz 已提交
1477 1478
        switch (qcow2_get_cluster_type(old_l2_entry)) {
            case QCOW2_CLUSTER_UNALLOCATED:
1479
                if (full_discard || !bs->backing) {
M
Max Reitz 已提交
1480 1481 1482 1483 1484
                    continue;
                }
                break;

            case QCOW2_CLUSTER_ZERO:
M
Max Reitz 已提交
1485 1486 1487 1488
                if (!full_discard) {
                    continue;
                }
                break;
M
Max Reitz 已提交
1489 1490 1491 1492 1493 1494 1495

            case QCOW2_CLUSTER_NORMAL:
            case QCOW2_CLUSTER_COMPRESSED:
                break;

            default:
                abort();
K
Kevin Wolf 已提交
1496 1497 1498
        }

        /* First remove L2 entries */
1499
        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
M
Max Reitz 已提交
1500
        if (!full_discard && s->qcow_version >= 3) {
1501 1502 1503 1504
            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
        } else {
            l2_table[l2_index + i] = cpu_to_be64(0);
        }
K
Kevin Wolf 已提交
1505 1506

        /* Then decrease the refcount */
M
Max Reitz 已提交
1507
        qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
K
Kevin Wolf 已提交
1508 1509
    }

1510
    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
K
Kevin Wolf 已提交
1511 1512 1513 1514 1515

    return nb_clusters;
}

int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
M
Max Reitz 已提交
1516
    int nb_sectors, enum qcow2_discard_type type, bool full_discard)
K
Kevin Wolf 已提交
1517
{
1518
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
1519
    uint64_t end_offset;
1520
    uint64_t nb_clusters;
K
Kevin Wolf 已提交
1521 1522 1523 1524 1525 1526
    int ret;

    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);

    /* Round start up and end down */
    offset = align_offset(offset, s->cluster_size);
1527
    end_offset = start_of_cluster(s, end_offset);
K
Kevin Wolf 已提交
1528 1529 1530 1531 1532 1533 1534

    if (offset > end_offset) {
        return 0;
    }

    nb_clusters = size_to_clusters(s, end_offset - offset);

K
Kevin Wolf 已提交
1535 1536
    s->cache_discards = true;

K
Kevin Wolf 已提交
1537 1538
    /* Each L2 table is handled by its own loop iteration */
    while (nb_clusters > 0) {
M
Max Reitz 已提交
1539
        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
K
Kevin Wolf 已提交
1540
        if (ret < 0) {
K
Kevin Wolf 已提交
1541
            goto fail;
K
Kevin Wolf 已提交
1542 1543 1544 1545 1546 1547
        }

        nb_clusters -= ret;
        offset += (ret * s->cluster_size);
    }

K
Kevin Wolf 已提交
1548 1549 1550 1551 1552 1553
    ret = 0;
fail:
    s->cache_discards = false;
    qcow2_process_discards(bs, ret);

    return ret;
K
Kevin Wolf 已提交
1554
}
K
Kevin Wolf 已提交
1555 1556 1557 1558 1559 1560 1561

/*
 * This zeroes as many clusters of nb_clusters as possible at once (i.e.
 * all clusters in the same L2 table) and returns the number of zeroed
 * clusters.
 */
static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
1562
                          uint64_t nb_clusters)
K
Kevin Wolf 已提交
1563
{
1564
    BDRVQcow2State *s = bs->opaque;
K
Kevin Wolf 已提交
1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
    uint64_t *l2_table;
    int l2_index;
    int ret;
    int i;

    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
    if (ret < 0) {
        return ret;
    }

    /* Limit nb_clusters to one L2 table */
    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1577
    assert(nb_clusters <= INT_MAX);
K
Kevin Wolf 已提交
1578 1579 1580 1581 1582 1583 1584

    for (i = 0; i < nb_clusters; i++) {
        uint64_t old_offset;

        old_offset = be64_to_cpu(l2_table[l2_index + i]);

        /* Update L2 entries */
1585
        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
K
Kevin Wolf 已提交
1586 1587
        if (old_offset & QCOW_OFLAG_COMPRESSED) {
            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1588
            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
K
Kevin Wolf 已提交
1589 1590 1591 1592 1593
        } else {
            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
        }
    }

1594
    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
K
Kevin Wolf 已提交
1595 1596 1597 1598 1599 1600

    return nb_clusters;
}

int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
{
1601
    BDRVQcow2State *s = bs->opaque;
1602
    uint64_t nb_clusters;
K
Kevin Wolf 已提交
1603 1604 1605 1606 1607 1608 1609 1610 1611 1612
    int ret;

    /* The zero flag is only supported by version 3 and newer */
    if (s->qcow_version < 3) {
        return -ENOTSUP;
    }

    /* Each L2 table is handled by its own loop iteration */
    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);

K
Kevin Wolf 已提交
1613 1614
    s->cache_discards = true;

K
Kevin Wolf 已提交
1615 1616 1617
    while (nb_clusters > 0) {
        ret = zero_single_l2(bs, offset, nb_clusters);
        if (ret < 0) {
K
Kevin Wolf 已提交
1618
            goto fail;
K
Kevin Wolf 已提交
1619 1620 1621 1622 1623 1624
        }

        nb_clusters -= ret;
        offset += (ret * s->cluster_size);
    }

K
Kevin Wolf 已提交
1625 1626 1627 1628 1629 1630
    ret = 0;
fail:
    s->cache_discards = false;
    qcow2_process_discards(bs, ret);

    return ret;
K
Kevin Wolf 已提交
1631
}
M
Max Reitz 已提交
1632 1633 1634 1635 1636

/*
 * Expands all zero clusters in a specific L1 table (or deallocates them, for
 * non-backed non-pre-allocated zero clusters).
 *
1637 1638 1639
 * l1_entries and *visited_l1_entries are used to keep track of progress for
 * status_cb(). l1_entries contains the total number of L1 entries and
 * *visited_l1_entries counts all visited L1 entries.
M
Max Reitz 已提交
1640 1641
 */
static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
1642
                                      int l1_size, int64_t *visited_l1_entries,
1643
                                      int64_t l1_entries,
1644 1645
                                      BlockDriverAmendStatusCB *status_cb,
                                      void *cb_opaque)
M
Max Reitz 已提交
1646
{
1647
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
1648 1649 1650 1651 1652 1653 1654 1655
    bool is_active_l1 = (l1_table == s->l1_table);
    uint64_t *l2_table = NULL;
    int ret;
    int i, j;

    if (!is_active_l1) {
        /* inactive L2 tables require a buffer to be stored in when loading
         * them from disk */
K
Kevin Wolf 已提交
1656
        l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size);
1657 1658 1659
        if (l2_table == NULL) {
            return -ENOMEM;
        }
M
Max Reitz 已提交
1660 1661 1662 1663 1664
    }

    for (i = 0; i < l1_size; i++) {
        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
        bool l2_dirty = false;
1665
        uint64_t l2_refcount;
M
Max Reitz 已提交
1666 1667 1668

        if (!l2_offset) {
            /* unallocated */
1669 1670
            (*visited_l1_entries)++;
            if (status_cb) {
1671
                status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
1672
            }
M
Max Reitz 已提交
1673 1674 1675
            continue;
        }

1676 1677 1678 1679 1680 1681 1682 1683
        if (offset_into_cluster(s, l2_offset)) {
            qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
                                    PRIx64 " unaligned (L1 index: %#x)",
                                    l2_offset, i);
            ret = -EIO;
            goto fail;
        }

M
Max Reitz 已提交
1684 1685 1686 1687 1688 1689
        if (is_active_l1) {
            /* get active L2 tables from cache */
            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
                    (void **)&l2_table);
        } else {
            /* load inactive L2 tables from disk */
K
Kevin Wolf 已提交
1690 1691
            ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
                            (void *)l2_table, s->cluster_sectors);
M
Max Reitz 已提交
1692 1693 1694 1695 1696
        }
        if (ret < 0) {
            goto fail;
        }

1697 1698 1699
        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
                                 &l2_refcount);
        if (ret < 0) {
1700 1701 1702
            goto fail;
        }

M
Max Reitz 已提交
1703 1704
        for (j = 0; j < s->l2_size; j++) {
            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1705
            int64_t offset = l2_entry & L2E_OFFSET_MASK;
M
Max Reitz 已提交
1706
            int cluster_type = qcow2_get_cluster_type(l2_entry);
1707
            bool preallocated = offset != 0;
M
Max Reitz 已提交
1708

1709
            if (cluster_type != QCOW2_CLUSTER_ZERO) {
M
Max Reitz 已提交
1710 1711 1712
                continue;
            }

1713
            if (!preallocated) {
1714
                if (!bs->backing) {
M
Max Reitz 已提交
1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
                    /* not backed; therefore we can simply deallocate the
                     * cluster */
                    l2_table[j] = 0;
                    l2_dirty = true;
                    continue;
                }

                offset = qcow2_alloc_clusters(bs, s->cluster_size);
                if (offset < 0) {
                    ret = offset;
                    goto fail;
                }
1727 1728 1729 1730 1731

                if (l2_refcount > 1) {
                    /* For shared L2 tables, set the refcount accordingly (it is
                     * already 1 and needs to be l2_refcount) */
                    ret = qcow2_update_cluster_refcount(bs,
1732 1733
                            offset >> s->cluster_bits,
                            refcount_diff(1, l2_refcount), false,
1734 1735 1736 1737 1738 1739 1740
                            QCOW2_DISCARD_OTHER);
                    if (ret < 0) {
                        qcow2_free_clusters(bs, offset, s->cluster_size,
                                            QCOW2_DISCARD_OTHER);
                        goto fail;
                    }
                }
M
Max Reitz 已提交
1741 1742
            }

1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755
            if (offset_into_cluster(s, offset)) {
                qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
                                        "%#" PRIx64 " unaligned (L2 offset: %#"
                                        PRIx64 ", L2 index: %#x)", offset,
                                        l2_offset, j);
                if (!preallocated) {
                    qcow2_free_clusters(bs, offset, s->cluster_size,
                                        QCOW2_DISCARD_ALWAYS);
                }
                ret = -EIO;
                goto fail;
            }

M
Max Reitz 已提交
1756
            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
M
Max Reitz 已提交
1757
            if (ret < 0) {
1758 1759 1760 1761
                if (!preallocated) {
                    qcow2_free_clusters(bs, offset, s->cluster_size,
                                        QCOW2_DISCARD_ALWAYS);
                }
M
Max Reitz 已提交
1762 1763 1764
                goto fail;
            }

K
Kevin Wolf 已提交
1765
            ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE,
1766
                                    s->cluster_sectors, 0);
M
Max Reitz 已提交
1767
            if (ret < 0) {
1768 1769 1770 1771
                if (!preallocated) {
                    qcow2_free_clusters(bs, offset, s->cluster_size,
                                        QCOW2_DISCARD_ALWAYS);
                }
M
Max Reitz 已提交
1772 1773 1774
                goto fail;
            }

1775 1776 1777 1778
            if (l2_refcount == 1) {
                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
            } else {
                l2_table[j] = cpu_to_be64(offset);
1779
            }
1780
            l2_dirty = true;
M
Max Reitz 已提交
1781 1782 1783 1784
        }

        if (is_active_l1) {
            if (l2_dirty) {
1785
                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
M
Max Reitz 已提交
1786 1787
                qcow2_cache_depends_on_flush(s->l2_table_cache);
            }
1788
            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
M
Max Reitz 已提交
1789 1790
        } else {
            if (l2_dirty) {
M
Max Reitz 已提交
1791 1792
                ret = qcow2_pre_write_overlap_check(bs,
                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
M
Max Reitz 已提交
1793 1794 1795 1796 1797
                        s->cluster_size);
                if (ret < 0) {
                    goto fail;
                }

K
Kevin Wolf 已提交
1798 1799
                ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
                                 (void *)l2_table, s->cluster_sectors);
M
Max Reitz 已提交
1800 1801 1802 1803 1804
                if (ret < 0) {
                    goto fail;
                }
            }
        }
1805 1806 1807

        (*visited_l1_entries)++;
        if (status_cb) {
1808
            status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
1809
        }
M
Max Reitz 已提交
1810 1811 1812 1813 1814 1815 1816 1817 1818
    }

    ret = 0;

fail:
    if (l2_table) {
        if (!is_active_l1) {
            qemu_vfree(l2_table);
        } else {
1819
            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
M
Max Reitz 已提交
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830
        }
    }
    return ret;
}

/*
 * For backed images, expands all zero clusters on the image. For non-backed
 * images, deallocates all non-pre-allocated zero clusters (and claims the
 * allocation for pre-allocated ones). This is important for downgrading to a
 * qcow2 version which doesn't yet support metadata zero clusters.
 */
1831
int qcow2_expand_zero_clusters(BlockDriverState *bs,
1832 1833
                               BlockDriverAmendStatusCB *status_cb,
                               void *cb_opaque)
M
Max Reitz 已提交
1834
{
1835
    BDRVQcow2State *s = bs->opaque;
M
Max Reitz 已提交
1836
    uint64_t *l1_table = NULL;
1837
    int64_t l1_entries = 0, visited_l1_entries = 0;
M
Max Reitz 已提交
1838 1839 1840
    int ret;
    int i, j;

1841 1842 1843 1844 1845 1846 1847
    if (status_cb) {
        l1_entries = s->l1_size;
        for (i = 0; i < s->nb_snapshots; i++) {
            l1_entries += s->snapshots[i].l1_size;
        }
    }

M
Max Reitz 已提交
1848
    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
1849
                                     &visited_l1_entries, l1_entries,
1850
                                     status_cb, cb_opaque);
M
Max Reitz 已提交
1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872
    if (ret < 0) {
        goto fail;
    }

    /* Inactive L1 tables may point to active L2 tables - therefore it is
     * necessary to flush the L2 table cache before trying to access the L2
     * tables pointed to by inactive L1 entries (else we might try to expand
     * zero clusters that have already been expanded); furthermore, it is also
     * necessary to empty the L2 table cache, since it may contain tables which
     * are now going to be modified directly on disk, bypassing the cache.
     * qcow2_cache_empty() does both for us. */
    ret = qcow2_cache_empty(bs, s->l2_table_cache);
    if (ret < 0) {
        goto fail;
    }

    for (i = 0; i < s->nb_snapshots; i++) {
        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;

        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);

K
Kevin Wolf 已提交
1873 1874 1875
        ret = bdrv_read(bs->file->bs,
                        s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
                        (void *)l1_table, l1_sectors);
M
Max Reitz 已提交
1876 1877 1878 1879 1880 1881 1882 1883 1884
        if (ret < 0) {
            goto fail;
        }

        for (j = 0; j < s->snapshots[i].l1_size; j++) {
            be64_to_cpus(&l1_table[j]);
        }

        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
1885
                                         &visited_l1_entries, l1_entries,
1886
                                         status_cb, cb_opaque);
M
Max Reitz 已提交
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897
        if (ret < 0) {
            goto fail;
        }
    }

    ret = 0;

fail:
    g_free(l1_table);
    return ret;
}