qcow.c 28.4 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the QCOW format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
P
pbrook 已提交
24
#include "qemu-common.h"
25
#include "block/block_int.h"
26
#include "qemu/module.h"
B
bellard 已提交
27
#include <zlib.h>
28
#include "qemu/aes.h"
29
#include "migration/migration.h"
B
bellard 已提交
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

/**************************************************************/
/* QEMU COW block driver with compression and encryption support */

#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
#define QCOW_VERSION 1

#define QCOW_CRYPT_NONE 0
#define QCOW_CRYPT_AES  1

#define QCOW_OFLAG_COMPRESSED (1LL << 63)

typedef struct QCowHeader {
    uint32_t magic;
    uint32_t version;
    uint64_t backing_file_offset;
    uint32_t backing_file_size;
    uint32_t mtime;
    uint64_t size; /* in bytes */
    uint8_t cluster_bits;
    uint8_t l2_bits;
    uint32_t crypt_method;
    uint64_t l1_table_offset;
} QCowHeader;

#define L2_CACHE_SIZE 16

typedef struct BDRVQcowState {
    int cluster_bits;
    int cluster_size;
    int cluster_sectors;
    int l2_bits;
    int l2_size;
    int l1_size;
    uint64_t cluster_offset_mask;
    uint64_t l1_table_offset;
    uint64_t *l1_table;
    uint64_t *l2_cache;
    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
    uint32_t l2_cache_counts[L2_CACHE_SIZE];
    uint8_t *cluster_cache;
    uint8_t *cluster_data;
    uint64_t cluster_cache_offset;
    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
    uint32_t crypt_method_header;
    AES_KEY aes_encrypt_key;
    AES_KEY aes_decrypt_key;
K
Kevin Wolf 已提交
77
    CoMutex lock;
K
Kevin Wolf 已提交
78
    Error *migration_blocker;
B
bellard 已提交
79 80
} BDRVQcowState;

81
static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
B
bellard 已提交
82 83 84 85

static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
{
    const QCowHeader *cow_header = (const void *)buf;
86

B
bellard 已提交
87 88
    if (buf_size >= sizeof(QCowHeader) &&
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
89
        be32_to_cpu(cow_header->version) == QCOW_VERSION)
B
bellard 已提交
90 91 92 93 94
        return 100;
    else
        return 0;
}

M
Max Reitz 已提交
95 96
static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
B
bellard 已提交
97 98
{
    BDRVQcowState *s = bs->opaque;
99
    int len, i, shift, ret;
B
bellard 已提交
100
    QCowHeader header;
B
bellard 已提交
101

102 103
    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
    if (ret < 0) {
B
bellard 已提交
104
        goto fail;
105
    }
B
bellard 已提交
106 107 108 109 110 111 112 113
    be32_to_cpus(&header.magic);
    be32_to_cpus(&header.version);
    be64_to_cpus(&header.backing_file_offset);
    be32_to_cpus(&header.backing_file_size);
    be32_to_cpus(&header.mtime);
    be64_to_cpus(&header.size);
    be32_to_cpus(&header.crypt_method);
    be64_to_cpus(&header.l1_table_offset);
114

115
    if (header.magic != QCOW_MAGIC) {
116
        ret = -EMEDIUMTYPE;
117 118 119 120 121 122 123 124
        goto fail;
    }
    if (header.version != QCOW_VERSION) {
        char version[64];
        snprintf(version, sizeof(version), "QCOW version %d", header.version);
        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
            bs->device_name, "qcow", version);
        ret = -ENOTSUP;
B
bellard 已提交
125
        goto fail;
126 127 128 129
    }

    if (header.size <= 1 || header.cluster_bits < 9) {
        ret = -EINVAL;
B
bellard 已提交
130
        goto fail;
131 132 133
    }
    if (header.crypt_method > QCOW_CRYPT_AES) {
        ret = -EINVAL;
B
bellard 已提交
134
        goto fail;
135
    }
B
bellard 已提交
136
    s->crypt_method_header = header.crypt_method;
137
    if (s->crypt_method_header) {
B
bellard 已提交
138
        bs->encrypted = 1;
139
    }
B
bellard 已提交
140 141 142 143 144 145 146 147 148 149 150 151 152
    s->cluster_bits = header.cluster_bits;
    s->cluster_size = 1 << s->cluster_bits;
    s->cluster_sectors = 1 << (s->cluster_bits - 9);
    s->l2_bits = header.l2_bits;
    s->l2_size = 1 << s->l2_bits;
    bs->total_sectors = header.size / 512;
    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;

    /* read the level 1 table */
    shift = s->cluster_bits + s->l2_bits;
    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;

    s->l1_table_offset = header.l1_table_offset;
153
    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
154 155 156 157

    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
               s->l1_size * sizeof(uint64_t));
    if (ret < 0) {
B
bellard 已提交
158
        goto fail;
159 160
    }

B
bellard 已提交
161 162 163 164
    for(i = 0;i < s->l1_size; i++) {
        be64_to_cpus(&s->l1_table[i]);
    }
    /* alloc L2 cache */
165 166 167
    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
    s->cluster_cache = g_malloc(s->cluster_size);
    s->cluster_data = g_malloc(s->cluster_size);
B
bellard 已提交
168
    s->cluster_cache_offset = -1;
169

B
bellard 已提交
170 171 172
    /* read the backing file name */
    if (header.backing_file_offset != 0) {
        len = header.backing_file_size;
173
        if (len > 1023) {
B
bellard 已提交
174
            len = 1023;
175 176 177 178
        }
        ret = bdrv_pread(bs->file, header.backing_file_offset,
                   bs->backing_file, len);
        if (ret < 0) {
B
bellard 已提交
179
            goto fail;
180
        }
B
bellard 已提交
181 182
        bs->backing_file[len] = '\0';
    }
S
Scott Wood 已提交
183

K
Kevin Wolf 已提交
184 185 186 187 188 189
    /* Disable migration when qcow images are used */
    error_set(&s->migration_blocker,
              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
              "qcow", bs->device_name, "live migration");
    migrate_add_blocker(s->migration_blocker);

S
Scott Wood 已提交
190
    qemu_co_mutex_init(&s->lock);
B
bellard 已提交
191 192 193
    return 0;

 fail:
194 195 196 197
    g_free(s->l1_table);
    g_free(s->l2_cache);
    g_free(s->cluster_cache);
    g_free(s->cluster_data);
198
    return ret;
B
bellard 已提交
199 200
}

J
Jeff Cody 已提交
201 202 203 204 205 206 207 208 209

/* We have nothing to do for QCOW reopen, stubs just return
 * success */
static int qcow_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
{
    return 0;
}

B
bellard 已提交
210 211 212 213 214
static int qcow_set_key(BlockDriverState *bs, const char *key)
{
    BDRVQcowState *s = bs->opaque;
    uint8_t keybuf[16];
    int len, i;
215

B
bellard 已提交
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
    memset(keybuf, 0, 16);
    len = strlen(key);
    if (len > 16)
        len = 16;
    /* XXX: we could compress the chars to 7 bits to increase
       entropy */
    for(i = 0;i < len;i++) {
        keybuf[i] = key[i];
    }
    s->crypt_method = s->crypt_method_header;

    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
        return -1;
    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
        return -1;
    return 0;
}

/* The crypt function is compatible with the linux cryptoloop
   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
   supported */
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
                            uint8_t *out_buf, const uint8_t *in_buf,
                            int nb_sectors, int enc,
                            const AES_KEY *key)
{
    union {
        uint64_t ll[2];
        uint8_t b[16];
    } ivec;
    int i;

    for(i = 0; i < nb_sectors; i++) {
        ivec.ll[0] = cpu_to_le64(sector_num);
        ivec.ll[1] = 0;
251
        AES_cbc_encrypt(in_buf, out_buf, 512, key,
B
bellard 已提交
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
                        ivec.b, enc);
        sector_num++;
        in_buf += 512;
        out_buf += 512;
    }
}

/* 'allocate' is:
 *
 * 0 to not allocate.
 *
 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
 * 'n_end')
 *
 * 2 to allocate a compressed cluster of size
 * 'compressed_size'. 'compressed_size' must be > 0 and <
268
 * cluster_size
B
bellard 已提交
269 270 271 272 273 274 275 276 277 278 279 280 281
 *
 * return 0 if not allocated.
 */
static uint64_t get_cluster_offset(BlockDriverState *bs,
                                   uint64_t offset, int allocate,
                                   int compressed_size,
                                   int n_start, int n_end)
{
    BDRVQcowState *s = bs->opaque;
    int min_index, i, j, l1_index, l2_index;
    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
    uint32_t min_count;
    int new_l2_table;
282

B
bellard 已提交
283 284 285 286 287 288 289
    l1_index = offset >> (s->l2_bits + s->cluster_bits);
    l2_offset = s->l1_table[l1_index];
    new_l2_table = 0;
    if (!l2_offset) {
        if (!allocate)
            return 0;
        /* allocate a new l2 entry */
290
        l2_offset = bdrv_getlength(bs->file);
B
bellard 已提交
291 292 293 294 295
        /* round to cluster size */
        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
        /* update the L1 entry */
        s->l1_table[l1_index] = l2_offset;
        tmp = cpu_to_be64(l2_offset);
296 297 298
        if (bdrv_pwrite_sync(bs->file,
                s->l1_table_offset + l1_index * sizeof(tmp),
                &tmp, sizeof(tmp)) < 0)
B
bellard 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
            return 0;
        new_l2_table = 1;
    }
    for(i = 0; i < L2_CACHE_SIZE; i++) {
        if (l2_offset == s->l2_cache_offsets[i]) {
            /* increment the hit count */
            if (++s->l2_cache_counts[i] == 0xffffffff) {
                for(j = 0; j < L2_CACHE_SIZE; j++) {
                    s->l2_cache_counts[j] >>= 1;
                }
            }
            l2_table = s->l2_cache + (i << s->l2_bits);
            goto found;
        }
    }
    /* not found: load a new entry in the least used one */
    min_index = 0;
    min_count = 0xffffffff;
    for(i = 0; i < L2_CACHE_SIZE; i++) {
        if (s->l2_cache_counts[i] < min_count) {
            min_count = s->l2_cache_counts[i];
            min_index = i;
        }
    }
    l2_table = s->l2_cache + (min_index << s->l2_bits);
    if (new_l2_table) {
        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
326 327
        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
                s->l2_size * sizeof(uint64_t)) < 0)
B
bellard 已提交
328 329
            return 0;
    } else {
330
        if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
B
bellard 已提交
331 332 333 334 335 336 337 338
            s->l2_size * sizeof(uint64_t))
            return 0;
    }
    s->l2_cache_offsets[min_index] = l2_offset;
    s->l2_cache_counts[min_index] = 1;
 found:
    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
    cluster_offset = be64_to_cpu(l2_table[l2_index]);
339
    if (!cluster_offset ||
B
bellard 已提交
340 341 342 343 344 345 346 347 348
        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
        if (!allocate)
            return 0;
        /* allocate a new cluster */
        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
            (n_end - n_start) < s->cluster_sectors) {
            /* if the cluster is already compressed, we must
               decompress it in the case it is not completely
               overwritten */
349
            if (decompress_cluster(bs, cluster_offset) < 0)
B
bellard 已提交
350
                return 0;
351
            cluster_offset = bdrv_getlength(bs->file);
352
            cluster_offset = (cluster_offset + s->cluster_size - 1) &
B
bellard 已提交
353 354
                ~(s->cluster_size - 1);
            /* write the cluster content */
355
            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
B
bellard 已提交
356 357 358
                s->cluster_size)
                return -1;
        } else {
359
            cluster_offset = bdrv_getlength(bs->file);
360 361 362 363
            if (allocate == 1) {
                /* round to cluster size */
                cluster_offset = (cluster_offset + s->cluster_size - 1) &
                    ~(s->cluster_size - 1);
364
                bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
365 366 367 368 369 370 371 372 373 374 375 376 377
                /* if encrypted, we must initialize the cluster
                   content which won't be written */
                if (s->crypt_method &&
                    (n_end - n_start) < s->cluster_sectors) {
                    uint64_t start_sect;
                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
                    memset(s->cluster_data + 512, 0x00, 512);
                    for(i = 0; i < s->cluster_sectors; i++) {
                        if (i < n_start || i >= n_end) {
                            encrypt_sectors(s, start_sect + i,
                                            s->cluster_data,
                                            s->cluster_data + 512, 1, 1,
                                            &s->aes_encrypt_key);
378
                            if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
379 380 381
                                            s->cluster_data, 512) != 512)
                                return -1;
                        }
B
bellard 已提交
382 383
                    }
                }
384 385 386
            } else if (allocate == 2) {
                cluster_offset |= QCOW_OFLAG_COMPRESSED |
                    (uint64_t)compressed_size << (63 - s->cluster_bits);
B
bellard 已提交
387 388 389 390 391
            }
        }
        /* update L2 table */
        tmp = cpu_to_be64(cluster_offset);
        l2_table[l2_index] = tmp;
392 393
        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
                &tmp, sizeof(tmp)) < 0)
B
bellard 已提交
394 395 396 397 398
            return 0;
    }
    return cluster_offset;
}

399
static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
400
        int64_t sector_num, int nb_sectors, int *pnum)
B
bellard 已提交
401 402 403 404 405
{
    BDRVQcowState *s = bs->opaque;
    int index_in_cluster, n;
    uint64_t cluster_offset;

406
    qemu_co_mutex_lock(&s->lock);
B
bellard 已提交
407
    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
408
    qemu_co_mutex_unlock(&s->lock);
B
bellard 已提交
409 410 411 412 413
    index_in_cluster = sector_num & (s->cluster_sectors - 1);
    n = s->cluster_sectors - index_in_cluster;
    if (n > nb_sectors)
        n = nb_sectors;
    *pnum = n;
414 415 416 417 418 419 420 421
    if (!cluster_offset) {
        return 0;
    }
    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) {
        return BDRV_BLOCK_DATA;
    }
    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset;
B
bellard 已提交
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
}

static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
                             const uint8_t *buf, int buf_size)
{
    z_stream strm1, *strm = &strm1;
    int ret, out_len;

    memset(strm, 0, sizeof(*strm));

    strm->next_in = (uint8_t *)buf;
    strm->avail_in = buf_size;
    strm->next_out = out_buf;
    strm->avail_out = out_buf_size;

    ret = inflateInit2(strm, -12);
    if (ret != Z_OK)
        return -1;
    ret = inflate(strm, Z_FINISH);
    out_len = strm->next_out - out_buf;
    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
        out_len != out_buf_size) {
        inflateEnd(strm);
        return -1;
    }
    inflateEnd(strm);
    return 0;
}
450

451
static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
B
bellard 已提交
452
{
453
    BDRVQcowState *s = bs->opaque;
B
bellard 已提交
454 455 456 457 458 459 460
    int ret, csize;
    uint64_t coffset;

    coffset = cluster_offset & s->cluster_offset_mask;
    if (s->cluster_cache_offset != coffset) {
        csize = cluster_offset >> (63 - s->cluster_bits);
        csize &= (s->cluster_size - 1);
461
        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
462
        if (ret != csize)
B
bellard 已提交
463 464 465 466 467 468 469 470 471 472
            return -1;
        if (decompress_buffer(s->cluster_cache, s->cluster_size,
                              s->cluster_data, csize) < 0) {
            return -1;
        }
        s->cluster_cache_offset = coffset;
    }
    return 0;
}

473
static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
F
Frediano Ziglio 已提交
474
                         int nb_sectors, QEMUIOVector *qiov)
B
bellard 已提交
475 476 477
{
    BDRVQcowState *s = bs->opaque;
    int index_in_cluster;
F
Frediano Ziglio 已提交
478
    int ret = 0, n;
479
    uint64_t cluster_offset;
F
Frediano Ziglio 已提交
480 481
    struct iovec hd_iov;
    QEMUIOVector hd_qiov;
F
Frediano Ziglio 已提交
482 483
    uint8_t *buf;
    void *orig_buf;
B
bellard 已提交
484

F
Frediano Ziglio 已提交
485 486 487 488 489
    if (qiov->niov > 1) {
        buf = orig_buf = qemu_blockalign(bs, qiov->size);
    } else {
        orig_buf = NULL;
        buf = (uint8_t *)qiov->iov->iov_base;
B
bellard 已提交
490
    }
491

F
Frediano Ziglio 已提交
492 493 494 495 496 497 498 499 500 501 502
    qemu_co_mutex_lock(&s->lock);

    while (nb_sectors != 0) {
        /* prepare next request */
        cluster_offset = get_cluster_offset(bs, sector_num << 9,
                                                 0, 0, 0, 0);
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
        n = s->cluster_sectors - index_in_cluster;
        if (n > nb_sectors) {
            n = nb_sectors;
        }
503

F
Frediano Ziglio 已提交
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
        if (!cluster_offset) {
            if (bs->backing_hd) {
                /* read from the base image */
                hd_iov.iov_base = (void *)buf;
                hd_iov.iov_len = n * 512;
                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
                qemu_co_mutex_unlock(&s->lock);
                ret = bdrv_co_readv(bs->backing_hd, sector_num,
                                    n, &hd_qiov);
                qemu_co_mutex_lock(&s->lock);
                if (ret < 0) {
                    goto fail;
                }
            } else {
                /* Note: in this case, no need to wait */
                memset(buf, 0, 512 * n);
            }
        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
            /* add AIO support for compressed blocks ? */
            if (decompress_cluster(bs, cluster_offset) < 0) {
                goto fail;
            }
            memcpy(buf,
                   s->cluster_cache + index_in_cluster * 512, 512 * n);
        } else {
            if ((cluster_offset & 511) != 0) {
                goto fail;
            }
            hd_iov.iov_base = (void *)buf;
F
Frediano Ziglio 已提交
533 534
            hd_iov.iov_len = n * 512;
            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
K
Kevin Wolf 已提交
535
            qemu_co_mutex_unlock(&s->lock);
F
Frediano Ziglio 已提交
536 537
            ret = bdrv_co_readv(bs->file,
                                (cluster_offset >> 9) + index_in_cluster,
F
Frediano Ziglio 已提交
538
                                n, &hd_qiov);
K
Kevin Wolf 已提交
539 540
            qemu_co_mutex_lock(&s->lock);
            if (ret < 0) {
F
Frediano Ziglio 已提交
541 542 543 544 545 546
                break;
            }
            if (s->crypt_method) {
                encrypt_sectors(s, sector_num, buf, buf,
                                n, 0,
                                &s->aes_decrypt_key);
547 548
            }
        }
F
Frediano Ziglio 已提交
549
        ret = 0;
550

F
Frediano Ziglio 已提交
551 552 553
        nb_sectors -= n;
        sector_num += n;
        buf += n * 512;
554 555
    }

F
Frediano Ziglio 已提交
556
done:
K
Kevin Wolf 已提交
557 558
    qemu_co_mutex_unlock(&s->lock);

F
Frediano Ziglio 已提交
559
    if (qiov->niov > 1) {
560
        qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
F
Frediano Ziglio 已提交
561
        qemu_vfree(orig_buf);
K
Kevin Wolf 已提交
562 563
    }

K
Kevin Wolf 已提交
564
    return ret;
F
Frediano Ziglio 已提交
565 566 567 568

fail:
    ret = -EIO;
    goto done;
B
bellard 已提交
569 570
}

571
static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
F
Frediano Ziglio 已提交
572
                          int nb_sectors, QEMUIOVector *qiov)
B
bellard 已提交
573 574 575 576 577
{
    BDRVQcowState *s = bs->opaque;
    int index_in_cluster;
    uint64_t cluster_offset;
    const uint8_t *src_buf;
F
Frediano Ziglio 已提交
578
    int ret = 0, n;
F
Frediano Ziglio 已提交
579 580 581
    uint8_t *cluster_data = NULL;
    struct iovec hd_iov;
    QEMUIOVector hd_qiov;
F
Frediano Ziglio 已提交
582 583
    uint8_t *buf;
    void *orig_buf;
584

F
Frediano Ziglio 已提交
585
    s->cluster_cache_offset = -1; /* disable compressed cache */
586

F
Frediano Ziglio 已提交
587 588
    if (qiov->niov > 1) {
        buf = orig_buf = qemu_blockalign(bs, qiov->size);
589
        qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
B
bellard 已提交
590
    } else {
F
Frediano Ziglio 已提交
591 592
        orig_buf = NULL;
        buf = (uint8_t *)qiov->iov->iov_base;
B
bellard 已提交
593
    }
594

K
Kevin Wolf 已提交
595
    qemu_co_mutex_lock(&s->lock);
596

F
Frediano Ziglio 已提交
597
    while (nb_sectors != 0) {
B
bellard 已提交
598

F
Frediano Ziglio 已提交
599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
        n = s->cluster_sectors - index_in_cluster;
        if (n > nb_sectors) {
            n = nb_sectors;
        }
        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
                                            index_in_cluster,
                                            index_in_cluster + n);
        if (!cluster_offset || (cluster_offset & 511) != 0) {
            ret = -EIO;
            break;
        }
        if (s->crypt_method) {
            if (!cluster_data) {
                cluster_data = g_malloc0(s->cluster_size);
            }
            encrypt_sectors(s, sector_num, cluster_data, buf,
                            n, 1, &s->aes_encrypt_key);
            src_buf = cluster_data;
        } else {
            src_buf = buf;
        }
B
bellard 已提交
621

F
Frediano Ziglio 已提交
622 623 624 625 626 627 628 629 630 631 632 633
        hd_iov.iov_base = (void *)src_buf;
        hd_iov.iov_len = n * 512;
        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
        qemu_co_mutex_unlock(&s->lock);
        ret = bdrv_co_writev(bs->file,
                             (cluster_offset >> 9) + index_in_cluster,
                             n, &hd_qiov);
        qemu_co_mutex_lock(&s->lock);
        if (ret < 0) {
            break;
        }
        ret = 0;
634

F
Frediano Ziglio 已提交
635 636 637 638
        nb_sectors -= n;
        sector_num += n;
        buf += n * 512;
    }
K
Kevin Wolf 已提交
639
    qemu_co_mutex_unlock(&s->lock);
640

F
Frediano Ziglio 已提交
641 642
    if (qiov->niov > 1) {
        qemu_vfree(orig_buf);
K
Kevin Wolf 已提交
643
    }
644
    g_free(cluster_data);
K
Kevin Wolf 已提交
645

K
Kevin Wolf 已提交
646
    return ret;
B
bellard 已提交
647 648
}

B
bellard 已提交
649
static void qcow_close(BlockDriverState *bs)
B
bellard 已提交
650 651
{
    BDRVQcowState *s = bs->opaque;
K
Kevin Wolf 已提交
652

653 654 655 656
    g_free(s->l1_table);
    g_free(s->l2_cache);
    g_free(s->cluster_cache);
    g_free(s->cluster_data);
K
Kevin Wolf 已提交
657 658 659

    migrate_del_blocker(s->migration_blocker);
    error_free(s->migration_blocker);
B
bellard 已提交
660 661
}

662 663
static int qcow_create(const char *filename, QEMUOptionParameter *options,
                       Error **errp)
B
bellard 已提交
664
{
665
    int header_size, backing_filename_len, l1_size, shift, i;
B
bellard 已提交
666
    QCowHeader header;
667
    uint8_t *tmp;
668 669 670
    int64_t total_size = 0;
    const char *backing_file = NULL;
    int flags = 0;
671
    Error *local_err = NULL;
672
    int ret;
673
    BlockDriverState *qcow_bs;
674 675 676 677 678 679 680 681 682 683 684 685

    /* Read out options */
    while (options && options->name) {
        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
            total_size = options->value.n / 512;
        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
            backing_file = options->value.s;
        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
        }
        options++;
    }
B
bellard 已提交
686

687
    ret = bdrv_create_file(filename, options, &local_err);
688
    if (ret < 0) {
689 690
        qerror_report_err(local_err);
        error_free(local_err);
691 692 693
        return ret;
    }

694 695
    ret = bdrv_file_open(&qcow_bs, filename, NULL, NULL, BDRV_O_RDWR,
                         &local_err);
696
    if (ret < 0) {
697 698
        qerror_report_err(local_err);
        error_free(local_err);
699 700 701 702 703 704 705 706
        return ret;
    }

    ret = bdrv_truncate(qcow_bs, 0);
    if (ret < 0) {
        goto exit;
    }

B
bellard 已提交
707 708 709 710 711 712 713
    memset(&header, 0, sizeof(header));
    header.magic = cpu_to_be32(QCOW_MAGIC);
    header.version = cpu_to_be32(QCOW_VERSION);
    header.size = cpu_to_be64(total_size * 512);
    header_size = sizeof(header);
    backing_filename_len = 0;
    if (backing_file) {
A
aurel32 已提交
714 715 716 717 718 719 720 721 722
        if (strcmp(backing_file, "fat:")) {
            header.backing_file_offset = cpu_to_be64(header_size);
            backing_filename_len = strlen(backing_file);
            header.backing_file_size = cpu_to_be32(backing_filename_len);
            header_size += backing_filename_len;
        } else {
            /* special backing file for vvfat */
            backing_file = NULL;
        }
B
bellard 已提交
723 724 725 726 727 728 729 730 731 732 733 734
        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
                                    unmodifyed sectors */
        header.l2_bits = 12; /* 32 KB L2 tables */
    } else {
        header.cluster_bits = 12; /* 4 KB clusters */
        header.l2_bits = 9; /* 4 KB L2 tables */
    }
    header_size = (header_size + 7) & ~7;
    shift = header.cluster_bits + header.l2_bits;
    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;

    header.l1_table_offset = cpu_to_be64(header_size);
735
    if (flags & BLOCK_FLAG_ENCRYPT) {
B
bellard 已提交
736 737 738 739
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
    } else {
        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
    }
740

B
bellard 已提交
741
    /* write all the data */
742
    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
743 744 745 746
    if (ret != sizeof(header)) {
        goto exit;
    }

B
bellard 已提交
747
    if (backing_file) {
748 749
        ret = bdrv_pwrite(qcow_bs, sizeof(header),
            backing_file, backing_filename_len);
750 751 752
        if (ret != backing_filename_len) {
            goto exit;
        }
B
bellard 已提交
753
    }
754 755 756 757 758 759 760 761

    tmp = g_malloc0(BDRV_SECTOR_SIZE);
    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
        BDRV_SECTOR_SIZE); i++) {
        ret = bdrv_pwrite(qcow_bs, header_size +
            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
        if (ret != BDRV_SECTOR_SIZE) {
            g_free(tmp);
762 763
            goto exit;
        }
B
bellard 已提交
764
    }
765

766
    g_free(tmp);
767 768
    ret = 0;
exit:
F
Fam Zheng 已提交
769
    bdrv_unref(qcow_bs);
770
    return ret;
B
bellard 已提交
771 772
}

B
bellard 已提交
773
static int qcow_make_empty(BlockDriverState *bs)
774 775 776
{
    BDRVQcowState *s = bs->opaque;
    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
B
bellard 已提交
777
    int ret;
778 779

    memset(s->l1_table, 0, l1_length);
780 781 782
    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
            l1_length) < 0)
        return -1;
783
    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
B
bellard 已提交
784 785
    if (ret < 0)
        return ret;
786 787 788 789 790 791 792 793

    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));

    return 0;
}

B
bellard 已提交
794 795
/* XXX: put compressed sectors first, then all the cluster aligned
   tables to avoid losing bytes in alignment */
796
static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
B
bellard 已提交
797
                                 const uint8_t *buf, int nb_sectors)
B
bellard 已提交
798 799 800 801 802 803 804
{
    BDRVQcowState *s = bs->opaque;
    z_stream strm;
    int ret, out_len;
    uint8_t *out_buf;
    uint64_t cluster_offset;

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
    if (nb_sectors != s->cluster_sectors) {
        ret = -EINVAL;

        /* Zero-pad last write if image size is not cluster aligned */
        if (sector_num + nb_sectors == bs->total_sectors &&
            nb_sectors < s->cluster_sectors) {
            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
            memset(pad_buf, 0, s->cluster_size);
            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
            ret = qcow_write_compressed(bs, sector_num,
                                        pad_buf, s->cluster_sectors);
            qemu_vfree(pad_buf);
        }
        return ret;
    }
B
bellard 已提交
820

821
    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
B
bellard 已提交
822 823 824 825

    /* best compression, small window, no zlib header */
    memset(&strm, 0, sizeof(strm));
    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
826
                       Z_DEFLATED, -12,
B
bellard 已提交
827 828
                       9, Z_DEFAULT_STRATEGY);
    if (ret != 0) {
829 830
        ret = -EINVAL;
        goto fail;
B
bellard 已提交
831 832 833 834 835 836 837 838 839 840
    }

    strm.avail_in = s->cluster_size;
    strm.next_in = (uint8_t *)buf;
    strm.avail_out = s->cluster_size;
    strm.next_out = out_buf;

    ret = deflate(&strm, Z_FINISH);
    if (ret != Z_STREAM_END && ret != Z_OK) {
        deflateEnd(&strm);
841 842
        ret = -EINVAL;
        goto fail;
B
bellard 已提交
843 844 845 846 847 848 849
    }
    out_len = strm.next_out - out_buf;

    deflateEnd(&strm);

    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
        /* could not compress: write normal cluster */
850 851 852 853
        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
        if (ret < 0) {
            goto fail;
        }
B
bellard 已提交
854
    } else {
855
        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
B
bellard 已提交
856
                                            out_len, 0, 0);
857 858 859 860 861
        if (cluster_offset == 0) {
            ret = -EIO;
            goto fail;
        }

B
bellard 已提交
862
        cluster_offset &= s->cluster_offset_mask;
863 864 865
        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
        if (ret < 0) {
            goto fail;
B
bellard 已提交
866 867
        }
    }
868

869 870
    ret = 0;
fail:
871
    g_free(out_buf);
872
    return ret;
B
bellard 已提交
873 874
}

B
bellard 已提交
875 876 877 878 879 880 881
static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    BDRVQcowState *s = bs->opaque;
    bdi->cluster_size = s->cluster_size;
    return 0;
}

882 883

static QEMUOptionParameter qcow_create_options[] = {
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898
    {
        .name = BLOCK_OPT_SIZE,
        .type = OPT_SIZE,
        .help = "Virtual disk size"
    },
    {
        .name = BLOCK_OPT_BACKING_FILE,
        .type = OPT_STRING,
        .help = "File name of a base image"
    },
    {
        .name = BLOCK_OPT_ENCRYPT,
        .type = OPT_FLAG,
        .help = "Encrypt the image"
    },
899 900 901
    { NULL }
};

902
static BlockDriver bdrv_qcow = {
903 904 905 906 907
    .format_name	= "qcow",
    .instance_size	= sizeof(BDRVQcowState),
    .bdrv_probe		= qcow_probe,
    .bdrv_open		= qcow_open,
    .bdrv_close		= qcow_close,
J
Jeff Cody 已提交
908
    .bdrv_reopen_prepare = qcow_reopen_prepare,
909
    .bdrv_create	= qcow_create,
910
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
911 912 913

    .bdrv_co_readv          = qcow_co_readv,
    .bdrv_co_writev         = qcow_co_writev,
914
    .bdrv_co_get_block_status   = qcow_co_get_block_status,
915 916 917 918 919

    .bdrv_set_key           = qcow_set_key,
    .bdrv_make_empty        = qcow_make_empty,
    .bdrv_write_compressed  = qcow_write_compressed,
    .bdrv_get_info          = qcow_get_info,
920 921

    .create_options = qcow_create_options,
B
bellard 已提交
922
};
923 924 925 926 927 928 929

static void bdrv_qcow_init(void)
{
    bdrv_register(&bdrv_qcow);
}

block_init(bdrv_qcow_init);