vmdk.c 73.2 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the VMDK format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004 Fabrice Bellard
5
 * Copyright (c) 2005 Filip Navara
6
 *
B
bellard 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
25

P
Peter Maydell 已提交
26
#include "qemu/osdep.h"
27
#include "qapi/error.h"
28
#include "block/block_int.h"
29
#include "sysemu/block-backend.h"
30
#include "qapi/qmp/qerror.h"
31
#include "qemu/error-report.h"
32
#include "qemu/module.h"
33
#include "qemu/bswap.h"
34
#include "migration/migration.h"
35
#include "qemu/cutils.h"
S
Stefan Weil 已提交
36
#include <zlib.h>
B
bellard 已提交
37 38 39

#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
F
Fam Zheng 已提交
40
#define VMDK4_COMPRESSION_DEFLATE 1
F
Fam Zheng 已提交
41
#define VMDK4_FLAG_NL_DETECT (1 << 0)
42
#define VMDK4_FLAG_RGD (1 << 1)
43 44
/* Zeroed-grain enable bit */
#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
F
Fam Zheng 已提交
45 46
#define VMDK4_FLAG_COMPRESS (1 << 16)
#define VMDK4_FLAG_MARKER (1 << 17)
47
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
B
bellard 已提交
48

49
#define VMDK_GTE_ZEROED 0x1
F
Fam Zheng 已提交
50 51 52 53 54 55 56 57

/* VMDK internal error codes */
#define VMDK_OK      0
#define VMDK_ERROR   (-1)
/* Cluster not allocated */
#define VMDK_UNALLOC (-2)
#define VMDK_ZEROED  (-3)

58 59
#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"

B
bellard 已提交
60 61 62 63 64 65 66 67 68 69 70
typedef struct {
    uint32_t version;
    uint32_t flags;
    uint32_t disk_sectors;
    uint32_t granularity;
    uint32_t l1dir_offset;
    uint32_t l1dir_size;
    uint32_t file_sectors;
    uint32_t cylinders;
    uint32_t heads;
    uint32_t sectors_per_track;
71
} QEMU_PACKED VMDK3Header;
B
bellard 已提交
72 73 74 75

typedef struct {
    uint32_t version;
    uint32_t flags;
76 77 78 79
    uint64_t capacity;
    uint64_t granularity;
    uint64_t desc_offset;
    uint64_t desc_size;
80 81
    /* Number of GrainTableEntries per GrainTable */
    uint32_t num_gtes_per_gt;
82 83 84
    uint64_t rgd_offset;
    uint64_t gd_offset;
    uint64_t grain_offset;
B
bellard 已提交
85 86
    char filler[1];
    char check_bytes[4];
F
Fam Zheng 已提交
87
    uint16_t compressAlgorithm;
88
} QEMU_PACKED VMDK4Header;
B
bellard 已提交
89 90 91

#define L2_CACHE_SIZE 16

F
Fam Zheng 已提交
92
typedef struct VmdkExtent {
93
    BdrvChild *file;
F
Fam Zheng 已提交
94
    bool flat;
F
Fam Zheng 已提交
95 96
    bool compressed;
    bool has_marker;
97 98
    bool has_zero_grain;
    int version;
F
Fam Zheng 已提交
99 100
    int64_t sectors;
    int64_t end_sector;
101
    int64_t flat_start_offset;
B
bellard 已提交
102
    int64_t l1_table_offset;
103
    int64_t l1_backup_table_offset;
B
bellard 已提交
104
    uint32_t *l1_table;
105
    uint32_t *l1_backup_table;
B
bellard 已提交
106 107 108 109 110 111 112 113
    unsigned int l1_size;
    uint32_t l1_entry_sectors;

    unsigned int l2_size;
    uint32_t *l2_cache;
    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
    uint32_t l2_cache_counts[L2_CACHE_SIZE];

114
    int64_t cluster_sectors;
F
Fam Zheng 已提交
115
    int64_t next_cluster_sector;
F
Fam Zheng 已提交
116
    char *type;
F
Fam Zheng 已提交
117 118 119
} VmdkExtent;

typedef struct BDRVVmdkState {
120
    CoMutex lock;
121
    uint64_t desc_offset;
122
    bool cid_updated;
123
    bool cid_checked;
F
Fam Zheng 已提交
124
    uint32_t cid;
125
    uint32_t parent_cid;
F
Fam Zheng 已提交
126 127 128
    int num_extents;
    /* Extent array with num_extents entries, ascend ordered by address */
    VmdkExtent *extents;
K
Kevin Wolf 已提交
129
    Error *migration_blocker;
F
Fam Zheng 已提交
130
    char *create_type;
B
bellard 已提交
131 132
} BDRVVmdkState;

133 134 135 136 137
typedef struct VmdkMetaData {
    unsigned int l1_index;
    unsigned int l2_index;
    unsigned int l2_offset;
    int valid;
F
Fam Zheng 已提交
138
    uint32_t *l2_cache_entry;
139 140
} VmdkMetaData;

F
Fam Zheng 已提交
141 142 143 144
typedef struct VmdkGrainMarker {
    uint64_t lba;
    uint32_t size;
    uint8_t  data[0];
145
} QEMU_PACKED VmdkGrainMarker;
F
Fam Zheng 已提交
146

147 148 149 150 151 152 153
enum {
    MARKER_END_OF_STREAM    = 0,
    MARKER_GRAIN_TABLE      = 1,
    MARKER_GRAIN_DIRECTORY  = 2,
    MARKER_FOOTER           = 3,
};

B
bellard 已提交
154 155 156 157
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
{
    uint32_t magic;

F
Fam Zheng 已提交
158
    if (buf_size < 4) {
B
bellard 已提交
159
        return 0;
F
Fam Zheng 已提交
160
    }
B
bellard 已提交
161 162
    magic = be32_to_cpu(*(uint32_t *)buf);
    if (magic == VMDK3_MAGIC ||
163
        magic == VMDK4_MAGIC) {
B
bellard 已提交
164
        return 100;
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
    } else {
        const char *p = (const char *)buf;
        const char *end = p + buf_size;
        while (p < end) {
            if (*p == '#') {
                /* skip comment line */
                while (p < end && *p != '\n') {
                    p++;
                }
                p++;
                continue;
            }
            if (*p == ' ') {
                while (p < end && *p == ' ') {
                    p++;
                }
                /* skip '\r' if windows line endings used. */
                if (p < end && *p == '\r') {
                    p++;
                }
                /* only accept blank lines before 'version=' line */
                if (p == end || *p != '\n') {
                    return 0;
                }
                p++;
                continue;
            }
            if (end - p >= strlen("version=X\n")) {
                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
                    strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
                    return 100;
                }
            }
            if (end - p >= strlen("version=X\r\n")) {
                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
                    return 100;
                }
            }
            return 0;
        }
B
bellard 已提交
206
        return 0;
207
    }
B
bellard 已提交
208 209
}

210
#define SECTOR_SIZE 512
F
Fam Zheng 已提交
211 212 213
#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
#define BUF_SIZE 4096
#define HEADER_SIZE 512                 /* first sector of 512 bytes */
214

F
Fam Zheng 已提交
215 216 217 218
static void vmdk_free_extents(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
219
    VmdkExtent *e;
F
Fam Zheng 已提交
220 221

    for (i = 0; i < s->num_extents; i++) {
F
Fam Zheng 已提交
222 223 224 225
        e = &s->extents[i];
        g_free(e->l1_table);
        g_free(e->l2_cache);
        g_free(e->l1_backup_table);
F
Fam Zheng 已提交
226
        g_free(e->type);
K
Kevin Wolf 已提交
227
        if (e->file != bs->file) {
228
            bdrv_unref_child(bs, e->file);
F
Fam Zheng 已提交
229
        }
F
Fam Zheng 已提交
230
    }
231
    g_free(s->extents);
F
Fam Zheng 已提交
232 233
}

234 235 236 237 238 239 240 241
static void vmdk_free_last_extent(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;

    if (s->num_extents == 0) {
        return;
    }
    s->num_extents--;
242
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
243 244
}

245
static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
B
bellard 已提交
246
{
247
    char *desc;
248
    uint32_t cid = 0xffffffff;
249
    const char *p_name, *cid_str;
250
    size_t cid_str_size;
251
    BDRVVmdkState *s = bs->opaque;
K
Kevin Wolf 已提交
252
    int ret;
253

254
    desc = g_malloc0(DESC_SIZE);
255
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
256
    if (ret < 0) {
257
        g_free(desc);
258
        return 0;
259
    }
260 261 262 263 264 265 266 267 268

    if (parent) {
        cid_str = "parentCID";
        cid_str_size = sizeof("parentCID");
    } else {
        cid_str = "CID";
        cid_str_size = sizeof("CID");
    }

K
Kevin Wolf 已提交
269
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
270 271
    p_name = strstr(desc, cid_str);
    if (p_name != NULL) {
272
        p_name += cid_str_size;
273
        sscanf(p_name, "%" SCNx32, &cid);
274 275
    }

276
    g_free(desc);
277 278 279 280 281
    return cid;
}

static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
{
282
    char *desc, *tmp_desc;
283
    char *p_name, *tmp_str;
284
    BDRVVmdkState *s = bs->opaque;
285
    int ret = 0;
286

287 288
    desc = g_malloc0(DESC_SIZE);
    tmp_desc = g_malloc0(DESC_SIZE);
289
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
290
    if (ret < 0) {
291
        goto out;
292
    }
293

K
Kevin Wolf 已提交
294
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
295
    tmp_str = strstr(desc, "parentCID");
K
Kevin Wolf 已提交
296
    if (tmp_str == NULL) {
297 298
        ret = -EINVAL;
        goto out;
K
Kevin Wolf 已提交
299 300
    }

301
    pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
F
Fam Zheng 已提交
302 303
    p_name = strstr(desc, "CID");
    if (p_name != NULL) {
304
        p_name += sizeof("CID");
305 306
        snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
        pstrcat(desc, DESC_SIZE, tmp_desc);
307 308
    }

309
    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
310

311 312 313 314
out:
    g_free(desc);
    g_free(tmp_desc);
    return ret;
315 316 317 318 319 320 321
}

static int vmdk_is_cid_valid(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;
    uint32_t cur_pcid;

322 323 324
    if (!s->cid_checked && bs->backing) {
        BlockDriverState *p_bs = bs->backing->bs;

F
Fam Zheng 已提交
325 326 327
        cur_pcid = vmdk_read_cid(p_bs, 0);
        if (s->parent_cid != cur_pcid) {
            /* CID not valid */
328
            return 0;
F
Fam Zheng 已提交
329
        }
330
    }
331
    s->cid_checked = true;
F
Fam Zheng 已提交
332
    /* CID valid */
333 334 335
    return 1;
}

K
Kevin Wolf 已提交
336
/* We have nothing to do for VMDK reopen, stubs just return success */
J
Jeff Cody 已提交
337 338 339 340 341
static int vmdk_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
{
    assert(state != NULL);
    assert(state->bs != NULL);
K
Kevin Wolf 已提交
342
    return 0;
J
Jeff Cody 已提交
343 344
}

345
static int vmdk_parent_open(BlockDriverState *bs)
346
{
347
    char *p_name;
348
    char *desc;
349
    BDRVVmdkState *s = bs->opaque;
350
    int ret;
351

352
    desc = g_malloc0(DESC_SIZE + 1);
353
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
354
    if (ret < 0) {
355
        goto out;
356
    }
357
    ret = 0;
358

F
Fam Zheng 已提交
359 360
    p_name = strstr(desc, "parentFileNameHint");
    if (p_name != NULL) {
361 362 363
        char *end_name;

        p_name += sizeof("parentFileNameHint") + 1;
F
Fam Zheng 已提交
364 365
        end_name = strchr(p_name, '\"');
        if (end_name == NULL) {
366 367
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
368 369
        }
        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
370 371
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
372
        }
373

K
Kevin Wolf 已提交
374
        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
375
    }
376

377 378 379
out:
    g_free(desc);
    return ret;
380 381
}

F
Fam Zheng 已提交
382 383
/* Create and append extent to the extent array. Return the added VmdkExtent
 * address. return NULL if allocation failed. */
384
static int vmdk_add_extent(BlockDriverState *bs,
385
                           BdrvChild *file, bool flat, int64_t sectors,
F
Fam Zheng 已提交
386 387
                           int64_t l1_offset, int64_t l1_backup_offset,
                           uint32_t l1_size,
388
                           int l2_size, uint64_t cluster_sectors,
F
Fam Zheng 已提交
389 390
                           VmdkExtent **new_extent,
                           Error **errp)
F
Fam Zheng 已提交
391 392 393
{
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
394
    int64_t nb_sectors;
F
Fam Zheng 已提交
395

396 397
    if (cluster_sectors > 0x200000) {
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
F
Fam Zheng 已提交
398 399
        error_setg(errp, "Invalid granularity, image may be corrupt");
        return -EFBIG;
400
    }
401 402 403 404 405
    if (l1_size > 512 * 1024 * 1024) {
        /* Although with big capacity and small l1_entry_sectors, we can get a
         * big l1_size, we don't want unbounded value to allocate the table.
         * Limit it to 512M, which is 16PB for default cluster and L2 table
         * size */
F
Fam Zheng 已提交
406
        error_setg(errp, "L1 size too big");
407 408
        return -EFBIG;
    }
409

410
    nb_sectors = bdrv_nb_sectors(file->bs);
411 412
    if (nb_sectors < 0) {
        return nb_sectors;
F
Fam Zheng 已提交
413 414
    }

415
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
F
Fam Zheng 已提交
416 417 418 419 420 421 422 423 424 425 426 427
    extent = &s->extents[s->num_extents];
    s->num_extents++;

    memset(extent, 0, sizeof(VmdkExtent));
    extent->file = file;
    extent->flat = flat;
    extent->sectors = sectors;
    extent->l1_table_offset = l1_offset;
    extent->l1_backup_table_offset = l1_backup_offset;
    extent->l1_size = l1_size;
    extent->l1_entry_sectors = l2_size * cluster_sectors;
    extent->l2_size = l2_size;
428
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
429
    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
F
Fam Zheng 已提交
430 431 432 433 434 435 436

    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
    } else {
        extent->end_sector = extent->sectors;
    }
    bs->total_sectors = extent->end_sector;
437 438 439 440
    if (new_extent) {
        *new_extent = extent;
    }
    return 0;
F
Fam Zheng 已提交
441 442
}

F
Fam Zheng 已提交
443 444
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
                            Error **errp)
445
{
446
    int ret;
447 448
    size_t l1_size;
    int i;
449

B
bellard 已提交
450
    /* read the L1 table */
F
Fam Zheng 已提交
451
    l1_size = extent->l1_size * sizeof(uint32_t);
452 453 454 455 456
    extent->l1_table = g_try_malloc(l1_size);
    if (l1_size && extent->l1_table == NULL) {
        return -ENOMEM;
    }

457
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
458 459 460
                     extent->l1_table_offset,
                     extent->l1_table,
                     l1_size);
461
    if (ret < 0) {
F
Fam Zheng 已提交
462 463
        error_setg_errno(errp, -ret,
                         "Could not read l1 table from extent '%s'",
464
                         extent->file->bs->filename);
465
        goto fail_l1;
F
Fam Zheng 已提交
466 467 468
    }
    for (i = 0; i < extent->l1_size; i++) {
        le32_to_cpus(&extent->l1_table[i]);
B
bellard 已提交
469 470
    }

F
Fam Zheng 已提交
471
    if (extent->l1_backup_table_offset) {
472 473 474 475 476
        extent->l1_backup_table = g_try_malloc(l1_size);
        if (l1_size && extent->l1_backup_table == NULL) {
            ret = -ENOMEM;
            goto fail_l1;
        }
477
        ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
478 479 480
                         extent->l1_backup_table_offset,
                         extent->l1_backup_table,
                         l1_size);
481
        if (ret < 0) {
F
Fam Zheng 已提交
482 483
            error_setg_errno(errp, -ret,
                             "Could not read l1 backup table from extent '%s'",
484
                             extent->file->bs->filename);
485
            goto fail_l1b;
F
Fam Zheng 已提交
486 487 488
        }
        for (i = 0; i < extent->l1_size; i++) {
            le32_to_cpus(&extent->l1_backup_table[i]);
489 490 491
        }
    }

F
Fam Zheng 已提交
492
    extent->l2_cache =
493
        g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE);
B
bellard 已提交
494
    return 0;
495
 fail_l1b:
496
    g_free(extent->l1_backup_table);
497
 fail_l1:
498
    g_free(extent->l1_table);
499 500 501
    return ret;
}

F
Fam Zheng 已提交
502
static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
503
                                 BdrvChild *file,
F
Fam Zheng 已提交
504
                                 int flags, Error **errp)
505 506 507 508 509 510
{
    int ret;
    uint32_t magic;
    VMDK3Header header;
    VmdkExtent *extent;

511
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
512
    if (ret < 0) {
F
Fam Zheng 已提交
513 514
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
515
                         file->bs->filename);
516
        return ret;
517
    }
518 519
    ret = vmdk_add_extent(bs, file, false,
                          le32_to_cpu(header.disk_sectors),
520
                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
521 522 523 524
                          0,
                          le32_to_cpu(header.l1dir_size),
                          4096,
                          le32_to_cpu(header.granularity),
F
Fam Zheng 已提交
525 526
                          &extent,
                          errp);
527 528 529
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
530
    ret = vmdk_init_tables(bs, extent, errp);
531
    if (ret) {
532 533
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
534 535 536 537
    }
    return ret;
}

538
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
539
                               QDict *options, Error **errp);
F
Fam Zheng 已提交
540

541
static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
P
Paolo Bonzini 已提交
542 543 544 545 546
{
    int64_t size;
    char *buf;
    int ret;

547
    size = bdrv_getlength(file->bs);
P
Paolo Bonzini 已提交
548 549 550 551 552
    if (size < 0) {
        error_setg_errno(errp, -size, "Could not access file");
        return NULL;
    }

553 554 555 556 557 558 559 560
    if (size < 4) {
        /* Both descriptor file and sparse image must be much larger than 4
         * bytes, also callers of vmdk_read_desc want to compare the first 4
         * bytes with VMDK4_MAGIC, let's error out if less is read. */
        error_setg(errp, "File is too small, not a valid image");
        return NULL;
    }

F
Fam Zheng 已提交
561 562
    size = MIN(size, (1 << 20) - 1);  /* avoid unbounded allocation */
    buf = g_malloc(size + 1);
P
Paolo Bonzini 已提交
563 564 565 566 567 568 569

    ret = bdrv_pread(file, desc_offset, buf, size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read from file");
        g_free(buf);
        return NULL;
    }
F
Fam Zheng 已提交
570
    buf[ret] = 0;
P
Paolo Bonzini 已提交
571 572 573 574

    return buf;
}

575
static int vmdk_open_vmdk4(BlockDriverState *bs,
576
                           BdrvChild *file,
K
Kevin Wolf 已提交
577
                           int flags, QDict *options, Error **errp)
578 579 580 581 582 583
{
    int ret;
    uint32_t magic;
    uint32_t l1_size, l1_entry_sectors;
    VMDK4Header header;
    VmdkExtent *extent;
F
Fam Zheng 已提交
584
    BDRVVmdkState *s = bs->opaque;
585
    int64_t l1_backup_offset = 0;
586
    bool compressed;
587

588
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
589
    if (ret < 0) {
F
Fam Zheng 已提交
590 591
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
592
                         file->bs->filename);
P
Paolo Bonzini 已提交
593
        return -EINVAL;
594
    }
595
    if (header.capacity == 0) {
596
        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
597
        if (desc_offset) {
598
            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
599 600 601
            if (!buf) {
                return -EINVAL;
            }
K
Kevin Wolf 已提交
602
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
603 604
            g_free(buf);
            return ret;
605
        }
F
Fam Zheng 已提交
606
    }
607

F
Fam Zheng 已提交
608 609 610 611
    if (!s->create_type) {
        s->create_type = g_strdup("monolithicSparse");
    }

612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
        /*
         * The footer takes precedence over the header, so read it in. The
         * footer starts at offset -1024 from the end: One sector for the
         * footer, and another one for the end-of-stream marker.
         */
        struct {
            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED footer_marker;

            uint32_t magic;
            VMDK4Header header;
            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];

            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED eos_marker;
        } QEMU_PACKED footer;

638
        ret = bdrv_pread(file,
K
Kevin Wolf 已提交
639
            bs->file->bs->total_sectors * 512 - 1536,
640 641
            &footer, sizeof(footer));
        if (ret < 0) {
642
            error_setg_errno(errp, -ret, "Failed to read footer");
643 644 645 646 647 648 649 650 651 652 653
            return ret;
        }

        /* Some sanity checks for the footer */
        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
            le32_to_cpu(footer.footer_marker.size) != 0  ||
            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
            le64_to_cpu(footer.eos_marker.val) != 0  ||
            le32_to_cpu(footer.eos_marker.size) != 0  ||
            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
        {
654
            error_setg(errp, "Invalid footer");
655 656 657 658 659 660
            return -EINVAL;
        }

        header = footer.header;
    }

661 662
    compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
663
    if (le32_to_cpu(header.version) > 3) {
664 665
        error_setg(errp, "Unsupported VMDK version %" PRIu32,
                   le32_to_cpu(header.version));
666
        return -ENOTSUP;
667 668
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
               !compressed) {
669 670 671 672 673 674
        /* VMware KB 2064959 explains that version 3 added support for
         * persistent changed block tracking (CBT), and backup software can
         * read it as version=1 if it doesn't care about the changed area
         * information. So we are safe to enable read only. */
        error_setg(errp, "VMDK version 3 must be read only");
        return -EINVAL;
675 676
    }

677
    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
P
Paolo Bonzini 已提交
678
        error_setg(errp, "L2 table size too big");
679 680 681
        return -EINVAL;
    }

682
    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
683
                        * le64_to_cpu(header.granularity);
684
    if (l1_entry_sectors == 0) {
685
        error_setg(errp, "L1 entry size is invalid");
686 687
        return -EINVAL;
    }
688 689
    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
                / l1_entry_sectors;
690 691 692
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
    }
693
    if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) {
694 695 696
        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
                   (int64_t)(le64_to_cpu(header.grain_offset)
                             * BDRV_SECTOR_SIZE));
697 698 699
        return -EINVAL;
    }

700
    ret = vmdk_add_extent(bs, file, false,
701 702
                          le64_to_cpu(header.capacity),
                          le64_to_cpu(header.gd_offset) << 9,
703
                          l1_backup_offset,
704
                          l1_size,
705
                          le32_to_cpu(header.num_gtes_per_gt),
706
                          le64_to_cpu(header.granularity),
F
Fam Zheng 已提交
707 708
                          &extent,
                          errp);
709 710 711
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
712 713
    extent->compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
714 715 716 717
    if (extent->compressed) {
        g_free(s->create_type);
        s->create_type = g_strdup("streamOptimized");
    }
F
Fam Zheng 已提交
718
    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
719 720
    extent->version = le32_to_cpu(header.version);
    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
F
Fam Zheng 已提交
721
    ret = vmdk_init_tables(bs, extent, errp);
722
    if (ret) {
723 724
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
725 726 727 728
    }
    return ret;
}

729 730 731 732 733 734 735 736 737
/* find an option value out of descriptor file */
static int vmdk_parse_description(const char *desc, const char *opt_name,
        char *buf, int buf_size)
{
    char *opt_pos, *opt_end;
    const char *end = desc + strlen(desc);

    opt_pos = strstr(desc, opt_name);
    if (!opt_pos) {
F
Fam Zheng 已提交
738
        return VMDK_ERROR;
739 740 741 742
    }
    /* Skip "=\"" following opt_name */
    opt_pos += strlen(opt_name) + 2;
    if (opt_pos >= end) {
F
Fam Zheng 已提交
743
        return VMDK_ERROR;
744 745 746 747 748 749
    }
    opt_end = opt_pos;
    while (opt_end < end && *opt_end != '"') {
        opt_end++;
    }
    if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
F
Fam Zheng 已提交
750
        return VMDK_ERROR;
751 752
    }
    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
F
Fam Zheng 已提交
753
    return VMDK_OK;
754 755
}

756
/* Open an extent file and append to bs array */
757
static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags,
K
Kevin Wolf 已提交
758
                            char *buf, QDict *options, Error **errp)
759 760 761
{
    uint32_t magic;

762
    magic = ldl_be_p(buf);
763 764
    switch (magic) {
        case VMDK3_MAGIC:
F
Fam Zheng 已提交
765
            return vmdk_open_vmfs_sparse(bs, file, flags, errp);
766 767
            break;
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
768
            return vmdk_open_vmdk4(bs, file, flags, options, errp);
769 770
            break;
        default:
P
Paolo Bonzini 已提交
771 772
            error_setg(errp, "Image not in VMDK format");
            return -EINVAL;
773 774 775 776
            break;
    }
}

777 778 779 780 781 782 783 784 785 786 787
static const char *next_line(const char *s)
{
    while (*s) {
        if (*s == '\n') {
            return s + 1;
        }
        s++;
    }
    return s;
}

788
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
K
Kevin Wolf 已提交
789 790
                              const char *desc_file_path, QDict *options,
                              Error **errp)
791 792
{
    int ret;
793
    int matches;
794 795 796
    char access[11];
    char type[11];
    char fname[512];
797
    const char *p, *np;
798 799
    int64_t sectors = 0;
    int64_t flat_offset;
800
    char *extent_path;
801
    BdrvChild *extent_file;
F
Fam Zheng 已提交
802 803
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent;
K
Kevin Wolf 已提交
804
    char extent_opt_prefix[32];
805
    Error *local_err = NULL;
806

807
    for (p = desc; *p; p = next_line(p)) {
808 809
        /* parse extent line in one of below formats:
         *
810 811
         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
         * RW [size in sectors] SPARSE "file-name.vmdk"
812 813
         * RW [size in sectors] VMFS "file-name.vmdk"
         * RW [size in sectors] VMFSSPARSE "file-name.vmdk"
814 815
         */
        flat_offset = -1;
816 817 818
        matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
                         access, &sectors, type, fname, &flat_offset);
        if (matches < 4 || strcmp(access, "RW")) {
819
            continue;
820
        } else if (!strcmp(type, "FLAT")) {
821
            if (matches != 5 || flat_offset < 0) {
822
                goto invalid;
823
            }
F
Fam Zheng 已提交
824
        } else if (!strcmp(type, "VMFS")) {
825
            if (matches == 4) {
826 827
                flat_offset = 0;
            } else {
828
                goto invalid;
829
            }
830
        } else if (matches != 4) {
831
            goto invalid;
832 833 834
        }

        if (sectors <= 0 ||
F
Fam Zheng 已提交
835
            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
P
Paolo Bonzini 已提交
836
             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
837
            (strcmp(access, "RW"))) {
838
            continue;
839 840
        }

841 842 843 844
        if (!path_is_absolute(fname) && !path_has_protocol(fname) &&
            !desc_file_path[0])
        {
            error_setg(errp, "Cannot use relative extent paths with VMDK "
K
Kevin Wolf 已提交
845
                       "descriptor file '%s'", bs->file->bs->filename);
846 847 848
            return -EINVAL;
        }

849
        extent_path = g_malloc0(PATH_MAX);
J
Jeff Cody 已提交
850
        path_combine(extent_path, PATH_MAX, desc_file_path, fname);
K
Kevin Wolf 已提交
851 852 853 854

        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
        assert(ret < 32);

855 856
        extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix,
                                      bs, &child_file, false, &local_err);
857
        g_free(extent_path);
858 859 860
        if (local_err) {
            error_propagate(errp, local_err);
            return -EINVAL;
861 862
        }

863
        /* save to extents array */
P
Paolo Bonzini 已提交
864
        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
865 866
            /* FLAT extent */

867
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
F
Fam Zheng 已提交
868
                            0, 0, 0, 0, 0, &extent, errp);
869
            if (ret < 0) {
870
                bdrv_unref_child(bs, extent_file);
871 872
                return ret;
            }
F
Fam Zheng 已提交
873
            extent->flat_start_offset = flat_offset << 9;
F
Fam Zheng 已提交
874 875
        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
876
            char *buf = vmdk_read_desc(extent_file, 0, errp);
877 878 879
            if (!buf) {
                ret = -EINVAL;
            } else {
K
Kevin Wolf 已提交
880 881
                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
                                       options, errp);
882
            }
883
            g_free(buf);
884
            if (ret) {
885
                bdrv_unref_child(bs, extent_file);
886 887
                return ret;
            }
F
Fam Zheng 已提交
888
            extent = &s->extents[s->num_extents - 1];
889
        } else {
F
Fam Zheng 已提交
890
            error_setg(errp, "Unsupported extent type '%s'", type);
891
            bdrv_unref_child(bs, extent_file);
892 893
            return -ENOTSUP;
        }
F
Fam Zheng 已提交
894
        extent->type = g_strdup(type);
895 896
    }
    return 0;
897 898 899 900 901 902 903 904 905

invalid:
    np = next_line(p);
    assert(np != p);
    if (np[-1] == '\n') {
        np--;
    }
    error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p);
    return -EINVAL;
906 907
}

908
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
909
                               QDict *options, Error **errp)
910 911 912 913 914 915
{
    int ret;
    char ct[128];
    BDRVVmdkState *s = bs->opaque;

    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
P
Paolo Bonzini 已提交
916 917
        error_setg(errp, "invalid VMDK image descriptor");
        ret = -EINVAL;
918
        goto exit;
919
    }
F
Fam Zheng 已提交
920
    if (strcmp(ct, "monolithicFlat") &&
P
Paolo Bonzini 已提交
921
        strcmp(ct, "vmfs") &&
F
Fam Zheng 已提交
922
        strcmp(ct, "vmfsSparse") &&
923
        strcmp(ct, "twoGbMaxExtentSparse") &&
F
Fam Zheng 已提交
924
        strcmp(ct, "twoGbMaxExtentFlat")) {
F
Fam Zheng 已提交
925
        error_setg(errp, "Unsupported image type '%s'", ct);
926 927
        ret = -ENOTSUP;
        goto exit;
928
    }
F
Fam Zheng 已提交
929
    s->create_type = g_strdup(ct);
930
    s->desc_offset = 0;
K
Kevin Wolf 已提交
931 932
    ret = vmdk_parse_extents(buf, bs, bs->file->bs->exact_filename, options,
                             errp);
933 934
exit:
    return ret;
935 936
}

M
Max Reitz 已提交
937 938
static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
939
{
940
    char *buf;
941 942
    int ret;
    BDRVVmdkState *s = bs->opaque;
943
    uint32_t magic;
944

945
    buf = vmdk_read_desc(bs->file, 0, errp);
946 947 948 949
    if (!buf) {
        return -EINVAL;
    }

950 951 952 953
    magic = ldl_be_p(buf);
    switch (magic) {
        case VMDK3_MAGIC:
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
954
            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options,
955
                                   errp);
956 957 958
            s->desc_offset = 0x200;
            break;
        default:
K
Kevin Wolf 已提交
959
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
960
            break;
961
    }
962 963 964 965
    if (ret) {
        goto fail;
    }

P
Paolo Bonzini 已提交
966 967 968 969 970
    /* try to open parent images, if exist */
    ret = vmdk_parent_open(bs);
    if (ret) {
        goto fail;
    }
F
Fam Zheng 已提交
971
    s->cid = vmdk_read_cid(bs, 0);
P
Paolo Bonzini 已提交
972
    s->parent_cid = vmdk_read_cid(bs, 1);
973
    qemu_co_mutex_init(&s->lock);
K
Kevin Wolf 已提交
974 975

    /* Disable migration when VMDK images are used */
976 977 978
    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
K
Kevin Wolf 已提交
979
    migrate_add_blocker(s->migration_blocker);
980
    g_free(buf);
K
Kevin Wolf 已提交
981
    return 0;
P
Paolo Bonzini 已提交
982 983

fail:
984
    g_free(buf);
F
Fam Zheng 已提交
985 986
    g_free(s->create_type);
    s->create_type = NULL;
P
Paolo Bonzini 已提交
987 988
    vmdk_free_extents(bs);
    return ret;
B
bellard 已提交
989 990
}

991

992
static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
993 994 995 996 997 998
{
    BDRVVmdkState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_extents; i++) {
        if (!s->extents[i].flat) {
999 1000 1001
            bs->bl.pwrite_zeroes_alignment =
                MAX(bs->bl.pwrite_zeroes_alignment,
                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
1002 1003 1004 1005
        }
    }
}

F
Fam Zheng 已提交
1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
/**
 * get_whole_cluster
 *
 * Copy backing file's cluster that covers @sector_num, otherwise write zero,
 * to the cluster at @cluster_sector_num.
 *
 * If @skip_start_sector < @skip_end_sector, the relative range
 * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
 * it for call to write user data in the request.
 */
F
Fam Zheng 已提交
1016
static int get_whole_cluster(BlockDriverState *bs,
F
Fam Zheng 已提交
1017
                             VmdkExtent *extent,
1018 1019 1020 1021
                             uint64_t cluster_offset,
                             uint64_t offset,
                             uint64_t skip_start_bytes,
                             uint64_t skip_end_bytes)
1022
{
1023
    int ret = VMDK_OK;
F
Fam Zheng 已提交
1024 1025 1026 1027 1028
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
1029
    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
F
Fam Zheng 已提交
1030
    whole_grain = qemu_blockalign(bs, cluster_bytes);
1031

1032
    if (!bs->backing) {
1033 1034
        memset(whole_grain, 0, skip_start_bytes);
        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1035 1036
    }

1037
    assert(skip_end_bytes <= cluster_bytes);
1038 1039
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
1040
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
F
Fam Zheng 已提交
1041 1042 1043
        ret = VMDK_ERROR;
        goto exit;
    }
1044

F
Fam Zheng 已提交
1045
    /* Read backing data before skip range */
1046
    if (skip_start_bytes > 0) {
1047
        if (bs->backing) {
1048
            ret = bdrv_pread(bs->backing, offset, whole_grain,
1049
                             skip_start_bytes);
F
Fam Zheng 已提交
1050 1051 1052 1053 1054
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
1055
        ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
1056
                          skip_start_bytes);
K
Kevin Wolf 已提交
1057
        if (ret < 0) {
1058 1059
            ret = VMDK_ERROR;
            goto exit;
K
Kevin Wolf 已提交
1060
        }
F
Fam Zheng 已提交
1061 1062
    }
    /* Read backing data after skip range */
1063
    if (skip_end_bytes < cluster_bytes) {
1064
        if (bs->backing) {
1065
            ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
1066 1067
                             whole_grain + skip_end_bytes,
                             cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1068 1069 1070 1071 1072
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
1073
        ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
1074 1075
                          whole_grain + skip_end_bytes,
                          cluster_bytes - skip_end_bytes);
K
Kevin Wolf 已提交
1076
        if (ret < 0) {
1077 1078
            ret = VMDK_ERROR;
            goto exit;
1079 1080
        }
    }
F
Fam Zheng 已提交
1081

1082
    ret = VMDK_OK;
1083 1084 1085
exit:
    qemu_vfree(whole_grain);
    return ret;
1086 1087
}

F
Fam Zheng 已提交
1088 1089
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
                         uint32_t offset)
1090
{
F
Fam Zheng 已提交
1091
    offset = cpu_to_le32(offset);
1092
    /* update L2 table */
1093
    if (bdrv_pwrite_sync(extent->file,
F
Fam Zheng 已提交
1094
                ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1095
                    + (m_data->l2_index * sizeof(offset)),
1096
                &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1097
        return VMDK_ERROR;
F
Fam Zheng 已提交
1098
    }
1099
    /* update backup L2 table */
F
Fam Zheng 已提交
1100 1101
    if (extent->l1_backup_table_offset != 0) {
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
1102
        if (bdrv_pwrite_sync(extent->file,
F
Fam Zheng 已提交
1103
                    ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1104
                        + (m_data->l2_index * sizeof(offset)),
1105
                    &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1106
            return VMDK_ERROR;
F
Fam Zheng 已提交
1107
        }
1108
    }
F
Fam Zheng 已提交
1109 1110 1111
    if (m_data->l2_cache_entry) {
        *m_data->l2_cache_entry = offset;
    }
1112

F
Fam Zheng 已提交
1113
    return VMDK_OK;
1114 1115
}

F
Fam Zheng 已提交
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
/**
 * get_cluster_offset
 *
 * Look up cluster offset in extent file by sector number, and store in
 * @cluster_offset.
 *
 * For flat extents, the start offset as parsed from the description file is
 * returned.
 *
 * For sparse extents, look up in L1, L2 table. If allocate is true, return an
 * offset for a new cluster and update L2 cache. If there is a backing file,
 * COW is done before returning; otherwise, zeroes are written to the allocated
 * cluster. Both COW and zero writing skips the sector range
 * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
 * has new data to write there.
 *
 * Returns: VMDK_OK if cluster exists and mapped in the image.
 *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
 *          VMDK_ERROR if failed.
 */
1136
static int get_cluster_offset(BlockDriverState *bs,
F
Fam Zheng 已提交
1137 1138 1139 1140 1141
                              VmdkExtent *extent,
                              VmdkMetaData *m_data,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
1142 1143
                              uint64_t skip_start_bytes,
                              uint64_t skip_end_bytes)
B
bellard 已提交
1144 1145 1146
{
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
1147
    uint32_t min_count, *l2_table;
1148
    bool zeroed = false;
F
Fam Zheng 已提交
1149
    int64_t ret;
1150
    int64_t cluster_sector;
1151

F
Fam Zheng 已提交
1152
    if (m_data) {
1153
        m_data->valid = 0;
F
Fam Zheng 已提交
1154
    }
1155
    if (extent->flat) {
1156
        *cluster_offset = extent->flat_start_offset;
F
Fam Zheng 已提交
1157
        return VMDK_OK;
1158
    }
1159

F
Fam Zheng 已提交
1160
    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
F
Fam Zheng 已提交
1161 1162
    l1_index = (offset >> 9) / extent->l1_entry_sectors;
    if (l1_index >= extent->l1_size) {
F
Fam Zheng 已提交
1163
        return VMDK_ERROR;
F
Fam Zheng 已提交
1164 1165 1166
    }
    l2_offset = extent->l1_table[l1_index];
    if (!l2_offset) {
F
Fam Zheng 已提交
1167
        return VMDK_UNALLOC;
F
Fam Zheng 已提交
1168
    }
1169
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1170
        if (l2_offset == extent->l2_cache_offsets[i]) {
B
bellard 已提交
1171
            /* increment the hit count */
F
Fam Zheng 已提交
1172
            if (++extent->l2_cache_counts[i] == 0xffffffff) {
1173
                for (j = 0; j < L2_CACHE_SIZE; j++) {
F
Fam Zheng 已提交
1174
                    extent->l2_cache_counts[j] >>= 1;
B
bellard 已提交
1175 1176
                }
            }
F
Fam Zheng 已提交
1177
            l2_table = extent->l2_cache + (i * extent->l2_size);
B
bellard 已提交
1178 1179 1180 1181 1182 1183
            goto found;
        }
    }
    /* not found: load a new entry in the least used one */
    min_index = 0;
    min_count = 0xffffffff;
1184
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1185 1186
        if (extent->l2_cache_counts[i] < min_count) {
            min_count = extent->l2_cache_counts[i];
B
bellard 已提交
1187 1188 1189
            min_index = i;
        }
    }
F
Fam Zheng 已提交
1190
    l2_table = extent->l2_cache + (min_index * extent->l2_size);
1191
    if (bdrv_pread(extent->file,
F
Fam Zheng 已提交
1192 1193 1194 1195
                (int64_t)l2_offset * 512,
                l2_table,
                extent->l2_size * sizeof(uint32_t)
            ) != extent->l2_size * sizeof(uint32_t)) {
F
Fam Zheng 已提交
1196
        return VMDK_ERROR;
F
Fam Zheng 已提交
1197
    }
1198

F
Fam Zheng 已提交
1199 1200
    extent->l2_cache_offsets[min_index] = l2_offset;
    extent->l2_cache_counts[min_index] = 1;
B
bellard 已提交
1201
 found:
F
Fam Zheng 已提交
1202
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
F
Fam Zheng 已提交
1203
    cluster_sector = le32_to_cpu(l2_table[l2_index]);
1204

F
Fam Zheng 已提交
1205
    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
1206 1207 1208
        zeroed = true;
    }

F
Fam Zheng 已提交
1209
    if (!cluster_sector || zeroed) {
1210
        if (!allocate) {
1211
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
1212
        }
1213

F
Fam Zheng 已提交
1214 1215
        cluster_sector = extent->next_cluster_sector;
        extent->next_cluster_sector += extent->cluster_sectors;
1216 1217 1218 1219 1220 1221

        /* First of all we write grain itself, to avoid race condition
         * that may to corrupt the image.
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
1222 1223
        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
                                offset, skip_start_bytes, skip_end_bytes);
F
Fam Zheng 已提交
1224 1225
        if (ret) {
            return ret;
1226
        }
1227 1228 1229 1230 1231 1232 1233
        if (m_data) {
            m_data->valid = 1;
            m_data->l1_index = l1_index;
            m_data->l2_index = l2_index;
            m_data->l2_offset = l2_offset;
            m_data->l2_cache_entry = &l2_table[l2_index];
        }
1234
    }
F
Fam Zheng 已提交
1235
    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
F
Fam Zheng 已提交
1236
    return VMDK_OK;
B
bellard 已提交
1237 1238
}

F
Fam Zheng 已提交
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
static VmdkExtent *find_extent(BDRVVmdkState *s,
                                int64_t sector_num, VmdkExtent *start_hint)
{
    VmdkExtent *extent = start_hint;

    if (!extent) {
        extent = &s->extents[0];
    }
    while (extent < &s->extents[s->num_extents]) {
        if (sector_num < extent->end_sector) {
            return extent;
        }
        extent++;
    }
    return NULL;
}

1256 1257 1258
static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
                                                   int64_t offset)
{
1259
    uint64_t extent_begin_offset, extent_relative_offset;
1260 1261 1262 1263 1264
    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;

    extent_begin_offset =
        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
    extent_relative_offset = offset - extent_begin_offset;
1265
    return extent_relative_offset % cluster_size;
1266 1267
}

1268 1269 1270
static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
                                                  int64_t sector_num)
{
1271 1272 1273
    uint64_t offset;
    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
    return offset / BDRV_SECTOR_SIZE;
1274 1275
}

1276
static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
1277
        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
B
bellard 已提交
1278 1279
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1280 1281 1282 1283 1284 1285 1286 1287
    int64_t index_in_cluster, n, ret;
    uint64_t offset;
    VmdkExtent *extent;

    extent = find_extent(s, sector_num, NULL);
    if (!extent) {
        return 0;
    }
1288
    qemu_co_mutex_lock(&s->lock);
1289
    ret = get_cluster_offset(bs, extent, NULL,
F
Fam Zheng 已提交
1290 1291
                             sector_num * 512, false, &offset,
                             0, 0);
1292
    qemu_co_mutex_unlock(&s->lock);
1293

1294
    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
    switch (ret) {
    case VMDK_ERROR:
        ret = -EIO;
        break;
    case VMDK_UNALLOC:
        ret = 0;
        break;
    case VMDK_ZEROED:
        ret = BDRV_BLOCK_ZERO;
        break;
    case VMDK_OK:
        ret = BDRV_BLOCK_DATA;
1307
        if (!extent->compressed) {
1308 1309 1310
            ret |= BDRV_BLOCK_OFFSET_VALID;
            ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS))
                    & BDRV_BLOCK_OFFSET_MASK;
1311
        }
1312
        *file = extent->file->bs;
1313 1314
        break;
    }
1315 1316

    n = extent->cluster_sectors - index_in_cluster;
F
Fam Zheng 已提交
1317
    if (n > nb_sectors) {
B
bellard 已提交
1318
        n = nb_sectors;
F
Fam Zheng 已提交
1319
    }
B
bellard 已提交
1320
    *pnum = n;
F
Fam Zheng 已提交
1321
    return ret;
B
bellard 已提交
1322 1323
}

1324
static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1325 1326 1327
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            uint64_t qiov_offset, uint64_t n_bytes,
                            uint64_t offset)
1328 1329
{
    int ret;
F
Fam Zheng 已提交
1330 1331
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
1332 1333
    QEMUIOVector local_qiov;
    struct iovec iov;
1334 1335
    int64_t write_offset;
    int64_t write_end_sector;
1336

F
Fam Zheng 已提交
1337
    if (extent->compressed) {
1338 1339
        void *compressed_data;

F
Fam Zheng 已提交
1340 1341 1342 1343 1344 1345
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1346 1347 1348 1349 1350 1351 1352

        compressed_data = g_malloc(n_bytes);
        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
        g_free(compressed_data);

        if (ret != Z_OK || buf_len == 0) {
F
Fam Zheng 已提交
1353 1354 1355
            ret = -EINVAL;
            goto out;
        }
1356 1357

        data->lba = offset >> BDRV_SECTOR_BITS;
F
Fam Zheng 已提交
1358
        data->size = buf_len;
1359 1360 1361 1362 1363 1364 1365 1366 1367 1368

        n_bytes = buf_len + sizeof(VmdkGrainMarker);
        iov = (struct iovec) {
            .iov_base   = data,
            .iov_len    = n_bytes,
        };
        qemu_iovec_init_external(&local_qiov, &iov, 1);
    } else {
        qemu_iovec_init(&local_qiov, qiov->niov);
        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
F
Fam Zheng 已提交
1369
    }
1370

1371
    write_offset = cluster_offset + offset_in_cluster,
1372
    ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
1373
                          &local_qiov, 0);
1374

1375
    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
1376

1377 1378 1379 1380 1381 1382
    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
    } else {
        extent->next_cluster_sector = MAX(extent->next_cluster_sector,
                                          write_end_sector);
    }
1383

1384
    if (ret < 0) {
1385 1386 1387 1388
        goto out;
    }
    ret = 0;
 out:
F
Fam Zheng 已提交
1389
    g_free(data);
1390 1391 1392
    if (!extent->compressed) {
        qemu_iovec_destroy(&local_qiov);
    }
1393 1394 1395 1396
    return ret;
}

static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1397 1398
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            int bytes)
1399 1400
{
    int ret;
F
Fam Zheng 已提交
1401 1402 1403 1404 1405 1406 1407
    int cluster_bytes, buf_bytes;
    uint8_t *cluster_buf, *compressed_data;
    uint8_t *uncomp_buf;
    uint32_t data_len;
    VmdkGrainMarker *marker;
    uLongf buf_len;

1408

F
Fam Zheng 已提交
1409
    if (!extent->compressed) {
1410
        ret = bdrv_co_preadv(extent->file,
1411 1412 1413 1414
                             cluster_offset + offset_in_cluster, bytes,
                             qiov, 0);
        if (ret < 0) {
            return ret;
F
Fam Zheng 已提交
1415
        }
1416
        return 0;
F
Fam Zheng 已提交
1417 1418 1419 1420 1421 1422
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
    buf_bytes = cluster_bytes * 2;
    cluster_buf = g_malloc(buf_bytes);
    uncomp_buf = g_malloc(cluster_bytes);
1423
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447
                cluster_offset,
                cluster_buf, buf_bytes);
    if (ret < 0) {
        goto out;
    }
    compressed_data = cluster_buf;
    buf_len = cluster_bytes;
    data_len = cluster_bytes;
    if (extent->has_marker) {
        marker = (VmdkGrainMarker *)cluster_buf;
        compressed_data = marker->data;
        data_len = le32_to_cpu(marker->size);
    }
    if (!data_len || data_len > buf_bytes) {
        ret = -EINVAL;
        goto out;
    }
    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
    if (ret != Z_OK) {
        ret = -EINVAL;
        goto out;

    }
    if (offset_in_cluster < 0 ||
1448
            offset_in_cluster + bytes > buf_len) {
F
Fam Zheng 已提交
1449 1450
        ret = -EINVAL;
        goto out;
1451
    }
1452
    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
F
Fam Zheng 已提交
1453 1454 1455 1456 1457 1458
    ret = 0;

 out:
    g_free(uncomp_buf);
    g_free(cluster_buf);
    return ret;
1459 1460
}

1461 1462 1463
static int coroutine_fn
vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
               QEMUIOVector *qiov, int flags)
B
bellard 已提交
1464 1465
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1466
    int ret;
1467
    uint64_t n_bytes, offset_in_cluster;
F
Fam Zheng 已提交
1468
    VmdkExtent *extent = NULL;
1469
    QEMUIOVector local_qiov;
B
bellard 已提交
1470
    uint64_t cluster_offset;
1471
    uint64_t bytes_done = 0;
1472

1473 1474 1475 1476 1477
    qemu_iovec_init(&local_qiov, qiov->niov);
    qemu_co_mutex_lock(&s->lock);

    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1478
        if (!extent) {
1479 1480
            ret = -EIO;
            goto fail;
F
Fam Zheng 已提交
1481
        }
F
Fam Zheng 已提交
1482
        ret = get_cluster_offset(bs, extent, NULL,
1483 1484 1485 1486 1487 1488
                                 offset, false, &cluster_offset, 0, 0);
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);

        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

1489
        if (ret != VMDK_OK) {
1490
            /* if not allocated, try to read from parent image, if exist */
1491
            if (bs->backing && ret != VMDK_ZEROED) {
F
Fam Zheng 已提交
1492
                if (!vmdk_is_cid_valid(bs)) {
1493 1494
                    ret = -EINVAL;
                    goto fail;
F
Fam Zheng 已提交
1495
                }
1496 1497 1498 1499

                qemu_iovec_reset(&local_qiov);
                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

1500
                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
1501
                                     &local_qiov, 0);
F
Fam Zheng 已提交
1502
                if (ret < 0) {
1503
                    goto fail;
F
Fam Zheng 已提交
1504
                }
1505
            } else {
1506
                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
1507
            }
B
bellard 已提交
1508
        } else {
1509 1510 1511 1512 1513
            qemu_iovec_reset(&local_qiov);
            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
                                   &local_qiov, n_bytes);
1514
            if (ret) {
1515
                goto fail;
1516
            }
B
bellard 已提交
1517
        }
1518 1519 1520
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
B
bellard 已提交
1521 1522
    }

1523 1524
    ret = 0;
fail:
1525
    qemu_co_mutex_unlock(&s->lock);
1526 1527
    qemu_iovec_destroy(&local_qiov);

1528 1529 1530
    return ret;
}

F
Fam Zheng 已提交
1531 1532 1533
/**
 * vmdk_write:
 * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1534 1535 1536 1537
 *                if possible, otherwise return -ENOTSUP.
 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
 *                with each cluster. By dry run we can find if the zero write
 *                is possible without modifying image data.
F
Fam Zheng 已提交
1538 1539 1540
 *
 * Returns: error code with 0 for success.
 */
1541 1542 1543
static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
                       uint64_t bytes, QEMUIOVector *qiov,
                       bool zeroed, bool zero_dry_run)
B
bellard 已提交
1544
{
1545
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1546
    VmdkExtent *extent = NULL;
F
Fam Zheng 已提交
1547
    int ret;
1548
    int64_t offset_in_cluster, n_bytes;
1549
    uint64_t cluster_offset;
1550
    uint64_t bytes_done = 0;
F
Fam Zheng 已提交
1551
    VmdkMetaData m_data;
1552

1553 1554
    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
        error_report("Wrong offset: offset=0x%" PRIx64
1555
                     " total_sectors=0x%" PRIx64,
1556
                     offset, bs->total_sectors);
1557
        return -EIO;
1558 1559
    }

1560 1561
    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1562 1563 1564
        if (!extent) {
            return -EIO;
        }
1565 1566 1567 1568 1569
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

        ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
1570
                                 !(extent->compressed || zeroed),
1571 1572
                                 &cluster_offset, offset_in_cluster,
                                 offset_in_cluster + n_bytes);
F
Fam Zheng 已提交
1573
        if (extent->compressed) {
F
Fam Zheng 已提交
1574
            if (ret == VMDK_OK) {
F
Fam Zheng 已提交
1575
                /* Refuse write to allocated cluster for streamOptimized */
F
Fam Zheng 已提交
1576 1577
                error_report("Could not write to allocated cluster"
                              " for streamOptimized");
F
Fam Zheng 已提交
1578 1579 1580
                return -EIO;
            } else {
                /* allocate */
1581
                ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
1582
                                         true, &cluster_offset, 0, 0);
F
Fam Zheng 已提交
1583 1584
            }
        }
F
Fam Zheng 已提交
1585
        if (ret == VMDK_ERROR) {
1586
            return -EINVAL;
F
Fam Zheng 已提交
1587
        }
F
Fam Zheng 已提交
1588 1589 1590
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
1591 1592 1593
                    offset_in_cluster == 0 &&
                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
F
Fam Zheng 已提交
1594 1595
                if (!zero_dry_run) {
                    /* update L2 tables */
F
Fam Zheng 已提交
1596 1597
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
                            != VMDK_OK) {
F
Fam Zheng 已提交
1598 1599 1600 1601 1602 1603 1604
                        return -EIO;
                    }
                }
            } else {
                return -ENOTSUP;
            }
        } else {
1605 1606
            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
                                    qiov, bytes_done, n_bytes, offset);
F
Fam Zheng 已提交
1607 1608 1609 1610 1611
            if (ret) {
                return ret;
            }
            if (m_data.valid) {
                /* update L2 tables */
F
Fam Zheng 已提交
1612 1613 1614
                if (vmdk_L2update(extent, &m_data,
                                  cluster_offset >> BDRV_SECTOR_BITS)
                        != VMDK_OK) {
F
Fam Zheng 已提交
1615 1616
                    return -EIO;
                }
F
Fam Zheng 已提交
1617
            }
1618
        }
1619 1620 1621
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
1622

F
Fam Zheng 已提交
1623 1624
        /* update CID on the first write every time the virtual disk is
         * opened */
1625
        if (!s->cid_updated) {
F
Fam Zheng 已提交
1626
            ret = vmdk_write_cid(bs, g_random_int());
K
Kevin Wolf 已提交
1627 1628 1629
            if (ret < 0) {
                return ret;
            }
1630
            s->cid_updated = true;
1631
        }
1632 1633
    }
    return 0;
B
bellard 已提交
1634 1635
}

1636 1637 1638
static int coroutine_fn
vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
                QEMUIOVector *qiov, int flags)
1639 1640 1641 1642
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
1643
    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
F
Fam Zheng 已提交
1644 1645 1646 1647
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671
typedef struct VmdkWriteCompressedCo {
    BlockDriverState *bs;
    int64_t sector_num;
    const uint8_t *buf;
    int nb_sectors;
    int ret;
} VmdkWriteCompressedCo;

static void vmdk_co_write_compressed(void *opaque)
{
    VmdkWriteCompressedCo *co = opaque;
    QEMUIOVector local_qiov;
    uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE;
    uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE;

    struct iovec iov = (struct iovec) {
        .iov_base   = (uint8_t*) co->buf,
        .iov_len    = bytes,
    };
    qemu_iovec_init_external(&local_qiov, &iov, 1);

    co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false);
}

1672 1673 1674 1675 1676 1677
static int vmdk_write_compressed(BlockDriverState *bs,
                                 int64_t sector_num,
                                 const uint8_t *buf,
                                 int nb_sectors)
{
    BDRVVmdkState *s = bs->opaque;
1678

1679
    if (s->num_extents == 1 && s->extents[0].compressed) {
1680 1681 1682 1683 1684 1685 1686 1687 1688
        Coroutine *co;
        AioContext *aio_context = bdrv_get_aio_context(bs);
        VmdkWriteCompressedCo data = {
            .bs         = bs,
            .sector_num = sector_num,
            .buf        = buf,
            .nb_sectors = nb_sectors,
            .ret        = -EINPROGRESS,
        };
1689 1690
        co = qemu_coroutine_create(vmdk_co_write_compressed, &data);
        qemu_coroutine_enter(co);
1691 1692 1693 1694
        while (data.ret == -EINPROGRESS) {
            aio_poll(aio_context, true);
        }
        return data.ret;
1695 1696 1697 1698 1699
    } else {
        return -ENOTSUP;
    }
}

1700 1701 1702 1703
static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
                                              int64_t offset,
                                              int bytes,
                                              BdrvRequestFlags flags)
F
Fam Zheng 已提交
1704 1705 1706
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
1707

F
Fam Zheng 已提交
1708
    qemu_co_mutex_lock(&s->lock);
1709 1710
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
1711
    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
F
Fam Zheng 已提交
1712
    if (!ret) {
1713
        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
F
Fam Zheng 已提交
1714
    }
1715 1716 1717 1718
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

1719
static int vmdk_create_extent(const char *filename, int64_t filesize,
1720
                              bool flat, bool compress, bool zeroed_grain,
1721
                              QemuOpts *opts, Error **errp)
1722
{
F
Fam Zheng 已提交
1723
    int ret, i;
1724
    BlockBackend *blk = NULL;
1725
    VMDK4Header header;
F
Fam Zheng 已提交
1726
    Error *local_err = NULL;
1727 1728 1729
    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
    uint32_t *gd_buf = NULL;
    int gd_buf_size;
1730

1731
    ret = bdrv_create_file(filename, opts, &local_err);
1732 1733 1734
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto exit;
1735
    }
1736

1737
    blk = blk_new_open(filename, NULL, NULL,
1738
                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
1739
    if (blk == NULL) {
1740
        error_propagate(errp, local_err);
1741
        ret = -EIO;
1742 1743 1744
        goto exit;
    }

1745 1746
    blk_set_allow_write_beyond_eof(blk, true);

F
Fam Zheng 已提交
1747
    if (flat) {
1748
        ret = blk_truncate(blk, filesize);
F
Fam Zheng 已提交
1749
        if (ret < 0) {
1750
            error_setg_errno(errp, -ret, "Could not truncate file");
F
Fam Zheng 已提交
1751 1752
        }
        goto exit;
1753
    }
1754 1755
    magic = cpu_to_be32(VMDK4_MAGIC);
    memset(&header, 0, sizeof(header));
1756 1757 1758 1759 1760 1761 1762
    if (compress) {
        header.version = 3;
    } else if (zeroed_grain) {
        header.version = 2;
    } else {
        header.version = 1;
    }
F
Fam Zheng 已提交
1763
    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
1764 1765
                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
1766
    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1767
    header.capacity = filesize / BDRV_SECTOR_SIZE;
A
Alexander Graf 已提交
1768
    header.granularity = 128;
1769
    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
1770

1771 1772 1773 1774 1775
    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                           BDRV_SECTOR_SIZE);
    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
1776 1777 1778 1779

    header.desc_offset = 1;
    header.desc_size = 20;
    header.rgd_offset = header.desc_offset + header.desc_size;
1780
    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
1781
    header.grain_offset =
1782 1783
        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                 header.granularity);
A
Alexander Graf 已提交
1784 1785 1786 1787 1788
    /* swap endianness for all header fields */
    header.version = cpu_to_le32(header.version);
    header.flags = cpu_to_le32(header.flags);
    header.capacity = cpu_to_le64(header.capacity);
    header.granularity = cpu_to_le64(header.granularity);
1789
    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
1790 1791 1792 1793 1794
    header.desc_offset = cpu_to_le64(header.desc_offset);
    header.desc_size = cpu_to_le64(header.desc_size);
    header.rgd_offset = cpu_to_le64(header.rgd_offset);
    header.gd_offset = cpu_to_le64(header.gd_offset);
    header.grain_offset = cpu_to_le64(header.grain_offset);
1795
    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1796 1797 1798 1799 1800

    header.check_bytes[0] = 0xa;
    header.check_bytes[1] = 0x20;
    header.check_bytes[2] = 0xd;
    header.check_bytes[3] = 0xa;
1801 1802

    /* write all the data */
1803
    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
1804
    if (ret < 0) {
1805
        error_setg(errp, QERR_IO_ERROR);
1806 1807
        goto exit;
    }
1808
    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
1809
    if (ret < 0) {
1810
        error_setg(errp, QERR_IO_ERROR);
1811 1812
        goto exit;
    }
1813

1814
    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9);
1815
    if (ret < 0) {
1816
        error_setg_errno(errp, -ret, "Could not truncate file");
1817 1818
        goto exit;
    }
1819 1820

    /* write grain directory */
1821 1822 1823
    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
    gd_buf = g_malloc0(gd_buf_size);
    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
1824
         i < gt_count; i++, tmp += gt_size) {
1825 1826
        gd_buf[i] = cpu_to_le32(tmp);
    }
1827
    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
1828
                     gd_buf, gd_buf_size, 0);
1829
    if (ret < 0) {
1830
        error_setg(errp, QERR_IO_ERROR);
1831
        goto exit;
1832
    }
1833

1834
    /* write backup grain directory */
1835
    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
1836
         i < gt_count; i++, tmp += gt_size) {
1837 1838
        gd_buf[i] = cpu_to_le32(tmp);
    }
1839
    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
1840
                     gd_buf, gd_buf_size, 0);
1841
    if (ret < 0) {
1842
        error_setg(errp, QERR_IO_ERROR);
1843
        goto exit;
1844
    }
1845

F
Fam Zheng 已提交
1846
    ret = 0;
1847
exit:
1848 1849
    if (blk) {
        blk_unref(blk);
1850 1851
    }
    g_free(gd_buf);
F
Fam Zheng 已提交
1852 1853 1854 1855
    return ret;
}

static int filename_decompose(const char *filename, char *path, char *prefix,
F
Fam Zheng 已提交
1856
                              char *postfix, size_t buf_len, Error **errp)
F
Fam Zheng 已提交
1857 1858 1859 1860
{
    const char *p, *q;

    if (filename == NULL || !strlen(filename)) {
F
Fam Zheng 已提交
1861
        error_setg(errp, "No filename provided");
F
Fam Zheng 已提交
1862
        return VMDK_ERROR;
F
Fam Zheng 已提交
1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
    }
    p = strrchr(filename, '/');
    if (p == NULL) {
        p = strrchr(filename, '\\');
    }
    if (p == NULL) {
        p = strrchr(filename, ':');
    }
    if (p != NULL) {
        p++;
        if (p - filename >= buf_len) {
F
Fam Zheng 已提交
1874
            return VMDK_ERROR;
F
Fam Zheng 已提交
1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886
        }
        pstrcpy(path, p - filename + 1, filename);
    } else {
        p = filename;
        path[0] = '\0';
    }
    q = strrchr(p, '.');
    if (q == NULL) {
        pstrcpy(prefix, buf_len, p);
        postfix[0] = '\0';
    } else {
        if (q - p >= buf_len) {
F
Fam Zheng 已提交
1887
            return VMDK_ERROR;
F
Fam Zheng 已提交
1888 1889 1890 1891
        }
        pstrcpy(prefix, q - p + 1, p);
        pstrcpy(postfix, buf_len, q);
    }
F
Fam Zheng 已提交
1892
    return VMDK_OK;
F
Fam Zheng 已提交
1893 1894
}

1895
static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
F
Fam Zheng 已提交
1896
{
1897
    int idx = 0;
1898
    BlockBackend *new_blk = NULL;
F
Fam Zheng 已提交
1899
    Error *local_err = NULL;
1900
    char *desc = NULL;
F
Fam Zheng 已提交
1901
    int64_t total_size = 0, filesize;
1902 1903
    char *adapter_type = NULL;
    char *backing_file = NULL;
1904
    char *hw_version = NULL;
1905
    char *fmt = NULL;
F
Fam Zheng 已提交
1906
    int ret = 0;
1907
    bool flat, split, compress;
1908
    GString *ext_desc_lines;
1909 1910 1911 1912 1913 1914
    char *path = g_malloc0(PATH_MAX);
    char *prefix = g_malloc0(PATH_MAX);
    char *postfix = g_malloc0(PATH_MAX);
    char *desc_line = g_malloc0(BUF_SIZE);
    char *ext_filename = g_malloc0(PATH_MAX);
    char *desc_filename = g_malloc0(PATH_MAX);
F
Fam Zheng 已提交
1915 1916
    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
    const char *desc_extent_line;
1917
    char *parent_desc_line = g_malloc0(BUF_SIZE);
F
Fam Zheng 已提交
1918
    uint32_t parent_cid = 0xffffffff;
1919
    uint32_t number_heads = 16;
1920
    bool zeroed_grain = false;
1921
    uint32_t desc_offset = 0, desc_len;
F
Fam Zheng 已提交
1922 1923 1924
    const char desc_template[] =
        "# Disk DescriptorFile\n"
        "version=1\n"
1925 1926
        "CID=%" PRIx32 "\n"
        "parentCID=%" PRIx32 "\n"
F
Fam Zheng 已提交
1927 1928 1929 1930 1931 1932 1933 1934 1935
        "createType=\"%s\"\n"
        "%s"
        "\n"
        "# Extent description\n"
        "%s"
        "\n"
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
1936
        "ddb.virtualHWVersion = \"%s\"\n"
F
Fam Zheng 已提交
1937
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1938
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
F
Fam Zheng 已提交
1939
        "ddb.geometry.sectors = \"63\"\n"
1940
        "ddb.adapterType = \"%s\"\n";
F
Fam Zheng 已提交
1941

1942 1943
    ext_desc_lines = g_string_new(NULL);

F
Fam Zheng 已提交
1944
    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
1945 1946
        ret = -EINVAL;
        goto exit;
F
Fam Zheng 已提交
1947 1948
    }
    /* Read out options */
1949 1950
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
1951 1952
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
1953
    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
1954
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966
        if (strcmp(hw_version, "undefined")) {
            error_setg(errp,
                       "compat6 cannot be enabled with hwversion set");
            ret = -EINVAL;
            goto exit;
        }
        g_free(hw_version);
        hw_version = g_strdup("6");
    }
    if (strcmp(hw_version, "undefined") == 0) {
        g_free(hw_version);
        hw_version = g_strdup("4");
1967 1968 1969 1970
    }
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
        zeroed_grain = true;
F
Fam Zheng 已提交
1971
    }
1972

1973
    if (!adapter_type) {
1974
        adapter_type = g_strdup("ide");
1975 1976 1977 1978
    } else if (strcmp(adapter_type, "ide") &&
               strcmp(adapter_type, "buslogic") &&
               strcmp(adapter_type, "lsilogic") &&
               strcmp(adapter_type, "legacyESX")) {
F
Fam Zheng 已提交
1979
        error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
1980 1981
        ret = -EINVAL;
        goto exit;
1982 1983 1984 1985 1986 1987
    }
    if (strcmp(adapter_type, "ide") != 0) {
        /* that's the number of heads with which vmware operates when
           creating, exporting, etc. vmdk files with a non-ide adapter type */
        number_heads = 255;
    }
F
Fam Zheng 已提交
1988 1989
    if (!fmt) {
        /* Default format to monolithicSparse */
1990
        fmt = g_strdup("monolithicSparse");
F
Fam Zheng 已提交
1991 1992 1993
    } else if (strcmp(fmt, "monolithicFlat") &&
               strcmp(fmt, "monolithicSparse") &&
               strcmp(fmt, "twoGbMaxExtentSparse") &&
1994 1995
               strcmp(fmt, "twoGbMaxExtentFlat") &&
               strcmp(fmt, "streamOptimized")) {
F
Fam Zheng 已提交
1996
        error_setg(errp, "Unknown subformat: '%s'", fmt);
1997 1998
        ret = -EINVAL;
        goto exit;
F
Fam Zheng 已提交
1999 2000 2001 2002 2003
    }
    split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
              strcmp(fmt, "twoGbMaxExtentSparse"));
    flat = !(strcmp(fmt, "monolithicFlat") &&
             strcmp(fmt, "twoGbMaxExtentFlat"));
2004
    compress = !strcmp(fmt, "streamOptimized");
F
Fam Zheng 已提交
2005
    if (flat) {
2006
        desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n";
F
Fam Zheng 已提交
2007
    } else {
2008
        desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n";
F
Fam Zheng 已提交
2009 2010
    }
    if (flat && backing_file) {
F
Fam Zheng 已提交
2011
        error_setg(errp, "Flat image can't have backing file");
2012 2013
        ret = -ENOTSUP;
        goto exit;
F
Fam Zheng 已提交
2014
    }
2015 2016
    if (flat && zeroed_grain) {
        error_setg(errp, "Flat image can't enable zeroed grain");
2017 2018
        ret = -ENOTSUP;
        goto exit;
2019
    }
F
Fam Zheng 已提交
2020
    if (backing_file) {
2021
        BlockBackend *blk;
2022 2023 2024 2025 2026 2027 2028 2029 2030 2031
        char *full_backing = g_new0(char, PATH_MAX);
        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
                                                     full_backing, PATH_MAX,
                                                     &local_err);
        if (local_err) {
            g_free(full_backing);
            error_propagate(errp, local_err);
            ret = -ENOENT;
            goto exit;
        }
2032

2033
        blk = blk_new_open(full_backing, NULL, NULL,
2034
                           BDRV_O_NO_BACKING, errp);
2035
        g_free(full_backing);
2036 2037
        if (blk == NULL) {
            ret = -EIO;
2038
            goto exit;
F
Fam Zheng 已提交
2039
        }
2040 2041
        if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) {
            blk_unref(blk);
2042 2043
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2044
        }
2045 2046
        parent_cid = vmdk_read_cid(blk_bs(blk), 0);
        blk_unref(blk);
2047
        snprintf(parent_desc_line, BUF_SIZE,
2048
                "parentFileNameHint=\"%s\"", backing_file);
F
Fam Zheng 已提交
2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059
    }

    /* Create extents */
    filesize = total_size;
    while (filesize > 0) {
        int64_t size = filesize;

        if (split && size > split_size) {
            size = split_size;
        }
        if (split) {
2060
            snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s",
F
Fam Zheng 已提交
2061 2062
                    prefix, flat ? 'f' : 's', ++idx, postfix);
        } else if (flat) {
2063
            snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix);
F
Fam Zheng 已提交
2064
        } else {
2065
            snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix);
F
Fam Zheng 已提交
2066
        }
2067
        snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename);
F
Fam Zheng 已提交
2068

2069
        if (vmdk_create_extent(ext_filename, size,
2070
                               flat, compress, zeroed_grain, opts, errp)) {
2071 2072
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2073 2074 2075 2076
        }
        filesize -= size;

        /* Format description line */
2077
        snprintf(desc_line, BUF_SIZE,
2078
                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
2079
        g_string_append(ext_desc_lines, desc_line);
F
Fam Zheng 已提交
2080 2081
    }
    /* generate descriptor file */
2082
    desc = g_strdup_printf(desc_template,
F
Fam Zheng 已提交
2083
                           g_random_int(),
2084 2085 2086 2087
                           parent_cid,
                           fmt,
                           parent_desc_line,
                           ext_desc_lines->str,
2088
                           hw_version,
2089 2090
                           total_size /
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
2091 2092
                           number_heads,
                           adapter_type);
2093 2094 2095 2096
    desc_len = strlen(desc);
    /* the descriptor offset = 0x200 */
    if (!split && !flat) {
        desc_offset = 0x200;
F
Fam Zheng 已提交
2097
    } else {
C
Chunyan Liu 已提交
2098
        ret = bdrv_create_file(filename, opts, &local_err);
2099
        if (ret < 0) {
F
Fam Zheng 已提交
2100
            error_propagate(errp, local_err);
2101 2102
            goto exit;
        }
F
Fam Zheng 已提交
2103
    }
2104

2105
    new_blk = blk_new_open(filename, NULL, NULL,
2106
                           BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
2107
    if (new_blk == NULL) {
F
Fam Zheng 已提交
2108
        error_propagate(errp, local_err);
2109
        ret = -EIO;
2110
        goto exit;
F
Fam Zheng 已提交
2111
    }
2112 2113 2114

    blk_set_allow_write_beyond_eof(new_blk, true);

2115
    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
2116 2117 2118
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
F
Fam Zheng 已提交
2119
    }
2120 2121 2122
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
     * for description file */
    if (desc_offset == 0) {
2123
        ret = blk_truncate(new_blk, desc_len);
2124
        if (ret < 0) {
2125
            error_setg_errno(errp, -ret, "Could not truncate file");
2126
        }
2127
    }
2128
exit:
2129 2130
    if (new_blk) {
        blk_unref(new_blk);
2131
    }
2132 2133
    g_free(adapter_type);
    g_free(backing_file);
2134
    g_free(hw_version);
2135
    g_free(fmt);
2136
    g_free(desc);
2137 2138 2139 2140 2141 2142 2143
    g_free(path);
    g_free(prefix);
    g_free(postfix);
    g_free(desc_line);
    g_free(ext_filename);
    g_free(desc_filename);
    g_free(parent_desc_line);
2144
    g_string_free(ext_desc_lines, true);
2145
    return ret;
2146 2147
}

B
bellard 已提交
2148
static void vmdk_close(BlockDriverState *bs)
B
bellard 已提交
2149
{
K
Kevin Wolf 已提交
2150 2151
    BDRVVmdkState *s = bs->opaque;

F
Fam Zheng 已提交
2152
    vmdk_free_extents(bs);
F
Fam Zheng 已提交
2153
    g_free(s->create_type);
K
Kevin Wolf 已提交
2154 2155 2156

    migrate_del_blocker(s->migration_blocker);
    error_free(s->migration_blocker);
B
bellard 已提交
2157 2158
}

P
Paolo Bonzini 已提交
2159
static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
P
pbrook 已提交
2160
{
F
Fam Zheng 已提交
2161
    BDRVVmdkState *s = bs->opaque;
2162 2163
    int i, err;
    int ret = 0;
F
Fam Zheng 已提交
2164 2165

    for (i = 0; i < s->num_extents; i++) {
2166
        err = bdrv_co_flush(s->extents[i].file->bs);
F
Fam Zheng 已提交
2167 2168 2169 2170 2171
        if (err < 0) {
            ret = err;
        }
    }
    return ret;
P
pbrook 已提交
2172 2173
}

2174 2175 2176 2177 2178 2179 2180
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
{
    int i;
    int64_t ret = 0;
    int64_t r;
    BDRVVmdkState *s = bs->opaque;

K
Kevin Wolf 已提交
2181
    ret = bdrv_get_allocated_file_size(bs->file->bs);
2182 2183 2184 2185
    if (ret < 0) {
        return ret;
    }
    for (i = 0; i < s->num_extents; i++) {
K
Kevin Wolf 已提交
2186
        if (s->extents[i].file == bs->file) {
2187 2188
            continue;
        }
2189
        r = bdrv_get_allocated_file_size(s->extents[i].file->bs);
2190 2191 2192 2193 2194 2195 2196
        if (r < 0) {
            return r;
        }
        ret += r;
    }
    return ret;
}
2197

F
Fam Zheng 已提交
2198 2199 2200 2201 2202 2203 2204 2205 2206
static int vmdk_has_zero_init(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;

    /* If has a flat extent and its underlying storage doesn't have zero init,
     * return 0. */
    for (i = 0; i < s->num_extents; i++) {
        if (s->extents[i].flat) {
2207
            if (!bdrv_has_zero_init(s->extents[i].file->bs)) {
F
Fam Zheng 已提交
2208 2209 2210 2211 2212 2213 2214
                return 0;
            }
        }
    }
    return 1;
}

F
Fam Zheng 已提交
2215 2216 2217 2218 2219
static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
{
    ImageInfo *info = g_new0(ImageInfo, 1);

    *info = (ImageInfo){
2220
        .filename         = g_strdup(extent->file->bs->filename),
F
Fam Zheng 已提交
2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231
        .format           = g_strdup(extent->type),
        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE,
        .compressed       = extent->compressed,
        .has_compressed   = extent->compressed,
        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE,
        .has_cluster_size = !extent->flat,
    };

    return info;
}

2232 2233 2234 2235 2236 2237
static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
                      BdrvCheckMode fix)
{
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int64_t sector_num = 0;
2238
    int64_t total_sectors = bdrv_nb_sectors(bs);
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
    int ret;
    uint64_t cluster_offset;

    if (fix) {
        return -ENOTSUP;
    }

    for (;;) {
        if (sector_num >= total_sectors) {
            return 0;
        }
        extent = find_extent(s, sector_num, extent);
        if (!extent) {
            fprintf(stderr,
                    "ERROR: could not find extent for sector %" PRId64 "\n",
                    sector_num);
            break;
        }
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << BDRV_SECTOR_BITS,
F
Fam Zheng 已提交
2259
                                 false, &cluster_offset, 0, 0);
2260 2261 2262 2263 2264 2265
        if (ret == VMDK_ERROR) {
            fprintf(stderr,
                    "ERROR: could not get cluster_offset for sector %"
                    PRId64 "\n", sector_num);
            break;
        }
2266 2267 2268
        if (ret == VMDK_OK &&
            cluster_offset >= bdrv_getlength(extent->file->bs))
        {
2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280
            fprintf(stderr,
                    "ERROR: cluster offset for sector %"
                    PRId64 " points after EOF\n", sector_num);
            break;
        }
        sector_num += extent->cluster_sectors;
    }

    result->corruptions++;
    return 0;
}

F
Fam Zheng 已提交
2281 2282 2283 2284 2285 2286 2287 2288
static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
    ImageInfoList **next;

    *spec_info = (ImageInfoSpecific){
2289
        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
2290 2291
        .u = {
            .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
F
Fam Zheng 已提交
2292 2293 2294
        },
    };

2295
    *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
F
Fam Zheng 已提交
2296 2297 2298 2299 2300
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

2301
    next = &spec_info->u.vmdk.data->extents;
F
Fam Zheng 已提交
2302 2303 2304 2305 2306 2307 2308 2309 2310 2311
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
        (*next)->next = NULL;
        next = &(*next)->next;
    }

    return spec_info;
}

2312 2313 2314 2315 2316 2317 2318
static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
{
    return a->flat == b->flat &&
           a->compressed == b->compressed &&
           (a->flat || a->cluster_sectors == b->cluster_sectors);
}

F
Fam Zheng 已提交
2319 2320 2321 2322 2323
static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    assert(s->num_extents);
2324

F
Fam Zheng 已提交
2325 2326
    /* See if we have multiple extents but they have different cases */
    for (i = 1; i < s->num_extents; i++) {
2327
        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
F
Fam Zheng 已提交
2328 2329 2330
            return -ENOTSUP;
        }
    }
2331 2332 2333 2334
    bdi->needs_compressed_writes = s->extents[0].compressed;
    if (!s->extents[0].flat) {
        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
    }
F
Fam Zheng 已提交
2335 2336 2337
    return 0;
}

2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363
static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_ADAPTER_TYPE,
            .type = QEMU_OPT_STRING,
            .help = "Virtual adapter type, can be one of "
                    "ide (default), lsilogic, buslogic or legacyESX"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_COMPAT6,
            .type = QEMU_OPT_BOOL,
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
2364 2365 2366 2367 2368 2369
        {
            .name = BLOCK_OPT_HWVERSION,
            .type = QEMU_OPT_STRING,
            .help = "VMDK hardware version",
            .def_value_str = "undefined"
        },
2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
            .help =
                "VMDK flat extent format, can be one of "
                "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
        },
        {
            .name = BLOCK_OPT_ZEROED_GRAIN,
            .type = QEMU_OPT_BOOL,
            .help = "Enable efficient zero writes "
                    "using the zeroed-grain GTE feature"
        },
        { /* end of list */ }
    }
2385 2386
};

2387
static BlockDriver bdrv_vmdk = {
F
Fam Zheng 已提交
2388 2389 2390 2391
    .format_name                  = "vmdk",
    .instance_size                = sizeof(BDRVVmdkState),
    .bdrv_probe                   = vmdk_probe,
    .bdrv_open                    = vmdk_open,
2392
    .bdrv_check                   = vmdk_check,
F
Fam Zheng 已提交
2393
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
2394
    .bdrv_co_preadv               = vmdk_co_preadv,
2395
    .bdrv_co_pwritev              = vmdk_co_pwritev,
2396
    .bdrv_write_compressed        = vmdk_write_compressed,
2397
    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
F
Fam Zheng 已提交
2398
    .bdrv_close                   = vmdk_close,
C
Chunyan Liu 已提交
2399
    .bdrv_create                  = vmdk_create,
F
Fam Zheng 已提交
2400
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
2401
    .bdrv_co_get_block_status     = vmdk_co_get_block_status,
F
Fam Zheng 已提交
2402 2403
    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
    .bdrv_has_zero_init           = vmdk_has_zero_init,
F
Fam Zheng 已提交
2404
    .bdrv_get_specific_info       = vmdk_get_specific_info,
2405
    .bdrv_refresh_limits          = vmdk_refresh_limits,
F
Fam Zheng 已提交
2406
    .bdrv_get_info                = vmdk_get_info,
F
Fam Zheng 已提交
2407

2408
    .supports_backing             = true,
2409
    .create_opts                  = &vmdk_create_opts,
B
bellard 已提交
2410
};
2411 2412 2413 2414 2415 2416 2417

static void bdrv_vmdk_init(void)
{
    bdrv_register(&bdrv_vmdk);
}

block_init(bdrv_vmdk_init);