vmdk.c 73.8 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the VMDK format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004 Fabrice Bellard
5
 * Copyright (c) 2005 Filip Navara
6
 *
B
bellard 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
25

P
Peter Maydell 已提交
26
#include "qemu/osdep.h"
27
#include "qapi/error.h"
28
#include "block/block_int.h"
29
#include "sysemu/block-backend.h"
30
#include "qapi/qmp/qerror.h"
31
#include "qemu/error-report.h"
32
#include "qemu/module.h"
33
#include "qemu/option.h"
34
#include "qemu/bswap.h"
35
#include "migration/blocker.h"
36
#include "qemu/cutils.h"
S
Stefan Weil 已提交
37
#include <zlib.h>
B
bellard 已提交
38 39 40

#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
F
Fam Zheng 已提交
41
#define VMDK4_COMPRESSION_DEFLATE 1
F
Fam Zheng 已提交
42
#define VMDK4_FLAG_NL_DETECT (1 << 0)
43
#define VMDK4_FLAG_RGD (1 << 1)
44 45
/* Zeroed-grain enable bit */
#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
F
Fam Zheng 已提交
46 47
#define VMDK4_FLAG_COMPRESS (1 << 16)
#define VMDK4_FLAG_MARKER (1 << 17)
48
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
B
bellard 已提交
49

50
#define VMDK_GTE_ZEROED 0x1
F
Fam Zheng 已提交
51 52 53 54 55 56 57 58

/* VMDK internal error codes */
#define VMDK_OK      0
#define VMDK_ERROR   (-1)
/* Cluster not allocated */
#define VMDK_UNALLOC (-2)
#define VMDK_ZEROED  (-3)

59 60
#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"

B
bellard 已提交
61 62 63 64 65 66 67 68 69 70 71
typedef struct {
    uint32_t version;
    uint32_t flags;
    uint32_t disk_sectors;
    uint32_t granularity;
    uint32_t l1dir_offset;
    uint32_t l1dir_size;
    uint32_t file_sectors;
    uint32_t cylinders;
    uint32_t heads;
    uint32_t sectors_per_track;
72
} QEMU_PACKED VMDK3Header;
B
bellard 已提交
73 74 75 76

typedef struct {
    uint32_t version;
    uint32_t flags;
77 78 79 80
    uint64_t capacity;
    uint64_t granularity;
    uint64_t desc_offset;
    uint64_t desc_size;
81 82
    /* Number of GrainTableEntries per GrainTable */
    uint32_t num_gtes_per_gt;
83 84 85
    uint64_t rgd_offset;
    uint64_t gd_offset;
    uint64_t grain_offset;
B
bellard 已提交
86 87
    char filler[1];
    char check_bytes[4];
F
Fam Zheng 已提交
88
    uint16_t compressAlgorithm;
89
} QEMU_PACKED VMDK4Header;
B
bellard 已提交
90 91 92

#define L2_CACHE_SIZE 16

F
Fam Zheng 已提交
93
typedef struct VmdkExtent {
94
    BdrvChild *file;
F
Fam Zheng 已提交
95
    bool flat;
F
Fam Zheng 已提交
96 97
    bool compressed;
    bool has_marker;
98 99
    bool has_zero_grain;
    int version;
F
Fam Zheng 已提交
100 101
    int64_t sectors;
    int64_t end_sector;
102
    int64_t flat_start_offset;
B
bellard 已提交
103
    int64_t l1_table_offset;
104
    int64_t l1_backup_table_offset;
B
bellard 已提交
105
    uint32_t *l1_table;
106
    uint32_t *l1_backup_table;
B
bellard 已提交
107 108 109 110 111 112 113 114
    unsigned int l1_size;
    uint32_t l1_entry_sectors;

    unsigned int l2_size;
    uint32_t *l2_cache;
    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
    uint32_t l2_cache_counts[L2_CACHE_SIZE];

115
    int64_t cluster_sectors;
F
Fam Zheng 已提交
116
    int64_t next_cluster_sector;
F
Fam Zheng 已提交
117
    char *type;
F
Fam Zheng 已提交
118 119 120
} VmdkExtent;

typedef struct BDRVVmdkState {
121
    CoMutex lock;
122
    uint64_t desc_offset;
123
    bool cid_updated;
124
    bool cid_checked;
F
Fam Zheng 已提交
125
    uint32_t cid;
126
    uint32_t parent_cid;
F
Fam Zheng 已提交
127 128 129
    int num_extents;
    /* Extent array with num_extents entries, ascend ordered by address */
    VmdkExtent *extents;
K
Kevin Wolf 已提交
130
    Error *migration_blocker;
F
Fam Zheng 已提交
131
    char *create_type;
B
bellard 已提交
132 133
} BDRVVmdkState;

134 135 136 137 138
typedef struct VmdkMetaData {
    unsigned int l1_index;
    unsigned int l2_index;
    unsigned int l2_offset;
    int valid;
F
Fam Zheng 已提交
139
    uint32_t *l2_cache_entry;
140 141
} VmdkMetaData;

F
Fam Zheng 已提交
142 143 144 145
typedef struct VmdkGrainMarker {
    uint64_t lba;
    uint32_t size;
    uint8_t  data[0];
146
} QEMU_PACKED VmdkGrainMarker;
F
Fam Zheng 已提交
147

148 149 150 151 152 153 154
enum {
    MARKER_END_OF_STREAM    = 0,
    MARKER_GRAIN_TABLE      = 1,
    MARKER_GRAIN_DIRECTORY  = 2,
    MARKER_FOOTER           = 3,
};

B
bellard 已提交
155 156 157 158
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
{
    uint32_t magic;

F
Fam Zheng 已提交
159
    if (buf_size < 4) {
B
bellard 已提交
160
        return 0;
F
Fam Zheng 已提交
161
    }
B
bellard 已提交
162 163
    magic = be32_to_cpu(*(uint32_t *)buf);
    if (magic == VMDK3_MAGIC ||
164
        magic == VMDK4_MAGIC) {
B
bellard 已提交
165
        return 100;
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
    } else {
        const char *p = (const char *)buf;
        const char *end = p + buf_size;
        while (p < end) {
            if (*p == '#') {
                /* skip comment line */
                while (p < end && *p != '\n') {
                    p++;
                }
                p++;
                continue;
            }
            if (*p == ' ') {
                while (p < end && *p == ' ') {
                    p++;
                }
                /* skip '\r' if windows line endings used. */
                if (p < end && *p == '\r') {
                    p++;
                }
                /* only accept blank lines before 'version=' line */
                if (p == end || *p != '\n') {
                    return 0;
                }
                p++;
                continue;
            }
            if (end - p >= strlen("version=X\n")) {
                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
                    strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
                    return 100;
                }
            }
            if (end - p >= strlen("version=X\r\n")) {
                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
                    return 100;
                }
            }
            return 0;
        }
B
bellard 已提交
207
        return 0;
208
    }
B
bellard 已提交
209 210
}

211
#define SECTOR_SIZE 512
F
Fam Zheng 已提交
212 213 214
#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
#define BUF_SIZE 4096
#define HEADER_SIZE 512                 /* first sector of 512 bytes */
215

F
Fam Zheng 已提交
216 217 218 219
static void vmdk_free_extents(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
220
    VmdkExtent *e;
F
Fam Zheng 已提交
221 222

    for (i = 0; i < s->num_extents; i++) {
F
Fam Zheng 已提交
223 224 225 226
        e = &s->extents[i];
        g_free(e->l1_table);
        g_free(e->l2_cache);
        g_free(e->l1_backup_table);
F
Fam Zheng 已提交
227
        g_free(e->type);
K
Kevin Wolf 已提交
228
        if (e->file != bs->file) {
229
            bdrv_unref_child(bs, e->file);
F
Fam Zheng 已提交
230
        }
F
Fam Zheng 已提交
231
    }
232
    g_free(s->extents);
F
Fam Zheng 已提交
233 234
}

235 236 237 238 239 240 241 242
static void vmdk_free_last_extent(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;

    if (s->num_extents == 0) {
        return;
    }
    s->num_extents--;
243
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
244 245
}

246 247
/* Return -ve errno, or 0 on success and write CID into *pcid. */
static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
B
bellard 已提交
248
{
249
    char *desc;
250
    uint32_t cid;
251
    const char *p_name, *cid_str;
252
    size_t cid_str_size;
253
    BDRVVmdkState *s = bs->opaque;
K
Kevin Wolf 已提交
254
    int ret;
255

256
    desc = g_malloc0(DESC_SIZE);
257
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
258
    if (ret < 0) {
259
        goto out;
260
    }
261 262 263 264 265 266 267 268 269

    if (parent) {
        cid_str = "parentCID";
        cid_str_size = sizeof("parentCID");
    } else {
        cid_str = "CID";
        cid_str_size = sizeof("CID");
    }

K
Kevin Wolf 已提交
270
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
271
    p_name = strstr(desc, cid_str);
272 273 274
    if (p_name == NULL) {
        ret = -EINVAL;
        goto out;
275
    }
276 277 278 279 280 281 282
    p_name += cid_str_size;
    if (sscanf(p_name, "%" SCNx32, &cid) != 1) {
        ret = -EINVAL;
        goto out;
    }
    *pcid = cid;
    ret = 0;
283

284
out:
285
    g_free(desc);
286
    return ret;
287 288 289 290
}

static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
{
291
    char *desc, *tmp_desc;
292
    char *p_name, *tmp_str;
293
    BDRVVmdkState *s = bs->opaque;
294
    int ret = 0;
295

296 297
    desc = g_malloc0(DESC_SIZE);
    tmp_desc = g_malloc0(DESC_SIZE);
298
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
299
    if (ret < 0) {
300
        goto out;
301
    }
302

K
Kevin Wolf 已提交
303
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
304
    tmp_str = strstr(desc, "parentCID");
K
Kevin Wolf 已提交
305
    if (tmp_str == NULL) {
306 307
        ret = -EINVAL;
        goto out;
K
Kevin Wolf 已提交
308 309
    }

310
    pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
F
Fam Zheng 已提交
311 312
    p_name = strstr(desc, "CID");
    if (p_name != NULL) {
313
        p_name += sizeof("CID");
314 315
        snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
        pstrcat(desc, DESC_SIZE, tmp_desc);
316 317
    }

318
    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
319

320 321 322 323
out:
    g_free(desc);
    g_free(tmp_desc);
    return ret;
324 325 326 327 328 329 330
}

static int vmdk_is_cid_valid(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;
    uint32_t cur_pcid;

331 332 333
    if (!s->cid_checked && bs->backing) {
        BlockDriverState *p_bs = bs->backing->bs;

334 335 336 337
        if (vmdk_read_cid(p_bs, 0, &cur_pcid) != 0) {
            /* read failure: report as not valid */
            return 0;
        }
F
Fam Zheng 已提交
338 339
        if (s->parent_cid != cur_pcid) {
            /* CID not valid */
340
            return 0;
F
Fam Zheng 已提交
341
        }
342
    }
343
    s->cid_checked = true;
F
Fam Zheng 已提交
344
    /* CID valid */
345 346 347
    return 1;
}

K
Kevin Wolf 已提交
348
/* We have nothing to do for VMDK reopen, stubs just return success */
J
Jeff Cody 已提交
349 350 351 352 353
static int vmdk_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
{
    assert(state != NULL);
    assert(state->bs != NULL);
K
Kevin Wolf 已提交
354
    return 0;
J
Jeff Cody 已提交
355 356
}

357
static int vmdk_parent_open(BlockDriverState *bs)
358
{
359
    char *p_name;
360
    char *desc;
361
    BDRVVmdkState *s = bs->opaque;
362
    int ret;
363

364
    desc = g_malloc0(DESC_SIZE + 1);
365
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
366
    if (ret < 0) {
367
        goto out;
368
    }
369
    ret = 0;
370

F
Fam Zheng 已提交
371 372
    p_name = strstr(desc, "parentFileNameHint");
    if (p_name != NULL) {
373 374 375
        char *end_name;

        p_name += sizeof("parentFileNameHint") + 1;
F
Fam Zheng 已提交
376 377
        end_name = strchr(p_name, '\"');
        if (end_name == NULL) {
378 379
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
380 381
        }
        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
382 383
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
384
        }
385

K
Kevin Wolf 已提交
386
        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
387
    }
388

389 390 391
out:
    g_free(desc);
    return ret;
392 393
}

F
Fam Zheng 已提交
394 395
/* Create and append extent to the extent array. Return the added VmdkExtent
 * address. return NULL if allocation failed. */
396
static int vmdk_add_extent(BlockDriverState *bs,
397
                           BdrvChild *file, bool flat, int64_t sectors,
F
Fam Zheng 已提交
398 399
                           int64_t l1_offset, int64_t l1_backup_offset,
                           uint32_t l1_size,
400
                           int l2_size, uint64_t cluster_sectors,
F
Fam Zheng 已提交
401 402
                           VmdkExtent **new_extent,
                           Error **errp)
F
Fam Zheng 已提交
403 404 405
{
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
406
    int64_t nb_sectors;
F
Fam Zheng 已提交
407

408 409
    if (cluster_sectors > 0x200000) {
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
F
Fam Zheng 已提交
410 411
        error_setg(errp, "Invalid granularity, image may be corrupt");
        return -EFBIG;
412
    }
413 414 415 416 417
    if (l1_size > 512 * 1024 * 1024) {
        /* Although with big capacity and small l1_entry_sectors, we can get a
         * big l1_size, we don't want unbounded value to allocate the table.
         * Limit it to 512M, which is 16PB for default cluster and L2 table
         * size */
F
Fam Zheng 已提交
418
        error_setg(errp, "L1 size too big");
419 420
        return -EFBIG;
    }
421

422
    nb_sectors = bdrv_nb_sectors(file->bs);
423 424
    if (nb_sectors < 0) {
        return nb_sectors;
F
Fam Zheng 已提交
425 426
    }

427
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
F
Fam Zheng 已提交
428 429 430 431 432 433 434 435 436 437 438 439
    extent = &s->extents[s->num_extents];
    s->num_extents++;

    memset(extent, 0, sizeof(VmdkExtent));
    extent->file = file;
    extent->flat = flat;
    extent->sectors = sectors;
    extent->l1_table_offset = l1_offset;
    extent->l1_backup_table_offset = l1_backup_offset;
    extent->l1_size = l1_size;
    extent->l1_entry_sectors = l2_size * cluster_sectors;
    extent->l2_size = l2_size;
440
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
441
    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
F
Fam Zheng 已提交
442 443 444 445 446 447 448

    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
    } else {
        extent->end_sector = extent->sectors;
    }
    bs->total_sectors = extent->end_sector;
449 450 451 452
    if (new_extent) {
        *new_extent = extent;
    }
    return 0;
F
Fam Zheng 已提交
453 454
}

F
Fam Zheng 已提交
455 456
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
                            Error **errp)
457
{
458
    int ret;
459 460
    size_t l1_size;
    int i;
461

B
bellard 已提交
462
    /* read the L1 table */
F
Fam Zheng 已提交
463
    l1_size = extent->l1_size * sizeof(uint32_t);
464 465 466 467 468
    extent->l1_table = g_try_malloc(l1_size);
    if (l1_size && extent->l1_table == NULL) {
        return -ENOMEM;
    }

469
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
470 471 472
                     extent->l1_table_offset,
                     extent->l1_table,
                     l1_size);
473
    if (ret < 0) {
F
Fam Zheng 已提交
474 475
        error_setg_errno(errp, -ret,
                         "Could not read l1 table from extent '%s'",
476
                         extent->file->bs->filename);
477
        goto fail_l1;
F
Fam Zheng 已提交
478 479 480
    }
    for (i = 0; i < extent->l1_size; i++) {
        le32_to_cpus(&extent->l1_table[i]);
B
bellard 已提交
481 482
    }

F
Fam Zheng 已提交
483
    if (extent->l1_backup_table_offset) {
484 485 486 487 488
        extent->l1_backup_table = g_try_malloc(l1_size);
        if (l1_size && extent->l1_backup_table == NULL) {
            ret = -ENOMEM;
            goto fail_l1;
        }
489
        ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
490 491 492
                         extent->l1_backup_table_offset,
                         extent->l1_backup_table,
                         l1_size);
493
        if (ret < 0) {
F
Fam Zheng 已提交
494 495
            error_setg_errno(errp, -ret,
                             "Could not read l1 backup table from extent '%s'",
496
                             extent->file->bs->filename);
497
            goto fail_l1b;
F
Fam Zheng 已提交
498 499 500
        }
        for (i = 0; i < extent->l1_size; i++) {
            le32_to_cpus(&extent->l1_backup_table[i]);
501 502 503
        }
    }

F
Fam Zheng 已提交
504
    extent->l2_cache =
505
        g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE);
B
bellard 已提交
506
    return 0;
507
 fail_l1b:
508
    g_free(extent->l1_backup_table);
509
 fail_l1:
510
    g_free(extent->l1_table);
511 512 513
    return ret;
}

F
Fam Zheng 已提交
514
static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
515
                                 BdrvChild *file,
F
Fam Zheng 已提交
516
                                 int flags, Error **errp)
517 518 519 520 521 522
{
    int ret;
    uint32_t magic;
    VMDK3Header header;
    VmdkExtent *extent;

523
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
524
    if (ret < 0) {
F
Fam Zheng 已提交
525 526
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
527
                         file->bs->filename);
528
        return ret;
529
    }
530 531
    ret = vmdk_add_extent(bs, file, false,
                          le32_to_cpu(header.disk_sectors),
532
                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
533 534 535 536
                          0,
                          le32_to_cpu(header.l1dir_size),
                          4096,
                          le32_to_cpu(header.granularity),
F
Fam Zheng 已提交
537 538
                          &extent,
                          errp);
539 540 541
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
542
    ret = vmdk_init_tables(bs, extent, errp);
543
    if (ret) {
544 545
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
546 547 548 549
    }
    return ret;
}

550
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
551
                               QDict *options, Error **errp);
F
Fam Zheng 已提交
552

553
static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
P
Paolo Bonzini 已提交
554 555 556 557 558
{
    int64_t size;
    char *buf;
    int ret;

559
    size = bdrv_getlength(file->bs);
P
Paolo Bonzini 已提交
560 561 562 563 564
    if (size < 0) {
        error_setg_errno(errp, -size, "Could not access file");
        return NULL;
    }

565 566 567 568 569 570 571 572
    if (size < 4) {
        /* Both descriptor file and sparse image must be much larger than 4
         * bytes, also callers of vmdk_read_desc want to compare the first 4
         * bytes with VMDK4_MAGIC, let's error out if less is read. */
        error_setg(errp, "File is too small, not a valid image");
        return NULL;
    }

F
Fam Zheng 已提交
573 574
    size = MIN(size, (1 << 20) - 1);  /* avoid unbounded allocation */
    buf = g_malloc(size + 1);
P
Paolo Bonzini 已提交
575 576 577 578 579 580 581

    ret = bdrv_pread(file, desc_offset, buf, size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read from file");
        g_free(buf);
        return NULL;
    }
F
Fam Zheng 已提交
582
    buf[ret] = 0;
P
Paolo Bonzini 已提交
583 584 585 586

    return buf;
}

587
static int vmdk_open_vmdk4(BlockDriverState *bs,
588
                           BdrvChild *file,
K
Kevin Wolf 已提交
589
                           int flags, QDict *options, Error **errp)
590 591 592 593 594 595
{
    int ret;
    uint32_t magic;
    uint32_t l1_size, l1_entry_sectors;
    VMDK4Header header;
    VmdkExtent *extent;
F
Fam Zheng 已提交
596
    BDRVVmdkState *s = bs->opaque;
597
    int64_t l1_backup_offset = 0;
598
    bool compressed;
599

600
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
601
    if (ret < 0) {
F
Fam Zheng 已提交
602 603
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
604
                         file->bs->filename);
P
Paolo Bonzini 已提交
605
        return -EINVAL;
606
    }
607
    if (header.capacity == 0) {
608
        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
609
        if (desc_offset) {
610
            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
611 612 613
            if (!buf) {
                return -EINVAL;
            }
K
Kevin Wolf 已提交
614
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
615 616
            g_free(buf);
            return ret;
617
        }
F
Fam Zheng 已提交
618
    }
619

F
Fam Zheng 已提交
620 621 622 623
    if (!s->create_type) {
        s->create_type = g_strdup("monolithicSparse");
    }

624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
        /*
         * The footer takes precedence over the header, so read it in. The
         * footer starts at offset -1024 from the end: One sector for the
         * footer, and another one for the end-of-stream marker.
         */
        struct {
            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED footer_marker;

            uint32_t magic;
            VMDK4Header header;
            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];

            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED eos_marker;
        } QEMU_PACKED footer;

650
        ret = bdrv_pread(file,
K
Kevin Wolf 已提交
651
            bs->file->bs->total_sectors * 512 - 1536,
652 653
            &footer, sizeof(footer));
        if (ret < 0) {
654
            error_setg_errno(errp, -ret, "Failed to read footer");
655 656 657 658 659 660 661 662 663 664 665
            return ret;
        }

        /* Some sanity checks for the footer */
        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
            le32_to_cpu(footer.footer_marker.size) != 0  ||
            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
            le64_to_cpu(footer.eos_marker.val) != 0  ||
            le32_to_cpu(footer.eos_marker.size) != 0  ||
            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
        {
666
            error_setg(errp, "Invalid footer");
667 668 669 670 671 672
            return -EINVAL;
        }

        header = footer.header;
    }

673 674
    compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
675
    if (le32_to_cpu(header.version) > 3) {
676 677
        error_setg(errp, "Unsupported VMDK version %" PRIu32,
                   le32_to_cpu(header.version));
678
        return -ENOTSUP;
679 680
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
               !compressed) {
681 682 683 684 685 686
        /* VMware KB 2064959 explains that version 3 added support for
         * persistent changed block tracking (CBT), and backup software can
         * read it as version=1 if it doesn't care about the changed area
         * information. So we are safe to enable read only. */
        error_setg(errp, "VMDK version 3 must be read only");
        return -EINVAL;
687 688
    }

689
    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
P
Paolo Bonzini 已提交
690
        error_setg(errp, "L2 table size too big");
691 692 693
        return -EINVAL;
    }

694
    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
695
                        * le64_to_cpu(header.granularity);
696
    if (l1_entry_sectors == 0) {
697
        error_setg(errp, "L1 entry size is invalid");
698 699
        return -EINVAL;
    }
700 701
    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
                / l1_entry_sectors;
702 703 704
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
    }
705
    if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) {
706 707 708
        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
                   (int64_t)(le64_to_cpu(header.grain_offset)
                             * BDRV_SECTOR_SIZE));
709 710 711
        return -EINVAL;
    }

712
    ret = vmdk_add_extent(bs, file, false,
713 714
                          le64_to_cpu(header.capacity),
                          le64_to_cpu(header.gd_offset) << 9,
715
                          l1_backup_offset,
716
                          l1_size,
717
                          le32_to_cpu(header.num_gtes_per_gt),
718
                          le64_to_cpu(header.granularity),
F
Fam Zheng 已提交
719 720
                          &extent,
                          errp);
721 722 723
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
724 725
    extent->compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
726 727 728 729
    if (extent->compressed) {
        g_free(s->create_type);
        s->create_type = g_strdup("streamOptimized");
    }
F
Fam Zheng 已提交
730
    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
731 732
    extent->version = le32_to_cpu(header.version);
    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
F
Fam Zheng 已提交
733
    ret = vmdk_init_tables(bs, extent, errp);
734
    if (ret) {
735 736
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
737 738 739 740
    }
    return ret;
}

741 742 743 744 745 746 747 748 749
/* find an option value out of descriptor file */
static int vmdk_parse_description(const char *desc, const char *opt_name,
        char *buf, int buf_size)
{
    char *opt_pos, *opt_end;
    const char *end = desc + strlen(desc);

    opt_pos = strstr(desc, opt_name);
    if (!opt_pos) {
F
Fam Zheng 已提交
750
        return VMDK_ERROR;
751 752 753 754
    }
    /* Skip "=\"" following opt_name */
    opt_pos += strlen(opt_name) + 2;
    if (opt_pos >= end) {
F
Fam Zheng 已提交
755
        return VMDK_ERROR;
756 757 758 759 760 761
    }
    opt_end = opt_pos;
    while (opt_end < end && *opt_end != '"') {
        opt_end++;
    }
    if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
F
Fam Zheng 已提交
762
        return VMDK_ERROR;
763 764
    }
    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
F
Fam Zheng 已提交
765
    return VMDK_OK;
766 767
}

768
/* Open an extent file and append to bs array */
769
static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags,
K
Kevin Wolf 已提交
770
                            char *buf, QDict *options, Error **errp)
771 772 773
{
    uint32_t magic;

774
    magic = ldl_be_p(buf);
775 776
    switch (magic) {
        case VMDK3_MAGIC:
F
Fam Zheng 已提交
777
            return vmdk_open_vmfs_sparse(bs, file, flags, errp);
778 779
            break;
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
780
            return vmdk_open_vmdk4(bs, file, flags, options, errp);
781 782
            break;
        default:
P
Paolo Bonzini 已提交
783 784
            error_setg(errp, "Image not in VMDK format");
            return -EINVAL;
785 786 787 788
            break;
    }
}

789 790 791 792 793 794 795 796 797 798 799
static const char *next_line(const char *s)
{
    while (*s) {
        if (*s == '\n') {
            return s + 1;
        }
        s++;
    }
    return s;
}

800
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
K
Kevin Wolf 已提交
801 802
                              const char *desc_file_path, QDict *options,
                              Error **errp)
803 804
{
    int ret;
805
    int matches;
806 807 808
    char access[11];
    char type[11];
    char fname[512];
809
    const char *p, *np;
810 811
    int64_t sectors = 0;
    int64_t flat_offset;
812
    char *extent_path;
813
    BdrvChild *extent_file;
F
Fam Zheng 已提交
814 815
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent;
K
Kevin Wolf 已提交
816
    char extent_opt_prefix[32];
817
    Error *local_err = NULL;
818

819
    for (p = desc; *p; p = next_line(p)) {
820 821
        /* parse extent line in one of below formats:
         *
822 823
         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
         * RW [size in sectors] SPARSE "file-name.vmdk"
824 825
         * RW [size in sectors] VMFS "file-name.vmdk"
         * RW [size in sectors] VMFSSPARSE "file-name.vmdk"
826 827
         */
        flat_offset = -1;
828 829 830
        matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
                         access, &sectors, type, fname, &flat_offset);
        if (matches < 4 || strcmp(access, "RW")) {
831
            continue;
832
        } else if (!strcmp(type, "FLAT")) {
833
            if (matches != 5 || flat_offset < 0) {
834
                goto invalid;
835
            }
F
Fam Zheng 已提交
836
        } else if (!strcmp(type, "VMFS")) {
837
            if (matches == 4) {
838 839
                flat_offset = 0;
            } else {
840
                goto invalid;
841
            }
842
        } else if (matches != 4) {
843
            goto invalid;
844 845 846
        }

        if (sectors <= 0 ||
F
Fam Zheng 已提交
847
            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
P
Paolo Bonzini 已提交
848
             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
849
            (strcmp(access, "RW"))) {
850
            continue;
851 852
        }

853 854 855 856
        if (!path_is_absolute(fname) && !path_has_protocol(fname) &&
            !desc_file_path[0])
        {
            error_setg(errp, "Cannot use relative extent paths with VMDK "
K
Kevin Wolf 已提交
857
                       "descriptor file '%s'", bs->file->bs->filename);
858 859 860
            return -EINVAL;
        }

861
        extent_path = g_malloc0(PATH_MAX);
J
Jeff Cody 已提交
862
        path_combine(extent_path, PATH_MAX, desc_file_path, fname);
K
Kevin Wolf 已提交
863 864 865 866

        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
        assert(ret < 32);

867 868
        extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix,
                                      bs, &child_file, false, &local_err);
869
        g_free(extent_path);
870 871 872
        if (local_err) {
            error_propagate(errp, local_err);
            return -EINVAL;
873 874
        }

875
        /* save to extents array */
P
Paolo Bonzini 已提交
876
        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
877 878
            /* FLAT extent */

879
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
F
Fam Zheng 已提交
880
                            0, 0, 0, 0, 0, &extent, errp);
881
            if (ret < 0) {
882
                bdrv_unref_child(bs, extent_file);
883 884
                return ret;
            }
F
Fam Zheng 已提交
885
            extent->flat_start_offset = flat_offset << 9;
F
Fam Zheng 已提交
886 887
        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
888
            char *buf = vmdk_read_desc(extent_file, 0, errp);
889 890 891
            if (!buf) {
                ret = -EINVAL;
            } else {
K
Kevin Wolf 已提交
892 893
                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
                                       options, errp);
894
            }
895
            g_free(buf);
896
            if (ret) {
897
                bdrv_unref_child(bs, extent_file);
898 899
                return ret;
            }
F
Fam Zheng 已提交
900
            extent = &s->extents[s->num_extents - 1];
901
        } else {
F
Fam Zheng 已提交
902
            error_setg(errp, "Unsupported extent type '%s'", type);
903
            bdrv_unref_child(bs, extent_file);
904 905
            return -ENOTSUP;
        }
F
Fam Zheng 已提交
906
        extent->type = g_strdup(type);
907 908
    }
    return 0;
909 910 911 912 913 914 915 916 917

invalid:
    np = next_line(p);
    assert(np != p);
    if (np[-1] == '\n') {
        np--;
    }
    error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p);
    return -EINVAL;
918 919
}

920
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
921
                               QDict *options, Error **errp)
922 923 924 925 926 927
{
    int ret;
    char ct[128];
    BDRVVmdkState *s = bs->opaque;

    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
P
Paolo Bonzini 已提交
928 929
        error_setg(errp, "invalid VMDK image descriptor");
        ret = -EINVAL;
930
        goto exit;
931
    }
F
Fam Zheng 已提交
932
    if (strcmp(ct, "monolithicFlat") &&
P
Paolo Bonzini 已提交
933
        strcmp(ct, "vmfs") &&
F
Fam Zheng 已提交
934
        strcmp(ct, "vmfsSparse") &&
935
        strcmp(ct, "twoGbMaxExtentSparse") &&
F
Fam Zheng 已提交
936
        strcmp(ct, "twoGbMaxExtentFlat")) {
F
Fam Zheng 已提交
937
        error_setg(errp, "Unsupported image type '%s'", ct);
938 939
        ret = -ENOTSUP;
        goto exit;
940
    }
F
Fam Zheng 已提交
941
    s->create_type = g_strdup(ct);
942
    s->desc_offset = 0;
K
Kevin Wolf 已提交
943 944
    ret = vmdk_parse_extents(buf, bs, bs->file->bs->exact_filename, options,
                             errp);
945 946
exit:
    return ret;
947 948
}

M
Max Reitz 已提交
949 950
static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
951
{
952
    char *buf;
953 954
    int ret;
    BDRVVmdkState *s = bs->opaque;
955
    uint32_t magic;
956
    Error *local_err = NULL;
957

958 959 960 961 962 963
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

964
    buf = vmdk_read_desc(bs->file, 0, errp);
965 966 967 968
    if (!buf) {
        return -EINVAL;
    }

969 970 971 972
    magic = ldl_be_p(buf);
    switch (magic) {
        case VMDK3_MAGIC:
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
973
            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options,
974
                                   errp);
975 976 977
            s->desc_offset = 0x200;
            break;
        default:
K
Kevin Wolf 已提交
978
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
979
            break;
980
    }
981 982 983 984
    if (ret) {
        goto fail;
    }

P
Paolo Bonzini 已提交
985 986 987 988 989
    /* try to open parent images, if exist */
    ret = vmdk_parent_open(bs);
    if (ret) {
        goto fail;
    }
990 991 992 993 994 995 996 997
    ret = vmdk_read_cid(bs, 0, &s->cid);
    if (ret) {
        goto fail;
    }
    ret = vmdk_read_cid(bs, 1, &s->parent_cid);
    if (ret) {
        goto fail;
    }
998
    qemu_co_mutex_init(&s->lock);
K
Kevin Wolf 已提交
999 1000

    /* Disable migration when VMDK images are used */
1001 1002 1003
    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
1004 1005 1006 1007 1008 1009 1010
    ret = migrate_add_blocker(s->migration_blocker, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        error_free(s->migration_blocker);
        goto fail;
    }

1011
    g_free(buf);
K
Kevin Wolf 已提交
1012
    return 0;
P
Paolo Bonzini 已提交
1013 1014

fail:
1015
    g_free(buf);
F
Fam Zheng 已提交
1016 1017
    g_free(s->create_type);
    s->create_type = NULL;
P
Paolo Bonzini 已提交
1018 1019
    vmdk_free_extents(bs);
    return ret;
B
bellard 已提交
1020 1021
}

1022

1023
static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
1024 1025 1026 1027 1028 1029
{
    BDRVVmdkState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_extents; i++) {
        if (!s->extents[i].flat) {
1030 1031 1032
            bs->bl.pwrite_zeroes_alignment =
                MAX(bs->bl.pwrite_zeroes_alignment,
                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
1033 1034 1035 1036
        }
    }
}

F
Fam Zheng 已提交
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
/**
 * get_whole_cluster
 *
 * Copy backing file's cluster that covers @sector_num, otherwise write zero,
 * to the cluster at @cluster_sector_num.
 *
 * If @skip_start_sector < @skip_end_sector, the relative range
 * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
 * it for call to write user data in the request.
 */
F
Fam Zheng 已提交
1047
static int get_whole_cluster(BlockDriverState *bs,
F
Fam Zheng 已提交
1048
                             VmdkExtent *extent,
1049 1050 1051 1052
                             uint64_t cluster_offset,
                             uint64_t offset,
                             uint64_t skip_start_bytes,
                             uint64_t skip_end_bytes)
1053
{
1054
    int ret = VMDK_OK;
F
Fam Zheng 已提交
1055 1056 1057 1058 1059
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
1060
    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
F
Fam Zheng 已提交
1061
    whole_grain = qemu_blockalign(bs, cluster_bytes);
1062

1063
    if (!bs->backing) {
1064 1065
        memset(whole_grain, 0, skip_start_bytes);
        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1066 1067
    }

1068
    assert(skip_end_bytes <= cluster_bytes);
1069 1070
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
1071
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
F
Fam Zheng 已提交
1072 1073 1074
        ret = VMDK_ERROR;
        goto exit;
    }
1075

F
Fam Zheng 已提交
1076
    /* Read backing data before skip range */
1077
    if (skip_start_bytes > 0) {
1078
        if (bs->backing) {
M
Max Reitz 已提交
1079 1080
            /* qcow2 emits this on bs->file instead of bs->backing */
            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
1081
            ret = bdrv_pread(bs->backing, offset, whole_grain,
1082
                             skip_start_bytes);
F
Fam Zheng 已提交
1083 1084 1085 1086 1087
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
M
Max Reitz 已提交
1088
        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
1089
        ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
1090
                          skip_start_bytes);
K
Kevin Wolf 已提交
1091
        if (ret < 0) {
1092 1093
            ret = VMDK_ERROR;
            goto exit;
K
Kevin Wolf 已提交
1094
        }
F
Fam Zheng 已提交
1095 1096
    }
    /* Read backing data after skip range */
1097
    if (skip_end_bytes < cluster_bytes) {
1098
        if (bs->backing) {
M
Max Reitz 已提交
1099 1100
            /* qcow2 emits this on bs->file instead of bs->backing */
            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
1101
            ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
1102 1103
                             whole_grain + skip_end_bytes,
                             cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1104 1105 1106 1107 1108
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
M
Max Reitz 已提交
1109
        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
1110
        ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
1111 1112
                          whole_grain + skip_end_bytes,
                          cluster_bytes - skip_end_bytes);
K
Kevin Wolf 已提交
1113
        if (ret < 0) {
1114 1115
            ret = VMDK_ERROR;
            goto exit;
1116 1117
        }
    }
F
Fam Zheng 已提交
1118

1119
    ret = VMDK_OK;
1120 1121 1122
exit:
    qemu_vfree(whole_grain);
    return ret;
1123 1124
}

F
Fam Zheng 已提交
1125 1126
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
                         uint32_t offset)
1127
{
F
Fam Zheng 已提交
1128
    offset = cpu_to_le32(offset);
1129
    /* update L2 table */
M
Max Reitz 已提交
1130
    BLKDBG_EVENT(extent->file, BLKDBG_L2_UPDATE);
1131
    if (bdrv_pwrite_sync(extent->file,
F
Fam Zheng 已提交
1132
                ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1133
                    + (m_data->l2_index * sizeof(offset)),
1134
                &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1135
        return VMDK_ERROR;
F
Fam Zheng 已提交
1136
    }
1137
    /* update backup L2 table */
F
Fam Zheng 已提交
1138 1139
    if (extent->l1_backup_table_offset != 0) {
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
1140
        if (bdrv_pwrite_sync(extent->file,
F
Fam Zheng 已提交
1141
                    ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1142
                        + (m_data->l2_index * sizeof(offset)),
1143
                    &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1144
            return VMDK_ERROR;
F
Fam Zheng 已提交
1145
        }
1146
    }
F
Fam Zheng 已提交
1147 1148 1149
    if (m_data->l2_cache_entry) {
        *m_data->l2_cache_entry = offset;
    }
1150

F
Fam Zheng 已提交
1151
    return VMDK_OK;
1152 1153
}

F
Fam Zheng 已提交
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
/**
 * get_cluster_offset
 *
 * Look up cluster offset in extent file by sector number, and store in
 * @cluster_offset.
 *
 * For flat extents, the start offset as parsed from the description file is
 * returned.
 *
 * For sparse extents, look up in L1, L2 table. If allocate is true, return an
 * offset for a new cluster and update L2 cache. If there is a backing file,
 * COW is done before returning; otherwise, zeroes are written to the allocated
 * cluster. Both COW and zero writing skips the sector range
 * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
 * has new data to write there.
 *
 * Returns: VMDK_OK if cluster exists and mapped in the image.
 *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
 *          VMDK_ERROR if failed.
 */
1174
static int get_cluster_offset(BlockDriverState *bs,
F
Fam Zheng 已提交
1175 1176 1177 1178 1179
                              VmdkExtent *extent,
                              VmdkMetaData *m_data,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
1180 1181
                              uint64_t skip_start_bytes,
                              uint64_t skip_end_bytes)
B
bellard 已提交
1182 1183 1184
{
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
1185
    uint32_t min_count, *l2_table;
1186
    bool zeroed = false;
F
Fam Zheng 已提交
1187
    int64_t ret;
1188
    int64_t cluster_sector;
1189

F
Fam Zheng 已提交
1190
    if (m_data) {
1191
        m_data->valid = 0;
F
Fam Zheng 已提交
1192
    }
1193
    if (extent->flat) {
1194
        *cluster_offset = extent->flat_start_offset;
F
Fam Zheng 已提交
1195
        return VMDK_OK;
1196
    }
1197

F
Fam Zheng 已提交
1198
    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
F
Fam Zheng 已提交
1199 1200
    l1_index = (offset >> 9) / extent->l1_entry_sectors;
    if (l1_index >= extent->l1_size) {
F
Fam Zheng 已提交
1201
        return VMDK_ERROR;
F
Fam Zheng 已提交
1202 1203 1204
    }
    l2_offset = extent->l1_table[l1_index];
    if (!l2_offset) {
F
Fam Zheng 已提交
1205
        return VMDK_UNALLOC;
F
Fam Zheng 已提交
1206
    }
1207
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1208
        if (l2_offset == extent->l2_cache_offsets[i]) {
B
bellard 已提交
1209
            /* increment the hit count */
F
Fam Zheng 已提交
1210
            if (++extent->l2_cache_counts[i] == 0xffffffff) {
1211
                for (j = 0; j < L2_CACHE_SIZE; j++) {
F
Fam Zheng 已提交
1212
                    extent->l2_cache_counts[j] >>= 1;
B
bellard 已提交
1213 1214
                }
            }
F
Fam Zheng 已提交
1215
            l2_table = extent->l2_cache + (i * extent->l2_size);
B
bellard 已提交
1216 1217 1218 1219 1220 1221
            goto found;
        }
    }
    /* not found: load a new entry in the least used one */
    min_index = 0;
    min_count = 0xffffffff;
1222
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1223 1224
        if (extent->l2_cache_counts[i] < min_count) {
            min_count = extent->l2_cache_counts[i];
B
bellard 已提交
1225 1226 1227
            min_index = i;
        }
    }
F
Fam Zheng 已提交
1228
    l2_table = extent->l2_cache + (min_index * extent->l2_size);
M
Max Reitz 已提交
1229
    BLKDBG_EVENT(extent->file, BLKDBG_L2_LOAD);
1230
    if (bdrv_pread(extent->file,
F
Fam Zheng 已提交
1231 1232 1233 1234
                (int64_t)l2_offset * 512,
                l2_table,
                extent->l2_size * sizeof(uint32_t)
            ) != extent->l2_size * sizeof(uint32_t)) {
F
Fam Zheng 已提交
1235
        return VMDK_ERROR;
F
Fam Zheng 已提交
1236
    }
1237

F
Fam Zheng 已提交
1238 1239
    extent->l2_cache_offsets[min_index] = l2_offset;
    extent->l2_cache_counts[min_index] = 1;
B
bellard 已提交
1240
 found:
F
Fam Zheng 已提交
1241
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
F
Fam Zheng 已提交
1242
    cluster_sector = le32_to_cpu(l2_table[l2_index]);
1243

F
Fam Zheng 已提交
1244
    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
1245 1246 1247
        zeroed = true;
    }

F
Fam Zheng 已提交
1248
    if (!cluster_sector || zeroed) {
1249
        if (!allocate) {
1250
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
1251
        }
1252

F
Fam Zheng 已提交
1253 1254
        cluster_sector = extent->next_cluster_sector;
        extent->next_cluster_sector += extent->cluster_sectors;
1255 1256 1257 1258 1259 1260

        /* First of all we write grain itself, to avoid race condition
         * that may to corrupt the image.
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
1261 1262
        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
                                offset, skip_start_bytes, skip_end_bytes);
F
Fam Zheng 已提交
1263 1264
        if (ret) {
            return ret;
1265
        }
1266 1267 1268 1269 1270 1271 1272
        if (m_data) {
            m_data->valid = 1;
            m_data->l1_index = l1_index;
            m_data->l2_index = l2_index;
            m_data->l2_offset = l2_offset;
            m_data->l2_cache_entry = &l2_table[l2_index];
        }
1273
    }
F
Fam Zheng 已提交
1274
    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
F
Fam Zheng 已提交
1275
    return VMDK_OK;
B
bellard 已提交
1276 1277
}

F
Fam Zheng 已提交
1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294
static VmdkExtent *find_extent(BDRVVmdkState *s,
                                int64_t sector_num, VmdkExtent *start_hint)
{
    VmdkExtent *extent = start_hint;

    if (!extent) {
        extent = &s->extents[0];
    }
    while (extent < &s->extents[s->num_extents]) {
        if (sector_num < extent->end_sector) {
            return extent;
        }
        extent++;
    }
    return NULL;
}

1295 1296 1297
static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
                                                   int64_t offset)
{
1298
    uint64_t extent_begin_offset, extent_relative_offset;
1299 1300 1301 1302 1303
    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;

    extent_begin_offset =
        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
    extent_relative_offset = offset - extent_begin_offset;
1304
    return extent_relative_offset % cluster_size;
1305 1306
}

1307 1308 1309 1310 1311
static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
                                             bool want_zero,
                                             int64_t offset, int64_t bytes,
                                             int64_t *pnum, int64_t *map,
                                             BlockDriverState **file)
B
bellard 已提交
1312 1313
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1314
    int64_t index_in_cluster, n, ret;
1315
    uint64_t cluster_offset;
F
Fam Zheng 已提交
1316 1317
    VmdkExtent *extent;

1318
    extent = find_extent(s, offset >> BDRV_SECTOR_BITS, NULL);
F
Fam Zheng 已提交
1319
    if (!extent) {
1320
        return -EIO;
F
Fam Zheng 已提交
1321
    }
1322
    qemu_co_mutex_lock(&s->lock);
1323
    ret = get_cluster_offset(bs, extent, NULL, offset, false, &cluster_offset,
F
Fam Zheng 已提交
1324
                             0, 0);
1325
    qemu_co_mutex_unlock(&s->lock);
1326

1327
    index_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339
    switch (ret) {
    case VMDK_ERROR:
        ret = -EIO;
        break;
    case VMDK_UNALLOC:
        ret = 0;
        break;
    case VMDK_ZEROED:
        ret = BDRV_BLOCK_ZERO;
        break;
    case VMDK_OK:
        ret = BDRV_BLOCK_DATA;
1340
        if (!extent->compressed) {
1341
            ret |= BDRV_BLOCK_OFFSET_VALID;
1342
            *map = cluster_offset + index_in_cluster;
1343
        }
1344
        *file = extent->file->bs;
1345 1346
        break;
    }
1347

1348 1349
    n = extent->cluster_sectors * BDRV_SECTOR_SIZE - index_in_cluster;
    *pnum = MIN(n, bytes);
F
Fam Zheng 已提交
1350
    return ret;
B
bellard 已提交
1351 1352
}

1353
static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1354 1355 1356
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            uint64_t qiov_offset, uint64_t n_bytes,
                            uint64_t offset)
1357 1358
{
    int ret;
F
Fam Zheng 已提交
1359 1360
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
1361 1362
    QEMUIOVector local_qiov;
    struct iovec iov;
1363 1364
    int64_t write_offset;
    int64_t write_end_sector;
1365

F
Fam Zheng 已提交
1366
    if (extent->compressed) {
1367 1368
        void *compressed_data;

F
Fam Zheng 已提交
1369 1370 1371 1372 1373 1374
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1375 1376 1377 1378 1379 1380 1381

        compressed_data = g_malloc(n_bytes);
        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
        g_free(compressed_data);

        if (ret != Z_OK || buf_len == 0) {
F
Fam Zheng 已提交
1382 1383 1384
            ret = -EINVAL;
            goto out;
        }
1385

1386 1387
        data->lba = cpu_to_le64(offset >> BDRV_SECTOR_BITS);
        data->size = cpu_to_le32(buf_len);
1388 1389 1390 1391 1392 1393 1394

        n_bytes = buf_len + sizeof(VmdkGrainMarker);
        iov = (struct iovec) {
            .iov_base   = data,
            .iov_len    = n_bytes,
        };
        qemu_iovec_init_external(&local_qiov, &iov, 1);
M
Max Reitz 已提交
1395 1396

        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED);
1397 1398 1399
    } else {
        qemu_iovec_init(&local_qiov, qiov->niov);
        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
M
Max Reitz 已提交
1400 1401

        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_AIO);
F
Fam Zheng 已提交
1402
    }
1403

1404
    write_offset = cluster_offset + offset_in_cluster;
1405
    ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
1406
                          &local_qiov, 0);
1407

1408
    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
1409

1410 1411 1412 1413 1414 1415
    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
    } else {
        extent->next_cluster_sector = MAX(extent->next_cluster_sector,
                                          write_end_sector);
    }
1416

1417
    if (ret < 0) {
1418 1419 1420 1421
        goto out;
    }
    ret = 0;
 out:
F
Fam Zheng 已提交
1422
    g_free(data);
1423 1424 1425
    if (!extent->compressed) {
        qemu_iovec_destroy(&local_qiov);
    }
1426 1427 1428 1429
    return ret;
}

static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1430 1431
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            int bytes)
1432 1433
{
    int ret;
F
Fam Zheng 已提交
1434 1435 1436 1437 1438 1439 1440
    int cluster_bytes, buf_bytes;
    uint8_t *cluster_buf, *compressed_data;
    uint8_t *uncomp_buf;
    uint32_t data_len;
    VmdkGrainMarker *marker;
    uLongf buf_len;

1441

F
Fam Zheng 已提交
1442
    if (!extent->compressed) {
M
Max Reitz 已提交
1443
        BLKDBG_EVENT(extent->file, BLKDBG_READ_AIO);
1444
        ret = bdrv_co_preadv(extent->file,
1445 1446 1447 1448
                             cluster_offset + offset_in_cluster, bytes,
                             qiov, 0);
        if (ret < 0) {
            return ret;
F
Fam Zheng 已提交
1449
        }
1450
        return 0;
F
Fam Zheng 已提交
1451 1452 1453 1454 1455 1456
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
    buf_bytes = cluster_bytes * 2;
    cluster_buf = g_malloc(buf_bytes);
    uncomp_buf = g_malloc(cluster_bytes);
M
Max Reitz 已提交
1457
    BLKDBG_EVENT(extent->file, BLKDBG_READ_COMPRESSED);
1458
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482
                cluster_offset,
                cluster_buf, buf_bytes);
    if (ret < 0) {
        goto out;
    }
    compressed_data = cluster_buf;
    buf_len = cluster_bytes;
    data_len = cluster_bytes;
    if (extent->has_marker) {
        marker = (VmdkGrainMarker *)cluster_buf;
        compressed_data = marker->data;
        data_len = le32_to_cpu(marker->size);
    }
    if (!data_len || data_len > buf_bytes) {
        ret = -EINVAL;
        goto out;
    }
    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
    if (ret != Z_OK) {
        ret = -EINVAL;
        goto out;

    }
    if (offset_in_cluster < 0 ||
1483
            offset_in_cluster + bytes > buf_len) {
F
Fam Zheng 已提交
1484 1485
        ret = -EINVAL;
        goto out;
1486
    }
1487
    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
F
Fam Zheng 已提交
1488 1489 1490 1491 1492 1493
    ret = 0;

 out:
    g_free(uncomp_buf);
    g_free(cluster_buf);
    return ret;
1494 1495
}

1496 1497 1498
static int coroutine_fn
vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
               QEMUIOVector *qiov, int flags)
B
bellard 已提交
1499 1500
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1501
    int ret;
1502
    uint64_t n_bytes, offset_in_cluster;
F
Fam Zheng 已提交
1503
    VmdkExtent *extent = NULL;
1504
    QEMUIOVector local_qiov;
B
bellard 已提交
1505
    uint64_t cluster_offset;
1506
    uint64_t bytes_done = 0;
1507

1508 1509 1510 1511 1512
    qemu_iovec_init(&local_qiov, qiov->niov);
    qemu_co_mutex_lock(&s->lock);

    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1513
        if (!extent) {
1514 1515
            ret = -EIO;
            goto fail;
F
Fam Zheng 已提交
1516
        }
F
Fam Zheng 已提交
1517
        ret = get_cluster_offset(bs, extent, NULL,
1518 1519 1520 1521 1522 1523
                                 offset, false, &cluster_offset, 0, 0);
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);

        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

1524
        if (ret != VMDK_OK) {
1525
            /* if not allocated, try to read from parent image, if exist */
1526
            if (bs->backing && ret != VMDK_ZEROED) {
F
Fam Zheng 已提交
1527
                if (!vmdk_is_cid_valid(bs)) {
1528 1529
                    ret = -EINVAL;
                    goto fail;
F
Fam Zheng 已提交
1530
                }
1531 1532 1533 1534

                qemu_iovec_reset(&local_qiov);
                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

M
Max Reitz 已提交
1535 1536
                /* qcow2 emits this on bs->file instead of bs->backing */
                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
1537
                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
1538
                                     &local_qiov, 0);
F
Fam Zheng 已提交
1539
                if (ret < 0) {
1540
                    goto fail;
F
Fam Zheng 已提交
1541
                }
1542
            } else {
1543
                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
1544
            }
B
bellard 已提交
1545
        } else {
1546 1547 1548 1549 1550
            qemu_iovec_reset(&local_qiov);
            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
                                   &local_qiov, n_bytes);
1551
            if (ret) {
1552
                goto fail;
1553
            }
B
bellard 已提交
1554
        }
1555 1556 1557
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
B
bellard 已提交
1558 1559
    }

1560 1561
    ret = 0;
fail:
1562
    qemu_co_mutex_unlock(&s->lock);
1563 1564
    qemu_iovec_destroy(&local_qiov);

1565 1566 1567
    return ret;
}

F
Fam Zheng 已提交
1568 1569 1570
/**
 * vmdk_write:
 * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1571 1572 1573 1574
 *                if possible, otherwise return -ENOTSUP.
 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
 *                with each cluster. By dry run we can find if the zero write
 *                is possible without modifying image data.
F
Fam Zheng 已提交
1575 1576 1577
 *
 * Returns: error code with 0 for success.
 */
1578 1579 1580
static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
                       uint64_t bytes, QEMUIOVector *qiov,
                       bool zeroed, bool zero_dry_run)
B
bellard 已提交
1581
{
1582
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1583
    VmdkExtent *extent = NULL;
F
Fam Zheng 已提交
1584
    int ret;
1585
    int64_t offset_in_cluster, n_bytes;
1586
    uint64_t cluster_offset;
1587
    uint64_t bytes_done = 0;
F
Fam Zheng 已提交
1588
    VmdkMetaData m_data;
1589

1590 1591
    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
        error_report("Wrong offset: offset=0x%" PRIx64
1592
                     " total_sectors=0x%" PRIx64,
1593
                     offset, bs->total_sectors);
1594
        return -EIO;
1595 1596
    }

1597 1598
    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1599 1600 1601
        if (!extent) {
            return -EIO;
        }
1602 1603 1604 1605 1606
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

        ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
1607
                                 !(extent->compressed || zeroed),
1608 1609
                                 &cluster_offset, offset_in_cluster,
                                 offset_in_cluster + n_bytes);
F
Fam Zheng 已提交
1610
        if (extent->compressed) {
F
Fam Zheng 已提交
1611
            if (ret == VMDK_OK) {
F
Fam Zheng 已提交
1612
                /* Refuse write to allocated cluster for streamOptimized */
F
Fam Zheng 已提交
1613 1614
                error_report("Could not write to allocated cluster"
                              " for streamOptimized");
F
Fam Zheng 已提交
1615 1616 1617
                return -EIO;
            } else {
                /* allocate */
1618
                ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
1619
                                         true, &cluster_offset, 0, 0);
F
Fam Zheng 已提交
1620 1621
            }
        }
F
Fam Zheng 已提交
1622
        if (ret == VMDK_ERROR) {
1623
            return -EINVAL;
F
Fam Zheng 已提交
1624
        }
F
Fam Zheng 已提交
1625 1626 1627
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
1628 1629 1630
                    offset_in_cluster == 0 &&
                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
F
Fam Zheng 已提交
1631 1632
                if (!zero_dry_run) {
                    /* update L2 tables */
F
Fam Zheng 已提交
1633 1634
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
                            != VMDK_OK) {
F
Fam Zheng 已提交
1635 1636 1637 1638 1639 1640 1641
                        return -EIO;
                    }
                }
            } else {
                return -ENOTSUP;
            }
        } else {
1642 1643
            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
                                    qiov, bytes_done, n_bytes, offset);
F
Fam Zheng 已提交
1644 1645 1646 1647 1648
            if (ret) {
                return ret;
            }
            if (m_data.valid) {
                /* update L2 tables */
F
Fam Zheng 已提交
1649 1650 1651
                if (vmdk_L2update(extent, &m_data,
                                  cluster_offset >> BDRV_SECTOR_BITS)
                        != VMDK_OK) {
F
Fam Zheng 已提交
1652 1653
                    return -EIO;
                }
F
Fam Zheng 已提交
1654
            }
1655
        }
1656 1657 1658
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
1659

F
Fam Zheng 已提交
1660 1661
        /* update CID on the first write every time the virtual disk is
         * opened */
1662
        if (!s->cid_updated) {
F
Fam Zheng 已提交
1663
            ret = vmdk_write_cid(bs, g_random_int());
K
Kevin Wolf 已提交
1664 1665 1666
            if (ret < 0) {
                return ret;
            }
1667
            s->cid_updated = true;
1668
        }
1669 1670
    }
    return 0;
B
bellard 已提交
1671 1672
}

1673 1674 1675
static int coroutine_fn
vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
                QEMUIOVector *qiov, int flags)
1676 1677 1678 1679
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
1680
    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
F
Fam Zheng 已提交
1681 1682 1683 1684
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

1685 1686 1687
static int coroutine_fn
vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                           uint64_t bytes, QEMUIOVector *qiov)
1688
{
1689
    return vmdk_co_pwritev(bs, offset, bytes, qiov, 0);
1690 1691
}

1692 1693 1694 1695
static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
                                              int64_t offset,
                                              int bytes,
                                              BdrvRequestFlags flags)
F
Fam Zheng 已提交
1696 1697 1698
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
1699

F
Fam Zheng 已提交
1700
    qemu_co_mutex_lock(&s->lock);
1701 1702
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
1703
    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
F
Fam Zheng 已提交
1704
    if (!ret) {
1705
        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
F
Fam Zheng 已提交
1706
    }
1707 1708 1709 1710
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

1711
static int vmdk_create_extent(const char *filename, int64_t filesize,
1712
                              bool flat, bool compress, bool zeroed_grain,
1713
                              QemuOpts *opts, Error **errp)
1714
{
F
Fam Zheng 已提交
1715
    int ret, i;
1716
    BlockBackend *blk = NULL;
1717
    VMDK4Header header;
F
Fam Zheng 已提交
1718
    Error *local_err = NULL;
1719 1720 1721
    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
    uint32_t *gd_buf = NULL;
    int gd_buf_size;
1722

1723
    ret = bdrv_create_file(filename, opts, &local_err);
1724 1725 1726
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto exit;
1727
    }
1728

1729
    blk = blk_new_open(filename, NULL, NULL,
1730 1731
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
                       &local_err);
1732
    if (blk == NULL) {
1733
        error_propagate(errp, local_err);
1734
        ret = -EIO;
1735 1736 1737
        goto exit;
    }

1738 1739
    blk_set_allow_write_beyond_eof(blk, true);

F
Fam Zheng 已提交
1740
    if (flat) {
1741
        ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp);
F
Fam Zheng 已提交
1742
        goto exit;
1743
    }
1744 1745
    magic = cpu_to_be32(VMDK4_MAGIC);
    memset(&header, 0, sizeof(header));
1746 1747 1748 1749 1750 1751 1752
    if (compress) {
        header.version = 3;
    } else if (zeroed_grain) {
        header.version = 2;
    } else {
        header.version = 1;
    }
F
Fam Zheng 已提交
1753
    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
1754 1755
                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
1756
    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1757
    header.capacity = filesize / BDRV_SECTOR_SIZE;
A
Alexander Graf 已提交
1758
    header.granularity = 128;
1759
    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
1760

1761 1762 1763 1764 1765
    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                           BDRV_SECTOR_SIZE);
    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
1766 1767 1768 1769

    header.desc_offset = 1;
    header.desc_size = 20;
    header.rgd_offset = header.desc_offset + header.desc_size;
1770
    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
1771
    header.grain_offset =
1772 1773
        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                 header.granularity);
A
Alexander Graf 已提交
1774 1775 1776 1777 1778
    /* swap endianness for all header fields */
    header.version = cpu_to_le32(header.version);
    header.flags = cpu_to_le32(header.flags);
    header.capacity = cpu_to_le64(header.capacity);
    header.granularity = cpu_to_le64(header.granularity);
1779
    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
1780 1781 1782 1783 1784
    header.desc_offset = cpu_to_le64(header.desc_offset);
    header.desc_size = cpu_to_le64(header.desc_size);
    header.rgd_offset = cpu_to_le64(header.rgd_offset);
    header.gd_offset = cpu_to_le64(header.gd_offset);
    header.grain_offset = cpu_to_le64(header.grain_offset);
1785
    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1786 1787 1788 1789 1790

    header.check_bytes[0] = 0xa;
    header.check_bytes[1] = 0x20;
    header.check_bytes[2] = 0xd;
    header.check_bytes[3] = 0xa;
1791 1792

    /* write all the data */
1793
    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
1794
    if (ret < 0) {
1795
        error_setg(errp, QERR_IO_ERROR);
1796 1797
        goto exit;
    }
1798
    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
1799
    if (ret < 0) {
1800
        error_setg(errp, QERR_IO_ERROR);
1801 1802
        goto exit;
    }
1803

1804 1805
    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9,
                       PREALLOC_MODE_OFF, errp);
1806 1807 1808
    if (ret < 0) {
        goto exit;
    }
1809 1810

    /* write grain directory */
1811 1812 1813
    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
    gd_buf = g_malloc0(gd_buf_size);
    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
1814
         i < gt_count; i++, tmp += gt_size) {
1815 1816
        gd_buf[i] = cpu_to_le32(tmp);
    }
1817
    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
1818
                     gd_buf, gd_buf_size, 0);
1819
    if (ret < 0) {
1820
        error_setg(errp, QERR_IO_ERROR);
1821
        goto exit;
1822
    }
1823

1824
    /* write backup grain directory */
1825
    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
1826
         i < gt_count; i++, tmp += gt_size) {
1827 1828
        gd_buf[i] = cpu_to_le32(tmp);
    }
1829
    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
1830
                     gd_buf, gd_buf_size, 0);
1831
    if (ret < 0) {
1832
        error_setg(errp, QERR_IO_ERROR);
1833
        goto exit;
1834
    }
1835

F
Fam Zheng 已提交
1836
    ret = 0;
1837
exit:
1838 1839
    if (blk) {
        blk_unref(blk);
1840 1841
    }
    g_free(gd_buf);
F
Fam Zheng 已提交
1842 1843 1844 1845
    return ret;
}

static int filename_decompose(const char *filename, char *path, char *prefix,
F
Fam Zheng 已提交
1846
                              char *postfix, size_t buf_len, Error **errp)
F
Fam Zheng 已提交
1847 1848 1849 1850
{
    const char *p, *q;

    if (filename == NULL || !strlen(filename)) {
F
Fam Zheng 已提交
1851
        error_setg(errp, "No filename provided");
F
Fam Zheng 已提交
1852
        return VMDK_ERROR;
F
Fam Zheng 已提交
1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863
    }
    p = strrchr(filename, '/');
    if (p == NULL) {
        p = strrchr(filename, '\\');
    }
    if (p == NULL) {
        p = strrchr(filename, ':');
    }
    if (p != NULL) {
        p++;
        if (p - filename >= buf_len) {
F
Fam Zheng 已提交
1864
            return VMDK_ERROR;
F
Fam Zheng 已提交
1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
        }
        pstrcpy(path, p - filename + 1, filename);
    } else {
        p = filename;
        path[0] = '\0';
    }
    q = strrchr(p, '.');
    if (q == NULL) {
        pstrcpy(prefix, buf_len, p);
        postfix[0] = '\0';
    } else {
        if (q - p >= buf_len) {
F
Fam Zheng 已提交
1877
            return VMDK_ERROR;
F
Fam Zheng 已提交
1878 1879 1880 1881
        }
        pstrcpy(prefix, q - p + 1, p);
        pstrcpy(postfix, buf_len, q);
    }
F
Fam Zheng 已提交
1882
    return VMDK_OK;
F
Fam Zheng 已提交
1883 1884
}

1885 1886
static int coroutine_fn vmdk_co_create_opts(const char *filename, QemuOpts *opts,
                                            Error **errp)
F
Fam Zheng 已提交
1887
{
1888
    int idx = 0;
1889
    BlockBackend *new_blk = NULL;
F
Fam Zheng 已提交
1890
    Error *local_err = NULL;
1891
    char *desc = NULL;
F
Fam Zheng 已提交
1892
    int64_t total_size = 0, filesize;
1893 1894
    char *adapter_type = NULL;
    char *backing_file = NULL;
1895
    char *hw_version = NULL;
1896
    char *fmt = NULL;
F
Fam Zheng 已提交
1897
    int ret = 0;
1898
    bool flat, split, compress;
1899
    GString *ext_desc_lines;
1900 1901 1902 1903 1904 1905
    char *path = g_malloc0(PATH_MAX);
    char *prefix = g_malloc0(PATH_MAX);
    char *postfix = g_malloc0(PATH_MAX);
    char *desc_line = g_malloc0(BUF_SIZE);
    char *ext_filename = g_malloc0(PATH_MAX);
    char *desc_filename = g_malloc0(PATH_MAX);
F
Fam Zheng 已提交
1906 1907
    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
    const char *desc_extent_line;
1908
    char *parent_desc_line = g_malloc0(BUF_SIZE);
F
Fam Zheng 已提交
1909
    uint32_t parent_cid = 0xffffffff;
1910
    uint32_t number_heads = 16;
1911
    bool zeroed_grain = false;
1912
    uint32_t desc_offset = 0, desc_len;
F
Fam Zheng 已提交
1913 1914 1915
    const char desc_template[] =
        "# Disk DescriptorFile\n"
        "version=1\n"
1916 1917
        "CID=%" PRIx32 "\n"
        "parentCID=%" PRIx32 "\n"
F
Fam Zheng 已提交
1918 1919 1920 1921 1922 1923 1924 1925 1926
        "createType=\"%s\"\n"
        "%s"
        "\n"
        "# Extent description\n"
        "%s"
        "\n"
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
1927
        "ddb.virtualHWVersion = \"%s\"\n"
F
Fam Zheng 已提交
1928
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1929
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
F
Fam Zheng 已提交
1930
        "ddb.geometry.sectors = \"63\"\n"
1931
        "ddb.adapterType = \"%s\"\n";
F
Fam Zheng 已提交
1932

1933 1934
    ext_desc_lines = g_string_new(NULL);

F
Fam Zheng 已提交
1935
    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
1936 1937
        ret = -EINVAL;
        goto exit;
F
Fam Zheng 已提交
1938 1939
    }
    /* Read out options */
1940 1941
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
1942 1943
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
1944
    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
1945
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957
        if (strcmp(hw_version, "undefined")) {
            error_setg(errp,
                       "compat6 cannot be enabled with hwversion set");
            ret = -EINVAL;
            goto exit;
        }
        g_free(hw_version);
        hw_version = g_strdup("6");
    }
    if (strcmp(hw_version, "undefined") == 0) {
        g_free(hw_version);
        hw_version = g_strdup("4");
1958 1959 1960 1961
    }
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
        zeroed_grain = true;
F
Fam Zheng 已提交
1962
    }
1963

1964
    if (!adapter_type) {
1965
        adapter_type = g_strdup("ide");
1966 1967 1968 1969
    } else if (strcmp(adapter_type, "ide") &&
               strcmp(adapter_type, "buslogic") &&
               strcmp(adapter_type, "lsilogic") &&
               strcmp(adapter_type, "legacyESX")) {
F
Fam Zheng 已提交
1970
        error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
1971 1972
        ret = -EINVAL;
        goto exit;
1973 1974 1975 1976 1977 1978
    }
    if (strcmp(adapter_type, "ide") != 0) {
        /* that's the number of heads with which vmware operates when
           creating, exporting, etc. vmdk files with a non-ide adapter type */
        number_heads = 255;
    }
F
Fam Zheng 已提交
1979 1980
    if (!fmt) {
        /* Default format to monolithicSparse */
1981
        fmt = g_strdup("monolithicSparse");
F
Fam Zheng 已提交
1982 1983 1984
    } else if (strcmp(fmt, "monolithicFlat") &&
               strcmp(fmt, "monolithicSparse") &&
               strcmp(fmt, "twoGbMaxExtentSparse") &&
1985 1986
               strcmp(fmt, "twoGbMaxExtentFlat") &&
               strcmp(fmt, "streamOptimized")) {
F
Fam Zheng 已提交
1987
        error_setg(errp, "Unknown subformat: '%s'", fmt);
1988 1989
        ret = -EINVAL;
        goto exit;
F
Fam Zheng 已提交
1990 1991 1992 1993 1994
    }
    split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
              strcmp(fmt, "twoGbMaxExtentSparse"));
    flat = !(strcmp(fmt, "monolithicFlat") &&
             strcmp(fmt, "twoGbMaxExtentFlat"));
1995
    compress = !strcmp(fmt, "streamOptimized");
F
Fam Zheng 已提交
1996
    if (flat) {
1997
        desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n";
F
Fam Zheng 已提交
1998
    } else {
1999
        desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n";
F
Fam Zheng 已提交
2000 2001
    }
    if (flat && backing_file) {
F
Fam Zheng 已提交
2002
        error_setg(errp, "Flat image can't have backing file");
2003 2004
        ret = -ENOTSUP;
        goto exit;
F
Fam Zheng 已提交
2005
    }
2006 2007
    if (flat && zeroed_grain) {
        error_setg(errp, "Flat image can't enable zeroed grain");
2008 2009
        ret = -ENOTSUP;
        goto exit;
2010
    }
F
Fam Zheng 已提交
2011
    if (backing_file) {
2012
        BlockBackend *blk;
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
        char *full_backing = g_new0(char, PATH_MAX);
        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
                                                     full_backing, PATH_MAX,
                                                     &local_err);
        if (local_err) {
            g_free(full_backing);
            error_propagate(errp, local_err);
            ret = -ENOENT;
            goto exit;
        }
2023

2024
        blk = blk_new_open(full_backing, NULL, NULL,
2025
                           BDRV_O_NO_BACKING, errp);
2026
        g_free(full_backing);
2027 2028
        if (blk == NULL) {
            ret = -EIO;
2029
            goto exit;
F
Fam Zheng 已提交
2030
        }
2031 2032
        if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) {
            blk_unref(blk);
2033 2034
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2035
        }
2036
        ret = vmdk_read_cid(blk_bs(blk), 0, &parent_cid);
2037
        blk_unref(blk);
2038 2039 2040
        if (ret) {
            goto exit;
        }
2041
        snprintf(parent_desc_line, BUF_SIZE,
2042
                "parentFileNameHint=\"%s\"", backing_file);
F
Fam Zheng 已提交
2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053
    }

    /* Create extents */
    filesize = total_size;
    while (filesize > 0) {
        int64_t size = filesize;

        if (split && size > split_size) {
            size = split_size;
        }
        if (split) {
2054
            snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s",
F
Fam Zheng 已提交
2055 2056
                    prefix, flat ? 'f' : 's', ++idx, postfix);
        } else if (flat) {
2057
            snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix);
F
Fam Zheng 已提交
2058
        } else {
2059
            snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix);
F
Fam Zheng 已提交
2060
        }
2061
        snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename);
F
Fam Zheng 已提交
2062

2063
        if (vmdk_create_extent(ext_filename, size,
2064
                               flat, compress, zeroed_grain, opts, errp)) {
2065 2066
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2067 2068 2069 2070
        }
        filesize -= size;

        /* Format description line */
2071
        snprintf(desc_line, BUF_SIZE,
2072
                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
2073
        g_string_append(ext_desc_lines, desc_line);
F
Fam Zheng 已提交
2074 2075
    }
    /* generate descriptor file */
2076
    desc = g_strdup_printf(desc_template,
F
Fam Zheng 已提交
2077
                           g_random_int(),
2078 2079 2080 2081
                           parent_cid,
                           fmt,
                           parent_desc_line,
                           ext_desc_lines->str,
2082
                           hw_version,
2083 2084
                           total_size /
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
2085 2086
                           number_heads,
                           adapter_type);
2087 2088 2089 2090
    desc_len = strlen(desc);
    /* the descriptor offset = 0x200 */
    if (!split && !flat) {
        desc_offset = 0x200;
F
Fam Zheng 已提交
2091
    } else {
C
Chunyan Liu 已提交
2092
        ret = bdrv_create_file(filename, opts, &local_err);
2093
        if (ret < 0) {
F
Fam Zheng 已提交
2094
            error_propagate(errp, local_err);
2095 2096
            goto exit;
        }
F
Fam Zheng 已提交
2097
    }
2098

2099
    new_blk = blk_new_open(filename, NULL, NULL,
2100 2101
                           BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
                           &local_err);
2102
    if (new_blk == NULL) {
F
Fam Zheng 已提交
2103
        error_propagate(errp, local_err);
2104
        ret = -EIO;
2105
        goto exit;
F
Fam Zheng 已提交
2106
    }
2107 2108 2109

    blk_set_allow_write_beyond_eof(new_blk, true);

2110
    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
2111 2112 2113
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
F
Fam Zheng 已提交
2114
    }
2115 2116 2117
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
     * for description file */
    if (desc_offset == 0) {
2118
        ret = blk_truncate(new_blk, desc_len, PREALLOC_MODE_OFF, errp);
2119
    }
2120
exit:
2121 2122
    if (new_blk) {
        blk_unref(new_blk);
2123
    }
2124 2125
    g_free(adapter_type);
    g_free(backing_file);
2126
    g_free(hw_version);
2127
    g_free(fmt);
2128
    g_free(desc);
2129 2130 2131 2132 2133 2134 2135
    g_free(path);
    g_free(prefix);
    g_free(postfix);
    g_free(desc_line);
    g_free(ext_filename);
    g_free(desc_filename);
    g_free(parent_desc_line);
2136
    g_string_free(ext_desc_lines, true);
2137
    return ret;
2138 2139
}

B
bellard 已提交
2140
static void vmdk_close(BlockDriverState *bs)
B
bellard 已提交
2141
{
K
Kevin Wolf 已提交
2142 2143
    BDRVVmdkState *s = bs->opaque;

F
Fam Zheng 已提交
2144
    vmdk_free_extents(bs);
F
Fam Zheng 已提交
2145
    g_free(s->create_type);
K
Kevin Wolf 已提交
2146 2147 2148

    migrate_del_blocker(s->migration_blocker);
    error_free(s->migration_blocker);
B
bellard 已提交
2149 2150
}

P
Paolo Bonzini 已提交
2151
static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
P
pbrook 已提交
2152
{
F
Fam Zheng 已提交
2153
    BDRVVmdkState *s = bs->opaque;
2154 2155
    int i, err;
    int ret = 0;
F
Fam Zheng 已提交
2156 2157

    for (i = 0; i < s->num_extents; i++) {
2158
        err = bdrv_co_flush(s->extents[i].file->bs);
F
Fam Zheng 已提交
2159 2160 2161 2162 2163
        if (err < 0) {
            ret = err;
        }
    }
    return ret;
P
pbrook 已提交
2164 2165
}

2166 2167 2168 2169 2170 2171 2172
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
{
    int i;
    int64_t ret = 0;
    int64_t r;
    BDRVVmdkState *s = bs->opaque;

K
Kevin Wolf 已提交
2173
    ret = bdrv_get_allocated_file_size(bs->file->bs);
2174 2175 2176 2177
    if (ret < 0) {
        return ret;
    }
    for (i = 0; i < s->num_extents; i++) {
K
Kevin Wolf 已提交
2178
        if (s->extents[i].file == bs->file) {
2179 2180
            continue;
        }
2181
        r = bdrv_get_allocated_file_size(s->extents[i].file->bs);
2182 2183 2184 2185 2186 2187 2188
        if (r < 0) {
            return r;
        }
        ret += r;
    }
    return ret;
}
2189

F
Fam Zheng 已提交
2190 2191 2192 2193 2194 2195 2196 2197 2198
static int vmdk_has_zero_init(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;

    /* If has a flat extent and its underlying storage doesn't have zero init,
     * return 0. */
    for (i = 0; i < s->num_extents; i++) {
        if (s->extents[i].flat) {
2199
            if (!bdrv_has_zero_init(s->extents[i].file->bs)) {
F
Fam Zheng 已提交
2200 2201 2202 2203 2204 2205 2206
                return 0;
            }
        }
    }
    return 1;
}

F
Fam Zheng 已提交
2207 2208 2209 2210 2211
static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
{
    ImageInfo *info = g_new0(ImageInfo, 1);

    *info = (ImageInfo){
2212
        .filename         = g_strdup(extent->file->bs->filename),
F
Fam Zheng 已提交
2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223
        .format           = g_strdup(extent->type),
        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE,
        .compressed       = extent->compressed,
        .has_compressed   = extent->compressed,
        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE,
        .has_cluster_size = !extent->flat,
    };

    return info;
}

2224 2225 2226
static int coroutine_fn vmdk_co_check(BlockDriverState *bs,
                                      BdrvCheckResult *result,
                                      BdrvCheckMode fix)
2227 2228 2229 2230
{
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int64_t sector_num = 0;
2231
    int64_t total_sectors = bdrv_nb_sectors(bs);
2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247
    int ret;
    uint64_t cluster_offset;

    if (fix) {
        return -ENOTSUP;
    }

    for (;;) {
        if (sector_num >= total_sectors) {
            return 0;
        }
        extent = find_extent(s, sector_num, extent);
        if (!extent) {
            fprintf(stderr,
                    "ERROR: could not find extent for sector %" PRId64 "\n",
                    sector_num);
2248
            ret = -EINVAL;
2249 2250 2251 2252
            break;
        }
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << BDRV_SECTOR_BITS,
F
Fam Zheng 已提交
2253
                                 false, &cluster_offset, 0, 0);
2254 2255 2256 2257 2258 2259
        if (ret == VMDK_ERROR) {
            fprintf(stderr,
                    "ERROR: could not get cluster_offset for sector %"
                    PRId64 "\n", sector_num);
            break;
        }
2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275
        if (ret == VMDK_OK) {
            int64_t extent_len = bdrv_getlength(extent->file->bs);
            if (extent_len < 0) {
                fprintf(stderr,
                        "ERROR: could not get extent file length for sector %"
                        PRId64 "\n", sector_num);
                ret = extent_len;
                break;
            }
            if (cluster_offset >= extent_len) {
                fprintf(stderr,
                        "ERROR: cluster offset for sector %"
                        PRId64 " points after EOF\n", sector_num);
                ret = -EINVAL;
                break;
            }
2276 2277 2278 2279 2280
        }
        sector_num += extent->cluster_sectors;
    }

    result->corruptions++;
2281
    return ret;
2282 2283
}

F
Fam Zheng 已提交
2284 2285 2286 2287 2288 2289 2290 2291
static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
    ImageInfoList **next;

    *spec_info = (ImageInfoSpecific){
2292
        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
2293 2294
        .u = {
            .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
F
Fam Zheng 已提交
2295 2296 2297
        },
    };

2298
    *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
F
Fam Zheng 已提交
2299 2300 2301 2302 2303
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

2304
    next = &spec_info->u.vmdk.data->extents;
F
Fam Zheng 已提交
2305 2306 2307 2308 2309 2310 2311 2312 2313 2314
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
        (*next)->next = NULL;
        next = &(*next)->next;
    }

    return spec_info;
}

2315 2316 2317 2318 2319 2320 2321
static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
{
    return a->flat == b->flat &&
           a->compressed == b->compressed &&
           (a->flat || a->cluster_sectors == b->cluster_sectors);
}

F
Fam Zheng 已提交
2322 2323 2324 2325 2326
static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    assert(s->num_extents);
2327

F
Fam Zheng 已提交
2328 2329
    /* See if we have multiple extents but they have different cases */
    for (i = 1; i < s->num_extents; i++) {
2330
        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
F
Fam Zheng 已提交
2331 2332 2333
            return -ENOTSUP;
        }
    }
2334 2335 2336 2337
    bdi->needs_compressed_writes = s->extents[0].compressed;
    if (!s->extents[0].flat) {
        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
    }
F
Fam Zheng 已提交
2338 2339 2340
    return 0;
}

2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366
static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_ADAPTER_TYPE,
            .type = QEMU_OPT_STRING,
            .help = "Virtual adapter type, can be one of "
                    "ide (default), lsilogic, buslogic or legacyESX"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_COMPAT6,
            .type = QEMU_OPT_BOOL,
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
2367 2368 2369 2370 2371 2372
        {
            .name = BLOCK_OPT_HWVERSION,
            .type = QEMU_OPT_STRING,
            .help = "VMDK hardware version",
            .def_value_str = "undefined"
        },
2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
            .help =
                "VMDK flat extent format, can be one of "
                "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
        },
        {
            .name = BLOCK_OPT_ZEROED_GRAIN,
            .type = QEMU_OPT_BOOL,
            .help = "Enable efficient zero writes "
                    "using the zeroed-grain GTE feature"
        },
        { /* end of list */ }
    }
2388 2389
};

2390
static BlockDriver bdrv_vmdk = {
F
Fam Zheng 已提交
2391 2392 2393 2394
    .format_name                  = "vmdk",
    .instance_size                = sizeof(BDRVVmdkState),
    .bdrv_probe                   = vmdk_probe,
    .bdrv_open                    = vmdk_open,
2395
    .bdrv_co_check                = vmdk_co_check,
F
Fam Zheng 已提交
2396
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
2397
    .bdrv_child_perm              = bdrv_format_default_perms,
2398
    .bdrv_co_preadv               = vmdk_co_preadv,
2399
    .bdrv_co_pwritev              = vmdk_co_pwritev,
2400
    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
2401
    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
F
Fam Zheng 已提交
2402
    .bdrv_close                   = vmdk_close,
2403
    .bdrv_co_create_opts          = vmdk_co_create_opts,
F
Fam Zheng 已提交
2404
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
2405
    .bdrv_co_block_status         = vmdk_co_block_status,
F
Fam Zheng 已提交
2406 2407
    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
    .bdrv_has_zero_init           = vmdk_has_zero_init,
F
Fam Zheng 已提交
2408
    .bdrv_get_specific_info       = vmdk_get_specific_info,
2409
    .bdrv_refresh_limits          = vmdk_refresh_limits,
F
Fam Zheng 已提交
2410
    .bdrv_get_info                = vmdk_get_info,
F
Fam Zheng 已提交
2411

2412
    .supports_backing             = true,
2413
    .create_opts                  = &vmdk_create_opts,
B
bellard 已提交
2414
};
2415 2416 2417 2418 2419 2420 2421

static void bdrv_vmdk_init(void)
{
    bdrv_register(&bdrv_vmdk);
}

block_init(bdrv_vmdk_init);