vmdk.c 96.0 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the VMDK format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004 Fabrice Bellard
5
 * Copyright (c) 2005 Filip Navara
6
 *
B
bellard 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
25

P
Peter Maydell 已提交
26
#include "qemu/osdep.h"
27
#include "qapi/error.h"
28
#include "block/block_int.h"
29
#include "sysemu/block-backend.h"
30
#include "qapi/qmp/qdict.h"
31
#include "qapi/qmp/qerror.h"
32
#include "qemu/error-report.h"
33
#include "qemu/module.h"
34
#include "qemu/option.h"
35
#include "qemu/bswap.h"
36
#include "migration/blocker.h"
37
#include "qemu/cutils.h"
S
Stefan Weil 已提交
38
#include <zlib.h>
B
bellard 已提交
39 40 41

#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
F
Fam Zheng 已提交
42
#define VMDK4_COMPRESSION_DEFLATE 1
F
Fam Zheng 已提交
43
#define VMDK4_FLAG_NL_DETECT (1 << 0)
44
#define VMDK4_FLAG_RGD (1 << 1)
45 46
/* Zeroed-grain enable bit */
#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
F
Fam Zheng 已提交
47 48
#define VMDK4_FLAG_COMPRESS (1 << 16)
#define VMDK4_FLAG_MARKER (1 << 17)
49
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
B
bellard 已提交
50

51 52
#define VMDK_EXTENT_MAX_SECTORS (1ULL << 32)

53
#define VMDK_GTE_ZEROED 0x1
F
Fam Zheng 已提交
54 55 56 57 58 59 60 61

/* VMDK internal error codes */
#define VMDK_OK      0
#define VMDK_ERROR   (-1)
/* Cluster not allocated */
#define VMDK_UNALLOC (-2)
#define VMDK_ZEROED  (-3)

62 63
#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"

B
bellard 已提交
64 65 66 67 68 69 70 71 72 73 74
typedef struct {
    uint32_t version;
    uint32_t flags;
    uint32_t disk_sectors;
    uint32_t granularity;
    uint32_t l1dir_offset;
    uint32_t l1dir_size;
    uint32_t file_sectors;
    uint32_t cylinders;
    uint32_t heads;
    uint32_t sectors_per_track;
75
} QEMU_PACKED VMDK3Header;
B
bellard 已提交
76 77 78 79

typedef struct {
    uint32_t version;
    uint32_t flags;
80 81 82 83
    uint64_t capacity;
    uint64_t granularity;
    uint64_t desc_offset;
    uint64_t desc_size;
84 85
    /* Number of GrainTableEntries per GrainTable */
    uint32_t num_gtes_per_gt;
86 87 88
    uint64_t rgd_offset;
    uint64_t gd_offset;
    uint64_t grain_offset;
B
bellard 已提交
89 90
    char filler[1];
    char check_bytes[4];
F
Fam Zheng 已提交
91
    uint16_t compressAlgorithm;
92
} QEMU_PACKED VMDK4Header;
B
bellard 已提交
93

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
typedef struct VMDKSESparseConstHeader {
    uint64_t magic;
    uint64_t version;
    uint64_t capacity;
    uint64_t grain_size;
    uint64_t grain_table_size;
    uint64_t flags;
    uint64_t reserved1;
    uint64_t reserved2;
    uint64_t reserved3;
    uint64_t reserved4;
    uint64_t volatile_header_offset;
    uint64_t volatile_header_size;
    uint64_t journal_header_offset;
    uint64_t journal_header_size;
    uint64_t journal_offset;
    uint64_t journal_size;
    uint64_t grain_dir_offset;
    uint64_t grain_dir_size;
    uint64_t grain_tables_offset;
    uint64_t grain_tables_size;
    uint64_t free_bitmap_offset;
    uint64_t free_bitmap_size;
    uint64_t backmap_offset;
    uint64_t backmap_size;
    uint64_t grains_offset;
    uint64_t grains_size;
    uint8_t pad[304];
} QEMU_PACKED VMDKSESparseConstHeader;

typedef struct VMDKSESparseVolatileHeader {
    uint64_t magic;
    uint64_t free_gt_number;
    uint64_t next_txn_seq_number;
    uint64_t replay_journal;
    uint8_t pad[480];
} QEMU_PACKED VMDKSESparseVolatileHeader;

B
bellard 已提交
132 133
#define L2_CACHE_SIZE 16

F
Fam Zheng 已提交
134
typedef struct VmdkExtent {
135
    BdrvChild *file;
F
Fam Zheng 已提交
136
    bool flat;
F
Fam Zheng 已提交
137 138
    bool compressed;
    bool has_marker;
139
    bool has_zero_grain;
140 141 142 143
    bool sesparse;
    uint64_t sesparse_l2_tables_offset;
    uint64_t sesparse_clusters_offset;
    int32_t entry_size;
144
    int version;
F
Fam Zheng 已提交
145 146
    int64_t sectors;
    int64_t end_sector;
147
    int64_t flat_start_offset;
B
bellard 已提交
148
    int64_t l1_table_offset;
149
    int64_t l1_backup_table_offset;
150
    void *l1_table;
151
    uint32_t *l1_backup_table;
B
bellard 已提交
152 153 154 155
    unsigned int l1_size;
    uint32_t l1_entry_sectors;

    unsigned int l2_size;
156
    void *l2_cache;
B
bellard 已提交
157 158 159
    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
    uint32_t l2_cache_counts[L2_CACHE_SIZE];

160
    int64_t cluster_sectors;
F
Fam Zheng 已提交
161
    int64_t next_cluster_sector;
F
Fam Zheng 已提交
162
    char *type;
F
Fam Zheng 已提交
163 164 165
} VmdkExtent;

typedef struct BDRVVmdkState {
166
    CoMutex lock;
167
    uint64_t desc_offset;
168
    bool cid_updated;
169
    bool cid_checked;
F
Fam Zheng 已提交
170
    uint32_t cid;
171
    uint32_t parent_cid;
F
Fam Zheng 已提交
172 173 174
    int num_extents;
    /* Extent array with num_extents entries, ascend ordered by address */
    VmdkExtent *extents;
K
Kevin Wolf 已提交
175
    Error *migration_blocker;
F
Fam Zheng 已提交
176
    char *create_type;
B
bellard 已提交
177 178
} BDRVVmdkState;

179 180 181 182
typedef struct VmdkMetaData {
    unsigned int l1_index;
    unsigned int l2_index;
    unsigned int l2_offset;
183
    bool new_allocation;
F
Fam Zheng 已提交
184
    uint32_t *l2_cache_entry;
185 186
} VmdkMetaData;

F
Fam Zheng 已提交
187 188 189
typedef struct VmdkGrainMarker {
    uint64_t lba;
    uint32_t size;
190
    uint8_t  data[];
191
} QEMU_PACKED VmdkGrainMarker;
F
Fam Zheng 已提交
192

193 194 195 196 197 198 199
enum {
    MARKER_END_OF_STREAM    = 0,
    MARKER_GRAIN_TABLE      = 1,
    MARKER_GRAIN_DIRECTORY  = 2,
    MARKER_FOOTER           = 3,
};

B
bellard 已提交
200 201 202 203
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
{
    uint32_t magic;

F
Fam Zheng 已提交
204
    if (buf_size < 4) {
B
bellard 已提交
205
        return 0;
F
Fam Zheng 已提交
206
    }
B
bellard 已提交
207 208
    magic = be32_to_cpu(*(uint32_t *)buf);
    if (magic == VMDK3_MAGIC ||
209
        magic == VMDK4_MAGIC) {
B
bellard 已提交
210
        return 100;
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
    } else {
        const char *p = (const char *)buf;
        const char *end = p + buf_size;
        while (p < end) {
            if (*p == '#') {
                /* skip comment line */
                while (p < end && *p != '\n') {
                    p++;
                }
                p++;
                continue;
            }
            if (*p == ' ') {
                while (p < end && *p == ' ') {
                    p++;
                }
                /* skip '\r' if windows line endings used. */
                if (p < end && *p == '\r') {
                    p++;
                }
                /* only accept blank lines before 'version=' line */
                if (p == end || *p != '\n') {
                    return 0;
                }
                p++;
                continue;
            }
            if (end - p >= strlen("version=X\n")) {
                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
240 241
                    strncmp("version=2\n", p, strlen("version=2\n")) == 0 ||
                    strncmp("version=3\n", p, strlen("version=3\n")) == 0) {
242 243 244 245 246
                    return 100;
                }
            }
            if (end - p >= strlen("version=X\r\n")) {
                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
247 248
                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0 ||
                    strncmp("version=3\r\n", p, strlen("version=3\r\n")) == 0) {
249 250 251 252 253
                    return 100;
                }
            }
            return 0;
        }
B
bellard 已提交
254
        return 0;
255
    }
B
bellard 已提交
256 257
}

258
#define SECTOR_SIZE 512
F
Fam Zheng 已提交
259 260 261
#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
#define BUF_SIZE 4096
#define HEADER_SIZE 512                 /* first sector of 512 bytes */
262

F
Fam Zheng 已提交
263 264 265 266
static void vmdk_free_extents(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
267
    VmdkExtent *e;
F
Fam Zheng 已提交
268 269

    for (i = 0; i < s->num_extents; i++) {
F
Fam Zheng 已提交
270 271 272 273
        e = &s->extents[i];
        g_free(e->l1_table);
        g_free(e->l2_cache);
        g_free(e->l1_backup_table);
F
Fam Zheng 已提交
274
        g_free(e->type);
K
Kevin Wolf 已提交
275
        if (e->file != bs->file) {
276
            bdrv_unref_child(bs, e->file);
F
Fam Zheng 已提交
277
        }
F
Fam Zheng 已提交
278
    }
279
    g_free(s->extents);
F
Fam Zheng 已提交
280 281
}

282 283 284 285 286 287 288 289
static void vmdk_free_last_extent(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;

    if (s->num_extents == 0) {
        return;
    }
    s->num_extents--;
290
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
291 292
}

293 294
/* Return -ve errno, or 0 on success and write CID into *pcid. */
static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
B
bellard 已提交
295
{
296
    char *desc;
297
    uint32_t cid;
298
    const char *p_name, *cid_str;
299
    size_t cid_str_size;
300
    BDRVVmdkState *s = bs->opaque;
K
Kevin Wolf 已提交
301
    int ret;
302

303
    desc = g_malloc0(DESC_SIZE);
304
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
305
    if (ret < 0) {
306
        goto out;
307
    }
308 309 310 311 312 313 314 315 316

    if (parent) {
        cid_str = "parentCID";
        cid_str_size = sizeof("parentCID");
    } else {
        cid_str = "CID";
        cid_str_size = sizeof("CID");
    }

K
Kevin Wolf 已提交
317
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
318
    p_name = strstr(desc, cid_str);
319 320 321
    if (p_name == NULL) {
        ret = -EINVAL;
        goto out;
322
    }
323 324 325 326 327 328 329
    p_name += cid_str_size;
    if (sscanf(p_name, "%" SCNx32, &cid) != 1) {
        ret = -EINVAL;
        goto out;
    }
    *pcid = cid;
    ret = 0;
330

331
out:
332
    g_free(desc);
333
    return ret;
334 335 336 337
}

static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
{
338
    char *desc, *tmp_desc;
339
    char *p_name, *tmp_str;
340
    BDRVVmdkState *s = bs->opaque;
341
    int ret = 0;
342

343 344
    desc = g_malloc0(DESC_SIZE);
    tmp_desc = g_malloc0(DESC_SIZE);
345
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
346
    if (ret < 0) {
347
        goto out;
348
    }
349

K
Kevin Wolf 已提交
350
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
351
    tmp_str = strstr(desc, "parentCID");
K
Kevin Wolf 已提交
352
    if (tmp_str == NULL) {
353 354
        ret = -EINVAL;
        goto out;
K
Kevin Wolf 已提交
355 356
    }

357
    pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
F
Fam Zheng 已提交
358 359
    p_name = strstr(desc, "CID");
    if (p_name != NULL) {
360
        p_name += sizeof("CID");
361 362
        snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
        pstrcat(desc, DESC_SIZE, tmp_desc);
363 364
    }

365
    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
366

367 368 369 370
out:
    g_free(desc);
    g_free(tmp_desc);
    return ret;
371 372 373 374 375 376 377
}

static int vmdk_is_cid_valid(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;
    uint32_t cur_pcid;

378 379 380
    if (!s->cid_checked && bs->backing) {
        BlockDriverState *p_bs = bs->backing->bs;

381 382 383 384 385 386
        if (strcmp(p_bs->drv->format_name, "vmdk")) {
            /* Backing file is not in vmdk format, so it does not have
             * a CID, which makes the overlay's parent CID invalid */
            return 0;
        }

387 388 389 390
        if (vmdk_read_cid(p_bs, 0, &cur_pcid) != 0) {
            /* read failure: report as not valid */
            return 0;
        }
F
Fam Zheng 已提交
391 392
        if (s->parent_cid != cur_pcid) {
            /* CID not valid */
393
            return 0;
F
Fam Zheng 已提交
394
        }
395
    }
396
    s->cid_checked = true;
F
Fam Zheng 已提交
397
    /* CID valid */
398 399 400
    return 1;
}

K
Kevin Wolf 已提交
401
/* We have nothing to do for VMDK reopen, stubs just return success */
J
Jeff Cody 已提交
402 403 404 405 406
static int vmdk_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
{
    assert(state != NULL);
    assert(state->bs != NULL);
K
Kevin Wolf 已提交
407
    return 0;
J
Jeff Cody 已提交
408 409
}

410
static int vmdk_parent_open(BlockDriverState *bs)
411
{
412
    char *p_name;
413
    char *desc;
414
    BDRVVmdkState *s = bs->opaque;
415
    int ret;
416

417
    desc = g_malloc0(DESC_SIZE + 1);
418
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
419
    if (ret < 0) {
420
        goto out;
421
    }
422
    ret = 0;
423

F
Fam Zheng 已提交
424 425
    p_name = strstr(desc, "parentFileNameHint");
    if (p_name != NULL) {
426 427 428
        char *end_name;

        p_name += sizeof("parentFileNameHint") + 1;
F
Fam Zheng 已提交
429 430
        end_name = strchr(p_name, '\"');
        if (end_name == NULL) {
431 432
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
433
        }
M
Max Reitz 已提交
434
        if ((end_name - p_name) > sizeof(bs->auto_backing_file) - 1) {
435 436
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
437
        }
438

M
Max Reitz 已提交
439 440 441
        pstrcpy(bs->auto_backing_file, end_name - p_name + 1, p_name);
        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
                bs->auto_backing_file);
442 443
        pstrcpy(bs->backing_format, sizeof(bs->backing_format),
                "vmdk");
444
    }
445

446 447 448
out:
    g_free(desc);
    return ret;
449 450
}

F
Fam Zheng 已提交
451 452
/* Create and append extent to the extent array. Return the added VmdkExtent
 * address. return NULL if allocation failed. */
453
static int vmdk_add_extent(BlockDriverState *bs,
454
                           BdrvChild *file, bool flat, int64_t sectors,
F
Fam Zheng 已提交
455 456
                           int64_t l1_offset, int64_t l1_backup_offset,
                           uint32_t l1_size,
457
                           int l2_size, uint64_t cluster_sectors,
F
Fam Zheng 已提交
458 459
                           VmdkExtent **new_extent,
                           Error **errp)
F
Fam Zheng 已提交
460 461 462
{
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
463
    int64_t nb_sectors;
F
Fam Zheng 已提交
464

465 466
    if (cluster_sectors > 0x200000) {
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
F
Fam Zheng 已提交
467 468
        error_setg(errp, "Invalid granularity, image may be corrupt");
        return -EFBIG;
469
    }
470
    if (l1_size > 32 * 1024 * 1024) {
471 472
        /*
         * Although with big capacity and small l1_entry_sectors, we can get a
473
         * big l1_size, we don't want unbounded value to allocate the table.
474 475 476 477 478 479
         * Limit it to 32M, which is enough to store:
         *     8TB  - for both VMDK3 & VMDK4 with
         *            minimal cluster size: 512B
         *            minimal L2 table size: 512 entries
         *            8 TB is still more than the maximal value supported for
         *            VMDK3 & VMDK4 which is 2TB.
480 481 482 483 484
         *     64TB - for "ESXi seSparse Extent"
         *            minimal cluster size: 512B (default is 4KB)
         *            L2 table size: 4096 entries (const).
         *            64TB is more than the maximal value supported for
         *            seSparse VMDKs (which is slightly less than 64TB)
485
         */
F
Fam Zheng 已提交
486
        error_setg(errp, "L1 size too big");
487 488
        return -EFBIG;
    }
489

490
    nb_sectors = bdrv_nb_sectors(file->bs);
491 492
    if (nb_sectors < 0) {
        return nb_sectors;
F
Fam Zheng 已提交
493 494
    }

495
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
F
Fam Zheng 已提交
496 497 498 499 500 501 502 503 504 505 506 507
    extent = &s->extents[s->num_extents];
    s->num_extents++;

    memset(extent, 0, sizeof(VmdkExtent));
    extent->file = file;
    extent->flat = flat;
    extent->sectors = sectors;
    extent->l1_table_offset = l1_offset;
    extent->l1_backup_table_offset = l1_backup_offset;
    extent->l1_size = l1_size;
    extent->l1_entry_sectors = l2_size * cluster_sectors;
    extent->l2_size = l2_size;
508
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
509
    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
510
    extent->entry_size = sizeof(uint32_t);
F
Fam Zheng 已提交
511 512 513 514 515 516 517

    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
    } else {
        extent->end_sector = extent->sectors;
    }
    bs->total_sectors = extent->end_sector;
518 519 520 521
    if (new_extent) {
        *new_extent = extent;
    }
    return 0;
F
Fam Zheng 已提交
522 523
}

F
Fam Zheng 已提交
524 525
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
                            Error **errp)
526
{
527
    int ret;
528 529
    size_t l1_size;
    int i;
530

B
bellard 已提交
531
    /* read the L1 table */
532
    l1_size = extent->l1_size * extent->entry_size;
533 534 535 536 537
    extent->l1_table = g_try_malloc(l1_size);
    if (l1_size && extent->l1_table == NULL) {
        return -ENOMEM;
    }

538
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
539 540 541
                     extent->l1_table_offset,
                     extent->l1_table,
                     l1_size);
542
    if (ret < 0) {
543
        bdrv_refresh_filename(extent->file->bs);
F
Fam Zheng 已提交
544 545
        error_setg_errno(errp, -ret,
                         "Could not read l1 table from extent '%s'",
546
                         extent->file->bs->filename);
547
        goto fail_l1;
F
Fam Zheng 已提交
548 549
    }
    for (i = 0; i < extent->l1_size; i++) {
550 551 552 553 554 555
        if (extent->entry_size == sizeof(uint64_t)) {
            le64_to_cpus((uint64_t *)extent->l1_table + i);
        } else {
            assert(extent->entry_size == sizeof(uint32_t));
            le32_to_cpus((uint32_t *)extent->l1_table + i);
        }
B
bellard 已提交
556 557
    }

F
Fam Zheng 已提交
558
    if (extent->l1_backup_table_offset) {
559
        assert(!extent->sesparse);
560 561 562 563 564
        extent->l1_backup_table = g_try_malloc(l1_size);
        if (l1_size && extent->l1_backup_table == NULL) {
            ret = -ENOMEM;
            goto fail_l1;
        }
565
        ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
566 567 568
                         extent->l1_backup_table_offset,
                         extent->l1_backup_table,
                         l1_size);
569
        if (ret < 0) {
570
            bdrv_refresh_filename(extent->file->bs);
F
Fam Zheng 已提交
571 572
            error_setg_errno(errp, -ret,
                             "Could not read l1 backup table from extent '%s'",
573
                             extent->file->bs->filename);
574
            goto fail_l1b;
F
Fam Zheng 已提交
575 576 577
        }
        for (i = 0; i < extent->l1_size; i++) {
            le32_to_cpus(&extent->l1_backup_table[i]);
578 579 580
        }
    }

F
Fam Zheng 已提交
581
    extent->l2_cache =
582
        g_malloc(extent->entry_size * extent->l2_size * L2_CACHE_SIZE);
B
bellard 已提交
583
    return 0;
584
 fail_l1b:
585
    g_free(extent->l1_backup_table);
586
 fail_l1:
587
    g_free(extent->l1_table);
588 589 590
    return ret;
}

F
Fam Zheng 已提交
591
static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
592
                                 BdrvChild *file,
F
Fam Zheng 已提交
593
                                 int flags, Error **errp)
594 595 596 597 598 599
{
    int ret;
    uint32_t magic;
    VMDK3Header header;
    VmdkExtent *extent;

600
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
601
    if (ret < 0) {
602
        bdrv_refresh_filename(file->bs);
F
Fam Zheng 已提交
603 604
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
605
                         file->bs->filename);
606
        return ret;
607
    }
608 609
    ret = vmdk_add_extent(bs, file, false,
                          le32_to_cpu(header.disk_sectors),
610
                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
611 612 613 614
                          0,
                          le32_to_cpu(header.l1dir_size),
                          4096,
                          le32_to_cpu(header.granularity),
F
Fam Zheng 已提交
615 616
                          &extent,
                          errp);
617 618 619
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
620
    ret = vmdk_init_tables(bs, extent, errp);
621
    if (ret) {
622 623
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
624 625 626 627
    }
    return ret;
}

628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
#define SESPARSE_CONST_HEADER_MAGIC UINT64_C(0x00000000cafebabe)
#define SESPARSE_VOLATILE_HEADER_MAGIC UINT64_C(0x00000000cafecafe)

/* Strict checks - format not officially documented */
static int check_se_sparse_const_header(VMDKSESparseConstHeader *header,
                                        Error **errp)
{
    header->magic = le64_to_cpu(header->magic);
    header->version = le64_to_cpu(header->version);
    header->grain_size = le64_to_cpu(header->grain_size);
    header->grain_table_size = le64_to_cpu(header->grain_table_size);
    header->flags = le64_to_cpu(header->flags);
    header->reserved1 = le64_to_cpu(header->reserved1);
    header->reserved2 = le64_to_cpu(header->reserved2);
    header->reserved3 = le64_to_cpu(header->reserved3);
    header->reserved4 = le64_to_cpu(header->reserved4);

    header->volatile_header_offset =
        le64_to_cpu(header->volatile_header_offset);
    header->volatile_header_size = le64_to_cpu(header->volatile_header_size);

    header->journal_header_offset = le64_to_cpu(header->journal_header_offset);
    header->journal_header_size = le64_to_cpu(header->journal_header_size);

    header->journal_offset = le64_to_cpu(header->journal_offset);
    header->journal_size = le64_to_cpu(header->journal_size);

    header->grain_dir_offset = le64_to_cpu(header->grain_dir_offset);
    header->grain_dir_size = le64_to_cpu(header->grain_dir_size);

    header->grain_tables_offset = le64_to_cpu(header->grain_tables_offset);
    header->grain_tables_size = le64_to_cpu(header->grain_tables_size);

    header->free_bitmap_offset = le64_to_cpu(header->free_bitmap_offset);
    header->free_bitmap_size = le64_to_cpu(header->free_bitmap_size);

    header->backmap_offset = le64_to_cpu(header->backmap_offset);
    header->backmap_size = le64_to_cpu(header->backmap_size);

    header->grains_offset = le64_to_cpu(header->grains_offset);
    header->grains_size = le64_to_cpu(header->grains_size);

    if (header->magic != SESPARSE_CONST_HEADER_MAGIC) {
        error_setg(errp, "Bad const header magic: 0x%016" PRIx64,
                   header->magic);
        return -EINVAL;
    }

    if (header->version != 0x0000000200000001) {
        error_setg(errp, "Unsupported version: 0x%016" PRIx64,
                   header->version);
        return -ENOTSUP;
    }

    if (header->grain_size != 8) {
        error_setg(errp, "Unsupported grain size: %" PRIu64,
                   header->grain_size);
        return -ENOTSUP;
    }

    if (header->grain_table_size != 64) {
        error_setg(errp, "Unsupported grain table size: %" PRIu64,
                   header->grain_table_size);
        return -ENOTSUP;
    }

    if (header->flags != 0) {
        error_setg(errp, "Unsupported flags: 0x%016" PRIx64,
                   header->flags);
        return -ENOTSUP;
    }

    if (header->reserved1 != 0 || header->reserved2 != 0 ||
        header->reserved3 != 0 || header->reserved4 != 0) {
        error_setg(errp, "Unsupported reserved bits:"
                   " 0x%016" PRIx64 " 0x%016" PRIx64
                   " 0x%016" PRIx64 " 0x%016" PRIx64,
                   header->reserved1, header->reserved2,
                   header->reserved3, header->reserved4);
        return -ENOTSUP;
    }

    /* check that padding is 0 */
    if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
        error_setg(errp, "Unsupported non-zero const header padding");
        return -ENOTSUP;
    }

    return 0;
}

static int check_se_sparse_volatile_header(VMDKSESparseVolatileHeader *header,
                                           Error **errp)
{
    header->magic = le64_to_cpu(header->magic);
    header->free_gt_number = le64_to_cpu(header->free_gt_number);
    header->next_txn_seq_number = le64_to_cpu(header->next_txn_seq_number);
    header->replay_journal = le64_to_cpu(header->replay_journal);

    if (header->magic != SESPARSE_VOLATILE_HEADER_MAGIC) {
        error_setg(errp, "Bad volatile header magic: 0x%016" PRIx64,
                   header->magic);
        return -EINVAL;
    }

    if (header->replay_journal) {
        error_setg(errp, "Image is dirty, Replaying journal not supported");
        return -ENOTSUP;
    }

    /* check that padding is 0 */
    if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
        error_setg(errp, "Unsupported non-zero volatile header padding");
        return -ENOTSUP;
    }

    return 0;
}

static int vmdk_open_se_sparse(BlockDriverState *bs,
                               BdrvChild *file,
                               int flags, Error **errp)
{
    int ret;
    VMDKSESparseConstHeader const_header;
    VMDKSESparseVolatileHeader volatile_header;
    VmdkExtent *extent;

    ret = bdrv_apply_auto_read_only(bs,
            "No write support for seSparse images available", errp);
    if (ret < 0) {
        return ret;
    }

    assert(sizeof(const_header) == SECTOR_SIZE);

    ret = bdrv_pread(file, 0, &const_header, sizeof(const_header));
    if (ret < 0) {
        bdrv_refresh_filename(file->bs);
        error_setg_errno(errp, -ret,
                         "Could not read const header from file '%s'",
                         file->bs->filename);
        return ret;
    }

    /* check const header */
    ret = check_se_sparse_const_header(&const_header, errp);
    if (ret < 0) {
        return ret;
    }

    assert(sizeof(volatile_header) == SECTOR_SIZE);

    ret = bdrv_pread(file,
                     const_header.volatile_header_offset * SECTOR_SIZE,
                     &volatile_header, sizeof(volatile_header));
    if (ret < 0) {
        bdrv_refresh_filename(file->bs);
        error_setg_errno(errp, -ret,
                         "Could not read volatile header from file '%s'",
                         file->bs->filename);
        return ret;
    }

    /* check volatile header */
    ret = check_se_sparse_volatile_header(&volatile_header, errp);
    if (ret < 0) {
        return ret;
    }

    ret = vmdk_add_extent(bs, file, false,
                          const_header.capacity,
                          const_header.grain_dir_offset * SECTOR_SIZE,
                          0,
                          const_header.grain_dir_size *
                          SECTOR_SIZE / sizeof(uint64_t),
                          const_header.grain_table_size *
                          SECTOR_SIZE / sizeof(uint64_t),
                          const_header.grain_size,
                          &extent,
                          errp);
    if (ret < 0) {
        return ret;
    }

    extent->sesparse = true;
    extent->sesparse_l2_tables_offset = const_header.grain_tables_offset;
    extent->sesparse_clusters_offset = const_header.grains_offset;
    extent->entry_size = sizeof(uint64_t);

    ret = vmdk_init_tables(bs, extent, errp);
    if (ret) {
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
    }

    return ret;
}

827
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
828
                               QDict *options, Error **errp);
F
Fam Zheng 已提交
829

830
static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
P
Paolo Bonzini 已提交
831 832 833 834 835
{
    int64_t size;
    char *buf;
    int ret;

836
    size = bdrv_getlength(file->bs);
P
Paolo Bonzini 已提交
837 838 839 840 841
    if (size < 0) {
        error_setg_errno(errp, -size, "Could not access file");
        return NULL;
    }

842 843 844 845 846 847 848 849
    if (size < 4) {
        /* Both descriptor file and sparse image must be much larger than 4
         * bytes, also callers of vmdk_read_desc want to compare the first 4
         * bytes with VMDK4_MAGIC, let's error out if less is read. */
        error_setg(errp, "File is too small, not a valid image");
        return NULL;
    }

F
Fam Zheng 已提交
850 851
    size = MIN(size, (1 << 20) - 1);  /* avoid unbounded allocation */
    buf = g_malloc(size + 1);
P
Paolo Bonzini 已提交
852 853 854 855 856 857 858

    ret = bdrv_pread(file, desc_offset, buf, size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read from file");
        g_free(buf);
        return NULL;
    }
F
Fam Zheng 已提交
859
    buf[ret] = 0;
P
Paolo Bonzini 已提交
860 861 862 863

    return buf;
}

864
static int vmdk_open_vmdk4(BlockDriverState *bs,
865
                           BdrvChild *file,
K
Kevin Wolf 已提交
866
                           int flags, QDict *options, Error **errp)
867 868 869 870 871 872
{
    int ret;
    uint32_t magic;
    uint32_t l1_size, l1_entry_sectors;
    VMDK4Header header;
    VmdkExtent *extent;
F
Fam Zheng 已提交
873
    BDRVVmdkState *s = bs->opaque;
874
    int64_t l1_backup_offset = 0;
875
    bool compressed;
876

877
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
878
    if (ret < 0) {
879
        bdrv_refresh_filename(file->bs);
F
Fam Zheng 已提交
880 881
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
882
                         file->bs->filename);
P
Paolo Bonzini 已提交
883
        return -EINVAL;
884
    }
885
    if (header.capacity == 0) {
886
        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
887
        if (desc_offset) {
888
            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
889 890 891
            if (!buf) {
                return -EINVAL;
            }
K
Kevin Wolf 已提交
892
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
893 894
            g_free(buf);
            return ret;
895
        }
F
Fam Zheng 已提交
896
    }
897

F
Fam Zheng 已提交
898 899 900 901
    if (!s->create_type) {
        s->create_type = g_strdup("monolithicSparse");
    }

902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
        /*
         * The footer takes precedence over the header, so read it in. The
         * footer starts at offset -1024 from the end: One sector for the
         * footer, and another one for the end-of-stream marker.
         */
        struct {
            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED footer_marker;

            uint32_t magic;
            VMDK4Header header;
            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];

            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED eos_marker;
        } QEMU_PACKED footer;

928
        ret = bdrv_pread(file,
K
Kevin Wolf 已提交
929
            bs->file->bs->total_sectors * 512 - 1536,
930 931
            &footer, sizeof(footer));
        if (ret < 0) {
932
            error_setg_errno(errp, -ret, "Failed to read footer");
933 934 935 936 937 938 939 940 941 942 943
            return ret;
        }

        /* Some sanity checks for the footer */
        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
            le32_to_cpu(footer.footer_marker.size) != 0  ||
            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
            le64_to_cpu(footer.eos_marker.val) != 0  ||
            le32_to_cpu(footer.eos_marker.size) != 0  ||
            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
        {
944
            error_setg(errp, "Invalid footer");
945 946 947 948 949 950
            return -EINVAL;
        }

        header = footer.header;
    }

951 952
    compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
953
    if (le32_to_cpu(header.version) > 3) {
954 955
        error_setg(errp, "Unsupported VMDK version %" PRIu32,
                   le32_to_cpu(header.version));
956
        return -ENOTSUP;
957 958
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
               !compressed) {
959 960 961 962 963 964
        /* VMware KB 2064959 explains that version 3 added support for
         * persistent changed block tracking (CBT), and backup software can
         * read it as version=1 if it doesn't care about the changed area
         * information. So we are safe to enable read only. */
        error_setg(errp, "VMDK version 3 must be read only");
        return -EINVAL;
965 966
    }

967
    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
P
Paolo Bonzini 已提交
968
        error_setg(errp, "L2 table size too big");
969 970 971
        return -EINVAL;
    }

972
    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
973
                        * le64_to_cpu(header.granularity);
974
    if (l1_entry_sectors == 0) {
975
        error_setg(errp, "L1 entry size is invalid");
976 977
        return -EINVAL;
    }
978 979
    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
                / l1_entry_sectors;
980 981 982
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
    }
983
    if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) {
984 985 986
        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
                   (int64_t)(le64_to_cpu(header.grain_offset)
                             * BDRV_SECTOR_SIZE));
987 988 989
        return -EINVAL;
    }

990
    ret = vmdk_add_extent(bs, file, false,
991 992
                          le64_to_cpu(header.capacity),
                          le64_to_cpu(header.gd_offset) << 9,
993
                          l1_backup_offset,
994
                          l1_size,
995
                          le32_to_cpu(header.num_gtes_per_gt),
996
                          le64_to_cpu(header.granularity),
F
Fam Zheng 已提交
997 998
                          &extent,
                          errp);
999 1000 1001
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
1002 1003
    extent->compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
1004 1005 1006 1007
    if (extent->compressed) {
        g_free(s->create_type);
        s->create_type = g_strdup("streamOptimized");
    }
F
Fam Zheng 已提交
1008
    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
1009 1010
    extent->version = le32_to_cpu(header.version);
    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
F
Fam Zheng 已提交
1011
    ret = vmdk_init_tables(bs, extent, errp);
1012
    if (ret) {
1013 1014
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
1015 1016 1017 1018
    }
    return ret;
}

1019 1020 1021 1022 1023 1024 1025 1026 1027
/* find an option value out of descriptor file */
static int vmdk_parse_description(const char *desc, const char *opt_name,
        char *buf, int buf_size)
{
    char *opt_pos, *opt_end;
    const char *end = desc + strlen(desc);

    opt_pos = strstr(desc, opt_name);
    if (!opt_pos) {
F
Fam Zheng 已提交
1028
        return VMDK_ERROR;
1029 1030 1031 1032
    }
    /* Skip "=\"" following opt_name */
    opt_pos += strlen(opt_name) + 2;
    if (opt_pos >= end) {
F
Fam Zheng 已提交
1033
        return VMDK_ERROR;
1034 1035 1036 1037 1038 1039
    }
    opt_end = opt_pos;
    while (opt_end < end && *opt_end != '"') {
        opt_end++;
    }
    if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
F
Fam Zheng 已提交
1040
        return VMDK_ERROR;
1041 1042
    }
    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
F
Fam Zheng 已提交
1043
    return VMDK_OK;
1044 1045
}

1046
/* Open an extent file and append to bs array */
1047
static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags,
K
Kevin Wolf 已提交
1048
                            char *buf, QDict *options, Error **errp)
1049 1050 1051
{
    uint32_t magic;

1052
    magic = ldl_be_p(buf);
1053 1054
    switch (magic) {
        case VMDK3_MAGIC:
F
Fam Zheng 已提交
1055
            return vmdk_open_vmfs_sparse(bs, file, flags, errp);
1056 1057
            break;
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
1058
            return vmdk_open_vmdk4(bs, file, flags, options, errp);
1059 1060
            break;
        default:
P
Paolo Bonzini 已提交
1061 1062
            error_setg(errp, "Image not in VMDK format");
            return -EINVAL;
1063 1064 1065 1066
            break;
    }
}

1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
static const char *next_line(const char *s)
{
    while (*s) {
        if (*s == '\n') {
            return s + 1;
        }
        s++;
    }
    return s;
}

1078
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
1079
                              QDict *options, Error **errp)
1080 1081
{
    int ret;
1082
    int matches;
1083 1084 1085
    char access[11];
    char type[11];
    char fname[512];
1086
    const char *p, *np;
1087 1088
    int64_t sectors = 0;
    int64_t flat_offset;
1089
    char *desc_file_dir = NULL;
1090
    char *extent_path;
1091
    BdrvChild *extent_file;
F
Fam Zheng 已提交
1092 1093
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent;
K
Kevin Wolf 已提交
1094
    char extent_opt_prefix[32];
1095
    Error *local_err = NULL;
1096

1097
    for (p = desc; *p; p = next_line(p)) {
1098 1099
        /* parse extent line in one of below formats:
         *
1100 1101
         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
         * RW [size in sectors] SPARSE "file-name.vmdk"
1102 1103
         * RW [size in sectors] VMFS "file-name.vmdk"
         * RW [size in sectors] VMFSSPARSE "file-name.vmdk"
1104
         * RW [size in sectors] SESPARSE "file-name.vmdk"
1105 1106
         */
        flat_offset = -1;
1107 1108 1109
        matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
                         access, &sectors, type, fname, &flat_offset);
        if (matches < 4 || strcmp(access, "RW")) {
1110
            continue;
1111
        } else if (!strcmp(type, "FLAT")) {
1112
            if (matches != 5 || flat_offset < 0) {
1113
                goto invalid;
1114
            }
F
Fam Zheng 已提交
1115
        } else if (!strcmp(type, "VMFS")) {
1116
            if (matches == 4) {
1117 1118
                flat_offset = 0;
            } else {
1119
                goto invalid;
1120
            }
1121
        } else if (matches != 4) {
1122
            goto invalid;
1123 1124 1125
        }

        if (sectors <= 0 ||
F
Fam Zheng 已提交
1126
            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
1127 1128
             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE") &&
             strcmp(type, "SESPARSE")) ||
1129
            (strcmp(access, "RW"))) {
1130
            continue;
1131 1132
        }

1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
        if (path_is_absolute(fname)) {
            extent_path = g_strdup(fname);
        } else {
            if (!desc_file_dir) {
                desc_file_dir = bdrv_dirname(bs->file->bs, errp);
                if (!desc_file_dir) {
                    bdrv_refresh_filename(bs->file->bs);
                    error_prepend(errp, "Cannot use relative paths with VMDK "
                                  "descriptor file '%s': ",
                                  bs->file->bs->filename);
                    ret = -EINVAL;
                    goto out;
                }
            }
1147

1148 1149
            extent_path = g_strconcat(desc_file_dir, fname, NULL);
        }
K
Kevin Wolf 已提交
1150 1151 1152 1153

        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
        assert(ret < 32);

1154
        extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix,
1155
                                      bs, &child_file, 0, false, &local_err);
1156
        g_free(extent_path);
1157 1158
        if (local_err) {
            error_propagate(errp, local_err);
1159 1160
            ret = -EINVAL;
            goto out;
1161 1162
        }

1163
        /* save to extents array */
P
Paolo Bonzini 已提交
1164
        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
1165 1166
            /* FLAT extent */

1167
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
F
Fam Zheng 已提交
1168
                            0, 0, 0, 0, 0, &extent, errp);
1169
            if (ret < 0) {
1170
                bdrv_unref_child(bs, extent_file);
1171
                goto out;
1172
            }
F
Fam Zheng 已提交
1173
            extent->flat_start_offset = flat_offset << 9;
F
Fam Zheng 已提交
1174 1175
        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
1176
            char *buf = vmdk_read_desc(extent_file, 0, errp);
1177 1178 1179
            if (!buf) {
                ret = -EINVAL;
            } else {
K
Kevin Wolf 已提交
1180 1181
                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
                                       options, errp);
1182
            }
1183
            g_free(buf);
1184
            if (ret) {
1185
                bdrv_unref_child(bs, extent_file);
1186
                goto out;
1187
            }
F
Fam Zheng 已提交
1188
            extent = &s->extents[s->num_extents - 1];
1189 1190 1191 1192
        } else if (!strcmp(type, "SESPARSE")) {
            ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp);
            if (ret) {
                bdrv_unref_child(bs, extent_file);
1193
                goto out;
1194 1195
            }
            extent = &s->extents[s->num_extents - 1];
1196
        } else {
F
Fam Zheng 已提交
1197
            error_setg(errp, "Unsupported extent type '%s'", type);
1198
            bdrv_unref_child(bs, extent_file);
1199 1200
            ret = -ENOTSUP;
            goto out;
1201
        }
F
Fam Zheng 已提交
1202
        extent->type = g_strdup(type);
1203
    }
1204 1205 1206

    ret = 0;
    goto out;
1207 1208 1209 1210 1211 1212 1213 1214

invalid:
    np = next_line(p);
    assert(np != p);
    if (np[-1] == '\n') {
        np--;
    }
    error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p);
1215 1216 1217 1218 1219
    ret = -EINVAL;

out:
    g_free(desc_file_dir);
    return ret;
1220 1221
}

1222
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
1223
                               QDict *options, Error **errp)
1224 1225 1226 1227 1228 1229
{
    int ret;
    char ct[128];
    BDRVVmdkState *s = bs->opaque;

    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
P
Paolo Bonzini 已提交
1230 1231
        error_setg(errp, "invalid VMDK image descriptor");
        ret = -EINVAL;
1232
        goto exit;
1233
    }
F
Fam Zheng 已提交
1234
    if (strcmp(ct, "monolithicFlat") &&
P
Paolo Bonzini 已提交
1235
        strcmp(ct, "vmfs") &&
F
Fam Zheng 已提交
1236
        strcmp(ct, "vmfsSparse") &&
1237
        strcmp(ct, "seSparse") &&
1238
        strcmp(ct, "twoGbMaxExtentSparse") &&
F
Fam Zheng 已提交
1239
        strcmp(ct, "twoGbMaxExtentFlat")) {
F
Fam Zheng 已提交
1240
        error_setg(errp, "Unsupported image type '%s'", ct);
1241 1242
        ret = -ENOTSUP;
        goto exit;
1243
    }
F
Fam Zheng 已提交
1244
    s->create_type = g_strdup(ct);
1245
    s->desc_offset = 0;
1246
    ret = vmdk_parse_extents(buf, bs, options, errp);
1247 1248
exit:
    return ret;
1249 1250
}

M
Max Reitz 已提交
1251 1252
static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
1253
{
1254
    char *buf;
1255 1256
    int ret;
    BDRVVmdkState *s = bs->opaque;
1257
    uint32_t magic;
1258
    Error *local_err = NULL;
1259

1260
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, 0,
1261 1262 1263 1264 1265
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

1266
    buf = vmdk_read_desc(bs->file, 0, errp);
1267 1268 1269 1270
    if (!buf) {
        return -EINVAL;
    }

1271 1272 1273 1274
    magic = ldl_be_p(buf);
    switch (magic) {
        case VMDK3_MAGIC:
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
1275
            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options,
1276
                                   errp);
1277 1278 1279
            s->desc_offset = 0x200;
            break;
        default:
K
Kevin Wolf 已提交
1280
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
1281
            break;
1282
    }
1283 1284 1285 1286
    if (ret) {
        goto fail;
    }

P
Paolo Bonzini 已提交
1287 1288 1289 1290 1291
    /* try to open parent images, if exist */
    ret = vmdk_parent_open(bs);
    if (ret) {
        goto fail;
    }
1292 1293 1294 1295 1296 1297 1298 1299
    ret = vmdk_read_cid(bs, 0, &s->cid);
    if (ret) {
        goto fail;
    }
    ret = vmdk_read_cid(bs, 1, &s->parent_cid);
    if (ret) {
        goto fail;
    }
1300
    qemu_co_mutex_init(&s->lock);
K
Kevin Wolf 已提交
1301 1302

    /* Disable migration when VMDK images are used */
1303 1304 1305
    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
1306 1307 1308 1309 1310 1311 1312
    ret = migrate_add_blocker(s->migration_blocker, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        error_free(s->migration_blocker);
        goto fail;
    }

1313
    g_free(buf);
K
Kevin Wolf 已提交
1314
    return 0;
P
Paolo Bonzini 已提交
1315 1316

fail:
1317
    g_free(buf);
F
Fam Zheng 已提交
1318 1319
    g_free(s->create_type);
    s->create_type = NULL;
P
Paolo Bonzini 已提交
1320 1321
    vmdk_free_extents(bs);
    return ret;
B
bellard 已提交
1322 1323
}

1324

1325
static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
1326 1327 1328 1329 1330 1331
{
    BDRVVmdkState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_extents; i++) {
        if (!s->extents[i].flat) {
1332 1333 1334
            bs->bl.pwrite_zeroes_alignment =
                MAX(bs->bl.pwrite_zeroes_alignment,
                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
1335 1336 1337 1338
        }
    }
}

F
Fam Zheng 已提交
1339 1340 1341 1342
/**
 * get_whole_cluster
 *
 * Copy backing file's cluster that covers @sector_num, otherwise write zero,
1343 1344 1345
 * to the cluster at @cluster_sector_num. If @zeroed is true, we're overwriting
 * a zeroed cluster in the current layer and must not copy data from the
 * backing file.
F
Fam Zheng 已提交
1346 1347 1348 1349 1350
 *
 * If @skip_start_sector < @skip_end_sector, the relative range
 * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
 * it for call to write user data in the request.
 */
F
Fam Zheng 已提交
1351
static int get_whole_cluster(BlockDriverState *bs,
F
Fam Zheng 已提交
1352
                             VmdkExtent *extent,
1353 1354 1355
                             uint64_t cluster_offset,
                             uint64_t offset,
                             uint64_t skip_start_bytes,
1356 1357
                             uint64_t skip_end_bytes,
                             bool zeroed)
1358
{
1359
    int ret = VMDK_OK;
F
Fam Zheng 已提交
1360 1361
    int64_t cluster_bytes;
    uint8_t *whole_grain;
1362
    bool copy_from_backing;
F
Fam Zheng 已提交
1363 1364 1365

    /* For COW, align request sector_num to cluster start */
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
1366
    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
F
Fam Zheng 已提交
1367
    whole_grain = qemu_blockalign(bs, cluster_bytes);
1368
    copy_from_backing = bs->backing && !zeroed;
1369

1370
    if (!copy_from_backing) {
1371 1372
        memset(whole_grain, 0, skip_start_bytes);
        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1373 1374
    }

1375
    assert(skip_end_bytes <= cluster_bytes);
1376 1377
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
1378
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
F
Fam Zheng 已提交
1379 1380 1381
        ret = VMDK_ERROR;
        goto exit;
    }
1382

F
Fam Zheng 已提交
1383
    /* Read backing data before skip range */
1384
    if (skip_start_bytes > 0) {
1385
        if (copy_from_backing) {
M
Max Reitz 已提交
1386 1387
            /* qcow2 emits this on bs->file instead of bs->backing */
            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
1388
            ret = bdrv_pread(bs->backing, offset, whole_grain,
1389
                             skip_start_bytes);
F
Fam Zheng 已提交
1390 1391 1392 1393 1394
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
M
Max Reitz 已提交
1395
        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
1396
        ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
1397
                          skip_start_bytes);
K
Kevin Wolf 已提交
1398
        if (ret < 0) {
1399 1400
            ret = VMDK_ERROR;
            goto exit;
K
Kevin Wolf 已提交
1401
        }
F
Fam Zheng 已提交
1402 1403
    }
    /* Read backing data after skip range */
1404
    if (skip_end_bytes < cluster_bytes) {
1405
        if (copy_from_backing) {
M
Max Reitz 已提交
1406 1407
            /* qcow2 emits this on bs->file instead of bs->backing */
            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
1408
            ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
1409 1410
                             whole_grain + skip_end_bytes,
                             cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1411 1412 1413 1414 1415
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
M
Max Reitz 已提交
1416
        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
1417
        ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
1418 1419
                          whole_grain + skip_end_bytes,
                          cluster_bytes - skip_end_bytes);
K
Kevin Wolf 已提交
1420
        if (ret < 0) {
1421 1422
            ret = VMDK_ERROR;
            goto exit;
1423 1424
        }
    }
F
Fam Zheng 已提交
1425

1426
    ret = VMDK_OK;
1427 1428 1429
exit:
    qemu_vfree(whole_grain);
    return ret;
1430 1431
}

F
Fam Zheng 已提交
1432 1433
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
                         uint32_t offset)
1434
{
F
Fam Zheng 已提交
1435
    offset = cpu_to_le32(offset);
1436
    /* update L2 table */
M
Max Reitz 已提交
1437
    BLKDBG_EVENT(extent->file, BLKDBG_L2_UPDATE);
1438
    if (bdrv_pwrite(extent->file,
F
Fam Zheng 已提交
1439
                ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1440
                    + (m_data->l2_index * sizeof(offset)),
1441
                &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1442
        return VMDK_ERROR;
F
Fam Zheng 已提交
1443
    }
1444
    /* update backup L2 table */
F
Fam Zheng 已提交
1445 1446
    if (extent->l1_backup_table_offset != 0) {
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
1447
        if (bdrv_pwrite(extent->file,
F
Fam Zheng 已提交
1448
                    ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1449
                        + (m_data->l2_index * sizeof(offset)),
1450
                    &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1451
            return VMDK_ERROR;
F
Fam Zheng 已提交
1452
        }
1453
    }
1454 1455 1456
    if (bdrv_flush(extent->file->bs) < 0) {
        return VMDK_ERROR;
    }
F
Fam Zheng 已提交
1457 1458 1459
    if (m_data->l2_cache_entry) {
        *m_data->l2_cache_entry = offset;
    }
1460

F
Fam Zheng 已提交
1461
    return VMDK_OK;
1462 1463
}

F
Fam Zheng 已提交
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483
/**
 * get_cluster_offset
 *
 * Look up cluster offset in extent file by sector number, and store in
 * @cluster_offset.
 *
 * For flat extents, the start offset as parsed from the description file is
 * returned.
 *
 * For sparse extents, look up in L1, L2 table. If allocate is true, return an
 * offset for a new cluster and update L2 cache. If there is a backing file,
 * COW is done before returning; otherwise, zeroes are written to the allocated
 * cluster. Both COW and zero writing skips the sector range
 * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
 * has new data to write there.
 *
 * Returns: VMDK_OK if cluster exists and mapped in the image.
 *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
 *          VMDK_ERROR if failed.
 */
1484
static int get_cluster_offset(BlockDriverState *bs,
F
Fam Zheng 已提交
1485 1486 1487 1488 1489
                              VmdkExtent *extent,
                              VmdkMetaData *m_data,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
1490 1491
                              uint64_t skip_start_bytes,
                              uint64_t skip_end_bytes)
B
bellard 已提交
1492 1493 1494
{
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
1495 1496
    uint32_t min_count;
    void *l2_table;
1497
    bool zeroed = false;
F
Fam Zheng 已提交
1498
    int64_t ret;
1499
    int64_t cluster_sector;
1500
    unsigned int l2_size_bytes = extent->l2_size * extent->entry_size;
1501

F
Fam Zheng 已提交
1502
    if (m_data) {
1503
        m_data->new_allocation = false;
F
Fam Zheng 已提交
1504
    }
1505
    if (extent->flat) {
1506
        *cluster_offset = extent->flat_start_offset;
F
Fam Zheng 已提交
1507
        return VMDK_OK;
1508
    }
1509

F
Fam Zheng 已提交
1510
    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
F
Fam Zheng 已提交
1511 1512
    l1_index = (offset >> 9) / extent->l1_entry_sectors;
    if (l1_index >= extent->l1_size) {
F
Fam Zheng 已提交
1513
        return VMDK_ERROR;
F
Fam Zheng 已提交
1514
    }
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
    if (extent->sesparse) {
        uint64_t l2_offset_u64;

        assert(extent->entry_size == sizeof(uint64_t));

        l2_offset_u64 = ((uint64_t *)extent->l1_table)[l1_index];
        if (l2_offset_u64 == 0) {
            l2_offset = 0;
        } else if ((l2_offset_u64 & 0xffffffff00000000) != 0x1000000000000000) {
            /*
             * Top most nibble is 0x1 if grain table is allocated.
             * strict check - top most 4 bytes must be 0x10000000 since max
             * supported size is 64TB for disk - so no more than 64TB / 16MB
             * grain directories which is smaller than uint32,
             * where 16MB is the only supported default grain table coverage.
             */
            return VMDK_ERROR;
        } else {
            l2_offset_u64 = l2_offset_u64 & 0x00000000ffffffff;
            l2_offset_u64 = extent->sesparse_l2_tables_offset +
                l2_offset_u64 * l2_size_bytes / SECTOR_SIZE;
            if (l2_offset_u64 > 0x00000000ffffffff) {
                return VMDK_ERROR;
            }
            l2_offset = (unsigned int)(l2_offset_u64);
        }
    } else {
        assert(extent->entry_size == sizeof(uint32_t));
        l2_offset = ((uint32_t *)extent->l1_table)[l1_index];
    }
F
Fam Zheng 已提交
1545
    if (!l2_offset) {
F
Fam Zheng 已提交
1546
        return VMDK_UNALLOC;
F
Fam Zheng 已提交
1547
    }
1548
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1549
        if (l2_offset == extent->l2_cache_offsets[i]) {
B
bellard 已提交
1550
            /* increment the hit count */
F
Fam Zheng 已提交
1551
            if (++extent->l2_cache_counts[i] == 0xffffffff) {
1552
                for (j = 0; j < L2_CACHE_SIZE; j++) {
F
Fam Zheng 已提交
1553
                    extent->l2_cache_counts[j] >>= 1;
B
bellard 已提交
1554 1555
                }
            }
1556
            l2_table = (char *)extent->l2_cache + (i * l2_size_bytes);
B
bellard 已提交
1557 1558 1559 1560 1561 1562
            goto found;
        }
    }
    /* not found: load a new entry in the least used one */
    min_index = 0;
    min_count = 0xffffffff;
1563
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1564 1565
        if (extent->l2_cache_counts[i] < min_count) {
            min_count = extent->l2_cache_counts[i];
B
bellard 已提交
1566 1567 1568
            min_index = i;
        }
    }
1569
    l2_table = (char *)extent->l2_cache + (min_index * l2_size_bytes);
M
Max Reitz 已提交
1570
    BLKDBG_EVENT(extent->file, BLKDBG_L2_LOAD);
1571
    if (bdrv_pread(extent->file,
F
Fam Zheng 已提交
1572 1573
                (int64_t)l2_offset * 512,
                l2_table,
1574 1575
                l2_size_bytes
            ) != l2_size_bytes) {
F
Fam Zheng 已提交
1576
        return VMDK_ERROR;
F
Fam Zheng 已提交
1577
    }
1578

F
Fam Zheng 已提交
1579 1580
    extent->l2_cache_offsets[min_index] = l2_offset;
    extent->l2_cache_counts[min_index] = 1;
B
bellard 已提交
1581
 found:
F
Fam Zheng 已提交
1582
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
K
Kevin Wolf 已提交
1583 1584 1585 1586 1587 1588
    if (m_data) {
        m_data->l1_index = l1_index;
        m_data->l2_index = l2_index;
        m_data->l2_offset = l2_offset;
        m_data->l2_cache_entry = ((uint32_t *)l2_table) + l2_index;
    }
1589

1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620
    if (extent->sesparse) {
        cluster_sector = le64_to_cpu(((uint64_t *)l2_table)[l2_index]);
        switch (cluster_sector & 0xf000000000000000) {
        case 0x0000000000000000:
            /* unallocated grain */
            if (cluster_sector != 0) {
                return VMDK_ERROR;
            }
            break;
        case 0x1000000000000000:
            /* scsi-unmapped grain - fallthrough */
        case 0x2000000000000000:
            /* zero grain */
            zeroed = true;
            break;
        case 0x3000000000000000:
            /* allocated grain */
            cluster_sector = (((cluster_sector & 0x0fff000000000000) >> 48) |
                              ((cluster_sector & 0x0000ffffffffffff) << 12));
            cluster_sector = extent->sesparse_clusters_offset +
                cluster_sector * extent->cluster_sectors;
            break;
        default:
            return VMDK_ERROR;
        }
    } else {
        cluster_sector = le32_to_cpu(((uint32_t *)l2_table)[l2_index]);

        if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
            zeroed = true;
        }
1621 1622
    }

F
Fam Zheng 已提交
1623
    if (!cluster_sector || zeroed) {
1624
        if (!allocate) {
1625
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
1626
        }
1627
        assert(!extent->sesparse);
1628

1629 1630 1631 1632
        if (extent->next_cluster_sector >= VMDK_EXTENT_MAX_SECTORS) {
            return VMDK_ERROR;
        }

F
Fam Zheng 已提交
1633 1634
        cluster_sector = extent->next_cluster_sector;
        extent->next_cluster_sector += extent->cluster_sectors;
1635 1636 1637 1638 1639 1640

        /* First of all we write grain itself, to avoid race condition
         * that may to corrupt the image.
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
1641
        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
1642 1643
                                offset, skip_start_bytes, skip_end_bytes,
                                zeroed);
F
Fam Zheng 已提交
1644 1645
        if (ret) {
            return ret;
1646
        }
1647
        if (m_data) {
1648
            m_data->new_allocation = true;
1649
        }
1650
    }
F
Fam Zheng 已提交
1651
    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
F
Fam Zheng 已提交
1652
    return VMDK_OK;
B
bellard 已提交
1653 1654
}

F
Fam Zheng 已提交
1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671
static VmdkExtent *find_extent(BDRVVmdkState *s,
                                int64_t sector_num, VmdkExtent *start_hint)
{
    VmdkExtent *extent = start_hint;

    if (!extent) {
        extent = &s->extents[0];
    }
    while (extent < &s->extents[s->num_extents]) {
        if (sector_num < extent->end_sector) {
            return extent;
        }
        extent++;
    }
    return NULL;
}

1672 1673 1674
static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
                                                   int64_t offset)
{
1675
    uint64_t extent_begin_offset, extent_relative_offset;
1676 1677 1678 1679 1680
    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;

    extent_begin_offset =
        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
    extent_relative_offset = offset - extent_begin_offset;
1681
    return extent_relative_offset % cluster_size;
1682 1683
}

1684 1685 1686 1687 1688
static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
                                             bool want_zero,
                                             int64_t offset, int64_t bytes,
                                             int64_t *pnum, int64_t *map,
                                             BlockDriverState **file)
B
bellard 已提交
1689 1690
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1691
    int64_t index_in_cluster, n, ret;
1692
    uint64_t cluster_offset;
F
Fam Zheng 已提交
1693 1694
    VmdkExtent *extent;

1695
    extent = find_extent(s, offset >> BDRV_SECTOR_BITS, NULL);
F
Fam Zheng 已提交
1696
    if (!extent) {
1697
        return -EIO;
F
Fam Zheng 已提交
1698
    }
1699
    qemu_co_mutex_lock(&s->lock);
1700
    ret = get_cluster_offset(bs, extent, NULL, offset, false, &cluster_offset,
F
Fam Zheng 已提交
1701
                             0, 0);
1702
    qemu_co_mutex_unlock(&s->lock);
1703

1704
    index_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716
    switch (ret) {
    case VMDK_ERROR:
        ret = -EIO;
        break;
    case VMDK_UNALLOC:
        ret = 0;
        break;
    case VMDK_ZEROED:
        ret = BDRV_BLOCK_ZERO;
        break;
    case VMDK_OK:
        ret = BDRV_BLOCK_DATA;
1717
        if (!extent->compressed) {
1718
            ret |= BDRV_BLOCK_OFFSET_VALID;
1719
            *map = cluster_offset + index_in_cluster;
1720 1721 1722
            if (extent->flat) {
                ret |= BDRV_BLOCK_RECURSE;
            }
1723
        }
1724
        *file = extent->file->bs;
1725 1726
        break;
    }
1727

1728 1729
    n = extent->cluster_sectors * BDRV_SECTOR_SIZE - index_in_cluster;
    *pnum = MIN(n, bytes);
F
Fam Zheng 已提交
1730
    return ret;
B
bellard 已提交
1731 1732
}

1733
static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1734 1735 1736
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            uint64_t qiov_offset, uint64_t n_bytes,
                            uint64_t offset)
1737 1738
{
    int ret;
F
Fam Zheng 已提交
1739 1740
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
1741
    QEMUIOVector local_qiov;
1742 1743
    int64_t write_offset;
    int64_t write_end_sector;
1744

F
Fam Zheng 已提交
1745
    if (extent->compressed) {
1746 1747
        void *compressed_data;

1748 1749 1750 1751 1752 1753 1754 1755 1756 1757
        /* Only whole clusters */
        if (offset_in_cluster ||
            n_bytes > (extent->cluster_sectors * SECTOR_SIZE) ||
            (n_bytes < (extent->cluster_sectors * SECTOR_SIZE) &&
             offset + n_bytes != extent->end_sector * SECTOR_SIZE))
        {
            ret = -EINVAL;
            goto out;
        }

F
Fam Zheng 已提交
1758 1759 1760 1761 1762 1763
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1764 1765 1766 1767 1768 1769 1770

        compressed_data = g_malloc(n_bytes);
        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
        g_free(compressed_data);

        if (ret != Z_OK || buf_len == 0) {
F
Fam Zheng 已提交
1771 1772 1773
            ret = -EINVAL;
            goto out;
        }
1774

1775 1776
        data->lba = cpu_to_le64(offset >> BDRV_SECTOR_BITS);
        data->size = cpu_to_le32(buf_len);
1777 1778

        n_bytes = buf_len + sizeof(VmdkGrainMarker);
1779
        qemu_iovec_init_buf(&local_qiov, data, n_bytes);
M
Max Reitz 已提交
1780 1781

        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED);
1782 1783 1784
    } else {
        qemu_iovec_init(&local_qiov, qiov->niov);
        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
M
Max Reitz 已提交
1785 1786

        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_AIO);
F
Fam Zheng 已提交
1787
    }
1788

1789
    write_offset = cluster_offset + offset_in_cluster;
1790
    ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
1791
                          &local_qiov, 0);
1792

1793
    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
1794

1795 1796 1797 1798 1799 1800
    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
    } else {
        extent->next_cluster_sector = MAX(extent->next_cluster_sector,
                                          write_end_sector);
    }
1801

1802
    if (ret < 0) {
1803 1804 1805 1806
        goto out;
    }
    ret = 0;
 out:
F
Fam Zheng 已提交
1807
    g_free(data);
1808 1809 1810
    if (!extent->compressed) {
        qemu_iovec_destroy(&local_qiov);
    }
1811 1812 1813 1814
    return ret;
}

static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1815 1816
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            int bytes)
1817 1818
{
    int ret;
F
Fam Zheng 已提交
1819 1820 1821 1822 1823 1824 1825
    int cluster_bytes, buf_bytes;
    uint8_t *cluster_buf, *compressed_data;
    uint8_t *uncomp_buf;
    uint32_t data_len;
    VmdkGrainMarker *marker;
    uLongf buf_len;

1826

F
Fam Zheng 已提交
1827
    if (!extent->compressed) {
M
Max Reitz 已提交
1828
        BLKDBG_EVENT(extent->file, BLKDBG_READ_AIO);
1829
        ret = bdrv_co_preadv(extent->file,
1830 1831 1832 1833
                             cluster_offset + offset_in_cluster, bytes,
                             qiov, 0);
        if (ret < 0) {
            return ret;
F
Fam Zheng 已提交
1834
        }
1835
        return 0;
F
Fam Zheng 已提交
1836 1837 1838 1839 1840 1841
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
    buf_bytes = cluster_bytes * 2;
    cluster_buf = g_malloc(buf_bytes);
    uncomp_buf = g_malloc(cluster_bytes);
M
Max Reitz 已提交
1842
    BLKDBG_EVENT(extent->file, BLKDBG_READ_COMPRESSED);
1843
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867
                cluster_offset,
                cluster_buf, buf_bytes);
    if (ret < 0) {
        goto out;
    }
    compressed_data = cluster_buf;
    buf_len = cluster_bytes;
    data_len = cluster_bytes;
    if (extent->has_marker) {
        marker = (VmdkGrainMarker *)cluster_buf;
        compressed_data = marker->data;
        data_len = le32_to_cpu(marker->size);
    }
    if (!data_len || data_len > buf_bytes) {
        ret = -EINVAL;
        goto out;
    }
    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
    if (ret != Z_OK) {
        ret = -EINVAL;
        goto out;

    }
    if (offset_in_cluster < 0 ||
1868
            offset_in_cluster + bytes > buf_len) {
F
Fam Zheng 已提交
1869 1870
        ret = -EINVAL;
        goto out;
1871
    }
1872
    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
F
Fam Zheng 已提交
1873 1874 1875 1876 1877 1878
    ret = 0;

 out:
    g_free(uncomp_buf);
    g_free(cluster_buf);
    return ret;
1879 1880
}

1881 1882 1883
static int coroutine_fn
vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
               QEMUIOVector *qiov, int flags)
B
bellard 已提交
1884 1885
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1886
    int ret;
1887
    uint64_t n_bytes, offset_in_cluster;
F
Fam Zheng 已提交
1888
    VmdkExtent *extent = NULL;
1889
    QEMUIOVector local_qiov;
B
bellard 已提交
1890
    uint64_t cluster_offset;
1891
    uint64_t bytes_done = 0;
1892

1893 1894 1895 1896 1897
    qemu_iovec_init(&local_qiov, qiov->niov);
    qemu_co_mutex_lock(&s->lock);

    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1898
        if (!extent) {
1899 1900
            ret = -EIO;
            goto fail;
F
Fam Zheng 已提交
1901
        }
F
Fam Zheng 已提交
1902
        ret = get_cluster_offset(bs, extent, NULL,
1903 1904 1905 1906 1907 1908
                                 offset, false, &cluster_offset, 0, 0);
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);

        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

1909
        if (ret != VMDK_OK) {
1910
            /* if not allocated, try to read from parent image, if exist */
1911
            if (bs->backing && ret != VMDK_ZEROED) {
F
Fam Zheng 已提交
1912
                if (!vmdk_is_cid_valid(bs)) {
1913 1914
                    ret = -EINVAL;
                    goto fail;
F
Fam Zheng 已提交
1915
                }
1916 1917 1918 1919

                qemu_iovec_reset(&local_qiov);
                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

M
Max Reitz 已提交
1920 1921
                /* qcow2 emits this on bs->file instead of bs->backing */
                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
1922
                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
1923
                                     &local_qiov, 0);
F
Fam Zheng 已提交
1924
                if (ret < 0) {
1925
                    goto fail;
F
Fam Zheng 已提交
1926
                }
1927
            } else {
1928
                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
1929
            }
B
bellard 已提交
1930
        } else {
1931 1932 1933 1934 1935
            qemu_iovec_reset(&local_qiov);
            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
                                   &local_qiov, n_bytes);
1936
            if (ret) {
1937
                goto fail;
1938
            }
B
bellard 已提交
1939
        }
1940 1941 1942
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
B
bellard 已提交
1943 1944
    }

1945 1946
    ret = 0;
fail:
1947
    qemu_co_mutex_unlock(&s->lock);
1948 1949
    qemu_iovec_destroy(&local_qiov);

1950 1951 1952
    return ret;
}

F
Fam Zheng 已提交
1953 1954 1955
/**
 * vmdk_write:
 * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1956 1957 1958 1959
 *                if possible, otherwise return -ENOTSUP.
 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
 *                with each cluster. By dry run we can find if the zero write
 *                is possible without modifying image data.
F
Fam Zheng 已提交
1960 1961 1962
 *
 * Returns: error code with 0 for success.
 */
1963 1964 1965
static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
                       uint64_t bytes, QEMUIOVector *qiov,
                       bool zeroed, bool zero_dry_run)
B
bellard 已提交
1966
{
1967
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1968
    VmdkExtent *extent = NULL;
F
Fam Zheng 已提交
1969
    int ret;
1970
    int64_t offset_in_cluster, n_bytes;
1971
    uint64_t cluster_offset;
1972
    uint64_t bytes_done = 0;
F
Fam Zheng 已提交
1973
    VmdkMetaData m_data;
1974

1975 1976
    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
        error_report("Wrong offset: offset=0x%" PRIx64
1977
                     " total_sectors=0x%" PRIx64,
1978
                     offset, bs->total_sectors);
1979
        return -EIO;
1980 1981
    }

1982 1983
    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1984 1985 1986
        if (!extent) {
            return -EIO;
        }
1987 1988 1989
        if (extent->sesparse) {
            return -ENOTSUP;
        }
1990 1991 1992 1993 1994
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

        ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
1995
                                 !(extent->compressed || zeroed),
1996 1997
                                 &cluster_offset, offset_in_cluster,
                                 offset_in_cluster + n_bytes);
F
Fam Zheng 已提交
1998
        if (extent->compressed) {
F
Fam Zheng 已提交
1999
            if (ret == VMDK_OK) {
F
Fam Zheng 已提交
2000
                /* Refuse write to allocated cluster for streamOptimized */
F
Fam Zheng 已提交
2001 2002
                error_report("Could not write to allocated cluster"
                              " for streamOptimized");
F
Fam Zheng 已提交
2003
                return -EIO;
K
Kevin Wolf 已提交
2004
            } else if (!zeroed) {
F
Fam Zheng 已提交
2005
                /* allocate */
2006
                ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
2007
                                         true, &cluster_offset, 0, 0);
F
Fam Zheng 已提交
2008 2009
            }
        }
F
Fam Zheng 已提交
2010
        if (ret == VMDK_ERROR) {
2011
            return -EINVAL;
F
Fam Zheng 已提交
2012
        }
F
Fam Zheng 已提交
2013 2014 2015
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
2016 2017 2018
                    offset_in_cluster == 0 &&
                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
2019
                if (!zero_dry_run && ret != VMDK_ZEROED) {
F
Fam Zheng 已提交
2020
                    /* update L2 tables */
F
Fam Zheng 已提交
2021 2022
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
                            != VMDK_OK) {
F
Fam Zheng 已提交
2023 2024 2025 2026 2027 2028 2029
                        return -EIO;
                    }
                }
            } else {
                return -ENOTSUP;
            }
        } else {
2030 2031
            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
                                    qiov, bytes_done, n_bytes, offset);
F
Fam Zheng 已提交
2032 2033 2034
            if (ret) {
                return ret;
            }
2035
            if (m_data.new_allocation) {
F
Fam Zheng 已提交
2036
                /* update L2 tables */
F
Fam Zheng 已提交
2037 2038 2039
                if (vmdk_L2update(extent, &m_data,
                                  cluster_offset >> BDRV_SECTOR_BITS)
                        != VMDK_OK) {
F
Fam Zheng 已提交
2040 2041
                    return -EIO;
                }
F
Fam Zheng 已提交
2042
            }
2043
        }
2044 2045 2046
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
2047

F
Fam Zheng 已提交
2048 2049
        /* update CID on the first write every time the virtual disk is
         * opened */
2050
        if (!s->cid_updated) {
F
Fam Zheng 已提交
2051
            ret = vmdk_write_cid(bs, g_random_int());
K
Kevin Wolf 已提交
2052 2053 2054
            if (ret < 0) {
                return ret;
            }
2055
            s->cid_updated = true;
2056
        }
2057 2058
    }
    return 0;
B
bellard 已提交
2059 2060
}

2061 2062 2063
static int coroutine_fn
vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
                QEMUIOVector *qiov, int flags)
2064 2065 2066 2067
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
2068
    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
F
Fam Zheng 已提交
2069 2070 2071 2072
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

2073 2074 2075
static int coroutine_fn
vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                           uint64_t bytes, QEMUIOVector *qiov)
2076
{
2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089
    if (bytes == 0) {
        /* The caller will write bytes 0 to signal EOF.
         * When receive it, we align EOF to a sector boundary. */
        BDRVVmdkState *s = bs->opaque;
        int i, ret;
        int64_t length;

        for (i = 0; i < s->num_extents; i++) {
            length = bdrv_getlength(s->extents[i].file->bs);
            if (length < 0) {
                return length;
            }
            length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
2090
            ret = bdrv_truncate(s->extents[i].file, length, false,
2091
                                PREALLOC_MODE_OFF, 0, NULL);
2092 2093 2094 2095 2096 2097
            if (ret < 0) {
                return ret;
            }
        }
        return 0;
    }
2098
    return vmdk_co_pwritev(bs, offset, bytes, qiov, 0);
2099 2100
}

2101 2102 2103 2104
static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
                                              int64_t offset,
                                              int bytes,
                                              BdrvRequestFlags flags)
F
Fam Zheng 已提交
2105 2106 2107
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
2108

F
Fam Zheng 已提交
2109
    qemu_co_mutex_lock(&s->lock);
2110 2111
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
2112
    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
F
Fam Zheng 已提交
2113
    if (!ret) {
2114
        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
F
Fam Zheng 已提交
2115
    }
2116 2117 2118 2119
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

F
Fam Zheng 已提交
2120 2121 2122 2123
static int vmdk_init_extent(BlockBackend *blk,
                            int64_t filesize, bool flat,
                            bool compress, bool zeroed_grain,
                            Error **errp)
2124
{
F
Fam Zheng 已提交
2125
    int ret, i;
2126
    VMDK4Header header;
2127 2128 2129
    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
    uint32_t *gd_buf = NULL;
    int gd_buf_size;
2130

F
Fam Zheng 已提交
2131
    if (flat) {
2132
        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp);
F
Fam Zheng 已提交
2133
        goto exit;
2134
    }
2135 2136
    magic = cpu_to_be32(VMDK4_MAGIC);
    memset(&header, 0, sizeof(header));
2137 2138 2139 2140 2141 2142 2143
    if (compress) {
        header.version = 3;
    } else if (zeroed_grain) {
        header.version = 2;
    } else {
        header.version = 1;
    }
F
Fam Zheng 已提交
2144
    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
2145 2146
                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
2147
    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
2148
    header.capacity = filesize / BDRV_SECTOR_SIZE;
A
Alexander Graf 已提交
2149
    header.granularity = 128;
2150
    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
2151

2152 2153 2154 2155 2156
    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                           BDRV_SECTOR_SIZE);
    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
2157 2158 2159 2160

    header.desc_offset = 1;
    header.desc_size = 20;
    header.rgd_offset = header.desc_offset + header.desc_size;
2161
    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
2162
    header.grain_offset =
2163 2164
        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                 header.granularity);
A
Alexander Graf 已提交
2165 2166 2167 2168 2169
    /* swap endianness for all header fields */
    header.version = cpu_to_le32(header.version);
    header.flags = cpu_to_le32(header.flags);
    header.capacity = cpu_to_le64(header.capacity);
    header.granularity = cpu_to_le64(header.granularity);
2170
    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
2171 2172 2173 2174 2175
    header.desc_offset = cpu_to_le64(header.desc_offset);
    header.desc_size = cpu_to_le64(header.desc_size);
    header.rgd_offset = cpu_to_le64(header.rgd_offset);
    header.gd_offset = cpu_to_le64(header.gd_offset);
    header.grain_offset = cpu_to_le64(header.grain_offset);
2176
    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
2177 2178 2179 2180 2181

    header.check_bytes[0] = 0xa;
    header.check_bytes[1] = 0x20;
    header.check_bytes[2] = 0xd;
    header.check_bytes[3] = 0xa;
2182 2183

    /* write all the data */
2184
    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
2185
    if (ret < 0) {
2186
        error_setg(errp, QERR_IO_ERROR);
2187 2188
        goto exit;
    }
2189
    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
2190
    if (ret < 0) {
2191
        error_setg(errp, QERR_IO_ERROR);
2192 2193
        goto exit;
    }
2194

2195
    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
2196
                       PREALLOC_MODE_OFF, 0, errp);
2197 2198 2199
    if (ret < 0) {
        goto exit;
    }
2200 2201

    /* write grain directory */
2202 2203 2204
    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
    gd_buf = g_malloc0(gd_buf_size);
    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
2205
         i < gt_count; i++, tmp += gt_size) {
2206 2207
        gd_buf[i] = cpu_to_le32(tmp);
    }
2208
    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
2209
                     gd_buf, gd_buf_size, 0);
2210
    if (ret < 0) {
2211
        error_setg(errp, QERR_IO_ERROR);
2212
        goto exit;
2213
    }
2214

2215
    /* write backup grain directory */
2216
    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
2217
         i < gt_count; i++, tmp += gt_size) {
2218 2219
        gd_buf[i] = cpu_to_le32(tmp);
    }
2220
    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
2221
                     gd_buf, gd_buf_size, 0);
2222
    if (ret < 0) {
2223
        error_setg(errp, QERR_IO_ERROR);
2224
    }
2225

F
Fam Zheng 已提交
2226
    ret = 0;
F
Fam Zheng 已提交
2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
exit:
    g_free(gd_buf);
    return ret;
}

static int vmdk_create_extent(const char *filename, int64_t filesize,
                              bool flat, bool compress, bool zeroed_grain,
                              BlockBackend **pbb,
                              QemuOpts *opts, Error **errp)
{
    int ret;
    BlockBackend *blk = NULL;
    Error *local_err = NULL;

    ret = bdrv_create_file(filename, opts, &local_err);
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto exit;
    }

    blk = blk_new_open(filename, NULL, NULL,
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
                       &local_err);
    if (blk == NULL) {
        error_propagate(errp, local_err);
        ret = -EIO;
        goto exit;
    }

    blk_set_allow_write_beyond_eof(blk, true);

    ret = vmdk_init_extent(blk, filesize, flat, compress, zeroed_grain, errp);
2259
exit:
2260
    if (blk) {
F
Fam Zheng 已提交
2261 2262 2263 2264 2265 2266
        if (pbb) {
            *pbb = blk;
        } else {
            blk_unref(blk);
            blk = NULL;
        }
2267
    }
F
Fam Zheng 已提交
2268 2269 2270 2271
    return ret;
}

static int filename_decompose(const char *filename, char *path, char *prefix,
F
Fam Zheng 已提交
2272
                              char *postfix, size_t buf_len, Error **errp)
F
Fam Zheng 已提交
2273 2274 2275 2276
{
    const char *p, *q;

    if (filename == NULL || !strlen(filename)) {
F
Fam Zheng 已提交
2277
        error_setg(errp, "No filename provided");
F
Fam Zheng 已提交
2278
        return VMDK_ERROR;
F
Fam Zheng 已提交
2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289
    }
    p = strrchr(filename, '/');
    if (p == NULL) {
        p = strrchr(filename, '\\');
    }
    if (p == NULL) {
        p = strrchr(filename, ':');
    }
    if (p != NULL) {
        p++;
        if (p - filename >= buf_len) {
F
Fam Zheng 已提交
2290
            return VMDK_ERROR;
F
Fam Zheng 已提交
2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302
        }
        pstrcpy(path, p - filename + 1, filename);
    } else {
        p = filename;
        path[0] = '\0';
    }
    q = strrchr(p, '.');
    if (q == NULL) {
        pstrcpy(prefix, buf_len, p);
        postfix[0] = '\0';
    } else {
        if (q - p >= buf_len) {
F
Fam Zheng 已提交
2303
            return VMDK_ERROR;
F
Fam Zheng 已提交
2304 2305 2306 2307
        }
        pstrcpy(prefix, q - p + 1, p);
        pstrcpy(postfix, buf_len, q);
    }
F
Fam Zheng 已提交
2308
    return VMDK_OK;
F
Fam Zheng 已提交
2309 2310
}

2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345
/*
 * idx == 0: get or create the descriptor file (also the image file if in a
 *           non-split format.
 * idx >= 1: get the n-th extent if in a split subformat
 */
typedef BlockBackend *(*vmdk_create_extent_fn)(int64_t size,
                                               int idx,
                                               bool flat,
                                               bool split,
                                               bool compress,
                                               bool zeroed_grain,
                                               void *opaque,
                                               Error **errp);

static void vmdk_desc_add_extent(GString *desc,
                                 const char *extent_line_fmt,
                                 int64_t size, const char *filename)
{
    char *basename = g_path_get_basename(filename);

    g_string_append_printf(desc, extent_line_fmt,
                           DIV_ROUND_UP(size, BDRV_SECTOR_SIZE), basename);
    g_free(basename);
}

static int coroutine_fn vmdk_co_do_create(int64_t size,
                                          BlockdevVmdkSubformat subformat,
                                          BlockdevVmdkAdapterType adapter_type,
                                          const char *backing_file,
                                          const char *hw_version,
                                          bool compat6,
                                          bool zeroed_grain,
                                          vmdk_create_extent_fn extent_fn,
                                          void *opaque,
                                          Error **errp)
F
Fam Zheng 已提交
2346
{
2347 2348
    int extent_idx;
    BlockBackend *blk = NULL;
2349
    BlockBackend *extent_blk;
F
Fam Zheng 已提交
2350
    Error *local_err = NULL;
2351
    char *desc = NULL;
F
Fam Zheng 已提交
2352
    int ret = 0;
2353
    bool flat, split, compress;
2354
    GString *ext_desc_lines;
F
Fam Zheng 已提交
2355
    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
2356 2357 2358
    int64_t extent_size;
    int64_t created_size = 0;
    const char *extent_line_fmt;
2359
    char *parent_desc_line = g_malloc0(BUF_SIZE);
F
Fam Zheng 已提交
2360
    uint32_t parent_cid = 0xffffffff;
2361
    uint32_t number_heads = 16;
2362
    uint32_t desc_offset = 0, desc_len;
F
Fam Zheng 已提交
2363 2364 2365
    const char desc_template[] =
        "# Disk DescriptorFile\n"
        "version=1\n"
2366 2367
        "CID=%" PRIx32 "\n"
        "parentCID=%" PRIx32 "\n"
F
Fam Zheng 已提交
2368 2369 2370 2371 2372 2373 2374 2375 2376
        "createType=\"%s\"\n"
        "%s"
        "\n"
        "# Extent description\n"
        "%s"
        "\n"
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
2377
        "ddb.virtualHWVersion = \"%s\"\n"
F
Fam Zheng 已提交
2378
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
2379
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
F
Fam Zheng 已提交
2380
        "ddb.geometry.sectors = \"63\"\n"
2381
        "ddb.adapterType = \"%s\"\n";
F
Fam Zheng 已提交
2382

2383 2384
    ext_desc_lines = g_string_new(NULL);

F
Fam Zheng 已提交
2385
    /* Read out options */
2386 2387
    if (compat6) {
        if (hw_version) {
2388 2389 2390 2391 2392
            error_setg(errp,
                       "compat6 cannot be enabled with hwversion set");
            ret = -EINVAL;
            goto exit;
        }
2393
        hw_version = "6";
2394
    }
2395 2396
    if (!hw_version) {
        hw_version = "4";
F
Fam Zheng 已提交
2397
    }
2398

2399
    if (adapter_type != BLOCKDEV_VMDK_ADAPTER_TYPE_IDE) {
2400 2401 2402 2403
        /* that's the number of heads with which vmware operates when
           creating, exporting, etc. vmdk files with a non-ide adapter type */
        number_heads = 255;
    }
2404 2405 2406 2407 2408 2409
    split = (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT) ||
            (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTSPARSE);
    flat = (subformat == BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICFLAT) ||
           (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT);
    compress = subformat == BLOCKDEV_VMDK_SUBFORMAT_STREAMOPTIMIZED;

F
Fam Zheng 已提交
2410
    if (flat) {
2411
        extent_line_fmt = "RW %" PRId64 " FLAT \"%s\" 0\n";
F
Fam Zheng 已提交
2412
    } else {
2413
        extent_line_fmt = "RW %" PRId64 " SPARSE \"%s\"\n";
F
Fam Zheng 已提交
2414 2415
    }
    if (flat && backing_file) {
F
Fam Zheng 已提交
2416
        error_setg(errp, "Flat image can't have backing file");
2417 2418
        ret = -ENOTSUP;
        goto exit;
F
Fam Zheng 已提交
2419
    }
2420 2421
    if (flat && zeroed_grain) {
        error_setg(errp, "Flat image can't enable zeroed grain");
2422 2423
        ret = -ENOTSUP;
        goto exit;
2424
    }
2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448

    /* Create extents */
    if (split) {
        extent_size = split_size;
    } else {
        extent_size = size;
    }
    if (!split && !flat) {
        created_size = extent_size;
    } else {
        created_size = 0;
    }
    /* Get the descriptor file BDS */
    blk = extent_fn(created_size, 0, flat, split, compress, zeroed_grain,
                    opaque, errp);
    if (!blk) {
        ret = -EIO;
        goto exit;
    }
    if (!split && !flat) {
        vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, created_size,
                             blk_bs(blk)->filename);
    }

F
Fam Zheng 已提交
2449
    if (backing_file) {
2450
        BlockBackend *backing;
2451 2452 2453 2454
        char *full_backing =
            bdrv_get_full_backing_filename_from_filename(blk_bs(blk)->filename,
                                                         backing_file,
                                                         &local_err);
2455 2456 2457 2458 2459
        if (local_err) {
            error_propagate(errp, local_err);
            ret = -ENOENT;
            goto exit;
        }
2460
        assert(full_backing);
2461

2462 2463
        backing = blk_new_open(full_backing, NULL, NULL,
                               BDRV_O_NO_BACKING, errp);
2464
        g_free(full_backing);
2465
        if (backing == NULL) {
2466
            ret = -EIO;
2467
            goto exit;
F
Fam Zheng 已提交
2468
        }
2469 2470 2471 2472
        if (strcmp(blk_bs(backing)->drv->format_name, "vmdk")) {
            error_setg(errp, "Invalid backing file format: %s. Must be vmdk",
                       blk_bs(backing)->drv->format_name);
            blk_unref(backing);
2473 2474
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2475
        }
2476 2477
        ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid);
        blk_unref(backing);
2478
        if (ret) {
2479
            error_setg(errp, "Failed to read parent CID");
2480 2481
            goto exit;
        }
2482
        snprintf(parent_desc_line, BUF_SIZE,
2483
                "parentFileNameHint=\"%s\"", backing_file);
F
Fam Zheng 已提交
2484
    }
2485 2486 2487 2488 2489 2490
    extent_idx = 1;
    while (created_size < size) {
        int64_t cur_size = MIN(size - created_size, extent_size);
        extent_blk = extent_fn(cur_size, extent_idx, flat, split, compress,
                               zeroed_grain, opaque, errp);
        if (!extent_blk) {
2491 2492
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2493
        }
2494 2495 2496 2497 2498
        vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, cur_size,
                             blk_bs(extent_blk)->filename);
        created_size += cur_size;
        extent_idx++;
        blk_unref(extent_blk);
F
Fam Zheng 已提交
2499
    }
2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510

    /* Check whether we got excess extents */
    extent_blk = extent_fn(-1, extent_idx, flat, split, compress, zeroed_grain,
                           opaque, NULL);
    if (extent_blk) {
        blk_unref(extent_blk);
        error_setg(errp, "List of extents contains unused extents");
        ret = -EINVAL;
        goto exit;
    }

F
Fam Zheng 已提交
2511
    /* generate descriptor file */
2512
    desc = g_strdup_printf(desc_template,
F
Fam Zheng 已提交
2513
                           g_random_int(),
2514
                           parent_cid,
2515
                           BlockdevVmdkSubformat_str(subformat),
2516 2517
                           parent_desc_line,
                           ext_desc_lines->str,
2518
                           hw_version,
2519
                           size /
2520
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
2521
                           number_heads,
2522
                           BlockdevVmdkAdapterType_str(adapter_type));
2523 2524 2525 2526
    desc_len = strlen(desc);
    /* the descriptor offset = 0x200 */
    if (!split && !flat) {
        desc_offset = 0x200;
2527 2528 2529 2530 2531 2532 2533 2534 2535 2536
    }

    ret = blk_pwrite(blk, desc_offset, desc, desc_len, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
    }
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
     * for description file */
    if (desc_offset == 0) {
2537
        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, 0, errp);
2538 2539 2540
        if (ret < 0) {
            goto exit;
        }
F
Fam Zheng 已提交
2541
    }
2542 2543 2544 2545 2546 2547 2548 2549 2550 2551
    ret = 0;
exit:
    if (blk) {
        blk_unref(blk);
    }
    g_free(desc);
    g_free(parent_desc_line);
    g_string_free(ext_desc_lines, true);
    return ret;
}
2552

2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570
typedef struct {
    char *path;
    char *prefix;
    char *postfix;
    QemuOpts *opts;
} VMDKCreateOptsData;

static BlockBackend *vmdk_co_create_opts_cb(int64_t size, int idx,
                                            bool flat, bool split, bool compress,
                                            bool zeroed_grain, void *opaque,
                                            Error **errp)
{
    BlockBackend *blk = NULL;
    BlockDriverState *bs = NULL;
    VMDKCreateOptsData *data = opaque;
    char *ext_filename = NULL;
    char *rel_filename = NULL;

2571 2572 2573 2574 2575 2576
    /* We're done, don't create excess extents. */
    if (size == -1) {
        assert(errp == NULL);
        return NULL;
    }

2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593
    if (idx == 0) {
        rel_filename = g_strdup_printf("%s%s", data->prefix, data->postfix);
    } else if (split) {
        rel_filename = g_strdup_printf("%s-%c%03d%s",
                                       data->prefix,
                                       flat ? 'f' : 's', idx, data->postfix);
    } else {
        assert(idx == 1);
        rel_filename = g_strdup_printf("%s-flat%s", data->prefix, data->postfix);
    }

    ext_filename = g_strdup_printf("%s%s", data->path, rel_filename);
    g_free(rel_filename);

    if (vmdk_create_extent(ext_filename, size,
                           flat, compress, zeroed_grain, &blk, data->opts,
                           errp)) {
2594
        goto exit;
F
Fam Zheng 已提交
2595
    }
2596 2597 2598 2599 2600
    bdrv_unref(bs);
exit:
    g_free(ext_filename);
    return blk;
}
2601

2602 2603 2604
static int coroutine_fn vmdk_co_create_opts(BlockDriver *drv,
                                            const char *filename,
                                            QemuOpts *opts,
2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626
                                            Error **errp)
{
    Error *local_err = NULL;
    char *desc = NULL;
    int64_t total_size = 0;
    char *adapter_type = NULL;
    BlockdevVmdkAdapterType adapter_type_enum;
    char *backing_file = NULL;
    char *hw_version = NULL;
    char *fmt = NULL;
    BlockdevVmdkSubformat subformat;
    int ret = 0;
    char *path = g_malloc0(PATH_MAX);
    char *prefix = g_malloc0(PATH_MAX);
    char *postfix = g_malloc0(PATH_MAX);
    char *desc_line = g_malloc0(BUF_SIZE);
    char *ext_filename = g_malloc0(PATH_MAX);
    char *desc_filename = g_malloc0(PATH_MAX);
    char *parent_desc_line = g_malloc0(BUF_SIZE);
    bool zeroed_grain;
    bool compat6;
    VMDKCreateOptsData data;
2627

2628 2629
    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
        ret = -EINVAL;
2630
        goto exit;
F
Fam Zheng 已提交
2631
    }
2632 2633 2634 2635 2636 2637 2638 2639 2640
    /* Read out options */
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
    compat6 = qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false);
    if (strcmp(hw_version, "undefined") == 0) {
        g_free(hw_version);
2641
        hw_version = NULL;
2642
    }
2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    zeroed_grain = qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false);

    if (adapter_type) {
        adapter_type_enum = qapi_enum_parse(&BlockdevVmdkAdapterType_lookup,
                                            adapter_type,
                                            BLOCKDEV_VMDK_ADAPTER_TYPE_IDE,
                                            &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            ret = -EINVAL;
            goto exit;
        }
    } else {
        adapter_type_enum = BLOCKDEV_VMDK_ADAPTER_TYPE_IDE;
2658
    }
2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684

    if (!fmt) {
        /* Default format to monolithicSparse */
        subformat = BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE;
    } else {
        subformat = qapi_enum_parse(&BlockdevVmdkSubformat_lookup,
                                    fmt,
                                    BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE,
                                    &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            ret = -EINVAL;
            goto exit;
        }
    }
    data = (VMDKCreateOptsData){
        .prefix = prefix,
        .postfix = postfix,
        .path = path,
        .opts = opts,
    };
    ret = vmdk_co_do_create(total_size, subformat, adapter_type_enum,
                            backing_file, hw_version, compat6, zeroed_grain,
                            vmdk_co_create_opts_cb, &data, errp);

exit:
2685 2686
    g_free(adapter_type);
    g_free(backing_file);
2687
    g_free(hw_version);
2688
    g_free(fmt);
2689
    g_free(desc);
2690 2691 2692 2693 2694 2695 2696
    g_free(path);
    g_free(prefix);
    g_free(postfix);
    g_free(desc_line);
    g_free(ext_filename);
    g_free(desc_filename);
    g_free(parent_desc_line);
2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730
    return ret;
}

static BlockBackend *vmdk_co_create_cb(int64_t size, int idx,
                                       bool flat, bool split, bool compress,
                                       bool zeroed_grain, void *opaque,
                                       Error **errp)
{
    int ret;
    BlockDriverState *bs;
    BlockBackend *blk;
    BlockdevCreateOptionsVmdk *opts = opaque;

    if (idx == 0) {
        bs = bdrv_open_blockdev_ref(opts->file, errp);
    } else {
        int i;
        BlockdevRefList *list = opts->extents;
        for (i = 1; i < idx; i++) {
            if (!list || !list->next) {
                error_setg(errp, "Extent [%d] not specified", i);
                return NULL;
            }
            list = list->next;
        }
        if (!list) {
            error_setg(errp, "Extent [%d] not specified", idx - 1);
            return NULL;
        }
        bs = bdrv_open_blockdev_ref(list->value, errp);
    }
    if (!bs) {
        return NULL;
    }
E
Eric Blake 已提交
2731 2732 2733 2734
    blk = blk_new_with_bs(bs,
                          BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
                          BLK_PERM_ALL, errp);
    if (!blk) {
2735 2736 2737 2738 2739
        return NULL;
    }
    blk_set_allow_write_beyond_eof(blk, true);
    bdrv_unref(bs);

2740 2741 2742 2743 2744 2745
    if (size != -1) {
        ret = vmdk_init_extent(blk, size, flat, compress, zeroed_grain, errp);
        if (ret) {
            blk_unref(blk);
            blk = NULL;
        }
2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776
    }
    return blk;
}

static int coroutine_fn vmdk_co_create(BlockdevCreateOptions *create_options,
                                       Error **errp)
{
    int ret;
    BlockdevCreateOptionsVmdk *opts;

    opts = &create_options->u.vmdk;

    /* Validate options */
    if (!QEMU_IS_ALIGNED(opts->size, BDRV_SECTOR_SIZE)) {
        error_setg(errp, "Image size must be a multiple of 512 bytes");
        ret = -EINVAL;
        goto out;
    }

    ret = vmdk_co_do_create(opts->size,
                            opts->subformat,
                            opts->adapter_type,
                            opts->backing_file,
                            opts->hwversion,
                            false,
                            opts->zeroed_grain,
                            vmdk_co_create_cb,
                            opts, errp);
    return ret;

out:
2777
    return ret;
2778 2779
}

B
bellard 已提交
2780
static void vmdk_close(BlockDriverState *bs)
B
bellard 已提交
2781
{
K
Kevin Wolf 已提交
2782 2783
    BDRVVmdkState *s = bs->opaque;

F
Fam Zheng 已提交
2784
    vmdk_free_extents(bs);
F
Fam Zheng 已提交
2785
    g_free(s->create_type);
K
Kevin Wolf 已提交
2786 2787 2788

    migrate_del_blocker(s->migration_blocker);
    error_free(s->migration_blocker);
B
bellard 已提交
2789 2790
}

P
Paolo Bonzini 已提交
2791
static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
P
pbrook 已提交
2792
{
F
Fam Zheng 已提交
2793
    BDRVVmdkState *s = bs->opaque;
2794 2795
    int i, err;
    int ret = 0;
F
Fam Zheng 已提交
2796 2797

    for (i = 0; i < s->num_extents; i++) {
2798
        err = bdrv_co_flush(s->extents[i].file->bs);
F
Fam Zheng 已提交
2799 2800 2801 2802 2803
        if (err < 0) {
            ret = err;
        }
    }
    return ret;
P
pbrook 已提交
2804 2805
}

2806 2807 2808 2809 2810 2811 2812
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
{
    int i;
    int64_t ret = 0;
    int64_t r;
    BDRVVmdkState *s = bs->opaque;

K
Kevin Wolf 已提交
2813
    ret = bdrv_get_allocated_file_size(bs->file->bs);
2814 2815 2816 2817
    if (ret < 0) {
        return ret;
    }
    for (i = 0; i < s->num_extents; i++) {
K
Kevin Wolf 已提交
2818
        if (s->extents[i].file == bs->file) {
2819 2820
            continue;
        }
2821
        r = bdrv_get_allocated_file_size(s->extents[i].file->bs);
2822 2823 2824 2825 2826 2827 2828
        if (r < 0) {
            return r;
        }
        ret += r;
    }
    return ret;
}
2829

F
Fam Zheng 已提交
2830 2831 2832 2833 2834 2835 2836 2837 2838
static int vmdk_has_zero_init(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;

    /* If has a flat extent and its underlying storage doesn't have zero init,
     * return 0. */
    for (i = 0; i < s->num_extents; i++) {
        if (s->extents[i].flat) {
2839
            if (!bdrv_has_zero_init(s->extents[i].file->bs)) {
F
Fam Zheng 已提交
2840 2841 2842 2843 2844 2845 2846
                return 0;
            }
        }
    }
    return 1;
}

F
Fam Zheng 已提交
2847 2848 2849 2850
static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
{
    ImageInfo *info = g_new0(ImageInfo, 1);

2851
    bdrv_refresh_filename(extent->file->bs);
F
Fam Zheng 已提交
2852
    *info = (ImageInfo){
2853
        .filename         = g_strdup(extent->file->bs->filename),
F
Fam Zheng 已提交
2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864
        .format           = g_strdup(extent->type),
        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE,
        .compressed       = extent->compressed,
        .has_compressed   = extent->compressed,
        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE,
        .has_cluster_size = !extent->flat,
    };

    return info;
}

2865 2866 2867
static int coroutine_fn vmdk_co_check(BlockDriverState *bs,
                                      BdrvCheckResult *result,
                                      BdrvCheckMode fix)
2868 2869 2870 2871
{
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int64_t sector_num = 0;
2872
    int64_t total_sectors = bdrv_nb_sectors(bs);
2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888
    int ret;
    uint64_t cluster_offset;

    if (fix) {
        return -ENOTSUP;
    }

    for (;;) {
        if (sector_num >= total_sectors) {
            return 0;
        }
        extent = find_extent(s, sector_num, extent);
        if (!extent) {
            fprintf(stderr,
                    "ERROR: could not find extent for sector %" PRId64 "\n",
                    sector_num);
2889
            ret = -EINVAL;
2890 2891 2892 2893
            break;
        }
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << BDRV_SECTOR_BITS,
F
Fam Zheng 已提交
2894
                                 false, &cluster_offset, 0, 0);
2895 2896 2897 2898 2899 2900
        if (ret == VMDK_ERROR) {
            fprintf(stderr,
                    "ERROR: could not get cluster_offset for sector %"
                    PRId64 "\n", sector_num);
            break;
        }
2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916
        if (ret == VMDK_OK) {
            int64_t extent_len = bdrv_getlength(extent->file->bs);
            if (extent_len < 0) {
                fprintf(stderr,
                        "ERROR: could not get extent file length for sector %"
                        PRId64 "\n", sector_num);
                ret = extent_len;
                break;
            }
            if (cluster_offset >= extent_len) {
                fprintf(stderr,
                        "ERROR: cluster offset for sector %"
                        PRId64 " points after EOF\n", sector_num);
                ret = -EINVAL;
                break;
            }
2917 2918 2919 2920 2921
        }
        sector_num += extent->cluster_sectors;
    }

    result->corruptions++;
2922
    return ret;
2923 2924
}

2925 2926
static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs,
                                                 Error **errp)
F
Fam Zheng 已提交
2927 2928 2929 2930 2931 2932 2933
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
    ImageInfoList **next;

    *spec_info = (ImageInfoSpecific){
2934
        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
2935 2936
        .u = {
            .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
F
Fam Zheng 已提交
2937 2938 2939
        },
    };

2940
    *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
F
Fam Zheng 已提交
2941 2942 2943 2944 2945
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

2946
    next = &spec_info->u.vmdk.data->extents;
F
Fam Zheng 已提交
2947 2948 2949 2950 2951 2952 2953 2954 2955 2956
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
        (*next)->next = NULL;
        next = &(*next)->next;
    }

    return spec_info;
}

2957 2958 2959 2960 2961 2962 2963
static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
{
    return a->flat == b->flat &&
           a->compressed == b->compressed &&
           (a->flat || a->cluster_sectors == b->cluster_sectors);
}

F
Fam Zheng 已提交
2964 2965 2966 2967 2968
static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    assert(s->num_extents);
2969

F
Fam Zheng 已提交
2970 2971
    /* See if we have multiple extents but they have different cases */
    for (i = 1; i < s->num_extents; i++) {
2972
        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
F
Fam Zheng 已提交
2973 2974 2975
            return -ENOTSUP;
        }
    }
2976 2977 2978 2979
    bdi->needs_compressed_writes = s->extents[0].compressed;
    if (!s->extents[0].flat) {
        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
    }
F
Fam Zheng 已提交
2980 2981 2982
    return 0;
}

2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999
static void vmdk_gather_child_options(BlockDriverState *bs, QDict *target,
                                      bool backing_overridden)
{
    /* No children but file and backing can be explicitly specified (TODO) */
    qdict_put(target, "file",
              qobject_ref(bs->file->bs->full_open_options));

    if (backing_overridden) {
        if (bs->backing) {
            qdict_put(target, "backing",
                      qobject_ref(bs->backing->bs->full_open_options));
        } else {
            qdict_put_null(target, "backing");
        }
    }
}

3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025
static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_ADAPTER_TYPE,
            .type = QEMU_OPT_STRING,
            .help = "Virtual adapter type, can be one of "
                    "ide (default), lsilogic, buslogic or legacyESX"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_COMPAT6,
            .type = QEMU_OPT_BOOL,
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
3026 3027 3028 3029 3030 3031
        {
            .name = BLOCK_OPT_HWVERSION,
            .type = QEMU_OPT_STRING,
            .help = "VMDK hardware version",
            .def_value_str = "undefined"
        },
3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
            .help =
                "VMDK flat extent format, can be one of "
                "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
        },
        {
            .name = BLOCK_OPT_ZEROED_GRAIN,
            .type = QEMU_OPT_BOOL,
            .help = "Enable efficient zero writes "
                    "using the zeroed-grain GTE feature"
        },
        { /* end of list */ }
    }
3047 3048
};

3049
static BlockDriver bdrv_vmdk = {
F
Fam Zheng 已提交
3050 3051 3052 3053
    .format_name                  = "vmdk",
    .instance_size                = sizeof(BDRVVmdkState),
    .bdrv_probe                   = vmdk_probe,
    .bdrv_open                    = vmdk_open,
3054
    .bdrv_co_check                = vmdk_co_check,
F
Fam Zheng 已提交
3055
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
3056
    .bdrv_child_perm              = bdrv_format_default_perms,
3057
    .bdrv_co_preadv               = vmdk_co_preadv,
3058
    .bdrv_co_pwritev              = vmdk_co_pwritev,
3059
    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
3060
    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
F
Fam Zheng 已提交
3061
    .bdrv_close                   = vmdk_close,
3062
    .bdrv_co_create_opts          = vmdk_co_create_opts,
3063
    .bdrv_co_create               = vmdk_co_create,
F
Fam Zheng 已提交
3064
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
3065
    .bdrv_co_block_status         = vmdk_co_block_status,
F
Fam Zheng 已提交
3066 3067
    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
    .bdrv_has_zero_init           = vmdk_has_zero_init,
F
Fam Zheng 已提交
3068
    .bdrv_get_specific_info       = vmdk_get_specific_info,
3069
    .bdrv_refresh_limits          = vmdk_refresh_limits,
F
Fam Zheng 已提交
3070
    .bdrv_get_info                = vmdk_get_info,
3071
    .bdrv_gather_child_options    = vmdk_gather_child_options,
F
Fam Zheng 已提交
3072

M
Max Reitz 已提交
3073
    .is_format                    = true,
3074
    .supports_backing             = true,
3075
    .create_opts                  = &vmdk_create_opts,
B
bellard 已提交
3076
};
3077 3078 3079 3080 3081 3082 3083

static void bdrv_vmdk_init(void)
{
    bdrv_register(&bdrv_vmdk);
}

block_init(bdrv_vmdk_init);