vmdk.c 72.6 KB
Newer Older
B
bellard 已提交
1 2
/*
 * Block driver for the VMDK format
3
 *
B
bellard 已提交
4
 * Copyright (c) 2004 Fabrice Bellard
5
 * Copyright (c) 2005 Filip Navara
6
 *
B
bellard 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
25

P
Peter Maydell 已提交
26
#include "qemu/osdep.h"
27
#include "qapi/error.h"
28
#include "block/block_int.h"
29
#include "sysemu/block-backend.h"
30
#include "qapi/qmp/qerror.h"
31
#include "qemu/error-report.h"
32
#include "qemu/module.h"
33
#include "qemu/bswap.h"
34
#include "migration/blocker.h"
35
#include "qemu/cutils.h"
S
Stefan Weil 已提交
36
#include <zlib.h>
B
bellard 已提交
37 38 39

#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
F
Fam Zheng 已提交
40
#define VMDK4_COMPRESSION_DEFLATE 1
F
Fam Zheng 已提交
41
#define VMDK4_FLAG_NL_DETECT (1 << 0)
42
#define VMDK4_FLAG_RGD (1 << 1)
43 44
/* Zeroed-grain enable bit */
#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
F
Fam Zheng 已提交
45 46
#define VMDK4_FLAG_COMPRESS (1 << 16)
#define VMDK4_FLAG_MARKER (1 << 17)
47
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
B
bellard 已提交
48

49
#define VMDK_GTE_ZEROED 0x1
F
Fam Zheng 已提交
50 51 52 53 54 55 56 57

/* VMDK internal error codes */
#define VMDK_OK      0
#define VMDK_ERROR   (-1)
/* Cluster not allocated */
#define VMDK_UNALLOC (-2)
#define VMDK_ZEROED  (-3)

58 59
#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"

B
bellard 已提交
60 61 62 63 64 65 66 67 68 69 70
typedef struct {
    uint32_t version;
    uint32_t flags;
    uint32_t disk_sectors;
    uint32_t granularity;
    uint32_t l1dir_offset;
    uint32_t l1dir_size;
    uint32_t file_sectors;
    uint32_t cylinders;
    uint32_t heads;
    uint32_t sectors_per_track;
71
} QEMU_PACKED VMDK3Header;
B
bellard 已提交
72 73 74 75

typedef struct {
    uint32_t version;
    uint32_t flags;
76 77 78 79
    uint64_t capacity;
    uint64_t granularity;
    uint64_t desc_offset;
    uint64_t desc_size;
80 81
    /* Number of GrainTableEntries per GrainTable */
    uint32_t num_gtes_per_gt;
82 83 84
    uint64_t rgd_offset;
    uint64_t gd_offset;
    uint64_t grain_offset;
B
bellard 已提交
85 86
    char filler[1];
    char check_bytes[4];
F
Fam Zheng 已提交
87
    uint16_t compressAlgorithm;
88
} QEMU_PACKED VMDK4Header;
B
bellard 已提交
89 90 91

#define L2_CACHE_SIZE 16

F
Fam Zheng 已提交
92
typedef struct VmdkExtent {
93
    BdrvChild *file;
F
Fam Zheng 已提交
94
    bool flat;
F
Fam Zheng 已提交
95 96
    bool compressed;
    bool has_marker;
97 98
    bool has_zero_grain;
    int version;
F
Fam Zheng 已提交
99 100
    int64_t sectors;
    int64_t end_sector;
101
    int64_t flat_start_offset;
B
bellard 已提交
102
    int64_t l1_table_offset;
103
    int64_t l1_backup_table_offset;
B
bellard 已提交
104
    uint32_t *l1_table;
105
    uint32_t *l1_backup_table;
B
bellard 已提交
106 107 108 109 110 111 112 113
    unsigned int l1_size;
    uint32_t l1_entry_sectors;

    unsigned int l2_size;
    uint32_t *l2_cache;
    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
    uint32_t l2_cache_counts[L2_CACHE_SIZE];

114
    int64_t cluster_sectors;
F
Fam Zheng 已提交
115
    int64_t next_cluster_sector;
F
Fam Zheng 已提交
116
    char *type;
F
Fam Zheng 已提交
117 118 119
} VmdkExtent;

typedef struct BDRVVmdkState {
120
    CoMutex lock;
121
    uint64_t desc_offset;
122
    bool cid_updated;
123
    bool cid_checked;
F
Fam Zheng 已提交
124
    uint32_t cid;
125
    uint32_t parent_cid;
F
Fam Zheng 已提交
126 127 128
    int num_extents;
    /* Extent array with num_extents entries, ascend ordered by address */
    VmdkExtent *extents;
K
Kevin Wolf 已提交
129
    Error *migration_blocker;
F
Fam Zheng 已提交
130
    char *create_type;
B
bellard 已提交
131 132
} BDRVVmdkState;

133 134 135 136 137
typedef struct VmdkMetaData {
    unsigned int l1_index;
    unsigned int l2_index;
    unsigned int l2_offset;
    int valid;
F
Fam Zheng 已提交
138
    uint32_t *l2_cache_entry;
139 140
} VmdkMetaData;

F
Fam Zheng 已提交
141 142 143 144
typedef struct VmdkGrainMarker {
    uint64_t lba;
    uint32_t size;
    uint8_t  data[0];
145
} QEMU_PACKED VmdkGrainMarker;
F
Fam Zheng 已提交
146

147 148 149 150 151 152 153
enum {
    MARKER_END_OF_STREAM    = 0,
    MARKER_GRAIN_TABLE      = 1,
    MARKER_GRAIN_DIRECTORY  = 2,
    MARKER_FOOTER           = 3,
};

B
bellard 已提交
154 155 156 157
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
{
    uint32_t magic;

F
Fam Zheng 已提交
158
    if (buf_size < 4) {
B
bellard 已提交
159
        return 0;
F
Fam Zheng 已提交
160
    }
B
bellard 已提交
161 162
    magic = be32_to_cpu(*(uint32_t *)buf);
    if (magic == VMDK3_MAGIC ||
163
        magic == VMDK4_MAGIC) {
B
bellard 已提交
164
        return 100;
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
    } else {
        const char *p = (const char *)buf;
        const char *end = p + buf_size;
        while (p < end) {
            if (*p == '#') {
                /* skip comment line */
                while (p < end && *p != '\n') {
                    p++;
                }
                p++;
                continue;
            }
            if (*p == ' ') {
                while (p < end && *p == ' ') {
                    p++;
                }
                /* skip '\r' if windows line endings used. */
                if (p < end && *p == '\r') {
                    p++;
                }
                /* only accept blank lines before 'version=' line */
                if (p == end || *p != '\n') {
                    return 0;
                }
                p++;
                continue;
            }
            if (end - p >= strlen("version=X\n")) {
                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
                    strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
                    return 100;
                }
            }
            if (end - p >= strlen("version=X\r\n")) {
                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
                    return 100;
                }
            }
            return 0;
        }
B
bellard 已提交
206
        return 0;
207
    }
B
bellard 已提交
208 209
}

210
#define SECTOR_SIZE 512
F
Fam Zheng 已提交
211 212 213
#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
#define BUF_SIZE 4096
#define HEADER_SIZE 512                 /* first sector of 512 bytes */
214

F
Fam Zheng 已提交
215 216 217 218
static void vmdk_free_extents(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
219
    VmdkExtent *e;
F
Fam Zheng 已提交
220 221

    for (i = 0; i < s->num_extents; i++) {
F
Fam Zheng 已提交
222 223 224 225
        e = &s->extents[i];
        g_free(e->l1_table);
        g_free(e->l2_cache);
        g_free(e->l1_backup_table);
F
Fam Zheng 已提交
226
        g_free(e->type);
K
Kevin Wolf 已提交
227
        if (e->file != bs->file) {
228
            bdrv_unref_child(bs, e->file);
F
Fam Zheng 已提交
229
        }
F
Fam Zheng 已提交
230
    }
231
    g_free(s->extents);
F
Fam Zheng 已提交
232 233
}

234 235 236 237 238 239 240 241
static void vmdk_free_last_extent(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;

    if (s->num_extents == 0) {
        return;
    }
    s->num_extents--;
242
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
243 244
}

245 246
/* Return -ve errno, or 0 on success and write CID into *pcid. */
static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
B
bellard 已提交
247
{
248
    char *desc;
249
    uint32_t cid;
250
    const char *p_name, *cid_str;
251
    size_t cid_str_size;
252
    BDRVVmdkState *s = bs->opaque;
K
Kevin Wolf 已提交
253
    int ret;
254

255
    desc = g_malloc0(DESC_SIZE);
256
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
257
    if (ret < 0) {
258
        goto out;
259
    }
260 261 262 263 264 265 266 267 268

    if (parent) {
        cid_str = "parentCID";
        cid_str_size = sizeof("parentCID");
    } else {
        cid_str = "CID";
        cid_str_size = sizeof("CID");
    }

K
Kevin Wolf 已提交
269
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
270
    p_name = strstr(desc, cid_str);
271 272 273
    if (p_name == NULL) {
        ret = -EINVAL;
        goto out;
274
    }
275 276 277 278 279 280 281
    p_name += cid_str_size;
    if (sscanf(p_name, "%" SCNx32, &cid) != 1) {
        ret = -EINVAL;
        goto out;
    }
    *pcid = cid;
    ret = 0;
282

283
out:
284
    g_free(desc);
285
    return ret;
286 287 288 289
}

static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
{
290
    char *desc, *tmp_desc;
291
    char *p_name, *tmp_str;
292
    BDRVVmdkState *s = bs->opaque;
293
    int ret = 0;
294

295 296
    desc = g_malloc0(DESC_SIZE);
    tmp_desc = g_malloc0(DESC_SIZE);
297
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
298
    if (ret < 0) {
299
        goto out;
300
    }
301

K
Kevin Wolf 已提交
302
    desc[DESC_SIZE - 1] = '\0';
F
Fam Zheng 已提交
303
    tmp_str = strstr(desc, "parentCID");
K
Kevin Wolf 已提交
304
    if (tmp_str == NULL) {
305 306
        ret = -EINVAL;
        goto out;
K
Kevin Wolf 已提交
307 308
    }

309
    pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
F
Fam Zheng 已提交
310 311
    p_name = strstr(desc, "CID");
    if (p_name != NULL) {
312
        p_name += sizeof("CID");
313 314
        snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
        pstrcat(desc, DESC_SIZE, tmp_desc);
315 316
    }

317
    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
K
Kevin Wolf 已提交
318

319 320 321 322
out:
    g_free(desc);
    g_free(tmp_desc);
    return ret;
323 324 325 326 327 328 329
}

static int vmdk_is_cid_valid(BlockDriverState *bs)
{
    BDRVVmdkState *s = bs->opaque;
    uint32_t cur_pcid;

330 331 332
    if (!s->cid_checked && bs->backing) {
        BlockDriverState *p_bs = bs->backing->bs;

333 334 335 336
        if (vmdk_read_cid(p_bs, 0, &cur_pcid) != 0) {
            /* read failure: report as not valid */
            return 0;
        }
F
Fam Zheng 已提交
337 338
        if (s->parent_cid != cur_pcid) {
            /* CID not valid */
339
            return 0;
F
Fam Zheng 已提交
340
        }
341
    }
342
    s->cid_checked = true;
F
Fam Zheng 已提交
343
    /* CID valid */
344 345 346
    return 1;
}

K
Kevin Wolf 已提交
347
/* We have nothing to do for VMDK reopen, stubs just return success */
J
Jeff Cody 已提交
348 349 350 351 352
static int vmdk_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
{
    assert(state != NULL);
    assert(state->bs != NULL);
K
Kevin Wolf 已提交
353
    return 0;
J
Jeff Cody 已提交
354 355
}

356
static int vmdk_parent_open(BlockDriverState *bs)
357
{
358
    char *p_name;
359
    char *desc;
360
    BDRVVmdkState *s = bs->opaque;
361
    int ret;
362

363
    desc = g_malloc0(DESC_SIZE + 1);
364
    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
365
    if (ret < 0) {
366
        goto out;
367
    }
368
    ret = 0;
369

F
Fam Zheng 已提交
370 371
    p_name = strstr(desc, "parentFileNameHint");
    if (p_name != NULL) {
372 373 374
        char *end_name;

        p_name += sizeof("parentFileNameHint") + 1;
F
Fam Zheng 已提交
375 376
        end_name = strchr(p_name, '\"');
        if (end_name == NULL) {
377 378
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
379 380
        }
        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
381 382
            ret = -EINVAL;
            goto out;
F
Fam Zheng 已提交
383
        }
384

K
Kevin Wolf 已提交
385
        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
386
    }
387

388 389 390
out:
    g_free(desc);
    return ret;
391 392
}

F
Fam Zheng 已提交
393 394
/* Create and append extent to the extent array. Return the added VmdkExtent
 * address. return NULL if allocation failed. */
395
static int vmdk_add_extent(BlockDriverState *bs,
396
                           BdrvChild *file, bool flat, int64_t sectors,
F
Fam Zheng 已提交
397 398
                           int64_t l1_offset, int64_t l1_backup_offset,
                           uint32_t l1_size,
399
                           int l2_size, uint64_t cluster_sectors,
F
Fam Zheng 已提交
400 401
                           VmdkExtent **new_extent,
                           Error **errp)
F
Fam Zheng 已提交
402 403 404
{
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
405
    int64_t nb_sectors;
F
Fam Zheng 已提交
406

407 408
    if (cluster_sectors > 0x200000) {
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
F
Fam Zheng 已提交
409 410
        error_setg(errp, "Invalid granularity, image may be corrupt");
        return -EFBIG;
411
    }
412 413 414 415 416
    if (l1_size > 512 * 1024 * 1024) {
        /* Although with big capacity and small l1_entry_sectors, we can get a
         * big l1_size, we don't want unbounded value to allocate the table.
         * Limit it to 512M, which is 16PB for default cluster and L2 table
         * size */
F
Fam Zheng 已提交
417
        error_setg(errp, "L1 size too big");
418 419
        return -EFBIG;
    }
420

421
    nb_sectors = bdrv_nb_sectors(file->bs);
422 423
    if (nb_sectors < 0) {
        return nb_sectors;
F
Fam Zheng 已提交
424 425
    }

426
    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
F
Fam Zheng 已提交
427 428 429 430 431 432 433 434 435 436 437 438
    extent = &s->extents[s->num_extents];
    s->num_extents++;

    memset(extent, 0, sizeof(VmdkExtent));
    extent->file = file;
    extent->flat = flat;
    extent->sectors = sectors;
    extent->l1_table_offset = l1_offset;
    extent->l1_backup_table_offset = l1_backup_offset;
    extent->l1_size = l1_size;
    extent->l1_entry_sectors = l2_size * cluster_sectors;
    extent->l2_size = l2_size;
439
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
440
    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
F
Fam Zheng 已提交
441 442 443 444 445 446 447

    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
    } else {
        extent->end_sector = extent->sectors;
    }
    bs->total_sectors = extent->end_sector;
448 449 450 451
    if (new_extent) {
        *new_extent = extent;
    }
    return 0;
F
Fam Zheng 已提交
452 453
}

F
Fam Zheng 已提交
454 455
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
                            Error **errp)
456
{
457
    int ret;
458 459
    size_t l1_size;
    int i;
460

B
bellard 已提交
461
    /* read the L1 table */
F
Fam Zheng 已提交
462
    l1_size = extent->l1_size * sizeof(uint32_t);
463 464 465 466 467
    extent->l1_table = g_try_malloc(l1_size);
    if (l1_size && extent->l1_table == NULL) {
        return -ENOMEM;
    }

468
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
469 470 471
                     extent->l1_table_offset,
                     extent->l1_table,
                     l1_size);
472
    if (ret < 0) {
F
Fam Zheng 已提交
473 474
        error_setg_errno(errp, -ret,
                         "Could not read l1 table from extent '%s'",
475
                         extent->file->bs->filename);
476
        goto fail_l1;
F
Fam Zheng 已提交
477 478 479
    }
    for (i = 0; i < extent->l1_size; i++) {
        le32_to_cpus(&extent->l1_table[i]);
B
bellard 已提交
480 481
    }

F
Fam Zheng 已提交
482
    if (extent->l1_backup_table_offset) {
483 484 485 486 487
        extent->l1_backup_table = g_try_malloc(l1_size);
        if (l1_size && extent->l1_backup_table == NULL) {
            ret = -ENOMEM;
            goto fail_l1;
        }
488
        ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
489 490 491
                         extent->l1_backup_table_offset,
                         extent->l1_backup_table,
                         l1_size);
492
        if (ret < 0) {
F
Fam Zheng 已提交
493 494
            error_setg_errno(errp, -ret,
                             "Could not read l1 backup table from extent '%s'",
495
                             extent->file->bs->filename);
496
            goto fail_l1b;
F
Fam Zheng 已提交
497 498 499
        }
        for (i = 0; i < extent->l1_size; i++) {
            le32_to_cpus(&extent->l1_backup_table[i]);
500 501 502
        }
    }

F
Fam Zheng 已提交
503
    extent->l2_cache =
504
        g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE);
B
bellard 已提交
505
    return 0;
506
 fail_l1b:
507
    g_free(extent->l1_backup_table);
508
 fail_l1:
509
    g_free(extent->l1_table);
510 511 512
    return ret;
}

F
Fam Zheng 已提交
513
static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
514
                                 BdrvChild *file,
F
Fam Zheng 已提交
515
                                 int flags, Error **errp)
516 517 518 519 520 521
{
    int ret;
    uint32_t magic;
    VMDK3Header header;
    VmdkExtent *extent;

522
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
523
    if (ret < 0) {
F
Fam Zheng 已提交
524 525
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
526
                         file->bs->filename);
527
        return ret;
528
    }
529 530
    ret = vmdk_add_extent(bs, file, false,
                          le32_to_cpu(header.disk_sectors),
531
                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
532 533 534 535
                          0,
                          le32_to_cpu(header.l1dir_size),
                          4096,
                          le32_to_cpu(header.granularity),
F
Fam Zheng 已提交
536 537
                          &extent,
                          errp);
538 539 540
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
541
    ret = vmdk_init_tables(bs, extent, errp);
542
    if (ret) {
543 544
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
545 546 547 548
    }
    return ret;
}

549
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
550
                               QDict *options, Error **errp);
F
Fam Zheng 已提交
551

552
static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
P
Paolo Bonzini 已提交
553 554 555 556 557
{
    int64_t size;
    char *buf;
    int ret;

558
    size = bdrv_getlength(file->bs);
P
Paolo Bonzini 已提交
559 560 561 562 563
    if (size < 0) {
        error_setg_errno(errp, -size, "Could not access file");
        return NULL;
    }

564 565 566 567 568 569 570 571
    if (size < 4) {
        /* Both descriptor file and sparse image must be much larger than 4
         * bytes, also callers of vmdk_read_desc want to compare the first 4
         * bytes with VMDK4_MAGIC, let's error out if less is read. */
        error_setg(errp, "File is too small, not a valid image");
        return NULL;
    }

F
Fam Zheng 已提交
572 573
    size = MIN(size, (1 << 20) - 1);  /* avoid unbounded allocation */
    buf = g_malloc(size + 1);
P
Paolo Bonzini 已提交
574 575 576 577 578 579 580

    ret = bdrv_pread(file, desc_offset, buf, size);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not read from file");
        g_free(buf);
        return NULL;
    }
F
Fam Zheng 已提交
581
    buf[ret] = 0;
P
Paolo Bonzini 已提交
582 583 584 585

    return buf;
}

586
static int vmdk_open_vmdk4(BlockDriverState *bs,
587
                           BdrvChild *file,
K
Kevin Wolf 已提交
588
                           int flags, QDict *options, Error **errp)
589 590 591 592 593 594
{
    int ret;
    uint32_t magic;
    uint32_t l1_size, l1_entry_sectors;
    VMDK4Header header;
    VmdkExtent *extent;
F
Fam Zheng 已提交
595
    BDRVVmdkState *s = bs->opaque;
596
    int64_t l1_backup_offset = 0;
597
    bool compressed;
598

599
    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
600
    if (ret < 0) {
F
Fam Zheng 已提交
601 602
        error_setg_errno(errp, -ret,
                         "Could not read header from file '%s'",
603
                         file->bs->filename);
P
Paolo Bonzini 已提交
604
        return -EINVAL;
605
    }
606
    if (header.capacity == 0) {
607
        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
608
        if (desc_offset) {
609
            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
610 611 612
            if (!buf) {
                return -EINVAL;
            }
K
Kevin Wolf 已提交
613
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
614 615
            g_free(buf);
            return ret;
616
        }
F
Fam Zheng 已提交
617
    }
618

F
Fam Zheng 已提交
619 620 621 622
    if (!s->create_type) {
        s->create_type = g_strdup("monolithicSparse");
    }

623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
        /*
         * The footer takes precedence over the header, so read it in. The
         * footer starts at offset -1024 from the end: One sector for the
         * footer, and another one for the end-of-stream marker.
         */
        struct {
            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED footer_marker;

            uint32_t magic;
            VMDK4Header header;
            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];

            struct {
                uint64_t val;
                uint32_t size;
                uint32_t type;
                uint8_t pad[512 - 16];
            } QEMU_PACKED eos_marker;
        } QEMU_PACKED footer;

649
        ret = bdrv_pread(file,
K
Kevin Wolf 已提交
650
            bs->file->bs->total_sectors * 512 - 1536,
651 652
            &footer, sizeof(footer));
        if (ret < 0) {
653
            error_setg_errno(errp, -ret, "Failed to read footer");
654 655 656 657 658 659 660 661 662 663 664
            return ret;
        }

        /* Some sanity checks for the footer */
        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
            le32_to_cpu(footer.footer_marker.size) != 0  ||
            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
            le64_to_cpu(footer.eos_marker.val) != 0  ||
            le32_to_cpu(footer.eos_marker.size) != 0  ||
            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
        {
665
            error_setg(errp, "Invalid footer");
666 667 668 669 670 671
            return -EINVAL;
        }

        header = footer.header;
    }

672 673
    compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
674
    if (le32_to_cpu(header.version) > 3) {
675 676
        error_setg(errp, "Unsupported VMDK version %" PRIu32,
                   le32_to_cpu(header.version));
677
        return -ENOTSUP;
678 679
    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
               !compressed) {
680 681 682 683 684 685
        /* VMware KB 2064959 explains that version 3 added support for
         * persistent changed block tracking (CBT), and backup software can
         * read it as version=1 if it doesn't care about the changed area
         * information. So we are safe to enable read only. */
        error_setg(errp, "VMDK version 3 must be read only");
        return -EINVAL;
686 687
    }

688
    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
P
Paolo Bonzini 已提交
689
        error_setg(errp, "L2 table size too big");
690 691 692
        return -EINVAL;
    }

693
    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
694
                        * le64_to_cpu(header.granularity);
695
    if (l1_entry_sectors == 0) {
696
        error_setg(errp, "L1 entry size is invalid");
697 698
        return -EINVAL;
    }
699 700
    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
                / l1_entry_sectors;
701 702 703
    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
    }
704
    if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) {
705 706 707
        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
                   (int64_t)(le64_to_cpu(header.grain_offset)
                             * BDRV_SECTOR_SIZE));
708 709 710
        return -EINVAL;
    }

711
    ret = vmdk_add_extent(bs, file, false,
712 713
                          le64_to_cpu(header.capacity),
                          le64_to_cpu(header.gd_offset) << 9,
714
                          l1_backup_offset,
715
                          l1_size,
716
                          le32_to_cpu(header.num_gtes_per_gt),
717
                          le64_to_cpu(header.granularity),
F
Fam Zheng 已提交
718 719
                          &extent,
                          errp);
720 721 722
    if (ret < 0) {
        return ret;
    }
F
Fam Zheng 已提交
723 724
    extent->compressed =
        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
725 726 727 728
    if (extent->compressed) {
        g_free(s->create_type);
        s->create_type = g_strdup("streamOptimized");
    }
F
Fam Zheng 已提交
729
    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
730 731
    extent->version = le32_to_cpu(header.version);
    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
F
Fam Zheng 已提交
732
    ret = vmdk_init_tables(bs, extent, errp);
733
    if (ret) {
734 735
        /* free extent allocated by vmdk_add_extent */
        vmdk_free_last_extent(bs);
736 737 738 739
    }
    return ret;
}

740 741 742 743 744 745 746 747 748
/* find an option value out of descriptor file */
static int vmdk_parse_description(const char *desc, const char *opt_name,
        char *buf, int buf_size)
{
    char *opt_pos, *opt_end;
    const char *end = desc + strlen(desc);

    opt_pos = strstr(desc, opt_name);
    if (!opt_pos) {
F
Fam Zheng 已提交
749
        return VMDK_ERROR;
750 751 752 753
    }
    /* Skip "=\"" following opt_name */
    opt_pos += strlen(opt_name) + 2;
    if (opt_pos >= end) {
F
Fam Zheng 已提交
754
        return VMDK_ERROR;
755 756 757 758 759 760
    }
    opt_end = opt_pos;
    while (opt_end < end && *opt_end != '"') {
        opt_end++;
    }
    if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
F
Fam Zheng 已提交
761
        return VMDK_ERROR;
762 763
    }
    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
F
Fam Zheng 已提交
764
    return VMDK_OK;
765 766
}

767
/* Open an extent file and append to bs array */
768
static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags,
K
Kevin Wolf 已提交
769
                            char *buf, QDict *options, Error **errp)
770 771 772
{
    uint32_t magic;

773
    magic = ldl_be_p(buf);
774 775
    switch (magic) {
        case VMDK3_MAGIC:
F
Fam Zheng 已提交
776
            return vmdk_open_vmfs_sparse(bs, file, flags, errp);
777 778
            break;
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
779
            return vmdk_open_vmdk4(bs, file, flags, options, errp);
780 781
            break;
        default:
P
Paolo Bonzini 已提交
782 783
            error_setg(errp, "Image not in VMDK format");
            return -EINVAL;
784 785 786 787
            break;
    }
}

788 789 790 791 792 793 794 795 796 797 798
static const char *next_line(const char *s)
{
    while (*s) {
        if (*s == '\n') {
            return s + 1;
        }
        s++;
    }
    return s;
}

799
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
K
Kevin Wolf 已提交
800 801
                              const char *desc_file_path, QDict *options,
                              Error **errp)
802 803
{
    int ret;
804
    int matches;
805 806 807
    char access[11];
    char type[11];
    char fname[512];
808
    const char *p, *np;
809 810
    int64_t sectors = 0;
    int64_t flat_offset;
811
    char *extent_path;
812
    BdrvChild *extent_file;
F
Fam Zheng 已提交
813 814
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent;
K
Kevin Wolf 已提交
815
    char extent_opt_prefix[32];
816
    Error *local_err = NULL;
817

818
    for (p = desc; *p; p = next_line(p)) {
819 820
        /* parse extent line in one of below formats:
         *
821 822
         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
         * RW [size in sectors] SPARSE "file-name.vmdk"
823 824
         * RW [size in sectors] VMFS "file-name.vmdk"
         * RW [size in sectors] VMFSSPARSE "file-name.vmdk"
825 826
         */
        flat_offset = -1;
827 828 829
        matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
                         access, &sectors, type, fname, &flat_offset);
        if (matches < 4 || strcmp(access, "RW")) {
830
            continue;
831
        } else if (!strcmp(type, "FLAT")) {
832
            if (matches != 5 || flat_offset < 0) {
833
                goto invalid;
834
            }
F
Fam Zheng 已提交
835
        } else if (!strcmp(type, "VMFS")) {
836
            if (matches == 4) {
837 838
                flat_offset = 0;
            } else {
839
                goto invalid;
840
            }
841
        } else if (matches != 4) {
842
            goto invalid;
843 844 845
        }

        if (sectors <= 0 ||
F
Fam Zheng 已提交
846
            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
P
Paolo Bonzini 已提交
847
             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
848
            (strcmp(access, "RW"))) {
849
            continue;
850 851
        }

852 853 854 855
        if (!path_is_absolute(fname) && !path_has_protocol(fname) &&
            !desc_file_path[0])
        {
            error_setg(errp, "Cannot use relative extent paths with VMDK "
K
Kevin Wolf 已提交
856
                       "descriptor file '%s'", bs->file->bs->filename);
857 858 859
            return -EINVAL;
        }

860
        extent_path = g_malloc0(PATH_MAX);
J
Jeff Cody 已提交
861
        path_combine(extent_path, PATH_MAX, desc_file_path, fname);
K
Kevin Wolf 已提交
862 863 864 865

        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
        assert(ret < 32);

866 867
        extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix,
                                      bs, &child_file, false, &local_err);
868
        g_free(extent_path);
869 870 871
        if (local_err) {
            error_propagate(errp, local_err);
            return -EINVAL;
872 873
        }

874
        /* save to extents array */
P
Paolo Bonzini 已提交
875
        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
876 877
            /* FLAT extent */

878
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
F
Fam Zheng 已提交
879
                            0, 0, 0, 0, 0, &extent, errp);
880
            if (ret < 0) {
881
                bdrv_unref_child(bs, extent_file);
882 883
                return ret;
            }
F
Fam Zheng 已提交
884
            extent->flat_start_offset = flat_offset << 9;
F
Fam Zheng 已提交
885 886
        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
887
            char *buf = vmdk_read_desc(extent_file, 0, errp);
888 889 890
            if (!buf) {
                ret = -EINVAL;
            } else {
K
Kevin Wolf 已提交
891 892
                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
                                       options, errp);
893
            }
894
            g_free(buf);
895
            if (ret) {
896
                bdrv_unref_child(bs, extent_file);
897 898
                return ret;
            }
F
Fam Zheng 已提交
899
            extent = &s->extents[s->num_extents - 1];
900
        } else {
F
Fam Zheng 已提交
901
            error_setg(errp, "Unsupported extent type '%s'", type);
902
            bdrv_unref_child(bs, extent_file);
903 904
            return -ENOTSUP;
        }
F
Fam Zheng 已提交
905
        extent->type = g_strdup(type);
906 907
    }
    return 0;
908 909 910 911 912 913 914 915 916

invalid:
    np = next_line(p);
    assert(np != p);
    if (np[-1] == '\n') {
        np--;
    }
    error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p);
    return -EINVAL;
917 918
}

919
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
K
Kevin Wolf 已提交
920
                               QDict *options, Error **errp)
921 922 923 924 925 926
{
    int ret;
    char ct[128];
    BDRVVmdkState *s = bs->opaque;

    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
P
Paolo Bonzini 已提交
927 928
        error_setg(errp, "invalid VMDK image descriptor");
        ret = -EINVAL;
929
        goto exit;
930
    }
F
Fam Zheng 已提交
931
    if (strcmp(ct, "monolithicFlat") &&
P
Paolo Bonzini 已提交
932
        strcmp(ct, "vmfs") &&
F
Fam Zheng 已提交
933
        strcmp(ct, "vmfsSparse") &&
934
        strcmp(ct, "twoGbMaxExtentSparse") &&
F
Fam Zheng 已提交
935
        strcmp(ct, "twoGbMaxExtentFlat")) {
F
Fam Zheng 已提交
936
        error_setg(errp, "Unsupported image type '%s'", ct);
937 938
        ret = -ENOTSUP;
        goto exit;
939
    }
F
Fam Zheng 已提交
940
    s->create_type = g_strdup(ct);
941
    s->desc_offset = 0;
K
Kevin Wolf 已提交
942 943
    ret = vmdk_parse_extents(buf, bs, bs->file->bs->exact_filename, options,
                             errp);
944 945
exit:
    return ret;
946 947
}

M
Max Reitz 已提交
948 949
static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
950
{
951
    char *buf;
952 953
    int ret;
    BDRVVmdkState *s = bs->opaque;
954
    uint32_t magic;
955
    Error *local_err = NULL;
956

957 958 959 960 961 962
    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
                               false, errp);
    if (!bs->file) {
        return -EINVAL;
    }

963
    buf = vmdk_read_desc(bs->file, 0, errp);
964 965 966 967
    if (!buf) {
        return -EINVAL;
    }

968 969 970 971
    magic = ldl_be_p(buf);
    switch (magic) {
        case VMDK3_MAGIC:
        case VMDK4_MAGIC:
K
Kevin Wolf 已提交
972
            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options,
973
                                   errp);
974 975 976
            s->desc_offset = 0x200;
            break;
        default:
K
Kevin Wolf 已提交
977
            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
978
            break;
979
    }
980 981 982 983
    if (ret) {
        goto fail;
    }

P
Paolo Bonzini 已提交
984 985 986 987 988
    /* try to open parent images, if exist */
    ret = vmdk_parent_open(bs);
    if (ret) {
        goto fail;
    }
989 990 991 992 993 994 995 996
    ret = vmdk_read_cid(bs, 0, &s->cid);
    if (ret) {
        goto fail;
    }
    ret = vmdk_read_cid(bs, 1, &s->parent_cid);
    if (ret) {
        goto fail;
    }
997
    qemu_co_mutex_init(&s->lock);
K
Kevin Wolf 已提交
998 999

    /* Disable migration when VMDK images are used */
1000 1001 1002
    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
               "does not support live migration",
               bdrv_get_device_or_node_name(bs));
1003 1004 1005 1006 1007 1008 1009
    ret = migrate_add_blocker(s->migration_blocker, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        error_free(s->migration_blocker);
        goto fail;
    }

1010
    g_free(buf);
K
Kevin Wolf 已提交
1011
    return 0;
P
Paolo Bonzini 已提交
1012 1013

fail:
1014
    g_free(buf);
F
Fam Zheng 已提交
1015 1016
    g_free(s->create_type);
    s->create_type = NULL;
P
Paolo Bonzini 已提交
1017 1018
    vmdk_free_extents(bs);
    return ret;
B
bellard 已提交
1019 1020
}

1021

1022
static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
1023 1024 1025 1026 1027 1028
{
    BDRVVmdkState *s = bs->opaque;
    int i;

    for (i = 0; i < s->num_extents; i++) {
        if (!s->extents[i].flat) {
1029 1030 1031
            bs->bl.pwrite_zeroes_alignment =
                MAX(bs->bl.pwrite_zeroes_alignment,
                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
1032 1033 1034 1035
        }
    }
}

F
Fam Zheng 已提交
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
/**
 * get_whole_cluster
 *
 * Copy backing file's cluster that covers @sector_num, otherwise write zero,
 * to the cluster at @cluster_sector_num.
 *
 * If @skip_start_sector < @skip_end_sector, the relative range
 * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
 * it for call to write user data in the request.
 */
F
Fam Zheng 已提交
1046
static int get_whole_cluster(BlockDriverState *bs,
F
Fam Zheng 已提交
1047
                             VmdkExtent *extent,
1048 1049 1050 1051
                             uint64_t cluster_offset,
                             uint64_t offset,
                             uint64_t skip_start_bytes,
                             uint64_t skip_end_bytes)
1052
{
1053
    int ret = VMDK_OK;
F
Fam Zheng 已提交
1054 1055 1056 1057 1058
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
1059
    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
F
Fam Zheng 已提交
1060
    whole_grain = qemu_blockalign(bs, cluster_bytes);
1061

1062
    if (!bs->backing) {
1063 1064
        memset(whole_grain, 0, skip_start_bytes);
        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1065 1066
    }

1067
    assert(skip_end_bytes <= cluster_bytes);
1068 1069
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
1070
    if (bs->backing && !vmdk_is_cid_valid(bs)) {
F
Fam Zheng 已提交
1071 1072 1073
        ret = VMDK_ERROR;
        goto exit;
    }
1074

F
Fam Zheng 已提交
1075
    /* Read backing data before skip range */
1076
    if (skip_start_bytes > 0) {
1077
        if (bs->backing) {
1078
            ret = bdrv_pread(bs->backing, offset, whole_grain,
1079
                             skip_start_bytes);
F
Fam Zheng 已提交
1080 1081 1082 1083 1084
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
1085
        ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
1086
                          skip_start_bytes);
K
Kevin Wolf 已提交
1087
        if (ret < 0) {
1088 1089
            ret = VMDK_ERROR;
            goto exit;
K
Kevin Wolf 已提交
1090
        }
F
Fam Zheng 已提交
1091 1092
    }
    /* Read backing data after skip range */
1093
    if (skip_end_bytes < cluster_bytes) {
1094
        if (bs->backing) {
1095
            ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
1096 1097
                             whole_grain + skip_end_bytes,
                             cluster_bytes - skip_end_bytes);
F
Fam Zheng 已提交
1098 1099 1100 1101 1102
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
1103
        ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
1104 1105
                          whole_grain + skip_end_bytes,
                          cluster_bytes - skip_end_bytes);
K
Kevin Wolf 已提交
1106
        if (ret < 0) {
1107 1108
            ret = VMDK_ERROR;
            goto exit;
1109 1110
        }
    }
F
Fam Zheng 已提交
1111

1112
    ret = VMDK_OK;
1113 1114 1115
exit:
    qemu_vfree(whole_grain);
    return ret;
1116 1117
}

F
Fam Zheng 已提交
1118 1119
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
                         uint32_t offset)
1120
{
F
Fam Zheng 已提交
1121
    offset = cpu_to_le32(offset);
1122
    /* update L2 table */
1123
    if (bdrv_pwrite_sync(extent->file,
F
Fam Zheng 已提交
1124
                ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1125
                    + (m_data->l2_index * sizeof(offset)),
1126
                &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1127
        return VMDK_ERROR;
F
Fam Zheng 已提交
1128
    }
1129
    /* update backup L2 table */
F
Fam Zheng 已提交
1130 1131
    if (extent->l1_backup_table_offset != 0) {
        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
1132
        if (bdrv_pwrite_sync(extent->file,
F
Fam Zheng 已提交
1133
                    ((int64_t)m_data->l2_offset * 512)
F
Fam Zheng 已提交
1134
                        + (m_data->l2_index * sizeof(offset)),
1135
                    &offset, sizeof(offset)) < 0) {
F
Fam Zheng 已提交
1136
            return VMDK_ERROR;
F
Fam Zheng 已提交
1137
        }
1138
    }
F
Fam Zheng 已提交
1139 1140 1141
    if (m_data->l2_cache_entry) {
        *m_data->l2_cache_entry = offset;
    }
1142

F
Fam Zheng 已提交
1143
    return VMDK_OK;
1144 1145
}

F
Fam Zheng 已提交
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
/**
 * get_cluster_offset
 *
 * Look up cluster offset in extent file by sector number, and store in
 * @cluster_offset.
 *
 * For flat extents, the start offset as parsed from the description file is
 * returned.
 *
 * For sparse extents, look up in L1, L2 table. If allocate is true, return an
 * offset for a new cluster and update L2 cache. If there is a backing file,
 * COW is done before returning; otherwise, zeroes are written to the allocated
 * cluster. Both COW and zero writing skips the sector range
 * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
 * has new data to write there.
 *
 * Returns: VMDK_OK if cluster exists and mapped in the image.
 *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
 *          VMDK_ERROR if failed.
 */
1166
static int get_cluster_offset(BlockDriverState *bs,
F
Fam Zheng 已提交
1167 1168 1169 1170 1171
                              VmdkExtent *extent,
                              VmdkMetaData *m_data,
                              uint64_t offset,
                              bool allocate,
                              uint64_t *cluster_offset,
1172 1173
                              uint64_t skip_start_bytes,
                              uint64_t skip_end_bytes)
B
bellard 已提交
1174 1175 1176
{
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
1177
    uint32_t min_count, *l2_table;
1178
    bool zeroed = false;
F
Fam Zheng 已提交
1179
    int64_t ret;
1180
    int64_t cluster_sector;
1181

F
Fam Zheng 已提交
1182
    if (m_data) {
1183
        m_data->valid = 0;
F
Fam Zheng 已提交
1184
    }
1185
    if (extent->flat) {
1186
        *cluster_offset = extent->flat_start_offset;
F
Fam Zheng 已提交
1187
        return VMDK_OK;
1188
    }
1189

F
Fam Zheng 已提交
1190
    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
F
Fam Zheng 已提交
1191 1192
    l1_index = (offset >> 9) / extent->l1_entry_sectors;
    if (l1_index >= extent->l1_size) {
F
Fam Zheng 已提交
1193
        return VMDK_ERROR;
F
Fam Zheng 已提交
1194 1195 1196
    }
    l2_offset = extent->l1_table[l1_index];
    if (!l2_offset) {
F
Fam Zheng 已提交
1197
        return VMDK_UNALLOC;
F
Fam Zheng 已提交
1198
    }
1199
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1200
        if (l2_offset == extent->l2_cache_offsets[i]) {
B
bellard 已提交
1201
            /* increment the hit count */
F
Fam Zheng 已提交
1202
            if (++extent->l2_cache_counts[i] == 0xffffffff) {
1203
                for (j = 0; j < L2_CACHE_SIZE; j++) {
F
Fam Zheng 已提交
1204
                    extent->l2_cache_counts[j] >>= 1;
B
bellard 已提交
1205 1206
                }
            }
F
Fam Zheng 已提交
1207
            l2_table = extent->l2_cache + (i * extent->l2_size);
B
bellard 已提交
1208 1209 1210 1211 1212 1213
            goto found;
        }
    }
    /* not found: load a new entry in the least used one */
    min_index = 0;
    min_count = 0xffffffff;
1214
    for (i = 0; i < L2_CACHE_SIZE; i++) {
F
Fam Zheng 已提交
1215 1216
        if (extent->l2_cache_counts[i] < min_count) {
            min_count = extent->l2_cache_counts[i];
B
bellard 已提交
1217 1218 1219
            min_index = i;
        }
    }
F
Fam Zheng 已提交
1220
    l2_table = extent->l2_cache + (min_index * extent->l2_size);
1221
    if (bdrv_pread(extent->file,
F
Fam Zheng 已提交
1222 1223 1224 1225
                (int64_t)l2_offset * 512,
                l2_table,
                extent->l2_size * sizeof(uint32_t)
            ) != extent->l2_size * sizeof(uint32_t)) {
F
Fam Zheng 已提交
1226
        return VMDK_ERROR;
F
Fam Zheng 已提交
1227
    }
1228

F
Fam Zheng 已提交
1229 1230
    extent->l2_cache_offsets[min_index] = l2_offset;
    extent->l2_cache_counts[min_index] = 1;
B
bellard 已提交
1231
 found:
F
Fam Zheng 已提交
1232
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
F
Fam Zheng 已提交
1233
    cluster_sector = le32_to_cpu(l2_table[l2_index]);
1234

F
Fam Zheng 已提交
1235
    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
1236 1237 1238
        zeroed = true;
    }

F
Fam Zheng 已提交
1239
    if (!cluster_sector || zeroed) {
1240
        if (!allocate) {
1241
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
1242
        }
1243

F
Fam Zheng 已提交
1244 1245
        cluster_sector = extent->next_cluster_sector;
        extent->next_cluster_sector += extent->cluster_sectors;
1246 1247 1248 1249 1250 1251

        /* First of all we write grain itself, to avoid race condition
         * that may to corrupt the image.
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
1252 1253
        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
                                offset, skip_start_bytes, skip_end_bytes);
F
Fam Zheng 已提交
1254 1255
        if (ret) {
            return ret;
1256
        }
1257 1258 1259 1260 1261 1262 1263
        if (m_data) {
            m_data->valid = 1;
            m_data->l1_index = l1_index;
            m_data->l2_index = l2_index;
            m_data->l2_offset = l2_offset;
            m_data->l2_cache_entry = &l2_table[l2_index];
        }
1264
    }
F
Fam Zheng 已提交
1265
    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
F
Fam Zheng 已提交
1266
    return VMDK_OK;
B
bellard 已提交
1267 1268
}

F
Fam Zheng 已提交
1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285
static VmdkExtent *find_extent(BDRVVmdkState *s,
                                int64_t sector_num, VmdkExtent *start_hint)
{
    VmdkExtent *extent = start_hint;

    if (!extent) {
        extent = &s->extents[0];
    }
    while (extent < &s->extents[s->num_extents]) {
        if (sector_num < extent->end_sector) {
            return extent;
        }
        extent++;
    }
    return NULL;
}

1286 1287 1288
static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
                                                   int64_t offset)
{
1289
    uint64_t extent_begin_offset, extent_relative_offset;
1290 1291 1292 1293 1294
    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;

    extent_begin_offset =
        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
    extent_relative_offset = offset - extent_begin_offset;
1295
    return extent_relative_offset % cluster_size;
1296 1297
}

1298 1299 1300
static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
                                                  int64_t sector_num)
{
1301 1302 1303
    uint64_t offset;
    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
    return offset / BDRV_SECTOR_SIZE;
1304 1305
}

1306
static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
1307
        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
B
bellard 已提交
1308 1309
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1310 1311 1312 1313 1314 1315 1316 1317
    int64_t index_in_cluster, n, ret;
    uint64_t offset;
    VmdkExtent *extent;

    extent = find_extent(s, sector_num, NULL);
    if (!extent) {
        return 0;
    }
1318
    qemu_co_mutex_lock(&s->lock);
1319
    ret = get_cluster_offset(bs, extent, NULL,
F
Fam Zheng 已提交
1320 1321
                             sector_num * 512, false, &offset,
                             0, 0);
1322
    qemu_co_mutex_unlock(&s->lock);
1323

1324
    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
    switch (ret) {
    case VMDK_ERROR:
        ret = -EIO;
        break;
    case VMDK_UNALLOC:
        ret = 0;
        break;
    case VMDK_ZEROED:
        ret = BDRV_BLOCK_ZERO;
        break;
    case VMDK_OK:
        ret = BDRV_BLOCK_DATA;
1337
        if (!extent->compressed) {
1338 1339 1340
            ret |= BDRV_BLOCK_OFFSET_VALID;
            ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS))
                    & BDRV_BLOCK_OFFSET_MASK;
1341
        }
1342
        *file = extent->file->bs;
1343 1344
        break;
    }
1345 1346

    n = extent->cluster_sectors - index_in_cluster;
F
Fam Zheng 已提交
1347
    if (n > nb_sectors) {
B
bellard 已提交
1348
        n = nb_sectors;
F
Fam Zheng 已提交
1349
    }
B
bellard 已提交
1350
    *pnum = n;
F
Fam Zheng 已提交
1351
    return ret;
B
bellard 已提交
1352 1353
}

1354
static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1355 1356 1357
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            uint64_t qiov_offset, uint64_t n_bytes,
                            uint64_t offset)
1358 1359
{
    int ret;
F
Fam Zheng 已提交
1360 1361
    VmdkGrainMarker *data = NULL;
    uLongf buf_len;
1362 1363
    QEMUIOVector local_qiov;
    struct iovec iov;
1364 1365
    int64_t write_offset;
    int64_t write_end_sector;
1366

F
Fam Zheng 已提交
1367
    if (extent->compressed) {
1368 1369
        void *compressed_data;

F
Fam Zheng 已提交
1370 1371 1372 1373 1374 1375
        if (!extent->has_marker) {
            ret = -EINVAL;
            goto out;
        }
        buf_len = (extent->cluster_sectors << 9) * 2;
        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1376 1377 1378 1379 1380 1381 1382

        compressed_data = g_malloc(n_bytes);
        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
        g_free(compressed_data);

        if (ret != Z_OK || buf_len == 0) {
F
Fam Zheng 已提交
1383 1384 1385
            ret = -EINVAL;
            goto out;
        }
1386

1387 1388
        data->lba = cpu_to_le64(offset >> BDRV_SECTOR_BITS);
        data->size = cpu_to_le32(buf_len);
1389 1390 1391 1392 1393 1394 1395 1396 1397 1398

        n_bytes = buf_len + sizeof(VmdkGrainMarker);
        iov = (struct iovec) {
            .iov_base   = data,
            .iov_len    = n_bytes,
        };
        qemu_iovec_init_external(&local_qiov, &iov, 1);
    } else {
        qemu_iovec_init(&local_qiov, qiov->niov);
        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
F
Fam Zheng 已提交
1399
    }
1400

1401
    write_offset = cluster_offset + offset_in_cluster,
1402
    ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
1403
                          &local_qiov, 0);
1404

1405
    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
1406

1407 1408 1409 1410 1411 1412
    if (extent->compressed) {
        extent->next_cluster_sector = write_end_sector;
    } else {
        extent->next_cluster_sector = MAX(extent->next_cluster_sector,
                                          write_end_sector);
    }
1413

1414
    if (ret < 0) {
1415 1416 1417 1418
        goto out;
    }
    ret = 0;
 out:
F
Fam Zheng 已提交
1419
    g_free(data);
1420 1421 1422
    if (!extent->compressed) {
        qemu_iovec_destroy(&local_qiov);
    }
1423 1424 1425 1426
    return ret;
}

static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1427 1428
                            int64_t offset_in_cluster, QEMUIOVector *qiov,
                            int bytes)
1429 1430
{
    int ret;
F
Fam Zheng 已提交
1431 1432 1433 1434 1435 1436 1437
    int cluster_bytes, buf_bytes;
    uint8_t *cluster_buf, *compressed_data;
    uint8_t *uncomp_buf;
    uint32_t data_len;
    VmdkGrainMarker *marker;
    uLongf buf_len;

1438

F
Fam Zheng 已提交
1439
    if (!extent->compressed) {
1440
        ret = bdrv_co_preadv(extent->file,
1441 1442 1443 1444
                             cluster_offset + offset_in_cluster, bytes,
                             qiov, 0);
        if (ret < 0) {
            return ret;
F
Fam Zheng 已提交
1445
        }
1446
        return 0;
F
Fam Zheng 已提交
1447 1448 1449 1450 1451 1452
    }
    cluster_bytes = extent->cluster_sectors * 512;
    /* Read two clusters in case GrainMarker + compressed data > one cluster */
    buf_bytes = cluster_bytes * 2;
    cluster_buf = g_malloc(buf_bytes);
    uncomp_buf = g_malloc(cluster_bytes);
1453
    ret = bdrv_pread(extent->file,
F
Fam Zheng 已提交
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477
                cluster_offset,
                cluster_buf, buf_bytes);
    if (ret < 0) {
        goto out;
    }
    compressed_data = cluster_buf;
    buf_len = cluster_bytes;
    data_len = cluster_bytes;
    if (extent->has_marker) {
        marker = (VmdkGrainMarker *)cluster_buf;
        compressed_data = marker->data;
        data_len = le32_to_cpu(marker->size);
    }
    if (!data_len || data_len > buf_bytes) {
        ret = -EINVAL;
        goto out;
    }
    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
    if (ret != Z_OK) {
        ret = -EINVAL;
        goto out;

    }
    if (offset_in_cluster < 0 ||
1478
            offset_in_cluster + bytes > buf_len) {
F
Fam Zheng 已提交
1479 1480
        ret = -EINVAL;
        goto out;
1481
    }
1482
    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
F
Fam Zheng 已提交
1483 1484 1485 1486 1487 1488
    ret = 0;

 out:
    g_free(uncomp_buf);
    g_free(cluster_buf);
    return ret;
1489 1490
}

1491 1492 1493
static int coroutine_fn
vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
               QEMUIOVector *qiov, int flags)
B
bellard 已提交
1494 1495
{
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1496
    int ret;
1497
    uint64_t n_bytes, offset_in_cluster;
F
Fam Zheng 已提交
1498
    VmdkExtent *extent = NULL;
1499
    QEMUIOVector local_qiov;
B
bellard 已提交
1500
    uint64_t cluster_offset;
1501
    uint64_t bytes_done = 0;
1502

1503 1504 1505 1506 1507
    qemu_iovec_init(&local_qiov, qiov->niov);
    qemu_co_mutex_lock(&s->lock);

    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1508
        if (!extent) {
1509 1510
            ret = -EIO;
            goto fail;
F
Fam Zheng 已提交
1511
        }
F
Fam Zheng 已提交
1512
        ret = get_cluster_offset(bs, extent, NULL,
1513 1514 1515 1516 1517 1518
                                 offset, false, &cluster_offset, 0, 0);
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);

        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

1519
        if (ret != VMDK_OK) {
1520
            /* if not allocated, try to read from parent image, if exist */
1521
            if (bs->backing && ret != VMDK_ZEROED) {
F
Fam Zheng 已提交
1522
                if (!vmdk_is_cid_valid(bs)) {
1523 1524
                    ret = -EINVAL;
                    goto fail;
F
Fam Zheng 已提交
1525
                }
1526 1527 1528 1529

                qemu_iovec_reset(&local_qiov);
                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

1530
                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
1531
                                     &local_qiov, 0);
F
Fam Zheng 已提交
1532
                if (ret < 0) {
1533
                    goto fail;
F
Fam Zheng 已提交
1534
                }
1535
            } else {
1536
                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
1537
            }
B
bellard 已提交
1538
        } else {
1539 1540 1541 1542 1543
            qemu_iovec_reset(&local_qiov);
            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);

            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
                                   &local_qiov, n_bytes);
1544
            if (ret) {
1545
                goto fail;
1546
            }
B
bellard 已提交
1547
        }
1548 1549 1550
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
B
bellard 已提交
1551 1552
    }

1553 1554
    ret = 0;
fail:
1555
    qemu_co_mutex_unlock(&s->lock);
1556 1557
    qemu_iovec_destroy(&local_qiov);

1558 1559 1560
    return ret;
}

F
Fam Zheng 已提交
1561 1562 1563
/**
 * vmdk_write:
 * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1564 1565 1566 1567
 *                if possible, otherwise return -ENOTSUP.
 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
 *                with each cluster. By dry run we can find if the zero write
 *                is possible without modifying image data.
F
Fam Zheng 已提交
1568 1569 1570
 *
 * Returns: error code with 0 for success.
 */
1571 1572 1573
static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
                       uint64_t bytes, QEMUIOVector *qiov,
                       bool zeroed, bool zero_dry_run)
B
bellard 已提交
1574
{
1575
    BDRVVmdkState *s = bs->opaque;
F
Fam Zheng 已提交
1576
    VmdkExtent *extent = NULL;
F
Fam Zheng 已提交
1577
    int ret;
1578
    int64_t offset_in_cluster, n_bytes;
1579
    uint64_t cluster_offset;
1580
    uint64_t bytes_done = 0;
F
Fam Zheng 已提交
1581
    VmdkMetaData m_data;
1582

1583 1584
    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
        error_report("Wrong offset: offset=0x%" PRIx64
1585
                     " total_sectors=0x%" PRIx64,
1586
                     offset, bs->total_sectors);
1587
        return -EIO;
1588 1589
    }

1590 1591
    while (bytes > 0) {
        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
F
Fam Zheng 已提交
1592 1593 1594
        if (!extent) {
            return -EIO;
        }
1595 1596 1597 1598 1599
        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                             - offset_in_cluster);

        ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
1600
                                 !(extent->compressed || zeroed),
1601 1602
                                 &cluster_offset, offset_in_cluster,
                                 offset_in_cluster + n_bytes);
F
Fam Zheng 已提交
1603
        if (extent->compressed) {
F
Fam Zheng 已提交
1604
            if (ret == VMDK_OK) {
F
Fam Zheng 已提交
1605
                /* Refuse write to allocated cluster for streamOptimized */
F
Fam Zheng 已提交
1606 1607
                error_report("Could not write to allocated cluster"
                              " for streamOptimized");
F
Fam Zheng 已提交
1608 1609 1610
                return -EIO;
            } else {
                /* allocate */
1611
                ret = get_cluster_offset(bs, extent, &m_data, offset,
F
Fam Zheng 已提交
1612
                                         true, &cluster_offset, 0, 0);
F
Fam Zheng 已提交
1613 1614
            }
        }
F
Fam Zheng 已提交
1615
        if (ret == VMDK_ERROR) {
1616
            return -EINVAL;
F
Fam Zheng 已提交
1617
        }
F
Fam Zheng 已提交
1618 1619 1620
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
1621 1622 1623
                    offset_in_cluster == 0 &&
                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
F
Fam Zheng 已提交
1624 1625
                if (!zero_dry_run) {
                    /* update L2 tables */
F
Fam Zheng 已提交
1626 1627
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
                            != VMDK_OK) {
F
Fam Zheng 已提交
1628 1629 1630 1631 1632 1633 1634
                        return -EIO;
                    }
                }
            } else {
                return -ENOTSUP;
            }
        } else {
1635 1636
            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
                                    qiov, bytes_done, n_bytes, offset);
F
Fam Zheng 已提交
1637 1638 1639 1640 1641
            if (ret) {
                return ret;
            }
            if (m_data.valid) {
                /* update L2 tables */
F
Fam Zheng 已提交
1642 1643 1644
                if (vmdk_L2update(extent, &m_data,
                                  cluster_offset >> BDRV_SECTOR_BITS)
                        != VMDK_OK) {
F
Fam Zheng 已提交
1645 1646
                    return -EIO;
                }
F
Fam Zheng 已提交
1647
            }
1648
        }
1649 1650 1651
        bytes -= n_bytes;
        offset += n_bytes;
        bytes_done += n_bytes;
1652

F
Fam Zheng 已提交
1653 1654
        /* update CID on the first write every time the virtual disk is
         * opened */
1655
        if (!s->cid_updated) {
F
Fam Zheng 已提交
1656
            ret = vmdk_write_cid(bs, g_random_int());
K
Kevin Wolf 已提交
1657 1658 1659
            if (ret < 0) {
                return ret;
            }
1660
            s->cid_updated = true;
1661
        }
1662 1663
    }
    return 0;
B
bellard 已提交
1664 1665
}

1666 1667 1668
static int coroutine_fn
vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
                QEMUIOVector *qiov, int flags)
1669 1670 1671 1672
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
    qemu_co_mutex_lock(&s->lock);
1673
    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
F
Fam Zheng 已提交
1674 1675 1676 1677
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

1678 1679 1680
static int coroutine_fn
vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                           uint64_t bytes, QEMUIOVector *qiov)
1681
{
1682
    return vmdk_co_pwritev(bs, offset, bytes, qiov, 0);
1683 1684
}

1685 1686 1687 1688
static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
                                              int64_t offset,
                                              int bytes,
                                              BdrvRequestFlags flags)
F
Fam Zheng 已提交
1689 1690 1691
{
    int ret;
    BDRVVmdkState *s = bs->opaque;
1692

F
Fam Zheng 已提交
1693
    qemu_co_mutex_lock(&s->lock);
1694 1695
    /* write zeroes could fail if sectors not aligned to cluster, test it with
     * dry_run == true before really updating image */
1696
    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
F
Fam Zheng 已提交
1697
    if (!ret) {
1698
        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
F
Fam Zheng 已提交
1699
    }
1700 1701 1702 1703
    qemu_co_mutex_unlock(&s->lock);
    return ret;
}

1704
static int vmdk_create_extent(const char *filename, int64_t filesize,
1705
                              bool flat, bool compress, bool zeroed_grain,
1706
                              QemuOpts *opts, Error **errp)
1707
{
F
Fam Zheng 已提交
1708
    int ret, i;
1709
    BlockBackend *blk = NULL;
1710
    VMDK4Header header;
F
Fam Zheng 已提交
1711
    Error *local_err = NULL;
1712 1713 1714
    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
    uint32_t *gd_buf = NULL;
    int gd_buf_size;
1715

1716
    ret = bdrv_create_file(filename, opts, &local_err);
1717 1718 1719
    if (ret < 0) {
        error_propagate(errp, local_err);
        goto exit;
1720
    }
1721

1722
    blk = blk_new_open(filename, NULL, NULL,
1723 1724
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
                       &local_err);
1725
    if (blk == NULL) {
1726
        error_propagate(errp, local_err);
1727
        ret = -EIO;
1728 1729 1730
        goto exit;
    }

1731 1732
    blk_set_allow_write_beyond_eof(blk, true);

F
Fam Zheng 已提交
1733
    if (flat) {
1734
        ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp);
F
Fam Zheng 已提交
1735
        goto exit;
1736
    }
1737 1738
    magic = cpu_to_be32(VMDK4_MAGIC);
    memset(&header, 0, sizeof(header));
1739 1740 1741 1742 1743 1744 1745
    if (compress) {
        header.version = 3;
    } else if (zeroed_grain) {
        header.version = 2;
    } else {
        header.version = 1;
    }
F
Fam Zheng 已提交
1746
    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
1747 1748
                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
1749
    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1750
    header.capacity = filesize / BDRV_SECTOR_SIZE;
A
Alexander Graf 已提交
1751
    header.granularity = 128;
1752
    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
1753

1754 1755 1756 1757 1758
    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                           BDRV_SECTOR_SIZE);
    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
1759 1760 1761 1762

    header.desc_offset = 1;
    header.desc_size = 20;
    header.rgd_offset = header.desc_offset + header.desc_size;
1763
    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
1764
    header.grain_offset =
1765 1766
        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                 header.granularity);
A
Alexander Graf 已提交
1767 1768 1769 1770 1771
    /* swap endianness for all header fields */
    header.version = cpu_to_le32(header.version);
    header.flags = cpu_to_le32(header.flags);
    header.capacity = cpu_to_le64(header.capacity);
    header.granularity = cpu_to_le64(header.granularity);
1772
    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
1773 1774 1775 1776 1777
    header.desc_offset = cpu_to_le64(header.desc_offset);
    header.desc_size = cpu_to_le64(header.desc_size);
    header.rgd_offset = cpu_to_le64(header.rgd_offset);
    header.gd_offset = cpu_to_le64(header.gd_offset);
    header.grain_offset = cpu_to_le64(header.grain_offset);
1778
    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1779 1780 1781 1782 1783

    header.check_bytes[0] = 0xa;
    header.check_bytes[1] = 0x20;
    header.check_bytes[2] = 0xd;
    header.check_bytes[3] = 0xa;
1784 1785

    /* write all the data */
1786
    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
1787
    if (ret < 0) {
1788
        error_setg(errp, QERR_IO_ERROR);
1789 1790
        goto exit;
    }
1791
    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
1792
    if (ret < 0) {
1793
        error_setg(errp, QERR_IO_ERROR);
1794 1795
        goto exit;
    }
1796

1797 1798
    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9,
                       PREALLOC_MODE_OFF, errp);
1799 1800 1801
    if (ret < 0) {
        goto exit;
    }
1802 1803

    /* write grain directory */
1804 1805 1806
    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
    gd_buf = g_malloc0(gd_buf_size);
    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
1807
         i < gt_count; i++, tmp += gt_size) {
1808 1809
        gd_buf[i] = cpu_to_le32(tmp);
    }
1810
    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
1811
                     gd_buf, gd_buf_size, 0);
1812
    if (ret < 0) {
1813
        error_setg(errp, QERR_IO_ERROR);
1814
        goto exit;
1815
    }
1816

1817
    /* write backup grain directory */
1818
    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
1819
         i < gt_count; i++, tmp += gt_size) {
1820 1821
        gd_buf[i] = cpu_to_le32(tmp);
    }
1822
    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
1823
                     gd_buf, gd_buf_size, 0);
1824
    if (ret < 0) {
1825
        error_setg(errp, QERR_IO_ERROR);
1826
        goto exit;
1827
    }
1828

F
Fam Zheng 已提交
1829
    ret = 0;
1830
exit:
1831 1832
    if (blk) {
        blk_unref(blk);
1833 1834
    }
    g_free(gd_buf);
F
Fam Zheng 已提交
1835 1836 1837 1838
    return ret;
}

static int filename_decompose(const char *filename, char *path, char *prefix,
F
Fam Zheng 已提交
1839
                              char *postfix, size_t buf_len, Error **errp)
F
Fam Zheng 已提交
1840 1841 1842 1843
{
    const char *p, *q;

    if (filename == NULL || !strlen(filename)) {
F
Fam Zheng 已提交
1844
        error_setg(errp, "No filename provided");
F
Fam Zheng 已提交
1845
        return VMDK_ERROR;
F
Fam Zheng 已提交
1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856
    }
    p = strrchr(filename, '/');
    if (p == NULL) {
        p = strrchr(filename, '\\');
    }
    if (p == NULL) {
        p = strrchr(filename, ':');
    }
    if (p != NULL) {
        p++;
        if (p - filename >= buf_len) {
F
Fam Zheng 已提交
1857
            return VMDK_ERROR;
F
Fam Zheng 已提交
1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869
        }
        pstrcpy(path, p - filename + 1, filename);
    } else {
        p = filename;
        path[0] = '\0';
    }
    q = strrchr(p, '.');
    if (q == NULL) {
        pstrcpy(prefix, buf_len, p);
        postfix[0] = '\0';
    } else {
        if (q - p >= buf_len) {
F
Fam Zheng 已提交
1870
            return VMDK_ERROR;
F
Fam Zheng 已提交
1871 1872 1873 1874
        }
        pstrcpy(prefix, q - p + 1, p);
        pstrcpy(postfix, buf_len, q);
    }
F
Fam Zheng 已提交
1875
    return VMDK_OK;
F
Fam Zheng 已提交
1876 1877
}

1878
static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
F
Fam Zheng 已提交
1879
{
1880
    int idx = 0;
1881
    BlockBackend *new_blk = NULL;
F
Fam Zheng 已提交
1882
    Error *local_err = NULL;
1883
    char *desc = NULL;
F
Fam Zheng 已提交
1884
    int64_t total_size = 0, filesize;
1885 1886
    char *adapter_type = NULL;
    char *backing_file = NULL;
1887
    char *hw_version = NULL;
1888
    char *fmt = NULL;
F
Fam Zheng 已提交
1889
    int ret = 0;
1890
    bool flat, split, compress;
1891
    GString *ext_desc_lines;
1892 1893 1894 1895 1896 1897
    char *path = g_malloc0(PATH_MAX);
    char *prefix = g_malloc0(PATH_MAX);
    char *postfix = g_malloc0(PATH_MAX);
    char *desc_line = g_malloc0(BUF_SIZE);
    char *ext_filename = g_malloc0(PATH_MAX);
    char *desc_filename = g_malloc0(PATH_MAX);
F
Fam Zheng 已提交
1898 1899
    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
    const char *desc_extent_line;
1900
    char *parent_desc_line = g_malloc0(BUF_SIZE);
F
Fam Zheng 已提交
1901
    uint32_t parent_cid = 0xffffffff;
1902
    uint32_t number_heads = 16;
1903
    bool zeroed_grain = false;
1904
    uint32_t desc_offset = 0, desc_len;
F
Fam Zheng 已提交
1905 1906 1907
    const char desc_template[] =
        "# Disk DescriptorFile\n"
        "version=1\n"
1908 1909
        "CID=%" PRIx32 "\n"
        "parentCID=%" PRIx32 "\n"
F
Fam Zheng 已提交
1910 1911 1912 1913 1914 1915 1916 1917 1918
        "createType=\"%s\"\n"
        "%s"
        "\n"
        "# Extent description\n"
        "%s"
        "\n"
        "# The Disk Data Base\n"
        "#DDB\n"
        "\n"
1919
        "ddb.virtualHWVersion = \"%s\"\n"
F
Fam Zheng 已提交
1920
        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1921
        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
F
Fam Zheng 已提交
1922
        "ddb.geometry.sectors = \"63\"\n"
1923
        "ddb.adapterType = \"%s\"\n";
F
Fam Zheng 已提交
1924

1925 1926
    ext_desc_lines = g_string_new(NULL);

F
Fam Zheng 已提交
1927
    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
1928 1929
        ret = -EINVAL;
        goto exit;
F
Fam Zheng 已提交
1930 1931
    }
    /* Read out options */
1932 1933
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
1934 1935
    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
1936
    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
1937
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
        if (strcmp(hw_version, "undefined")) {
            error_setg(errp,
                       "compat6 cannot be enabled with hwversion set");
            ret = -EINVAL;
            goto exit;
        }
        g_free(hw_version);
        hw_version = g_strdup("6");
    }
    if (strcmp(hw_version, "undefined") == 0) {
        g_free(hw_version);
        hw_version = g_strdup("4");
1950 1951 1952 1953
    }
    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
        zeroed_grain = true;
F
Fam Zheng 已提交
1954
    }
1955

1956
    if (!adapter_type) {
1957
        adapter_type = g_strdup("ide");
1958 1959 1960 1961
    } else if (strcmp(adapter_type, "ide") &&
               strcmp(adapter_type, "buslogic") &&
               strcmp(adapter_type, "lsilogic") &&
               strcmp(adapter_type, "legacyESX")) {
F
Fam Zheng 已提交
1962
        error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
1963 1964
        ret = -EINVAL;
        goto exit;
1965 1966 1967 1968 1969 1970
    }
    if (strcmp(adapter_type, "ide") != 0) {
        /* that's the number of heads with which vmware operates when
           creating, exporting, etc. vmdk files with a non-ide adapter type */
        number_heads = 255;
    }
F
Fam Zheng 已提交
1971 1972
    if (!fmt) {
        /* Default format to monolithicSparse */
1973
        fmt = g_strdup("monolithicSparse");
F
Fam Zheng 已提交
1974 1975 1976
    } else if (strcmp(fmt, "monolithicFlat") &&
               strcmp(fmt, "monolithicSparse") &&
               strcmp(fmt, "twoGbMaxExtentSparse") &&
1977 1978
               strcmp(fmt, "twoGbMaxExtentFlat") &&
               strcmp(fmt, "streamOptimized")) {
F
Fam Zheng 已提交
1979
        error_setg(errp, "Unknown subformat: '%s'", fmt);
1980 1981
        ret = -EINVAL;
        goto exit;
F
Fam Zheng 已提交
1982 1983 1984 1985 1986
    }
    split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
              strcmp(fmt, "twoGbMaxExtentSparse"));
    flat = !(strcmp(fmt, "monolithicFlat") &&
             strcmp(fmt, "twoGbMaxExtentFlat"));
1987
    compress = !strcmp(fmt, "streamOptimized");
F
Fam Zheng 已提交
1988
    if (flat) {
1989
        desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n";
F
Fam Zheng 已提交
1990
    } else {
1991
        desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n";
F
Fam Zheng 已提交
1992 1993
    }
    if (flat && backing_file) {
F
Fam Zheng 已提交
1994
        error_setg(errp, "Flat image can't have backing file");
1995 1996
        ret = -ENOTSUP;
        goto exit;
F
Fam Zheng 已提交
1997
    }
1998 1999
    if (flat && zeroed_grain) {
        error_setg(errp, "Flat image can't enable zeroed grain");
2000 2001
        ret = -ENOTSUP;
        goto exit;
2002
    }
F
Fam Zheng 已提交
2003
    if (backing_file) {
2004
        BlockBackend *blk;
2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
        char *full_backing = g_new0(char, PATH_MAX);
        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
                                                     full_backing, PATH_MAX,
                                                     &local_err);
        if (local_err) {
            g_free(full_backing);
            error_propagate(errp, local_err);
            ret = -ENOENT;
            goto exit;
        }
2015

2016
        blk = blk_new_open(full_backing, NULL, NULL,
2017
                           BDRV_O_NO_BACKING, errp);
2018
        g_free(full_backing);
2019 2020
        if (blk == NULL) {
            ret = -EIO;
2021
            goto exit;
F
Fam Zheng 已提交
2022
        }
2023 2024
        if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) {
            blk_unref(blk);
2025 2026
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2027
        }
2028
        ret = vmdk_read_cid(blk_bs(blk), 0, &parent_cid);
2029
        blk_unref(blk);
2030 2031 2032
        if (ret) {
            goto exit;
        }
2033
        snprintf(parent_desc_line, BUF_SIZE,
2034
                "parentFileNameHint=\"%s\"", backing_file);
F
Fam Zheng 已提交
2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045
    }

    /* Create extents */
    filesize = total_size;
    while (filesize > 0) {
        int64_t size = filesize;

        if (split && size > split_size) {
            size = split_size;
        }
        if (split) {
2046
            snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s",
F
Fam Zheng 已提交
2047 2048
                    prefix, flat ? 'f' : 's', ++idx, postfix);
        } else if (flat) {
2049
            snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix);
F
Fam Zheng 已提交
2050
        } else {
2051
            snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix);
F
Fam Zheng 已提交
2052
        }
2053
        snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename);
F
Fam Zheng 已提交
2054

2055
        if (vmdk_create_extent(ext_filename, size,
2056
                               flat, compress, zeroed_grain, opts, errp)) {
2057 2058
            ret = -EINVAL;
            goto exit;
F
Fam Zheng 已提交
2059 2060 2061 2062
        }
        filesize -= size;

        /* Format description line */
2063
        snprintf(desc_line, BUF_SIZE,
2064
                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
2065
        g_string_append(ext_desc_lines, desc_line);
F
Fam Zheng 已提交
2066 2067
    }
    /* generate descriptor file */
2068
    desc = g_strdup_printf(desc_template,
F
Fam Zheng 已提交
2069
                           g_random_int(),
2070 2071 2072 2073
                           parent_cid,
                           fmt,
                           parent_desc_line,
                           ext_desc_lines->str,
2074
                           hw_version,
2075 2076
                           total_size /
                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
2077 2078
                           number_heads,
                           adapter_type);
2079 2080 2081 2082
    desc_len = strlen(desc);
    /* the descriptor offset = 0x200 */
    if (!split && !flat) {
        desc_offset = 0x200;
F
Fam Zheng 已提交
2083
    } else {
C
Chunyan Liu 已提交
2084
        ret = bdrv_create_file(filename, opts, &local_err);
2085
        if (ret < 0) {
F
Fam Zheng 已提交
2086
            error_propagate(errp, local_err);
2087 2088
            goto exit;
        }
F
Fam Zheng 已提交
2089
    }
2090

2091
    new_blk = blk_new_open(filename, NULL, NULL,
2092 2093
                           BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
                           &local_err);
2094
    if (new_blk == NULL) {
F
Fam Zheng 已提交
2095
        error_propagate(errp, local_err);
2096
        ret = -EIO;
2097
        goto exit;
F
Fam Zheng 已提交
2098
    }
2099 2100 2101

    blk_set_allow_write_beyond_eof(new_blk, true);

2102
    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
2103 2104 2105
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not write description");
        goto exit;
F
Fam Zheng 已提交
2106
    }
2107 2108 2109
    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
     * for description file */
    if (desc_offset == 0) {
2110
        ret = blk_truncate(new_blk, desc_len, PREALLOC_MODE_OFF, errp);
2111
    }
2112
exit:
2113 2114
    if (new_blk) {
        blk_unref(new_blk);
2115
    }
2116 2117
    g_free(adapter_type);
    g_free(backing_file);
2118
    g_free(hw_version);
2119
    g_free(fmt);
2120
    g_free(desc);
2121 2122 2123 2124 2125 2126 2127
    g_free(path);
    g_free(prefix);
    g_free(postfix);
    g_free(desc_line);
    g_free(ext_filename);
    g_free(desc_filename);
    g_free(parent_desc_line);
2128
    g_string_free(ext_desc_lines, true);
2129
    return ret;
2130 2131
}

B
bellard 已提交
2132
static void vmdk_close(BlockDriverState *bs)
B
bellard 已提交
2133
{
K
Kevin Wolf 已提交
2134 2135
    BDRVVmdkState *s = bs->opaque;

F
Fam Zheng 已提交
2136
    vmdk_free_extents(bs);
F
Fam Zheng 已提交
2137
    g_free(s->create_type);
K
Kevin Wolf 已提交
2138 2139 2140

    migrate_del_blocker(s->migration_blocker);
    error_free(s->migration_blocker);
B
bellard 已提交
2141 2142
}

P
Paolo Bonzini 已提交
2143
static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
P
pbrook 已提交
2144
{
F
Fam Zheng 已提交
2145
    BDRVVmdkState *s = bs->opaque;
2146 2147
    int i, err;
    int ret = 0;
F
Fam Zheng 已提交
2148 2149

    for (i = 0; i < s->num_extents; i++) {
2150
        err = bdrv_co_flush(s->extents[i].file->bs);
F
Fam Zheng 已提交
2151 2152 2153 2154 2155
        if (err < 0) {
            ret = err;
        }
    }
    return ret;
P
pbrook 已提交
2156 2157
}

2158 2159 2160 2161 2162 2163 2164
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
{
    int i;
    int64_t ret = 0;
    int64_t r;
    BDRVVmdkState *s = bs->opaque;

K
Kevin Wolf 已提交
2165
    ret = bdrv_get_allocated_file_size(bs->file->bs);
2166 2167 2168 2169
    if (ret < 0) {
        return ret;
    }
    for (i = 0; i < s->num_extents; i++) {
K
Kevin Wolf 已提交
2170
        if (s->extents[i].file == bs->file) {
2171 2172
            continue;
        }
2173
        r = bdrv_get_allocated_file_size(s->extents[i].file->bs);
2174 2175 2176 2177 2178 2179 2180
        if (r < 0) {
            return r;
        }
        ret += r;
    }
    return ret;
}
2181

F
Fam Zheng 已提交
2182 2183 2184 2185 2186 2187 2188 2189 2190
static int vmdk_has_zero_init(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;

    /* If has a flat extent and its underlying storage doesn't have zero init,
     * return 0. */
    for (i = 0; i < s->num_extents; i++) {
        if (s->extents[i].flat) {
2191
            if (!bdrv_has_zero_init(s->extents[i].file->bs)) {
F
Fam Zheng 已提交
2192 2193 2194 2195 2196 2197 2198
                return 0;
            }
        }
    }
    return 1;
}

F
Fam Zheng 已提交
2199 2200 2201 2202 2203
static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
{
    ImageInfo *info = g_new0(ImageInfo, 1);

    *info = (ImageInfo){
2204
        .filename         = g_strdup(extent->file->bs->filename),
F
Fam Zheng 已提交
2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215
        .format           = g_strdup(extent->type),
        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE,
        .compressed       = extent->compressed,
        .has_compressed   = extent->compressed,
        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE,
        .has_cluster_size = !extent->flat,
    };

    return info;
}

2216 2217 2218 2219 2220 2221
static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
                      BdrvCheckMode fix)
{
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *extent = NULL;
    int64_t sector_num = 0;
2222
    int64_t total_sectors = bdrv_nb_sectors(bs);
2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242
    int ret;
    uint64_t cluster_offset;

    if (fix) {
        return -ENOTSUP;
    }

    for (;;) {
        if (sector_num >= total_sectors) {
            return 0;
        }
        extent = find_extent(s, sector_num, extent);
        if (!extent) {
            fprintf(stderr,
                    "ERROR: could not find extent for sector %" PRId64 "\n",
                    sector_num);
            break;
        }
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << BDRV_SECTOR_BITS,
F
Fam Zheng 已提交
2243
                                 false, &cluster_offset, 0, 0);
2244 2245 2246 2247 2248 2249
        if (ret == VMDK_ERROR) {
            fprintf(stderr,
                    "ERROR: could not get cluster_offset for sector %"
                    PRId64 "\n", sector_num);
            break;
        }
2250 2251 2252
        if (ret == VMDK_OK &&
            cluster_offset >= bdrv_getlength(extent->file->bs))
        {
2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
            fprintf(stderr,
                    "ERROR: cluster offset for sector %"
                    PRId64 " points after EOF\n", sector_num);
            break;
        }
        sector_num += extent->cluster_sectors;
    }

    result->corruptions++;
    return 0;
}

F
Fam Zheng 已提交
2265 2266 2267 2268 2269 2270 2271 2272
static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
    ImageInfoList **next;

    *spec_info = (ImageInfoSpecific){
2273
        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
2274 2275
        .u = {
            .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
F
Fam Zheng 已提交
2276 2277 2278
        },
    };

2279
    *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
F
Fam Zheng 已提交
2280 2281 2282 2283 2284
        .create_type = g_strdup(s->create_type),
        .cid = s->cid,
        .parent_cid = s->parent_cid,
    };

2285
    next = &spec_info->u.vmdk.data->extents;
F
Fam Zheng 已提交
2286 2287 2288 2289 2290 2291 2292 2293 2294 2295
    for (i = 0; i < s->num_extents; i++) {
        *next = g_new0(ImageInfoList, 1);
        (*next)->value = vmdk_get_extent_info(&s->extents[i]);
        (*next)->next = NULL;
        next = &(*next)->next;
    }

    return spec_info;
}

2296 2297 2298 2299 2300 2301 2302
static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
{
    return a->flat == b->flat &&
           a->compressed == b->compressed &&
           (a->flat || a->cluster_sectors == b->cluster_sectors);
}

F
Fam Zheng 已提交
2303 2304 2305 2306 2307
static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    int i;
    BDRVVmdkState *s = bs->opaque;
    assert(s->num_extents);
2308

F
Fam Zheng 已提交
2309 2310
    /* See if we have multiple extents but they have different cases */
    for (i = 1; i < s->num_extents; i++) {
2311
        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
F
Fam Zheng 已提交
2312 2313 2314
            return -ENOTSUP;
        }
    }
2315 2316 2317 2318
    bdi->needs_compressed_writes = s->extents[0].compressed;
    if (!s->extents[0].flat) {
        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
    }
F
Fam Zheng 已提交
2319 2320 2321
    return 0;
}

2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347
static QemuOptsList vmdk_create_opts = {
    .name = "vmdk-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_ADAPTER_TYPE,
            .type = QEMU_OPT_STRING,
            .help = "Virtual adapter type, can be one of "
                    "ide (default), lsilogic, buslogic or legacyESX"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_COMPAT6,
            .type = QEMU_OPT_BOOL,
            .help = "VMDK version 6 image",
            .def_value_str = "off"
        },
2348 2349 2350 2351 2352 2353
        {
            .name = BLOCK_OPT_HWVERSION,
            .type = QEMU_OPT_STRING,
            .help = "VMDK hardware version",
            .def_value_str = "undefined"
        },
2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368
        {
            .name = BLOCK_OPT_SUBFMT,
            .type = QEMU_OPT_STRING,
            .help =
                "VMDK flat extent format, can be one of "
                "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
        },
        {
            .name = BLOCK_OPT_ZEROED_GRAIN,
            .type = QEMU_OPT_BOOL,
            .help = "Enable efficient zero writes "
                    "using the zeroed-grain GTE feature"
        },
        { /* end of list */ }
    }
2369 2370
};

2371
static BlockDriver bdrv_vmdk = {
F
Fam Zheng 已提交
2372 2373 2374 2375
    .format_name                  = "vmdk",
    .instance_size                = sizeof(BDRVVmdkState),
    .bdrv_probe                   = vmdk_probe,
    .bdrv_open                    = vmdk_open,
2376
    .bdrv_check                   = vmdk_check,
F
Fam Zheng 已提交
2377
    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
2378
    .bdrv_child_perm              = bdrv_format_default_perms,
2379
    .bdrv_co_preadv               = vmdk_co_preadv,
2380
    .bdrv_co_pwritev              = vmdk_co_pwritev,
2381
    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
2382
    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
F
Fam Zheng 已提交
2383
    .bdrv_close                   = vmdk_close,
C
Chunyan Liu 已提交
2384
    .bdrv_create                  = vmdk_create,
F
Fam Zheng 已提交
2385
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
2386
    .bdrv_co_get_block_status     = vmdk_co_get_block_status,
F
Fam Zheng 已提交
2387 2388
    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
    .bdrv_has_zero_init           = vmdk_has_zero_init,
F
Fam Zheng 已提交
2389
    .bdrv_get_specific_info       = vmdk_get_specific_info,
2390
    .bdrv_refresh_limits          = vmdk_refresh_limits,
F
Fam Zheng 已提交
2391
    .bdrv_get_info                = vmdk_get_info,
F
Fam Zheng 已提交
2392

2393
    .supports_backing             = true,
2394
    .create_opts                  = &vmdk_create_opts,
B
bellard 已提交
2395
};
2396 2397 2398 2399 2400 2401 2402

static void bdrv_vmdk_init(void)
{
    bdrv_register(&bdrv_vmdk);
}

block_init(bdrv_vmdk_init);