file-posix.c 97.1 KB
Newer Older
B
bellard 已提交
1
/*
2
 * Block driver for RAW files (posix)
3
 *
B
bellard 已提交
4
 * Copyright (c) 2006 Fabrice Bellard
5
 *
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
24

P
Peter Maydell 已提交
25
#include "qemu/osdep.h"
26
#include "qapi/error.h"
27
#include "qemu/cutils.h"
28
#include "qemu/error-report.h"
29
#include "block/block_int.h"
30
#include "qemu/module.h"
31
#include "qemu/option.h"
32
#include "trace.h"
33
#include "block/thread-pool.h"
34
#include "qemu/iov.h"
35
#include "block/raw-aio.h"
36
#include "qapi/qmp/qdict.h"
37
#include "qapi/qmp/qstring.h"
B
bellard 已提交
38

39 40 41
#include "scsi/pr-manager.h"
#include "scsi/constants.h"

42
#if defined(__APPLE__) && (__MACH__)
B
bellard 已提交
43 44 45 46 47 48 49 50
#include <paths.h>
#include <sys/param.h>
#include <IOKit/IOKitLib.h>
#include <IOKit/IOBSD.h>
#include <IOKit/storage/IOMediaBSDClient.h>
#include <IOKit/storage/IOMedia.h>
#include <IOKit/storage/IOCDMedia.h>
//#include <IOKit/storage/IOCDTypes.h>
51
#include <IOKit/storage/IODVDMedia.h>
B
bellard 已提交
52 53 54 55
#include <CoreFoundation/CoreFoundation.h>
#endif

#ifdef __sun__
56
#define _POSIX_PTHREAD_SEMANTICS 1
B
bellard 已提交
57 58
#include <sys/dkio.h>
#endif
B
bellard 已提交
59 60
#ifdef __linux__
#include <sys/ioctl.h>
61
#include <sys/param.h>
62
#include <sys/syscall.h>
B
bellard 已提交
63 64
#include <linux/cdrom.h>
#include <linux/fd.h>
65
#include <linux/fs.h>
66
#include <linux/hdreg.h>
67
#include <scsi/sg.h>
68 69 70
#ifdef __s390__
#include <asm/dasd.h>
#endif
71 72 73
#ifndef FS_NOCOW_FL
#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
#endif
74
#endif
75
#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
76 77
#include <linux/falloc.h>
#endif
A
Aurelien Jarno 已提交
78
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
79
#include <sys/disk.h>
B
blueswir1 已提交
80
#include <sys/cdio.h>
81
#endif
B
bellard 已提交
82

83 84 85 86 87 88
#ifdef __OpenBSD__
#include <sys/ioctl.h>
#include <sys/disklabel.h>
#include <sys/dkio.h>
#endif

89 90 91 92 93 94 95
#ifdef __NetBSD__
#include <sys/ioctl.h>
#include <sys/disklabel.h>
#include <sys/dkio.h>
#include <sys/disk.h>
#endif

96 97 98 99 100
#ifdef __DragonFly__
#include <sys/ioctl.h>
#include <sys/diskslice.h>
#endif

C
Christoph Hellwig 已提交
101 102 103 104
#ifdef CONFIG_XFS
#include <xfs/xfs.h>
#endif

P
pbrook 已提交
105
//#define DEBUG_BLOCK
106 107 108

#ifdef DEBUG_BLOCK
# define DEBUG_BLOCK_PRINT 1
T
ths 已提交
109
#else
110
# define DEBUG_BLOCK_PRINT 0
T
ths 已提交
111
#endif
112 113 114 115 116 117
#define DPRINTF(fmt, ...) \
do { \
    if (DEBUG_BLOCK_PRINT) { \
        printf(fmt, ## __VA_ARGS__); \
    } \
} while (0)
T
ths 已提交
118

A
aliguori 已提交
119 120
/* OS X does not have O_DSYNC */
#ifndef O_DSYNC
121
#ifdef O_SYNC
122
#define O_DSYNC O_SYNC
123 124 125
#elif defined(O_FSYNC)
#define O_DSYNC O_FSYNC
#endif
A
aliguori 已提交
126 127
#endif

128 129 130 131 132
/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
#ifndef O_DIRECT
#define O_DIRECT O_DSYNC
#endif

B
bellard 已提交
133 134
#define FTYPE_FILE   0
#define FTYPE_CD     1
B
bellard 已提交
135

136 137
#define MAX_BLOCKSIZE	4096

138 139 140 141 142
/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 * leaving a few more bytes for its future use. */
#define RAW_LOCK_PERM_BASE             100
#define RAW_LOCK_SHARED_BASE           200

B
bellard 已提交
143 144
typedef struct BDRVRawState {
    int fd;
145 146
    int lock_fd;
    bool use_lock;
B
bellard 已提交
147
    int type;
148
    int open_flags;
149 150
    size_t buf_align;

151 152 153 154
    /* The current permissions. */
    uint64_t perm;
    uint64_t shared_perm;

C
Christoph Hellwig 已提交
155
#ifdef CONFIG_XFS
156
    bool is_xfs:1;
C
Christoph Hellwig 已提交
157
#endif
158
    bool has_discard:1;
159
    bool has_write_zeroes:1;
160
    bool discard_zeroes:1;
161
    bool use_linux_aio:1;
162
    bool page_cache_inconsistent:1;
163
    bool has_fallocate;
164
    bool needs_alignment;
165
    bool check_cache_dropped;
166 167

    PRManager *pr_mgr;
B
bellard 已提交
168 169
} BDRVRawState;

J
Jeff Cody 已提交
170 171 172
typedef struct BDRVRawReopenState {
    int fd;
    int open_flags;
173
    bool check_cache_dropped;
J
Jeff Cody 已提交
174 175
} BDRVRawReopenState;

B
bellard 已提交
176
static int fd_open(BlockDriverState *bs);
177
static int64_t raw_getlength(BlockDriverState *bs);
B
bellard 已提交
178

179 180 181 182 183 184 185 186
typedef struct RawPosixAIOData {
    BlockDriverState *bs;
    int aio_fildes;
    union {
        struct iovec *aio_iov;
        void *aio_ioctl_buf;
    };
    int aio_niov;
P
Paolo Bonzini 已提交
187
    uint64_t aio_nbytes;
188 189 190
#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
    off_t aio_offset;
    int aio_type;
191 192 193 194 195 196 197 198 199 200
    union {
        struct {
            int aio_fd2;
            off_t aio_offset2;
        };
        struct {
            PreallocMode prealloc;
            Error **errp;
        };
    };
201 202
} RawPosixAIOData;

A
Aurelien Jarno 已提交
203
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
204
static int cdrom_reopen(BlockDriverState *bs);
B
blueswir1 已提交
205 206
#endif

207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
#if defined(__NetBSD__)
static int raw_normalize_devicepath(const char **filename)
{
    static char namebuf[PATH_MAX];
    const char *dp, *fname;
    struct stat sb;

    fname = *filename;
    dp = strrchr(fname, '/');
    if (lstat(fname, &sb) < 0) {
        fprintf(stderr, "%s: stat failed: %s\n",
            fname, strerror(errno));
        return -errno;
    }

    if (!S_ISBLK(sb.st_mode)) {
        return 0;
    }

    if (dp == NULL) {
        snprintf(namebuf, PATH_MAX, "r%s", fname);
    } else {
        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
            (int)(dp - fname), fname, dp + 1);
    }
    fprintf(stderr, "%s is a block device", fname);
    *filename = namebuf;
    fprintf(stderr, ", using %s\n", *filename);

    return 0;
}
#else
static int raw_normalize_devicepath(const char **filename)
{
    return 0;
}
#endif

245 246 247 248
/*
 * Get logical block size via ioctl. On success store it in @sector_size_p.
 */
static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
249 250
{
    unsigned int sector_size;
251
    bool success = false;
252
    int i;
253

254
    errno = ENOTSUP;
255
    static const unsigned long ioctl_list[] = {
256
#ifdef BLKSSZGET
257
        BLKSSZGET,
258 259
#endif
#ifdef DKIOCGETBLOCKSIZE
260
        DKIOCGETBLOCKSIZE,
261 262
#endif
#ifdef DIOCGSECTORSIZE
263
        DIOCGSECTORSIZE,
264
#endif
265 266 267 268 269 270 271 272 273
    };

    /* Try a few ioctls to get the right size */
    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
            *sector_size_p = sector_size;
            success = true;
        }
    }
274 275 276 277

    return success ? 0 : -errno;
}

278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
/**
 * Get physical block size of @fd.
 * On success, store it in @blk_size and return 0.
 * On failure, return -errno.
 */
static int probe_physical_blocksize(int fd, unsigned int *blk_size)
{
#ifdef BLKPBSZGET
    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
        return -errno;
    }
    return 0;
#else
    return -ENOTSUP;
#endif
}

295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
/* Check if read is allowed with given memory buffer and length.
 *
 * This function is used to check O_DIRECT memory buffer and request alignment.
 */
static bool raw_is_io_aligned(int fd, void *buf, size_t len)
{
    ssize_t ret = pread(fd, buf, len, 0);

    if (ret >= 0) {
        return true;
    }

#ifdef __linux__
    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
     * other errors (e.g. real I/O error), which could happen on a failed
     * drive, since we only care about probing alignment.
     */
    if (errno != EINVAL) {
        return true;
    }
#endif

    return false;
}

320 321 322 323
static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
{
    BDRVRawState *s = bs->opaque;
    char *buf;
324
    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
325

326
    /* For SCSI generic devices the alignment is not really used.
327
       With buffered I/O, we don't have any restrictions. */
328
    if (bdrv_is_sg(bs) || !s->needs_alignment) {
329
        bs->bl.request_alignment = 1;
330 331 332 333
        s->buf_align = 1;
        return;
    }

334
    bs->bl.request_alignment = 0;
335 336
    s->buf_align = 0;
    /* Let's try to use the logical blocksize for the alignment. */
337 338
    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
        bs->bl.request_alignment = 0;
339
    }
340 341 342
#ifdef CONFIG_XFS
    if (s->is_xfs) {
        struct dioattr da;
343
        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
344
            bs->bl.request_alignment = da.d_miniosz;
345 346 347 348 349 350 351 352 353
            /* The kernel returns wrong information for d_mem */
            /* s->buf_align = da.d_mem; */
        }
    }
#endif

    /* If we could not get the sizes so far, we can only guess them */
    if (!s->buf_align) {
        size_t align;
354 355 356
        buf = qemu_memalign(max_align, 2 * max_align);
        for (align = 512; align <= max_align; align <<= 1) {
            if (raw_is_io_aligned(fd, buf + align, max_align)) {
357 358 359 360 361 362 363
                s->buf_align = align;
                break;
            }
        }
        qemu_vfree(buf);
    }

364
    if (!bs->bl.request_alignment) {
365
        size_t align;
366 367
        buf = qemu_memalign(s->buf_align, max_align);
        for (align = 512; align <= max_align; align <<= 1) {
368
            if (raw_is_io_aligned(fd, buf, align)) {
369
                bs->bl.request_alignment = align;
370 371 372 373 374
                break;
            }
        }
        qemu_vfree(buf);
    }
375

376
    if (!s->buf_align || !bs->bl.request_alignment) {
E
Eric Blake 已提交
377 378
        error_setg(errp, "Could not find working O_DIRECT alignment");
        error_append_hint(errp, "Try cache.direct=off\n");
379
    }
380 381
}

382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
static void raw_parse_flags(int bdrv_flags, int *open_flags)
{
    assert(open_flags != NULL);

    *open_flags |= O_BINARY;
    *open_flags &= ~O_ACCMODE;
    if (bdrv_flags & BDRV_O_RDWR) {
        *open_flags |= O_RDWR;
    } else {
        *open_flags |= O_RDONLY;
    }

    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
     * and O_DIRECT for no caching. */
    if ((bdrv_flags & BDRV_O_NOCACHE)) {
        *open_flags |= O_DIRECT;
    }
}

401 402 403
static void raw_parse_filename(const char *filename, QDict *options,
                               Error **errp)
{
404
    bdrv_parse_filename_strip_prefix(filename, "file:", options);
405 406
}

407 408 409 410 411 412 413 414 415
static QemuOptsList raw_runtime_opts = {
    .name = "raw",
    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
    .desc = {
        {
            .name = "filename",
            .type = QEMU_OPT_STRING,
            .help = "File name of the image",
        },
416 417 418 419 420
        {
            .name = "aio",
            .type = QEMU_OPT_STRING,
            .help = "host AIO implementation (threads, native)",
        },
F
Fam Zheng 已提交
421 422 423 424 425
        {
            .name = "locking",
            .type = QEMU_OPT_STRING,
            .help = "file locking mode (on/off/auto, default: auto)",
        },
426 427 428 429 430
        {
            .name = "pr-manager",
            .type = QEMU_OPT_STRING,
            .help = "id of persistent reservation manager object (default: none)",
        },
431 432 433 434 435
        {
            .name = "x-check-cache-dropped",
            .type = QEMU_OPT_BOOL,
            .help = "check that page cache was dropped on live migration (default: off)"
        },
436 437 438 439 440
        { /* end of list */ }
    },
};

static int raw_open_common(BlockDriverState *bs, QDict *options,
441 442
                           int bdrv_flags, int open_flags,
                           bool device, Error **errp)
B
bellard 已提交
443 444
{
    BDRVRawState *s = bs->opaque;
445 446
    QemuOpts *opts;
    Error *local_err = NULL;
447
    const char *filename = NULL;
448
    const char *str;
449
    BlockdevAioOptions aio, aio_default;
450
    int fd, ret;
451
    struct stat st;
452
    OnOffAuto locking;
B
bellard 已提交
453

454
    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
455
    qemu_opts_absorb_qdict(opts, options, &local_err);
456
    if (local_err) {
457
        error_propagate(errp, local_err);
458 459 460 461 462 463
        ret = -EINVAL;
        goto fail;
    }

    filename = qemu_opt_get(opts, "filename");

464 465
    ret = raw_normalize_devicepath(&filename);
    if (ret != 0) {
466
        error_setg_errno(errp, -ret, "Could not normalize device path");
467
        goto fail;
468 469
    }

470 471 472
    aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
                  ? BLOCKDEV_AIO_OPTIONS_NATIVE
                  : BLOCKDEV_AIO_OPTIONS_THREADS;
473 474
    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
                          qemu_opt_get(opts, "aio"),
475
                          aio_default, &local_err);
476 477 478 479 480 481 482
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }
    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);

483 484
    locking = qapi_enum_parse(&OnOffAuto_lookup,
                              qemu_opt_get(opts, "locking"),
485
                              ON_OFF_AUTO_AUTO, &local_err);
486 487 488 489 490 491 492 493
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto fail;
    }
    switch (locking) {
    case ON_OFF_AUTO_ON:
        s->use_lock = true;
494 495 496 497 498 499 500
        if (!qemu_has_ofd_lock()) {
            fprintf(stderr,
                    "File lock requested but OFD locking syscall is "
                    "unavailable, falling back to POSIX file locks.\n"
                    "Due to the implementation, locks can be lost "
                    "unexpectedly.\n");
        }
501 502 503 504 505
        break;
    case ON_OFF_AUTO_OFF:
        s->use_lock = false;
        break;
    case ON_OFF_AUTO_AUTO:
506
        s->use_lock = qemu_has_ofd_lock();
507 508 509 510 511
        break;
    default:
        abort();
    }

512 513 514 515 516 517 518 519 520 521
    str = qemu_opt_get(opts, "pr-manager");
    if (str) {
        s->pr_mgr = pr_manager_lookup(str, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
            ret = -EINVAL;
            goto fail;
        }
    }

522 523 524
    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
                                               false);

525 526
    s->open_flags = open_flags;
    raw_parse_flags(bdrv_flags, &s->open_flags);
B
bellard 已提交
527

528
    s->fd = -1;
K
Kevin Wolf 已提交
529
    fd = qemu_open(filename, s->open_flags, 0644);
B
bellard 已提交
530 531
    if (fd < 0) {
        ret = -errno;
532
        error_setg_errno(errp, errno, "Could not open '%s'", filename);
533
        if (ret == -EROFS) {
B
bellard 已提交
534
            ret = -EACCES;
535 536
        }
        goto fail;
B
bellard 已提交
537
    }
B
bellard 已提交
538
    s->fd = fd;
539

540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
    s->lock_fd = -1;
    if (s->use_lock) {
        fd = qemu_open(filename, s->open_flags);
        if (fd < 0) {
            ret = -errno;
            error_setg_errno(errp, errno, "Could not open '%s' for locking",
                             filename);
            qemu_close(s->fd);
            goto fail;
        }
        s->lock_fd = fd;
    }
    s->perm = 0;
    s->shared_perm = BLK_PERM_ALL;

555
#ifdef CONFIG_LINUX_AIO
556
     /* Currently Linux does AIO only for files opened with O_DIRECT */
557 558 559 560 561 562 563 564 565 566 567
    if (s->use_linux_aio) {
        if (!(s->open_flags & O_DIRECT)) {
            error_setg(errp, "aio=native was specified, but it requires "
                             "cache.direct=on, which was not specified.");
            ret = -EINVAL;
            goto fail;
        }
        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
            error_prepend(errp, "Unable to use native AIO: ");
            goto fail;
        }
568
    }
569
#else
570
    if (s->use_linux_aio) {
571 572 573 574
        error_setg(errp, "aio=native was specified, but is not supported "
                         "in this build.");
        ret = -EINVAL;
        goto fail;
575 576
    }
#endif /* !defined(CONFIG_LINUX_AIO) */
577

578
    s->has_discard = true;
579
    s->has_write_zeroes = true;
580 581 582
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
        s->needs_alignment = true;
    }
583 584

    if (fstat(s->fd, &st) < 0) {
585
        ret = -errno;
586 587 588
        error_setg_errno(errp, errno, "Could not stat file");
        goto fail;
    }
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612

    if (!device) {
        if (S_ISBLK(st.st_mode)) {
            warn_report("Opening a block device as a file using the '%s' "
                        "driver is deprecated", bs->drv->format_name);
        } else if (S_ISCHR(st.st_mode)) {
            warn_report("Opening a character device as a file using the '%s' "
                        "driver is deprecated", bs->drv->format_name);
        } else if (!S_ISREG(st.st_mode)) {
            error_setg(errp, "A regular file was expected by the '%s' driver, "
                       "but something else was given", bs->drv->format_name);
            ret = -EINVAL;
            goto fail;
        } else {
            s->discard_zeroes = true;
            s->has_fallocate = true;
        }
    } else {
        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
            error_setg(errp, "'%s' driver expects either "
                       "a character or block device", bs->drv->format_name);
            ret = -EINVAL;
            goto fail;
        }
613
    }
614

615 616 617 618 619 620 621 622 623 624
    if (S_ISBLK(st.st_mode)) {
#ifdef BLKDISCARDZEROES
        unsigned int arg;
        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
            s->discard_zeroes = true;
        }
#endif
#ifdef __linux__
        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
         * not rely on the contents of discarded blocks unless using O_DIRECT.
625
         * Same for BLKZEROOUT.
626 627 628
         */
        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
            s->discard_zeroes = false;
629
            s->has_write_zeroes = false;
630 631 632
        }
#endif
    }
633 634 635 636 637 638 639 640 641 642 643
#ifdef __FreeBSD__
    if (S_ISCHR(st.st_mode)) {
        /*
         * The file is a char device (disk), which on FreeBSD isn't behind
         * a pager, so force all requests to be aligned. This is needed
         * so QEMU makes sure all IO operations on the device are aligned
         * to sector size, or else FreeBSD will reject them with EINVAL.
         */
        s->needs_alignment = true;
    }
#endif
644

C
Christoph Hellwig 已提交
645 646
#ifdef CONFIG_XFS
    if (platform_test_xfs_fd(s->fd)) {
647
        s->is_xfs = true;
C
Christoph Hellwig 已提交
648 649 650
    }
#endif

651
    bs->supported_zero_flags = s->discard_zeroes ? BDRV_REQ_MAY_UNMAP : 0;
652 653
    ret = 0;
fail:
654 655 656
    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
        unlink(filename);
    }
657 658
    qemu_opts_del(opts);
    return ret;
B
bellard 已提交
659 660
}

M
Max Reitz 已提交
661 662
static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
663 664 665 666
{
    BDRVRawState *s = bs->opaque;

    s->type = FTYPE_FILE;
667
    return raw_open_common(bs, options, flags, 0, false, errp);
668 669
}

670 671 672 673 674 675 676 677 678 679 680 681 682
typedef enum {
    RAW_PL_PREPARE,
    RAW_PL_COMMIT,
    RAW_PL_ABORT,
} RawPermLockOp;

#define PERM_FOREACH(i) \
    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)

/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
 * file; if @unlock == true, also unlock the unneeded bytes.
 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
 */
683
static int raw_apply_lock_bytes(int fd,
684 685 686 687 688 689 690 691 692 693
                                uint64_t perm_lock_bits,
                                uint64_t shared_perm_lock_bits,
                                bool unlock, Error **errp)
{
    int ret;
    int i;

    PERM_FOREACH(i) {
        int off = RAW_LOCK_PERM_BASE + i;
        if (perm_lock_bits & (1ULL << i)) {
694
            ret = qemu_lock_fd(fd, off, 1, false);
695 696 697 698 699
            if (ret) {
                error_setg(errp, "Failed to lock byte %d", off);
                return ret;
            }
        } else if (unlock) {
700
            ret = qemu_unlock_fd(fd, off, 1);
701 702 703 704 705 706 707 708 709
            if (ret) {
                error_setg(errp, "Failed to unlock byte %d", off);
                return ret;
            }
        }
    }
    PERM_FOREACH(i) {
        int off = RAW_LOCK_SHARED_BASE + i;
        if (shared_perm_lock_bits & (1ULL << i)) {
710
            ret = qemu_lock_fd(fd, off, 1, false);
711 712 713 714 715
            if (ret) {
                error_setg(errp, "Failed to lock byte %d", off);
                return ret;
            }
        } else if (unlock) {
716
            ret = qemu_unlock_fd(fd, off, 1);
717 718 719 720 721 722 723 724 725 726
            if (ret) {
                error_setg(errp, "Failed to unlock byte %d", off);
                return ret;
            }
        }
    }
    return 0;
}

/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
727
static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
728 729 730 731 732 733 734 735 736
                                Error **errp)
{
    int ret;
    int i;

    PERM_FOREACH(i) {
        int off = RAW_LOCK_SHARED_BASE + i;
        uint64_t p = 1ULL << i;
        if (perm & p) {
737
            ret = qemu_lock_fd_test(fd, off, 1, true);
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
            if (ret) {
                char *perm_name = bdrv_perm_names(p);
                error_setg(errp,
                           "Failed to get \"%s\" lock",
                           perm_name);
                g_free(perm_name);
                error_append_hint(errp,
                                  "Is another process using the image?\n");
                return ret;
            }
        }
    }
    PERM_FOREACH(i) {
        int off = RAW_LOCK_PERM_BASE + i;
        uint64_t p = 1ULL << i;
        if (!(shared_perm & p)) {
754
            ret = qemu_lock_fd_test(fd, off, 1, true);
755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
            if (ret) {
                char *perm_name = bdrv_perm_names(p);
                error_setg(errp,
                           "Failed to get shared \"%s\" lock",
                           perm_name);
                g_free(perm_name);
                error_append_hint(errp,
                                  "Is another process using the image?\n");
                return ret;
            }
        }
    }
    return 0;
}

static int raw_handle_perm_lock(BlockDriverState *bs,
                                RawPermLockOp op,
                                uint64_t new_perm, uint64_t new_shared,
                                Error **errp)
{
    BDRVRawState *s = bs->opaque;
    int ret = 0;
    Error *local_err = NULL;

    if (!s->use_lock) {
        return 0;
    }

    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
        return 0;
    }

    assert(s->lock_fd > 0);

    switch (op) {
    case RAW_PL_PREPARE:
791
        ret = raw_apply_lock_bytes(s->lock_fd, s->perm | new_perm,
792 793 794
                                   ~s->shared_perm | ~new_shared,
                                   false, errp);
        if (!ret) {
795
            ret = raw_check_lock_bytes(s->lock_fd, new_perm, new_shared, errp);
796 797 798 799 800 801 802
            if (!ret) {
                return 0;
            }
        }
        op = RAW_PL_ABORT;
        /* fall through to unlock bytes. */
    case RAW_PL_ABORT:
803 804
        raw_apply_lock_bytes(s->lock_fd, s->perm, ~s->shared_perm,
                             true, &local_err);
805 806 807 808 809 810 811 812
        if (local_err) {
            /* Theoretically the above call only unlocks bytes and it cannot
             * fail. Something weird happened, report it.
             */
            error_report_err(local_err);
        }
        break;
    case RAW_PL_COMMIT:
813 814
        raw_apply_lock_bytes(s->lock_fd, new_perm, ~new_shared,
                             true, &local_err);
815 816 817 818 819 820 821 822 823 824 825
        if (local_err) {
            /* Theoretically the above call only unlocks bytes and it cannot
             * fail. Something weird happened, report it.
             */
            error_report_err(local_err);
        }
        break;
    }
    return ret;
}

J
Jeff Cody 已提交
826 827 828 829
static int raw_reopen_prepare(BDRVReopenState *state,
                              BlockReopenQueue *queue, Error **errp)
{
    BDRVRawState *s;
F
Fam Zheng 已提交
830
    BDRVRawReopenState *rs;
831
    QemuOpts *opts;
J
Jeff Cody 已提交
832
    int ret = 0;
833
    Error *local_err = NULL;
J
Jeff Cody 已提交
834 835 836 837 838 839

    assert(state != NULL);
    assert(state->bs != NULL);

    s = state->bs->opaque;

840
    state->opaque = g_new0(BDRVRawReopenState, 1);
F
Fam Zheng 已提交
841
    rs = state->opaque;
842 843 844 845 846 847 848 849 850 851 852 853 854
    rs->fd = -1;

    /* Handle options changes */
    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
    qemu_opts_absorb_qdict(opts, state->options, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        ret = -EINVAL;
        goto out;
    }

    rs->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
                                                s->check_cache_dropped);
J
Jeff Cody 已提交
855

M
Max Reitz 已提交
856
    if (s->type == FTYPE_CD) {
F
Fam Zheng 已提交
857
        rs->open_flags |= O_NONBLOCK;
858 859
    }

F
Fam Zheng 已提交
860
    raw_parse_flags(state->flags, &rs->open_flags);
J
Jeff Cody 已提交
861

862
    int fcntl_flags = O_APPEND | O_NONBLOCK;
J
Jeff Cody 已提交
863 864 865 866
#ifdef O_NOATIME
    fcntl_flags |= O_NOATIME;
#endif

867 868
#ifdef O_ASYNC
    /* Not all operating systems have O_ASYNC, and those that don't
F
Fam Zheng 已提交
869
     * will not let us track the state into rs->open_flags (typically
870 871 872 873 874 875
     * you achieve the same effect with an ioctl, for example I_SETSIG
     * on Solaris). But we do not use O_ASYNC, so that's fine.
     */
    assert((s->open_flags & O_ASYNC) == 0);
#endif

F
Fam Zheng 已提交
876
    if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
J
Jeff Cody 已提交
877
        /* dup the original fd */
F
Fam Zheng 已提交
878 879 880
        rs->fd = qemu_dup(s->fd);
        if (rs->fd >= 0) {
            ret = fcntl_setfl(rs->fd, rs->open_flags);
J
Jeff Cody 已提交
881
            if (ret) {
F
Fam Zheng 已提交
882 883
                qemu_close(rs->fd);
                rs->fd = -1;
J
Jeff Cody 已提交
884 885 886 887 888
            }
        }
    }

    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
F
Fam Zheng 已提交
889
    if (rs->fd == -1) {
890 891 892 893 894
        const char *normalized_filename = state->bs->filename;
        ret = raw_normalize_devicepath(&normalized_filename);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "Could not normalize device path");
        } else {
F
Fam Zheng 已提交
895 896 897
            assert(!(rs->open_flags & O_CREAT));
            rs->fd = qemu_open(normalized_filename, rs->open_flags);
            if (rs->fd == -1) {
898 899 900
                error_setg_errno(errp, errno, "Could not reopen file");
                ret = -1;
            }
J
Jeff Cody 已提交
901 902
        }
    }
903 904 905

    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
     * alignment with the new fd. */
F
Fam Zheng 已提交
906 907
    if (rs->fd != -1) {
        raw_probe_alignment(state->bs, rs->fd, &local_err);
908
        if (local_err) {
F
Fam Zheng 已提交
909 910
            qemu_close(rs->fd);
            rs->fd = -1;
911 912 913 914 915
            error_propagate(errp, local_err);
            ret = -EINVAL;
        }
    }

916 917
out:
    qemu_opts_del(opts);
J
Jeff Cody 已提交
918 919 920 921 922
    return ret;
}

static void raw_reopen_commit(BDRVReopenState *state)
{
F
Fam Zheng 已提交
923
    BDRVRawReopenState *rs = state->opaque;
J
Jeff Cody 已提交
924 925
    BDRVRawState *s = state->bs->opaque;

926
    s->check_cache_dropped = rs->check_cache_dropped;
F
Fam Zheng 已提交
927
    s->open_flags = rs->open_flags;
J
Jeff Cody 已提交
928 929

    qemu_close(s->fd);
F
Fam Zheng 已提交
930
    s->fd = rs->fd;
J
Jeff Cody 已提交
931 932 933 934 935 936 937 938

    g_free(state->opaque);
    state->opaque = NULL;
}


static void raw_reopen_abort(BDRVReopenState *state)
{
F
Fam Zheng 已提交
939
    BDRVRawReopenState *rs = state->opaque;
J
Jeff Cody 已提交
940 941

     /* nothing to do if NULL, we didn't get far enough */
F
Fam Zheng 已提交
942
    if (rs == NULL) {
J
Jeff Cody 已提交
943 944 945
        return;
    }

F
Fam Zheng 已提交
946 947 948
    if (rs->fd >= 0) {
        qemu_close(rs->fd);
        rs->fd = -1;
J
Jeff Cody 已提交
949 950 951 952 953
    }
    g_free(state->opaque);
    state->opaque = NULL;
}

954
static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
955 956
{
#ifdef BLKSECTGET
957 958 959 960 961 962
    int max_bytes = 0;
    short max_sectors = 0;
    if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
        return max_bytes;
    } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
        return max_sectors << BDRV_SECTOR_BITS;
963 964 965 966 967 968 969 970
    } else {
        return -errno;
    }
#else
    return -ENOSYS;
#endif
}

971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988
static int hdev_get_max_segments(const struct stat *st)
{
#ifdef CONFIG_LINUX
    char buf[32];
    const char *end;
    char *sysfspath;
    int ret;
    int fd = -1;
    long max_segments;

    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
                                major(st->st_rdev), minor(st->st_rdev));
    fd = open(sysfspath, O_RDONLY);
    if (fd == -1) {
        ret = -errno;
        goto out;
    }
    do {
989
        ret = read(fd, buf, sizeof(buf) - 1);
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
    } while (ret == -1 && errno == EINTR);
    if (ret < 0) {
        ret = -errno;
        goto out;
    } else if (ret == 0) {
        ret = -EIO;
        goto out;
    }
    buf[ret] = 0;
    /* The file is ended with '\n', pass 'end' to accept that. */
    ret = qemu_strtol(buf, &end, 10, &max_segments);
    if (ret == 0 && end && *end == '\n') {
        ret = max_segments;
    }

out:
1006 1007 1008
    if (fd != -1) {
        close(fd);
    }
1009 1010 1011 1012 1013 1014 1015
    g_free(sysfspath);
    return ret;
#else
    return -ENOTSUP;
#endif
}

1016
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1017 1018
{
    BDRVRawState *s = bs->opaque;
1019 1020 1021
    struct stat st;

    if (!fstat(s->fd, &st)) {
1022
        if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
1023 1024 1025
            int ret = hdev_get_max_transfer_length(bs, s->fd);
            if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
                bs->bl.max_transfer = pow2floor(ret);
1026
            }
1027 1028 1029 1030 1031
            ret = hdev_get_max_segments(&st);
            if (ret > 0) {
                bs->bl.max_transfer = MIN(bs->bl.max_transfer,
                                          ret * getpagesize());
            }
1032 1033
        }
    }
J
Jeff Cody 已提交
1034

1035
    raw_probe_alignment(bs, s->fd, errp);
1036
    bs->bl.min_mem_alignment = s->buf_align;
1037
    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
1038
}
B
bellard 已提交
1039

1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
static int check_for_dasd(int fd)
{
#ifdef BIODASDINFO2
    struct dasd_information2_t info = {0};

    return ioctl(fd, BIODASDINFO2, &info);
#else
    return -1;
#endif
}

/**
 * Try to get @bs's logical and physical block size.
 * On success, store them in @bsz and return zero.
 * On failure, return negative errno.
 */
static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
{
    BDRVRawState *s = bs->opaque;
    int ret;

    /* If DASD, get blocksizes */
    if (check_for_dasd(s->fd) < 0) {
        return -ENOTSUP;
    }
    ret = probe_logical_blocksize(s->fd, &bsz->log);
    if (ret < 0) {
        return ret;
    }
    return probe_physical_blocksize(s->fd, &bsz->phys);
}

/**
 * Try to get @bs's geometry: cyls, heads, sectors.
 * On success, store them in @geo and return 0.
 * On failure return -errno.
 * (Allows block driver to assign default geometry values that guest sees)
 */
#ifdef __linux__
static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
{
    BDRVRawState *s = bs->opaque;
    struct hd_geometry ioctl_geo = {0};

    /* If DASD, get its geometry */
    if (check_for_dasd(s->fd) < 0) {
        return -ENOTSUP;
    }
    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
        return -errno;
    }
    /* HDIO_GETGEO may return success even though geo contains zeros
       (e.g. certain multipath setups) */
    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
        return -ENOTSUP;
    }
    /* Do not return a geometry for partition */
    if (ioctl_geo.start != 0) {
        return -ENOTSUP;
    }
    geo->heads = ioctl_geo.heads;
    geo->sectors = ioctl_geo.sectors;
    geo->cylinders = ioctl_geo.cylinders;

    return 0;
}
#else /* __linux__ */
static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
{
    return -ENOTSUP;
}
#endif

1113 1114 1115 1116 1117 1118 1119 1120 1121
static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
{
    int ret;

    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
    if (ret == -1) {
        return -errno;
    }

P
Paolo Bonzini 已提交
1122
    return 0;
1123 1124 1125 1126
}

static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
{
1127
    BDRVRawState *s = aiocb->bs->opaque;
1128 1129
    int ret;

1130 1131 1132 1133
    if (s->page_cache_inconsistent) {
        return -EIO;
    }

1134 1135
    ret = qemu_fdatasync(aiocb->aio_fildes);
    if (ret == -1) {
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
        /* There is no clear definition of the semantics of a failing fsync(),
         * so we may have to assume the worst. The sad truth is that this
         * assumption is correct for Linux. Some pages are now probably marked
         * clean in the page cache even though they are inconsistent with the
         * on-disk contents. The next fdatasync() call would succeed, but no
         * further writeback attempt will be made. We can't get back to a state
         * in which we know what is on disk (we would have to rewrite
         * everything that was touched since the last fdatasync() at least), so
         * make bdrv_flush() fail permanently. Given that the behaviour isn't
         * really defined, I have little hope that other OSes are doing better.
         *
         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
         * cache. */
        if ((s->open_flags & O_DIRECT) == 0) {
            s->page_cache_inconsistent = true;
        }
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238
        return -errno;
    }
    return 0;
}

#ifdef CONFIG_PREADV

static bool preadv_present = true;

static ssize_t
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return preadv(fd, iov, nr_iov, offset);
}

static ssize_t
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return pwritev(fd, iov, nr_iov, offset);
}

#else

static bool preadv_present = false;

static ssize_t
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return -ENOSYS;
}

static ssize_t
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
{
    return -ENOSYS;
}

#endif

static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
{
    ssize_t len;

    do {
        if (aiocb->aio_type & QEMU_AIO_WRITE)
            len = qemu_pwritev(aiocb->aio_fildes,
                               aiocb->aio_iov,
                               aiocb->aio_niov,
                               aiocb->aio_offset);
         else
            len = qemu_preadv(aiocb->aio_fildes,
                              aiocb->aio_iov,
                              aiocb->aio_niov,
                              aiocb->aio_offset);
    } while (len == -1 && errno == EINTR);

    if (len == -1) {
        return -errno;
    }
    return len;
}

/*
 * Read/writes the data to/from a given linear buffer.
 *
 * Returns the number of bytes handles or -errno in case of an error. Short
 * reads are only returned if the end of the file is reached.
 */
static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
{
    ssize_t offset = 0;
    ssize_t len;

    while (offset < aiocb->aio_nbytes) {
        if (aiocb->aio_type & QEMU_AIO_WRITE) {
            len = pwrite(aiocb->aio_fildes,
                         (const char *)buf + offset,
                         aiocb->aio_nbytes - offset,
                         aiocb->aio_offset + offset);
        } else {
            len = pread(aiocb->aio_fildes,
                        buf + offset,
                        aiocb->aio_nbytes - offset,
                        aiocb->aio_offset + offset);
        }
        if (len == -1 && errno == EINTR) {
            continue;
1239 1240 1241 1242 1243 1244 1245 1246 1247
        } else if (len == -1 && errno == EINVAL &&
                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
                   offset > 0) {
            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
             * after a short read.  Assume that O_DIRECT short reads only occur
             * at EOF.  Therefore this is a short read, not an I/O error.
             */
            break;
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298
        } else if (len == -1) {
            offset = -errno;
            break;
        } else if (len == 0) {
            break;
        }
        offset += len;
    }

    return offset;
}

static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
{
    ssize_t nbytes;
    char *buf;

    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
        /*
         * If there is just a single buffer, and it is properly aligned
         * we can just use plain pread/pwrite without any problems.
         */
        if (aiocb->aio_niov == 1) {
             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
        }
        /*
         * We have more than one iovec, and all are properly aligned.
         *
         * Try preadv/pwritev first and fall back to linearizing the
         * buffer if it's not supported.
         */
        if (preadv_present) {
            nbytes = handle_aiocb_rw_vector(aiocb);
            if (nbytes == aiocb->aio_nbytes ||
                (nbytes < 0 && nbytes != -ENOSYS)) {
                return nbytes;
            }
            preadv_present = false;
        }

        /*
         * XXX(hch): short read/write.  no easy way to handle the reminder
         * using these interfaces.  For now retry using plain
         * pread/pwrite?
         */
    }

    /*
     * Ok, we have to do it the hard way, copy all segments into
     * a single aligned buffer.
     */
1299 1300 1301 1302 1303
    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
    if (buf == NULL) {
        return -ENOMEM;
    }

1304 1305 1306 1307 1308 1309 1310 1311
    if (aiocb->aio_type & QEMU_AIO_WRITE) {
        char *p = buf;
        int i;

        for (i = 0; i < aiocb->aio_niov; ++i) {
            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
            p += aiocb->aio_iov[i].iov_len;
        }
1312
        assert(p - buf == aiocb->aio_nbytes);
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
    }

    nbytes = handle_aiocb_rw_linear(aiocb, buf);
    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
        char *p = buf;
        size_t count = aiocb->aio_nbytes, copy;
        int i;

        for (i = 0; i < aiocb->aio_niov && count; ++i) {
            copy = count;
            if (copy > aiocb->aio_iov[i].iov_len) {
                copy = aiocb->aio_iov[i].iov_len;
            }
            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
1327
            assert(count >= copy);
1328 1329 1330
            p     += copy;
            count -= copy;
        }
1331
        assert(count == 0);
1332 1333 1334 1335 1336 1337
    }
    qemu_vfree(buf);

    return nbytes;
}

P
Paolo Bonzini 已提交
1338
#ifdef CONFIG_XFS
1339 1340 1341
static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
{
    struct xfs_flock64 fl;
1342
    int err;
1343 1344 1345 1346 1347 1348 1349

    memset(&fl, 0, sizeof(fl));
    fl.l_whence = SEEK_SET;
    fl.l_start = offset;
    fl.l_len = bytes;

    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
1350 1351 1352
        err = errno;
        DPRINTF("cannot write zero range (%s)\n", strerror(errno));
        return -err;
1353 1354 1355 1356 1357
    }

    return 0;
}

P
Paolo Bonzini 已提交
1358 1359 1360
static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
{
    struct xfs_flock64 fl;
1361
    int err;
P
Paolo Bonzini 已提交
1362 1363 1364 1365 1366 1367 1368

    memset(&fl, 0, sizeof(fl));
    fl.l_whence = SEEK_SET;
    fl.l_start = offset;
    fl.l_len = bytes;

    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
1369 1370 1371
        err = errno;
        DPRINTF("cannot punch hole (%s)\n", strerror(errno));
        return -err;
P
Paolo Bonzini 已提交
1372 1373 1374 1375 1376 1377
    }

    return 0;
}
#endif

1378 1379 1380 1381 1382 1383 1384 1385 1386
static int translate_err(int err)
{
    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
        err == -ENOTTY) {
        err = -ENOTSUP;
    }
    return err;
}

1387
#ifdef CONFIG_FALLOCATE
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
static int do_fallocate(int fd, int mode, off_t offset, off_t len)
{
    do {
        if (fallocate(fd, mode, offset, len) == 0) {
            return 0;
        }
    } while (errno == EINTR);
    return translate_err(-errno);
}
#endif

1399
static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1400
{
1401
    int ret = -ENOTSUP;
1402 1403
    BDRVRawState *s = aiocb->bs->opaque;

1404
    if (!s->has_write_zeroes) {
1405 1406 1407 1408
        return -ENOTSUP;
    }

#ifdef BLKZEROOUT
1409 1410 1411 1412
    do {
        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
            return 0;
1413
        }
1414 1415 1416
    } while (errno == EINTR);

    ret = translate_err(-errno);
1417 1418
#endif

1419
    if (ret == -ENOTSUP) {
1420 1421 1422 1423 1424
        s->has_write_zeroes = false;
    }
    return ret;
}

1425 1426
static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
{
1427
#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1428
    BDRVRawState *s = aiocb->bs->opaque;
1429
#endif
1430 1431 1432
#ifdef CONFIG_FALLOCATE
    int64_t len;
#endif
1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443

    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
        return handle_aiocb_write_zeroes_block(aiocb);
    }

#ifdef CONFIG_XFS
    if (s->is_xfs) {
        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
    }
#endif

1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454
#ifdef CONFIG_FALLOCATE_ZERO_RANGE
    if (s->has_write_zeroes) {
        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
                               aiocb->aio_offset, aiocb->aio_nbytes);
        if (ret == 0 || ret != -ENOTSUP) {
            return ret;
        }
        s->has_write_zeroes = false;
    }
#endif

1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
    if (s->has_discard && s->has_fallocate) {
        int ret = do_fallocate(s->fd,
                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                               aiocb->aio_offset, aiocb->aio_nbytes);
        if (ret == 0) {
            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
            if (ret == 0 || ret != -ENOTSUP) {
                return ret;
            }
            s->has_fallocate = false;
        } else if (ret != -ENOTSUP) {
            return ret;
        } else {
            s->has_discard = false;
        }
    }
#endif

1474
#ifdef CONFIG_FALLOCATE
1475 1476 1477 1478
    /* Last resort: we are trying to extend the file with zeroed data. This
     * can be done via fallocate(fd, 0) */
    len = bdrv_getlength(aiocb->bs);
    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1479 1480 1481 1482 1483 1484 1485 1486
        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
        if (ret == 0 || ret != -ENOTSUP) {
            return ret;
        }
        s->has_fallocate = false;
    }
#endif

1487 1488 1489
    return -ENOTSUP;
}

1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513
#ifndef HAVE_COPY_FILE_RANGE
static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
                             off_t *out_off, size_t len, unsigned int flags)
{
#ifdef __NR_copy_file_range
    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
                   out_off, len, flags);
#else
    errno = ENOSYS;
    return -1;
#endif
}
#endif

static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
{
    uint64_t bytes = aiocb->aio_nbytes;
    off_t in_off = aiocb->aio_offset;
    off_t out_off = aiocb->aio_offset2;

    while (bytes) {
        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
                                      aiocb->aio_fd2, &out_off,
                                      bytes, 0);
1514 1515
        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
                                   aiocb->aio_fd2, out_off, bytes, 0, ret);
F
Fam Zheng 已提交
1516 1517 1518 1519
        if (ret == 0) {
            /* No progress (e.g. when beyond EOF), let the caller fall back to
             * buffer I/O. */
            return -ENOSPC;
1520 1521
        }
        if (ret < 0) {
F
Fam Zheng 已提交
1522 1523
            switch (errno) {
            case ENOSYS:
1524
                return -ENOTSUP;
F
Fam Zheng 已提交
1525 1526 1527
            case EINTR:
                continue;
            default:
1528 1529 1530 1531 1532 1533 1534 1535
                return -errno;
            }
        }
        bytes -= ret;
    }
    return 0;
}

P
Paolo Bonzini 已提交
1536 1537 1538 1539 1540
static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
{
    int ret = -EOPNOTSUPP;
    BDRVRawState *s = aiocb->bs->opaque;

1541 1542
    if (!s->has_discard) {
        return -ENOTSUP;
P
Paolo Bonzini 已提交
1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
    }

    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
#ifdef BLKDISCARD
        do {
            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
                return 0;
            }
        } while (errno == EINTR);

        ret = -errno;
#endif
    } else {
#ifdef CONFIG_XFS
        if (s->is_xfs) {
            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
        }
#endif

#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1564 1565
        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                           aiocb->aio_offset, aiocb->aio_nbytes);
P
Paolo Bonzini 已提交
1566 1567 1568
#endif
    }

1569 1570
    ret = translate_err(ret);
    if (ret == -ENOTSUP) {
1571
        s->has_discard = false;
P
Paolo Bonzini 已提交
1572 1573 1574 1575
    }
    return ret;
}

1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648
static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
{
    int result = 0;
    int64_t current_length = 0;
    char *buf = NULL;
    struct stat st;
    int fd = aiocb->aio_fildes;
    int64_t offset = aiocb->aio_offset;
    Error **errp = aiocb->errp;

    if (fstat(fd, &st) < 0) {
        result = -errno;
        error_setg_errno(errp, -result, "Could not stat file");
        return result;
    }

    current_length = st.st_size;
    if (current_length > offset && aiocb->prealloc != PREALLOC_MODE_OFF) {
        error_setg(errp, "Cannot use preallocation for shrinking files");
        return -ENOTSUP;
    }

    switch (aiocb->prealloc) {
#ifdef CONFIG_POSIX_FALLOCATE
    case PREALLOC_MODE_FALLOC:
        /*
         * Truncating before posix_fallocate() makes it about twice slower on
         * file systems that do not support fallocate(), trying to check if a
         * block is allocated before allocating it, so don't do that here.
         */
        if (offset != current_length) {
            result = -posix_fallocate(fd, current_length,
                                      offset - current_length);
            if (result != 0) {
                /* posix_fallocate() doesn't set errno. */
                error_setg_errno(errp, -result,
                                 "Could not preallocate new data");
            }
        } else {
            result = 0;
        }
        goto out;
#endif
    case PREALLOC_MODE_FULL:
    {
        int64_t num = 0, left = offset - current_length;
        off_t seek_result;

        /*
         * Knowing the final size from the beginning could allow the file
         * system driver to do less allocations and possibly avoid
         * fragmentation of the file.
         */
        if (ftruncate(fd, offset) != 0) {
            result = -errno;
            error_setg_errno(errp, -result, "Could not resize file");
            goto out;
        }

        buf = g_malloc0(65536);

        seek_result = lseek(fd, current_length, SEEK_SET);
        if (seek_result < 0) {
            result = -errno;
            error_setg_errno(errp, -result,
                             "Failed to seek to the old end of file");
            goto out;
        }

        while (left > 0) {
            num = MIN(left, 65536);
            result = write(fd, buf, num);
            if (result < 0) {
1649 1650 1651
                if (errno == EINTR) {
                    continue;
                }
1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694
                result = -errno;
                error_setg_errno(errp, -result,
                                 "Could not write zeros for preallocation");
                goto out;
            }
            left -= result;
        }
        if (result >= 0) {
            result = fsync(fd);
            if (result < 0) {
                result = -errno;
                error_setg_errno(errp, -result,
                                 "Could not flush file to disk");
                goto out;
            }
        }
        goto out;
    }
    case PREALLOC_MODE_OFF:
        if (ftruncate(fd, offset) != 0) {
            result = -errno;
            error_setg_errno(errp, -result, "Could not resize file");
        }
        return result;
    default:
        result = -ENOTSUP;
        error_setg(errp, "Unsupported preallocation mode: %s",
                   PreallocMode_str(aiocb->prealloc));
        return result;
    }

out:
    if (result < 0) {
        if (ftruncate(fd, current_length) < 0) {
            error_report("Failed to restore old file length: %s",
                         strerror(errno));
        }
    }

    g_free(buf);
    return result;
}

1695 1696 1697 1698 1699 1700 1701 1702
static int aio_worker(void *arg)
{
    RawPosixAIOData *aiocb = arg;
    ssize_t ret = 0;

    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
    case QEMU_AIO_READ:
        ret = handle_aiocb_rw(aiocb);
M
Max Reitz 已提交
1703
        if (ret >= 0 && ret < aiocb->aio_nbytes) {
1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728
            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
                      0, aiocb->aio_nbytes - ret);

            ret = aiocb->aio_nbytes;
        }
        if (ret == aiocb->aio_nbytes) {
            ret = 0;
        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
            ret = -EINVAL;
        }
        break;
    case QEMU_AIO_WRITE:
        ret = handle_aiocb_rw(aiocb);
        if (ret == aiocb->aio_nbytes) {
            ret = 0;
        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
            ret = -EINVAL;
        }
        break;
    case QEMU_AIO_FLUSH:
        ret = handle_aiocb_flush(aiocb);
        break;
    case QEMU_AIO_IOCTL:
        ret = handle_aiocb_ioctl(aiocb);
        break;
P
Paolo Bonzini 已提交
1729 1730 1731
    case QEMU_AIO_DISCARD:
        ret = handle_aiocb_discard(aiocb);
        break;
1732 1733 1734
    case QEMU_AIO_WRITE_ZEROES:
        ret = handle_aiocb_write_zeroes(aiocb);
        break;
1735 1736 1737
    case QEMU_AIO_COPY_RANGE:
        ret = handle_aiocb_copy_range(aiocb);
        break;
1738 1739 1740
    case QEMU_AIO_TRUNCATE:
        ret = handle_aiocb_truncate(aiocb);
        break;
1741 1742 1743 1744 1745 1746
    default:
        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
        ret = -EINVAL;
        break;
    }

1747
    g_free(aiocb);
1748 1749 1750
    return ret;
}

1751 1752 1753 1754
static int paio_submit_co_full(BlockDriverState *bs, int fd,
                               int64_t offset, int fd2, int64_t offset2,
                               QEMUIOVector *qiov,
                               int bytes, int type)
1755
{
1756
    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1757 1758 1759 1760 1761
    ThreadPool *pool;

    acb->bs = bs;
    acb->aio_type = type;
    acb->aio_fildes = fd;
1762 1763
    acb->aio_fd2 = fd2;
    acb->aio_offset2 = offset2;
1764

1765
    acb->aio_nbytes = bytes;
1766
    acb->aio_offset = offset;
1767

1768 1769 1770
    if (qiov) {
        acb->aio_iov = qiov->iov;
        acb->aio_niov = qiov->niov;
1771
        assert(qiov->size == bytes);
1772 1773
    }

1774
    trace_file_paio_submit_co(offset, bytes, type);
1775 1776 1777 1778
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_co(pool, aio_worker, acb);
}

1779 1780 1781 1782 1783 1784 1785
static inline int paio_submit_co(BlockDriverState *bs, int fd,
                                 int64_t offset, QEMUIOVector *qiov,
                                 int bytes, int type)
{
    return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type);
}

1786 1787
static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
                                   uint64_t bytes, QEMUIOVector *qiov, int type)
B
bellard 已提交
1788
{
1789 1790
    BDRVRawState *s = bs->opaque;

B
bellard 已提交
1791
    if (fd_open(bs) < 0)
1792
        return -EIO;
B
bellard 已提交
1793

1794
    /*
1795 1796 1797 1798
     * Check if the underlying device requires requests to be aligned,
     * and if the request we are trying to submit is aligned or not.
     * If this is the case tell the low-level driver that it needs
     * to copy the buffer.
1799
     */
1800
    if (s->needs_alignment) {
1801
        if (!bdrv_qiov_is_aligned(bs, qiov)) {
1802
            type |= QEMU_AIO_MISALIGNED;
1803
#ifdef CONFIG_LINUX_AIO
1804
        } else if (s->use_linux_aio) {
1805
            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1806
            assert(qiov->size == bytes);
1807
            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1808
#endif
1809
        }
1810
    }
1811

1812
    return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
1813 1814
}

1815 1816 1817
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
                                      uint64_t bytes, QEMUIOVector *qiov,
                                      int flags)
1818
{
1819
    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1820 1821
}

1822 1823 1824
static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                       uint64_t bytes, QEMUIOVector *qiov,
                                       int flags)
1825
{
1826 1827
    assert(flags == 0);
    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
B
bellard 已提交
1828 1829
}

1830 1831 1832
static void raw_aio_plug(BlockDriverState *bs)
{
#ifdef CONFIG_LINUX_AIO
1833 1834
    BDRVRawState *s = bs->opaque;
    if (s->use_linux_aio) {
1835 1836
        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
        laio_io_plug(bs, aio);
1837 1838 1839 1840 1841 1842 1843
    }
#endif
}

static void raw_aio_unplug(BlockDriverState *bs)
{
#ifdef CONFIG_LINUX_AIO
1844 1845
    BDRVRawState *s = bs->opaque;
    if (s->use_linux_aio) {
1846 1847
        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
        laio_io_unplug(bs, aio);
1848 1849 1850 1851
    }
#endif
}

1852
static int raw_co_flush_to_disk(BlockDriverState *bs)
1853 1854
{
    BDRVRawState *s = bs->opaque;
1855
    int ret;
1856

1857 1858 1859 1860
    ret = fd_open(bs);
    if (ret < 0) {
        return ret;
    }
1861

1862
    return paio_submit_co(bs, s->fd, 0, NULL, 0, QEMU_AIO_FLUSH);
1863 1864
}

1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880
static void raw_aio_attach_aio_context(BlockDriverState *bs,
                                       AioContext *new_context)
{
#ifdef CONFIG_LINUX_AIO
    BDRVRawState *s = bs->opaque;
    if (s->use_linux_aio) {
        Error *local_err;
        if (!aio_setup_linux_aio(new_context, &local_err)) {
            error_reportf_err(local_err, "Unable to use native AIO, "
                                         "falling back to thread pool: ");
            s->use_linux_aio = false;
        }
    }
#endif
}

B
bellard 已提交
1881 1882 1883
static void raw_close(BlockDriverState *bs)
{
    BDRVRawState *s = bs->opaque;
1884

B
bellard 已提交
1885
    if (s->fd >= 0) {
1886
        qemu_close(s->fd);
B
bellard 已提交
1887 1888
        s->fd = -1;
    }
1889 1890 1891 1892
    if (s->lock_fd >= 0) {
        qemu_close(s->lock_fd);
        s->lock_fd = -1;
    }
B
bellard 已提交
1893 1894
}

1895 1896 1897 1898 1899 1900
/**
 * Truncates the given regular file @fd to @offset and, when growing, fills the
 * new space according to @prealloc.
 *
 * Returns: 0 on success, -errno on failure.
 */
1901 1902 1903
static int coroutine_fn
raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
                     PreallocMode prealloc, Error **errp)
1904
{
1905 1906
    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
    ThreadPool *pool;
1907

1908 1909 1910 1911 1912 1913 1914 1915
    *acb = (RawPosixAIOData) {
        .bs             = bs,
        .aio_fildes     = fd,
        .aio_type       = QEMU_AIO_TRUNCATE,
        .aio_offset     = offset,
        .prealloc       = prealloc,
        .errp           = errp,
    };
1916

1917 1918 1919
    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_co(pool, aio_worker, acb);
1920 1921
}

1922 1923
static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
                                        PreallocMode prealloc, Error **errp)
B
bellard 已提交
1924 1925
{
    BDRVRawState *s = bs->opaque;
1926
    struct stat st;
1927
    int ret;
1928 1929

    if (fstat(s->fd, &st)) {
1930 1931 1932
        ret = -errno;
        error_setg_errno(errp, -ret, "Failed to fstat() the file");
        return ret;
1933 1934 1935
    }

    if (S_ISREG(st.st_mode)) {
1936
        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
1937 1938 1939 1940
    }

    if (prealloc != PREALLOC_MODE_OFF) {
        error_setg(errp, "Preallocation mode '%s' unsupported for this "
1941
                   "non-regular file", PreallocMode_str(prealloc));
1942 1943 1944 1945
        return -ENOTSUP;
    }

    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1946 1947 1948 1949
        if (offset > raw_getlength(bs)) {
            error_setg(errp, "Cannot grow device files");
            return -EINVAL;
        }
1950
    } else {
1951
        error_setg(errp, "Resizing this file is not supported");
1952 1953 1954
        return -ENOTSUP;
    }

B
bellard 已提交
1955 1956 1957
    return 0;
}

1958 1959 1960 1961 1962 1963 1964 1965
#ifdef __OpenBSD__
static int64_t raw_getlength(BlockDriverState *bs)
{
    BDRVRawState *s = bs->opaque;
    int fd = s->fd;
    struct stat st;

    if (fstat(fd, &st))
1966
        return -errno;
1967 1968 1969 1970
    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
        struct disklabel dl;

        if (ioctl(fd, DIOCGDINFO, &dl))
1971
            return -errno;
1972 1973 1974 1975 1976
        return (uint64_t)dl.d_secsize *
            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
    } else
        return st.st_size;
}
1977 1978 1979 1980 1981 1982 1983 1984
#elif defined(__NetBSD__)
static int64_t raw_getlength(BlockDriverState *bs)
{
    BDRVRawState *s = bs->opaque;
    int fd = s->fd;
    struct stat st;

    if (fstat(fd, &st))
1985
        return -errno;
1986 1987 1988 1989 1990 1991 1992 1993 1994
    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
        struct dkwedge_info dkw;

        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
            return dkw.dkw_size * 512;
        } else {
            struct disklabel dl;

            if (ioctl(fd, DIOCGDINFO, &dl))
1995
                return -errno;
1996 1997 1998 1999 2000 2001
            return (uint64_t)dl.d_secsize *
                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
        }
    } else
        return st.st_size;
}
C
Christoph Hellwig 已提交
2002 2003 2004 2005 2006 2007
#elif defined(__sun__)
static int64_t raw_getlength(BlockDriverState *bs)
{
    BDRVRawState *s = bs->opaque;
    struct dk_minfo minfo;
    int ret;
2008
    int64_t size;
C
Christoph Hellwig 已提交
2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026

    ret = fd_open(bs);
    if (ret < 0) {
        return ret;
    }

    /*
     * Use the DKIOCGMEDIAINFO ioctl to read the size.
     */
    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
    if (ret != -1) {
        return minfo.dki_lbsize * minfo.dki_capacity;
    }

    /*
     * There are reports that lseek on some devices fails, but
     * irc discussion said that contingency on contingency was overkill.
     */
2027 2028 2029 2030 2031
    size = lseek(s->fd, 0, SEEK_END);
    if (size < 0) {
        return -errno;
    }
    return size;
C
Christoph Hellwig 已提交
2032 2033 2034
}
#elif defined(CONFIG_BSD)
static int64_t raw_getlength(BlockDriverState *bs)
B
bellard 已提交
2035 2036 2037 2038 2039
{
    BDRVRawState *s = bs->opaque;
    int fd = s->fd;
    int64_t size;
    struct stat sb;
A
Aurelien Jarno 已提交
2040
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
B
blueswir1 已提交
2041
    int reopened = 0;
B
bellard 已提交
2042
#endif
B
bellard 已提交
2043 2044 2045 2046 2047
    int ret;

    ret = fd_open(bs);
    if (ret < 0)
        return ret;
B
bellard 已提交
2048

A
Aurelien Jarno 已提交
2049
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
B
blueswir1 已提交
2050 2051
again:
#endif
B
bellard 已提交
2052 2053 2054
    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
#ifdef DIOCGMEDIASIZE
	if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2055 2056 2057 2058 2059 2060 2061 2062 2063
#elif defined(DIOCGPART)
        {
                struct partinfo pi;
                if (ioctl(fd, DIOCGPART, &pi) == 0)
                        size = pi.media_size;
                else
                        size = 0;
        }
        if (size == 0)
B
bellard 已提交
2064
#endif
2065
#if defined(__APPLE__) && defined(__MACH__)
2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079
        {
            uint64_t sectors = 0;
            uint32_t sector_size = 0;

            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
                size = sectors * sector_size;
            } else {
                size = lseek(fd, 0LL, SEEK_END);
                if (size < 0) {
                    return -errno;
                }
            }
        }
B
bellard 已提交
2080 2081
#else
        size = lseek(fd, 0LL, SEEK_END);
2082 2083 2084
        if (size < 0) {
            return -errno;
        }
B
blueswir1 已提交
2085
#endif
A
Aurelien Jarno 已提交
2086
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
B
blueswir1 已提交
2087 2088 2089 2090 2091 2092
        switch(s->type) {
        case FTYPE_CD:
            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
            if (size == 2048LL * (unsigned)-1)
                size = 0;
            /* XXX no disc?  maybe we need to reopen... */
2093
            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
B
blueswir1 已提交
2094 2095 2096 2097
                reopened = 1;
                goto again;
            }
        }
B
bellard 已提交
2098
#endif
C
Christoph Hellwig 已提交
2099
    } else {
B
bellard 已提交
2100
        size = lseek(fd, 0, SEEK_END);
2101 2102 2103
        if (size < 0) {
            return -errno;
        }
B
bellard 已提交
2104 2105 2106
    }
    return size;
}
C
Christoph Hellwig 已提交
2107 2108 2109 2110 2111
#else
static int64_t raw_getlength(BlockDriverState *bs)
{
    BDRVRawState *s = bs->opaque;
    int ret;
2112
    int64_t size;
C
Christoph Hellwig 已提交
2113 2114 2115 2116 2117 2118

    ret = fd_open(bs);
    if (ret < 0) {
        return ret;
    }

2119 2120 2121 2122 2123
    size = lseek(s->fd, 0, SEEK_END);
    if (size < 0) {
        return -errno;
    }
    return size;
C
Christoph Hellwig 已提交
2124
}
2125
#endif
B
bellard 已提交
2126

2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137
static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
{
    struct stat st;
    BDRVRawState *s = bs->opaque;

    if (fstat(s->fd, &st) < 0) {
        return -errno;
    }
    return (int64_t)st.st_blocks * 512;
}

2138 2139
static int coroutine_fn
raw_co_create(BlockdevCreateOptions *options, Error **errp)
B
bellard 已提交
2140
{
K
Kevin Wolf 已提交
2141
    BlockdevCreateOptionsFile *file_opts;
M
Max Reitz 已提交
2142
    Error *local_err = NULL;
B
bellard 已提交
2143
    int fd;
M
Max Reitz 已提交
2144
    uint64_t perm, shared;
2145
    int result = 0;
B
bellard 已提交
2146

K
Kevin Wolf 已提交
2147 2148 2149
    /* Validate options and set default values */
    assert(options->driver == BLOCKDEV_DRIVER_FILE);
    file_opts = &options->u.file;
2150

K
Kevin Wolf 已提交
2151 2152 2153 2154 2155
    if (!file_opts->has_nocow) {
        file_opts->nocow = false;
    }
    if (!file_opts->has_preallocation) {
        file_opts->preallocation = PREALLOC_MODE_OFF;
2156
    }
B
bellard 已提交
2157

K
Kevin Wolf 已提交
2158
    /* Create file */
2159
    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
2160 2161
    if (fd < 0) {
        result = -errno;
2162
        error_setg_errno(errp, -result, "Could not create file");
2163 2164 2165
        goto out;
    }

2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179
    /* Take permissions: We want to discard everything, so we need
     * BLK_PERM_WRITE; and truncation to the desired size requires
     * BLK_PERM_RESIZE.
     * On the other hand, we cannot share the RESIZE permission
     * because we promise that after this function, the file has the
     * size given in the options.  If someone else were to resize it
     * concurrently, we could not guarantee that.
     * Note that after this function, we can no longer guarantee that
     * the file is not touched by a third party, so it may be resized
     * then. */
    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;

    /* Step one: Take locks */
M
Max Reitz 已提交
2180
    result = raw_apply_lock_bytes(fd, perm, ~shared, false, errp);
2181 2182 2183 2184 2185 2186 2187
    if (result < 0) {
        goto out_close;
    }

    /* Step two: Check that nobody else has taken conflicting locks */
    result = raw_check_lock_bytes(fd, perm, shared, errp);
    if (result < 0) {
M
Max Reitz 已提交
2188
        goto out_unlock;
2189 2190 2191
    }

    /* Clear the file by truncating it to 0 */
2192
    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2193
    if (result < 0) {
M
Max Reitz 已提交
2194
        goto out_unlock;
2195 2196
    }

K
Kevin Wolf 已提交
2197
    if (file_opts->nocow) {
2198
#ifdef __linux__
2199 2200 2201 2202 2203 2204 2205 2206 2207
        /* Set NOCOW flag to solve performance issue on fs like btrfs.
         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
         * will be ignored since any failure of this operation should not
         * block the left work.
         */
        int attr;
        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
            attr |= FS_NOCOW_FL;
            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2208
        }
2209 2210 2211
#endif
    }

2212 2213
    /* Resize and potentially preallocate the file to the desired
     * final size */
2214 2215
    result = raw_regular_truncate(NULL, fd, file_opts->size,
                                  file_opts->preallocation, errp);
2216
    if (result < 0) {
M
Max Reitz 已提交
2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227
        goto out_unlock;
    }

out_unlock:
    raw_apply_lock_bytes(fd, 0, 0, true, &local_err);
    if (local_err) {
        /* The above call should not fail, and if it does, that does
         * not mean the whole creation operation has failed.  So
         * report it the user for their convenience, but do not report
         * it to the caller. */
        error_report_err(local_err);
2228
    }
2229

2230
out_close:
2231 2232 2233 2234 2235
    if (qemu_close(fd) != 0 && result == 0) {
        result = -errno;
        error_setg_errno(errp, -result, "Could not close the new file");
    }
out:
2236
    return result;
B
bellard 已提交
2237 2238
}

K
Kevin Wolf 已提交
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278
static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
                                           Error **errp)
{
    BlockdevCreateOptions options;
    int64_t total_size = 0;
    bool nocow = false;
    PreallocMode prealloc;
    char *buf = NULL;
    Error *local_err = NULL;

    /* Skip file: protocol prefix */
    strstart(filename, "file:", &filename);

    /* Read out options */
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
                               PREALLOC_MODE_OFF, &local_err);
    g_free(buf);
    if (local_err) {
        error_propagate(errp, local_err);
        return -EINVAL;
    }

    options = (BlockdevCreateOptions) {
        .driver     = BLOCKDEV_DRIVER_FILE,
        .u.file     = {
            .filename           = (char *) filename,
            .size               = total_size,
            .has_preallocation  = true,
            .preallocation      = prealloc,
            .has_nocow          = true,
            .nocow              = nocow,
        },
    };
    return raw_co_create(&options, errp);
}

2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290
/*
 * Find allocation range in @bs around offset @start.
 * May change underlying file descriptor's file offset.
 * If @start is not in a hole, store @start in @data, and the
 * beginning of the next hole in @hole, and return 0.
 * If @start is in a non-trailing hole, store @start in @hole and the
 * beginning of the next non-hole in @data, and return 0.
 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
 * If we can't find out, return a negative errno other than -ENXIO.
 */
static int find_allocation(BlockDriverState *bs, off_t start,
                           off_t *data, off_t *hole)
2291 2292
{
#if defined SEEK_HOLE && defined SEEK_DATA
2293
    BDRVRawState *s = bs->opaque;
2294
    off_t offs;
2295

2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310
    /*
     * SEEK_DATA cases:
     * D1. offs == start: start is in data
     * D2. offs > start: start is in a hole, next data at offs
     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
     *                              or start is beyond EOF
     *     If the latter happens, the file has been truncated behind
     *     our back since we opened it.  All bets are off then.
     *     Treating like a trailing hole is simplest.
     * D4. offs < 0, errno != ENXIO: we learned nothing
     */
    offs = lseek(s->fd, start, SEEK_DATA);
    if (offs < 0) {
        return -errno;          /* D3 or D4 */
    }
2311 2312 2313 2314 2315 2316

    if (offs < start) {
        /* This is not a valid return by lseek().  We are safe to just return
         * -EIO in this case, and we'll treat it like D4. */
        return -EIO;
    }
2317 2318 2319 2320 2321 2322

    if (offs > start) {
        /* D2: in hole, next data at offs */
        *hole = start;
        *data = offs;
        return 0;
2323 2324
    }

2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347
    /* D1: in data, end not yet known */

    /*
     * SEEK_HOLE cases:
     * H1. offs == start: start is in a hole
     *     If this happens here, a hole has been dug behind our back
     *     since the previous lseek().
     * H2. offs > start: either start is in data, next hole at offs,
     *                   or start is in trailing hole, EOF at offs
     *     Linux treats trailing holes like any other hole: offs ==
     *     start.  Solaris seeks to EOF instead: offs > start (blech).
     *     If that happens here, a hole has been dug behind our back
     *     since the previous lseek().
     * H3. offs < 0, errno = ENXIO: start is beyond EOF
     *     If this happens, the file has been truncated behind our
     *     back since we opened it.  Treat it like a trailing hole.
     * H4. offs < 0, errno != ENXIO: we learned nothing
     *     Pretend we know nothing at all, i.e. "forget" about D1.
     */
    offs = lseek(s->fd, start, SEEK_HOLE);
    if (offs < 0) {
        return -errno;          /* D1 and (H3 or H4) */
    }
2348 2349 2350 2351 2352 2353

    if (offs < start) {
        /* This is not a valid return by lseek().  We are safe to just return
         * -EIO in this case, and we'll treat it like H4. */
        return -EIO;
    }
2354 2355 2356 2357 2358 2359 2360 2361

    if (offs > start) {
        /*
         * D1 and H2: either in data, next hole at offs, or it was in
         * data but is now in a trailing hole.  In the latter case,
         * all bets are off.  Treating it as if it there was data all
         * the way to EOF is safe, so simply do that.
         */
2362
        *data = start;
2363 2364
        *hole = offs;
        return 0;
2365
    }
2366

2367 2368
    /* D1 and H1 */
    return -EBUSY;
2369
#else
2370
    return -ENOTSUP;
2371
#endif
2372 2373 2374
}

/*
2375
 * Returns the allocation status of the specified offset.
2376
 *
2377
 * The block layer guarantees 'offset' and 'bytes' are within bounds.
2378
 *
2379 2380
 * 'pnum' is set to the number of bytes (including and immediately following
 * the specified offset) that are known to be in the same
2381 2382
 * allocated/unallocated state.
 *
2383
 * 'bytes' is the max value 'pnum' should be set to.
2384
 */
2385 2386 2387 2388 2389 2390 2391 2392
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
                                            bool want_zero,
                                            int64_t offset,
                                            int64_t bytes, int64_t *pnum,
                                            int64_t *map,
                                            BlockDriverState **file)
{
    off_t data = 0, hole = 0;
2393
    int ret;
2394 2395 2396 2397 2398 2399

    ret = fd_open(bs);
    if (ret < 0) {
        return ret;
    }

2400 2401 2402 2403 2404
    if (!want_zero) {
        *pnum = bytes;
        *map = offset;
        *file = bs;
        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2405
    }
2406

2407
    ret = find_allocation(bs, offset, &data, &hole);
2408 2409
    if (ret == -ENXIO) {
        /* Trailing hole */
2410
        *pnum = bytes;
2411 2412 2413
        ret = BDRV_BLOCK_ZERO;
    } else if (ret < 0) {
        /* No info available, so pretend there are no holes */
2414
        *pnum = bytes;
2415
        ret = BDRV_BLOCK_DATA;
2416 2417
    } else if (data == offset) {
        /* On a data extent, compute bytes to the end of the extent,
2418
         * possibly including a partial sector at EOF. */
2419
        *pnum = MIN(bytes, hole - offset);
2420
        ret = BDRV_BLOCK_DATA;
2421
    } else {
2422 2423 2424
        /* On a hole, compute bytes to the beginning of the next extent.  */
        assert(hole == offset);
        *pnum = MIN(bytes, data - offset);
2425
        ret = BDRV_BLOCK_ZERO;
2426
    }
2427
    *map = offset;
2428
    *file = bs;
2429
    return ret | BDRV_BLOCK_OFFSET_VALID;
2430 2431
}

2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498
#if defined(__linux__)
/* Verify that the file is not in the page cache */
static void check_cache_dropped(BlockDriverState *bs, Error **errp)
{
    const size_t window_size = 128 * 1024 * 1024;
    BDRVRawState *s = bs->opaque;
    void *window = NULL;
    size_t length = 0;
    unsigned char *vec;
    size_t page_size;
    off_t offset;
    off_t end;

    /* mincore(2) page status information requires 1 byte per page */
    page_size = sysconf(_SC_PAGESIZE);
    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));

    end = raw_getlength(bs);

    for (offset = 0; offset < end; offset += window_size) {
        void *new_window;
        size_t new_length;
        size_t vec_end;
        size_t i;
        int ret;

        /* Unmap previous window if size has changed */
        new_length = MIN(end - offset, window_size);
        if (new_length != length) {
            munmap(window, length);
            window = NULL;
            length = 0;
        }

        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
                          s->fd, offset);
        if (new_window == MAP_FAILED) {
            error_setg_errno(errp, errno, "mmap failed");
            break;
        }

        window = new_window;
        length = new_length;

        ret = mincore(window, length, vec);
        if (ret < 0) {
            error_setg_errno(errp, errno, "mincore failed");
            break;
        }

        vec_end = DIV_ROUND_UP(length, page_size);
        for (i = 0; i < vec_end; i++) {
            if (vec[i] & 0x1) {
                error_setg(errp, "page cache still in use!");
                break;
            }
        }
    }

    if (window) {
        munmap(window, length);
    }

    g_free(vec);
}
#endif /* __linux__ */

2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531
static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
                                                 Error **errp)
{
    BDRVRawState *s = bs->opaque;
    int ret;

    ret = fd_open(bs);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "The file descriptor is not open");
        return;
    }

    if (s->open_flags & O_DIRECT) {
        return; /* No host kernel page cache */
    }

#if defined(__linux__)
    /* This sets the scene for the next syscall... */
    ret = bdrv_co_flush(bs);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "flush failed");
        return;
    }

    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
     * process.  These limitations are okay because we just fsynced the file,
     * we don't use mmap, and the file should not be in use by other processes.
     */
    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
    if (ret != 0) { /* the return value is a positive errno */
        error_setg_errno(errp, ret, "fadvise failed");
        return;
    }
2532 2533 2534 2535

    if (s->check_cache_dropped) {
        check_cache_dropped(bs, errp);
    }
2536 2537 2538 2539 2540 2541 2542 2543 2544 2545
#else /* __linux__ */
    /* Do nothing.  Live migration to a remote host with cache.direct=off is
     * unsupported on other host operating systems.  Cache consistency issues
     * may occur but no error is reported here, partly because that's the
     * historical behavior and partly because it's hard to differentiate valid
     * configurations that should not cause errors.
     */
#endif /* !__linux__ */
}

2546 2547
static coroutine_fn int
raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
C
Christoph Hellwig 已提交
2548 2549 2550
{
    BDRVRawState *s = bs->opaque;

2551
    return paio_submit_co(bs, s->fd, offset, NULL, bytes, QEMU_AIO_DISCARD);
C
Christoph Hellwig 已提交
2552
}
2553

2554 2555
static int coroutine_fn raw_co_pwrite_zeroes(
    BlockDriverState *bs, int64_t offset,
2556
    int bytes, BdrvRequestFlags flags)
2557 2558 2559 2560
{
    BDRVRawState *s = bs->opaque;

    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
2561
        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
2562 2563
                              QEMU_AIO_WRITE_ZEROES);
    } else if (s->discard_zeroes) {
2564
        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
2565
                              QEMU_AIO_DISCARD);
2566
    }
2567
    return -ENOTSUP;
2568 2569 2570 2571 2572 2573 2574 2575 2576 2577
}

static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
    BDRVRawState *s = bs->opaque;

    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
    return 0;
}

2578 2579 2580 2581 2582 2583 2584 2585 2586
static QemuOptsList raw_create_opts = {
    .name = "raw-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
2587 2588 2589 2590 2591
        {
            .name = BLOCK_OPT_NOCOW,
            .type = QEMU_OPT_BOOL,
            .help = "Turn off copy-on-write (valid only on btrfs)"
        },
2592 2593 2594 2595 2596
        {
            .name = BLOCK_OPT_PREALLOC,
            .type = QEMU_OPT_STRING,
            .help = "Preallocation mode (allowed values: off, falloc, full)"
        },
2597 2598
        { /* end of list */ }
    }
2599 2600
};

2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619
static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
                          Error **errp)
{
    return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
}

static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
{
    BDRVRawState *s = bs->opaque;
    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
    s->perm = perm;
    s->shared_perm = shared;
}

static void raw_abort_perm_update(BlockDriverState *bs)
{
    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
}

2620 2621 2622 2623
static int coroutine_fn raw_co_copy_range_from(
        BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
        BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
2624
{
2625 2626
    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
                                 read_flags, write_flags);
2627 2628 2629
}

static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
2630 2631 2632 2633 2634 2635 2636
                                             BdrvChild *src,
                                             uint64_t src_offset,
                                             BdrvChild *dst,
                                             uint64_t dst_offset,
                                             uint64_t bytes,
                                             BdrvRequestFlags read_flags,
                                             BdrvRequestFlags write_flags)
2637 2638 2639 2640 2641 2642 2643 2644 2645 2646
{
    BDRVRawState *s = bs->opaque;
    BDRVRawState *src_s;

    assert(dst->bs == bs);
    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
        return -ENOTSUP;
    }

    src_s = src->bs->opaque;
2647
    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
2648 2649 2650 2651 2652 2653
        return -EIO;
    }
    return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset,
                               NULL, bytes, QEMU_AIO_COPY_RANGE);
}

2654
BlockDriver bdrv_file = {
2655 2656
    .format_name = "file",
    .protocol_name = "file",
B
blueswir1 已提交
2657
    .instance_size = sizeof(BDRVRawState),
2658
    .bdrv_needs_filename = true,
B
blueswir1 已提交
2659
    .bdrv_probe = NULL, /* no probe for protocols */
2660
    .bdrv_parse_filename = raw_parse_filename,
2661
    .bdrv_file_open = raw_open,
J
Jeff Cody 已提交
2662 2663 2664
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit = raw_reopen_commit,
    .bdrv_reopen_abort = raw_reopen_abort,
B
blueswir1 已提交
2665
    .bdrv_close = raw_close,
K
Kevin Wolf 已提交
2666
    .bdrv_co_create = raw_co_create,
2667
    .bdrv_co_create_opts = raw_co_create_opts,
2668
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
2669
    .bdrv_co_block_status = raw_co_block_status,
2670
    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
2671
    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
2672

2673 2674
    .bdrv_co_preadv         = raw_co_preadv,
    .bdrv_co_pwritev        = raw_co_pwritev,
2675 2676
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
    .bdrv_co_pdiscard       = raw_co_pdiscard,
2677 2678
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
2679
    .bdrv_refresh_limits = raw_refresh_limits,
2680 2681
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
2682
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
2683

2684
    .bdrv_co_truncate = raw_co_truncate,
B
bellard 已提交
2685
    .bdrv_getlength = raw_getlength,
2686
    .bdrv_get_info = raw_get_info,
2687 2688
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,
2689 2690 2691
    .bdrv_check_perm = raw_check_perm,
    .bdrv_set_perm   = raw_set_perm,
    .bdrv_abort_perm_update = raw_abort_perm_update,
2692
    .create_opts = &raw_create_opts,
B
bellard 已提交
2693 2694
};

B
bellard 已提交
2695 2696 2697
/***********************************************/
/* host device */

2698
#if defined(__APPLE__) && defined(__MACH__)
2699 2700
static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
                                CFIndex maxPathSize, int flags);
2701
static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
B
bellard 已提交
2702
{
2703
    kern_return_t kernResult = KERN_FAILURE;
B
bellard 已提交
2704 2705
    mach_port_t     masterPort;
    CFMutableDictionaryRef  classesToMatch;
2706 2707
    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
    char *mediaType = NULL;
B
bellard 已提交
2708 2709 2710 2711 2712

    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
    if ( KERN_SUCCESS != kernResult ) {
        printf( "IOMasterPort returned %d\n", kernResult );
    }
2713

2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730
    int index;
    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
        classesToMatch = IOServiceMatching(matching_array[index]);
        if (classesToMatch == NULL) {
            error_report("IOServiceMatching returned NULL for %s",
                         matching_array[index]);
            continue;
        }
        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
                             kCFBooleanTrue);
        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
                                                  mediaIterator);
        if (kernResult != KERN_SUCCESS) {
            error_report("Note: IOServiceGetMatchingServices returned %d",
                         kernResult);
            continue;
        }
2731

2732 2733 2734 2735 2736 2737 2738 2739
        /* If a match was found, leave the loop */
        if (*mediaIterator != 0) {
            DPRINTF("Matching using %s\n", matching_array[index]);
            mediaType = g_strdup(matching_array[index]);
            break;
        }
    }
    return mediaType;
B
bellard 已提交
2740 2741
}

2742 2743
kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
                         CFIndex maxPathSize, int flags)
B
bellard 已提交
2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755
{
    io_object_t     nextMedia;
    kern_return_t   kernResult = KERN_FAILURE;
    *bsdPath = '\0';
    nextMedia = IOIteratorNext( mediaIterator );
    if ( nextMedia )
    {
        CFTypeRef   bsdPathAsCFString;
    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
        if ( bsdPathAsCFString ) {
            size_t devPathLength;
            strcpy( bsdPath, _PATH_DEV );
2756 2757 2758
            if (flags & BDRV_O_NOCACHE) {
                strcat(bsdPath, "r");
            }
B
bellard 已提交
2759 2760 2761 2762 2763 2764 2765 2766
            devPathLength = strlen( bsdPath );
            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
                kernResult = KERN_SUCCESS;
            }
            CFRelease( bsdPathAsCFString );
        }
        IOObjectRelease( nextMedia );
    }
2767

B
bellard 已提交
2768 2769 2770
    return kernResult;
}

2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810
/* Sets up a real cdrom for use in QEMU */
static bool setup_cdrom(char *bsd_path, Error **errp)
{
    int index, num_of_test_partitions = 2, fd;
    char test_partition[MAXPATHLEN];
    bool partition_found = false;

    /* look for a working partition */
    for (index = 0; index < num_of_test_partitions; index++) {
        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
                 index);
        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
        if (fd >= 0) {
            partition_found = true;
            qemu_close(fd);
            break;
        }
    }

    /* if a working partition on the device was not found */
    if (partition_found == false) {
        error_setg(errp, "Failed to find a working partition on disc");
    } else {
        DPRINTF("Using %s as optical disc\n", test_partition);
        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
    }
    return partition_found;
}

/* Prints directions on mounting and unmounting a device */
static void print_unmounting_directions(const char *file_name)
{
    error_report("If device %s is mounted on the desktop, unmount"
                 " it first before using it in QEMU", file_name);
    error_report("Command to unmount device: diskutil unmountDisk %s",
                 file_name);
    error_report("Command to mount device: diskutil mountDisk %s", file_name);
}

#endif /* defined(__APPLE__) && defined(__MACH__) */
B
bellard 已提交
2811

2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827
static int hdev_probe_device(const char *filename)
{
    struct stat st;

    /* allow a dedicated CD-ROM driver to match with a higher priority */
    if (strstart(filename, "/dev/cdrom", NULL))
        return 50;

    if (stat(filename, &st) >= 0 &&
            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
        return 100;
    }

    return 0;
}

2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860
static int check_hdev_writable(BDRVRawState *s)
{
#if defined(BLKROGET)
    /* Linux block devices can be configured "read-only" using blockdev(8).
     * This is independent of device node permissions and therefore open(2)
     * with O_RDWR succeeds.  Actual writes fail with EPERM.
     *
     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
     * check for read-only block devices so that Linux block devices behave
     * properly.
     */
    struct stat st;
    int readonly = 0;

    if (fstat(s->fd, &st)) {
        return -errno;
    }

    if (!S_ISBLK(st.st_mode)) {
        return 0;
    }

    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
        return -errno;
    }

    if (readonly) {
        return -EACCES;
    }
#endif /* defined(BLKROGET) */
    return 0;
}

2861 2862 2863
static void hdev_parse_filename(const char *filename, QDict *options,
                                Error **errp)
{
2864
    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
2865 2866
}

2867 2868 2869 2870 2871
static bool hdev_is_sg(BlockDriverState *bs)
{

#if defined(__linux__)

K
Kevin Wolf 已提交
2872
    BDRVRawState *s = bs->opaque;
2873 2874 2875
    struct stat st;
    struct sg_scsi_id scsiid;
    int sg_version;
K
Kevin Wolf 已提交
2876 2877 2878 2879 2880
    int ret;

    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
        return false;
    }
2881

K
Kevin Wolf 已提交
2882 2883 2884 2885 2886 2887 2888
    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
    if (ret < 0) {
        return false;
    }

    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
    if (ret >= 0) {
2889 2890 2891 2892 2893 2894 2895 2896 2897 2898
        DPRINTF("SG device found: type=%d, version=%d\n",
            scsiid.scsi_type, sg_version);
        return true;
    }

#endif

    return false;
}

M
Max Reitz 已提交
2899 2900
static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
B
bellard 已提交
2901 2902
{
    BDRVRawState *s = bs->opaque;
2903
    Error *local_err = NULL;
2904
    int ret;
2905

2906
#if defined(__APPLE__) && defined(__MACH__)
2907 2908 2909 2910 2911 2912
    /*
     * Caution: while qdict_get_str() is fine, getting non-string types
     * would require more care.  When @options come from -blockdev or
     * blockdev_add, its members are typed according to the QAPI
     * schema, but when they come from -drive, they're all QString.
     */
2913
    const char *filename = qdict_get_str(options, "filename");
2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929
    char bsd_path[MAXPATHLEN] = "";
    bool error_occurred = false;

    /* If using a real cdrom */
    if (strcmp(filename, "/dev/cdrom") == 0) {
        char *mediaType = NULL;
        kern_return_t ret_val;
        io_iterator_t mediaIterator = 0;

        mediaType = FindEjectableOpticalMedia(&mediaIterator);
        if (mediaType == NULL) {
            error_setg(errp, "Please make sure your CD/DVD is in the optical"
                       " drive");
            error_occurred = true;
            goto hdev_open_Mac_error;
        }
2930

2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950
        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
        if (ret_val != KERN_SUCCESS) {
            error_setg(errp, "Could not get BSD path for optical drive");
            error_occurred = true;
            goto hdev_open_Mac_error;
        }

        /* If a real optical drive was not found */
        if (bsd_path[0] == '\0') {
            error_setg(errp, "Failed to obtain bsd path for optical drive");
            error_occurred = true;
            goto hdev_open_Mac_error;
        }

        /* If using a cdrom disc and finding a partition on the disc failed */
        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
            setup_cdrom(bsd_path, errp) == false) {
            print_unmounting_directions(bsd_path);
            error_occurred = true;
            goto hdev_open_Mac_error;
B
bellard 已提交
2951
        }
2952

2953
        qdict_put_str(options, "filename", bsd_path);
2954 2955 2956 2957 2958 2959 2960 2961 2962

hdev_open_Mac_error:
        g_free(mediaType);
        if (mediaIterator) {
            IOObjectRelease(mediaIterator);
        }
        if (error_occurred) {
            return -ENOENT;
        }
B
bellard 已提交
2963
    }
2964
#endif /* defined(__APPLE__) && defined(__MACH__) */
B
bellard 已提交
2965 2966

    s->type = FTYPE_FILE;
2967

2968
    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
2969
    if (ret < 0) {
2970
        error_propagate(errp, local_err);
2971 2972 2973 2974 2975 2976 2977 2978 2979
#if defined(__APPLE__) && defined(__MACH__)
        if (*bsd_path) {
            filename = bsd_path;
        }
        /* if a physical device experienced an error while being opened */
        if (strncmp(filename, "/dev/", 5) == 0) {
            print_unmounting_directions(filename);
        }
#endif /* defined(__APPLE__) && defined(__MACH__) */
2980 2981 2982
        return ret;
    }

2983 2984 2985
    /* Since this does ioctl the device must be already opened */
    bs->sg = hdev_is_sg(bs);

2986 2987 2988 2989
    if (flags & BDRV_O_RDWR) {
        ret = check_hdev_writable(s);
        if (ret < 0) {
            raw_close(bs);
2990
            error_setg_errno(errp, -ret, "The device is not writable");
2991 2992 2993 2994 2995
            return ret;
        }
    }

    return ret;
B
bellard 已提交
2996 2997
}

2998
#if defined(__linux__)
2999

3000
static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
3001
        unsigned long int req, void *buf,
3002
        BlockCompletionFunc *cb, void *opaque)
3003
{
3004
    BDRVRawState *s = bs->opaque;
3005
    RawPosixAIOData *acb;
3006
    ThreadPool *pool;
3007

3008 3009
    if (fd_open(bs) < 0)
        return NULL;
3010

3011 3012 3013 3014 3015 3016 3017 3018 3019
    if (req == SG_IO && s->pr_mgr) {
        struct sg_io_hdr *io_hdr = buf;
        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
                                      s->fd, io_hdr, cb, opaque);
        }
    }

3020
    acb = g_new(RawPosixAIOData, 1);
3021 3022 3023 3024 3025 3026
    acb->bs = bs;
    acb->aio_type = QEMU_AIO_IOCTL;
    acb->aio_fildes = s->fd;
    acb->aio_offset = 0;
    acb->aio_ioctl_buf = buf;
    acb->aio_ioctl_cmd = req;
3027 3028
    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
3029
}
M
Max Reitz 已提交
3030
#endif /* linux */
3031

B
blueswir1 已提交
3032 3033 3034 3035 3036 3037 3038 3039 3040
static int fd_open(BlockDriverState *bs)
{
    BDRVRawState *s = bs->opaque;

    /* this is just to ensure s->fd is sane (its called by io ops) */
    if (s->fd >= 0)
        return 0;
    return -EIO;
}
3041

3042 3043
static coroutine_fn int
hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3044 3045
{
    BDRVRawState *s = bs->opaque;
3046
    int ret;
3047

3048 3049 3050
    ret = fd_open(bs);
    if (ret < 0) {
        return ret;
3051
    }
3052 3053
    return paio_submit_co(bs, s->fd, offset, NULL, bytes,
                          QEMU_AIO_DISCARD | QEMU_AIO_BLKDEV);
3054 3055
}

3056
static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3057
    int64_t offset, int bytes, BdrvRequestFlags flags)
3058 3059 3060 3061 3062 3063 3064 3065 3066
{
    BDRVRawState *s = bs->opaque;
    int rc;

    rc = fd_open(bs);
    if (rc < 0) {
        return rc;
    }
    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
3067
        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
3068 3069
                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
    } else if (s->discard_zeroes) {
3070
        return paio_submit_co(bs, s->fd, offset, NULL, bytes,
3071
                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
3072
    }
3073
    return -ENOTSUP;
3074 3075
}

3076 3077
static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
                                            Error **errp)
3078 3079 3080 3081
{
    int fd;
    int ret = 0;
    struct stat stat_buf;
3082
    int64_t total_size = 0;
3083 3084
    bool has_prefix;

M
Max Reitz 已提交
3085 3086
    /* This function is used by both protocol block drivers and therefore either
     * of these prefixes may be given.
3087 3088 3089 3090
     * The return value has to be stored somewhere, otherwise this is an error
     * due to -Werror=unused-value. */
    has_prefix =
        strstart(filename, "host_device:", &filename) ||
M
Max Reitz 已提交
3091
        strstart(filename, "host_cdrom:" , &filename);
3092 3093

    (void)has_prefix;
3094

3095 3096 3097 3098 3099 3100
    ret = raw_normalize_devicepath(&filename);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Could not normalize device path");
        return ret;
    }

3101
    /* Read out options */
3102 3103
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                          BDRV_SECTOR_SIZE);
3104

3105
    fd = qemu_open(filename, O_WRONLY | O_BINARY);
3106 3107 3108 3109 3110
    if (fd < 0) {
        ret = -errno;
        error_setg_errno(errp, -ret, "Could not open device");
        return ret;
    }
3111

3112
    if (fstat(fd, &stat_buf) < 0) {
3113
        ret = -errno;
3114 3115 3116 3117
        error_setg_errno(errp, -ret, "Could not stat device");
    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
        error_setg(errp,
                   "The given file is neither a block nor a character device");
3118
        ret = -ENODEV;
3119
    } else if (lseek(fd, 0, SEEK_END) < total_size) {
3120
        error_setg(errp, "Device is too small");
3121
        ret = -ENOSPC;
3122
    }
3123

3124 3125 3126 3127 3128 3129 3130 3131 3132 3133
    if (!ret && total_size) {
        uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
        int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
        if (lseek(fd, 0, SEEK_SET) == -1) {
            ret = -errno;
        } else {
            ret = qemu_write_full(fd, buf, zero_size);
            ret = ret == zero_size ? 0 : -errno;
        }
    }
3134
    qemu_close(fd);
3135 3136 3137
    return ret;
}

3138
static BlockDriver bdrv_host_device = {
3139
    .format_name        = "host_device",
3140
    .protocol_name        = "host_device",
3141
    .instance_size      = sizeof(BDRVRawState),
3142
    .bdrv_needs_filename = true,
3143
    .bdrv_probe_device  = hdev_probe_device,
3144
    .bdrv_parse_filename = hdev_parse_filename,
3145
    .bdrv_file_open     = hdev_open,
3146
    .bdrv_close         = raw_close,
3147 3148 3149
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
3150
    .bdrv_co_create_opts = hdev_co_create_opts,
3151
    .create_opts         = &raw_create_opts,
3152
    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3153
    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3154

3155 3156
    .bdrv_co_preadv         = raw_co_preadv,
    .bdrv_co_pwritev        = raw_co_pwritev,
3157 3158
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
    .bdrv_co_pdiscard       = hdev_co_pdiscard,
3159 3160
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3161
    .bdrv_refresh_limits = raw_refresh_limits,
3162 3163
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
3164
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3165

3166
    .bdrv_co_truncate       = raw_co_truncate,
3167
    .bdrv_getlength	= raw_getlength,
3168
    .bdrv_get_info = raw_get_info,
3169 3170
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,
3171 3172 3173
    .bdrv_check_perm = raw_check_perm,
    .bdrv_set_perm   = raw_set_perm,
    .bdrv_abort_perm_update = raw_abort_perm_update,
3174 3175
    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
    .bdrv_probe_geometry = hdev_probe_geometry,
B
bellard 已提交
3176

3177
    /* generic scsi device */
3178 3179 3180
#ifdef __linux__
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
#endif
3181 3182
};

3183 3184 3185 3186
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
static void cdrom_parse_filename(const char *filename, QDict *options,
                                 Error **errp)
{
3187
    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3188 3189 3190 3191
}
#endif

#ifdef __linux__
M
Max Reitz 已提交
3192 3193
static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
3194 3195 3196 3197 3198
{
    BDRVRawState *s = bs->opaque;

    s->type = FTYPE_CD;

B
Blue Swirl 已提交
3199
    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3200
    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3201 3202
}

3203 3204
static int cdrom_probe_device(const char *filename)
{
3205 3206
    int fd, ret;
    int prio = 0;
3207
    struct stat st;
3208

3209
    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
3210 3211 3212
    if (fd < 0) {
        goto out;
    }
3213 3214 3215 3216
    ret = fstat(fd, &st);
    if (ret == -1 || !S_ISBLK(st.st_mode)) {
        goto outc;
    }
3217 3218 3219 3220 3221 3222

    /* Attempt to detect via a CDROM specific ioctl */
    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
    if (ret >= 0)
        prio = 100;

3223
outc:
3224
    qemu_close(fd);
3225 3226
out:
    return prio;
3227 3228
}

3229
static bool cdrom_is_inserted(BlockDriverState *bs)
3230 3231 3232 3233 3234
{
    BDRVRawState *s = bs->opaque;
    int ret;

    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3235
    return ret == CDS_DISC_OK;
3236 3237
}

3238
static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250
{
    BDRVRawState *s = bs->opaque;

    if (eject_flag) {
        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
            perror("CDROMEJECT");
    } else {
        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
            perror("CDROMEJECT");
    }
}

3251
static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265
{
    BDRVRawState *s = bs->opaque;

    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
        /*
         * Note: an error can happen if the distribution automatically
         * mounts the CD-ROM
         */
        /* perror("CDROM_LOCKDOOR"); */
    }
}

static BlockDriver bdrv_host_cdrom = {
    .format_name        = "host_cdrom",
3266
    .protocol_name      = "host_cdrom",
3267
    .instance_size      = sizeof(BDRVRawState),
3268
    .bdrv_needs_filename = true,
3269
    .bdrv_probe_device	= cdrom_probe_device,
3270
    .bdrv_parse_filename = cdrom_parse_filename,
3271
    .bdrv_file_open     = cdrom_open,
3272
    .bdrv_close         = raw_close,
3273 3274 3275
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
3276
    .bdrv_co_create_opts = hdev_co_create_opts,
3277
    .create_opts         = &raw_create_opts,
3278
    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3279

3280 3281 3282

    .bdrv_co_preadv         = raw_co_preadv,
    .bdrv_co_pwritev        = raw_co_pwritev,
3283
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3284
    .bdrv_refresh_limits = raw_refresh_limits,
3285 3286
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
3287
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3288

3289
    .bdrv_co_truncate    = raw_co_truncate,
3290 3291
    .bdrv_getlength      = raw_getlength,
    .has_variable_length = true,
3292 3293
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,
3294 3295 3296 3297

    /* removable device support */
    .bdrv_is_inserted   = cdrom_is_inserted,
    .bdrv_eject         = cdrom_eject,
3298
    .bdrv_lock_medium   = cdrom_lock_medium,
3299 3300

    /* generic scsi device */
3301
    .bdrv_aio_ioctl     = hdev_aio_ioctl,
3302 3303 3304
};
#endif /* __linux__ */

A
Aurelien Jarno 已提交
3305
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3306 3307
static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
3308 3309
{
    BDRVRawState *s = bs->opaque;
3310
    Error *local_err = NULL;
3311 3312 3313 3314
    int ret;

    s->type = FTYPE_CD;

3315
    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3316
    if (ret) {
3317
        error_propagate(errp, local_err);
3318
        return ret;
3319
    }
3320

D
Dong Xu Wang 已提交
3321
    /* make sure the door isn't locked at this time */
3322 3323 3324 3325
    ioctl(s->fd, CDIOCALLOW);
    return 0;
}

3326 3327 3328 3329 3330 3331 3332 3333
static int cdrom_probe_device(const char *filename)
{
    if (strstart(filename, "/dev/cd", NULL) ||
            strstart(filename, "/dev/acd", NULL))
        return 100;
    return 0;
}

3334 3335 3336 3337 3338 3339 3340 3341 3342 3343
static int cdrom_reopen(BlockDriverState *bs)
{
    BDRVRawState *s = bs->opaque;
    int fd;

    /*
     * Force reread of possibly changed/newly loaded disc,
     * FreeBSD seems to not notice sometimes...
     */
    if (s->fd >= 0)
3344
        qemu_close(s->fd);
3345
    fd = qemu_open(bs->filename, s->open_flags, 0644);
3346 3347 3348 3349 3350 3351
    if (fd < 0) {
        s->fd = -1;
        return -EIO;
    }
    s->fd = fd;

D
Dong Xu Wang 已提交
3352
    /* make sure the door isn't locked at this time */
3353 3354 3355 3356
    ioctl(s->fd, CDIOCALLOW);
    return 0;
}

3357
static bool cdrom_is_inserted(BlockDriverState *bs)
3358 3359 3360 3361
{
    return raw_getlength(bs) > 0;
}

3362
static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3363 3364 3365 3366
{
    BDRVRawState *s = bs->opaque;

    if (s->fd < 0)
3367
        return;
3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378

    (void) ioctl(s->fd, CDIOCALLOW);

    if (eject_flag) {
        if (ioctl(s->fd, CDIOCEJECT) < 0)
            perror("CDIOCEJECT");
    } else {
        if (ioctl(s->fd, CDIOCCLOSE) < 0)
            perror("CDIOCCLOSE");
    }

3379
    cdrom_reopen(bs);
3380 3381
}

3382
static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3383 3384 3385 3386
{
    BDRVRawState *s = bs->opaque;

    if (s->fd < 0)
3387
        return;
3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398
    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
        /*
         * Note: an error can happen if the distribution automatically
         * mounts the CD-ROM
         */
        /* perror("CDROM_LOCKDOOR"); */
    }
}

static BlockDriver bdrv_host_cdrom = {
    .format_name        = "host_cdrom",
3399
    .protocol_name      = "host_cdrom",
3400
    .instance_size      = sizeof(BDRVRawState),
3401
    .bdrv_needs_filename = true,
3402
    .bdrv_probe_device	= cdrom_probe_device,
3403
    .bdrv_parse_filename = cdrom_parse_filename,
3404
    .bdrv_file_open     = cdrom_open,
3405
    .bdrv_close         = raw_close,
3406 3407 3408
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
3409
    .bdrv_co_create_opts = hdev_co_create_opts,
3410
    .create_opts        = &raw_create_opts,
3411

3412 3413
    .bdrv_co_preadv         = raw_co_preadv,
    .bdrv_co_pwritev        = raw_co_pwritev,
3414
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3415
    .bdrv_refresh_limits = raw_refresh_limits,
3416 3417
    .bdrv_io_plug = raw_aio_plug,
    .bdrv_io_unplug = raw_aio_unplug,
3418
    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3419

3420
    .bdrv_co_truncate    = raw_co_truncate,
3421 3422
    .bdrv_getlength      = raw_getlength,
    .has_variable_length = true,
3423 3424
    .bdrv_get_allocated_file_size
                        = raw_get_allocated_file_size,
3425

B
bellard 已提交
3426
    /* removable device support */
3427 3428
    .bdrv_is_inserted   = cdrom_is_inserted,
    .bdrv_eject         = cdrom_eject,
3429
    .bdrv_lock_medium   = cdrom_lock_medium,
B
bellard 已提交
3430
};
3431
#endif /* __FreeBSD__ */
3432

3433
static void bdrv_file_init(void)
3434
{
3435 3436 3437 3438
    /*
     * Register all the drivers.  Note that order is important, the driver
     * registered last will get probed first.
     */
3439
    bdrv_register(&bdrv_file);
3440
    bdrv_register(&bdrv_host_device);
3441 3442 3443
#ifdef __linux__
    bdrv_register(&bdrv_host_cdrom);
#endif
A
Aurelien Jarno 已提交
3444
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3445 3446
    bdrv_register(&bdrv_host_cdrom);
#endif
3447 3448
}

3449
block_init(bdrv_file_init);