You need to sign in or sign up before continuing.
block_int.h 38.9 KB
Newer Older
B
bellard 已提交
1 2
/*
 * QEMU System Emulator block driver
3
 *
B
bellard 已提交
4
 * Copyright (c) 2003 Fabrice Bellard
5
 *
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#ifndef BLOCK_INT_H
#define BLOCK_INT_H

27
#include "block/accounting.h"
28
#include "block/block.h"
29 30
#include "qemu/option.h"
#include "qemu/queue.h"
31
#include "qemu/coroutine.h"
32
#include "qemu/stats64.h"
33
#include "qemu/timer.h"
L
Luiz Capitulino 已提交
34
#include "qapi-types.h"
35
#include "qemu/hbitmap.h"
36
#include "block/snapshot.h"
37
#include "qemu/main-loop.h"
38
#include "qemu/throttle.h"
P
pbrook 已提交
39

40
#define BLOCK_FLAG_LAZY_REFCOUNTS   8
41

42 43
#define BLOCK_OPT_SIZE              "size"
#define BLOCK_OPT_ENCRYPT           "encryption"
44
#define BLOCK_OPT_ENCRYPT_FORMAT    "encrypt.format"
45
#define BLOCK_OPT_COMPAT6           "compat6"
46
#define BLOCK_OPT_HWVERSION         "hwversion"
47 48 49 50 51 52 53 54
#define BLOCK_OPT_BACKING_FILE      "backing_file"
#define BLOCK_OPT_BACKING_FMT       "backing_fmt"
#define BLOCK_OPT_CLUSTER_SIZE      "cluster_size"
#define BLOCK_OPT_TABLE_SIZE        "table_size"
#define BLOCK_OPT_PREALLOC          "preallocation"
#define BLOCK_OPT_SUBFMT            "subformat"
#define BLOCK_OPT_COMPAT_LEVEL      "compat"
#define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
55
#define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
56
#define BLOCK_OPT_REDUNDANCY        "redundancy"
57
#define BLOCK_OPT_NOCOW             "nocow"
58
#define BLOCK_OPT_OBJECT_SIZE       "object_size"
59
#define BLOCK_OPT_REFCOUNT_BITS     "refcount_bits"
60

61 62
#define BLOCK_PROBE_BUF_SIZE        512

63 64 65 66 67 68
enum BdrvTrackedRequestType {
    BDRV_TRACKED_READ,
    BDRV_TRACKED_WRITE,
    BDRV_TRACKED_DISCARD,
};

69 70
typedef struct BdrvTrackedRequest {
    BlockDriverState *bs;
71 72
    int64_t offset;
    unsigned int bytes;
73
    enum BdrvTrackedRequestType type;
74

75
    bool serialising;
76 77 78
    int64_t overlap_offset;
    unsigned int overlap_bytes;

79 80 81
    QLIST_ENTRY(BdrvTrackedRequest) list;
    Coroutine *co; /* owner, used for deadlock detection */
    CoQueue wait_queue; /* coroutines blocked on this request */
82 83

    struct BdrvTrackedRequest *waiting_for;
84 85
} BdrvTrackedRequest;

B
bellard 已提交
86 87 88
struct BlockDriver {
    const char *format_name;
    int instance_size;
89

90 91 92 93
    /* set to true if the BlockDriver is a block filter */
    bool is_filter;
    /* for snapshots block filter like Quorum can implement the
     * following recursive callback.
94 95 96
     * It's purpose is to recurse on the filter children while calling
     * bdrv_recurse_is_first_non_filter on them.
     * For a sample implementation look in the future Quorum block filter.
97
     */
98 99
    bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs,
                                             BlockDriverState *candidate);
100

B
bellard 已提交
101
    int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
102
    int (*bdrv_probe_device)(const char *filename);
103 104 105

    /* Any driver implementing this callback is expected to be able to handle
     * NULL file names in its .bdrv_open() implementation */
106
    void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
107 108 109 110 111 112
    /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
     * this field set to true, except ones that are defined only by their
     * child's bs.
     * An example of the last type will be the quorum block driver.
     */
    bool bdrv_needs_filename;
113

114 115 116
    /* Set if a driver can support backing files */
    bool supports_backing;

117 118 119 120 121
    /* For handling image reopen for split or non-split files */
    int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
                               BlockReopenQueue *queue, Error **errp);
    void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
    void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
122
    void (*bdrv_join_options)(QDict *options, QDict *old_options);
123

M
Max Reitz 已提交
124 125 126 127
    int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp);
    int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp);
B
bellard 已提交
128
    void (*bdrv_close)(BlockDriverState *bs);
C
Chunyan Liu 已提交
129
    int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp);
B
bellard 已提交
130
    int (*bdrv_set_key)(BlockDriverState *bs, const char *key);
131
    int (*bdrv_make_empty)(BlockDriverState *bs);
M
Max Reitz 已提交
132

133
    void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
M
Max Reitz 已提交
134

B
bellard 已提交
135
    /* aio */
136
    BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
137
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
138
        BlockCompletionFunc *cb, void *opaque);
139
    BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
140
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
141
        BlockCompletionFunc *cb, void *opaque);
142
    BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
143
        BlockCompletionFunc *cb, void *opaque);
144
    BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
145
        int64_t offset, int bytes,
146
        BlockCompletionFunc *cb, void *opaque);
B
bellard 已提交
147

K
Kevin Wolf 已提交
148 149
    int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
150 151
    int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
K
Kevin Wolf 已提交
152 153
    int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
154 155
    int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
156 157
    int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
158

159 160 161
    /*
     * Efficiently zero a region of the disk image.  Typically an image format
     * would use a compact metadata representation to implement this.  This
162 163
     * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
     * will be called instead.
164
     */
E
Eric Blake 已提交
165
    int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
166
        int64_t offset, int bytes, BdrvRequestFlags flags);
167
    int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
168
        int64_t offset, int bytes);
169 170 171 172 173 174 175

    /*
     * Building block for bdrv_block_status[_above]. The driver should
     * answer only according to the current layer, and should not
     * set BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW.  See block.h
     * for the meaning of _DATA, _ZERO, and _OFFSET_VALID.
     */
176
    int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
177 178
        int64_t sector_num, int nb_sectors, int *pnum,
        BlockDriverState **file);
K
Kevin Wolf 已提交
179

180 181 182
    /*
     * Invalidate any cached meta-data.
     */
183
    void (*bdrv_invalidate_cache)(BlockDriverState *bs, Error **errp);
184
    int (*bdrv_inactivate)(BlockDriverState *bs);
185

P
Pavel Dovgalyuk 已提交
186 187 188 189 190 191 192
    /*
     * Flushes all data for all layers by calling bdrv_co_flush for underlying
     * layers, if needed. This function is needed for deterministic
     * synchronization of the flush finishing callback.
     */
    int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);

193 194
    /*
     * Flushes all data that was already written to the OS all the way down to
195
     * the disk (for example file-posix.c calls fsync()).
196 197 198
     */
    int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);

K
Kevin Wolf 已提交
199 200 201 202 203 204 205
    /*
     * Flushes all internal caches to the OS. The data may still sit in a
     * writeback cache of the host OS, but it will survive a crash of the qemu
     * process.
     */
    int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);

B
bellard 已提交
206
    const char *protocol_name;
207
    int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset, Error **errp);
208

B
bellard 已提交
209
    int64_t (*bdrv_getlength)(BlockDriverState *bs);
210
    bool has_variable_length;
211
    int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
212

213 214 215
    int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs,
        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov);

216
    int (*bdrv_snapshot_create)(BlockDriverState *bs,
B
bellard 已提交
217
                                QEMUSnapshotInfo *sn_info);
218
    int (*bdrv_snapshot_goto)(BlockDriverState *bs,
B
bellard 已提交
219
                              const char *snapshot_id);
220 221 222 223
    int (*bdrv_snapshot_delete)(BlockDriverState *bs,
                                const char *snapshot_id,
                                const char *name,
                                Error **errp);
224
    int (*bdrv_snapshot_list)(BlockDriverState *bs,
B
bellard 已提交
225
                              QEMUSnapshotInfo **psn_info);
E
edison 已提交
226
    int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
227 228 229
                                  const char *snapshot_id,
                                  const char *name,
                                  Error **errp);
B
bellard 已提交
230
    int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
M
Max Reitz 已提交
231
    ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs);
B
bellard 已提交
232

233 234 235 236 237 238
    int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
                                          QEMUIOVector *qiov,
                                          int64_t pos);
    int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
                                          QEMUIOVector *qiov,
                                          int64_t pos);
239

K
Kevin Wolf 已提交
240 241 242
    int (*bdrv_change_backing_file)(BlockDriverState *bs,
        const char *backing_file, const char *backing_fmt);

B
bellard 已提交
243
    /* removable device specific */
244
    bool (*bdrv_is_inserted)(BlockDriverState *bs);
B
bellard 已提交
245
    int (*bdrv_media_changed)(BlockDriverState *bs);
246
    void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
247
    void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
248

249
    /* to control generic scsi devices */
250
    BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
251
        unsigned long int req, void *buf,
252
        BlockCompletionFunc *cb, void *opaque);
253 254
    int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs,
                                      unsigned long int req, void *buf);
255

256
    /* List of options for creating images, terminated by name == NULL */
257
    QemuOptsList *create_opts;
258

259 260 261 262
    /*
     * Returns 0 for completed check, -errno for internal errors.
     * The check results are stored in result.
     */
263
    int (*bdrv_check)(BlockDriverState *bs, BdrvCheckResult *result,
264
        BdrvCheckMode fix);
A
aliguori 已提交
265

266
    int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts,
267 268
                              BlockDriverAmendStatusCB *status_cb,
                              void *cb_opaque);
M
Max Reitz 已提交
269

270
    void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
K
Kevin Wolf 已提交
271

K
Kevin Wolf 已提交
272 273 274
    /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
    int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
        const char *tag);
F
Fam Zheng 已提交
275 276
    int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
        const char *tag);
K
Kevin Wolf 已提交
277 278 279
    int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
    bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);

280
    void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
281

K
Kevin Wolf 已提交
282 283 284 285 286
    /*
     * Returns 1 if newly created images are guaranteed to contain only
     * zeros, 0 otherwise.
     */
    int (*bdrv_has_zero_init)(BlockDriverState *bs);
287

288 289 290 291 292 293 294 295 296 297 298 299 300
    /* Remove fd handlers, timers, and other event loop callbacks so the event
     * loop is no longer in use.  Called with no in-flight requests and in
     * depth-first traversal order with parents before child nodes.
     */
    void (*bdrv_detach_aio_context)(BlockDriverState *bs);

    /* Add fd handlers, timers, and other event loop callbacks so I/O requests
     * can be processed again.  Called with no in-flight requests and in
     * depth-first traversal order with child nodes before parent nodes.
     */
    void (*bdrv_attach_aio_context)(BlockDriverState *bs,
                                    AioContext *new_context);

301 302 303 304
    /* io queue for linux-aio */
    void (*bdrv_io_plug)(BlockDriverState *bs);
    void (*bdrv_io_unplug)(BlockDriverState *bs);

305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
    /**
     * Try to get @bs's logical and physical block size.
     * On success, store them in @bsz and return zero.
     * On failure, return negative errno.
     */
    int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz);
    /**
     * Try to get @bs's geometry (cyls, heads, sectors)
     * On success, store them in @geo and return 0.
     * On failure return -errno.
     * Only drivers that want to override guest geometry implement this
     * callback; see hd_geometry_guess().
     */
    int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);

320 321 322 323 324 325
    /**
     * Drain and stop any internal sources of requests in the driver, and
     * remain so until next I/O callback (e.g. bdrv_co_writev) is called.
     */
    void (*bdrv_drain)(BlockDriverState *bs);

326 327 328 329 330
    void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
                           Error **errp);
    void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
                           Error **errp);

331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
    /**
     * Informs the block driver that a permission change is intended. The
     * driver checks whether the change is permissible and may take other
     * preparations for the change (e.g. get file system locks). This operation
     * is always followed either by a call to either .bdrv_set_perm or
     * .bdrv_abort_perm_update.
     *
     * Checks whether the requested set of cumulative permissions in @perm
     * can be granted for accessing @bs and whether no other users are using
     * permissions other than those given in @shared (both arguments take
     * BLK_PERM_* bitmasks).
     *
     * If both conditions are met, 0 is returned. Otherwise, -errno is returned
     * and errp is set to an error describing the conflict.
     */
    int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm,
                           uint64_t shared, Error **errp);

    /**
     * Called to inform the driver that the set of cumulative set of used
     * permissions for @bs has changed to @perm, and the set of sharable
     * permission to @shared. The driver can use this to propagate changes to
     * its children (i.e. request permissions only if a parent actually needs
     * them).
     *
     * This function is only invoked after bdrv_check_perm(), so block drivers
     * may rely on preparations made in their .bdrv_check_perm implementation.
     */
    void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared);

    /*
     * Called to inform the driver that after a previous bdrv_check_perm()
     * call, the permission update is not performed and any preparations made
     * for it (e.g. taken file locks) need to be undone.
     *
     * This function can be called even for nodes that never saw a
     * bdrv_check_perm() call. It is a no-op then.
     */
    void (*bdrv_abort_perm_update)(BlockDriverState *bs);

    /**
     * Returns in @nperm and @nshared the permissions that the driver for @bs
     * needs on its child @c, based on the cumulative permissions requested by
     * the parents in @parent_perm and @parent_shared.
     *
     * If @c is NULL, return the permissions for attaching a new child for the
     * given @role.
     */
     void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c,
                             const BdrvChildRole *role,
                             uint64_t parent_perm, uint64_t parent_shared,
                             uint64_t *nperm, uint64_t *nshared);

384
    QLIST_ENTRY(BlockDriver) list;
B
bellard 已提交
385 386
};

387
typedef struct BlockLimits {
388 389 390 391 392 393
    /* Alignment requirement, in bytes, for offset/length of I/O
     * requests. Must be a power of 2 less than INT_MAX; defaults to
     * 1 for drivers with modern byte interfaces, and to 512
     * otherwise. */
    uint32_t request_alignment;

394 395
    /* Maximum number of bytes that can be discarded at once (since it
     * is signed, it must be < 2G, if set). Must be multiple of
396 397 398 399
     * pdiscard_alignment, but need not be power of 2. May be 0 if no
     * inherent 32-bit limit */
    int32_t max_pdiscard;

400 401 402 403
    /* Optimal alignment for discard requests in bytes. A power of 2
     * is best but not mandatory.  Must be a multiple of
     * bl.request_alignment, and must be less than max_pdiscard if
     * that is set. May be 0 if bl.request_alignment is good enough */
404
    uint32_t pdiscard_alignment;
405

406 407
    /* Maximum number of bytes that can zeroized at once (since it is
     * signed, it must be < 2G, if set). Must be multiple of
408
     * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */
409
    int32_t max_pwrite_zeroes;
410

411 412 413 414 415
    /* Optimal alignment for write zeroes requests in bytes. A power
     * of 2 is best but not mandatory.  Must be a multiple of
     * bl.request_alignment, and must be less than max_pwrite_zeroes
     * if that is set. May be 0 if bl.request_alignment is good
     * enough */
416
    uint32_t pwrite_zeroes_alignment;
417

418 419 420
    /* Optimal transfer length in bytes.  A power of 2 is best but not
     * mandatory.  Must be a multiple of bl.request_alignment, or 0 if
     * no preferred size */
421 422
    uint32_t opt_transfer;

423 424 425 426
    /* Maximal transfer length in bytes.  Need not be power of 2, but
     * must be multiple of opt_transfer and bl.request_alignment, or 0
     * for no 32-bit limit.  For now, anything larger than INT_MAX is
     * clamped down. */
427
    uint32_t max_transfer;
428

429
    /* memory alignment, in bytes so that no bounce buffer is needed */
430 431
    size_t min_mem_alignment;

432
    /* memory alignment, in bytes, for bounce buffer */
433
    size_t opt_mem_alignment;
434 435 436

    /* maximum number of iovec elements */
    int max_iov;
437 438
} BlockLimits;

439 440
typedef struct BdrvOpBlocker BdrvOpBlocker;

M
Max Reitz 已提交
441 442 443 444 445
typedef struct BdrvAioNotifier {
    void (*attached_aio_context)(AioContext *new_context, void *opaque);
    void (*detach_aio_context)(void *opaque);

    void *opaque;
446
    bool deleted;
M
Max Reitz 已提交
447 448 449 450

    QLIST_ENTRY(BdrvAioNotifier) list;
} BdrvAioNotifier;

451
struct BdrvChildRole {
452 453
    /* If true, bdrv_replace_node() doesn't change the node this BdrvChild
     * points to. */
454 455
    bool stay_at_node;

456 457
    void (*inherit_options)(int *child_flags, QDict *child_options,
                            int parent_flags, QDict *parent_options);
458

459 460 461
    void (*change_media)(BdrvChild *child, bool load);
    void (*resize)(BdrvChild *child);

462 463 464
    /* Returns a name that is supposedly more useful for human users than the
     * node name for identifying the node in question (in particular, a BB
     * name), or NULL if the parent can't provide a better name. */
465
    const char *(*get_name)(BdrvChild *child);
466

467 468 469 470
    /* Returns a malloced string that describes the parent of the child for a
     * human reader. This could be a node-name, BlockBackend name, qdev ID or
     * QOM path of the device owning the BlockBackend, job type and ID etc. The
     * caller is responsible for freeing the memory. */
471
    char *(*get_parent_desc)(BdrvChild *child);
472

473 474 475 476 477 478 479 480 481 482
    /*
     * If this pair of functions is implemented, the parent doesn't issue new
     * requests after returning from .drained_begin() until .drained_end() is
     * called.
     *
     * Note that this can be nested. If drained_begin() was called twice, new
     * I/O is allowed only after drained_end() was called twice, too.
     */
    void (*drained_begin)(BdrvChild *child);
    void (*drained_end)(BdrvChild *child);
483

484 485 486
    /* Notifies the parent that the child has been activated/inactivated (e.g.
     * when migration is completing) and it can start/stop requesting
     * permissions and doing I/O on it. */
487
    void (*activate)(BdrvChild *child, Error **errp);
488
    int (*inactivate)(BdrvChild *child);
489

490 491
    void (*attach)(BdrvChild *child);
    void (*detach)(BdrvChild *child);
492 493 494 495
};

extern const BdrvChildRole child_file;
extern const BdrvChildRole child_format;
K
Kevin Wolf 已提交
496
extern const BdrvChildRole child_backing;
497

K
Kevin Wolf 已提交
498
struct BdrvChild {
499
    BlockDriverState *bs;
500
    char *name;
501
    const BdrvChildRole *role;
K
Kevin Wolf 已提交
502
    void *opaque;
503 504 505 506 507 508 509 510 511 512 513 514

    /**
     * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask)
     */
    uint64_t perm;

    /**
     * Permissions that can still be granted to other users of @bs while this
     * BdrvChild is still attached to it. (BLK_PERM_* bitmask)
     */
    uint64_t shared_perm;

515
    QLIST_ENTRY(BdrvChild) next;
K
Kevin Wolf 已提交
516
    QLIST_ENTRY(BdrvChild) next_parent;
K
Kevin Wolf 已提交
517
};
518

519 520 521 522 523 524
/*
 * Note: the function bdrv_append() copies and swaps contents of
 * BlockDriverStates, so if you add new fields to this struct, please
 * inspect bdrv_append() to determine if the new fields need to be
 * copied as well.
 */
B
bellard 已提交
525
struct BlockDriverState {
526 527 528
    /* Protected by big QEMU lock or read-only after opening.  No special
     * locking needed during I/O...
     */
529
    int open_flags; /* flags used to open the file, re-used for re-open */
530 531 532 533
    bool read_only; /* if true, the media is read only */
    bool encrypted; /* if true, the media is encrypted */
    bool sg;        /* if true, the device is a /dev/sg* */
    bool probed;    /* if true, format was probed rather than specified */
534
    bool force_share; /* if true, always allow all shared permissions */
535

B
bellard 已提交
536
    BlockDriver *drv; /* NULL means no media */
B
bellard 已提交
537 538
    void *opaque;

539
    AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
M
Max Reitz 已提交
540 541 542 543
    /* long-running tasks intended to always use the same AioContext as this
     * BDS may register themselves in this list to be notified of changes
     * regarding this BDS's context */
    QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
544
    bool walking_aio_notifiers; /* to make removal during iteration safe */
545

546 547 548
    char filename[PATH_MAX];
    char backing_file[PATH_MAX]; /* if non zero, the image is a diff of
                                    this file image */
549
    char backing_format[16]; /* if non-zero and backing_file exists */
B
bellard 已提交
550

M
Max Reitz 已提交
551
    QDict *full_open_options;
552
    char exact_filename[PATH_MAX];
M
Max Reitz 已提交
553

554
    BdrvChild *backing;
K
Kevin Wolf 已提交
555
    BdrvChild *file;
556

557 558 559
    /* I/O Limits */
    BlockLimits bl;

560 561
    /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
    unsigned int supported_write_flags;
E
Eric Blake 已提交
562
    /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
563 564
     * BDRV_REQ_MAY_UNMAP) */
    unsigned int supported_zero_flags;
565

566 567 568 569
    /* the following member gives a name to every node on the bs graph. */
    char node_name[32];
    /* element of the list of named nodes building the graph */
    QTAILQ_ENTRY(BlockDriverState) node_list;
570 571
    /* element of the list of all BlockDriverStates (all_bdrv_states) */
    QTAILQ_ENTRY(BlockDriverState) bs_list;
572 573
    /* element of the list of monitor-owned BDS */
    QTAILQ_ENTRY(BlockDriverState) monitor_list;
574
    int refcnt;
S
Stefan Hajnoczi 已提交
575

576 577 578
    /* operation blockers */
    QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];

579 580
    /* long-running background operation */
    BlockJob *job;
581

582 583 584 585
    /* The node that this node inherited default options from (and a reopen on
     * which can affect this node by changing these defaults). This is always a
     * parent node of this node. */
    BlockDriverState *inherits_from;
586
    QLIST_HEAD(, BdrvChild) children;
K
Kevin Wolf 已提交
587
    QLIST_HEAD(, BdrvChild) parents;
588

589
    QDict *options;
K
Kevin Wolf 已提交
590
    QDict *explicit_options;
591
    BlockdevDetectZeroesOptions detect_zeroes;
592 593 594

    /* The error object in use for blocking operations on backing_hd */
    Error *backing_blocker;
595

596 597 598
    /* Protected by AioContext lock */

    /* If we are reading a disk image, give its size in sectors.
599 600
     * Generally read-only; it is written to by load_snapshot and
     * save_snaphost, but the block layer is quiescent during those.
601 602 603 604 605 606
     */
    int64_t total_sectors;

    /* Callback before write request is processed */
    NotifierWithReturnList before_write_notifiers;

607 608 609
    /* threshold limit for writes, in bytes. "High water mark". */
    uint64_t write_threshold_offset;
    NotifierWithReturn write_threshold_notifier;
610

611 612
    /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
     * Reading from the list can be done with either the BQL or the
613 614
     * dirty_bitmap_mutex.  Modifying a bitmap only requires
     * dirty_bitmap_mutex.  */
615
    QemuMutex dirty_bitmap_mutex;
616 617
    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;

618 619 620
    /* Offset after the highest byte written to */
    Stat64 wr_highest_offset;

621 622 623 624 625 626
    /* If true, copy read backing sectors into image.  Can be >1 if more
     * than one client has requested copy-on-read.  Accessed with atomic
     * ops.
     */
    int copy_on_read;

627 628 629 630 631 632
    /* number of in-flight requests; overall and serialising.
     * Accessed with atomic ops.
     */
    unsigned int in_flight;
    unsigned int serialising_in_flight;

633 634 635 636 637
    /* Internal to BDRV_POLL_WHILE and bdrv_wakeup.  Accessed with atomic
     * ops.
     */
    bool wakeup;

638 639 640 641 642
    /* counter for nested bdrv_io_plug.
     * Accessed with atomic ops.
    */
    unsigned io_plugged;

643 644 645
    /* do we need to tell the quest if we have a volatile write cache? */
    int enable_write_cache;

646
    /* Accessed with atomic ops.  */
647
    int quiesce_counter;
648
    unsigned int write_gen;               /* Current data generation */
649 650 651 652 653 654 655 656 657

    /* Protected by reqs_lock.  */
    CoMutex reqs_lock;
    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
    CoQueue flush_queue;                  /* Serializing flush queue */
    bool active_flush_req;                /* Flush request in flight? */

    /* Only read/written by whoever has set active_flush_req to true.  */
    unsigned int flushed_gen;             /* Flushed write generation */
B
bellard 已提交
658 659
};

M
Max Reitz 已提交
660 661 662 663 664 665
struct BlockBackendRootState {
    int open_flags;
    bool read_only;
    BlockdevDetectZeroesOptions detect_zeroes;
};

M
Max Reitz 已提交
666 667 668 669 670 671 672 673 674 675 676 677 678 679
typedef enum BlockMirrorBackingMode {
    /* Reuse the existing backing chain from the source for the target.
     * - sync=full: Set backing BDS to NULL.
     * - sync=top:  Use source's backing BDS.
     * - sync=none: Use source as the backing BDS. */
    MIRROR_SOURCE_BACKING_CHAIN,

    /* Open the target's backing chain completely anew */
    MIRROR_OPEN_BACKING_CHAIN,

    /* Do not change the target's backing BDS after job completion */
    MIRROR_LEAVE_BACKING_CHAIN,
} BlockMirrorBackingMode;

680 681 682 683 684
static inline BlockDriverState *backing_bs(BlockDriverState *bs)
{
    return bs->backing ? bs->backing->bs : NULL;
}

685 686 687 688 689 690 691

/* Essential block drivers which must always be statically linked into qemu, and
 * which therefore can be accessed without using bdrv_find_format() */
extern BlockDriver bdrv_file;
extern BlockDriver bdrv_raw;
extern BlockDriver bdrv_qcow2;

692
int coroutine_fn bdrv_co_preadv(BdrvChild *child,
693 694
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags);
695
int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
696 697
    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
    BdrvRequestFlags flags);
698

699
int get_tmp_filename(char *filename, int size);
700 701
BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
                            const char *filename);
702

703 704 705
void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
                                      QDict *options);

706

707 708 709 710 711 712 713 714 715
/**
 * bdrv_add_before_write_notifier:
 *
 * Register a callback that is invoked before write requests are processed but
 * after any throttling or waiting for overlapping requests.
 */
void bdrv_add_before_write_notifier(BlockDriverState *bs,
                                    NotifierWithReturn *notifier);

716 717 718 719 720
/**
 * bdrv_detach_aio_context:
 *
 * May be called from .bdrv_detach_aio_context() to detach children from the
 * current #AioContext.  This is only needed by block drivers that manage their
721
 * own children.  Both ->file and ->backing are automatically handled and
722 723 724 725 726 727 728 729 730
 * block drivers should not call this function on them explicitly.
 */
void bdrv_detach_aio_context(BlockDriverState *bs);

/**
 * bdrv_attach_aio_context:
 *
 * May be called from .bdrv_attach_aio_context() to attach children to the new
 * #AioContext.  This is only needed by block drivers that manage their own
731
 * children.  Both ->file and ->backing are automatically handled and block
732 733 734 735 736
 * drivers should not call this function on them explicitly.
 */
void bdrv_attach_aio_context(BlockDriverState *bs,
                             AioContext *new_context);

M
Max Reitz 已提交
737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
/**
 * bdrv_add_aio_context_notifier:
 *
 * If a long-running job intends to be always run in the same AioContext as a
 * certain BDS, it may use this function to be notified of changes regarding the
 * association of the BDS to an AioContext.
 *
 * attached_aio_context() is called after the target BDS has been attached to a
 * new AioContext; detach_aio_context() is called before the target BDS is being
 * detached from its old AioContext.
 */
void bdrv_add_aio_context_notifier(BlockDriverState *bs,
        void (*attached_aio_context)(AioContext *new_context, void *opaque),
        void (*detach_aio_context)(void *opaque), void *opaque);

/**
 * bdrv_remove_aio_context_notifier:
 *
 * Unsubscribe of change notifications regarding the BDS's AioContext. The
 * parameters given here have to be the same as those given to
 * bdrv_add_aio_context_notifier().
 */
void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
                                      void (*aio_context_attached)(AioContext *,
                                                                   void *),
                                      void (*aio_context_detached)(void *),
                                      void *opaque);

765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
/**
 * bdrv_wakeup:
 * @bs: The BlockDriverState for which an I/O operation has been completed.
 *
 * Wake up the main thread if it is waiting on BDRV_POLL_WHILE.  During
 * synchronous I/O on a BlockDriverState that is attached to another
 * I/O thread, the main thread lets the I/O thread's event loop run,
 * waiting for the I/O operation to complete.  A bdrv_wakeup will wake
 * up the main thread if necessary.
 *
 * Manual calls to bdrv_wakeup are rarely necessary, because
 * bdrv_dec_in_flight already calls it.
 */
void bdrv_wakeup(BlockDriverState *bs);

780 781 782 783
#ifdef _WIN32
int is_windows_drive(const char *filename);
#endif

P
Paolo Bonzini 已提交
784 785
/**
 * stream_start:
786 787
 * @job_id: The id of the newly-created job, or %NULL to use the
 * device name of @bs.
P
Paolo Bonzini 已提交
788 789 790
 * @bs: Block device to operate on.
 * @base: Block device that will become the new base, or %NULL to
 * flatten the whole backing file chain onto @bs.
791 792
 * @backing_file_str: The file name that will be written to @bs as the
 * the new backing file if the job completes. Ignored if @base is %NULL.
793
 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
P
Paolo Bonzini 已提交
794
 * @on_error: The action to take upon error.
795
 * @errp: Error object.
P
Paolo Bonzini 已提交
796 797 798 799 800
 *
 * Start a streaming operation on @bs.  Clusters that are unallocated
 * in @bs, but allocated in any image between @base and @bs (both
 * exclusive) will be written to @bs.  At the end of a successful
 * streaming job, the backing file of @bs will be changed to
801 802
 * @backing_file_str in the written image and to @base in the live
 * BlockDriverState.
P
Paolo Bonzini 已提交
803
 */
804 805
void stream_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *base, const char *backing_file_str,
806
                  int64_t speed, BlockdevOnError on_error, Error **errp);
807

808 809
/**
 * commit_start:
810 811
 * @job_id: The id of the newly-created job, or %NULL to use the
 * device name of @bs.
F
Fam Zheng 已提交
812 813 814
 * @bs: Active block device.
 * @top: Top block device to be committed.
 * @base: Block device that will be written into, and become the new top.
815 816
 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 * @on_error: The action to take upon error.
817
 * @backing_file_str: String to use as the backing file in @top's overlay
818 819 820
 * @filter_node_name: The node name that should be assigned to the filter
 * driver that the commit job inserts into the graph above @top. NULL means
 * that a node name should be autogenerated.
821 822 823
 * @errp: Error object.
 *
 */
824 825
void commit_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
826
                  BlockdevOnError on_error, const char *backing_file_str,
827
                  const char *filter_node_name, Error **errp);
F
Fam Zheng 已提交
828 829
/**
 * commit_active_start:
830 831
 * @job_id: The id of the newly-created job, or %NULL to use the
 * device name of @bs.
F
Fam Zheng 已提交
832 833
 * @bs: Active block device to be committed.
 * @base: Block device that will be written into, and become the new top.
834 835
 * @creation_flags: Flags that control the behavior of the Job lifetime.
 *                  See @BlockJobCreateFlags
F
Fam Zheng 已提交
836 837
 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 * @on_error: The action to take upon error.
838 839 840
 * @filter_node_name: The node name that should be assigned to the filter
 * driver that the commit job inserts into the graph above @bs. NULL means that
 * a node name should be autogenerated.
F
Fam Zheng 已提交
841 842
 * @cb: Completion function for the job.
 * @opaque: Opaque pointer value passed to @cb.
843
 * @auto_complete: Auto complete the job.
844
 * @errp: Error object.
F
Fam Zheng 已提交
845 846
 *
 */
847
void commit_active_start(const char *job_id, BlockDriverState *bs,
848 849
                         BlockDriverState *base, int creation_flags,
                         int64_t speed, BlockdevOnError on_error,
850
                         const char *filter_node_name,
851 852
                         BlockCompletionFunc *cb, void *opaque,
                         bool auto_complete, Error **errp);
P
Paolo Bonzini 已提交
853 854
/*
 * mirror_start:
855 856
 * @job_id: The id of the newly-created job, or %NULL to use the
 * device name of @bs.
P
Paolo Bonzini 已提交
857 858
 * @bs: Block device to operate on.
 * @target: Block device to write to.
859 860
 * @replaces: Block graph node name to replace once the mirror is done. Can
 *            only be used when full mirroring is selected.
P
Paolo Bonzini 已提交
861
 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
862
 * @granularity: The chosen granularity for the dirty bitmap.
863
 * @buf_size: The amount of data that can be in flight at one time.
P
Paolo Bonzini 已提交
864
 * @mode: Whether to collapse all images in the chain to the target.
M
Max Reitz 已提交
865
 * @backing_mode: How to establish the target's backing chain after completion.
866 867
 * @on_source_error: The action to take upon error reading from the source.
 * @on_target_error: The action to take upon error writing to the target.
868
 * @unmap: Whether to unmap target where source sectors only contain zeroes.
869 870 871
 * @filter_node_name: The node name that should be assigned to the filter
 * driver that the mirror job inserts into the graph above @bs. NULL means that
 * a node name should be autogenerated.
P
Paolo Bonzini 已提交
872 873 874
 * @errp: Error object.
 *
 * Start a mirroring operation on @bs.  Clusters that are allocated
875
 * in @bs will be written to @target until the job is cancelled or
P
Paolo Bonzini 已提交
876 877 878
 * manually completed.  At the end of a successful mirroring job,
 * @bs will be switched to read from @target.
 */
879 880
void mirror_start(const char *job_id, BlockDriverState *bs,
                  BlockDriverState *target, const char *replaces,
881
                  int64_t speed, uint32_t granularity, int64_t buf_size,
M
Max Reitz 已提交
882 883
                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                  BlockdevOnError on_source_error,
884
                  BlockdevOnError on_target_error,
885
                  bool unmap, const char *filter_node_name, Error **errp);
P
Paolo Bonzini 已提交
886

887
/*
888
 * backup_job_create:
889 890
 * @job_id: The id of the newly-created job, or %NULL to use the
 * device name of @bs.
891 892 893
 * @bs: Block device to operate on.
 * @target: Block device to write to.
 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
894
 * @sync_mode: What parts of the disk image should be copied to the destination.
895
 * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL.
896 897
 * @on_source_error: The action to take upon error reading from the source.
 * @on_target_error: The action to take upon error writing to the target.
898 899
 * @creation_flags: Flags that control the behavior of the Job lifetime.
 *                  See @BlockJobCreateFlags
900 901
 * @cb: Completion function for the job.
 * @opaque: Opaque pointer value passed to @cb.
902
 * @txn: Transaction that this job is part of (may be NULL).
903
 *
904
 * Create a backup operation on @bs.  Clusters in @bs are written to @target
905 906
 * until the job is cancelled or manually completed.
 */
907 908 909 910 911 912 913 914 915 916
BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
                            BlockDriverState *target, int64_t speed,
                            MirrorSyncMode sync_mode,
                            BdrvDirtyBitmap *sync_bitmap,
                            bool compress,
                            BlockdevOnError on_source_error,
                            BlockdevOnError on_target_error,
                            int creation_flags,
                            BlockCompletionFunc *cb, void *opaque,
                            BlockJobTxn *txn, Error **errp);
917

918 919
void hmp_drive_add_node(Monitor *mon, const char *optstr);

920 921
BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                  const char *child_name,
922
                                  const BdrvChildRole *child_role,
923 924
                                  uint64_t perm, uint64_t shared_perm,
                                  void *opaque, Error **errp);
925 926
void bdrv_root_unref_child(BdrvChild *child);

927 928 929
int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
                            Error **errp);

930 931 932 933 934 935 936 937
/* Default implementation for BlockDriver.bdrv_child_perm() that can be used by
 * block filters: Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED and RESIZE to
 * all children */
void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
                               const BdrvChildRole *role,
                               uint64_t perm, uint64_t shared,
                               uint64_t *nperm, uint64_t *nshared);

938 939 940 941 942 943 944 945
/* Default implementation for BlockDriver.bdrv_child_perm() that can be used by
 * (non-raw) image formats: Like above for bs->backing, but for bs->file it
 * requires WRITE | RESIZE for read-write images, always requires
 * CONSISTENT_READ and doesn't share WRITE. */
void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
                               const BdrvChildRole *role,
                               uint64_t perm, uint64_t shared,
                               uint64_t *nperm, uint64_t *nshared);
946

K
Kevin Wolf 已提交
947
const char *bdrv_get_parent_name(const BlockDriverState *bs);
948
void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
949
bool blk_dev_has_removable_media(BlockBackend *blk);
M
Max Reitz 已提交
950
bool blk_dev_has_tray(BlockBackend *blk);
951 952 953 954
void blk_dev_eject_request(BlockBackend *blk, bool force);
bool blk_dev_is_tray_open(BlockBackend *blk);
bool blk_dev_is_medium_locked(BlockBackend *blk);

955
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int64_t nr_sect);
956
bool bdrv_requests_pending(BlockDriverState *bs);
957

F
Fam Zheng 已提交
958 959 960
void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);

961 962 963
void bdrv_inc_in_flight(BlockDriverState *bs);
void bdrv_dec_in_flight(BlockDriverState *bs);

964 965
void blockdev_close_all_bdrv_states(void);

B
bellard 已提交
966
#endif /* BLOCK_INT_H */