sheepdog.c 87.8 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version
 * 2 as published by the Free Software Foundation.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10 11 12
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
13 14
 */

P
Peter Maydell 已提交
15
#include "qemu/osdep.h"
M
Markus Armbruster 已提交
16
#include "qapi-visit.h"
17
#include "qapi/error.h"
18
#include "qapi/qmp/qdict.h"
M
Markus Armbruster 已提交
19
#include "qapi/qobject-input-visitor.h"
M
MORITA Kazutaka 已提交
20
#include "qemu/uri.h"
21 22
#include "qemu/error-report.h"
#include "qemu/sockets.h"
23
#include "block/block_int.h"
24
#include "sysemu/block-backend.h"
25
#include "qemu/bitops.h"
26
#include "qemu/cutils.h"
27 28 29 30

#define SD_PROTO_VER 0x01

#define SD_DEFAULT_ADDR "localhost"
31
#define SD_DEFAULT_PORT 7000
32 33 34 35

#define SD_OP_CREATE_AND_WRITE_OBJ  0x01
#define SD_OP_READ_OBJ       0x02
#define SD_OP_WRITE_OBJ      0x03
36
/* 0x04 is used internally by Sheepdog */
37 38 39 40 41 42

#define SD_OP_NEW_VDI        0x11
#define SD_OP_LOCK_VDI       0x12
#define SD_OP_RELEASE_VDI    0x13
#define SD_OP_GET_VDI_INFO   0x14
#define SD_OP_READ_VDIS      0x15
43
#define SD_OP_FLUSH_VDI      0x16
L
Liu Yuan 已提交
44
#define SD_OP_DEL_VDI        0x17
45
#define SD_OP_GET_CLUSTER_DEFAULT   0x18
46 47 48

#define SD_FLAG_CMD_WRITE    0x01
#define SD_FLAG_CMD_COW      0x02
49 50
#define SD_FLAG_CMD_CACHE    0x04 /* Writeback mode for cache */
#define SD_FLAG_CMD_DIRECT   0x08 /* Don't use cache */
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

#define SD_RES_SUCCESS       0x00 /* Success */
#define SD_RES_UNKNOWN       0x01 /* Unknown error */
#define SD_RES_NO_OBJ        0x02 /* No object found */
#define SD_RES_EIO           0x03 /* I/O error */
#define SD_RES_VDI_EXIST     0x04 /* Vdi exists already */
#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
#define SD_RES_SYSTEM_ERROR  0x06 /* System error */
#define SD_RES_VDI_LOCKED    0x07 /* Vdi is locked */
#define SD_RES_NO_VDI        0x08 /* No vdi found */
#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
#define SD_RES_VDI_READ      0x0A /* Cannot read requested vdi */
#define SD_RES_VDI_WRITE     0x0B /* Cannot write requested vdi */
#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
#define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */
#define SD_RES_NO_TAG        0x0E /* Requested tag is not found */
#define SD_RES_STARTUP       0x0F /* Sheepdog is on starting up */
#define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */
#define SD_RES_SHUTDOWN      0x11 /* Sheepdog is shutting down */
#define SD_RES_NO_MEM        0x12 /* Cannot allocate memory */
#define SD_RES_FULL_VDI      0x13 /* we already have the maximum vdis */
#define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */
#define SD_RES_NO_SPACE      0x15 /* Server has no room for new objects */
#define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
#define SD_RES_WAIT_FOR_JOIN    0x17 /* Waiting for other nodes joining */
#define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
77
#define SD_RES_HALT          0x19 /* Sheepdog is stopped serving IO request */
78
#define SD_RES_READONLY      0x1A /* Object is read-only */
79 80 81 82 83 84 85 86

/*
 * Object ID rules
 *
 *  0 - 19 (20 bits): data object space
 * 20 - 31 (12 bits): reserved data object space
 * 32 - 55 (24 bits): vdi object space
 * 56 - 59 ( 4 bits): reserved vdi object space
D
Dong Xu Wang 已提交
87
 * 60 - 63 ( 4 bits): object type identifier space
88 89 90 91 92 93 94 95 96 97 98 99
 */

#define VDI_SPACE_SHIFT   32
#define VDI_BIT (UINT64_C(1) << 63)
#define VMSTATE_BIT (UINT64_C(1) << 62)
#define MAX_DATA_OBJS (UINT64_C(1) << 20)
#define MAX_CHILDREN 1024
#define SD_MAX_VDI_LEN 256
#define SD_MAX_VDI_TAG_LEN 256
#define SD_NR_VDIS   (1U << 24)
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
100
#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
101 102 103 104 105 106 107 108
/*
 * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
 * (SD_EC_MAX_STRIP - 1) for parity strips
 *
 * SD_MAX_COPIES is sum of number of data strips and parity strips.
 */
#define SD_EC_MAX_STRIP 16
#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
109 110 111 112

#define SD_INODE_SIZE (sizeof(SheepdogInode))
#define CURRENT_VDI_ID 0

113 114 115
#define LOCK_TYPE_NORMAL 0
#define LOCK_TYPE_SHARED 1      /* for iSCSI multipath */

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
typedef struct SheepdogReq {
    uint8_t proto_ver;
    uint8_t opcode;
    uint16_t flags;
    uint32_t epoch;
    uint32_t id;
    uint32_t data_length;
    uint32_t opcode_specific[8];
} SheepdogReq;

typedef struct SheepdogRsp {
    uint8_t proto_ver;
    uint8_t opcode;
    uint16_t flags;
    uint32_t epoch;
    uint32_t id;
    uint32_t data_length;
    uint32_t result;
    uint32_t opcode_specific[7];
} SheepdogRsp;

typedef struct SheepdogObjReq {
    uint8_t proto_ver;
    uint8_t opcode;
    uint16_t flags;
    uint32_t epoch;
    uint32_t id;
    uint32_t data_length;
    uint64_t oid;
    uint64_t cow_oid;
146
    uint8_t copies;
147 148
    uint8_t copy_policy;
    uint8_t reserved[6];
149 150 151 152 153 154 155 156 157 158 159
    uint64_t offset;
} SheepdogObjReq;

typedef struct SheepdogObjRsp {
    uint8_t proto_ver;
    uint8_t opcode;
    uint16_t flags;
    uint32_t epoch;
    uint32_t id;
    uint32_t data_length;
    uint32_t result;
160
    uint8_t copies;
161 162
    uint8_t copy_policy;
    uint8_t reserved[2];
163 164 165 166 167 168 169 170 171 172 173
    uint32_t pad[6];
} SheepdogObjRsp;

typedef struct SheepdogVdiReq {
    uint8_t proto_ver;
    uint8_t opcode;
    uint16_t flags;
    uint32_t epoch;
    uint32_t id;
    uint32_t data_length;
    uint64_t vdi_size;
174
    uint32_t base_vdi_id;
175
    uint8_t copies;
176
    uint8_t copy_policy;
177 178
    uint8_t store_policy;
    uint8_t block_size_shift;
179
    uint32_t snapid;
180 181
    uint32_t type;
    uint32_t pad[2];
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
} SheepdogVdiReq;

typedef struct SheepdogVdiRsp {
    uint8_t proto_ver;
    uint8_t opcode;
    uint16_t flags;
    uint32_t epoch;
    uint32_t id;
    uint32_t data_length;
    uint32_t result;
    uint32_t rsvd;
    uint32_t vdi_id;
    uint32_t pad[5];
} SheepdogVdiRsp;

197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
typedef struct SheepdogClusterRsp {
    uint8_t proto_ver;
    uint8_t opcode;
    uint16_t flags;
    uint32_t epoch;
    uint32_t id;
    uint32_t data_length;
    uint32_t result;
    uint8_t nr_copies;
    uint8_t copy_policy;
    uint8_t block_size_shift;
    uint8_t __pad1;
    uint32_t __pad2[6];
} SheepdogClusterRsp;

212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
typedef struct SheepdogInode {
    char name[SD_MAX_VDI_LEN];
    char tag[SD_MAX_VDI_TAG_LEN];
    uint64_t ctime;
    uint64_t snap_ctime;
    uint64_t vm_clock_nsec;
    uint64_t vdi_size;
    uint64_t vm_state_size;
    uint16_t copy_policy;
    uint8_t nr_copies;
    uint8_t block_size_shift;
    uint32_t snap_id;
    uint32_t vdi_id;
    uint32_t parent_vdi_id;
    uint32_t child_vdi_id[MAX_CHILDREN];
    uint32_t data_vdi_id[MAX_DATA_OBJS];
} SheepdogInode;

230 231
#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id)

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
/*
 * 64 bit FNV-1a non-zero initial basis
 */
#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)

/*
 * 64 bit Fowler/Noll/Vo FNV-1a hash code
 */
static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
{
    unsigned char *bp = buf;
    unsigned char *be = bp + len;
    while (bp < be) {
        hval ^= (uint64_t) *bp++;
        hval += (hval << 1) + (hval << 4) + (hval << 5) +
            (hval << 7) + (hval << 8) + (hval << 40);
    }
    return hval;
}

252
static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
253 254 255 256
{
    return inode->vdi_id == inode->data_vdi_id[idx];
}

257
static inline bool is_data_obj(uint64_t oid)
258 259 260 261 262 263 264 265 266
{
    return !(VDI_BIT & oid);
}

static inline uint64_t data_oid_to_idx(uint64_t oid)
{
    return oid & (MAX_DATA_OBJS - 1);
}

267 268 269 270 271
static inline uint32_t oid_to_vid(uint64_t oid)
{
    return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
}

272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
static inline uint64_t vid_to_vdi_oid(uint32_t vid)
{
    return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
}

static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
{
    return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
}

static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
{
    return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
}

287
static inline bool is_snapshot(struct SheepdogInode *inode)
288 289 290 291
{
    return !!inode->snap_ctime;
}

292 293 294 295 296 297
static inline size_t count_data_objs(const struct SheepdogInode *inode)
{
    return DIV_ROUND_UP(inode->vdi_size,
                        (1UL << inode->block_size_shift));
}

298
#undef DPRINTF
299
#ifdef DEBUG_SDOG
300
#define DEBUG_SDOG_PRINT 1
301
#else
302
#define DEBUG_SDOG_PRINT 0
303
#endif
304 305 306 307 308 309
#define DPRINTF(fmt, args...)                                           \
    do {                                                                \
        if (DEBUG_SDOG_PRINT) {                                         \
            fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
        }                                                               \
    } while (0)
310 311

typedef struct SheepdogAIOCB SheepdogAIOCB;
P
Paolo Bonzini 已提交
312
typedef struct BDRVSheepdogState BDRVSheepdogState;
313 314 315 316 317 318 319 320 321 322 323

typedef struct AIOReq {
    SheepdogAIOCB *aiocb;
    unsigned int iov_offset;

    uint64_t oid;
    uint64_t base_oid;
    uint64_t offset;
    unsigned int data_len;
    uint8_t flags;
    uint32_t id;
324
    bool create;
325

326
    QLIST_ENTRY(AIOReq) aio_siblings;
327 328 329 330 331
} AIOReq;

enum AIOCBState {
    AIOCB_WRITE_UDATA,
    AIOCB_READ_UDATA,
332
    AIOCB_FLUSH_CACHE,
333
    AIOCB_DISCARD_OBJ,
334 335
};

336
#define AIOCBOverlapping(x, y)                                 \
337 338 339
    (!(x->max_affect_data_idx < y->min_affect_data_idx          \
       || y->max_affect_data_idx < x->min_affect_data_idx))

340
struct SheepdogAIOCB {
P
Paolo Bonzini 已提交
341
    BDRVSheepdogState *s;
342 343 344 345 346 347 348 349 350

    QEMUIOVector *qiov;

    int64_t sector_num;
    int nb_sectors;

    int ret;
    enum AIOCBState aiocb_type;

M
MORITA Kazutaka 已提交
351
    Coroutine *coroutine;
352
    int nr_pending;
353 354 355 356

    uint32_t min_affect_data_idx;
    uint32_t max_affect_data_idx;

357 358 359 360 361 362 363 364 365
    /*
     * The difference between affect_data_idx and dirty_data_idx:
     * affect_data_idx represents range of index of all request types.
     * dirty_data_idx represents range of index updated by COW requests.
     * dirty_data_idx is used for updating an inode object.
     */
    uint32_t min_dirty_data_idx;
    uint32_t max_dirty_data_idx;

366
    QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
367 368
};

P
Paolo Bonzini 已提交
369
struct BDRVSheepdogState {
370
    BlockDriverState *bs;
371
    AioContext *aio_context;
372

373 374 375
    SheepdogInode inode;

    char name[SD_MAX_VDI_LEN];
376
    bool is_snapshot;
377
    uint32_t cache_flags;
378
    bool discard_supported;
379

380
    SocketAddress *addr;
381 382
    int fd;

M
MORITA Kazutaka 已提交
383 384 385 386
    CoMutex lock;
    Coroutine *co_send;
    Coroutine *co_recv;

387
    uint32_t aioreq_seq_num;
388 389

    /* Every aio request must be linked to either of these queues. */
390
    QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
391
    QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
392

P
Paolo Bonzini 已提交
393
    CoMutex queue_lock;
394
    CoQueue overlapping_queue;
395
    QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
P
Paolo Bonzini 已提交
396
};
397

L
Liu Yuan 已提交
398 399 400 401 402
typedef struct BDRVSheepdogReopenState {
    int fd;
    int cache_flags;
} BDRVSheepdogReopenState;

J
Jeff Cody 已提交
403
static const char *sd_strerror(int err)
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
{
    int i;

    static const struct {
        int err;
        const char *desc;
    } errors[] = {
        {SD_RES_SUCCESS, "Success"},
        {SD_RES_UNKNOWN, "Unknown error"},
        {SD_RES_NO_OBJ, "No object found"},
        {SD_RES_EIO, "I/O error"},
        {SD_RES_VDI_EXIST, "VDI exists already"},
        {SD_RES_INVALID_PARMS, "Invalid parameters"},
        {SD_RES_SYSTEM_ERROR, "System error"},
        {SD_RES_VDI_LOCKED, "VDI is already locked"},
        {SD_RES_NO_VDI, "No vdi found"},
        {SD_RES_NO_BASE_VDI, "No base VDI found"},
        {SD_RES_VDI_READ, "Failed read the requested VDI"},
        {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
        {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
        {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
        {SD_RES_NO_TAG, "Failed to find the requested tag"},
        {SD_RES_STARTUP, "The system is still booting"},
        {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
        {SD_RES_SHUTDOWN, "The system is shutting down"},
        {SD_RES_NO_MEM, "Out of memory on the server"},
        {SD_RES_FULL_VDI, "We already have the maximum vdis"},
        {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
        {SD_RES_NO_SPACE, "Server has no space for new objects"},
        {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
        {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
        {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
436
        {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
437
        {SD_RES_READONLY, "Object is read-only"},
438 439 440 441 442 443 444 445 446 447 448 449 450 451
    };

    for (i = 0; i < ARRAY_SIZE(errors); ++i) {
        if (errors[i].err == err) {
            return errors[i].desc;
        }
    }

    return "Invalid error code";
}

/*
 * Sheepdog I/O handling:
 *
M
MORITA Kazutaka 已提交
452
 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
453
 *    link the requests to the inflight_list in the
454
 *    BDRVSheepdogState.  The function yields while waiting for
M
MORITA Kazutaka 已提交
455
 *    receiving the response.
456
 *
M
MORITA Kazutaka 已提交
457
 * 2. We receive the response in aio_read_response, the fd handler to
458 459 460
 *    the sheepdog connection.  We switch back to sd_co_readv/sd_writev
 *    after all the requests belonging to the AIOCB are finished.  If
 *    needed, sd_co_writev will send another requests for the vdi object.
461 462 463 464
 */

static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
                                    uint64_t oid, unsigned int data_len,
465
                                    uint64_t offset, uint8_t flags, bool create,
466 467 468 469
                                    uint64_t base_oid, unsigned int iov_offset)
{
    AIOReq *aio_req;

470
    aio_req = g_malloc(sizeof(*aio_req));
471 472 473 474 475 476 477 478
    aio_req->aiocb = acb;
    aio_req->iov_offset = iov_offset;
    aio_req->oid = oid;
    aio_req->base_oid = base_oid;
    aio_req->offset = offset;
    aio_req->data_len = data_len;
    aio_req->flags = flags;
    aio_req->id = s->aioreq_seq_num++;
479
    aio_req->create = create;
480

481
    acb->nr_pending++;
482 483 484
    return aio_req;
}

485 486 487 488 489 490 491
static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
{
    SheepdogAIOCB *cb;

retry:
    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
        if (AIOCBOverlapping(acb, cb)) {
P
Paolo Bonzini 已提交
492
            qemu_co_queue_wait(&s->overlapping_queue, &s->queue_lock);
493 494 495 496 497
            goto retry;
        }
    }
}

P
Paolo Bonzini 已提交
498 499 500
static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
                         QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
                         int type)
501
{
502 503 504
    uint32_t object_size;

    object_size = (UINT32_C(1) << s->inode.block_size_shift);
505

P
Paolo Bonzini 已提交
506
    acb->s = s;
507 508 509 510 511 512

    acb->qiov = qiov;

    acb->sector_num = sector_num;
    acb->nb_sectors = nb_sectors;

M
MORITA Kazutaka 已提交
513
    acb->coroutine = qemu_coroutine_self();
514
    acb->ret = 0;
515
    acb->nr_pending = 0;
516 517 518 519 520

    acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
    acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
                              acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;

521 522
    acb->min_dirty_data_idx = UINT32_MAX;
    acb->max_dirty_data_idx = 0;
P
Paolo Bonzini 已提交
523
    acb->aiocb_type = type;
524 525 526 527 528

    if (type == AIOCB_FLUSH_CACHE) {
        return;
    }

P
Paolo Bonzini 已提交
529
    qemu_co_mutex_lock(&s->queue_lock);
530 531
    wait_for_overlapping_aiocb(s, acb);
    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
P
Paolo Bonzini 已提交
532
    qemu_co_mutex_unlock(&s->queue_lock);
533 534
}

535
static SocketAddress *sd_socket_address(const char *path,
536 537
                                        const char *host, const char *port)
{
538
    SocketAddress *addr = g_new0(SocketAddress, 1);
539 540

    if (path) {
541 542
        addr->type = SOCKET_ADDRESS_TYPE_UNIX;
        addr->u.q_unix.path = g_strdup(path);
543
    } else {
544 545 546
        addr->type = SOCKET_ADDRESS_TYPE_INET;
        addr->u.inet.host = g_strdup(host ?: SD_DEFAULT_ADDR);
        addr->u.inet.port = g_strdup(port ?: stringify(SD_DEFAULT_PORT));
547 548 549 550 551
    }

    return addr;
}

552
static SocketAddress *sd_server_config(QDict *options, Error **errp)
M
Markus Armbruster 已提交
553 554 555 556
{
    QDict *server = NULL;
    QObject *crumpled_server = NULL;
    Visitor *iv = NULL;
557
    SocketAddress *saddr = NULL;
M
Markus Armbruster 已提交
558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
    Error *local_err = NULL;

    qdict_extract_subqdict(options, &server, "server.");

    crumpled_server = qdict_crumple(server, errp);
    if (!crumpled_server) {
        goto done;
    }

    /*
     * FIXME .numeric, .to, .ipv4 or .ipv6 don't work with -drive
     * server.type=inet.  .to doesn't matter, it's ignored anyway.
     * That's because when @options come from -blockdev or
     * blockdev_add, members are typed according to the QAPI schema,
     * but when they come from -drive, they're all QString.  The
     * visitor expects the former.
     */
    iv = qobject_input_visitor_new(crumpled_server);
576
    visit_type_SocketAddress(iv, NULL, &saddr, &local_err);
M
Markus Armbruster 已提交
577 578 579 580 581 582 583 584 585 586 587 588
    if (local_err) {
        error_propagate(errp, local_err);
        goto done;
    }

done:
    visit_free(iv);
    qobject_decref(crumpled_server);
    QDECREF(server);
    return saddr;
}

L
Liu Yuan 已提交
589
/* Return -EIO in case of error, file descriptor on success */
590
static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
591
{
592
    int fd;
593

594
    fd = socket_connect(s->addr, errp);
595

596
    if (s->addr->type == SOCKET_ADDRESS_TYPE_INET && fd >= 0) {
597 598 599
        int ret = socket_set_nodelay(fd);
        if (ret < 0) {
            error_report("%s", strerror(errno));
600 601
        }
    }
602

603
    if (fd >= 0) {
604
        qemu_set_nonblock(fd);
L
Liu Yuan 已提交
605 606
    } else {
        fd = -EIO;
607 608 609 610 611
    }

    return fd;
}

L
Liu Yuan 已提交
612
/* Return 0 on success and -errno in case of error */
613 614
static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
                                    unsigned int *wlen)
615 616 617 618
{
    int ret;

    ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
619
    if (ret != sizeof(*hdr)) {
620
        error_report("failed to send a req, %s", strerror(errno));
621
        return -errno;
622 623 624
    }

    ret = qemu_co_send(sockfd, data, *wlen);
625
    if (ret != *wlen) {
626
        error_report("failed to send a req, %s", strerror(errno));
627
        return -errno;
628 629 630 631
    }

    return ret;
}
632

633 634
typedef struct SheepdogReqCo {
    int sockfd;
P
Paolo Bonzini 已提交
635
    BlockDriverState *bs;
636
    AioContext *aio_context;
637 638 639 640 641 642
    SheepdogReq *hdr;
    void *data;
    unsigned int *wlen;
    unsigned int *rlen;
    int ret;
    bool finished;
643
    Coroutine *co;
644 645
} SheepdogReqCo;

646 647 648 649 650 651 652
static void restart_co_req(void *opaque)
{
    SheepdogReqCo *srco = opaque;

    aio_co_wake(srco->co);
}

653
static coroutine_fn void do_co_req(void *opaque)
654 655
{
    int ret;
656 657 658 659 660 661
    SheepdogReqCo *srco = opaque;
    int sockfd = srco->sockfd;
    SheepdogReq *hdr = srco->hdr;
    void *data = srco->data;
    unsigned int *wlen = srco->wlen;
    unsigned int *rlen = srco->rlen;
662

663
    srco->co = qemu_coroutine_self();
664
    aio_set_fd_handler(srco->aio_context, sockfd, false,
665
                       NULL, restart_co_req, NULL, srco);
666 667 668 669 670 671

    ret = send_co_req(sockfd, hdr, data, wlen);
    if (ret < 0) {
        goto out;
    }

672
    aio_set_fd_handler(srco->aio_context, sockfd, false,
673
                       restart_co_req, NULL, NULL, srco);
674

675
    ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
676
    if (ret != sizeof(*hdr)) {
677
        error_report("failed to get a rsp, %s", strerror(errno));
678
        ret = -errno;
679 680 681 682 683 684 685 686 687
        goto out;
    }

    if (*rlen > hdr->data_length) {
        *rlen = hdr->data_length;
    }

    if (*rlen) {
        ret = qemu_co_recv(sockfd, data, *rlen);
688
        if (ret != *rlen) {
689
            error_report("failed to get the data, %s", strerror(errno));
690
            ret = -errno;
691 692 693 694 695
            goto out;
        }
    }
    ret = 0;
out:
696 697
    /* there is at most one request for this sockfd, so it is safe to
     * set each handler to NULL. */
698
    aio_set_fd_handler(srco->aio_context, sockfd, false,
699
                       NULL, NULL, NULL, NULL);
700

701
    srco->co = NULL;
702
    srco->ret = ret;
703 704
    /* Set srco->finished before reading bs->wakeup.  */
    atomic_mb_set(&srco->finished, true);
705 706 707
    if (srco->bs) {
        bdrv_wakeup(srco->bs);
    }
708 709
}

L
Liu Yuan 已提交
710 711 712 713 714
/*
 * Send the request to the sheep in a synchronous manner.
 *
 * Return 0 on success, -errno in case of error.
 */
P
Paolo Bonzini 已提交
715
static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
716
                  void *data, unsigned int *wlen, unsigned int *rlen)
717 718 719 720
{
    Coroutine *co;
    SheepdogReqCo srco = {
        .sockfd = sockfd,
P
Paolo Bonzini 已提交
721 722
        .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
        .bs = bs,
723 724 725 726 727 728 729 730 731 732 733
        .hdr = hdr,
        .data = data,
        .wlen = wlen,
        .rlen = rlen,
        .ret = 0,
        .finished = false,
    };

    if (qemu_in_coroutine()) {
        do_co_req(&srco);
    } else {
734
        co = qemu_coroutine_create(do_co_req, &srco);
P
Paolo Bonzini 已提交
735
        if (bs) {
736
            bdrv_coroutine_enter(bs, co);
P
Paolo Bonzini 已提交
737 738 739 740 741 742
            BDRV_POLL_WHILE(bs, !srco.finished);
        } else {
            qemu_coroutine_enter(co);
            while (!srco.finished) {
                aio_poll(qemu_get_aio_context(), true);
            }
743 744 745 746
        }
    }

    return srco.ret;
747 748
}

749
static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
750 751
                                         struct iovec *iov, int niov,
                                         enum AIOCBState aiocb_type);
752
static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
753
static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
754
static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
755
static void co_write_request(void *opaque);
756

757 758 759 760 761
static coroutine_fn void reconnect_to_sdog(void *opaque)
{
    BDRVSheepdogState *s = opaque;
    AIOReq *aio_req, *next;

762
    aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
763
                       NULL, NULL, NULL);
764 765 766 767 768 769 770 771 772 773
    close(s->fd);
    s->fd = -1;

    /* Wait for outstanding write requests to be completed. */
    while (s->co_send != NULL) {
        co_write_request(opaque);
    }

    /* Try to reconnect the sheepdog server every one second. */
    while (s->fd < 0) {
774
        Error *local_err = NULL;
775
        s->fd = get_sheep_fd(s, &local_err);
776 777
        if (s->fd < 0) {
            DPRINTF("Wait for connection to be established\n");
778
            error_report_err(local_err);
779
            qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000000ULL);
780 781 782 783 784 785 786 787 788 789
        }
    };

    /*
     * Now we have to resend all the request in the inflight queue.  However,
     * resend_aioreq() can yield and newly created requests can be added to the
     * inflight queue before the coroutine is resumed.  To avoid mixing them, we
     * have to move all the inflight requests to the failed queue before
     * resend_aioreq() is called.
     */
P
Paolo Bonzini 已提交
790
    qemu_co_mutex_lock(&s->queue_lock);
791 792 793 794 795 796 797 798 799
    QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
        QLIST_REMOVE(aio_req, aio_siblings);
        QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
    }

    /* Resend all the failed aio requests. */
    while (!QLIST_EMPTY(&s->failed_aio_head)) {
        aio_req = QLIST_FIRST(&s->failed_aio_head);
        QLIST_REMOVE(aio_req, aio_siblings);
P
Paolo Bonzini 已提交
800
        qemu_co_mutex_unlock(&s->queue_lock);
801
        resend_aioreq(s, aio_req);
P
Paolo Bonzini 已提交
802
        qemu_co_mutex_lock(&s->queue_lock);
803
    }
P
Paolo Bonzini 已提交
804
    qemu_co_mutex_unlock(&s->queue_lock);
805 806
}

807 808 809 810 811 812
/*
 * Receive responses of the I/O requests.
 *
 * This function is registered as a fd handler, and called from the
 * main loop when s->fd is ready for reading responses.
 */
P
Paolo Bonzini 已提交
813
static void coroutine_fn aio_read_response(void *opaque)
814 815 816 817 818 819 820
{
    SheepdogObjRsp rsp;
    BDRVSheepdogState *s = opaque;
    int fd = s->fd;
    int ret;
    AIOReq *aio_req = NULL;
    SheepdogAIOCB *acb;
821
    uint64_t idx;
822 823

    /* read a header */
824
    ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
825
    if (ret != sizeof(rsp)) {
826
        error_report("failed to get the header, %s", strerror(errno));
827
        goto err;
828 829
    }

830 831
    /* find the right aio_req from the inflight aio list */
    QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
832 833 834 835 836
        if (aio_req->id == rsp.id) {
            break;
        }
    }
    if (!aio_req) {
837
        error_report("cannot find aio_req %x", rsp.id);
838
        goto err;
839 840 841 842 843 844 845 846 847 848 849
    }

    acb = aio_req->aiocb;

    switch (acb->aiocb_type) {
    case AIOCB_WRITE_UDATA:
        if (!is_data_obj(aio_req->oid)) {
            break;
        }
        idx = data_oid_to_idx(aio_req->oid);

850
        if (aio_req->create) {
851 852 853 854 855 856
            /*
             * If the object is newly created one, we need to update
             * the vdi object (metadata object).  min_dirty_data_idx
             * and max_dirty_data_idx are changed to include updated
             * index between them.
             */
857 858
            if (rsp.result == SD_RES_SUCCESS) {
                s->inode.data_vdi_id[idx] = s->inode.vdi_id;
859 860
                acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
                acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
861
            }
862 863 864
        }
        break;
    case AIOCB_READ_UDATA:
865 866
        ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
                            aio_req->iov_offset, rsp.data_length);
867
        if (ret != rsp.data_length) {
868
            error_report("failed to get the data, %s", strerror(errno));
869
            goto err;
870 871
        }
        break;
872 873
    case AIOCB_FLUSH_CACHE:
        if (rsp.result == SD_RES_INVALID_PARMS) {
874
            DPRINTF("disable cache since the server doesn't support it\n");
875 876 877 878
            s->cache_flags = SD_FLAG_CMD_DIRECT;
            rsp.result = SD_RES_SUCCESS;
        }
        break;
879 880 881
    case AIOCB_DISCARD_OBJ:
        switch (rsp.result) {
        case SD_RES_INVALID_PARMS:
882
            error_report("server doesn't support discard command");
883 884 885 886 887 888
            rsp.result = SD_RES_SUCCESS;
            s->discard_supported = false;
            break;
        default:
            break;
        }
889 890
    }

891 892 893 894 895
    /* No more data for this aio_req (reload_inode below uses its own file
     * descriptor handler which doesn't use co_recv).
    */
    s->co_recv = NULL;

P
Paolo Bonzini 已提交
896
    qemu_co_mutex_lock(&s->queue_lock);
897
    QLIST_REMOVE(aio_req, aio_siblings);
P
Paolo Bonzini 已提交
898 899
    qemu_co_mutex_unlock(&s->queue_lock);

900 901 902 903
    switch (rsp.result) {
    case SD_RES_SUCCESS:
        break;
    case SD_RES_READONLY:
904 905 906
        if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
            ret = reload_inode(s, 0, "");
            if (ret < 0) {
907
                goto err;
908 909 910 911 912 913 914 915
            }
        }
        if (is_data_obj(aio_req->oid)) {
            aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
                                           data_oid_to_idx(aio_req->oid));
        } else {
            aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
        }
916
        resend_aioreq(s, aio_req);
917
        return;
918
    default:
919
        acb->ret = -EIO;
920
        error_report("%s", sd_strerror(rsp.result));
921
        break;
922 923
    }

924 925 926
    g_free(aio_req);

    if (!--acb->nr_pending) {
927 928
        /*
         * We've finished all requests which belong to the AIOCB, so
M
MORITA Kazutaka 已提交
929
         * we can switch back to sd_co_readv/writev now.
930
         */
931
        aio_co_wake(acb->coroutine);
932
    }
933

934
    return;
935

936 937
err:
    reconnect_to_sdog(opaque);
M
MORITA Kazutaka 已提交
938 939 940 941 942 943 944
}

static void co_read_response(void *opaque)
{
    BDRVSheepdogState *s = opaque;

    if (!s->co_recv) {
945
        s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
M
MORITA Kazutaka 已提交
946 947
    }

948
    aio_co_enter(s->aio_context, s->co_recv);
M
MORITA Kazutaka 已提交
949 950 951 952 953 954
}

static void co_write_request(void *opaque)
{
    BDRVSheepdogState *s = opaque;

955
    aio_co_wake(s->co_send);
956 957 958
}

/*
D
Deepak Kathayat 已提交
959
 * Return a socket descriptor to read/write objects.
960
 *
D
Deepak Kathayat 已提交
961
 * We cannot use this descriptor for other operations because
962 963
 * the block driver may be on waiting response from the server.
 */
964
static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
965
{
966
    int fd;
967

968
    fd = connect_to_sdog(s, errp);
969
    if (fd < 0) {
970
        return fd;
971 972
    }

973
    aio_set_fd_handler(s->aio_context, fd, false,
974
                       co_read_response, NULL, NULL, s);
975 976 977
    return fd;
}

978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
/*
 * Parse numeric snapshot ID in @str
 * If @str can't be parsed as number, return false.
 * Else, if the number is zero or too large, set *@snapid to zero and
 * return true.
 * Else, set *@snapid to the number and return true.
 */
static bool sd_parse_snapid(const char *str, uint32_t *snapid)
{
    unsigned long ul;
    int ret;

    ret = qemu_strtoul(str, NULL, 10, &ul);
    if (ret == -ERANGE) {
        ul = ret = 0;
    }
    if (ret) {
        return false;
    }
    if (ul > UINT32_MAX) {
        ul = 0;
    }

    *snapid = ul;
    return true;
}

static bool sd_parse_snapid_or_tag(const char *str,
                                   uint32_t *snapid, char tag[])
{
    if (!sd_parse_snapid(str, snapid)) {
        *snapid = 0;
        if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) {
            return false;
        }
    } else if (!*snapid) {
        return false;
    } else {
        tag[0] = 0;
    }
    return true;
}

1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
typedef struct {
    const char *path;           /* non-null iff transport is tcp */
    const char *host;           /* valid when transport is tcp */
    int port;                   /* valid when transport is tcp */
    char vdi[SD_MAX_VDI_LEN];
    char tag[SD_MAX_VDI_TAG_LEN];
    uint32_t snap_id;
    /* Remainder is only for sd_config_done() */
    URI *uri;
    QueryParams *qp;
} SheepdogConfig;

static void sd_config_done(SheepdogConfig *cfg)
{
    if (cfg->qp) {
        query_params_free(cfg->qp);
    }
    uri_free(cfg->uri);
}

static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
1042
                         Error **errp)
M
MORITA Kazutaka 已提交
1043
{
1044
    Error *err = NULL;
M
MORITA Kazutaka 已提交
1045
    QueryParams *qp = NULL;
1046 1047
    bool is_unix;
    URI *uri;
M
MORITA Kazutaka 已提交
1048

1049 1050 1051
    memset(cfg, 0, sizeof(*cfg));

    cfg->uri = uri = uri_parse(filename);
M
MORITA Kazutaka 已提交
1052
    if (!uri) {
1053 1054
        error_setg(&err, "invalid URI");
        goto out;
M
MORITA Kazutaka 已提交
1055 1056
    }

1057
    /* transport */
1058
    if (!g_strcmp0(uri->scheme, "sheepdog")) {
1059
        is_unix = false;
1060
    } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
1061
        is_unix = false;
1062
    } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
1063
        is_unix = true;
1064
    } else {
1065 1066
        error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
                   " or 'sheepdog+unix'");
1067 1068 1069
        goto out;
    }

M
MORITA Kazutaka 已提交
1070
    if (uri->path == NULL || !strcmp(uri->path, "/")) {
1071
        error_setg(&err, "missing file path in URI");
M
MORITA Kazutaka 已提交
1072 1073
        goto out;
    }
1074 1075
    if (g_strlcpy(cfg->vdi, uri->path + 1, SD_MAX_VDI_LEN)
        >= SD_MAX_VDI_LEN) {
1076
        error_setg(&err, "VDI name is too long");
1077 1078
        goto out;
    }
M
MORITA Kazutaka 已提交
1079

1080
    cfg->qp = qp = query_params_parse(uri->query);
1081

1082
    if (is_unix) {
1083
        /* sheepdog+unix:///vdiname?socket=path */
1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
        if (uri->server || uri->port) {
            error_setg(&err, "URI scheme %s doesn't accept a server address",
                       uri->scheme);
            goto out;
        }
        if (!qp->n) {
            error_setg(&err,
                       "URI scheme %s requires query parameter 'socket'",
                       uri->scheme);
            goto out;
        }
        if (qp->n != 1 || strcmp(qp->p[0].name, "socket")) {
            error_setg(&err, "unexpected query parameters");
1097 1098
            goto out;
        }
1099
        cfg->path = qp->p[0].value;
1100 1101
    } else {
        /* sheepdog[+tcp]://[host:port]/vdiname */
1102 1103 1104 1105
        if (qp->n) {
            error_setg(&err, "unexpected query parameters");
            goto out;
        }
1106 1107
        cfg->host = uri->server;
        cfg->port = uri->port;
1108
    }
M
MORITA Kazutaka 已提交
1109 1110 1111

    /* snapshot tag */
    if (uri->fragment) {
1112 1113
        if (!sd_parse_snapid_or_tag(uri->fragment,
                                    &cfg->snap_id, cfg->tag)) {
1114 1115
            error_setg(&err, "'%s' is not a valid snapshot ID",
                       uri->fragment);
1116
            goto out;
M
MORITA Kazutaka 已提交
1117 1118
        }
    } else {
1119
        cfg->snap_id = CURRENT_VDI_ID; /* search current vdi */
M
MORITA Kazutaka 已提交
1120 1121 1122
    }

out:
1123 1124
    if (err) {
        error_propagate(errp, err);
1125
        sd_config_done(cfg);
M
MORITA Kazutaka 已提交
1126 1127 1128
    }
}

1129
/*
M
MORITA Kazutaka 已提交
1130
 * Parse a filename (old syntax)
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
 *
 * filename must be one of the following formats:
 *   1. [vdiname]
 *   2. [vdiname]:[snapid]
 *   3. [vdiname]:[tag]
 *   4. [hostname]:[port]:[vdiname]
 *   5. [hostname]:[port]:[vdiname]:[snapid]
 *   6. [hostname]:[port]:[vdiname]:[tag]
 *
 * You can boot from the snapshot images by specifying `snapid` or
 * `tag'.
 *
 * You can run VMs outside the Sheepdog cluster by specifying
 * `hostname' and `port' (experimental).
 */
1146
static void parse_vdiname(SheepdogConfig *cfg, const char *filename,
1147
                          Error **errp)
1148
{
1149
    Error *err = NULL;
M
MORITA Kazutaka 已提交
1150 1151
    char *p, *q, *uri;
    const char *host_spec, *vdi_spec;
1152
    int nr_sep;
1153

L
Laurent Vivier 已提交
1154
    strstart(filename, "sheepdog:", &filename);
1155
    p = q = g_strdup(filename);
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166

    /* count the number of separators */
    nr_sep = 0;
    while (*p) {
        if (*p == ':') {
            nr_sep++;
        }
        p++;
    }
    p = q;

M
MORITA Kazutaka 已提交
1167
    /* use the first two tokens as host_spec. */
1168
    if (nr_sep >= 2) {
M
MORITA Kazutaka 已提交
1169
        host_spec = p;
1170
        p = strchr(p, ':');
M
MORITA Kazutaka 已提交
1171
        p++;
1172 1173 1174
        p = strchr(p, ':');
        *p++ = '\0';
    } else {
M
MORITA Kazutaka 已提交
1175
        host_spec = "";
1176 1177
    }

M
MORITA Kazutaka 已提交
1178
    vdi_spec = p;
1179

M
MORITA Kazutaka 已提交
1180
    p = strchr(vdi_spec, ':');
1181
    if (p) {
M
MORITA Kazutaka 已提交
1182
        *p++ = '#';
1183 1184
    }

M
MORITA Kazutaka 已提交
1185
    uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
1186

1187 1188 1189 1190
    /*
     * FIXME We to escape URI meta-characters, e.g. "x?y=z"
     * produces "sheepdog://x?y=z".  Because of that ...
     */
1191
    sd_parse_uri(cfg, uri, &err);
1192 1193 1194 1195 1196 1197 1198 1199 1200
    if (err) {
        /*
         * ... this can fail, but the error message is misleading.
         * Replace it by the traditional useless one until the
         * escaping is fixed.
         */
        error_free(err);
        error_setg(errp, "Can't parse filename");
    }
M
MORITA Kazutaka 已提交
1201 1202 1203

    g_free(q);
    g_free(uri);
1204 1205
}

1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
static void sd_parse_filename(const char *filename, QDict *options,
                              Error **errp)
{
    Error *err = NULL;
    SheepdogConfig cfg;
    char buf[32];

    if (strstr(filename, "://")) {
        sd_parse_uri(&cfg, filename, &err);
    } else {
        parse_vdiname(&cfg, filename, &err);
    }
    if (err) {
        error_propagate(errp, err);
        return;
    }

    if (cfg.path) {
M
Markus Armbruster 已提交
1224 1225 1226 1227 1228 1229 1230 1231
        qdict_set_default_str(options, "server.path", cfg.path);
        qdict_set_default_str(options, "server.type", "unix");
    } else {
        qdict_set_default_str(options, "server.type", "inet");
        qdict_set_default_str(options, "server.host",
                              cfg.host ?: SD_DEFAULT_ADDR);
        snprintf(buf, sizeof(buf), "%d", cfg.port ?: SD_DEFAULT_PORT);
        qdict_set_default_str(options, "server.port", buf);
1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
    }
    qdict_set_default_str(options, "vdi", cfg.vdi);
    qdict_set_default_str(options, "tag", cfg.tag);
    if (cfg.snap_id) {
        snprintf(buf, sizeof(buf), "%d", cfg.snap_id);
        qdict_set_default_str(options, "snap-id", buf);
    }

    sd_config_done(&cfg);
}

1243 1244
static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
                         uint32_t snapid, const char *tag, uint32_t *vid,
1245
                         bool lock, Error **errp)
1246 1247 1248 1249 1250 1251 1252
{
    int ret, fd;
    SheepdogVdiReq hdr;
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
    unsigned int wlen, rlen = 0;
    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];

1253
    fd = connect_to_sdog(s, errp);
1254
    if (fd < 0) {
1255
        return fd;
1256 1257
    }

1258 1259 1260 1261
    /* This pair of strncpy calls ensures that the buffer is zero-filled,
     * which is desirable since we'll soon be sending those bytes, and
     * don't want the send_req to read uninitialized data.
     */
1262 1263 1264 1265
    strncpy(buf, filename, SD_MAX_VDI_LEN);
    strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);

    memset(&hdr, 0, sizeof(hdr));
1266
    if (lock) {
1267
        hdr.opcode = SD_OP_LOCK_VDI;
1268
        hdr.type = LOCK_TYPE_NORMAL;
1269 1270
    } else {
        hdr.opcode = SD_OP_GET_VDI_INFO;
1271 1272 1273 1274 1275 1276 1277
    }
    wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
    hdr.proto_ver = SD_PROTO_VER;
    hdr.data_length = wlen;
    hdr.snapid = snapid;
    hdr.flags = SD_FLAG_CMD_WRITE;

P
Paolo Bonzini 已提交
1278
    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1279
    if (ret) {
1280
        error_setg_errno(errp, -ret, "cannot get vdi info");
1281 1282 1283 1284
        goto out;
    }

    if (rsp->result != SD_RES_SUCCESS) {
1285 1286
        error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
                   sd_strerror(rsp->result), filename, snapid, tag);
1287 1288
        if (rsp->result == SD_RES_NO_VDI) {
            ret = -ENOENT;
1289 1290
        } else if (rsp->result == SD_RES_VDI_LOCKED) {
            ret = -EBUSY;
1291 1292 1293
        } else {
            ret = -EIO;
        }
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303
        goto out;
    }
    *vid = rsp->vdi_id;

    ret = 0;
out:
    closesocket(fd);
    return ret;
}

1304
static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1305 1306
                                         struct iovec *iov, int niov,
                                         enum AIOCBState aiocb_type)
1307 1308 1309
{
    int nr_copies = s->inode.nr_copies;
    SheepdogObjReq hdr;
1310
    unsigned int wlen = 0;
1311 1312 1313 1314 1315 1316
    int ret;
    uint64_t oid = aio_req->oid;
    unsigned int datalen = aio_req->data_len;
    uint64_t offset = aio_req->offset;
    uint8_t flags = aio_req->flags;
    uint64_t old_oid = aio_req->base_oid;
1317
    bool create = aio_req->create;
1318

P
Paolo Bonzini 已提交
1319
    qemu_co_mutex_lock(&s->queue_lock);
1320
    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
P
Paolo Bonzini 已提交
1321
    qemu_co_mutex_unlock(&s->queue_lock);
1322

1323
    if (!nr_copies) {
1324
        error_report("bug");
1325 1326 1327 1328
    }

    memset(&hdr, 0, sizeof(hdr));

1329 1330 1331 1332 1333
    switch (aiocb_type) {
    case AIOCB_FLUSH_CACHE:
        hdr.opcode = SD_OP_FLUSH_VDI;
        break;
    case AIOCB_READ_UDATA:
1334 1335
        hdr.opcode = SD_OP_READ_OBJ;
        hdr.flags = flags;
1336 1337 1338 1339 1340 1341 1342
        break;
    case AIOCB_WRITE_UDATA:
        if (create) {
            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
        } else {
            hdr.opcode = SD_OP_WRITE_OBJ;
        }
1343 1344
        wlen = datalen;
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
1345
        break;
1346
    case AIOCB_DISCARD_OBJ:
1347 1348 1349 1350 1351 1352 1353
        hdr.opcode = SD_OP_WRITE_OBJ;
        hdr.flags = SD_FLAG_CMD_WRITE | flags;
        s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
        offset = offsetof(SheepdogInode,
                          data_vdi_id[data_oid_to_idx(oid)]);
        oid = vid_to_vdi_oid(s->inode.vdi_id);
        wlen = datalen = sizeof(uint32_t);
1354
        break;
1355 1356
    }

1357 1358
    if (s->cache_flags) {
        hdr.flags |= s->cache_flags;
1359 1360
    }

1361 1362 1363 1364 1365 1366 1367 1368 1369
    hdr.oid = oid;
    hdr.cow_oid = old_oid;
    hdr.copies = s->inode.nr_copies;

    hdr.data_length = datalen;
    hdr.offset = offset;

    hdr.id = aio_req->id;

M
MORITA Kazutaka 已提交
1370 1371
    qemu_co_mutex_lock(&s->lock);
    s->co_send = qemu_coroutine_self();
1372
    aio_set_fd_handler(s->aio_context, s->fd, false,
1373
                       co_read_response, co_write_request, NULL, s);
P
Paolo Bonzini 已提交
1374
    socket_set_cork(s->fd, 1);
1375 1376

    /* send a header */
1377
    ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
1378
    if (ret != sizeof(hdr)) {
1379
        error_report("failed to send a req, %s", strerror(errno));
1380
        goto out;
1381 1382 1383
    }

    if (wlen) {
1384
        ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
1385
        if (ret != wlen) {
1386
            error_report("failed to send a data, %s", strerror(errno));
1387 1388
        }
    }
1389
out:
P
Paolo Bonzini 已提交
1390
    socket_set_cork(s->fd, 0);
1391
    aio_set_fd_handler(s->aio_context, s->fd, false,
1392
                       co_read_response, NULL, NULL, s);
1393
    s->co_send = NULL;
M
MORITA Kazutaka 已提交
1394
    qemu_co_mutex_unlock(&s->lock);
1395 1396
}

P
Paolo Bonzini 已提交
1397
static int read_write_object(int fd, BlockDriverState *bs, char *buf,
1398
                             uint64_t oid, uint8_t copies,
1399
                             unsigned int datalen, uint64_t offset,
1400
                             bool write, bool create, uint32_t cache_flags)
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422
{
    SheepdogObjReq hdr;
    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
    unsigned int wlen, rlen;
    int ret;

    memset(&hdr, 0, sizeof(hdr));

    if (write) {
        wlen = datalen;
        rlen = 0;
        hdr.flags = SD_FLAG_CMD_WRITE;
        if (create) {
            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
        } else {
            hdr.opcode = SD_OP_WRITE_OBJ;
        }
    } else {
        wlen = 0;
        rlen = datalen;
        hdr.opcode = SD_OP_READ_OBJ;
    }
1423

1424
    hdr.flags |= cache_flags;
1425

1426 1427 1428 1429 1430
    hdr.oid = oid;
    hdr.data_length = datalen;
    hdr.offset = offset;
    hdr.copies = copies;

P
Paolo Bonzini 已提交
1431
    ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1432
    if (ret) {
1433
        error_report("failed to send a request to the sheep");
1434
        return ret;
1435 1436 1437 1438 1439 1440
    }

    switch (rsp->result) {
    case SD_RES_SUCCESS:
        return 0;
    default:
1441
        error_report("%s", sd_strerror(rsp->result));
1442
        return -EIO;
1443 1444 1445
    }
}

P
Paolo Bonzini 已提交
1446
static int read_object(int fd, BlockDriverState *bs, char *buf,
1447
                       uint64_t oid, uint8_t copies,
1448 1449
                       unsigned int datalen, uint64_t offset,
                       uint32_t cache_flags)
1450
{
P
Paolo Bonzini 已提交
1451
    return read_write_object(fd, bs, buf, oid, copies,
1452
                             datalen, offset, false,
1453
                             false, cache_flags);
1454 1455
}

P
Paolo Bonzini 已提交
1456
static int write_object(int fd, BlockDriverState *bs, char *buf,
1457
                        uint64_t oid, uint8_t copies,
1458
                        unsigned int datalen, uint64_t offset, bool create,
1459
                        uint32_t cache_flags)
1460
{
P
Paolo Bonzini 已提交
1461
    return read_write_object(fd, bs, buf, oid, copies,
1462
                             datalen, offset, true,
1463
                             create, cache_flags);
1464 1465
}

1466 1467 1468
/* update inode with the latest state */
static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
{
1469
    Error *local_err = NULL;
1470 1471 1472 1473
    SheepdogInode *inode;
    int ret = 0, fd;
    uint32_t vid = 0;

1474
    fd = connect_to_sdog(s, &local_err);
1475
    if (fd < 0) {
1476
        error_report_err(local_err);
1477 1478 1479
        return -EIO;
    }

1480
    inode = g_malloc(SD_INODE_HEADER_SIZE);
1481

1482
    ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
1483
    if (ret) {
1484
        error_report_err(local_err);
1485 1486 1487
        goto out;
    }

P
Paolo Bonzini 已提交
1488
    ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
1489 1490
                      s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
                      s->cache_flags);
1491 1492 1493 1494 1495
    if (ret < 0) {
        goto out;
    }

    if (inode->vdi_id != s->inode.vdi_id) {
1496
        memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE);
1497 1498 1499 1500 1501 1502 1503 1504 1505
    }

out:
    g_free(inode);
    closesocket(fd);

    return ret;
}

1506
static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
1507 1508
{
    SheepdogAIOCB *acb = aio_req->aiocb;
1509 1510

    aio_req->create = false;
1511 1512

    /* check whether this request becomes a CoW one */
1513
    if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
1514 1515 1516 1517 1518 1519
        int idx = data_oid_to_idx(aio_req->oid);

        if (is_data_obj_writable(&s->inode, idx)) {
            goto out;
        }

1520 1521 1522 1523
        if (s->inode.data_vdi_id[idx]) {
            aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
            aio_req->flags |= SD_FLAG_CMD_COW;
        }
1524
        aio_req->create = true;
1525 1526
    }
out:
1527
    if (is_data_obj(aio_req->oid)) {
1528
        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1529
                        acb->aiocb_type);
1530 1531 1532 1533
    } else {
        struct iovec iov;
        iov.iov_base = &s->inode;
        iov.iov_len = sizeof(s->inode);
1534
        add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
1535
    }
1536 1537
}

1538 1539 1540 1541
static void sd_detach_aio_context(BlockDriverState *bs)
{
    BDRVSheepdogState *s = bs->opaque;

1542
    aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
1543
                       NULL, NULL, NULL);
1544 1545 1546 1547 1548 1549 1550 1551
}

static void sd_attach_aio_context(BlockDriverState *bs,
                                  AioContext *new_context)
{
    BDRVSheepdogState *s = bs->opaque;

    s->aio_context = new_context;
1552
    aio_set_fd_handler(new_context, s->fd, false,
1553
                       co_read_response, NULL, NULL, s);
1554 1555
}

1556 1557 1558 1559
static QemuOptsList runtime_opts = {
    .name = "sheepdog",
    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
    .desc = {
1560 1561 1562 1563 1564 1565 1566 1567 1568 1569
        {
            .name = "vdi",
            .type = QEMU_OPT_STRING,
        },
        {
            .name = "snap-id",
            .type = QEMU_OPT_NUMBER,
        },
        {
            .name = "tag",
1570 1571 1572 1573 1574 1575
            .type = QEMU_OPT_STRING,
        },
        { /* end of list */ }
    },
};

M
Max Reitz 已提交
1576 1577
static int sd_open(BlockDriverState *bs, QDict *options, int flags,
                   Error **errp)
1578 1579 1580 1581
{
    int ret, fd;
    uint32_t vid = 0;
    BDRVSheepdogState *s = bs->opaque;
M
Markus Armbruster 已提交
1582
    const char *vdi, *snap_id_str, *tag;
1583
    uint64_t snap_id;
1584
    char *buf = NULL;
1585 1586 1587
    QemuOpts *opts;
    Error *local_err = NULL;

1588
    s->bs = bs;
1589
    s->aio_context = bdrv_get_aio_context(bs);
1590

1591
    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
1592
    qemu_opts_absorb_qdict(opts, options, &local_err);
1593
    if (local_err) {
1594
        error_propagate(errp, local_err);
1595
        ret = -EINVAL;
1596
        goto err_no_fd;
1597 1598
    }

M
Markus Armbruster 已提交
1599 1600 1601 1602 1603 1604
    s->addr = sd_server_config(options, errp);
    if (!s->addr) {
        ret = -EINVAL;
        goto err_no_fd;
    }

1605 1606 1607 1608
    vdi = qemu_opt_get(opts, "vdi");
    snap_id_str = qemu_opt_get(opts, "snap-id");
    snap_id = qemu_opt_get_number(opts, "snap-id", CURRENT_VDI_ID);
    tag = qemu_opt_get(opts, "tag");
1609

1610 1611 1612 1613 1614 1615 1616 1617 1618 1619
    if (!vdi) {
        error_setg(errp, "parameter 'vdi' is missing");
        ret = -EINVAL;
        goto err_no_fd;
    }
    if (strlen(vdi) >= SD_MAX_VDI_LEN) {
        error_setg(errp, "value of parameter 'vdi' is too long");
        ret = -EINVAL;
        goto err_no_fd;
    }
1620

1621 1622 1623 1624 1625 1626 1627 1628 1629
    if (snap_id > UINT32_MAX) {
        snap_id = 0;
    }
    if (snap_id_str && !snap_id) {
        error_setg(errp, "'snap-id=%s' is not a valid snapshot ID",
                   snap_id_str);
        ret = -EINVAL;
        goto err_no_fd;
    }
M
MORITA Kazutaka 已提交
1630

1631 1632
    if (!tag) {
        tag = "";
M
MORITA Kazutaka 已提交
1633
    }
1634
    if (strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
1635
        error_setg(errp, "value of parameter 'tag' is too long");
1636
        ret = -EINVAL;
1637
        goto err_no_fd;
1638
    }
1639 1640 1641 1642 1643

    QLIST_INIT(&s->inflight_aio_head);
    QLIST_INIT(&s->failed_aio_head);
    QLIST_INIT(&s->inflight_aiocb_head);

1644
    s->fd = get_sheep_fd(s, errp);
1645
    if (s->fd < 0) {
1646
        ret = s->fd;
1647
        goto err_no_fd;
1648 1649
    }

1650
    ret = find_vdi_name(s, vdi, (uint32_t)snap_id, tag, &vid, true, errp);
1651
    if (ret) {
1652
        goto err;
1653 1654
    }

1655 1656 1657 1658 1659 1660 1661 1662
    /*
     * QEMU block layer emulates writethrough cache as 'writeback + flush', so
     * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
     */
    s->cache_flags = SD_FLAG_CMD_CACHE;
    if (flags & BDRV_O_NOCACHE) {
        s->cache_flags = SD_FLAG_CMD_DIRECT;
    }
1663
    s->discard_supported = true;
1664

1665
    if (snap_id || tag[0]) {
1666
        DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
1667
        s->is_snapshot = true;
1668 1669
    }

1670
    fd = connect_to_sdog(s, errp);
1671
    if (fd < 0) {
1672
        ret = fd;
1673
        goto err;
1674 1675
    }

1676
    buf = g_malloc(SD_INODE_SIZE);
P
Paolo Bonzini 已提交
1677
    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
1678
                      0, SD_INODE_SIZE, 0, s->cache_flags);
1679 1680 1681 1682

    closesocket(fd);

    if (ret) {
1683
        error_setg(errp, "Can't read snapshot inode");
1684
        goto err;
1685 1686 1687 1688
    }

    memcpy(&s->inode, buf, sizeof(s->inode));

L
Liu Yuan 已提交
1689
    bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
1690
    pstrcpy(s->name, sizeof(s->name), vdi);
M
MORITA Kazutaka 已提交
1691
    qemu_co_mutex_init(&s->lock);
P
Paolo Bonzini 已提交
1692
    qemu_co_mutex_init(&s->queue_lock);
1693
    qemu_co_queue_init(&s->overlapping_queue);
1694
    qemu_opts_del(opts);
1695
    g_free(buf);
1696
    return 0;
1697 1698

err:
1699
    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
1700
                       false, NULL, NULL, NULL, NULL);
1701 1702
    closesocket(s->fd);
err_no_fd:
1703
    qemu_opts_del(opts);
1704
    g_free(buf);
1705
    return ret;
1706 1707
}

L
Liu Yuan 已提交
1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736
static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
                             Error **errp)
{
    BDRVSheepdogState *s = state->bs->opaque;
    BDRVSheepdogReopenState *re_s;
    int ret = 0;

    re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);

    re_s->cache_flags = SD_FLAG_CMD_CACHE;
    if (state->flags & BDRV_O_NOCACHE) {
        re_s->cache_flags = SD_FLAG_CMD_DIRECT;
    }

    re_s->fd = get_sheep_fd(s, errp);
    if (re_s->fd < 0) {
        ret = re_s->fd;
        return ret;
    }

    return ret;
}

static void sd_reopen_commit(BDRVReopenState *state)
{
    BDRVSheepdogReopenState *re_s = state->opaque;
    BDRVSheepdogState *s = state->bs->opaque;

    if (s->fd) {
1737
        aio_set_fd_handler(s->aio_context, s->fd, false,
1738
                           NULL, NULL, NULL, NULL);
L
Liu Yuan 已提交
1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760
        closesocket(s->fd);
    }

    s->fd = re_s->fd;
    s->cache_flags = re_s->cache_flags;

    g_free(state->opaque);
    state->opaque = NULL;

    return;
}

static void sd_reopen_abort(BDRVReopenState *state)
{
    BDRVSheepdogReopenState *re_s = state->opaque;
    BDRVSheepdogState *s = state->bs->opaque;

    if (re_s == NULL) {
        return;
    }

    if (re_s->fd) {
1761
        aio_set_fd_handler(s->aio_context, re_s->fd, false,
1762
                           NULL, NULL, NULL, NULL);
L
Liu Yuan 已提交
1763 1764 1765 1766 1767 1768 1769 1770 1771
        closesocket(re_s->fd);
    }

    g_free(state->opaque);
    state->opaque = NULL;

    return;
}

1772 1773
static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
                        Error **errp)
1774 1775 1776 1777 1778 1779 1780
{
    SheepdogVdiReq hdr;
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
    int fd, ret;
    unsigned int wlen, rlen = 0;
    char buf[SD_MAX_VDI_LEN];

1781
    fd = connect_to_sdog(s, errp);
1782
    if (fd < 0) {
1783
        return fd;
1784 1785
    }

1786 1787 1788
    /* FIXME: would it be better to fail (e.g., return -EIO) when filename
     * does not fit in buf?  For now, just truncate and avoid buffer overrun.
     */
1789
    memset(buf, 0, sizeof(buf));
L
Liu Yuan 已提交
1790
    pstrcpy(buf, sizeof(buf), s->name);
1791 1792 1793

    memset(&hdr, 0, sizeof(hdr));
    hdr.opcode = SD_OP_NEW_VDI;
1794
    hdr.base_vdi_id = s->inode.vdi_id;
1795 1796 1797 1798 1799 1800 1801

    wlen = SD_MAX_VDI_LEN;

    hdr.flags = SD_FLAG_CMD_WRITE;
    hdr.snapid = snapshot;

    hdr.data_length = wlen;
L
Liu Yuan 已提交
1802 1803
    hdr.vdi_size = s->inode.vdi_size;
    hdr.copy_policy = s->inode.copy_policy;
1804
    hdr.copies = s->inode.nr_copies;
1805
    hdr.block_size_shift = s->inode.block_size_shift;
1806

P
Paolo Bonzini 已提交
1807
    ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1808 1809 1810 1811

    closesocket(fd);

    if (ret) {
1812
        error_setg_errno(errp, -ret, "create failed");
1813
        return ret;
1814 1815 1816
    }

    if (rsp->result != SD_RES_SUCCESS) {
1817
        error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
1818 1819 1820 1821 1822 1823 1824 1825 1826 1827
        return -EIO;
    }

    if (vdi_id) {
        *vdi_id = rsp->vdi_id;
    }

    return 0;
}

1828
static int sd_prealloc(const char *filename, Error **errp)
1829
{
1830
    BlockBackend *blk = NULL;
1831 1832
    BDRVSheepdogState *base = NULL;
    unsigned long buf_size;
1833
    uint32_t idx, max_idx;
1834
    uint32_t object_size;
1835
    int64_t vdi_size;
1836
    void *buf = NULL;
1837 1838
    int ret;

1839
    blk = blk_new_open(filename, NULL, NULL,
1840
                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1841 1842
    if (blk == NULL) {
        ret = -EIO;
1843
        goto out_with_err_set;
1844 1845
    }

1846 1847 1848
    blk_set_allow_write_beyond_eof(blk, true);

    vdi_size = blk_getlength(blk);
1849 1850 1851 1852
    if (vdi_size < 0) {
        ret = vdi_size;
        goto out;
    }
1853

1854
    base = blk_bs(blk)->opaque;
1855 1856 1857 1858 1859
    object_size = (UINT32_C(1) << base->inode.block_size_shift);
    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
    buf = g_malloc0(buf_size);

    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
1860 1861 1862 1863 1864 1865

    for (idx = 0; idx < max_idx; idx++) {
        /*
         * The created image can be a cloned image, so we need to read
         * a data from the source image.
         */
1866
        ret = blk_pread(blk, idx * buf_size, buf, buf_size);
1867 1868 1869
        if (ret < 0) {
            goto out;
        }
1870
        ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
1871 1872 1873 1874
        if (ret < 0) {
            goto out;
        }
    }
1875

1876
    ret = 0;
1877
out:
1878 1879 1880 1881
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Can't pre-allocate");
    }
out_with_err_set:
1882 1883
    if (blk) {
        blk_unref(blk);
1884
    }
1885
    g_free(buf);
1886 1887 1888 1889

    return ret;
}

1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
/*
 * Sheepdog support two kinds of redundancy, full replication and erasure
 * coding.
 *
 * # create a fully replicated vdi with x copies
 * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
 *
 * # create a erasure coded vdi with x data strips and y parity strips
 * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
 */
static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
{
    struct SheepdogInode *inode = &s->inode;
    const char *n1, *n2;
    long copy, parity;
    char p[10];

    pstrcpy(p, sizeof(p), opt);
    n1 = strtok(p, ":");
    n2 = strtok(NULL, ":");

    if (!n1) {
        return -EINVAL;
    }

    copy = strtol(n1, NULL, 10);
1916
    /* FIXME fix error checking by switching to qemu_strtol() */
1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930
    if (copy > SD_MAX_COPIES || copy < 1) {
        return -EINVAL;
    }
    if (!n2) {
        inode->copy_policy = 0;
        inode->nr_copies = copy;
        return 0;
    }

    if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
        return -EINVAL;
    }

    parity = strtol(n2, NULL, 10);
1931
    /* FIXME fix error checking by switching to qemu_strtol() */
1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945
    if (parity >= SD_EC_MAX_STRIP || parity < 1) {
        return -EINVAL;
    }

    /*
     * 4 bits for parity and 4 bits for data.
     * We have to compress upper data bits because it can't represent 16
     */
    inode->copy_policy = ((copy / 2) << 4) + parity;
    inode->nr_copies = copy + parity;

    return 0;
}

1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956
static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
{
    struct SheepdogInode *inode = &s->inode;
    uint64_t object_size;
    int obj_order;

    object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
    if (object_size) {
        if ((object_size - 1) & object_size) {    /* not a power of 2? */
            return -EINVAL;
        }
1957
        obj_order = ctz32(object_size);
1958 1959 1960 1961 1962 1963 1964 1965 1966
        if (obj_order < 20 || obj_order > 31) {
            return -EINVAL;
        }
        inode->block_size_shift = (uint8_t)obj_order;
    }

    return 0;
}

1967
static int sd_create(const char *filename, QemuOpts *opts,
1968
                     Error **errp)
1969
{
1970
    Error *err = NULL;
1971
    int ret = 0;
L
Liu Yuan 已提交
1972
    uint32_t vid = 0;
1973
    char *backing_file = NULL;
1974
    char *buf = NULL;
1975
    BDRVSheepdogState *s;
1976
    SheepdogConfig cfg;
1977
    uint64_t max_vdi_size;
1978
    bool prealloc = false;
1979

1980
    s = g_new0(BDRVSheepdogState, 1);
1981

M
MORITA Kazutaka 已提交
1982
    if (strstr(filename, "://")) {
1983
        sd_parse_uri(&cfg, filename, &err);
M
MORITA Kazutaka 已提交
1984
    } else {
1985
        parse_vdiname(&cfg, filename, &err);
M
MORITA Kazutaka 已提交
1986
    }
1987 1988
    if (err) {
        error_propagate(errp, err);
1989
        goto out;
1990 1991
    }

1992 1993 1994 1995 1996 1997
    buf = cfg.port ? g_strdup_printf("%d", cfg.port) : NULL;
    s->addr = sd_socket_address(cfg.path, cfg.host, buf);
    g_free(buf);
    strcpy(s->name, cfg.vdi);
    sd_config_done(&cfg);

1998 1999
    s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                                 BDRV_SECTOR_SIZE);
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
    if (!buf || !strcmp(buf, "off")) {
        prealloc = false;
    } else if (!strcmp(buf, "full")) {
        prealloc = true;
    } else {
        error_setg(errp, "Invalid preallocation mode: '%s'", buf);
        ret = -EINVAL;
        goto out;
    }

    g_free(buf);
    buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
    if (buf) {
        ret = parse_redundancy(s, buf);
        if (ret < 0) {
            error_setg(errp, "Invalid redundancy mode: '%s'", buf);
            goto out;
2019 2020
        }
    }
2021 2022 2023 2024 2025
    ret = parse_block_size_shift(s, opts);
    if (ret < 0) {
        error_setg(errp, "Invalid object_size."
                         " obect_size needs to be power of 2"
                         " and be limited from 2^20 to 2^31");
2026
        goto out;
2027 2028 2029
    }

    if (backing_file) {
2030
        BlockBackend *blk;
2031
        BDRVSheepdogState *base;
2032 2033 2034
        BlockDriver *drv;

        /* Currently, only Sheepdog backing image is supported. */
2035
        drv = bdrv_find_protocol(backing_file, true, NULL);
2036
        if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
2037
            error_setg(errp, "backing_file must be a sheepdog image");
2038 2039
            ret = -EINVAL;
            goto out;
2040 2041
        }

2042
        blk = blk_new_open(backing_file, NULL, NULL,
2043
                           BDRV_O_PROTOCOL, errp);
2044 2045
        if (blk == NULL) {
            ret = -EIO;
2046
            goto out;
2047
        }
2048

2049
        base = blk_bs(blk)->opaque;
2050

2051
        if (!is_snapshot(&base->inode)) {
2052
            error_setg(errp, "cannot clone from a non snapshot vdi");
2053
            blk_unref(blk);
2054 2055
            ret = -EINVAL;
            goto out;
2056
        }
2057
        s->inode.vdi_id = base->inode.vdi_id;
2058
        blk_unref(blk);
2059 2060
    }

2061
    s->aio_context = qemu_get_aio_context();
2062 2063 2064 2065 2066 2067 2068 2069

    /* if block_size_shift is not specified, get cluster default value */
    if (s->inode.block_size_shift == 0) {
        SheepdogVdiReq hdr;
        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
        int fd;
        unsigned int wlen = 0, rlen = 0;

2070
        fd = connect_to_sdog(s, errp);
2071
        if (fd < 0) {
2072
            ret = fd;
2073 2074 2075 2076 2077 2078 2079
            goto out;
        }

        memset(&hdr, 0, sizeof(hdr));
        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
        hdr.proto_ver = SD_PROTO_VER;

P
Paolo Bonzini 已提交
2080
        ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103
                     NULL, &wlen, &rlen);
        closesocket(fd);
        if (ret) {
            error_setg_errno(errp, -ret, "failed to get cluster default");
            goto out;
        }
        if (rsp->result == SD_RES_SUCCESS) {
            s->inode.block_size_shift = rsp->block_size_shift;
        } else {
            s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
        }
    }

    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;

    if (s->inode.vdi_size > max_vdi_size) {
        error_setg(errp, "An image is too large."
                         " The maximum image size is %"PRIu64 "GB",
                         max_vdi_size / 1024 / 1024 / 1024);
        ret = -EINVAL;
        goto out;
    }

2104
    ret = do_sd_create(s, &vid, 0, errp);
2105
    if (ret) {
2106
        goto out;
2107 2108
    }

2109
    if (prealloc) {
2110
        ret = sd_prealloc(filename, errp);
2111
    }
2112
out:
2113 2114
    g_free(backing_file);
    g_free(buf);
2115 2116
    g_free(s);
    return ret;
2117 2118 2119 2120
}

static void sd_close(BlockDriverState *bs)
{
2121
    Error *local_err = NULL;
2122 2123 2124 2125 2126 2127
    BDRVSheepdogState *s = bs->opaque;
    SheepdogVdiReq hdr;
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
    unsigned int wlen, rlen = 0;
    int fd, ret;

2128
    DPRINTF("%s\n", s->name);
2129

2130
    fd = connect_to_sdog(s, &local_err);
2131
    if (fd < 0) {
2132
        error_report_err(local_err);
2133 2134 2135 2136 2137 2138
        return;
    }

    memset(&hdr, 0, sizeof(hdr));

    hdr.opcode = SD_OP_RELEASE_VDI;
2139
    hdr.type = LOCK_TYPE_NORMAL;
2140
    hdr.base_vdi_id = s->inode.vdi_id;
2141 2142 2143 2144
    wlen = strlen(s->name) + 1;
    hdr.data_length = wlen;
    hdr.flags = SD_FLAG_CMD_WRITE;

P
Paolo Bonzini 已提交
2145
    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2146
                 s->name, &wlen, &rlen);
2147 2148 2149 2150 2151

    closesocket(fd);

    if (!ret && rsp->result != SD_RES_SUCCESS &&
        rsp->result != SD_RES_VDI_NOT_LOCKED) {
2152
        error_report("%s, %s", sd_strerror(rsp->result), s->name);
2153 2154
    }

2155
    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
2156
                       false, NULL, NULL, NULL, NULL);
2157
    closesocket(s->fd);
2158
    qapi_free_SocketAddress(s->addr);
2159 2160 2161 2162 2163 2164 2165 2166 2167
}

static int64_t sd_getlength(BlockDriverState *bs)
{
    BDRVSheepdogState *s = bs->opaque;

    return s->inode.vdi_size;
}

2168 2169
static int sd_truncate(BlockDriverState *bs, int64_t offset,
                       PreallocMode prealloc, Error **errp)
2170 2171 2172 2173
{
    BDRVSheepdogState *s = bs->opaque;
    int ret, fd;
    unsigned int datalen;
2174
    uint64_t max_vdi_size;
2175

2176 2177
    if (prealloc != PREALLOC_MODE_OFF) {
        error_setg(errp, "Unsupported preallocation mode '%s'",
2178
                   PreallocMode_str(prealloc));
2179 2180 2181
        return -ENOTSUP;
    }

2182
    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
2183
    if (offset < s->inode.vdi_size) {
2184
        error_setg(errp, "shrinking is not supported");
2185
        return -EINVAL;
2186
    } else if (offset > max_vdi_size) {
2187
        error_setg(errp, "too big image size");
2188 2189 2190
        return -EINVAL;
    }

2191
    fd = connect_to_sdog(s, errp);
2192
    if (fd < 0) {
2193
        return fd;
2194 2195 2196 2197 2198
    }

    /* we don't need to update entire object */
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
    s->inode.vdi_size = offset;
P
Paolo Bonzini 已提交
2199
    ret = write_object(fd, s->bs, (char *)&s->inode,
2200 2201
                       vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                       datalen, 0, false, s->cache_flags);
2202 2203 2204
    close(fd);

    if (ret < 0) {
2205
        error_setg_errno(errp, -ret, "failed to update an inode");
2206 2207
    }

2208
    return ret;
2209 2210 2211 2212 2213 2214
}

/*
 * This function is called after writing data objects.  If we need to
 * update metadata, this sends a write request to the vdi object.
 */
P
Paolo Bonzini 已提交
2215
static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
2216
{
P
Paolo Bonzini 已提交
2217
    BDRVSheepdogState *s = acb->s;
2218 2219 2220 2221
    struct iovec iov;
    AIOReq *aio_req;
    uint32_t offset, data_len, mn, mx;

2222 2223
    mn = acb->min_dirty_data_idx;
    mx = acb->max_dirty_data_idx;
2224 2225
    if (mn <= mx) {
        /* we need to update the vdi object. */
2226
        ++acb->nr_pending;
2227 2228 2229 2230
        offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
            mn * sizeof(s->inode.data_vdi_id[0]);
        data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);

2231 2232
        acb->min_dirty_data_idx = UINT32_MAX;
        acb->max_dirty_data_idx = 0;
2233 2234 2235 2236

        iov.iov_base = &s->inode;
        iov.iov_len = sizeof(s->inode);
        aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
2237 2238
                                data_len, offset, 0, false, 0, offset);
        add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
2239 2240 2241
        if (--acb->nr_pending) {
            qemu_coroutine_yield();
        }
2242 2243 2244
    }
}

L
Liu Yuan 已提交
2245 2246 2247
/* Delete current working VDI on the snapshot chain */
static bool sd_delete(BDRVSheepdogState *s)
{
2248
    Error *local_err = NULL;
L
Liu Yuan 已提交
2249 2250 2251
    unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
    SheepdogVdiReq hdr = {
        .opcode = SD_OP_DEL_VDI,
2252
        .base_vdi_id = s->inode.vdi_id,
L
Liu Yuan 已提交
2253 2254 2255 2256 2257 2258
        .data_length = wlen,
        .flags = SD_FLAG_CMD_WRITE,
    };
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
    int fd, ret;

2259
    fd = connect_to_sdog(s, &local_err);
L
Liu Yuan 已提交
2260
    if (fd < 0) {
2261
        error_report_err(local_err);
L
Liu Yuan 已提交
2262 2263 2264
        return false;
    }

P
Paolo Bonzini 已提交
2265
    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2266
                 s->name, &wlen, &rlen);
L
Liu Yuan 已提交
2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284
    closesocket(fd);
    if (ret) {
        return false;
    }
    switch (rsp->result) {
    case SD_RES_NO_VDI:
        error_report("%s was already deleted", s->name);
        /* fall through */
    case SD_RES_SUCCESS:
        break;
    default:
        error_report("%s, %s", sd_strerror(rsp->result), s->name);
        return false;
    }

    return true;
}

2285 2286 2287 2288 2289
/*
 * Create a writable VDI from a snapshot
 */
static int sd_create_branch(BDRVSheepdogState *s)
{
2290
    Error *local_err = NULL;
2291 2292 2293
    int ret, fd;
    uint32_t vid;
    char *buf;
L
Liu Yuan 已提交
2294
    bool deleted;
2295

2296
    DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
2297

2298
    buf = g_malloc(SD_INODE_SIZE);
2299

L
Liu Yuan 已提交
2300 2301
    /*
     * Even If deletion fails, we will just create extra snapshot based on
D
Deepak Kathayat 已提交
2302
     * the working VDI which was supposed to be deleted. So no need to
L
Liu Yuan 已提交
2303 2304 2305
     * false bail out.
     */
    deleted = sd_delete(s);
2306
    ret = do_sd_create(s, &vid, !deleted, &local_err);
2307
    if (ret) {
2308
        error_report_err(local_err);
2309 2310 2311
        goto out;
    }

2312
    DPRINTF("%" PRIx32 " is created.\n", vid);
2313

2314
    fd = connect_to_sdog(s, &local_err);
2315
    if (fd < 0) {
2316
        error_report_err(local_err);
2317
        ret = fd;
2318 2319 2320
        goto out;
    }

P
Paolo Bonzini 已提交
2321
    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
2322
                      s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
2323 2324 2325 2326 2327 2328 2329 2330 2331

    closesocket(fd);

    if (ret < 0) {
        goto out;
    }

    memcpy(&s->inode, buf, sizeof(s->inode));

2332
    s->is_snapshot = false;
2333
    ret = 0;
2334
    DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
2335 2336

out:
2337
    g_free(buf);
2338 2339 2340 2341 2342 2343 2344 2345

    return ret;
}

/*
 * Send I/O requests to the server.
 *
 * This function sends requests to the server, links the requests to
2346
 * the inflight_list in BDRVSheepdogState, and exits without
2347 2348 2349
 * waiting the response.  The responses are received in the
 * `aio_read_response' function which is called from the main loop as
 * a fd handler.
M
MORITA Kazutaka 已提交
2350 2351 2352
 *
 * Returns 1 when we need to wait a response, 0 when there is no sent
 * request and -errno in error cases.
2353
 */
P
Paolo Bonzini 已提交
2354
static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
2355 2356
{
    int ret = 0;
L
Liu Yuan 已提交
2357
    unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
2358 2359
    unsigned long idx;
    uint32_t object_size;
2360
    uint64_t oid;
2361
    uint64_t offset;
P
Paolo Bonzini 已提交
2362
    BDRVSheepdogState *s = acb->s;
2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373
    SheepdogInode *inode = &s->inode;
    AIOReq *aio_req;

    if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
        /*
         * In the case we open the snapshot VDI, Sheepdog creates the
         * writable VDI when we do a write operation first.
         */
        ret = sd_create_branch(s);
        if (ret) {
            acb->ret = -EIO;
2374
            return;
2375 2376 2377
        }
    }

2378 2379 2380 2381
    object_size = (UINT32_C(1) << inode->block_size_shift);
    idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
    offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;

2382 2383 2384 2385 2386 2387
    /*
     * Make sure we don't free the aiocb before we are done with all requests.
     * This additional reference is dropped at the end of this function.
     */
    acb->nr_pending++;

2388 2389 2390
    while (done != total) {
        uint8_t flags = 0;
        uint64_t old_oid = 0;
2391
        bool create = false;
2392 2393 2394

        oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);

2395
        len = MIN(total - done, object_size - offset);
2396

2397 2398 2399 2400
        switch (acb->aiocb_type) {
        case AIOCB_READ_UDATA:
            if (!inode->data_vdi_id[idx]) {
                qemu_iovec_memset(acb->qiov, done, 0, len);
2401 2402
                goto done;
            }
2403 2404 2405
            break;
        case AIOCB_WRITE_UDATA:
            if (!inode->data_vdi_id[idx]) {
2406
                create = true;
2407 2408
            } else if (!is_data_obj_writable(inode, idx)) {
                /* Copy-On-Write */
2409
                create = true;
2410 2411 2412 2413
                old_oid = oid;
                flags = SD_FLAG_CMD_COW;
            }
            break;
2414 2415 2416 2417 2418
        case AIOCB_DISCARD_OBJ:
            /*
             * We discard the object only when the whole object is
             * 1) allocated 2) trimmed. Otherwise, simply skip it.
             */
2419
            if (len != object_size || inode->data_vdi_id[idx] == 0) {
2420 2421 2422
                goto done;
            }
            break;
2423 2424
        default:
            break;
2425 2426 2427
        }

        if (create) {
2428
            DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
2429
                    inode->vdi_id, oid,
2430 2431
                    vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
            oid = vid_to_data_oid(inode->vdi_id, idx);
2432
            DPRINTF("new oid %" PRIx64 "\n", oid);
2433 2434
        }

2435
        aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
2436 2437 2438
                                old_oid,
                                acb->aiocb_type == AIOCB_DISCARD_OBJ ?
                                0 : done);
2439
        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
2440
                        acb->aiocb_type);
2441 2442 2443 2444 2445
    done:
        offset = 0;
        idx++;
        done += len;
    }
2446 2447
    if (--acb->nr_pending) {
        qemu_coroutine_yield();
2448 2449 2450
    }
}

2451
static void sd_aio_complete(SheepdogAIOCB *acb)
2452
{
P
Paolo Bonzini 已提交
2453
    BDRVSheepdogState *s;
2454 2455
    if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
        return;
2456 2457
    }

P
Paolo Bonzini 已提交
2458 2459
    s = acb->s;
    qemu_co_mutex_lock(&s->queue_lock);
2460
    QLIST_REMOVE(acb, aiocb_siblings);
P
Paolo Bonzini 已提交
2461 2462
    qemu_co_queue_restart_all(&s->overlapping_queue);
    qemu_co_mutex_unlock(&s->queue_lock);
2463 2464
}

2465
static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
M
MORITA Kazutaka 已提交
2466
                        int nb_sectors, QEMUIOVector *qiov)
2467
{
P
Paolo Bonzini 已提交
2468
    SheepdogAIOCB acb;
M
MORITA Kazutaka 已提交
2469
    int ret;
2470 2471
    int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
    BDRVSheepdogState *s = bs->opaque;
2472

M
Max Reitz 已提交
2473
    if (offset > s->inode.vdi_size) {
2474
        ret = sd_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
2475 2476
        if (ret < 0) {
            return ret;
2477 2478 2479
        }
    }

P
Paolo Bonzini 已提交
2480 2481 2482
    sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
    sd_co_rw_vector(&acb);
    sd_write_done(&acb);
2483
    sd_aio_complete(&acb);
M
MORITA Kazutaka 已提交
2484

P
Paolo Bonzini 已提交
2485
    return acb.ret;
2486 2487
}

2488
static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
M
MORITA Kazutaka 已提交
2489
                       int nb_sectors, QEMUIOVector *qiov)
2490
{
P
Paolo Bonzini 已提交
2491
    SheepdogAIOCB acb;
2492
    BDRVSheepdogState *s = bs->opaque;
2493

P
Paolo Bonzini 已提交
2494 2495
    sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
    sd_co_rw_vector(&acb);
2496
    sd_aio_complete(&acb);
M
MORITA Kazutaka 已提交
2497

P
Paolo Bonzini 已提交
2498
    return acb.ret;
2499 2500
}

2501 2502 2503
static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
{
    BDRVSheepdogState *s = bs->opaque;
P
Paolo Bonzini 已提交
2504
    SheepdogAIOCB acb;
2505
    AIOReq *aio_req;
2506

2507
    if (s->cache_flags != SD_FLAG_CMD_CACHE) {
2508 2509 2510
        return 0;
    }

P
Paolo Bonzini 已提交
2511
    sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
2512

P
Paolo Bonzini 已提交
2513 2514
    acb.nr_pending++;
    aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
2515
                            0, 0, 0, false, 0, 0);
P
Paolo Bonzini 已提交
2516
    add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
2517

P
Paolo Bonzini 已提交
2518
    if (--acb.nr_pending) {
2519 2520
        qemu_coroutine_yield();
    }
2521 2522

    sd_aio_complete(&acb);
P
Paolo Bonzini 已提交
2523
    return acb.ret;
2524 2525
}

2526 2527
static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
{
2528
    Error *local_err = NULL;
2529 2530 2531 2532 2533 2534
    BDRVSheepdogState *s = bs->opaque;
    int ret, fd;
    uint32_t new_vid;
    SheepdogInode *inode;
    unsigned int datalen;

2535
    DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
2536 2537 2538 2539 2540
            "is_snapshot %d\n", sn_info->name, sn_info->id_str,
            s->name, sn_info->vm_state_size, s->is_snapshot);

    if (s->is_snapshot) {
        error_report("You can't create a snapshot of a snapshot VDI, "
2541
                     "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
2542 2543 2544 2545

        return -EINVAL;
    }

2546
    DPRINTF("%s %s\n", sn_info->name, sn_info->id_str);
2547 2548 2549

    s->inode.vm_state_size = sn_info->vm_state_size;
    s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
2550 2551 2552
    /* It appears that inode.tag does not require a NUL terminator,
     * which means this use of strncpy is ok.
     */
2553 2554 2555
    strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
    /* we don't need to update entire object */
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2556
    inode = g_malloc(datalen);
2557 2558

    /* refresh inode. */
2559
    fd = connect_to_sdog(s, &local_err);
2560
    if (fd < 0) {
2561
        error_report_err(local_err);
2562
        ret = fd;
2563 2564 2565
        goto cleanup;
    }

P
Paolo Bonzini 已提交
2566
    ret = write_object(fd, s->bs, (char *)&s->inode,
2567 2568
                       vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                       datalen, 0, false, s->cache_flags);
2569
    if (ret < 0) {
2570
        error_report("failed to write snapshot's inode.");
2571 2572 2573
        goto cleanup;
    }

2574
    ret = do_sd_create(s, &new_vid, 1, &local_err);
2575
    if (ret < 0) {
2576 2577
        error_reportf_err(local_err,
                          "failed to create inode for snapshot: ");
2578 2579 2580
        goto cleanup;
    }

P
Paolo Bonzini 已提交
2581
    ret = read_object(fd, s->bs, (char *)inode,
2582 2583
                      vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
                      s->cache_flags);
2584 2585

    if (ret < 0) {
2586
        error_report("failed to read new inode info. %s", strerror(errno));
2587 2588 2589 2590
        goto cleanup;
    }

    memcpy(&s->inode, inode, datalen);
2591
    DPRINTF("s->inode: name %s snap_id %x oid %x\n",
2592 2593 2594
            s->inode.name, s->inode.snap_id, s->inode.vdi_id);

cleanup:
2595
    g_free(inode);
2596 2597 2598 2599
    closesocket(fd);
    return ret;
}

L
Liu Yuan 已提交
2600 2601 2602 2603
/*
 * We implement rollback(loadvm) operation to the specified snapshot by
 * 1) switch to the snapshot
 * 2) rely on sd_create_branch to delete working VDI and
D
Deepak Kathayat 已提交
2604
 * 3) create a new working VDI based on the specified snapshot
L
Liu Yuan 已提交
2605
 */
2606 2607 2608 2609
static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
{
    BDRVSheepdogState *s = bs->opaque;
    BDRVSheepdogState *old_s;
2610
    char tag[SD_MAX_VDI_TAG_LEN];
2611
    uint32_t snapid = 0;
2612 2613 2614 2615 2616
    int ret;

    if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) {
        return -EINVAL;
    }
2617

2618
    old_s = g_new(BDRVSheepdogState, 1);
2619 2620 2621

    memcpy(old_s, s, sizeof(BDRVSheepdogState));

2622
    ret = reload_inode(s, snapid, tag);
2623 2624 2625 2626
    if (ret) {
        goto out;
    }

2627 2628
    ret = sd_create_branch(s);
    if (ret) {
2629 2630 2631
        goto out;
    }

2632
    g_free(old_s);
2633 2634 2635 2636 2637

    return 0;
out:
    /* recover bdrv_sd_state */
    memcpy(s, old_s, sizeof(BDRVSheepdogState));
2638
    g_free(old_s);
2639

2640
    error_report("failed to open. recover old bdrv_sd_state.");
2641 2642 2643 2644

    return ret;
}

2645 2646
#define NR_BATCHED_DISCARD 128

2647
static int remove_objects(BDRVSheepdogState *s, Error **errp)
2648 2649
{
    int fd, i = 0, nr_objs = 0;
2650
    int ret;
2651 2652
    SheepdogInode *inode = &s->inode;

2653
    fd = connect_to_sdog(s, errp);
2654
    if (fd < 0) {
2655
        return fd;
2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676
    }

    nr_objs = count_data_objs(inode);
    while (i < nr_objs) {
        int start_idx, nr_filled_idx;

        while (i < nr_objs && !inode->data_vdi_id[i]) {
            i++;
        }
        start_idx = i;

        nr_filled_idx = 0;
        while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
            if (inode->data_vdi_id[i]) {
                inode->data_vdi_id[i] = 0;
                nr_filled_idx++;
            }

            i++;
        }

P
Paolo Bonzini 已提交
2677
        ret = write_object(fd, s->bs,
2678 2679 2680 2681 2682 2683 2684
                           (char *)&inode->data_vdi_id[start_idx],
                           vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
                           (i - start_idx) * sizeof(uint32_t),
                           offsetof(struct SheepdogInode,
                                    data_vdi_id[start_idx]),
                           false, s->cache_flags);
        if (ret < 0) {
2685
            error_setg(errp, "Failed to discard snapshot inode");
2686 2687 2688 2689
            goto out;
        }
    }

2690
    ret = 0;
2691 2692
out:
    closesocket(fd);
2693
    return ret;
2694 2695
}

2696 2697 2698 2699
static int sd_snapshot_delete(BlockDriverState *bs,
                              const char *snapshot_id,
                              const char *name,
                              Error **errp)
2700
{
2701 2702 2703 2704
    /*
     * FIXME should delete the snapshot matching both @snapshot_id and
     * @name, but @name not used here
     */
2705
    unsigned long snap_id = 0;
2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718
    char snap_tag[SD_MAX_VDI_TAG_LEN];
    int fd, ret;
    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
    BDRVSheepdogState *s = bs->opaque;
    unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
    uint32_t vid;
    SheepdogVdiReq hdr = {
        .opcode = SD_OP_DEL_VDI,
        .data_length = wlen,
        .flags = SD_FLAG_CMD_WRITE,
    };
    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;

2719 2720 2721
    ret = remove_objects(s, errp);
    if (ret) {
        return ret;
2722 2723 2724 2725 2726
    }

    memset(buf, 0, sizeof(buf));
    memset(snap_tag, 0, sizeof(snap_tag));
    pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
2727
    /* TODO Use sd_parse_snapid() once this mess is cleaned up */
2728 2729
    ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
    if (ret || snap_id > UINT32_MAX) {
2730 2731 2732 2733 2734
        /*
         * FIXME Since qemu_strtoul() returns -EINVAL when
         * @snapshot_id is null, @snapshot_id is mandatory.  Correct
         * would be to require at least one of @snapshot_id and @name.
         */
2735 2736 2737
        error_setg(errp, "Invalid snapshot ID: %s",
                         snapshot_id ? snapshot_id : "<null>");
        return -EINVAL;
2738 2739 2740
    }

    if (snap_id) {
2741
        hdr.snapid = (uint32_t) snap_id;
2742
    } else {
2743
        /* FIXME I suspect we should use @name here */
2744
        /* FIXME don't truncate silently */
2745 2746 2747 2748
        pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
        pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
    }

2749
    ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp);
2750 2751 2752 2753
    if (ret) {
        return ret;
    }

2754
    fd = connect_to_sdog(s, errp);
2755
    if (fd < 0) {
2756
        return fd;
2757 2758
    }

P
Paolo Bonzini 已提交
2759
    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2760 2761 2762
                 buf, &wlen, &rlen);
    closesocket(fd);
    if (ret) {
2763
        error_setg_errno(errp, -ret, "Couldn't send request to server");
2764 2765 2766 2767 2768
        return ret;
    }

    switch (rsp->result) {
    case SD_RES_NO_VDI:
2769 2770
        error_setg(errp, "Can't find the snapshot");
        return -ENOENT;
2771 2772 2773
    case SD_RES_SUCCESS:
        break;
    default:
2774 2775
        error_setg(errp, "%s", sd_strerror(rsp->result));
        return -EIO;
2776 2777
    }

2778
    return 0;
2779 2780 2781 2782
}

static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
{
2783
    Error *local_err = NULL;
2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795
    BDRVSheepdogState *s = bs->opaque;
    SheepdogReq req;
    int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
    QEMUSnapshotInfo *sn_tab = NULL;
    unsigned wlen, rlen;
    int found = 0;
    static SheepdogInode inode;
    unsigned long *vdi_inuse;
    unsigned int start_nr;
    uint64_t hval;
    uint32_t vid;

2796
    vdi_inuse = g_malloc(max);
2797

2798
    fd = connect_to_sdog(s, &local_err);
2799
    if (fd < 0) {
2800
        error_report_err(local_err);
2801
        ret = fd;
2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812
        goto out;
    }

    rlen = max;
    wlen = 0;

    memset(&req, 0, sizeof(req));

    req.opcode = SD_OP_READ_VDIS;
    req.data_length = max;

P
Paolo Bonzini 已提交
2813
    ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
2814 2815 2816 2817 2818 2819

    closesocket(fd);
    if (ret) {
        goto out;
    }

2820
    sn_tab = g_new0(QEMUSnapshotInfo, nr);
2821 2822 2823 2824 2825

    /* calculate a vdi id with hash function */
    hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
    start_nr = hval & (SD_NR_VDIS - 1);

2826
    fd = connect_to_sdog(s, &local_err);
2827
    if (fd < 0) {
2828
        error_report_err(local_err);
2829
        ret = fd;
2830 2831 2832 2833 2834 2835 2836 2837 2838
        goto out;
    }

    for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
        if (!test_bit(vid, vdi_inuse)) {
            break;
        }

        /* we don't need to read entire object */
P
Paolo Bonzini 已提交
2839
        ret = read_object(fd, s->bs, (char *)&inode,
2840
                          vid_to_vdi_oid(vid),
2841
                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
2842
                          s->cache_flags);
2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853

        if (ret) {
            continue;
        }

        if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
            sn_tab[found].date_sec = inode.snap_ctime >> 32;
            sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
            sn_tab[found].vm_state_size = inode.vm_state_size;
            sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;

2854 2855
            snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
                     "%" PRIu32, inode.snap_id);
2856 2857 2858
            pstrcpy(sn_tab[found].name,
                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
                    inode.tag);
2859 2860 2861 2862 2863 2864 2865 2866
            found++;
        }
    }

    closesocket(fd);
out:
    *psn_tab = sn_tab;

2867
    g_free(vdi_inuse);
2868

2869 2870 2871 2872
    if (ret < 0) {
        return ret;
    }

2873 2874 2875 2876 2877 2878
    return found;
}

static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
                                int64_t pos, int size, int load)
{
2879
    Error *local_err = NULL;
2880 2881
    bool create;
    int fd, ret = 0, remaining = size;
2882 2883 2884
    unsigned int data_len;
    uint64_t vmstate_oid;
    uint64_t offset;
2885 2886
    uint32_t vdi_index;
    uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
2887
    uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
2888

2889
    fd = connect_to_sdog(s, &local_err);
2890
    if (fd < 0) {
2891
        error_report_err(local_err);
2892
        return fd;
2893 2894
    }

2895
    while (remaining) {
2896 2897
        vdi_index = pos / object_size;
        offset = pos % object_size;
2898

2899
        data_len = MIN(remaining, object_size - offset);
2900

2901
        vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
2902 2903 2904

        create = (offset == 0);
        if (load) {
P
Paolo Bonzini 已提交
2905
            ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
2906
                              s->inode.nr_copies, data_len, offset,
2907
                              s->cache_flags);
2908
        } else {
P
Paolo Bonzini 已提交
2909
            ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
2910
                               s->inode.nr_copies, data_len, offset, create,
2911
                               s->cache_flags);
2912 2913 2914
        }

        if (ret < 0) {
2915
            error_report("failed to save vmstate %s", strerror(errno));
2916 2917 2918 2919
            goto cleanup;
        }

        pos += data_len;
2920
        data += data_len;
2921
        remaining -= data_len;
2922
    }
2923
    ret = size;
2924 2925 2926 2927 2928
cleanup:
    closesocket(fd);
    return ret;
}

2929 2930
static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                           int64_t pos)
2931 2932
{
    BDRVSheepdogState *s = bs->opaque;
2933 2934
    void *buf;
    int ret;
2935

2936 2937 2938 2939 2940 2941
    buf = qemu_blockalign(bs, qiov->size);
    qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
    ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
    qemu_vfree(buf);

    return ret;
2942 2943
}

2944 2945
static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                           int64_t pos)
2946 2947
{
    BDRVSheepdogState *s = bs->opaque;
2948 2949
    void *buf;
    int ret;
2950

2951 2952 2953 2954 2955 2956
    buf = qemu_blockalign(bs, qiov->size);
    ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
    qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
    qemu_vfree(buf);

    return ret;
2957 2958 2959
}


2960
static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
2961
                                      int bytes)
2962
{
P
Paolo Bonzini 已提交
2963
    SheepdogAIOCB acb;
2964
    BDRVSheepdogState *s = bs->opaque;
2965 2966 2967
    QEMUIOVector discard_iov;
    struct iovec iov;
    uint32_t zero = 0;
2968 2969

    if (!s->discard_supported) {
2970
        return 0;
2971 2972
    }

2973 2974 2975 2976 2977 2978
    memset(&discard_iov, 0, sizeof(discard_iov));
    memset(&iov, 0, sizeof(iov));
    iov.iov_base = &zero;
    iov.iov_len = sizeof(zero);
    discard_iov.iov = &iov;
    discard_iov.niov = 1;
2979
    if (!QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE)) {
2980 2981
        return -ENOTSUP;
    }
P
Paolo Bonzini 已提交
2982
    sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
2983
                 bytes >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
P
Paolo Bonzini 已提交
2984
    sd_co_rw_vector(&acb);
2985
    sd_aio_complete(&acb);
2986

P
Paolo Bonzini 已提交
2987
    return acb.ret;
2988 2989
}

2990 2991
static coroutine_fn int64_t
sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2992
                       int *pnum, BlockDriverState **file)
2993 2994 2995
{
    BDRVSheepdogState *s = bs->opaque;
    SheepdogInode *inode = &s->inode;
2996
    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
L
Liu Yuan 已提交
2997
    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
2998
    unsigned long start = offset / object_size,
2999
                  end = DIV_ROUND_UP((sector_num + nb_sectors) *
3000
                                     BDRV_SECTOR_SIZE, object_size);
3001
    unsigned long idx;
L
Liu Yuan 已提交
3002
    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018

    for (idx = start; idx < end; idx++) {
        if (inode->data_vdi_id[idx] == 0) {
            break;
        }
    }
    if (idx == start) {
        /* Get the longest length of unallocated sectors */
        ret = 0;
        for (idx = start + 1; idx < end; idx++) {
            if (inode->data_vdi_id[idx] != 0) {
                break;
            }
        }
    }

3019
    *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
3020 3021 3022
    if (*pnum > nb_sectors) {
        *pnum = nb_sectors;
    }
3023 3024 3025
    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
        *file = bs;
    }
3026 3027 3028
    return ret;
}

3029 3030 3031 3032
static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
{
    BDRVSheepdogState *s = bs->opaque;
    SheepdogInode *inode = &s->inode;
3033 3034
    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
3035 3036 3037 3038 3039 3040
    uint64_t size = 0;

    for (i = 0; i < last; i++) {
        if (inode->data_vdi_id[i] == 0) {
            continue;
        }
3041
        size += object_size;
3042 3043 3044 3045
    }
    return size;
}

3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069
static QemuOptsList sd_create_opts = {
    .name = "sheepdog-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head),
    .desc = {
        {
            .name = BLOCK_OPT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Virtual disk size"
        },
        {
            .name = BLOCK_OPT_BACKING_FILE,
            .type = QEMU_OPT_STRING,
            .help = "File name of a base image"
        },
        {
            .name = BLOCK_OPT_PREALLOC,
            .type = QEMU_OPT_STRING,
            .help = "Preallocation mode (allowed values: off, full)"
        },
        {
            .name = BLOCK_OPT_REDUNDANCY,
            .type = QEMU_OPT_STRING,
            .help = "Redundancy of the image"
        },
3070 3071 3072 3073 3074
        {
            .name = BLOCK_OPT_OBJECT_SIZE,
            .type = QEMU_OPT_SIZE,
            .help = "Object size of the image"
        },
3075 3076
        { /* end of list */ }
    }
3077 3078
};

M
MORITA Kazutaka 已提交
3079
static BlockDriver bdrv_sheepdog = {
J
Jeff Cody 已提交
3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091
    .format_name                  = "sheepdog",
    .protocol_name                = "sheepdog",
    .instance_size                = sizeof(BDRVSheepdogState),
    .bdrv_parse_filename          = sd_parse_filename,
    .bdrv_file_open               = sd_open,
    .bdrv_reopen_prepare          = sd_reopen_prepare,
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
    .bdrv_create                  = sd_create,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
3092
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
J
Jeff Cody 已提交
3093
    .bdrv_truncate                = sd_truncate,
3094

J
Jeff Cody 已提交
3095 3096 3097 3098 3099
    .bdrv_co_readv                = sd_co_readv,
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
    .bdrv_co_get_block_status     = sd_co_get_block_status,
3100

J
Jeff Cody 已提交
3101 3102 3103 3104
    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
    .bdrv_snapshot_delete         = sd_snapshot_delete,
    .bdrv_snapshot_list           = sd_snapshot_list,
3105

J
Jeff Cody 已提交
3106 3107
    .bdrv_save_vmstate            = sd_save_vmstate,
    .bdrv_load_vmstate            = sd_load_vmstate,
3108

J
Jeff Cody 已提交
3109 3110
    .bdrv_detach_aio_context      = sd_detach_aio_context,
    .bdrv_attach_aio_context      = sd_attach_aio_context,
3111

J
Jeff Cody 已提交
3112
    .create_opts                  = &sd_create_opts,
3113 3114
};

M
MORITA Kazutaka 已提交
3115
static BlockDriver bdrv_sheepdog_tcp = {
J
Jeff Cody 已提交
3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127
    .format_name                  = "sheepdog",
    .protocol_name                = "sheepdog+tcp",
    .instance_size                = sizeof(BDRVSheepdogState),
    .bdrv_parse_filename          = sd_parse_filename,
    .bdrv_file_open               = sd_open,
    .bdrv_reopen_prepare          = sd_reopen_prepare,
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
    .bdrv_create                  = sd_create,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
3128
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
J
Jeff Cody 已提交
3129
    .bdrv_truncate                = sd_truncate,
M
MORITA Kazutaka 已提交
3130

J
Jeff Cody 已提交
3131 3132 3133 3134 3135
    .bdrv_co_readv                = sd_co_readv,
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
    .bdrv_co_get_block_status     = sd_co_get_block_status,
M
MORITA Kazutaka 已提交
3136

J
Jeff Cody 已提交
3137 3138 3139 3140
    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
    .bdrv_snapshot_delete         = sd_snapshot_delete,
    .bdrv_snapshot_list           = sd_snapshot_list,
M
MORITA Kazutaka 已提交
3141

J
Jeff Cody 已提交
3142 3143
    .bdrv_save_vmstate            = sd_save_vmstate,
    .bdrv_load_vmstate            = sd_load_vmstate,
M
MORITA Kazutaka 已提交
3144

J
Jeff Cody 已提交
3145 3146
    .bdrv_detach_aio_context      = sd_detach_aio_context,
    .bdrv_attach_aio_context      = sd_attach_aio_context,
3147

J
Jeff Cody 已提交
3148
    .create_opts                  = &sd_create_opts,
M
MORITA Kazutaka 已提交
3149 3150
};

3151
static BlockDriver bdrv_sheepdog_unix = {
J
Jeff Cody 已提交
3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163
    .format_name                  = "sheepdog",
    .protocol_name                = "sheepdog+unix",
    .instance_size                = sizeof(BDRVSheepdogState),
    .bdrv_parse_filename          = sd_parse_filename,
    .bdrv_file_open               = sd_open,
    .bdrv_reopen_prepare          = sd_reopen_prepare,
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
    .bdrv_create                  = sd_create,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
3164
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
J
Jeff Cody 已提交
3165
    .bdrv_truncate                = sd_truncate,
3166

J
Jeff Cody 已提交
3167 3168 3169 3170 3171
    .bdrv_co_readv                = sd_co_readv,
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
    .bdrv_co_get_block_status     = sd_co_get_block_status,
3172

J
Jeff Cody 已提交
3173 3174 3175 3176
    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
    .bdrv_snapshot_delete         = sd_snapshot_delete,
    .bdrv_snapshot_list           = sd_snapshot_list,
3177

J
Jeff Cody 已提交
3178 3179
    .bdrv_save_vmstate            = sd_save_vmstate,
    .bdrv_load_vmstate            = sd_load_vmstate,
3180

J
Jeff Cody 已提交
3181 3182
    .bdrv_detach_aio_context      = sd_detach_aio_context,
    .bdrv_attach_aio_context      = sd_attach_aio_context,
3183

J
Jeff Cody 已提交
3184
    .create_opts                  = &sd_create_opts,
3185 3186
};

3187 3188 3189
static void bdrv_sheepdog_init(void)
{
    bdrv_register(&bdrv_sheepdog);
M
MORITA Kazutaka 已提交
3190
    bdrv_register(&bdrv_sheepdog_tcp);
3191
    bdrv_register(&bdrv_sheepdog_unix);
3192 3193
}
block_init(bdrv_sheepdog_init);