nbd.c 66.0 KB
Newer Older
1 2 3
/*
 * QEMU Block driver for  NBD
 *
4
 * Copyright (c) 2019 Virtuozzo International GmbH.
5
 * Copyright (C) 2016 Red Hat, Inc.
6
 * Copyright (C) 2008 Bull S.A.S.
M
malc 已提交
7
 *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Some parts:
 *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

P
Peter Maydell 已提交
31
#include "qemu/osdep.h"
32 33

#include "trace.h"
34
#include "qemu/uri.h"
35
#include "qemu/option.h"
36
#include "qemu/cutils.h"
37
#include "qemu/main-loop.h"
38

39
#include "qapi/qapi-visit-sockets.h"
40
#include "qapi/qmp/qstring.h"
41 42 43 44

#include "block/qdict.h"
#include "block/nbd.h"
#include "block/block_int.h"
45

46
#define EN_OPTSTR ":exportname="
47 48 49 50 51 52 53 54 55 56 57
#define MAX_NBD_REQUESTS    16

#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs))
#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ (uint64_t)(intptr_t)(bs))

typedef struct {
    Coroutine *coroutine;
    uint64_t offset;        /* original offset of the request */
    bool receiving;         /* waiting for connection_co? */
} NBDClientRequest;

58
typedef enum NBDClientState {
59 60
    NBD_CLIENT_CONNECTING_WAIT,
    NBD_CLIENT_CONNECTING_NOWAIT,
61 62 63 64
    NBD_CLIENT_CONNECTED,
    NBD_CLIENT_QUIT
} NBDClientState;

65
typedef struct BDRVNBDState {
66 67 68 69 70 71 72
    QIOChannelSocket *sioc; /* The master data channel */
    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
    NBDExportInfo info;

    CoMutex send_mutex;
    CoQueue free_sema;
    Coroutine *connection_co;
M
Max Reitz 已提交
73
    Coroutine *teardown_co;
74 75 76
    QemuCoSleepState *connection_co_sleep_ns_state;
    bool drained;
    bool wait_drained_end;
77
    int in_flight;
78
    NBDClientState state;
79 80 81
    int connect_status;
    Error *connect_err;
    bool wait_in_flight;
82 83 84 85

    NBDClientRequest requests[MAX_NBD_REQUESTS];
    NBDReply reply;
    BlockDriverState *bs;
86

87 88
    /* Connection parameters */
    uint32_t reconnect_delay;
89
    SocketAddress *saddr;
M
Max Reitz 已提交
90
    char *export, *tlscredsid;
91 92 93
    QCryptoTLSCreds *tlscreds;
    const char *hostname;
    char *x_dirty_bitmap;
94 95
} BDRVNBDState;

96 97 98 99
static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr,
                                                  Error **errp);
static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc,
                                Error **errp);
100

101 102 103 104 105 106 107 108 109 110 111 112 113
static void nbd_clear_bdrvstate(BDRVNBDState *s)
{
    object_unref(OBJECT(s->tlscreds));
    qapi_free_SocketAddress(s->saddr);
    s->saddr = NULL;
    g_free(s->export);
    s->export = NULL;
    g_free(s->tlscredsid);
    s->tlscredsid = NULL;
    g_free(s->x_dirty_bitmap);
    s->x_dirty_bitmap = NULL;
}

114 115
static void nbd_channel_error(BDRVNBDState *s, int ret)
{
116 117 118 119 120 121 122 123 124 125 126
    if (ret == -EIO) {
        if (s->state == NBD_CLIENT_CONNECTED) {
            s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT :
                                            NBD_CLIENT_CONNECTING_NOWAIT;
        }
    } else {
        if (s->state == NBD_CLIENT_CONNECTED) {
            qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
        }
        s->state = NBD_CLIENT_QUIT;
    }
127 128
}

129
static void nbd_recv_coroutines_wake_all(BDRVNBDState *s)
130 131 132 133 134 135 136 137 138 139 140 141 142 143
{
    int i;

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        NBDClientRequest *req = &s->requests[i];

        if (req->coroutine && req->receiving) {
            aio_co_wake(req->coroutine);
        }
    }
}

static void nbd_client_detach_aio_context(BlockDriverState *bs)
{
144 145 146
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;

    qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
147 148 149 150 151
}

static void nbd_client_attach_aio_context_bh(void *opaque)
{
    BlockDriverState *bs = opaque;
152
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
153 154 155 156 157 158 159

    /*
     * The node is still drained, so we know the coroutine has yielded in
     * nbd_read_eof(), the only place where bs->in_flight can reach 0, or it is
     * entered for the first time. Both places are safe for entering the
     * coroutine.
     */
160
    qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
161 162 163 164 165 166
    bdrv_dec_in_flight(bs);
}

static void nbd_client_attach_aio_context(BlockDriverState *bs,
                                          AioContext *new_context)
{
167 168
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;

169 170 171 172 173 174 175
    /*
     * s->connection_co is either yielded from nbd_receive_reply or from
     * nbd_co_reconnect_loop()
     */
    if (s->state == NBD_CLIENT_CONNECTED) {
        qio_channel_attach_aio_context(QIO_CHANNEL(s->ioc), new_context);
    }
176 177 178 179 180 181 182 183 184 185

    bdrv_inc_in_flight(bs);

    /*
     * Need to wait here for the BH to run because the BH must run while the
     * node is still drained.
     */
    aio_wait_bh_oneshot(new_context, nbd_client_attach_aio_context_bh, bs);
}

186 187 188
static void coroutine_fn nbd_client_co_drain_begin(BlockDriverState *bs)
{
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
189

190 191 192 193 194 195 196
    s->drained = true;
    if (s->connection_co_sleep_ns_state) {
        qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
    }
}

static void coroutine_fn nbd_client_co_drain_end(BlockDriverState *bs)
197
{
198
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
199

200 201 202 203 204 205 206
    s->drained = false;
    if (s->wait_drained_end) {
        s->wait_drained_end = false;
        aio_co_wake(s->connection_co);
    }
}

207

208 209 210 211
static void nbd_teardown_connection(BlockDriverState *bs)
{
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;

212
    if (s->ioc) {
213 214
        /* finish any pending coroutines */
        qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
215 216 217 218
    } else if (s->sioc) {
        /* abort negotiation */
        qio_channel_shutdown(QIO_CHANNEL(s->sioc), QIO_CHANNEL_SHUTDOWN_BOTH,
                             NULL);
219
    }
220

221 222 223 224 225 226
    s->state = NBD_CLIENT_QUIT;
    if (s->connection_co) {
        if (s->connection_co_sleep_ns_state) {
            qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
        }
    }
M
Max Reitz 已提交
227 228 229 230 231 232 233 234 235
    if (qemu_in_coroutine()) {
        s->teardown_co = qemu_coroutine_self();
        /* connection_co resumes us when it terminates */
        qemu_coroutine_yield();
        s->teardown_co = NULL;
    } else {
        BDRV_POLL_WHILE(bs, s->connection_co);
    }
    assert(!s->connection_co);
236
}
237

238 239 240 241 242 243 244 245 246 247 248 249 250
static bool nbd_client_connecting(BDRVNBDState *s)
{
    return s->state == NBD_CLIENT_CONNECTING_WAIT ||
        s->state == NBD_CLIENT_CONNECTING_NOWAIT;
}

static bool nbd_client_connecting_wait(BDRVNBDState *s)
{
    return s->state == NBD_CLIENT_CONNECTING_WAIT;
}

static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
{
251
    int ret;
252
    Error *local_err = NULL;
253
    QIOChannelSocket *sioc;
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291

    if (!nbd_client_connecting(s)) {
        return;
    }

    /* Wait for completion of all in-flight requests */

    qemu_co_mutex_lock(&s->send_mutex);

    while (s->in_flight > 0) {
        qemu_co_mutex_unlock(&s->send_mutex);
        nbd_recv_coroutines_wake_all(s);
        s->wait_in_flight = true;
        qemu_coroutine_yield();
        s->wait_in_flight = false;
        qemu_co_mutex_lock(&s->send_mutex);
    }

    qemu_co_mutex_unlock(&s->send_mutex);

    if (!nbd_client_connecting(s)) {
        return;
    }

    /*
     * Now we are sure that nobody is accessing the channel, and no one will
     * try until we set the state to CONNECTED.
     */

    /* Finalize previous connection if any */
    if (s->ioc) {
        nbd_client_detach_aio_context(s->bs);
        object_unref(OBJECT(s->sioc));
        s->sioc = NULL;
        object_unref(OBJECT(s->ioc));
        s->ioc = NULL;
    }

292 293 294 295 296 297
    sioc = nbd_establish_connection(s->saddr, &local_err);
    if (!sioc) {
        ret = -ECONNREFUSED;
        goto out;
    }

298 299
    bdrv_dec_in_flight(s->bs);

300 301
    ret = nbd_client_handshake(s->bs, sioc, &local_err);

302 303 304 305 306 307 308 309 310 311 312 313
    if (s->drained) {
        s->wait_drained_end = true;
        while (s->drained) {
            /*
             * We may be entered once from nbd_client_attach_aio_context_bh
             * and then from nbd_client_co_drain_end. So here is a loop.
             */
            qemu_coroutine_yield();
        }
    }
    bdrv_inc_in_flight(s->bs);

314 315
out:
    s->connect_status = ret;
316 317 318 319
    error_free(s->connect_err);
    s->connect_err = NULL;
    error_propagate(&s->connect_err, local_err);

320 321 322 323
    if (ret >= 0) {
        /* successfully connected */
        s->state = NBD_CLIENT_CONNECTED;
        qemu_co_queue_restart_all(&s->free_sema);
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
    }
}

static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s)
{
    uint64_t start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    uint64_t delay_ns = s->reconnect_delay * NANOSECONDS_PER_SECOND;
    uint64_t timeout = 1 * NANOSECONDS_PER_SECOND;
    uint64_t max_timeout = 16 * NANOSECONDS_PER_SECOND;

    nbd_reconnect_attempt(s);

    while (nbd_client_connecting(s)) {
        if (s->state == NBD_CLIENT_CONNECTING_WAIT &&
            qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time_ns > delay_ns)
        {
            s->state = NBD_CLIENT_CONNECTING_NOWAIT;
            qemu_co_queue_restart_all(&s->free_sema);
        }

        qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout,
                                  &s->connection_co_sleep_ns_state);
        if (s->drained) {
            bdrv_dec_in_flight(s->bs);
            s->wait_drained_end = true;
            while (s->drained) {
                /*
                 * We may be entered once from nbd_client_attach_aio_context_bh
                 * and then from nbd_client_co_drain_end. So here is a loop.
                 */
                qemu_coroutine_yield();
            }
            bdrv_inc_in_flight(s->bs);
        }
        if (timeout < max_timeout) {
            timeout *= 2;
        }

        nbd_reconnect_attempt(s);
    }
364 365 366 367
}

static coroutine_fn void nbd_connection_entry(void *opaque)
{
368
    BDRVNBDState *s = opaque;
369 370 371 372
    uint64_t i;
    int ret = 0;
    Error *local_err = NULL;

373
    while (s->state != NBD_CLIENT_QUIT) {
374 375 376 377 378 379 380 381 382
        /*
         * The NBD client can only really be considered idle when it has
         * yielded from qio_channel_readv_all_eof(), waiting for data. This is
         * the point where the additional scheduled coroutine entry happens
         * after nbd_client_attach_aio_context().
         *
         * Therefore we keep an additional in_flight reference all the time and
         * only drop it temporarily here.
         */
383 384 385 386 387 388 389 390 391

        if (nbd_client_connecting(s)) {
            nbd_co_reconnect_loop(s);
        }

        if (s->state != NBD_CLIENT_CONNECTED) {
            continue;
        }

392 393 394 395 396 397
        assert(s->reply.handle == 0);
        ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, &local_err);

        if (local_err) {
            trace_nbd_read_reply_entry_fail(ret, error_get_pretty(local_err));
            error_free(local_err);
398
            local_err = NULL;
399 400
        }
        if (ret <= 0) {
401
            nbd_channel_error(s, ret ? ret : -EIO);
402
            continue;
403 404 405 406 407 408 409 410 411 412 413 414 415
        }

        /*
         * There's no need for a mutex on the receive side, because the
         * handler acts as a synchronization point and ensures that only
         * one coroutine is called until the reply finishes.
         */
        i = HANDLE_TO_INDEX(s, s->reply.handle);
        if (i >= MAX_NBD_REQUESTS ||
            !s->requests[i].coroutine ||
            !s->requests[i].receiving ||
            (nbd_reply_is_structured(&s->reply) && !s->info.structured_reply))
        {
416
            nbd_channel_error(s, -EINVAL);
417
            continue;
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
        }

        /*
         * We're woken up again by the request itself.  Note that there
         * is no race between yielding and reentering connection_co.  This
         * is because:
         *
         * - if the request runs on the same AioContext, it is only
         *   entered after we yield
         *
         * - if the request runs on a different AioContext, reentering
         *   connection_co happens through a bottom half, which can only
         *   run after we yield.
         */
        aio_co_wake(s->requests[i].coroutine);
        qemu_coroutine_yield();
    }

436
    qemu_co_queue_restart_all(&s->free_sema);
437 438 439 440
    nbd_recv_coroutines_wake_all(s);
    bdrv_dec_in_flight(s->bs);

    s->connection_co = NULL;
441 442 443 444 445 446 447 448
    if (s->ioc) {
        nbd_client_detach_aio_context(s->bs);
        object_unref(OBJECT(s->sioc));
        s->sioc = NULL;
        object_unref(OBJECT(s->ioc));
        s->ioc = NULL;
    }

M
Max Reitz 已提交
449 450 451
    if (s->teardown_co) {
        aio_co_wake(s->teardown_co);
    }
452 453 454 455 456 457 458
    aio_wait_kick();
}

static int nbd_co_send_request(BlockDriverState *bs,
                               NBDRequest *request,
                               QEMUIOVector *qiov)
{
459
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
460
    int rc, i = -1;
461 462

    qemu_co_mutex_lock(&s->send_mutex);
463
    while (s->in_flight == MAX_NBD_REQUESTS || nbd_client_connecting_wait(s)) {
464 465
        qemu_co_queue_wait(&s->free_sema, &s->send_mutex);
    }
466 467 468 469 470 471

    if (s->state != NBD_CLIENT_CONNECTED) {
        rc = -EIO;
        goto err;
    }

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
    s->in_flight++;

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->requests[i].coroutine == NULL) {
            break;
        }
    }

    g_assert(qemu_in_coroutine());
    assert(i < MAX_NBD_REQUESTS);

    s->requests[i].coroutine = qemu_coroutine_self();
    s->requests[i].offset = request->from;
    s->requests[i].receiving = false;

    request->handle = INDEX_TO_HANDLE(s, i);

    assert(s->ioc);

    if (qiov) {
        qio_channel_set_cork(s->ioc, true);
        rc = nbd_send_request(s->ioc, request);
494
        if (rc >= 0 && s->state == NBD_CLIENT_CONNECTED) {
495 496 497 498 499 500 501 502 503 504 505 506 507 508
            if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
                                       NULL) < 0) {
                rc = -EIO;
            }
        } else if (rc >= 0) {
            rc = -EIO;
        }
        qio_channel_set_cork(s->ioc, false);
    } else {
        rc = nbd_send_request(s->ioc, request);
    }

err:
    if (rc < 0) {
509 510 511 512 513
        nbd_channel_error(s, rc);
        if (i != -1) {
            s->requests[i].coroutine = NULL;
            s->in_flight--;
        }
514 515 516 517 518
        if (s->in_flight == 0 && s->wait_in_flight) {
            aio_co_wake(s->connection_co);
        } else {
            qemu_co_queue_next(&s->free_sema);
        }
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
    }
    qemu_co_mutex_unlock(&s->send_mutex);
    return rc;
}

static inline uint16_t payload_advance16(uint8_t **payload)
{
    *payload += 2;
    return lduw_be_p(*payload - 2);
}

static inline uint32_t payload_advance32(uint8_t **payload)
{
    *payload += 4;
    return ldl_be_p(*payload - 4);
}

static inline uint64_t payload_advance64(uint8_t **payload)
{
    *payload += 8;
    return ldq_be_p(*payload - 8);
}

542
static int nbd_parse_offset_hole_payload(BDRVNBDState *s,
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
                                         NBDStructuredReplyChunk *chunk,
                                         uint8_t *payload, uint64_t orig_offset,
                                         QEMUIOVector *qiov, Error **errp)
{
    uint64_t offset;
    uint32_t hole_size;

    if (chunk->length != sizeof(offset) + sizeof(hole_size)) {
        error_setg(errp, "Protocol error: invalid payload for "
                         "NBD_REPLY_TYPE_OFFSET_HOLE");
        return -EINVAL;
    }

    offset = payload_advance64(&payload);
    hole_size = payload_advance32(&payload);

    if (!hole_size || offset < orig_offset || hole_size > qiov->size ||
        offset > orig_offset + qiov->size - hole_size) {
        error_setg(errp, "Protocol error: server sent chunk exceeding requested"
                         " region");
        return -EINVAL;
    }
565 566
    if (s->info.min_block &&
        !QEMU_IS_ALIGNED(hole_size, s->info.min_block)) {
567 568 569 570 571 572 573 574 575 576 577 578 579
        trace_nbd_structured_read_compliance("hole");
    }

    qemu_iovec_memset(qiov, offset - orig_offset, 0, hole_size);

    return 0;
}

/*
 * nbd_parse_blockstatus_payload
 * Based on our request, we expect only one extent in reply, for the
 * base:allocation context.
 */
580
static int nbd_parse_blockstatus_payload(BDRVNBDState *s,
581 582 583 584 585 586 587 588 589 590 591 592 593 594
                                         NBDStructuredReplyChunk *chunk,
                                         uint8_t *payload, uint64_t orig_length,
                                         NBDExtent *extent, Error **errp)
{
    uint32_t context_id;

    /* The server succeeded, so it must have sent [at least] one extent */
    if (chunk->length < sizeof(context_id) + sizeof(*extent)) {
        error_setg(errp, "Protocol error: invalid payload for "
                         "NBD_REPLY_TYPE_BLOCK_STATUS");
        return -EINVAL;
    }

    context_id = payload_advance32(&payload);
595
    if (s->info.context_id != context_id) {
596 597 598
        error_setg(errp, "Protocol error: unexpected context id %d for "
                         "NBD_REPLY_TYPE_BLOCK_STATUS, when negotiated context "
                         "id is %d", context_id,
599
                         s->info.context_id);
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623
        return -EINVAL;
    }

    extent->length = payload_advance32(&payload);
    extent->flags = payload_advance32(&payload);

    if (extent->length == 0) {
        error_setg(errp, "Protocol error: server sent status chunk with "
                   "zero length");
        return -EINVAL;
    }

    /*
     * A server sending unaligned block status is in violation of the
     * protocol, but as qemu-nbd 3.1 is such a server (at least for
     * POSIX files that are not a multiple of 512 bytes, since qemu
     * rounds files up to 512-byte multiples but lseek(SEEK_HOLE)
     * still sees an implicit hole beyond the real EOF), it's nicer to
     * work around the misbehaving server. If the request included
     * more than the final unaligned block, truncate it back to an
     * aligned result; if the request was only the final block, round
     * up to the full block and change the status to fully-allocated
     * (always a safe status, even if it loses information).
     */
624 625
    if (s->info.min_block && !QEMU_IS_ALIGNED(extent->length,
                                                   s->info.min_block)) {
626
        trace_nbd_parse_blockstatus_compliance("extent length is unaligned");
627
        if (extent->length > s->info.min_block) {
628
            extent->length = QEMU_ALIGN_DOWN(extent->length,
629
                                             s->info.min_block);
630
        } else {
631
            extent->length = s->info.min_block;
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
            extent->flags = 0;
        }
    }

    /*
     * We used NBD_CMD_FLAG_REQ_ONE, so the server should not have
     * sent us any more than one extent, nor should it have included
     * status beyond our request in that extent. However, it's easy
     * enough to ignore the server's noncompliance without killing the
     * connection; just ignore trailing extents, and clamp things to
     * the length of our request.
     */
    if (chunk->length > sizeof(context_id) + sizeof(*extent)) {
        trace_nbd_parse_blockstatus_compliance("more than one extent");
    }
    if (extent->length > orig_length) {
        extent->length = orig_length;
        trace_nbd_parse_blockstatus_compliance("extent length too large");
    }

    return 0;
}

/*
 * nbd_parse_error_payload
 * on success @errp contains message describing nbd error reply
 */
static int nbd_parse_error_payload(NBDStructuredReplyChunk *chunk,
                                   uint8_t *payload, int *request_ret,
                                   Error **errp)
{
    uint32_t error;
    uint16_t message_size;

    assert(chunk->type & (1 << 15));

    if (chunk->length < sizeof(error) + sizeof(message_size)) {
        error_setg(errp,
                   "Protocol error: invalid payload for structured error");
        return -EINVAL;
    }

    error = nbd_errno_to_system_errno(payload_advance32(&payload));
    if (error == 0) {
        error_setg(errp, "Protocol error: server sent structured error chunk "
                         "with error = 0");
        return -EINVAL;
    }

    *request_ret = -error;
    message_size = payload_advance16(&payload);

    if (message_size > chunk->length - sizeof(error) - sizeof(message_size)) {
        error_setg(errp, "Protocol error: server sent structured error chunk "
                         "with incorrect message size");
        return -EINVAL;
    }

    /* TODO: Add a trace point to mention the server complaint */

    /* TODO handle ERROR_OFFSET */

    return 0;
}

697
static int nbd_co_receive_offset_data_payload(BDRVNBDState *s,
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741
                                              uint64_t orig_offset,
                                              QEMUIOVector *qiov, Error **errp)
{
    QEMUIOVector sub_qiov;
    uint64_t offset;
    size_t data_size;
    int ret;
    NBDStructuredReplyChunk *chunk = &s->reply.structured;

    assert(nbd_reply_is_structured(&s->reply));

    /* The NBD spec requires at least one byte of payload */
    if (chunk->length <= sizeof(offset)) {
        error_setg(errp, "Protocol error: invalid payload for "
                         "NBD_REPLY_TYPE_OFFSET_DATA");
        return -EINVAL;
    }

    if (nbd_read64(s->ioc, &offset, "OFFSET_DATA offset", errp) < 0) {
        return -EIO;
    }

    data_size = chunk->length - sizeof(offset);
    assert(data_size);
    if (offset < orig_offset || data_size > qiov->size ||
        offset > orig_offset + qiov->size - data_size) {
        error_setg(errp, "Protocol error: server sent chunk exceeding requested"
                         " region");
        return -EINVAL;
    }
    if (s->info.min_block && !QEMU_IS_ALIGNED(data_size, s->info.min_block)) {
        trace_nbd_structured_read_compliance("data");
    }

    qemu_iovec_init(&sub_qiov, qiov->niov);
    qemu_iovec_concat(&sub_qiov, qiov, offset - orig_offset, data_size);
    ret = qio_channel_readv_all(s->ioc, sub_qiov.iov, sub_qiov.niov, errp);
    qemu_iovec_destroy(&sub_qiov);

    return ret < 0 ? -EIO : 0;
}

#define NBD_MAX_MALLOC_PAYLOAD 1000
static coroutine_fn int nbd_co_receive_structured_payload(
742
        BDRVNBDState *s, void **payload, Error **errp)
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
{
    int ret;
    uint32_t len;

    assert(nbd_reply_is_structured(&s->reply));

    len = s->reply.structured.length;

    if (len == 0) {
        return 0;
    }

    if (payload == NULL) {
        error_setg(errp, "Unexpected structured payload");
        return -EINVAL;
    }

    if (len > NBD_MAX_MALLOC_PAYLOAD) {
        error_setg(errp, "Payload too large");
        return -EINVAL;
    }

    *payload = g_new(char, len);
    ret = nbd_read(s->ioc, *payload, len, "structured payload", errp);
    if (ret < 0) {
        g_free(*payload);
        *payload = NULL;
        return ret;
    }

    return 0;
}

/*
 * nbd_co_do_receive_one_chunk
 * for simple reply:
 *   set request_ret to received reply error
 *   if qiov is not NULL: read payload to @qiov
 * for structured reply chunk:
 *   if error chunk: read payload, set @request_ret, do not set @payload
 *   else if offset_data chunk: read payload data to @qiov, do not set @payload
 *   else: read payload to @payload
 *
 * If function fails, @errp contains corresponding error message, and the
 * connection with the server is suspect.  If it returns 0, then the
 * transaction succeeded (although @request_ret may be a negative errno
 * corresponding to the server's error reply), and errp is unchanged.
 */
static coroutine_fn int nbd_co_do_receive_one_chunk(
792
        BDRVNBDState *s, uint64_t handle, bool only_structured,
793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
        int *request_ret, QEMUIOVector *qiov, void **payload, Error **errp)
{
    int ret;
    int i = HANDLE_TO_INDEX(s, handle);
    void *local_payload = NULL;
    NBDStructuredReplyChunk *chunk;

    if (payload) {
        *payload = NULL;
    }
    *request_ret = 0;

    /* Wait until we're woken up by nbd_connection_entry.  */
    s->requests[i].receiving = true;
    qemu_coroutine_yield();
    s->requests[i].receiving = false;
809
    if (s->state != NBD_CLIENT_CONNECTED) {
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884
        error_setg(errp, "Connection closed");
        return -EIO;
    }
    assert(s->ioc);

    assert(s->reply.handle == handle);

    if (nbd_reply_is_simple(&s->reply)) {
        if (only_structured) {
            error_setg(errp, "Protocol error: simple reply when structured "
                             "reply chunk was expected");
            return -EINVAL;
        }

        *request_ret = -nbd_errno_to_system_errno(s->reply.simple.error);
        if (*request_ret < 0 || !qiov) {
            return 0;
        }

        return qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
                                     errp) < 0 ? -EIO : 0;
    }

    /* handle structured reply chunk */
    assert(s->info.structured_reply);
    chunk = &s->reply.structured;

    if (chunk->type == NBD_REPLY_TYPE_NONE) {
        if (!(chunk->flags & NBD_REPLY_FLAG_DONE)) {
            error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk without"
                       " NBD_REPLY_FLAG_DONE flag set");
            return -EINVAL;
        }
        if (chunk->length) {
            error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk with"
                       " nonzero length");
            return -EINVAL;
        }
        return 0;
    }

    if (chunk->type == NBD_REPLY_TYPE_OFFSET_DATA) {
        if (!qiov) {
            error_setg(errp, "Unexpected NBD_REPLY_TYPE_OFFSET_DATA chunk");
            return -EINVAL;
        }

        return nbd_co_receive_offset_data_payload(s, s->requests[i].offset,
                                                  qiov, errp);
    }

    if (nbd_reply_type_is_error(chunk->type)) {
        payload = &local_payload;
    }

    ret = nbd_co_receive_structured_payload(s, payload, errp);
    if (ret < 0) {
        return ret;
    }

    if (nbd_reply_type_is_error(chunk->type)) {
        ret = nbd_parse_error_payload(chunk, local_payload, request_ret, errp);
        g_free(local_payload);
        return ret;
    }

    return 0;
}

/*
 * nbd_co_receive_one_chunk
 * Read reply, wake up connection_co and set s->quit if needed.
 * Return value is a fatal error code or normal nbd reply error code
 */
static coroutine_fn int nbd_co_receive_one_chunk(
885
        BDRVNBDState *s, uint64_t handle, bool only_structured,
886 887 888 889 890 891 892
        int *request_ret, QEMUIOVector *qiov, NBDReply *reply, void **payload,
        Error **errp)
{
    int ret = nbd_co_do_receive_one_chunk(s, handle, only_structured,
                                          request_ret, qiov, payload, errp);

    if (ret < 0) {
E
Eric Blake 已提交
893
        memset(reply, 0, sizeof(*reply));
894
        nbd_channel_error(s, ret);
895 896
    } else {
        /* For assert at loop start in nbd_connection_entry */
E
Eric Blake 已提交
897
        *reply = s->reply;
898
    }
899
    s->reply.handle = 0;
900

901 902 903 904 905 906
    if (s->connection_co && !s->wait_in_flight) {
        /*
         * We must check s->wait_in_flight, because we may entered by
         * nbd_recv_coroutines_wake_all(), in this case we should not
         * wake connection_co here, it will woken by last request.
         */
907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
        aio_co_wake(s->connection_co);
    }

    return ret;
}

typedef struct NBDReplyChunkIter {
    int ret;
    int request_ret;
    Error *err;
    bool done, only_structured;
} NBDReplyChunkIter;

static void nbd_iter_channel_error(NBDReplyChunkIter *iter,
                                   int ret, Error **local_err)
{
923
    assert(local_err && *local_err);
924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
    assert(ret < 0);

    if (!iter->ret) {
        iter->ret = ret;
        error_propagate(&iter->err, *local_err);
    } else {
        error_free(*local_err);
    }

    *local_err = NULL;
}

static void nbd_iter_request_error(NBDReplyChunkIter *iter, int ret)
{
    assert(ret < 0);

    if (!iter->request_ret) {
        iter->request_ret = ret;
    }
}

/*
 * NBD_FOREACH_REPLY_CHUNK
 * The pointer stored in @payload requires g_free() to free it.
 */
#define NBD_FOREACH_REPLY_CHUNK(s, iter, handle, structured, \
                                qiov, reply, payload) \
    for (iter = (NBDReplyChunkIter) { .only_structured = structured }; \
         nbd_reply_chunk_iter_receive(s, &iter, handle, qiov, reply, payload);)

/*
 * nbd_reply_chunk_iter_receive
 * The pointer stored in @payload requires g_free() to free it.
 */
958
static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s,
959 960 961 962 963 964 965 966 967
                                         NBDReplyChunkIter *iter,
                                         uint64_t handle,
                                         QEMUIOVector *qiov, NBDReply *reply,
                                         void **payload)
{
    int ret, request_ret;
    NBDReply local_reply;
    NBDStructuredReplyChunk *chunk;
    Error *local_err = NULL;
968
    if (s->state != NBD_CLIENT_CONNECTED) {
969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
        error_setg(&local_err, "Connection closed");
        nbd_iter_channel_error(iter, -EIO, &local_err);
        goto break_loop;
    }

    if (iter->done) {
        /* Previous iteration was last. */
        goto break_loop;
    }

    if (reply == NULL) {
        reply = &local_reply;
    }

    ret = nbd_co_receive_one_chunk(s, handle, iter->only_structured,
                                   &request_ret, qiov, reply, payload,
                                   &local_err);
    if (ret < 0) {
        nbd_iter_channel_error(iter, ret, &local_err);
    } else if (request_ret < 0) {
        nbd_iter_request_error(iter, request_ret);
    }

    /* Do not execute the body of NBD_FOREACH_REPLY_CHUNK for simple reply. */
993
    if (nbd_reply_is_simple(reply) || s->state != NBD_CLIENT_CONNECTED) {
994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
        goto break_loop;
    }

    chunk = &reply->structured;
    iter->only_structured = true;

    if (chunk->type == NBD_REPLY_TYPE_NONE) {
        /* NBD_REPLY_FLAG_DONE is already checked in nbd_co_receive_one_chunk */
        assert(chunk->flags & NBD_REPLY_FLAG_DONE);
        goto break_loop;
    }

    if (chunk->flags & NBD_REPLY_FLAG_DONE) {
        /* This iteration is last. */
        iter->done = true;
    }

    /* Execute the loop body */
    return true;

break_loop:
    s->requests[HANDLE_TO_INDEX(s, handle)].coroutine = NULL;

    qemu_co_mutex_lock(&s->send_mutex);
    s->in_flight--;
1019 1020 1021 1022 1023
    if (s->in_flight == 0 && s->wait_in_flight) {
        aio_co_wake(s->connection_co);
    } else {
        qemu_co_queue_next(&s->free_sema);
    }
1024 1025 1026 1027 1028
    qemu_co_mutex_unlock(&s->send_mutex);

    return false;
}

1029
static int nbd_co_receive_return_code(BDRVNBDState *s, uint64_t handle,
1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
                                      int *request_ret, Error **errp)
{
    NBDReplyChunkIter iter;

    NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, NULL, NULL) {
        /* nbd_reply_chunk_iter_receive does all the work */
    }

    error_propagate(errp, iter.err);
    *request_ret = iter.request_ret;
    return iter.ret;
}

1043
static int nbd_co_receive_cmdread_reply(BDRVNBDState *s, uint64_t handle,
1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070
                                        uint64_t offset, QEMUIOVector *qiov,
                                        int *request_ret, Error **errp)
{
    NBDReplyChunkIter iter;
    NBDReply reply;
    void *payload = NULL;
    Error *local_err = NULL;

    NBD_FOREACH_REPLY_CHUNK(s, iter, handle, s->info.structured_reply,
                            qiov, &reply, &payload)
    {
        int ret;
        NBDStructuredReplyChunk *chunk = &reply.structured;

        assert(nbd_reply_is_structured(&reply));

        switch (chunk->type) {
        case NBD_REPLY_TYPE_OFFSET_DATA:
            /*
             * special cased in nbd_co_receive_one_chunk, data is already
             * in qiov
             */
            break;
        case NBD_REPLY_TYPE_OFFSET_HOLE:
            ret = nbd_parse_offset_hole_payload(s, &reply.structured, payload,
                                                offset, qiov, &local_err);
            if (ret < 0) {
1071
                nbd_channel_error(s, ret);
1072 1073 1074 1075 1076 1077
                nbd_iter_channel_error(&iter, ret, &local_err);
            }
            break;
        default:
            if (!nbd_reply_type_is_error(chunk->type)) {
                /* not allowed reply type */
1078
                nbd_channel_error(s, -EINVAL);
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
                error_setg(&local_err,
                           "Unexpected reply type: %d (%s) for CMD_READ",
                           chunk->type, nbd_reply_type_lookup(chunk->type));
                nbd_iter_channel_error(&iter, -EINVAL, &local_err);
            }
        }

        g_free(payload);
        payload = NULL;
    }

    error_propagate(errp, iter.err);
    *request_ret = iter.request_ret;
    return iter.ret;
}

1095
static int nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
                                            uint64_t handle, uint64_t length,
                                            NBDExtent *extent,
                                            int *request_ret, Error **errp)
{
    NBDReplyChunkIter iter;
    NBDReply reply;
    void *payload = NULL;
    Error *local_err = NULL;
    bool received = false;

    assert(!extent->length);
    NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, &reply, &payload) {
        int ret;
        NBDStructuredReplyChunk *chunk = &reply.structured;

        assert(nbd_reply_is_structured(&reply));

        switch (chunk->type) {
        case NBD_REPLY_TYPE_BLOCK_STATUS:
            if (received) {
1116
                nbd_channel_error(s, -EINVAL);
1117 1118 1119 1120 1121 1122 1123 1124 1125
                error_setg(&local_err, "Several BLOCK_STATUS chunks in reply");
                nbd_iter_channel_error(&iter, -EINVAL, &local_err);
            }
            received = true;

            ret = nbd_parse_blockstatus_payload(s, &reply.structured,
                                                payload, length, extent,
                                                &local_err);
            if (ret < 0) {
1126
                nbd_channel_error(s, ret);
1127 1128 1129 1130 1131
                nbd_iter_channel_error(&iter, ret, &local_err);
            }
            break;
        default:
            if (!nbd_reply_type_is_error(chunk->type)) {
1132
                nbd_channel_error(s, -EINVAL);
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
                error_setg(&local_err,
                           "Unexpected reply type: %d (%s) "
                           "for CMD_BLOCK_STATUS",
                           chunk->type, nbd_reply_type_lookup(chunk->type));
                nbd_iter_channel_error(&iter, -EINVAL, &local_err);
            }
        }

        g_free(payload);
        payload = NULL;
    }

    if (!extent->length && !iter.request_ret) {
        error_setg(&local_err, "Server did not reply with any status extents");
        nbd_iter_channel_error(&iter, -EIO, &local_err);
    }

    error_propagate(errp, iter.err);
    *request_ret = iter.request_ret;
    return iter.ret;
}

static int nbd_co_request(BlockDriverState *bs, NBDRequest *request,
                          QEMUIOVector *write_qiov)
{
    int ret, request_ret;
    Error *local_err = NULL;
1160
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1161 1162 1163 1164 1165 1166 1167 1168 1169

    assert(request->type != NBD_CMD_READ);
    if (write_qiov) {
        assert(request->type == NBD_CMD_WRITE);
        assert(request->len == iov_size(write_qiov->iov, write_qiov->niov));
    } else {
        assert(request->type != NBD_CMD_WRITE);
    }

1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
    do {
        ret = nbd_co_send_request(bs, request, write_qiov);
        if (ret < 0) {
            continue;
        }

        ret = nbd_co_receive_return_code(s, request->handle,
                                         &request_ret, &local_err);
        if (local_err) {
            trace_nbd_co_request_fail(request->from, request->len,
                                      request->handle, request->flags,
                                      request->type,
                                      nbd_cmd_lookup(request->type),
                                      ret, error_get_pretty(local_err));
            error_free(local_err);
            local_err = NULL;
        }
    } while (ret < 0 && nbd_client_connecting_wait(s));

1189 1190 1191 1192 1193 1194 1195 1196
    return ret ? ret : request_ret;
}

static int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
                                uint64_t bytes, QEMUIOVector *qiov, int flags)
{
    int ret, request_ret;
    Error *local_err = NULL;
1197
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
    NBDRequest request = {
        .type = NBD_CMD_READ,
        .from = offset,
        .len = bytes,
    };

    assert(bytes <= NBD_MAX_BUFFER_SIZE);
    assert(!flags);

    if (!bytes) {
        return 0;
    }
    /*
     * Work around the fact that the block layer doesn't do
     * byte-accurate sizing yet - if the read exceeds the server's
     * advertised size because the block layer rounded size up, then
     * truncate the request to the server and tail-pad with zero.
     */
1216
    if (offset >= s->info.size) {
1217 1218 1219 1220
        assert(bytes < BDRV_SECTOR_SIZE);
        qemu_iovec_memset(qiov, 0, 0, bytes);
        return 0;
    }
1221 1222
    if (offset + bytes > s->info.size) {
        uint64_t slop = offset + bytes - s->info.size;
1223 1224 1225 1226 1227 1228

        assert(slop < BDRV_SECTOR_SIZE);
        qemu_iovec_memset(qiov, bytes - slop, 0, slop);
        request.len -= slop;
    }

1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
    do {
        ret = nbd_co_send_request(bs, &request, NULL);
        if (ret < 0) {
            continue;
        }

        ret = nbd_co_receive_cmdread_reply(s, request.handle, offset, qiov,
                                           &request_ret, &local_err);
        if (local_err) {
            trace_nbd_co_request_fail(request.from, request.len, request.handle,
                                      request.flags, request.type,
                                      nbd_cmd_lookup(request.type),
                                      ret, error_get_pretty(local_err));
            error_free(local_err);
            local_err = NULL;
        }
    } while (ret < 0 && nbd_client_connecting_wait(s));
1246 1247 1248 1249 1250 1251 1252

    return ret ? ret : request_ret;
}

static int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                 uint64_t bytes, QEMUIOVector *qiov, int flags)
{
1253
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1254 1255 1256 1257 1258 1259
    NBDRequest request = {
        .type = NBD_CMD_WRITE,
        .from = offset,
        .len = bytes,
    };

1260
    assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
1261
    if (flags & BDRV_REQ_FUA) {
1262
        assert(s->info.flags & NBD_FLAG_SEND_FUA);
1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276
        request.flags |= NBD_CMD_FLAG_FUA;
    }

    assert(bytes <= NBD_MAX_BUFFER_SIZE);

    if (!bytes) {
        return 0;
    }
    return nbd_co_request(bs, &request, qiov);
}

static int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
                                       int bytes, BdrvRequestFlags flags)
{
1277
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1278 1279 1280 1281 1282 1283
    NBDRequest request = {
        .type = NBD_CMD_WRITE_ZEROES,
        .from = offset,
        .len = bytes,
    };

1284 1285
    assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
    if (!(s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) {
1286 1287 1288 1289
        return -ENOTSUP;
    }

    if (flags & BDRV_REQ_FUA) {
1290
        assert(s->info.flags & NBD_FLAG_SEND_FUA);
1291 1292 1293 1294 1295
        request.flags |= NBD_CMD_FLAG_FUA;
    }
    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
        request.flags |= NBD_CMD_FLAG_NO_HOLE;
    }
1296 1297 1298 1299
    if (flags & BDRV_REQ_NO_FALLBACK) {
        assert(s->info.flags & NBD_FLAG_SEND_FAST_ZERO);
        request.flags |= NBD_CMD_FLAG_FAST_ZERO;
    }
1300 1301 1302 1303 1304 1305 1306 1307 1308

    if (!bytes) {
        return 0;
    }
    return nbd_co_request(bs, &request, NULL);
}

static int nbd_client_co_flush(BlockDriverState *bs)
{
1309
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1310 1311
    NBDRequest request = { .type = NBD_CMD_FLUSH };

1312
    if (!(s->info.flags & NBD_FLAG_SEND_FLUSH)) {
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
        return 0;
    }

    request.from = 0;
    request.len = 0;

    return nbd_co_request(bs, &request, NULL);
}

static int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset,
                                  int bytes)
{
1325
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1326 1327 1328 1329 1330 1331
    NBDRequest request = {
        .type = NBD_CMD_TRIM,
        .from = offset,
        .len = bytes,
    };

1332 1333
    assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
    if (!(s->info.flags & NBD_FLAG_SEND_TRIM) || !bytes) {
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345
        return 0;
    }

    return nbd_co_request(bs, &request, NULL);
}

static int coroutine_fn nbd_client_co_block_status(
        BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
        int64_t *pnum, int64_t *map, BlockDriverState **file)
{
    int ret, request_ret;
    NBDExtent extent = { 0 };
1346
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1347 1348 1349 1350 1351
    Error *local_err = NULL;

    NBDRequest request = {
        .type = NBD_CMD_BLOCK_STATUS,
        .from = offset,
1352
        .len = MIN(QEMU_ALIGN_DOWN(INT_MAX, bs->bl.request_alignment),
1353
                   MIN(bytes, s->info.size - offset)),
1354 1355 1356
        .flags = NBD_CMD_FLAG_REQ_ONE,
    };

1357
    if (!s->info.base_allocation) {
1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370
        *pnum = bytes;
        *map = offset;
        *file = bs;
        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
    }

    /*
     * Work around the fact that the block layer doesn't do
     * byte-accurate sizing yet - if the status request exceeds the
     * server's advertised size because the block layer rounded size
     * up, we truncated the request to the server (above), or are
     * called on just the hole.
     */
1371
    if (offset >= s->info.size) {
1372 1373 1374 1375 1376 1377
        *pnum = bytes;
        assert(bytes < BDRV_SECTOR_SIZE);
        /* Intentionally don't report offset_valid for the hole */
        return BDRV_BLOCK_ZERO;
    }

1378 1379
    if (s->info.min_block) {
        assert(QEMU_IS_ALIGNED(request.len, s->info.min_block));
1380
    }
1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
    do {
        ret = nbd_co_send_request(bs, &request, NULL);
        if (ret < 0) {
            continue;
        }

        ret = nbd_co_receive_blockstatus_reply(s, request.handle, bytes,
                                               &extent, &request_ret,
                                               &local_err);
        if (local_err) {
            trace_nbd_co_request_fail(request.from, request.len, request.handle,
                                      request.flags, request.type,
                                      nbd_cmd_lookup(request.type),
                                      ret, error_get_pretty(local_err));
            error_free(local_err);
            local_err = NULL;
        }
    } while (ret < 0 && nbd_client_connecting_wait(s));
1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412

    if (ret < 0 || request_ret < 0) {
        return ret ? ret : request_ret;
    }

    assert(extent.length);
    *pnum = extent.length;
    *map = offset;
    *file = bs;
    return (extent.flags & NBD_STATE_HOLE ? 0 : BDRV_BLOCK_DATA) |
        (extent.flags & NBD_STATE_ZERO ? BDRV_BLOCK_ZERO : 0) |
        BDRV_BLOCK_OFFSET_VALID;
}

1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424
static int nbd_client_reopen_prepare(BDRVReopenState *state,
                                     BlockReopenQueue *queue, Error **errp)
{
    BDRVNBDState *s = (BDRVNBDState *)state->bs->opaque;

    if ((state->flags & BDRV_O_RDWR) && (s->info.flags & NBD_FLAG_READ_ONLY)) {
        error_setg(errp, "Can't reopen read-only NBD mount as read/write");
        return -EACCES;
    }
    return 0;
}

1425 1426
static void nbd_client_close(BlockDriverState *bs)
{
1427
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1428 1429
    NBDRequest request = { .type = NBD_CMD_DISC };

1430 1431 1432
    if (s->ioc) {
        nbd_send_request(s->ioc, &request);
    }
1433 1434 1435 1436 1437 1438 1439

    nbd_teardown_connection(bs);
}

static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr,
                                                  Error **errp)
{
1440
    ERRP_GUARD();
1441 1442 1443 1444 1445
    QIOChannelSocket *sioc;

    sioc = qio_channel_socket_new();
    qio_channel_set_name(QIO_CHANNEL(sioc), "nbd-client");

1446 1447
    qio_channel_socket_connect_sync(sioc, saddr, errp);
    if (*errp) {
1448 1449 1450 1451 1452 1453 1454 1455 1456
        object_unref(OBJECT(sioc));
        return NULL;
    }

    qio_channel_set_delay(QIO_CHANNEL(sioc), false);

    return sioc;
}

1457 1458 1459
/* nbd_client_handshake takes ownership on sioc. On failure it is unref'ed. */
static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc,
                                Error **errp)
1460
{
1461
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1462
    AioContext *aio_context = bdrv_get_aio_context(bs);
1463 1464
    int ret;

1465
    trace_nbd_client_handshake(s->export);
1466 1467 1468

    s->sioc = sioc;

1469 1470
    qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
    qio_channel_attach_aio_context(QIO_CHANNEL(sioc), aio_context);
1471

1472 1473 1474
    s->info.request_sizes = true;
    s->info.structured_reply = true;
    s->info.base_allocation = true;
1475 1476 1477 1478
    s->info.x_dirty_bitmap = g_strdup(s->x_dirty_bitmap);
    s->info.name = g_strdup(s->export ?: "");
    ret = nbd_receive_negotiate(aio_context, QIO_CHANNEL(sioc), s->tlscreds,
                                s->hostname, &s->ioc, &s->info, errp);
1479 1480
    g_free(s->info.x_dirty_bitmap);
    g_free(s->info.name);
1481 1482
    if (ret < 0) {
        object_unref(OBJECT(sioc));
1483
        s->sioc = NULL;
1484 1485
        return ret;
    }
1486
    if (s->x_dirty_bitmap && !s->info.base_allocation) {
1487
        error_setg(errp, "requested x-dirty-bitmap %s not found",
1488
                   s->x_dirty_bitmap);
1489 1490 1491
        ret = -EINVAL;
        goto fail;
    }
1492
    if (s->info.flags & NBD_FLAG_READ_ONLY) {
1493 1494 1495 1496 1497
        ret = bdrv_apply_auto_read_only(bs, "NBD export is read-only", errp);
        if (ret < 0) {
            goto fail;
        }
    }
1498
    if (s->info.flags & NBD_FLAG_SEND_FUA) {
1499 1500 1501
        bs->supported_write_flags = BDRV_REQ_FUA;
        bs->supported_zero_flags |= BDRV_REQ_FUA;
    }
1502
    if (s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
1503
        bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
1504 1505 1506
        if (s->info.flags & NBD_FLAG_SEND_FAST_ZERO) {
            bs->supported_zero_flags |= BDRV_REQ_NO_FALLBACK;
        }
1507 1508
    }

1509 1510 1511
    if (!s->ioc) {
        s->ioc = QIO_CHANNEL(sioc);
        object_ref(OBJECT(s->ioc));
1512 1513
    }

1514
    trace_nbd_client_handshake_success(s->export);
1515 1516 1517 1518 1519

    return 0;

 fail:
    /*
1520 1521
     * We have connected, but must fail for other reasons.
     * Send NBD_CMD_DISC as a courtesy to the server.
1522 1523 1524 1525
     */
    {
        NBDRequest request = { .type = NBD_CMD_DISC };

1526
        nbd_send_request(s->ioc ?: QIO_CHANNEL(sioc), &request);
1527 1528

        object_unref(OBJECT(sioc));
1529
        s->sioc = NULL;
1530 1531 1532 1533 1534

        return ret;
    }
}

1535 1536 1537
/*
 * Parse nbd_open options
 */
1538

1539
static int nbd_parse_uri(const char *filename, QDict *options)
P
Paolo Bonzini 已提交
1540 1541 1542 1543 1544
{
    URI *uri;
    const char *p;
    QueryParams *qp = NULL;
    int ret = 0;
1545
    bool is_unix;
P
Paolo Bonzini 已提交
1546 1547 1548 1549 1550 1551 1552

    uri = uri_parse(filename);
    if (!uri) {
        return -EINVAL;
    }

    /* transport */
1553
    if (!g_strcmp0(uri->scheme, "nbd")) {
1554
        is_unix = false;
1555
    } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
1556
        is_unix = false;
1557
    } else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
1558
        is_unix = true;
P
Paolo Bonzini 已提交
1559 1560 1561 1562 1563
    } else {
        ret = -EINVAL;
        goto out;
    }

1564 1565 1566 1567
    p = uri->path ? uri->path : "";
    if (p[0] == '/') {
        p++;
    }
P
Paolo Bonzini 已提交
1568
    if (p[0]) {
1569
        qdict_put_str(options, "export", p);
P
Paolo Bonzini 已提交
1570 1571 1572
    }

    qp = query_params_parse(uri->query);
1573
    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
P
Paolo Bonzini 已提交
1574 1575 1576 1577
        ret = -EINVAL;
        goto out;
    }

1578
    if (is_unix) {
P
Paolo Bonzini 已提交
1579 1580 1581 1582 1583
        /* nbd+unix:///export?socket=path */
        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
            ret = -EINVAL;
            goto out;
        }
1584 1585
        qdict_put_str(options, "server.type", "unix");
        qdict_put_str(options, "server.path", qp->p[0].value);
P
Paolo Bonzini 已提交
1586
    } else {
1587
        QString *host;
M
Max Reitz 已提交
1588 1589
        char *port_str;

1590
        /* nbd[+tcp]://host[:port]/export */
P
Paolo Bonzini 已提交
1591 1592 1593 1594
        if (!uri->server) {
            ret = -EINVAL;
            goto out;
        }
1595

1596 1597 1598
        /* strip braces from literal IPv6 address */
        if (uri->server[0] == '[') {
            host = qstring_from_substr(uri->server, 1,
1599
                                       strlen(uri->server) - 1);
1600 1601 1602 1603
        } else {
            host = qstring_from_str(uri->server);
        }

1604
        qdict_put_str(options, "server.type", "inet");
1605
        qdict_put(options, "server.host", host);
M
Max Reitz 已提交
1606 1607

        port_str = g_strdup_printf("%d", uri->port ?: NBD_DEFAULT_PORT);
1608
        qdict_put_str(options, "server.port", port_str);
M
Max Reitz 已提交
1609
        g_free(port_str);
P
Paolo Bonzini 已提交
1610 1611 1612 1613 1614 1615 1616 1617 1618 1619
    }

out:
    if (qp) {
        query_params_free(qp);
    }
    uri_free(uri);
    return ret;
}

1620 1621 1622 1623 1624 1625 1626 1627
static bool nbd_has_filename_options_conflict(QDict *options, Error **errp)
{
    const QDictEntry *e;

    for (e = qdict_first(options); e; e = qdict_next(options, e)) {
        if (!strcmp(e->key, "host") ||
            !strcmp(e->key, "port") ||
            !strcmp(e->key, "path") ||
M
Max Reitz 已提交
1628 1629
            !strcmp(e->key, "export") ||
            strstart(e->key, "server.", NULL))
1630 1631 1632 1633 1634 1635 1636 1637 1638 1639
        {
            error_setg(errp, "Option '%s' cannot be used with a file name",
                       e->key);
            return true;
        }
    }

    return false;
}

1640 1641
static void nbd_parse_filename(const char *filename, QDict *options,
                               Error **errp)
1642
{
E
Eric Blake 已提交
1643
    g_autofree char *file = NULL;
1644 1645
    char *export_name;
    const char *host_spec;
1646 1647
    const char *unixpath;

1648
    if (nbd_has_filename_options_conflict(options, errp)) {
1649 1650 1651
        return;
    }

P
Paolo Bonzini 已提交
1652
    if (strstr(filename, "://")) {
1653 1654 1655 1656 1657
        int ret = nbd_parse_uri(filename, options);
        if (ret < 0) {
            error_setg(errp, "No valid URL specified");
        }
        return;
P
Paolo Bonzini 已提交
1658 1659
    }

1660
    file = g_strdup(filename);
1661

1662 1663 1664
    export_name = strstr(file, EN_OPTSTR);
    if (export_name) {
        if (export_name[strlen(EN_OPTSTR)] == 0) {
E
Eric Blake 已提交
1665
            return;
1666
        }
1667 1668
        export_name[0] = 0; /* truncate 'file' */
        export_name += strlen(EN_OPTSTR);
1669

1670
        qdict_put_str(options, "export", export_name);
1671 1672
    }

1673 1674
    /* extract the host_spec - fail if it's not nbd:... */
    if (!strstart(file, "nbd:", &host_spec)) {
1675
        error_setg(errp, "File name string for NBD must start with 'nbd:'");
E
Eric Blake 已提交
1676
        return;
1677
    }
1678

1679
    if (!*host_spec) {
E
Eric Blake 已提交
1680
        return;
1681 1682
    }

1683 1684
    /* are we a UNIX or TCP socket? */
    if (strstart(host_spec, "unix:", &unixpath)) {
1685 1686
        qdict_put_str(options, "server.type", "unix");
        qdict_put_str(options, "server.path", unixpath);
1687
    } else {
1688
        InetSocketAddress *addr = g_new(InetSocketAddress, 1);
1689

1690 1691
        if (inet_parse(addr, host_spec, errp)) {
            goto out_inet;
1692
        }
1693

1694 1695 1696
        qdict_put_str(options, "server.type", "inet");
        qdict_put_str(options, "server.host", addr->host);
        qdict_put_str(options, "server.port", addr->port);
1697
    out_inet:
1698 1699 1700 1701
        qapi_free_InetSocketAddress(addr);
    }
}

M
Max Reitz 已提交
1702 1703 1704
static bool nbd_process_legacy_socket_options(QDict *output_options,
                                              QemuOpts *legacy_opts,
                                              Error **errp)
1705
{
M
Max Reitz 已提交
1706 1707 1708 1709
    const char *path = qemu_opt_get(legacy_opts, "path");
    const char *host = qemu_opt_get(legacy_opts, "host");
    const char *port = qemu_opt_get(legacy_opts, "port");
    const QDictEntry *e;
1710

M
Max Reitz 已提交
1711 1712 1713
    if (!path && !host && !port) {
        return true;
    }
1714

M
Max Reitz 已提交
1715 1716 1717 1718 1719 1720
    for (e = qdict_first(output_options); e; e = qdict_next(output_options, e))
    {
        if (strstart(e->key, "server.", NULL)) {
            error_setg(errp, "Cannot use 'server' and path/host/port at the "
                       "same time");
            return false;
1721
        }
1722
    }
M
Max Reitz 已提交
1723 1724 1725 1726 1727 1728 1729 1730 1731 1732

    if (path && host) {
        error_setg(errp, "path and host may not be used at the same time");
        return false;
    } else if (path) {
        if (port) {
            error_setg(errp, "port may not be used without host");
            return false;
        }

1733 1734
        qdict_put_str(output_options, "server.type", "unix");
        qdict_put_str(output_options, "server.path", path);
M
Max Reitz 已提交
1735
    } else if (host) {
1736 1737 1738 1739
        qdict_put_str(output_options, "server.type", "inet");
        qdict_put_str(output_options, "server.host", host);
        qdict_put_str(output_options, "server.port",
                      port ?: stringify(NBD_DEFAULT_PORT));
1740
    }
1741

M
Max Reitz 已提交
1742 1743
    return true;
}
1744

1745 1746
static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options,
                                 Error **errp)
M
Max Reitz 已提交
1747
{
1748
    SocketAddress *saddr = NULL;
M
Max Reitz 已提交
1749 1750 1751 1752 1753 1754 1755
    QDict *addr = NULL;
    Visitor *iv = NULL;

    qdict_extract_subqdict(options, &addr, "server.");
    if (!qdict_size(addr)) {
        error_setg(errp, "NBD server address missing");
        goto done;
1756 1757
    }

1758 1759
    iv = qobject_input_visitor_new_flat_confused(addr, errp);
    if (!iv) {
M
Max Reitz 已提交
1760 1761
        goto done;
    }
1762

1763
    if (!visit_type_SocketAddress(iv, NULL, &saddr, errp)) {
M
Max Reitz 已提交
1764 1765
        goto done;
    }
1766

M
Max Reitz 已提交
1767
done:
1768
    qobject_unref(addr);
M
Max Reitz 已提交
1769
    visit_free(iv);
1770
    return saddr;
1771
}
1772

1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802
static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp)
{
    Object *obj;
    QCryptoTLSCreds *creds;

    obj = object_resolve_path_component(
        object_get_objects_root(), id);
    if (!obj) {
        error_setg(errp, "No TLS credentials with id '%s'",
                   id);
        return NULL;
    }
    creds = (QCryptoTLSCreds *)
        object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS);
    if (!creds) {
        error_setg(errp, "Object with id '%s' is not TLS credentials",
                   id);
        return NULL;
    }

    if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) {
        error_setg(errp,
                   "Expecting TLS credentials with a client endpoint");
        return NULL;
    }
    object_ref(obj);
    return creds;
}


1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831
static QemuOptsList nbd_runtime_opts = {
    .name = "nbd",
    .head = QTAILQ_HEAD_INITIALIZER(nbd_runtime_opts.head),
    .desc = {
        {
            .name = "host",
            .type = QEMU_OPT_STRING,
            .help = "TCP host to connect to",
        },
        {
            .name = "port",
            .type = QEMU_OPT_STRING,
            .help = "TCP port to connect to",
        },
        {
            .name = "path",
            .type = QEMU_OPT_STRING,
            .help = "Unix socket path to connect to",
        },
        {
            .name = "export",
            .type = QEMU_OPT_STRING,
            .help = "Name of the NBD export to open",
        },
        {
            .name = "tls-creds",
            .type = QEMU_OPT_STRING,
            .help = "ID of the TLS credentials to use",
        },
1832 1833 1834 1835 1836 1837
        {
            .name = "x-dirty-bitmap",
            .type = QEMU_OPT_STRING,
            .help = "experimental: expose named dirty bitmap in place of "
                    "block status",
        },
1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848
        {
            .name = "reconnect-delay",
            .type = QEMU_OPT_NUMBER,
            .help = "On an unexpected disconnect, the nbd client tries to "
                    "connect again until succeeding or encountering a serious "
                    "error.  During the first @reconnect-delay seconds, all "
                    "requests are paused and will be rerun on a successful "
                    "reconnect. After that time, any delayed requests and all "
                    "future requests before a successful reconnect will "
                    "immediately fail. Default 0",
        },
1849
        { /* end of list */ }
1850 1851 1852
    },
};

1853 1854
static int nbd_process_options(BlockDriverState *bs, QDict *options,
                               Error **errp)
1855 1856
{
    BDRVNBDState *s = bs->opaque;
1857
    QemuOpts *opts;
1858
    int ret = -EINVAL;
1859

1860
    opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort);
1861
    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1862 1863 1864
        goto error;
    }

1865
    /* Translate @host, @port, and @path to a SocketAddress */
M
Max Reitz 已提交
1866 1867 1868 1869
    if (!nbd_process_legacy_socket_options(options, opts, errp)) {
        goto error;
    }

1870
    /* Pop the config into our state object. Exit if invalid. */
M
Max Reitz 已提交
1871 1872
    s->saddr = nbd_config(s, options, errp);
    if (!s->saddr) {
1873 1874 1875
        goto error;
    }

M
Max Reitz 已提交
1876
    s->export = g_strdup(qemu_opt_get(opts, "export"));
E
Eric Blake 已提交
1877 1878 1879 1880
    if (s->export && strlen(s->export) > NBD_MAX_STRING_SIZE) {
        error_setg(errp, "export name too long to send to server");
        goto error;
    }
M
Max Reitz 已提交
1881

1882 1883
    s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds"));
    if (s->tlscredsid) {
1884 1885
        s->tlscreds = nbd_get_tls_creds(s->tlscredsid, errp);
        if (!s->tlscreds) {
1886 1887 1888
            goto error;
        }

1889
        /* TODO SOCKET_ADDRESS_KIND_FD where fd has AF_INET or AF_INET6 */
1890
        if (s->saddr->type != SOCKET_ADDRESS_TYPE_INET) {
1891 1892 1893
            error_setg(errp, "TLS only supported over IP sockets");
            goto error;
        }
1894
        s->hostname = s->saddr->u.inet.host;
1895 1896
    }

1897
    s->x_dirty_bitmap = g_strdup(qemu_opt_get(opts, "x-dirty-bitmap"));
E
Eric Blake 已提交
1898 1899 1900 1901 1902
    if (s->x_dirty_bitmap && strlen(s->x_dirty_bitmap) > NBD_MAX_STRING_SIZE) {
        error_setg(errp, "x-dirty-bitmap query too long to send to server");
        goto error;
    }

1903 1904 1905
    s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);

    ret = 0;
1906

1907
 error:
1908
    if (ret < 0) {
1909
        nbd_clear_bdrvstate(s);
1910
    }
1911
    qemu_opts_del(opts);
1912
    return ret;
1913 1914
}

1915 1916 1917 1918 1919
static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
                    Error **errp)
{
    int ret;
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1920
    QIOChannelSocket *sioc;
1921 1922 1923 1924 1925 1926 1927 1928 1929 1930

    ret = nbd_process_options(bs, options, errp);
    if (ret < 0) {
        return ret;
    }

    s->bs = bs;
    qemu_co_mutex_init(&s->send_mutex);
    qemu_co_queue_init(&s->free_sema);

1931 1932 1933 1934 1935 1936 1937 1938 1939 1940
    /*
     * establish TCP connection, return error if it fails
     * TODO: Configurable retry-until-timeout behaviour.
     */
    sioc = nbd_establish_connection(s->saddr, errp);
    if (!sioc) {
        return -ECONNREFUSED;
    }

    ret = nbd_client_handshake(bs, sioc, errp);
1941
    if (ret < 0) {
1942
        nbd_clear_bdrvstate(s);
1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954
        return ret;
    }
    /* successfully connected */
    s->state = NBD_CLIENT_CONNECTED;

    s->connection_co = qemu_coroutine_create(nbd_connection_entry, s);
    bdrv_inc_in_flight(bs);
    aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co);

    return 0;
}

P
Paolo Bonzini 已提交
1955 1956
static int nbd_co_flush(BlockDriverState *bs)
{
M
Max Reitz 已提交
1957
    return nbd_client_co_flush(bs);
P
Paolo Bonzini 已提交
1958 1959
}

1960 1961
static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
{
1962
    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1963
    uint32_t min = s->info.min_block;
1964 1965
    uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);

1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
    /*
     * If the server did not advertise an alignment:
     * - a size that is not sector-aligned implies that an alignment
     *   of 1 can be used to access those tail bytes
     * - advertisement of block status requires an alignment of 1, so
     *   that we don't violate block layer constraints that block
     *   status is always aligned (as we can't control whether the
     *   server will report sub-sector extents, such as a hole at EOF
     *   on an unaligned POSIX file)
     * - otherwise, assume the server is so old that we are safer avoiding
     *   sub-sector requests
     */
    if (!min) {
        min = (!QEMU_IS_ALIGNED(s->info.size, BDRV_SECTOR_SIZE) ||
               s->info.base_allocation) ? 1 : BDRV_SECTOR_SIZE;
    }

    bs->bl.request_alignment = min;
1984
    bs->bl.max_pdiscard = QEMU_ALIGN_DOWN(INT_MAX, min);
1985 1986 1987 1988 1989 1990 1991
    bs->bl.max_pwrite_zeroes = max;
    bs->bl.max_transfer = max;

    if (s->info.opt_block &&
        s->info.opt_block > bs->bl.opt_transfer) {
        bs->bl.opt_transfer = s->info.opt_block;
    }
1992 1993
}

1994 1995
static void nbd_close(BlockDriverState *bs)
{
1996 1997
    BDRVNBDState *s = bs->opaque;

M
Max Reitz 已提交
1998
    nbd_client_close(bs);
1999
    nbd_clear_bdrvstate(s);
2000 2001
}

2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028
/*
 * NBD cannot truncate, but if the caller asks to truncate to the same size, or
 * to a smaller size with exact=false, there is no reason to fail the
 * operation.
 *
 * Preallocation mode is ignored since it does not seems useful to fail when
 * we never change anything.
 */
static int coroutine_fn nbd_co_truncate(BlockDriverState *bs, int64_t offset,
                                        bool exact, PreallocMode prealloc,
                                        BdrvRequestFlags flags, Error **errp)
{
    BDRVNBDState *s = bs->opaque;

    if (offset != s->info.size && exact) {
        error_setg(errp, "Cannot resize NBD nodes");
        return -ENOTSUP;
    }

    if (offset > s->info.size) {
        error_setg(errp, "Cannot grow NBD nodes");
        return -EINVAL;
    }

    return 0;
}

2029 2030 2031 2032
static int64_t nbd_getlength(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;

2033
    return s->info.size;
2034 2035
}

2036
static void nbd_refresh_filename(BlockDriverState *bs)
2037
{
2038
    BDRVNBDState *s = bs->opaque;
M
Max Reitz 已提交
2039
    const char *host = NULL, *port = NULL, *path = NULL;
2040
    size_t len = 0;
M
Max Reitz 已提交
2041

2042
    if (s->saddr->type == SOCKET_ADDRESS_TYPE_INET) {
2043
        const InetSocketAddress *inet = &s->saddr->u.inet;
M
Max Reitz 已提交
2044 2045 2046 2047
        if (!inet->has_ipv4 && !inet->has_ipv6 && !inet->has_to) {
            host = inet->host;
            port = inet->port;
        }
2048
    } else if (s->saddr->type == SOCKET_ADDRESS_TYPE_UNIX) {
2049 2050
        path = s->saddr->u.q_unix.path;
    } /* else can't represent as pseudo-filename */
2051

M
Max Reitz 已提交
2052
    if (path && s->export) {
2053 2054
        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
                       "nbd+unix:///%s?socket=%s", s->export, path);
M
Max Reitz 已提交
2055
    } else if (path && !s->export) {
2056 2057
        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
                       "nbd+unix://?socket=%s", path);
M
Max Reitz 已提交
2058
    } else if (host && s->export) {
2059 2060
        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
                       "nbd://%s:%s/%s", host, port, s->export);
M
Max Reitz 已提交
2061
    } else if (host && !s->export) {
2062 2063 2064
        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
                       "nbd://%s:%s", host, port);
    }
2065
    if (len >= sizeof(bs->exact_filename)) {
2066 2067
        /* Name is too long to represent exactly, so leave it empty. */
        bs->exact_filename[0] = '\0';
M
Max Reitz 已提交
2068
    }
2069 2070
}

2071 2072 2073 2074 2075 2076 2077 2078 2079 2080
static char *nbd_dirname(BlockDriverState *bs, Error **errp)
{
    /* The generic bdrv_dirname() implementation is able to work out some
     * directory name for NBD nodes, but that would be wrong. So far there is no
     * specification for how "export paths" would work, so NBD does not have
     * directory names. */
    error_setg(errp, "Cannot generate a base directory for NBD nodes");
    return NULL;
}

2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091
static const char *const nbd_strong_runtime_opts[] = {
    "path",
    "host",
    "port",
    "export",
    "tls-creds",
    "server.",

    NULL
};

2092
static BlockDriver bdrv_nbd = {
2093 2094 2095 2096
    .format_name                = "nbd",
    .protocol_name              = "nbd",
    .instance_size              = sizeof(BDRVNBDState),
    .bdrv_parse_filename        = nbd_parse_filename,
2097 2098
    .bdrv_co_create_opts        = bdrv_co_create_opts_simple,
    .create_opts                = &bdrv_create_opts_simple,
2099
    .bdrv_file_open             = nbd_open,
2100
    .bdrv_reopen_prepare        = nbd_client_reopen_prepare,
2101 2102
    .bdrv_co_preadv             = nbd_client_co_preadv,
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
2103
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
2104 2105
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
2106
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
2107
    .bdrv_refresh_limits        = nbd_refresh_limits,
2108
    .bdrv_co_truncate           = nbd_co_truncate,
2109
    .bdrv_getlength             = nbd_getlength,
2110 2111
    .bdrv_detach_aio_context    = nbd_client_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_client_attach_aio_context,
2112 2113
    .bdrv_co_drain_begin        = nbd_client_co_drain_begin,
    .bdrv_co_drain_end          = nbd_client_co_drain_end,
2114
    .bdrv_refresh_filename      = nbd_refresh_filename,
2115
    .bdrv_co_block_status       = nbd_client_co_block_status,
2116
    .bdrv_dirname               = nbd_dirname,
2117
    .strong_runtime_opts        = nbd_strong_runtime_opts,
P
Paolo Bonzini 已提交
2118 2119 2120
};

static BlockDriver bdrv_nbd_tcp = {
2121 2122 2123 2124
    .format_name                = "nbd",
    .protocol_name              = "nbd+tcp",
    .instance_size              = sizeof(BDRVNBDState),
    .bdrv_parse_filename        = nbd_parse_filename,
2125 2126
    .bdrv_co_create_opts        = bdrv_co_create_opts_simple,
    .create_opts                = &bdrv_create_opts_simple,
2127
    .bdrv_file_open             = nbd_open,
2128
    .bdrv_reopen_prepare        = nbd_client_reopen_prepare,
2129 2130
    .bdrv_co_preadv             = nbd_client_co_preadv,
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
2131
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
2132 2133
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
2134
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
2135
    .bdrv_refresh_limits        = nbd_refresh_limits,
2136
    .bdrv_co_truncate           = nbd_co_truncate,
2137
    .bdrv_getlength             = nbd_getlength,
2138 2139
    .bdrv_detach_aio_context    = nbd_client_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_client_attach_aio_context,
2140 2141
    .bdrv_co_drain_begin        = nbd_client_co_drain_begin,
    .bdrv_co_drain_end          = nbd_client_co_drain_end,
2142
    .bdrv_refresh_filename      = nbd_refresh_filename,
2143
    .bdrv_co_block_status       = nbd_client_co_block_status,
2144
    .bdrv_dirname               = nbd_dirname,
2145
    .strong_runtime_opts        = nbd_strong_runtime_opts,
P
Paolo Bonzini 已提交
2146 2147 2148
};

static BlockDriver bdrv_nbd_unix = {
2149 2150 2151 2152
    .format_name                = "nbd",
    .protocol_name              = "nbd+unix",
    .instance_size              = sizeof(BDRVNBDState),
    .bdrv_parse_filename        = nbd_parse_filename,
2153 2154
    .bdrv_co_create_opts        = bdrv_co_create_opts_simple,
    .create_opts                = &bdrv_create_opts_simple,
2155
    .bdrv_file_open             = nbd_open,
2156
    .bdrv_reopen_prepare        = nbd_client_reopen_prepare,
2157 2158
    .bdrv_co_preadv             = nbd_client_co_preadv,
    .bdrv_co_pwritev            = nbd_client_co_pwritev,
2159
    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
2160 2161
    .bdrv_close                 = nbd_close,
    .bdrv_co_flush_to_os        = nbd_co_flush,
2162
    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
2163
    .bdrv_refresh_limits        = nbd_refresh_limits,
2164
    .bdrv_co_truncate           = nbd_co_truncate,
2165
    .bdrv_getlength             = nbd_getlength,
2166 2167
    .bdrv_detach_aio_context    = nbd_client_detach_aio_context,
    .bdrv_attach_aio_context    = nbd_client_attach_aio_context,
2168 2169
    .bdrv_co_drain_begin        = nbd_client_co_drain_begin,
    .bdrv_co_drain_end          = nbd_client_co_drain_end,
2170
    .bdrv_refresh_filename      = nbd_refresh_filename,
2171
    .bdrv_co_block_status       = nbd_client_co_block_status,
2172
    .bdrv_dirname               = nbd_dirname,
2173
    .strong_runtime_opts        = nbd_strong_runtime_opts,
2174
};
2175 2176 2177 2178

static void bdrv_nbd_init(void)
{
    bdrv_register(&bdrv_nbd);
P
Paolo Bonzini 已提交
2179 2180
    bdrv_register(&bdrv_nbd_tcp);
    bdrv_register(&bdrv_nbd_unix);
2181 2182 2183
}

block_init(bdrv_nbd_init);