server.c 32.9 KB
Newer Older
1
/*
B
bellard 已提交
2 3
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
 *
F
Fam Zheng 已提交
4
 *  Network Block Device Server Side
B
bellard 已提交
5 6 7 8 9 10 11 12 13 14 15
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
B
bellard 已提交
18

P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
20
#include "qapi/error.h"
F
Fam Zheng 已提交
21
#include "nbd-internal.h"
22 23 24 25 26 27 28

static int system_errno_to_nbd_errno(int err)
{
    switch (err) {
    case 0:
        return NBD_SUCCESS;
    case EPERM:
29
    case EROFS:
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
        return NBD_EPERM;
    case EIO:
        return NBD_EIO;
    case ENOMEM:
        return NBD_ENOMEM;
#ifdef EDQUOT
    case EDQUOT:
#endif
    case EFBIG:
    case ENOSPC:
        return NBD_ENOSPC;
    case EINVAL:
    default:
        return NBD_EINVAL;
    }
}

47 48 49 50 51 52 53 54 55 56 57
/* Definitions for opaque data types */

typedef struct NBDRequest NBDRequest;

struct NBDRequest {
    QSIMPLEQ_ENTRY(NBDRequest) entry;
    NBDClient *client;
    uint8_t *data;
};

struct NBDExport {
58
    int refcount;
59 60
    void (*close)(NBDExport *exp);

M
Max Reitz 已提交
61
    BlockBackend *blk;
P
Paolo Bonzini 已提交
62
    char *name;
63 64 65
    off_t dev_offset;
    off_t size;
    uint32_t nbdflags;
66
    QTAILQ_HEAD(, NBDClient) clients;
P
Paolo Bonzini 已提交
67
    QTAILQ_ENTRY(NBDExport) next;
M
Max Reitz 已提交
68 69

    AioContext *ctx;
70 71

    Notifier eject_notifier;
72 73
};

P
Paolo Bonzini 已提交
74 75
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);

76 77 78 79 80
struct NBDClient {
    int refcount;
    void (*close)(NBDClient *client);

    NBDExport *exp;
81 82
    QCryptoTLSCreds *tlscreds;
    char *tlsaclname;
83 84
    QIOChannelSocket *sioc; /* The underlying data channel */
    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
85 86 87 88 89 90

    Coroutine *recv_coroutine;

    CoMutex send_lock;
    Coroutine *send_coroutine;

M
Max Reitz 已提交
91 92
    bool can_read;

93
    QTAILQ_ENTRY(NBDClient) next;
94
    int nb_requests;
95
    bool closing;
96 97
};

B
bellard 已提交
98 99
/* That's all folks */

M
Max Reitz 已提交
100 101 102 103
static void nbd_set_handlers(NBDClient *client);
static void nbd_unset_handlers(NBDClient *client);
static void nbd_update_can_read(NBDClient *client);

104 105 106
static gboolean nbd_negotiate_continue(QIOChannel *ioc,
                                       GIOCondition condition,
                                       void *opaque)
107 108
{
    qemu_coroutine_enter(opaque, NULL);
109
    return TRUE;
110 111
}

112
static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size)
113 114
{
    ssize_t ret;
115
    guint watch;
116 117 118

    assert(qemu_in_coroutine());
    /* Negotiation are always in main loop. */
119 120 121 122 123 124 125
    watch = qio_channel_add_watch(ioc,
                                  G_IO_IN,
                                  nbd_negotiate_continue,
                                  qemu_coroutine_self(),
                                  NULL);
    ret = read_sync(ioc, buffer, size);
    g_source_remove(watch);
126 127 128 129
    return ret;

}

130
static ssize_t nbd_negotiate_write(QIOChannel *ioc, void *buffer, size_t size)
131 132
{
    ssize_t ret;
133
    guint watch;
134 135 136

    assert(qemu_in_coroutine());
    /* Negotiation are always in main loop. */
137 138 139 140 141 142 143
    watch = qio_channel_add_watch(ioc,
                                  G_IO_OUT,
                                  nbd_negotiate_continue,
                                  qemu_coroutine_self(),
                                  NULL);
    ret = write_sync(ioc, buffer, size);
    g_source_remove(watch);
144 145 146
    return ret;
}

147
static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size)
148 149 150 151 152
{
    ssize_t ret, dropped = size;
    uint8_t *buffer = g_malloc(MIN(65536, size));

    while (size > 0) {
153
        ret = nbd_negotiate_read(ioc, buffer, MIN(65536, size));
154 155 156 157 158 159 160 161 162 163 164 165 166
        if (ret < 0) {
            g_free(buffer);
            return ret;
        }

        assert(ret <= size);
        size -= ret;
    }

    g_free(buffer);
    return dropped;
}

167
/* Basic flow for negotiation
B
bellard 已提交
168 169 170

   Server         Client
   Negotiate
171 172 173 174 175 176 177 178 179 180 181 182 183

   or

   Server         Client
   Negotiate #1
                  Option
   Negotiate #2

   ----

   followed by

   Server         Client
B
bellard 已提交
184 185 186 187 188 189 190
                  Request
   Response
                  Request
   Response
                  ...
   ...
                  Request (type == 2)
191

B
bellard 已提交
192 193
*/

194
static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt)
195 196
{
    uint64_t magic;
197
    uint32_t len;
198

199 200
    TRACE("Reply opt=%x type=%x", type, opt);

201
    magic = cpu_to_be64(NBD_REP_MAGIC);
202
    if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
203 204
        LOG("write failed (rep magic)");
        return -EINVAL;
205
    }
206
    opt = cpu_to_be32(opt);
207
    if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) {
208 209
        LOG("write failed (rep opt)");
        return -EINVAL;
210
    }
211
    type = cpu_to_be32(type);
212
    if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) {
213 214
        LOG("write failed (rep type)");
        return -EINVAL;
215
    }
216
    len = cpu_to_be32(0);
217
    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
218 219
        LOG("write failed (rep data length)");
        return -EINVAL;
220
    }
221 222
    return 0;
}
223

224
static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp)
225 226 227 228
{
    uint64_t magic, name_len;
    uint32_t opt, type, len;

229
    TRACE("Advertizing export name '%s'", exp->name ? exp->name : "");
230 231
    name_len = strlen(exp->name);
    magic = cpu_to_be64(NBD_REP_MAGIC);
232
    if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
233 234 235 236
        LOG("write failed (magic)");
        return -EINVAL;
     }
    opt = cpu_to_be32(NBD_OPT_LIST);
237
    if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) {
238 239 240 241
        LOG("write failed (opt)");
        return -EINVAL;
    }
    type = cpu_to_be32(NBD_REP_SERVER);
242
    if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) {
243 244 245 246
        LOG("write failed (reply type)");
        return -EINVAL;
    }
    len = cpu_to_be32(name_len + sizeof(len));
247
    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
248 249 250 251
        LOG("write failed (length)");
        return -EINVAL;
    }
    len = cpu_to_be32(name_len);
252
    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
253 254 255
        LOG("write failed (length)");
        return -EINVAL;
    }
256
    if (nbd_negotiate_write(ioc, exp->name, name_len) != name_len) {
257 258 259 260 261 262
        LOG("write failed (buffer)");
        return -EINVAL;
    }
    return 0;
}

263
static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length)
264 265 266 267
{
    NBDExport *exp;

    if (length) {
268
        if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
269 270
            return -EIO;
        }
271 272
        return nbd_negotiate_send_rep(client->ioc,
                                      NBD_REP_ERR_INVALID, NBD_OPT_LIST);
273 274 275 276
    }

    /* For each export, send a NBD_REP_SERVER reply. */
    QTAILQ_FOREACH(exp, &exports, next) {
277
        if (nbd_negotiate_send_rep_list(client->ioc, exp)) {
278 279 280 281
            return -EINVAL;
        }
    }
    /* Finish with a NBD_REP_ACK. */
282
    return nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_LIST);
283 284
}

285
static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length)
286
{
287
    int rc = -EINVAL;
288
    char name[256];
289

290 291 292
    /* Client sends:
        [20 ..  xx]   export name (length bytes)
     */
293 294 295 296 297
    TRACE("Checking length");
    if (length > 255) {
        LOG("Bad length received");
        goto fail;
    }
298
    if (nbd_negotiate_read(client->ioc, name, length) != length) {
299 300 301 302 303
        LOG("read failed");
        goto fail;
    }
    name[length] = '\0';

304 305
    TRACE("Client requested export '%s'", name);

306 307 308 309 310 311 312 313 314 315 316 317 318
    client->exp = nbd_export_find(name);
    if (!client->exp) {
        LOG("export not found");
        goto fail;
    }

    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    nbd_export_get(client->exp);
    rc = 0;
fail:
    return rc;
}

319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367

static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
                                                 uint32_t length)
{
    QIOChannel *ioc;
    QIOChannelTLS *tioc;
    struct NBDTLSHandshakeData data = { 0 };

    TRACE("Setting up TLS");
    ioc = client->ioc;
    if (length) {
        if (nbd_negotiate_drop_sync(ioc, length) != length) {
            return NULL;
        }
        nbd_negotiate_send_rep(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS);
        return NULL;
    }

    nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_STARTTLS);

    tioc = qio_channel_tls_new_server(ioc,
                                      client->tlscreds,
                                      client->tlsaclname,
                                      NULL);
    if (!tioc) {
        return NULL;
    }

    TRACE("Starting TLS handshake");
    data.loop = g_main_loop_new(g_main_context_default(), FALSE);
    qio_channel_tls_handshake(tioc,
                              nbd_tls_handshake,
                              &data,
                              NULL);

    if (!data.complete) {
        g_main_loop_run(data.loop);
    }
    g_main_loop_unref(data.loop);
    if (data.error) {
        object_unref(OBJECT(tioc));
        error_free(data.error);
        return NULL;
    }

    return QIO_CHANNEL(tioc);
}


368
static int nbd_negotiate_options(NBDClient *client)
369
{
M
Max Reitz 已提交
370
    uint32_t flags;
371
    bool fixedNewstyle = false;
M
Max Reitz 已提交
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386

    /* Client sends:
        [ 0 ..   3]   client flags

        [ 0 ..   7]   NBD_OPTS_MAGIC
        [ 8 ..  11]   NBD option
        [12 ..  15]   Data length
        ...           Rest of request

        [ 0 ..   7]   NBD_OPTS_MAGIC
        [ 8 ..  11]   Second NBD option
        [12 ..  15]   Data length
        ...           Rest of request
    */

387 388
    if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) !=
        sizeof(flags)) {
M
Max Reitz 已提交
389 390 391 392 393
        LOG("read failed");
        return -EIO;
    }
    TRACE("Checking client flags");
    be32_to_cpus(&flags);
394 395 396 397 398 399 400
    if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
        TRACE("Support supports fixed newstyle handshake");
        fixedNewstyle = true;
        flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
    }
    if (flags != 0) {
        TRACE("Unknown client flags 0x%x received", flags);
M
Max Reitz 已提交
401 402 403
        return -EIO;
    }

404
    while (1) {
M
Max Reitz 已提交
405
        int ret;
406
        uint32_t clientflags, length;
407 408
        uint64_t magic;

409 410
        if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) !=
            sizeof(magic)) {
411 412 413 414 415 416 417 418 419
            LOG("read failed");
            return -EINVAL;
        }
        TRACE("Checking opts magic");
        if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
            LOG("Bad magic received");
            return -EINVAL;
        }

420 421
        if (nbd_negotiate_read(client->ioc, &clientflags,
                               sizeof(clientflags)) != sizeof(clientflags)) {
422 423 424
            LOG("read failed");
            return -EINVAL;
        }
425
        clientflags = be32_to_cpu(clientflags);
426

427 428
        if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) !=
            sizeof(length)) {
429 430 431 432 433
            LOG("read failed");
            return -EINVAL;
        }
        length = be32_to_cpu(length);

434
        TRACE("Checking option 0x%x", clientflags);
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
        if (client->tlscreds &&
            client->ioc == (QIOChannel *)client->sioc) {
            QIOChannel *tioc;
            if (!fixedNewstyle) {
                TRACE("Unsupported option 0x%x", clientflags);
                return -EINVAL;
            }
            switch (clientflags) {
            case NBD_OPT_STARTTLS:
                tioc = nbd_negotiate_handle_starttls(client, length);
                if (!tioc) {
                    return -EIO;
                }
                object_unref(OBJECT(client->ioc));
                client->ioc = QIO_CHANNEL(tioc);
                break;

            default:
                TRACE("Option 0x%x not permitted before TLS", clientflags);
                nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_TLS_REQD,
                                       clientflags);
                return -EINVAL;
            }
        } else if (fixedNewstyle) {
459 460 461 462 463 464 465 466 467 468 469 470 471 472
            switch (clientflags) {
            case NBD_OPT_LIST:
                ret = nbd_negotiate_handle_list(client, length);
                if (ret < 0) {
                    return ret;
                }
                break;

            case NBD_OPT_ABORT:
                return -EINVAL;

            case NBD_OPT_EXPORT_NAME:
                return nbd_negotiate_handle_export_name(client, length);

473 474 475 476 477 478 479 480 481 482 483
            case NBD_OPT_STARTTLS:
                if (client->tlscreds) {
                    TRACE("TLS already enabled");
                    nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_INVALID,
                                           clientflags);
                } else {
                    TRACE("TLS not configured");
                    nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_POLICY,
                                           clientflags);
                }
                return -EINVAL;
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
            default:
                TRACE("Unsupported option 0x%x", clientflags);
                nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_UNSUP,
                                       clientflags);
                return -EINVAL;
            }
        } else {
            /*
             * If broken new-style we should drop the connection
             * for anything except NBD_OPT_EXPORT_NAME
             */
            switch (clientflags) {
            case NBD_OPT_EXPORT_NAME:
                return nbd_negotiate_handle_export_name(client, length);

            default:
                TRACE("Unsupported option 0x%x", clientflags);
                return -EINVAL;
502
            }
503 504 505 506
        }
    }
}

507 508 509 510 511 512
typedef struct {
    NBDClient *client;
    Coroutine *co;
} NBDClientNewData;

static coroutine_fn int nbd_negotiate(NBDClientNewData *data)
B
bellard 已提交
513
{
514
    NBDClient *client = data->client;
N
Nick Thomas 已提交
515
    char buf[8 + 8 + 8 + 128];
516
    int rc;
517 518
    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
519
    bool oldStyle;
N
Nick Thomas 已提交
520

521
    /* Old style negotiation header without options
522 523
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
N
Nick Thomas 已提交
524
        [16 ..  23]   size
525
        [24 ..  25]   server flags (0)
H
Hani Benhabiles 已提交
526
        [26 ..  27]   export flags
527 528
        [28 .. 151]   reserved     (0)

529
       New style negotiation header with options
530 531 532
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
        [16 ..  17]   server flags (0)
533
        ....options sent....
534 535 536
        [18 ..  25]   size
        [26 ..  27]   export flags
        [28 .. 151]   reserved     (0)
N
Nick Thomas 已提交
537 538
     */

539
    qio_channel_set_blocking(client->ioc, false, NULL);
540 541
    rc = -EINVAL;

N
Nick Thomas 已提交
542
    TRACE("Beginning negotiation.");
543
    memset(buf, 0, sizeof(buf));
N
Nick Thomas 已提交
544
    memcpy(buf, "NBDMAGIC", 8);
545 546 547

    oldStyle = client->exp != NULL && !client->tlscreds;
    if (oldStyle) {
548
        assert ((client->exp->nbdflags & ~65535) == 0);
J
John Snow 已提交
549 550 551
        stq_be_p(buf + 8, NBD_CLIENT_MAGIC);
        stq_be_p(buf + 16, client->exp->size);
        stw_be_p(buf + 26, client->exp->nbdflags | myflags);
552
    } else {
J
John Snow 已提交
553 554
        stq_be_p(buf + 8, NBD_OPTS_MAGIC);
        stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE);
555
    }
N
Nick Thomas 已提交
556

557 558 559 560 561
    if (oldStyle) {
        if (client->tlscreds) {
            TRACE("TLS cannot be enabled with oldstyle protocol");
            goto fail;
        }
562
        if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) != sizeof(buf)) {
563 564 565 566
            LOG("write failed");
            goto fail;
        }
    } else {
567
        if (nbd_negotiate_write(client->ioc, buf, 18) != 18) {
568 569 570
            LOG("write failed");
            goto fail;
        }
571
        rc = nbd_negotiate_options(client);
572
        if (rc != 0) {
573 574 575 576 577
            LOG("option negotiation failed");
            goto fail;
        }

        assert ((client->exp->nbdflags & ~65535) == 0);
J
John Snow 已提交
578 579
        stq_be_p(buf + 18, client->exp->size);
        stw_be_p(buf + 26, client->exp->nbdflags | myflags);
580 581
        if (nbd_negotiate_write(client->ioc, buf + 18, sizeof(buf) - 18) !=
            sizeof(buf) - 18) {
582 583 584
            LOG("write failed");
            goto fail;
        }
N
Nick Thomas 已提交
585 586
    }

D
Dong Xu Wang 已提交
587
    TRACE("Negotiation succeeded.");
588 589 590
    rc = 0;
fail:
    return rc;
B
bellard 已提交
591 592
}

P
Paolo Bonzini 已提交
593
#ifdef __linux__
B
bellard 已提交
594 595 596

int nbd_disconnect(int fd)
{
N
Nick Thomas 已提交
597 598 599 600
    ioctl(fd, NBD_CLEAR_QUE);
    ioctl(fd, NBD_DISCONNECT);
    ioctl(fd, NBD_CLEAR_SOCK);
    return 0;
B
bellard 已提交
601 602
}

603 604 605 606
#else

int nbd_disconnect(int fd)
{
607
    return -ENOTSUP;
608 609
}
#endif
B
bellard 已提交
610

611
static ssize_t nbd_receive_request(QIOChannel *ioc, struct nbd_request *request)
612
{
P
Paolo Bonzini 已提交
613
    uint8_t buf[NBD_REQUEST_SIZE];
N
Nick Thomas 已提交
614
    uint32_t magic;
615
    ssize_t ret;
N
Nick Thomas 已提交
616

617
    ret = read_sync(ioc, buf, sizeof(buf));
618 619 620 621 622
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
623
        LOG("read failed");
624
        return -EINVAL;
N
Nick Thomas 已提交
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
    }

    /* Request
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
       [ 8 .. 15]   handle
       [16 .. 23]   from
       [24 .. 27]   len
     */

    magic = be32_to_cpup((uint32_t*)buf);
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));

    TRACE("Got request: "
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
          magic, request->type, request->from, request->len);

    if (magic != NBD_REQUEST_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
647
        return -EINVAL;
N
Nick Thomas 已提交
648 649
    }
    return 0;
650 651
}

652
static ssize_t nbd_send_reply(QIOChannel *ioc, struct nbd_reply *reply)
653
{
P
Paolo Bonzini 已提交
654
    uint8_t buf[NBD_REPLY_SIZE];
655
    ssize_t ret;
N
Nick Thomas 已提交
656

657 658
    reply->error = system_errno_to_nbd_errno(reply->error);

N
Nick Thomas 已提交
659 660 661 662 663
    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */
J
John Snow 已提交
664 665 666
    stl_be_p(buf, NBD_REPLY_MAGIC);
    stl_be_p(buf + 4, reply->error);
    stq_be_p(buf + 8, reply->handle);
N
Nick Thomas 已提交
667 668 669

    TRACE("Sending response to client");

670
    ret = write_sync(ioc, buf, sizeof(buf));
671 672 673 674 675
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
676
        LOG("writing to socket failed");
677
        return -EINVAL;
N
Nick Thomas 已提交
678 679
    }
    return 0;
680
}
B
bellard 已提交
681

P
Paolo Bonzini 已提交
682 683
#define MAX_NBD_REQUESTS 16

684
void nbd_client_get(NBDClient *client)
685 686 687 688
{
    client->refcount++;
}

689
void nbd_client_put(NBDClient *client)
690 691
{
    if (--client->refcount == 0) {
692
        /* The last reference should be dropped by client->close,
M
Max Reitz 已提交
693
         * which is called by client_close.
694 695 696
         */
        assert(client->closing);

M
Max Reitz 已提交
697
        nbd_unset_handlers(client);
698 699
        object_unref(OBJECT(client->sioc));
        object_unref(OBJECT(client->ioc));
700 701 702 703
        if (client->tlscreds) {
            object_unref(OBJECT(client->tlscreds));
        }
        g_free(client->tlsaclname);
704 705 706 707
        if (client->exp) {
            QTAILQ_REMOVE(&client->exp->clients, client, next);
            nbd_export_put(client->exp);
        }
708 709 710 711
        g_free(client);
    }
}

M
Max Reitz 已提交
712
static void client_close(NBDClient *client)
713
{
714 715 716 717 718 719 720 721 722
    if (client->closing) {
        return;
    }

    client->closing = true;

    /* Force requests to finish.  They will drop their own references,
     * then we'll close the socket and free the NBDClient.
     */
723 724
    qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
                         NULL);
725 726

    /* Also tell the client, so that they release their reference.  */
727 728 729 730 731
    if (client->close) {
        client->close(client);
    }
}

732
static NBDRequest *nbd_request_get(NBDClient *client)
P
Paolo Bonzini 已提交
733 734
{
    NBDRequest *req;
735

P
Paolo Bonzini 已提交
736 737
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
    client->nb_requests++;
M
Max Reitz 已提交
738
    nbd_update_can_read(client);
P
Paolo Bonzini 已提交
739

740
    req = g_new0(NBDRequest, 1);
741 742
    nbd_client_get(client);
    req->client = client;
P
Paolo Bonzini 已提交
743 744 745
    return req;
}

746
static void nbd_request_put(NBDRequest *req)
P
Paolo Bonzini 已提交
747
{
748
    NBDClient *client = req->client;
749

750 751 752
    if (req->data) {
        qemu_vfree(req->data);
    }
753
    g_free(req);
754

M
Max Reitz 已提交
755 756
    client->nb_requests--;
    nbd_update_can_read(client);
757
    nbd_client_put(client);
P
Paolo Bonzini 已提交
758 759
}

M
Max Reitz 已提交
760
static void blk_aio_attached(AioContext *ctx, void *opaque)
M
Max Reitz 已提交
761 762 763 764 765 766 767 768 769 770 771 772 773
{
    NBDExport *exp = opaque;
    NBDClient *client;

    TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx);

    exp->ctx = ctx;

    QTAILQ_FOREACH(client, &exp->clients, next) {
        nbd_set_handlers(client);
    }
}

M
Max Reitz 已提交
774
static void blk_aio_detach(void *opaque)
M
Max Reitz 已提交
775 776 777 778 779 780 781 782 783 784 785 786 787
{
    NBDExport *exp = opaque;
    NBDClient *client;

    TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);

    QTAILQ_FOREACH(client, &exp->clients, next) {
        nbd_unset_handlers(client);
    }

    exp->ctx = NULL;
}

788 789 790 791 792 793
static void nbd_eject_notifier(Notifier *n, void *data)
{
    NBDExport *exp = container_of(n, NBDExport, eject_notifier);
    nbd_export_close(exp);
}

794
NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
M
Max Reitz 已提交
795 796
                          uint32_t nbdflags, void (*close)(NBDExport *),
                          Error **errp)
P
Paolo Bonzini 已提交
797 798
{
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
799
    exp->refcount = 1;
800
    QTAILQ_INIT(&exp->clients);
M
Max Reitz 已提交
801
    exp->blk = blk;
P
Paolo Bonzini 已提交
802 803
    exp->dev_offset = dev_offset;
    exp->nbdflags = nbdflags;
M
Max Reitz 已提交
804 805 806 807 808 809 810 811
    exp->size = size < 0 ? blk_getlength(blk) : size;
    if (exp->size < 0) {
        error_setg_errno(errp, -exp->size,
                         "Failed to determine the NBD export's length");
        goto fail;
    }
    exp->size -= exp->size % BDRV_SECTOR_SIZE;

812
    exp->close = close;
M
Max Reitz 已提交
813 814 815
    exp->ctx = blk_get_aio_context(blk);
    blk_ref(blk);
    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
816 817 818 819

    exp->eject_notifier.notify = nbd_eject_notifier;
    blk_add_remove_bs_notifier(blk, &exp->eject_notifier);

820 821
    /*
     * NBD exports are used for non-shared storage migration.  Make sure
822
     * that BDRV_O_INACTIVE is cleared and the image is ready for write
823 824
     * access since the export could be available before migration handover.
     */
825
    aio_context_acquire(exp->ctx);
M
Max Reitz 已提交
826
    blk_invalidate_cache(blk, NULL);
827
    aio_context_release(exp->ctx);
P
Paolo Bonzini 已提交
828
    return exp;
M
Max Reitz 已提交
829 830 831 832

fail:
    g_free(exp);
    return NULL;
P
Paolo Bonzini 已提交
833 834
}

P
Paolo Bonzini 已提交
835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867
NBDExport *nbd_export_find(const char *name)
{
    NBDExport *exp;
    QTAILQ_FOREACH(exp, &exports, next) {
        if (strcmp(name, exp->name) == 0) {
            return exp;
        }
    }

    return NULL;
}

void nbd_export_set_name(NBDExport *exp, const char *name)
{
    if (exp->name == name) {
        return;
    }

    nbd_export_get(exp);
    if (exp->name != NULL) {
        g_free(exp->name);
        exp->name = NULL;
        QTAILQ_REMOVE(&exports, exp, next);
        nbd_export_put(exp);
    }
    if (name != NULL) {
        nbd_export_get(exp);
        exp->name = g_strdup(name);
        QTAILQ_INSERT_TAIL(&exports, exp, next);
    }
    nbd_export_put(exp);
}

P
Paolo Bonzini 已提交
868 869
void nbd_export_close(NBDExport *exp)
{
870
    NBDClient *client, *next;
871

872 873
    nbd_export_get(exp);
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
M
Max Reitz 已提交
874
        client_close(client);
875
    }
P
Paolo Bonzini 已提交
876
    nbd_export_set_name(exp, NULL);
877
    nbd_export_put(exp);
878 879 880 881 882 883 884 885 886 887 888 889 890
}

void nbd_export_get(NBDExport *exp)
{
    assert(exp->refcount > 0);
    exp->refcount++;
}

void nbd_export_put(NBDExport *exp)
{
    assert(exp->refcount > 0);
    if (exp->refcount == 1) {
        nbd_export_close(exp);
P
Paolo Bonzini 已提交
891 892
    }

893
    if (--exp->refcount == 0) {
P
Paolo Bonzini 已提交
894 895
        assert(exp->name == NULL);

896 897 898 899
        if (exp->close) {
            exp->close(exp);
        }

900
        if (exp->blk) {
901
            notifier_remove(&exp->eject_notifier);
902 903 904 905 906 907
            blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
                                            blk_aio_detach, exp);
            blk_unref(exp->blk);
            exp->blk = NULL;
        }

908 909
        g_free(exp);
    }
P
Paolo Bonzini 已提交
910 911
}

912
BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
P
Paolo Bonzini 已提交
913
{
M
Max Reitz 已提交
914
    return exp->blk;
P
Paolo Bonzini 已提交
915 916
}

P
Paolo Bonzini 已提交
917 918 919 920 921 922 923 924 925
void nbd_export_close_all(void)
{
    NBDExport *exp, *next;

    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
        nbd_export_close(exp);
    }
}

P
Paolo Bonzini 已提交
926 927
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
                                 int len)
928
{
929
    NBDClient *client = req->client;
P
Paolo Bonzini 已提交
930
    ssize_t rc, ret;
931

932
    g_assert(qemu_in_coroutine());
P
Paolo Bonzini 已提交
933 934
    qemu_co_mutex_lock(&client->send_lock);
    client->send_coroutine = qemu_coroutine_self();
M
Max Reitz 已提交
935
    nbd_set_handlers(client);
P
Paolo Bonzini 已提交
936

937
    if (!len) {
938
        rc = nbd_send_reply(client->ioc, reply);
939
    } else {
940 941
        qio_channel_set_cork(client->ioc, true);
        rc = nbd_send_reply(client->ioc, reply);
942
        if (rc >= 0) {
943
            ret = write_sync(client->ioc, req->data, len);
944
            if (ret != len) {
945
                rc = -EIO;
946 947
            }
        }
948
        qio_channel_set_cork(client->ioc, false);
949
    }
P
Paolo Bonzini 已提交
950 951

    client->send_coroutine = NULL;
M
Max Reitz 已提交
952
    nbd_set_handlers(client);
P
Paolo Bonzini 已提交
953
    qemu_co_mutex_unlock(&client->send_lock);
954 955 956
    return rc;
}

P
Paolo Bonzini 已提交
957
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
958
{
959
    NBDClient *client = req->client;
960
    uint32_t command;
P
Paolo Bonzini 已提交
961
    ssize_t rc;
962

963
    g_assert(qemu_in_coroutine());
P
Paolo Bonzini 已提交
964
    client->recv_coroutine = qemu_coroutine_self();
M
Max Reitz 已提交
965 966
    nbd_update_can_read(client);

967
    rc = nbd_receive_request(client->ioc, request);
968 969 970 971
    if (rc < 0) {
        if (rc != -EAGAIN) {
            rc = -EIO;
        }
972 973 974 975 976 977 978 979 980 981 982 983
        goto out;
    }

    if ((request->from + request->len) < request->from) {
        LOG("integer overflow detected! "
            "you're probably being attacked");
        rc = -EINVAL;
        goto out;
    }

    TRACE("Decoding type");

984 985
    command = request->type & NBD_CMD_MASK_COMMAND;
    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
986 987 988 989 990 991 992
        if (request->len > NBD_MAX_BUFFER_SIZE) {
            LOG("len (%u) is larger than max len (%u)",
                request->len, NBD_MAX_BUFFER_SIZE);
            rc = -EINVAL;
            goto out;
        }

993 994 995 996 997
        req->data = blk_try_blockalign(client->exp->blk, request->len);
        if (req->data == NULL) {
            rc = -ENOMEM;
            goto out;
        }
998 999
    }
    if (command == NBD_CMD_WRITE) {
1000 1001
        TRACE("Reading %u byte(s)", request->len);

1002
        if (read_sync(client->ioc, req->data, request->len) != request->len) {
1003 1004 1005 1006 1007 1008 1009 1010
            LOG("reading from socket failed");
            rc = -EIO;
            goto out;
        }
    }
    rc = 0;

out:
P
Paolo Bonzini 已提交
1011
    client->recv_coroutine = NULL;
M
Max Reitz 已提交
1012 1013
    nbd_update_can_read(client);

1014 1015 1016
    return rc;
}

P
Paolo Bonzini 已提交
1017
static void nbd_trip(void *opaque)
1018
{
P
Paolo Bonzini 已提交
1019
    NBDClient *client = opaque;
1020
    NBDExport *exp = client->exp;
1021
    NBDRequest *req;
N
Nick Thomas 已提交
1022 1023
    struct nbd_request request;
    struct nbd_reply reply;
P
Paolo Bonzini 已提交
1024
    ssize_t ret;
1025
    uint32_t command;
N
Nick Thomas 已提交
1026 1027

    TRACE("Reading request.");
1028 1029 1030
    if (client->closing) {
        return;
    }
N
Nick Thomas 已提交
1031

1032
    req = nbd_request_get(client);
P
Paolo Bonzini 已提交
1033
    ret = nbd_co_receive_request(req, &request);
1034 1035 1036
    if (ret == -EAGAIN) {
        goto done;
    }
1037
    if (ret == -EIO) {
P
Paolo Bonzini 已提交
1038
        goto out;
1039
    }
N
Nick Thomas 已提交
1040

1041 1042 1043
    reply.handle = request.handle;
    reply.error = 0;

1044 1045 1046
    if (ret < 0) {
        reply.error = -ret;
        goto error_reply;
N
Nick Thomas 已提交
1047
    }
1048 1049
    command = request.type & NBD_CMD_MASK_COMMAND;
    if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
N
Nick Thomas 已提交
1050 1051
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
            ", Offset: %" PRIu64 "\n",
P
Paolo Bonzini 已提交
1052
                    request.from, request.len,
S
Stefan Weil 已提交
1053
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
N
Nick Thomas 已提交
1054
        LOG("requested operation past EOF--bad client?");
1055
        goto invalid_request;
N
Nick Thomas 已提交
1056 1057
    }

1058 1059 1060 1061 1062 1063 1064 1065
    if (client->closing) {
        /*
         * The client may be closed when we are blocked in
         * nbd_co_receive_request()
         */
        goto done;
    }

1066
    switch (command) {
N
Nick Thomas 已提交
1067 1068 1069
    case NBD_CMD_READ:
        TRACE("Request type is READ");

P
Paolo Bonzini 已提交
1070
        if (request.type & NBD_CMD_FLAG_FUA) {
M
Max Reitz 已提交
1071
            ret = blk_co_flush(exp->blk);
P
Paolo Bonzini 已提交
1072 1073 1074 1075 1076 1077 1078
            if (ret < 0) {
                LOG("flush failed");
                reply.error = -ret;
                goto error_reply;
            }
        }

M
Max Reitz 已提交
1079 1080 1081
        ret = blk_read(exp->blk,
                       (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
                       req->data, request.len / BDRV_SECTOR_SIZE);
1082
        if (ret < 0) {
N
Nick Thomas 已提交
1083
            LOG("reading from file failed");
1084
            reply.error = -ret;
1085
            goto error_reply;
N
Nick Thomas 已提交
1086 1087 1088
        }

        TRACE("Read %u byte(s)", request.len);
P
Paolo Bonzini 已提交
1089
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
P
Paolo Bonzini 已提交
1090
            goto out;
N
Nick Thomas 已提交
1091 1092 1093 1094
        break;
    case NBD_CMD_WRITE:
        TRACE("Request type is WRITE");

P
Paolo Bonzini 已提交
1095
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
N
Nick Thomas 已提交
1096
            TRACE("Server is read-only, return error");
1097 1098 1099 1100 1101 1102
            reply.error = EROFS;
            goto error_reply;
        }

        TRACE("Writing to device");

M
Max Reitz 已提交
1103 1104 1105
        ret = blk_write(exp->blk,
                        (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
                        req->data, request.len / BDRV_SECTOR_SIZE);
1106 1107 1108 1109 1110
        if (ret < 0) {
            LOG("writing to file failed");
            reply.error = -ret;
            goto error_reply;
        }
N
Nick Thomas 已提交
1111

1112
        if (request.type & NBD_CMD_FLAG_FUA) {
M
Max Reitz 已提交
1113
            ret = blk_co_flush(exp->blk);
1114
            if (ret < 0) {
1115
                LOG("flush failed");
1116
                reply.error = -ret;
1117
                goto error_reply;
1118
            }
N
Nick Thomas 已提交
1119 1120
        }

1121
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1122
            goto out;
1123
        }
N
Nick Thomas 已提交
1124 1125 1126 1127
        break;
    case NBD_CMD_DISC:
        TRACE("Request type is DISCONNECT");
        errno = 0;
P
Paolo Bonzini 已提交
1128
        goto out;
P
Paolo Bonzini 已提交
1129 1130 1131
    case NBD_CMD_FLUSH:
        TRACE("Request type is FLUSH");

M
Max Reitz 已提交
1132
        ret = blk_co_flush(exp->blk);
P
Paolo Bonzini 已提交
1133 1134 1135 1136
        if (ret < 0) {
            LOG("flush failed");
            reply.error = -ret;
        }
1137
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1138
            goto out;
1139
        }
P
Paolo Bonzini 已提交
1140 1141 1142
        break;
    case NBD_CMD_TRIM:
        TRACE("Request type is TRIM");
M
Max Reitz 已提交
1143 1144 1145
        ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
                                       / BDRV_SECTOR_SIZE,
                             request.len / BDRV_SECTOR_SIZE);
P
Paolo Bonzini 已提交
1146 1147 1148 1149
        if (ret < 0) {
            LOG("discard failed");
            reply.error = -ret;
        }
1150
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1151
            goto out;
1152
        }
P
Paolo Bonzini 已提交
1153
        break;
N
Nick Thomas 已提交
1154 1155
    default:
        LOG("invalid request type (%u) received", request.type);
1156
    invalid_request:
Y
Yik Fang 已提交
1157
        reply.error = EINVAL;
1158
    error_reply:
1159
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1160
            goto out;
1161
        }
1162
        break;
N
Nick Thomas 已提交
1163 1164 1165 1166
    }

    TRACE("Request/Reply complete");

1167
done:
P
Paolo Bonzini 已提交
1168 1169 1170
    nbd_request_put(req);
    return;

P
Paolo Bonzini 已提交
1171
out:
1172
    nbd_request_put(req);
M
Max Reitz 已提交
1173
    client_close(client);
B
bellard 已提交
1174
}
P
Paolo Bonzini 已提交
1175

1176 1177 1178 1179
static void nbd_read(void *opaque)
{
    NBDClient *client = opaque;

P
Paolo Bonzini 已提交
1180 1181 1182 1183
    if (client->recv_coroutine) {
        qemu_coroutine_enter(client->recv_coroutine, NULL);
    } else {
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1184 1185 1186
    }
}

P
Paolo Bonzini 已提交
1187 1188 1189 1190 1191 1192 1193
static void nbd_restart_write(void *opaque)
{
    NBDClient *client = opaque;

    qemu_coroutine_enter(client->send_coroutine, NULL);
}

M
Max Reitz 已提交
1194 1195 1196
static void nbd_set_handlers(NBDClient *client)
{
    if (client->exp && client->exp->ctx) {
1197
        aio_set_fd_handler(client->exp->ctx, client->sioc->fd,
1198
                           true,
M
Max Reitz 已提交
1199 1200 1201 1202 1203 1204 1205 1206 1207
                           client->can_read ? nbd_read : NULL,
                           client->send_coroutine ? nbd_restart_write : NULL,
                           client);
    }
}

static void nbd_unset_handlers(NBDClient *client)
{
    if (client->exp && client->exp->ctx) {
1208
        aio_set_fd_handler(client->exp->ctx, client->sioc->fd,
1209
                           true, NULL, NULL, NULL);
M
Max Reitz 已提交
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226
    }
}

static void nbd_update_can_read(NBDClient *client)
{
    bool can_read = client->recv_coroutine ||
                    client->nb_requests < MAX_NBD_REQUESTS;

    if (can_read != client->can_read) {
        client->can_read = can_read;
        nbd_set_handlers(client);

        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
         * in nbd_set_handlers() will have taken care of that */
    }
}

1227 1228 1229 1230 1231 1232 1233 1234 1235 1236
static coroutine_fn void nbd_co_client_start(void *opaque)
{
    NBDClientNewData *data = opaque;
    NBDClient *client = data->client;
    NBDExport *exp = client->exp;

    if (exp) {
        nbd_export_get(exp);
    }
    if (nbd_negotiate(data)) {
1237
        client_close(client);
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
        goto out;
    }
    qemu_co_mutex_init(&client->send_lock);
    nbd_set_handlers(client);

    if (exp) {
        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
    }
out:
    g_free(data);
}

1250 1251
void nbd_client_new(NBDExport *exp,
                    QIOChannelSocket *sioc,
1252 1253
                    QCryptoTLSCreds *tlscreds,
                    const char *tlsaclname,
1254
                    void (*close_fn)(NBDClient *))
P
Paolo Bonzini 已提交
1255
{
1256
    NBDClient *client;
1257 1258
    NBDClientNewData *data = g_new(NBDClientNewData, 1);

1259 1260 1261
    client = g_malloc0(sizeof(NBDClient));
    client->refcount = 1;
    client->exp = exp;
1262 1263 1264 1265 1266
    client->tlscreds = tlscreds;
    if (tlscreds) {
        object_ref(OBJECT(client->tlscreds));
    }
    client->tlsaclname = g_strdup(tlsaclname);
1267 1268 1269 1270
    client->sioc = sioc;
    object_ref(OBJECT(client->sioc));
    client->ioc = QIO_CHANNEL(sioc);
    object_ref(OBJECT(client->ioc));
M
Max Reitz 已提交
1271
    client->can_read = true;
1272
    client->close = close_fn;
1273

1274 1275 1276
    data->client = client;
    data->co = qemu_coroutine_create(nbd_co_client_start);
    qemu_coroutine_enter(data->co, data);
P
Paolo Bonzini 已提交
1277
}