server.c 32.9 KB
Newer Older
1
/*
B
bellard 已提交
2 3
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
 *
F
Fam Zheng 已提交
4
 *  Network Block Device Server Side
B
bellard 已提交
5 6 7 8 9 10 11 12 13 14 15
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
B
bellard 已提交
18

P
Peter Maydell 已提交
19
#include "qemu/osdep.h"
F
Fam Zheng 已提交
20
#include "nbd-internal.h"
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44

static int system_errno_to_nbd_errno(int err)
{
    switch (err) {
    case 0:
        return NBD_SUCCESS;
    case EPERM:
        return NBD_EPERM;
    case EIO:
        return NBD_EIO;
    case ENOMEM:
        return NBD_ENOMEM;
#ifdef EDQUOT
    case EDQUOT:
#endif
    case EFBIG:
    case ENOSPC:
        return NBD_ENOSPC;
    case EINVAL:
    default:
        return NBD_EINVAL;
    }
}

45 46 47 48 49 50 51 52 53 54 55
/* Definitions for opaque data types */

typedef struct NBDRequest NBDRequest;

struct NBDRequest {
    QSIMPLEQ_ENTRY(NBDRequest) entry;
    NBDClient *client;
    uint8_t *data;
};

struct NBDExport {
56
    int refcount;
57 58
    void (*close)(NBDExport *exp);

M
Max Reitz 已提交
59
    BlockBackend *blk;
P
Paolo Bonzini 已提交
60
    char *name;
61 62 63
    off_t dev_offset;
    off_t size;
    uint32_t nbdflags;
64
    QTAILQ_HEAD(, NBDClient) clients;
P
Paolo Bonzini 已提交
65
    QTAILQ_ENTRY(NBDExport) next;
M
Max Reitz 已提交
66 67

    AioContext *ctx;
68 69

    Notifier eject_notifier;
70 71
};

P
Paolo Bonzini 已提交
72 73
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);

74 75 76 77 78
struct NBDClient {
    int refcount;
    void (*close)(NBDClient *client);

    NBDExport *exp;
79 80
    QCryptoTLSCreds *tlscreds;
    char *tlsaclname;
81 82
    QIOChannelSocket *sioc; /* The underlying data channel */
    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
83 84 85 86 87 88

    Coroutine *recv_coroutine;

    CoMutex send_lock;
    Coroutine *send_coroutine;

M
Max Reitz 已提交
89 90
    bool can_read;

91
    QTAILQ_ENTRY(NBDClient) next;
92
    int nb_requests;
93
    bool closing;
94 95
};

B
bellard 已提交
96 97
/* That's all folks */

M
Max Reitz 已提交
98 99 100 101
static void nbd_set_handlers(NBDClient *client);
static void nbd_unset_handlers(NBDClient *client);
static void nbd_update_can_read(NBDClient *client);

102 103 104
static gboolean nbd_negotiate_continue(QIOChannel *ioc,
                                       GIOCondition condition,
                                       void *opaque)
105 106
{
    qemu_coroutine_enter(opaque, NULL);
107
    return TRUE;
108 109
}

110
static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size)
111 112
{
    ssize_t ret;
113
    guint watch;
114 115 116

    assert(qemu_in_coroutine());
    /* Negotiation are always in main loop. */
117 118 119 120 121 122 123
    watch = qio_channel_add_watch(ioc,
                                  G_IO_IN,
                                  nbd_negotiate_continue,
                                  qemu_coroutine_self(),
                                  NULL);
    ret = read_sync(ioc, buffer, size);
    g_source_remove(watch);
124 125 126 127
    return ret;

}

128
static ssize_t nbd_negotiate_write(QIOChannel *ioc, void *buffer, size_t size)
129 130
{
    ssize_t ret;
131
    guint watch;
132 133 134

    assert(qemu_in_coroutine());
    /* Negotiation are always in main loop. */
135 136 137 138 139 140 141
    watch = qio_channel_add_watch(ioc,
                                  G_IO_OUT,
                                  nbd_negotiate_continue,
                                  qemu_coroutine_self(),
                                  NULL);
    ret = write_sync(ioc, buffer, size);
    g_source_remove(watch);
142 143 144
    return ret;
}

145
static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size)
146 147 148 149 150
{
    ssize_t ret, dropped = size;
    uint8_t *buffer = g_malloc(MIN(65536, size));

    while (size > 0) {
151
        ret = nbd_negotiate_read(ioc, buffer, MIN(65536, size));
152 153 154 155 156 157 158 159 160 161 162 163 164
        if (ret < 0) {
            g_free(buffer);
            return ret;
        }

        assert(ret <= size);
        size -= ret;
    }

    g_free(buffer);
    return dropped;
}

165
/* Basic flow for negotiation
B
bellard 已提交
166 167 168

   Server         Client
   Negotiate
169 170 171 172 173 174 175 176 177 178 179 180 181

   or

   Server         Client
   Negotiate #1
                  Option
   Negotiate #2

   ----

   followed by

   Server         Client
B
bellard 已提交
182 183 184 185 186 187 188
                  Request
   Response
                  Request
   Response
                  ...
   ...
                  Request (type == 2)
189

B
bellard 已提交
190 191
*/

192
static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt)
193 194
{
    uint64_t magic;
195
    uint32_t len;
196

197 198
    TRACE("Reply opt=%x type=%x", type, opt);

199
    magic = cpu_to_be64(NBD_REP_MAGIC);
200
    if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
201 202
        LOG("write failed (rep magic)");
        return -EINVAL;
203
    }
204
    opt = cpu_to_be32(opt);
205
    if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) {
206 207
        LOG("write failed (rep opt)");
        return -EINVAL;
208
    }
209
    type = cpu_to_be32(type);
210
    if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) {
211 212
        LOG("write failed (rep type)");
        return -EINVAL;
213
    }
214
    len = cpu_to_be32(0);
215
    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
216 217
        LOG("write failed (rep data length)");
        return -EINVAL;
218
    }
219 220
    return 0;
}
221

222
static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp)
223 224 225 226
{
    uint64_t magic, name_len;
    uint32_t opt, type, len;

227
    TRACE("Advertizing export name '%s'", exp->name ? exp->name : "");
228 229
    name_len = strlen(exp->name);
    magic = cpu_to_be64(NBD_REP_MAGIC);
230
    if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
231 232 233 234
        LOG("write failed (magic)");
        return -EINVAL;
     }
    opt = cpu_to_be32(NBD_OPT_LIST);
235
    if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) {
236 237 238 239
        LOG("write failed (opt)");
        return -EINVAL;
    }
    type = cpu_to_be32(NBD_REP_SERVER);
240
    if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) {
241 242 243 244
        LOG("write failed (reply type)");
        return -EINVAL;
    }
    len = cpu_to_be32(name_len + sizeof(len));
245
    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
246 247 248 249
        LOG("write failed (length)");
        return -EINVAL;
    }
    len = cpu_to_be32(name_len);
250
    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
251 252 253
        LOG("write failed (length)");
        return -EINVAL;
    }
254
    if (nbd_negotiate_write(ioc, exp->name, name_len) != name_len) {
255 256 257 258 259 260
        LOG("write failed (buffer)");
        return -EINVAL;
    }
    return 0;
}

261
static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length)
262 263 264 265
{
    NBDExport *exp;

    if (length) {
266
        if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
267 268
            return -EIO;
        }
269 270
        return nbd_negotiate_send_rep(client->ioc,
                                      NBD_REP_ERR_INVALID, NBD_OPT_LIST);
271 272 273 274
    }

    /* For each export, send a NBD_REP_SERVER reply. */
    QTAILQ_FOREACH(exp, &exports, next) {
275
        if (nbd_negotiate_send_rep_list(client->ioc, exp)) {
276 277 278 279
            return -EINVAL;
        }
    }
    /* Finish with a NBD_REP_ACK. */
280
    return nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_LIST);
281 282
}

283
static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length)
284
{
285
    int rc = -EINVAL;
286
    char name[256];
287

288 289 290
    /* Client sends:
        [20 ..  xx]   export name (length bytes)
     */
291 292 293 294 295
    TRACE("Checking length");
    if (length > 255) {
        LOG("Bad length received");
        goto fail;
    }
296
    if (nbd_negotiate_read(client->ioc, name, length) != length) {
297 298 299 300 301
        LOG("read failed");
        goto fail;
    }
    name[length] = '\0';

302 303
    TRACE("Client requested export '%s'", name);

304 305 306 307 308 309 310 311 312 313 314 315 316
    client->exp = nbd_export_find(name);
    if (!client->exp) {
        LOG("export not found");
        goto fail;
    }

    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    nbd_export_get(client->exp);
    rc = 0;
fail:
    return rc;
}

317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365

static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
                                                 uint32_t length)
{
    QIOChannel *ioc;
    QIOChannelTLS *tioc;
    struct NBDTLSHandshakeData data = { 0 };

    TRACE("Setting up TLS");
    ioc = client->ioc;
    if (length) {
        if (nbd_negotiate_drop_sync(ioc, length) != length) {
            return NULL;
        }
        nbd_negotiate_send_rep(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS);
        return NULL;
    }

    nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_STARTTLS);

    tioc = qio_channel_tls_new_server(ioc,
                                      client->tlscreds,
                                      client->tlsaclname,
                                      NULL);
    if (!tioc) {
        return NULL;
    }

    TRACE("Starting TLS handshake");
    data.loop = g_main_loop_new(g_main_context_default(), FALSE);
    qio_channel_tls_handshake(tioc,
                              nbd_tls_handshake,
                              &data,
                              NULL);

    if (!data.complete) {
        g_main_loop_run(data.loop);
    }
    g_main_loop_unref(data.loop);
    if (data.error) {
        object_unref(OBJECT(tioc));
        error_free(data.error);
        return NULL;
    }

    return QIO_CHANNEL(tioc);
}


366
static int nbd_negotiate_options(NBDClient *client)
367
{
M
Max Reitz 已提交
368
    uint32_t flags;
369
    bool fixedNewstyle = false;
M
Max Reitz 已提交
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384

    /* Client sends:
        [ 0 ..   3]   client flags

        [ 0 ..   7]   NBD_OPTS_MAGIC
        [ 8 ..  11]   NBD option
        [12 ..  15]   Data length
        ...           Rest of request

        [ 0 ..   7]   NBD_OPTS_MAGIC
        [ 8 ..  11]   Second NBD option
        [12 ..  15]   Data length
        ...           Rest of request
    */

385 386
    if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) !=
        sizeof(flags)) {
M
Max Reitz 已提交
387 388 389 390 391
        LOG("read failed");
        return -EIO;
    }
    TRACE("Checking client flags");
    be32_to_cpus(&flags);
392 393 394 395 396 397 398
    if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
        TRACE("Support supports fixed newstyle handshake");
        fixedNewstyle = true;
        flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
    }
    if (flags != 0) {
        TRACE("Unknown client flags 0x%x received", flags);
M
Max Reitz 已提交
399 400 401
        return -EIO;
    }

402
    while (1) {
M
Max Reitz 已提交
403
        int ret;
404
        uint32_t clientflags, length;
405 406
        uint64_t magic;

407 408
        if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) !=
            sizeof(magic)) {
409 410 411 412 413 414 415 416 417
            LOG("read failed");
            return -EINVAL;
        }
        TRACE("Checking opts magic");
        if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
            LOG("Bad magic received");
            return -EINVAL;
        }

418 419
        if (nbd_negotiate_read(client->ioc, &clientflags,
                               sizeof(clientflags)) != sizeof(clientflags)) {
420 421 422
            LOG("read failed");
            return -EINVAL;
        }
423
        clientflags = be32_to_cpu(clientflags);
424

425 426
        if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) !=
            sizeof(length)) {
427 428 429 430 431
            LOG("read failed");
            return -EINVAL;
        }
        length = be32_to_cpu(length);

432
        TRACE("Checking option 0x%x", clientflags);
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
        if (client->tlscreds &&
            client->ioc == (QIOChannel *)client->sioc) {
            QIOChannel *tioc;
            if (!fixedNewstyle) {
                TRACE("Unsupported option 0x%x", clientflags);
                return -EINVAL;
            }
            switch (clientflags) {
            case NBD_OPT_STARTTLS:
                tioc = nbd_negotiate_handle_starttls(client, length);
                if (!tioc) {
                    return -EIO;
                }
                object_unref(OBJECT(client->ioc));
                client->ioc = QIO_CHANNEL(tioc);
                break;

            default:
                TRACE("Option 0x%x not permitted before TLS", clientflags);
                nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_TLS_REQD,
                                       clientflags);
                return -EINVAL;
            }
        } else if (fixedNewstyle) {
457 458 459 460 461 462 463 464 465 466 467 468 469 470
            switch (clientflags) {
            case NBD_OPT_LIST:
                ret = nbd_negotiate_handle_list(client, length);
                if (ret < 0) {
                    return ret;
                }
                break;

            case NBD_OPT_ABORT:
                return -EINVAL;

            case NBD_OPT_EXPORT_NAME:
                return nbd_negotiate_handle_export_name(client, length);

471 472 473 474 475 476 477 478 479 480 481
            case NBD_OPT_STARTTLS:
                if (client->tlscreds) {
                    TRACE("TLS already enabled");
                    nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_INVALID,
                                           clientflags);
                } else {
                    TRACE("TLS not configured");
                    nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_POLICY,
                                           clientflags);
                }
                return -EINVAL;
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
            default:
                TRACE("Unsupported option 0x%x", clientflags);
                nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_UNSUP,
                                       clientflags);
                return -EINVAL;
            }
        } else {
            /*
             * If broken new-style we should drop the connection
             * for anything except NBD_OPT_EXPORT_NAME
             */
            switch (clientflags) {
            case NBD_OPT_EXPORT_NAME:
                return nbd_negotiate_handle_export_name(client, length);

            default:
                TRACE("Unsupported option 0x%x", clientflags);
                return -EINVAL;
500
            }
501 502 503 504
        }
    }
}

505 506 507 508 509 510
typedef struct {
    NBDClient *client;
    Coroutine *co;
} NBDClientNewData;

static coroutine_fn int nbd_negotiate(NBDClientNewData *data)
B
bellard 已提交
511
{
512
    NBDClient *client = data->client;
N
Nick Thomas 已提交
513
    char buf[8 + 8 + 8 + 128];
514
    int rc;
515 516
    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
517
    bool oldStyle;
N
Nick Thomas 已提交
518

519
    /* Old style negotiation header without options
520 521
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
N
Nick Thomas 已提交
522
        [16 ..  23]   size
523
        [24 ..  25]   server flags (0)
H
Hani Benhabiles 已提交
524
        [26 ..  27]   export flags
525 526
        [28 .. 151]   reserved     (0)

527
       New style negotiation header with options
528 529 530
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
        [16 ..  17]   server flags (0)
531
        ....options sent....
532 533 534
        [18 ..  25]   size
        [26 ..  27]   export flags
        [28 .. 151]   reserved     (0)
N
Nick Thomas 已提交
535 536
     */

537
    qio_channel_set_blocking(client->ioc, false, NULL);
538 539
    rc = -EINVAL;

N
Nick Thomas 已提交
540
    TRACE("Beginning negotiation.");
541
    memset(buf, 0, sizeof(buf));
N
Nick Thomas 已提交
542
    memcpy(buf, "NBDMAGIC", 8);
543 544 545

    oldStyle = client->exp != NULL && !client->tlscreds;
    if (oldStyle) {
546
        assert ((client->exp->nbdflags & ~65535) == 0);
J
John Snow 已提交
547 548 549
        stq_be_p(buf + 8, NBD_CLIENT_MAGIC);
        stq_be_p(buf + 16, client->exp->size);
        stw_be_p(buf + 26, client->exp->nbdflags | myflags);
550
    } else {
J
John Snow 已提交
551 552
        stq_be_p(buf + 8, NBD_OPTS_MAGIC);
        stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE);
553
    }
N
Nick Thomas 已提交
554

555 556 557 558 559
    if (oldStyle) {
        if (client->tlscreds) {
            TRACE("TLS cannot be enabled with oldstyle protocol");
            goto fail;
        }
560
        if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) != sizeof(buf)) {
561 562 563 564
            LOG("write failed");
            goto fail;
        }
    } else {
565
        if (nbd_negotiate_write(client->ioc, buf, 18) != 18) {
566 567 568
            LOG("write failed");
            goto fail;
        }
569
        rc = nbd_negotiate_options(client);
570
        if (rc != 0) {
571 572 573 574 575
            LOG("option negotiation failed");
            goto fail;
        }

        assert ((client->exp->nbdflags & ~65535) == 0);
J
John Snow 已提交
576 577
        stq_be_p(buf + 18, client->exp->size);
        stw_be_p(buf + 26, client->exp->nbdflags | myflags);
578 579
        if (nbd_negotiate_write(client->ioc, buf + 18, sizeof(buf) - 18) !=
            sizeof(buf) - 18) {
580 581 582
            LOG("write failed");
            goto fail;
        }
N
Nick Thomas 已提交
583 584
    }

D
Dong Xu Wang 已提交
585
    TRACE("Negotiation succeeded.");
586 587 588
    rc = 0;
fail:
    return rc;
B
bellard 已提交
589 590
}

P
Paolo Bonzini 已提交
591
#ifdef __linux__
B
bellard 已提交
592 593 594

int nbd_disconnect(int fd)
{
N
Nick Thomas 已提交
595 596 597 598
    ioctl(fd, NBD_CLEAR_QUE);
    ioctl(fd, NBD_DISCONNECT);
    ioctl(fd, NBD_CLEAR_SOCK);
    return 0;
B
bellard 已提交
599 600
}

601 602 603 604
#else

int nbd_disconnect(int fd)
{
605
    return -ENOTSUP;
606 607
}
#endif
B
bellard 已提交
608

609
static ssize_t nbd_receive_request(QIOChannel *ioc, struct nbd_request *request)
610
{
P
Paolo Bonzini 已提交
611
    uint8_t buf[NBD_REQUEST_SIZE];
N
Nick Thomas 已提交
612
    uint32_t magic;
613
    ssize_t ret;
N
Nick Thomas 已提交
614

615
    ret = read_sync(ioc, buf, sizeof(buf));
616 617 618 619 620
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
621
        LOG("read failed");
622
        return -EINVAL;
N
Nick Thomas 已提交
623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
    }

    /* Request
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
       [ 8 .. 15]   handle
       [16 .. 23]   from
       [24 .. 27]   len
     */

    magic = be32_to_cpup((uint32_t*)buf);
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));

    TRACE("Got request: "
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
          magic, request->type, request->from, request->len);

    if (magic != NBD_REQUEST_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
645
        return -EINVAL;
N
Nick Thomas 已提交
646 647
    }
    return 0;
648 649
}

650
static ssize_t nbd_send_reply(QIOChannel *ioc, struct nbd_reply *reply)
651
{
P
Paolo Bonzini 已提交
652
    uint8_t buf[NBD_REPLY_SIZE];
653
    ssize_t ret;
N
Nick Thomas 已提交
654

655 656
    reply->error = system_errno_to_nbd_errno(reply->error);

N
Nick Thomas 已提交
657 658 659 660 661
    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */
J
John Snow 已提交
662 663 664
    stl_be_p(buf, NBD_REPLY_MAGIC);
    stl_be_p(buf + 4, reply->error);
    stq_be_p(buf + 8, reply->handle);
N
Nick Thomas 已提交
665 666 667

    TRACE("Sending response to client");

668
    ret = write_sync(ioc, buf, sizeof(buf));
669 670 671 672 673
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
674
        LOG("writing to socket failed");
675
        return -EINVAL;
N
Nick Thomas 已提交
676 677
    }
    return 0;
678
}
B
bellard 已提交
679

P
Paolo Bonzini 已提交
680 681
#define MAX_NBD_REQUESTS 16

682
void nbd_client_get(NBDClient *client)
683 684 685 686
{
    client->refcount++;
}

687
void nbd_client_put(NBDClient *client)
688 689
{
    if (--client->refcount == 0) {
690
        /* The last reference should be dropped by client->close,
M
Max Reitz 已提交
691
         * which is called by client_close.
692 693 694
         */
        assert(client->closing);

M
Max Reitz 已提交
695
        nbd_unset_handlers(client);
696 697
        object_unref(OBJECT(client->sioc));
        object_unref(OBJECT(client->ioc));
698 699 700 701
        if (client->tlscreds) {
            object_unref(OBJECT(client->tlscreds));
        }
        g_free(client->tlsaclname);
702 703 704 705
        if (client->exp) {
            QTAILQ_REMOVE(&client->exp->clients, client, next);
            nbd_export_put(client->exp);
        }
706 707 708 709
        g_free(client);
    }
}

M
Max Reitz 已提交
710
static void client_close(NBDClient *client)
711
{
712 713 714 715 716 717 718 719 720
    if (client->closing) {
        return;
    }

    client->closing = true;

    /* Force requests to finish.  They will drop their own references,
     * then we'll close the socket and free the NBDClient.
     */
721 722
    qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
                         NULL);
723 724

    /* Also tell the client, so that they release their reference.  */
725 726 727 728 729
    if (client->close) {
        client->close(client);
    }
}

730
static NBDRequest *nbd_request_get(NBDClient *client)
P
Paolo Bonzini 已提交
731 732
{
    NBDRequest *req;
733

P
Paolo Bonzini 已提交
734 735
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
    client->nb_requests++;
M
Max Reitz 已提交
736
    nbd_update_can_read(client);
P
Paolo Bonzini 已提交
737

738
    req = g_new0(NBDRequest, 1);
739 740
    nbd_client_get(client);
    req->client = client;
P
Paolo Bonzini 已提交
741 742 743
    return req;
}

744
static void nbd_request_put(NBDRequest *req)
P
Paolo Bonzini 已提交
745
{
746
    NBDClient *client = req->client;
747

748 749 750
    if (req->data) {
        qemu_vfree(req->data);
    }
751
    g_free(req);
752

M
Max Reitz 已提交
753 754
    client->nb_requests--;
    nbd_update_can_read(client);
755
    nbd_client_put(client);
P
Paolo Bonzini 已提交
756 757
}

M
Max Reitz 已提交
758
static void blk_aio_attached(AioContext *ctx, void *opaque)
M
Max Reitz 已提交
759 760 761 762 763 764 765 766 767 768 769 770 771
{
    NBDExport *exp = opaque;
    NBDClient *client;

    TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx);

    exp->ctx = ctx;

    QTAILQ_FOREACH(client, &exp->clients, next) {
        nbd_set_handlers(client);
    }
}

M
Max Reitz 已提交
772
static void blk_aio_detach(void *opaque)
M
Max Reitz 已提交
773 774 775 776 777 778 779 780 781 782 783 784 785
{
    NBDExport *exp = opaque;
    NBDClient *client;

    TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);

    QTAILQ_FOREACH(client, &exp->clients, next) {
        nbd_unset_handlers(client);
    }

    exp->ctx = NULL;
}

786 787 788 789 790 791
static void nbd_eject_notifier(Notifier *n, void *data)
{
    NBDExport *exp = container_of(n, NBDExport, eject_notifier);
    nbd_export_close(exp);
}

792
NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
M
Max Reitz 已提交
793 794
                          uint32_t nbdflags, void (*close)(NBDExport *),
                          Error **errp)
P
Paolo Bonzini 已提交
795 796
{
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
797
    exp->refcount = 1;
798
    QTAILQ_INIT(&exp->clients);
M
Max Reitz 已提交
799
    exp->blk = blk;
P
Paolo Bonzini 已提交
800 801
    exp->dev_offset = dev_offset;
    exp->nbdflags = nbdflags;
M
Max Reitz 已提交
802 803 804 805 806 807 808 809
    exp->size = size < 0 ? blk_getlength(blk) : size;
    if (exp->size < 0) {
        error_setg_errno(errp, -exp->size,
                         "Failed to determine the NBD export's length");
        goto fail;
    }
    exp->size -= exp->size % BDRV_SECTOR_SIZE;

810
    exp->close = close;
M
Max Reitz 已提交
811 812 813
    exp->ctx = blk_get_aio_context(blk);
    blk_ref(blk);
    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
814 815 816 817

    exp->eject_notifier.notify = nbd_eject_notifier;
    blk_add_remove_bs_notifier(blk, &exp->eject_notifier);

818 819
    /*
     * NBD exports are used for non-shared storage migration.  Make sure
820
     * that BDRV_O_INACTIVE is cleared and the image is ready for write
821 822
     * access since the export could be available before migration handover.
     */
823
    aio_context_acquire(exp->ctx);
M
Max Reitz 已提交
824
    blk_invalidate_cache(blk, NULL);
825
    aio_context_release(exp->ctx);
P
Paolo Bonzini 已提交
826
    return exp;
M
Max Reitz 已提交
827 828 829 830

fail:
    g_free(exp);
    return NULL;
P
Paolo Bonzini 已提交
831 832
}

P
Paolo Bonzini 已提交
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
NBDExport *nbd_export_find(const char *name)
{
    NBDExport *exp;
    QTAILQ_FOREACH(exp, &exports, next) {
        if (strcmp(name, exp->name) == 0) {
            return exp;
        }
    }

    return NULL;
}

void nbd_export_set_name(NBDExport *exp, const char *name)
{
    if (exp->name == name) {
        return;
    }

    nbd_export_get(exp);
    if (exp->name != NULL) {
        g_free(exp->name);
        exp->name = NULL;
        QTAILQ_REMOVE(&exports, exp, next);
        nbd_export_put(exp);
    }
    if (name != NULL) {
        nbd_export_get(exp);
        exp->name = g_strdup(name);
        QTAILQ_INSERT_TAIL(&exports, exp, next);
    }
    nbd_export_put(exp);
}

P
Paolo Bonzini 已提交
866 867
void nbd_export_close(NBDExport *exp)
{
868
    NBDClient *client, *next;
869

870 871
    nbd_export_get(exp);
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
M
Max Reitz 已提交
872
        client_close(client);
873
    }
P
Paolo Bonzini 已提交
874
    nbd_export_set_name(exp, NULL);
875
    nbd_export_put(exp);
876 877 878 879 880 881 882 883 884 885 886 887 888
}

void nbd_export_get(NBDExport *exp)
{
    assert(exp->refcount > 0);
    exp->refcount++;
}

void nbd_export_put(NBDExport *exp)
{
    assert(exp->refcount > 0);
    if (exp->refcount == 1) {
        nbd_export_close(exp);
P
Paolo Bonzini 已提交
889 890
    }

891
    if (--exp->refcount == 0) {
P
Paolo Bonzini 已提交
892 893
        assert(exp->name == NULL);

894 895 896 897
        if (exp->close) {
            exp->close(exp);
        }

898
        if (exp->blk) {
899
            notifier_remove(&exp->eject_notifier);
900 901 902 903 904 905
            blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
                                            blk_aio_detach, exp);
            blk_unref(exp->blk);
            exp->blk = NULL;
        }

906 907
        g_free(exp);
    }
P
Paolo Bonzini 已提交
908 909
}

910
BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
P
Paolo Bonzini 已提交
911
{
M
Max Reitz 已提交
912
    return exp->blk;
P
Paolo Bonzini 已提交
913 914
}

P
Paolo Bonzini 已提交
915 916 917 918 919 920 921 922 923
void nbd_export_close_all(void)
{
    NBDExport *exp, *next;

    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
        nbd_export_close(exp);
    }
}

P
Paolo Bonzini 已提交
924 925
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
                                 int len)
926
{
927
    NBDClient *client = req->client;
P
Paolo Bonzini 已提交
928
    ssize_t rc, ret;
929

930
    g_assert(qemu_in_coroutine());
P
Paolo Bonzini 已提交
931 932
    qemu_co_mutex_lock(&client->send_lock);
    client->send_coroutine = qemu_coroutine_self();
M
Max Reitz 已提交
933
    nbd_set_handlers(client);
P
Paolo Bonzini 已提交
934

935
    if (!len) {
936
        rc = nbd_send_reply(client->ioc, reply);
937
    } else {
938 939
        qio_channel_set_cork(client->ioc, true);
        rc = nbd_send_reply(client->ioc, reply);
940
        if (rc >= 0) {
941
            ret = write_sync(client->ioc, req->data, len);
942
            if (ret != len) {
943
                rc = -EIO;
944 945
            }
        }
946
        qio_channel_set_cork(client->ioc, false);
947
    }
P
Paolo Bonzini 已提交
948 949

    client->send_coroutine = NULL;
M
Max Reitz 已提交
950
    nbd_set_handlers(client);
P
Paolo Bonzini 已提交
951
    qemu_co_mutex_unlock(&client->send_lock);
952 953 954
    return rc;
}

P
Paolo Bonzini 已提交
955
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
956
{
957
    NBDClient *client = req->client;
958
    uint32_t command;
P
Paolo Bonzini 已提交
959
    ssize_t rc;
960

961
    g_assert(qemu_in_coroutine());
P
Paolo Bonzini 已提交
962
    client->recv_coroutine = qemu_coroutine_self();
M
Max Reitz 已提交
963 964
    nbd_update_can_read(client);

965
    rc = nbd_receive_request(client->ioc, request);
966 967 968 969
    if (rc < 0) {
        if (rc != -EAGAIN) {
            rc = -EIO;
        }
970 971 972 973 974 975 976 977 978 979 980 981
        goto out;
    }

    if ((request->from + request->len) < request->from) {
        LOG("integer overflow detected! "
            "you're probably being attacked");
        rc = -EINVAL;
        goto out;
    }

    TRACE("Decoding type");

982 983
    command = request->type & NBD_CMD_MASK_COMMAND;
    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
984 985 986 987 988 989 990
        if (request->len > NBD_MAX_BUFFER_SIZE) {
            LOG("len (%u) is larger than max len (%u)",
                request->len, NBD_MAX_BUFFER_SIZE);
            rc = -EINVAL;
            goto out;
        }

991 992 993 994 995
        req->data = blk_try_blockalign(client->exp->blk, request->len);
        if (req->data == NULL) {
            rc = -ENOMEM;
            goto out;
        }
996 997
    }
    if (command == NBD_CMD_WRITE) {
998 999
        TRACE("Reading %u byte(s)", request->len);

1000
        if (read_sync(client->ioc, req->data, request->len) != request->len) {
1001 1002 1003 1004 1005 1006 1007 1008
            LOG("reading from socket failed");
            rc = -EIO;
            goto out;
        }
    }
    rc = 0;

out:
P
Paolo Bonzini 已提交
1009
    client->recv_coroutine = NULL;
M
Max Reitz 已提交
1010 1011
    nbd_update_can_read(client);

1012 1013 1014
    return rc;
}

P
Paolo Bonzini 已提交
1015
static void nbd_trip(void *opaque)
1016
{
P
Paolo Bonzini 已提交
1017
    NBDClient *client = opaque;
1018
    NBDExport *exp = client->exp;
1019
    NBDRequest *req;
N
Nick Thomas 已提交
1020 1021
    struct nbd_request request;
    struct nbd_reply reply;
P
Paolo Bonzini 已提交
1022
    ssize_t ret;
1023
    uint32_t command;
N
Nick Thomas 已提交
1024 1025

    TRACE("Reading request.");
1026 1027 1028
    if (client->closing) {
        return;
    }
N
Nick Thomas 已提交
1029

1030
    req = nbd_request_get(client);
P
Paolo Bonzini 已提交
1031
    ret = nbd_co_receive_request(req, &request);
1032 1033 1034
    if (ret == -EAGAIN) {
        goto done;
    }
1035
    if (ret == -EIO) {
P
Paolo Bonzini 已提交
1036
        goto out;
1037
    }
N
Nick Thomas 已提交
1038

1039 1040 1041
    reply.handle = request.handle;
    reply.error = 0;

1042 1043 1044
    if (ret < 0) {
        reply.error = -ret;
        goto error_reply;
N
Nick Thomas 已提交
1045
    }
1046 1047
    command = request.type & NBD_CMD_MASK_COMMAND;
    if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
N
Nick Thomas 已提交
1048 1049
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
            ", Offset: %" PRIu64 "\n",
P
Paolo Bonzini 已提交
1050
                    request.from, request.len,
S
Stefan Weil 已提交
1051
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
N
Nick Thomas 已提交
1052
        LOG("requested operation past EOF--bad client?");
1053
        goto invalid_request;
N
Nick Thomas 已提交
1054 1055
    }

1056 1057 1058 1059 1060 1061 1062 1063
    if (client->closing) {
        /*
         * The client may be closed when we are blocked in
         * nbd_co_receive_request()
         */
        goto done;
    }

1064
    switch (command) {
N
Nick Thomas 已提交
1065 1066 1067
    case NBD_CMD_READ:
        TRACE("Request type is READ");

P
Paolo Bonzini 已提交
1068
        if (request.type & NBD_CMD_FLAG_FUA) {
M
Max Reitz 已提交
1069
            ret = blk_co_flush(exp->blk);
P
Paolo Bonzini 已提交
1070 1071 1072 1073 1074 1075 1076
            if (ret < 0) {
                LOG("flush failed");
                reply.error = -ret;
                goto error_reply;
            }
        }

M
Max Reitz 已提交
1077 1078 1079
        ret = blk_read(exp->blk,
                       (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
                       req->data, request.len / BDRV_SECTOR_SIZE);
1080
        if (ret < 0) {
N
Nick Thomas 已提交
1081
            LOG("reading from file failed");
1082
            reply.error = -ret;
1083
            goto error_reply;
N
Nick Thomas 已提交
1084 1085 1086
        }

        TRACE("Read %u byte(s)", request.len);
P
Paolo Bonzini 已提交
1087
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
P
Paolo Bonzini 已提交
1088
            goto out;
N
Nick Thomas 已提交
1089 1090 1091 1092
        break;
    case NBD_CMD_WRITE:
        TRACE("Request type is WRITE");

P
Paolo Bonzini 已提交
1093
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
N
Nick Thomas 已提交
1094
            TRACE("Server is read-only, return error");
1095 1096 1097 1098 1099 1100
            reply.error = EROFS;
            goto error_reply;
        }

        TRACE("Writing to device");

M
Max Reitz 已提交
1101 1102 1103
        ret = blk_write(exp->blk,
                        (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
                        req->data, request.len / BDRV_SECTOR_SIZE);
1104 1105 1106 1107 1108
        if (ret < 0) {
            LOG("writing to file failed");
            reply.error = -ret;
            goto error_reply;
        }
N
Nick Thomas 已提交
1109

1110
        if (request.type & NBD_CMD_FLAG_FUA) {
M
Max Reitz 已提交
1111
            ret = blk_co_flush(exp->blk);
1112
            if (ret < 0) {
1113
                LOG("flush failed");
1114
                reply.error = -ret;
1115
                goto error_reply;
1116
            }
N
Nick Thomas 已提交
1117 1118
        }

1119
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1120
            goto out;
1121
        }
N
Nick Thomas 已提交
1122 1123 1124 1125
        break;
    case NBD_CMD_DISC:
        TRACE("Request type is DISCONNECT");
        errno = 0;
P
Paolo Bonzini 已提交
1126
        goto out;
P
Paolo Bonzini 已提交
1127 1128 1129
    case NBD_CMD_FLUSH:
        TRACE("Request type is FLUSH");

M
Max Reitz 已提交
1130
        ret = blk_co_flush(exp->blk);
P
Paolo Bonzini 已提交
1131 1132 1133 1134
        if (ret < 0) {
            LOG("flush failed");
            reply.error = -ret;
        }
1135
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1136
            goto out;
1137
        }
P
Paolo Bonzini 已提交
1138 1139 1140
        break;
    case NBD_CMD_TRIM:
        TRACE("Request type is TRIM");
M
Max Reitz 已提交
1141 1142 1143
        ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
                                       / BDRV_SECTOR_SIZE,
                             request.len / BDRV_SECTOR_SIZE);
P
Paolo Bonzini 已提交
1144 1145 1146 1147
        if (ret < 0) {
            LOG("discard failed");
            reply.error = -ret;
        }
1148
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1149
            goto out;
1150
        }
P
Paolo Bonzini 已提交
1151
        break;
N
Nick Thomas 已提交
1152 1153
    default:
        LOG("invalid request type (%u) received", request.type);
1154
    invalid_request:
Y
Yik Fang 已提交
1155
        reply.error = EINVAL;
1156
    error_reply:
1157
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1158
            goto out;
1159
        }
1160
        break;
N
Nick Thomas 已提交
1161 1162 1163 1164
    }

    TRACE("Request/Reply complete");

1165
done:
P
Paolo Bonzini 已提交
1166 1167 1168
    nbd_request_put(req);
    return;

P
Paolo Bonzini 已提交
1169
out:
1170
    nbd_request_put(req);
M
Max Reitz 已提交
1171
    client_close(client);
B
bellard 已提交
1172
}
P
Paolo Bonzini 已提交
1173

1174 1175 1176 1177
static void nbd_read(void *opaque)
{
    NBDClient *client = opaque;

P
Paolo Bonzini 已提交
1178 1179 1180 1181
    if (client->recv_coroutine) {
        qemu_coroutine_enter(client->recv_coroutine, NULL);
    } else {
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1182 1183 1184
    }
}

P
Paolo Bonzini 已提交
1185 1186 1187 1188 1189 1190 1191
static void nbd_restart_write(void *opaque)
{
    NBDClient *client = opaque;

    qemu_coroutine_enter(client->send_coroutine, NULL);
}

M
Max Reitz 已提交
1192 1193 1194
static void nbd_set_handlers(NBDClient *client)
{
    if (client->exp && client->exp->ctx) {
1195
        aio_set_fd_handler(client->exp->ctx, client->sioc->fd,
1196
                           true,
M
Max Reitz 已提交
1197 1198 1199 1200 1201 1202 1203 1204 1205
                           client->can_read ? nbd_read : NULL,
                           client->send_coroutine ? nbd_restart_write : NULL,
                           client);
    }
}

static void nbd_unset_handlers(NBDClient *client)
{
    if (client->exp && client->exp->ctx) {
1206
        aio_set_fd_handler(client->exp->ctx, client->sioc->fd,
1207
                           true, NULL, NULL, NULL);
M
Max Reitz 已提交
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224
    }
}

static void nbd_update_can_read(NBDClient *client)
{
    bool can_read = client->recv_coroutine ||
                    client->nb_requests < MAX_NBD_REQUESTS;

    if (can_read != client->can_read) {
        client->can_read = can_read;
        nbd_set_handlers(client);

        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
         * in nbd_set_handlers() will have taken care of that */
    }
}

1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
static coroutine_fn void nbd_co_client_start(void *opaque)
{
    NBDClientNewData *data = opaque;
    NBDClient *client = data->client;
    NBDExport *exp = client->exp;

    if (exp) {
        nbd_export_get(exp);
    }
    if (nbd_negotiate(data)) {
1235
        client_close(client);
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
        goto out;
    }
    qemu_co_mutex_init(&client->send_lock);
    nbd_set_handlers(client);

    if (exp) {
        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
    }
out:
    g_free(data);
}

1248 1249
void nbd_client_new(NBDExport *exp,
                    QIOChannelSocket *sioc,
1250 1251
                    QCryptoTLSCreds *tlscreds,
                    const char *tlsaclname,
1252
                    void (*close_fn)(NBDClient *))
P
Paolo Bonzini 已提交
1253
{
1254
    NBDClient *client;
1255 1256
    NBDClientNewData *data = g_new(NBDClientNewData, 1);

1257 1258 1259
    client = g_malloc0(sizeof(NBDClient));
    client->refcount = 1;
    client->exp = exp;
1260 1261 1262 1263 1264
    client->tlscreds = tlscreds;
    if (tlscreds) {
        object_ref(OBJECT(client->tlscreds));
    }
    client->tlsaclname = g_strdup(tlsaclname);
1265 1266 1267 1268
    client->sioc = sioc;
    object_ref(OBJECT(client->sioc));
    client->ioc = QIO_CHANNEL(sioc);
    object_ref(OBJECT(client->ioc));
M
Max Reitz 已提交
1269
    client->can_read = true;
1270
    client->close = close_fn;
1271

1272 1273 1274
    data->client = client;
    data->co = qemu_coroutine_create(nbd_co_client_start);
    qemu_coroutine_enter(data->co, data);
P
Paolo Bonzini 已提交
1275
}