server.c 45.7 KB
Newer Older
1
/*
2
 *  Copyright (C) 2016-2017 Red Hat, Inc.
B
bellard 已提交
3 4
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
 *
F
Fam Zheng 已提交
5
 *  Network Block Device Server Side
B
bellard 已提交
6 7 8 9 10 11 12 13 14 15 16
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
17
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
18
 */
B
bellard 已提交
19

P
Peter Maydell 已提交
20
#include "qemu/osdep.h"
21
#include "qapi/error.h"
22
#include "trace.h"
F
Fam Zheng 已提交
23
#include "nbd-internal.h"
24 25 26 27 28 29 30

static int system_errno_to_nbd_errno(int err)
{
    switch (err) {
    case 0:
        return NBD_SUCCESS;
    case EPERM:
31
    case EROFS:
32 33 34 35 36 37 38 39 40 41 42
        return NBD_EPERM;
    case EIO:
        return NBD_EIO;
    case ENOMEM:
        return NBD_ENOMEM;
#ifdef EDQUOT
    case EDQUOT:
#endif
    case EFBIG:
    case ENOSPC:
        return NBD_ENOSPC;
43 44
    case EOVERFLOW:
        return NBD_EOVERFLOW;
45 46
    case ESHUTDOWN:
        return NBD_ESHUTDOWN;
47 48 49 50 51 52
    case EINVAL:
    default:
        return NBD_EINVAL;
    }
}

53 54
/* Definitions for opaque data types */

55
typedef struct NBDRequestData NBDRequestData;
56

57 58
struct NBDRequestData {
    QSIMPLEQ_ENTRY(NBDRequestData) entry;
59 60
    NBDClient *client;
    uint8_t *data;
61
    bool complete;
62 63 64
};

struct NBDExport {
65
    int refcount;
66 67
    void (*close)(NBDExport *exp);

M
Max Reitz 已提交
68
    BlockBackend *blk;
P
Paolo Bonzini 已提交
69
    char *name;
70
    char *description;
71 72
    off_t dev_offset;
    off_t size;
E
Eric Blake 已提交
73
    uint16_t nbdflags;
74
    QTAILQ_HEAD(, NBDClient) clients;
P
Paolo Bonzini 已提交
75
    QTAILQ_ENTRY(NBDExport) next;
M
Max Reitz 已提交
76 77

    AioContext *ctx;
78

79
    BlockBackend *eject_notifier_blk;
80
    Notifier eject_notifier;
81 82
};

P
Paolo Bonzini 已提交
83 84
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);

85 86
struct NBDClient {
    int refcount;
87
    void (*close_fn)(NBDClient *client, bool negotiated);
88 89

    NBDExport *exp;
90 91
    QCryptoTLSCreds *tlscreds;
    char *tlsaclname;
92 93
    QIOChannelSocket *sioc; /* The underlying data channel */
    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
94 95 96 97 98 99

    Coroutine *recv_coroutine;

    CoMutex send_lock;
    Coroutine *send_coroutine;

100
    QTAILQ_ENTRY(NBDClient) next;
101
    int nb_requests;
102
    bool closing;
103 104
};

B
bellard 已提交
105 106
/* That's all folks */

107
static void nbd_client_receive_next_request(NBDClient *client);
M
Max Reitz 已提交
108

109
/* Basic flow for negotiation
B
bellard 已提交
110 111 112

   Server         Client
   Negotiate
113 114 115 116 117 118 119 120 121 122 123 124 125

   or

   Server         Client
   Negotiate #1
                  Option
   Negotiate #2

   ----

   followed by

   Server         Client
B
bellard 已提交
126 127 128 129 130 131 132
                  Request
   Response
                  Request
   Response
                  ...
   ...
                  Request (type == 2)
133

B
bellard 已提交
134 135
*/

136 137 138
/* Send a reply header, including length, but no payload.
 * Return -errno on error, 0 on success. */
static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type,
139
                                      uint32_t opt, uint32_t len, Error **errp)
140 141 142
{
    uint64_t magic;

143 144
    trace_nbd_negotiate_send_rep_len(opt, nbd_opt_lookup(opt),
                                     type, nbd_rep_lookup(type), len);
145

E
Eric Blake 已提交
146
    assert(len < NBD_MAX_BUFFER_SIZE);
147
    magic = cpu_to_be64(NBD_REP_MAGIC);
148 149
    if (nbd_write(ioc, &magic, sizeof(magic), errp) < 0) {
        error_prepend(errp, "write failed (rep magic): ");
150
        return -EINVAL;
151
    }
152

153
    opt = cpu_to_be32(opt);
154 155
    if (nbd_write(ioc, &opt, sizeof(opt), errp) < 0) {
        error_prepend(errp, "write failed (rep opt): ");
156
        return -EINVAL;
157
    }
158

159
    type = cpu_to_be32(type);
160 161
    if (nbd_write(ioc, &type, sizeof(type), errp) < 0) {
        error_prepend(errp, "write failed (rep type): ");
162
        return -EINVAL;
163
    }
164

165
    len = cpu_to_be32(len);
166 167
    if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
        error_prepend(errp, "write failed (rep data length): ");
168
        return -EINVAL;
169
    }
170 171
    return 0;
}
172

173 174
/* Send a reply header with default 0 length.
 * Return -errno on error, 0 on success. */
175 176
static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt,
                                  Error **errp)
177
{
178
    return nbd_negotiate_send_rep_len(ioc, type, opt, 0, errp);
179 180
}

181 182
/* Send an error reply.
 * Return -errno on error, 0 on success. */
183
static int GCC_FMT_ATTR(5, 6)
184
nbd_negotiate_send_rep_err(QIOChannel *ioc, uint32_t type,
185
                           uint32_t opt, Error **errp, const char *fmt, ...)
186 187 188 189 190 191 192 193 194 195 196
{
    va_list va;
    char *msg;
    int ret;
    size_t len;

    va_start(va, fmt);
    msg = g_strdup_vprintf(fmt, va);
    va_end(va);
    len = strlen(msg);
    assert(len < 4096);
197
    trace_nbd_negotiate_send_rep_err(msg);
198
    ret = nbd_negotiate_send_rep_len(ioc, type, opt, len, errp);
199 200 201
    if (ret < 0) {
        goto out;
    }
202 203
    if (nbd_write(ioc, msg, len, errp) < 0) {
        error_prepend(errp, "write failed (error message): ");
204 205 206 207
        ret = -EIO;
    } else {
        ret = 0;
    }
208

209 210 211 212 213
out:
    g_free(msg);
    return ret;
}

214 215
/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
 * Return -errno on error, 0 on success. */
216 217
static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp,
                                       Error **errp)
218
{
219
    size_t name_len, desc_len;
220
    uint32_t len;
221 222
    const char *name = exp->name ? exp->name : "";
    const char *desc = exp->description ? exp->description : "";
223
    int ret;
224

225
    trace_nbd_negotiate_send_rep_list(name, desc);
226 227
    name_len = strlen(name);
    desc_len = strlen(desc);
228
    len = name_len + desc_len + sizeof(len);
229 230
    ret = nbd_negotiate_send_rep_len(ioc, NBD_REP_SERVER, NBD_OPT_LIST, len,
                                     errp);
231 232
    if (ret < 0) {
        return ret;
233
    }
234

235
    len = cpu_to_be32(name_len);
236 237
    if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
        error_prepend(errp, "write failed (name length): ");
238 239
        return -EINVAL;
    }
240 241 242

    if (nbd_write(ioc, name, name_len, errp) < 0) {
        error_prepend(errp, "write failed (name buffer): ");
243 244
        return -EINVAL;
    }
245 246 247

    if (nbd_write(ioc, desc, desc_len, errp) < 0) {
        error_prepend(errp, "write failed (description buffer): ");
248 249
        return -EINVAL;
    }
250

251 252 253
    return 0;
}

254 255
/* Process the NBD_OPT_LIST command, with a potential series of replies.
 * Return -errno on error, 0 on success. */
256 257
static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length,
                                     Error **errp)
258 259 260 261
{
    NBDExport *exp;

    if (length) {
262
        if (nbd_drop(client->ioc, length, errp) < 0) {
263 264
            return -EIO;
        }
265 266
        return nbd_negotiate_send_rep_err(client->ioc,
                                          NBD_REP_ERR_INVALID, NBD_OPT_LIST,
267
                                          errp,
268
                                          "OPT_LIST should not have length");
269 270 271 272
    }

    /* For each export, send a NBD_REP_SERVER reply. */
    QTAILQ_FOREACH(exp, &exports, next) {
273
        if (nbd_negotiate_send_rep_list(client->ioc, exp, errp)) {
274 275 276 277
            return -EINVAL;
        }
    }
    /* Finish with a NBD_REP_ACK. */
278
    return nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_LIST, errp);
279 280
}

E
Eric Blake 已提交
281 282
/* Send a reply to NBD_OPT_EXPORT_NAME.
 * Return -errno on error, 0 on success. */
283
static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length,
284
                                            uint16_t myflags, bool no_zeroes,
285
                                            Error **errp)
286
{
287
    char name[NBD_MAX_NAME_SIZE + 1];
288
    char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
289 290
    size_t len;
    int ret;
291

292 293
    /* Client sends:
        [20 ..  xx]   export name (length bytes)
294 295 296 297
       Server replies:
        [ 0 ..   7]   size
        [ 8 ..   9]   export flags
        [10 .. 133]   reserved     (0) [unless no_zeroes]
298
     */
299
    trace_nbd_negotiate_handle_export_name();
300
    if (length >= sizeof(name)) {
301
        error_setg(errp, "Bad length received");
302
        return -EINVAL;
303
    }
304 305
    if (nbd_read(client->ioc, name, length, errp) < 0) {
        error_prepend(errp, "read failed: ");
306
        return -EINVAL;
307 308 309
    }
    name[length] = '\0';

310
    trace_nbd_negotiate_handle_export_name_request(name);
311

312 313
    client->exp = nbd_export_find(name);
    if (!client->exp) {
314
        error_setg(errp, "export not found");
315
        return -EINVAL;
316 317
    }

318 319 320 321 322 323 324 325 326 327 328
    trace_nbd_negotiate_new_style_size_flags(client->exp->size,
                                             client->exp->nbdflags | myflags);
    stq_be_p(buf, client->exp->size);
    stw_be_p(buf + 8, client->exp->nbdflags | myflags);
    len = no_zeroes ? 10 : sizeof(buf);
    ret = nbd_write(client->ioc, buf, len, errp);
    if (ret < 0) {
        error_prepend(errp, "write failed: ");
        return ret;
    }

329 330
    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    nbd_export_get(client->exp);
331 332

    return 0;
333 334
}

E
Eric Blake 已提交
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
/* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
 * The buffer does NOT include the info type prefix.
 * Return -errno on error, 0 if ready to send more. */
static int nbd_negotiate_send_info(NBDClient *client, uint32_t opt,
                                   uint16_t info, uint32_t length, void *buf,
                                   Error **errp)
{
    int rc;

    trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
    rc = nbd_negotiate_send_rep_len(client->ioc, NBD_REP_INFO, opt,
                                    sizeof(info) + length, errp);
    if (rc < 0) {
        return rc;
    }
    cpu_to_be16s(&info);
    if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
        return -EIO;
    }
    if (nbd_write(client->ioc, buf, length, errp) < 0) {
        return -EIO;
    }
    return 0;
}

/* Handle NBD_OPT_INFO and NBD_OPT_GO.
 * Return -errno on error, 0 if ready for next option, and 1 to move
 * into transmission phase.  */
static int nbd_negotiate_handle_info(NBDClient *client, uint32_t length,
                                     uint32_t opt, uint16_t myflags,
                                     Error **errp)
{
    int rc;
    char name[NBD_MAX_NAME_SIZE + 1];
    NBDExport *exp;
    uint16_t requests;
    uint16_t request;
    uint32_t namelen;
    bool sendname = false;
374 375
    bool blocksize = false;
    uint32_t sizes[3];
E
Eric Blake 已提交
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
    char buf[sizeof(uint64_t) + sizeof(uint16_t)];
    const char *msg;

    /* Client sends:
        4 bytes: L, name length (can be 0)
        L bytes: export name
        2 bytes: N, number of requests (can be 0)
        N * 2 bytes: N requests
    */
    if (length < sizeof(namelen) + sizeof(requests)) {
        msg = "overall request too short";
        goto invalid;
    }
    if (nbd_read(client->ioc, &namelen, sizeof(namelen), errp) < 0) {
        return -EIO;
    }
    be32_to_cpus(&namelen);
    length -= sizeof(namelen);
    if (namelen > length - sizeof(requests) || (length - namelen) % 2) {
        msg = "name length is incorrect";
        goto invalid;
    }
    if (nbd_read(client->ioc, name, namelen, errp) < 0) {
        return -EIO;
    }
    name[namelen] = '\0';
    length -= namelen;
    trace_nbd_negotiate_handle_export_name_request(name);

    if (nbd_read(client->ioc, &requests, sizeof(requests), errp) < 0) {
        return -EIO;
    }
    be16_to_cpus(&requests);
    length -= sizeof(requests);
    trace_nbd_negotiate_handle_info_requests(requests);
    if (requests != length / sizeof(request)) {
        msg = "incorrect number of  requests for overall length";
        goto invalid;
    }
    while (requests--) {
        if (nbd_read(client->ioc, &request, sizeof(request), errp) < 0) {
            return -EIO;
        }
        be16_to_cpus(&request);
        length -= sizeof(request);
        trace_nbd_negotiate_handle_info_request(request,
                                                nbd_info_lookup(request));
423 424 425 426 427
        /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
         * everything else is either a request we don't know or
         * something we send regardless of request */
        switch (request) {
        case NBD_INFO_NAME:
E
Eric Blake 已提交
428
            sendname = true;
429 430 431 432
            break;
        case NBD_INFO_BLOCK_SIZE:
            blocksize = true;
            break;
E
Eric Blake 已提交
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
        }
    }

    exp = nbd_export_find(name);
    if (!exp) {
        return nbd_negotiate_send_rep_err(client->ioc, NBD_REP_ERR_UNKNOWN,
                                          opt, errp, "export '%s' not present",
                                          name);
    }

    /* Don't bother sending NBD_INFO_NAME unless client requested it */
    if (sendname) {
        rc = nbd_negotiate_send_info(client, opt, NBD_INFO_NAME, length, name,
                                     errp);
        if (rc < 0) {
            return rc;
        }
    }

    /* Send NBD_INFO_DESCRIPTION only if available, regardless of
     * client request */
    if (exp->description) {
        size_t len = strlen(exp->description);

        rc = nbd_negotiate_send_info(client, opt, NBD_INFO_DESCRIPTION,
                                     len, exp->description, errp);
        if (rc < 0) {
            return rc;
        }
    }

464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
    /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
     * according to whether the client requested it, and according to
     * whether this is OPT_INFO or OPT_GO. */
    /* minimum - 1 for back-compat, or 512 if client is new enough.
     * TODO: consult blk_bs(blk)->bl.request_alignment? */
    sizes[0] = (opt == NBD_OPT_INFO || blocksize) ? BDRV_SECTOR_SIZE : 1;
    /* preferred - Hard-code to 4096 for now.
     * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
    sizes[1] = 4096;
    /* maximum - At most 32M, but smaller as appropriate. */
    sizes[2] = MIN(blk_get_max_transfer(exp->blk), NBD_MAX_BUFFER_SIZE);
    trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
    cpu_to_be32s(&sizes[0]);
    cpu_to_be32s(&sizes[1]);
    cpu_to_be32s(&sizes[2]);
    rc = nbd_negotiate_send_info(client, opt, NBD_INFO_BLOCK_SIZE,
                                 sizeof(sizes), sizes, errp);
    if (rc < 0) {
        return rc;
    }

E
Eric Blake 已提交
485 486 487 488 489 490 491 492 493 494 495
    /* Send NBD_INFO_EXPORT always */
    trace_nbd_negotiate_new_style_size_flags(exp->size,
                                             exp->nbdflags | myflags);
    stq_be_p(buf, exp->size);
    stw_be_p(buf + 8, exp->nbdflags | myflags);
    rc = nbd_negotiate_send_info(client, opt, NBD_INFO_EXPORT,
                                 sizeof(buf), buf, errp);
    if (rc < 0) {
        return rc;
    }

496 497 498 499 500 501 502 503 504 505 506 507
    /* If the client is just asking for NBD_OPT_INFO, but forgot to
     * request block sizes, return an error.
     * TODO: consult blk_bs(blk)->request_align, and only error if it
     * is not 1? */
    if (opt == NBD_OPT_INFO && !blocksize) {
        return nbd_negotiate_send_rep_err(client->ioc,
                                          NBD_REP_ERR_BLOCK_SIZE_REQD, opt,
                                          errp,
                                          "request NBD_INFO_BLOCK_SIZE to "
                                          "use this export");
    }

E
Eric Blake 已提交
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
    /* Final reply */
    rc = nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, opt, errp);
    if (rc < 0) {
        return rc;
    }

    if (opt == NBD_OPT_GO) {
        client->exp = exp;
        QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
        nbd_export_get(client->exp);
        rc = 1;
    }
    return rc;

 invalid:
    if (nbd_drop(client->ioc, length, errp) < 0) {
        return -EIO;
    }
    return nbd_negotiate_send_rep_err(client->ioc, NBD_REP_ERR_INVALID, opt,
                                      errp, "%s", msg);
}


531 532
/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
 * new channel for all further (now-encrypted) communication. */
533
static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
534 535
                                                 uint32_t length,
                                                 Error **errp)
536 537 538 539 540
{
    QIOChannel *ioc;
    QIOChannelTLS *tioc;
    struct NBDTLSHandshakeData data = { 0 };

541
    trace_nbd_negotiate_handle_starttls();
542 543
    ioc = client->ioc;
    if (length) {
544
        if (nbd_drop(ioc, length, errp) < 0) {
545 546
            return NULL;
        }
547
        nbd_negotiate_send_rep_err(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS,
548
                                   errp,
549
                                   "OPT_STARTTLS should not have length");
550 551 552
        return NULL;
    }

553
    if (nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK,
554
                               NBD_OPT_STARTTLS, errp) < 0) {
555 556
        return NULL;
    }
557 558 559 560

    tioc = qio_channel_tls_new_server(ioc,
                                      client->tlscreds,
                                      client->tlsaclname,
561
                                      errp);
562 563 564 565
    if (!tioc) {
        return NULL;
    }

566
    qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
567
    trace_nbd_negotiate_handle_starttls_handshake();
568 569 570 571 572 573 574 575 576 577 578 579
    data.loop = g_main_loop_new(g_main_context_default(), FALSE);
    qio_channel_tls_handshake(tioc,
                              nbd_tls_handshake,
                              &data,
                              NULL);

    if (!data.complete) {
        g_main_loop_run(data.loop);
    }
    g_main_loop_unref(data.loop);
    if (data.error) {
        object_unref(OBJECT(tioc));
580
        error_propagate(errp, data.error);
581 582 583 584 585 586
        return NULL;
    }

    return QIO_CHANNEL(tioc);
}

587
/* nbd_negotiate_options
E
Eric Blake 已提交
588 589
 * Process all NBD_OPT_* client option commands, during fixed newstyle
 * negotiation.
590
 * Return:
591 592 593 594
 * -errno  on error, errp is set
 * 0       on successful negotiation, errp is not set
 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
 *         errp is not set
595
 */
596 597
static int nbd_negotiate_options(NBDClient *client, uint16_t myflags,
                                 Error **errp)
598
{
M
Max Reitz 已提交
599
    uint32_t flags;
600
    bool fixedNewstyle = false;
601
    bool no_zeroes = false;
M
Max Reitz 已提交
602 603 604 605

    /* Client sends:
        [ 0 ..   3]   client flags

E
Eric Blake 已提交
606
       Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
M
Max Reitz 已提交
607 608 609 610 611 612 613 614 615 616 617
        [ 0 ..   7]   NBD_OPTS_MAGIC
        [ 8 ..  11]   NBD option
        [12 ..  15]   Data length
        ...           Rest of request

        [ 0 ..   7]   NBD_OPTS_MAGIC
        [ 8 ..  11]   Second NBD option
        [12 ..  15]   Data length
        ...           Rest of request
    */

618 619
    if (nbd_read(client->ioc, &flags, sizeof(flags), errp) < 0) {
        error_prepend(errp, "read failed: ");
M
Max Reitz 已提交
620 621 622
        return -EIO;
    }
    be32_to_cpus(&flags);
623
    trace_nbd_negotiate_options_flags(flags);
624 625 626 627
    if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
        fixedNewstyle = true;
        flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
    }
E
Eric Blake 已提交
628
    if (flags & NBD_FLAG_C_NO_ZEROES) {
629
        no_zeroes = true;
E
Eric Blake 已提交
630 631
        flags &= ~NBD_FLAG_C_NO_ZEROES;
    }
632
    if (flags != 0) {
633
        error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
634
        return -EINVAL;
M
Max Reitz 已提交
635 636
    }

637
    while (1) {
M
Max Reitz 已提交
638
        int ret;
639
        uint32_t option, length;
640 641
        uint64_t magic;

642 643
        if (nbd_read(client->ioc, &magic, sizeof(magic), errp) < 0) {
            error_prepend(errp, "read failed: ");
644 645
            return -EINVAL;
        }
646 647 648
        magic = be64_to_cpu(magic);
        trace_nbd_negotiate_options_check_magic(magic);
        if (magic != NBD_OPTS_MAGIC) {
649
            error_setg(errp, "Bad magic received");
650 651 652
            return -EINVAL;
        }

653 654
        if (nbd_read(client->ioc, &option,
                     sizeof(option), errp) < 0) {
655
            error_prepend(errp, "read failed: ");
656 657
            return -EINVAL;
        }
658
        option = be32_to_cpu(option);
659

660 661
        if (nbd_read(client->ioc, &length, sizeof(length), errp) < 0) {
            error_prepend(errp, "read failed: ");
662 663 664 665
            return -EINVAL;
        }
        length = be32_to_cpu(length);

666 667
        trace_nbd_negotiate_options_check_option(option,
                                                 nbd_opt_lookup(option));
668 669 670 671
        if (client->tlscreds &&
            client->ioc == (QIOChannel *)client->sioc) {
            QIOChannel *tioc;
            if (!fixedNewstyle) {
672
                error_setg(errp, "Unsupported option 0x%" PRIx32, option);
673 674
                return -EINVAL;
            }
675
            switch (option) {
676
            case NBD_OPT_STARTTLS:
677
                tioc = nbd_negotiate_handle_starttls(client, length, errp);
678 679 680
                if (!tioc) {
                    return -EIO;
                }
681
                ret = 0;
682 683 684 685
                object_unref(OBJECT(client->ioc));
                client->ioc = QIO_CHANNEL(tioc);
                break;

686 687
            case NBD_OPT_EXPORT_NAME:
                /* No way to return an error to client, so drop connection */
688
                error_setg(errp, "Option 0x%x not permitted before TLS",
689
                           option);
690 691
                return -EINVAL;

692
            default:
693
                if (nbd_drop(client->ioc, length, errp) < 0) {
694 695
                    return -EIO;
                }
696 697
                ret = nbd_negotiate_send_rep_err(client->ioc,
                                                 NBD_REP_ERR_TLS_REQD,
698
                                                 option, errp,
699 700
                                                 "Option 0x%" PRIx32
                                                 "not permitted before TLS",
701
                                                 option);
702 703 704
                /* Let the client keep trying, unless they asked to
                 * quit. In this mode, we've already sent an error, so
                 * we can't ack the abort.  */
705
                if (option == NBD_OPT_ABORT) {
706
                    return 1;
707
                }
708
                break;
709 710
            }
        } else if (fixedNewstyle) {
711
            switch (option) {
712
            case NBD_OPT_LIST:
713
                ret = nbd_negotiate_handle_list(client, length, errp);
714 715 716
                break;

            case NBD_OPT_ABORT:
717 718 719
                /* NBD spec says we must try to reply before
                 * disconnecting, but that we must also tolerate
                 * guests that don't wait for our reply. */
720
                nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, option, NULL);
721
                return 1;
722 723

            case NBD_OPT_EXPORT_NAME:
724 725 726
                return nbd_negotiate_handle_export_name(client, length,
                                                        myflags, no_zeroes,
                                                        errp);
727

E
Eric Blake 已提交
728 729 730 731 732 733 734 735 736 737
            case NBD_OPT_INFO:
            case NBD_OPT_GO:
                ret = nbd_negotiate_handle_info(client, length, option,
                                                myflags, errp);
                if (ret == 1) {
                    assert(option == NBD_OPT_GO);
                    return 0;
                }
                break;

738
            case NBD_OPT_STARTTLS:
739
                if (nbd_drop(client->ioc, length, errp) < 0) {
740 741
                    return -EIO;
                }
742
                if (client->tlscreds) {
743 744
                    ret = nbd_negotiate_send_rep_err(client->ioc,
                                                     NBD_REP_ERR_INVALID,
745
                                                     option, errp,
746
                                                     "TLS already enabled");
747
                } else {
748 749
                    ret = nbd_negotiate_send_rep_err(client->ioc,
                                                     NBD_REP_ERR_POLICY,
750
                                                     option, errp,
751
                                                     "TLS not configured");
752
                }
753
                break;
754
            default:
755
                if (nbd_drop(client->ioc, length, errp) < 0) {
756 757
                    return -EIO;
                }
758 759
                ret = nbd_negotiate_send_rep_err(client->ioc,
                                                 NBD_REP_ERR_UNSUP,
760
                                                 option, errp,
761
                                                 "Unsupported option 0x%"
762 763
                                                 PRIx32 " (%s)", option,
                                                 nbd_opt_lookup(option));
764
                break;
765 766 767 768 769 770
            }
        } else {
            /*
             * If broken new-style we should drop the connection
             * for anything except NBD_OPT_EXPORT_NAME
             */
771
            switch (option) {
772
            case NBD_OPT_EXPORT_NAME:
773 774 775
                return nbd_negotiate_handle_export_name(client, length,
                                                        myflags, no_zeroes,
                                                        errp);
776 777

            default:
778 779
                error_setg(errp, "Unsupported option 0x%" PRIx32 " (%s)",
                           option, nbd_opt_lookup(option));
780
                return -EINVAL;
781
            }
782
        }
783 784 785
        if (ret < 0) {
            return ret;
        }
786 787 788
    }
}

789 790
/* nbd_negotiate
 * Return:
791 792 793 794
 * -errno  on error, errp is set
 * 0       on successful negotiation, errp is not set
 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
 *         errp is not set
795
 */
796
static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
B
bellard 已提交
797
{
798
    char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
799
    int ret;
E
Eric Blake 已提交
800
    const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
801 802
                              NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA |
                              NBD_FLAG_SEND_WRITE_ZEROES);
803
    bool oldStyle;
N
Nick Thomas 已提交
804

805
    /* Old style negotiation header, no room for options
806 807
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
N
Nick Thomas 已提交
808
        [16 ..  23]   size
809
        [24 ..  27]   export flags (zero-extended)
810 811
        [28 .. 151]   reserved     (0)

812
       New style negotiation header, client can send options
813 814 815
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
        [16 ..  17]   server flags (0)
E
Eric Blake 已提交
816
        ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
N
Nick Thomas 已提交
817 818
     */

819
    qio_channel_set_blocking(client->ioc, false, NULL);
820

821
    trace_nbd_negotiate_begin();
N
Nick Thomas 已提交
822
    memcpy(buf, "NBDMAGIC", 8);
823 824 825

    oldStyle = client->exp != NULL && !client->tlscreds;
    if (oldStyle) {
826 827
        trace_nbd_negotiate_old_style(client->exp->size,
                                      client->exp->nbdflags | myflags);
J
John Snow 已提交
828 829
        stq_be_p(buf + 8, NBD_CLIENT_MAGIC);
        stq_be_p(buf + 16, client->exp->size);
830
        stl_be_p(buf + 24, client->exp->nbdflags | myflags);
N
Nick Thomas 已提交
831

832 833
        if (nbd_write(client->ioc, buf, sizeof(buf), errp) < 0) {
            error_prepend(errp, "write failed: ");
834
            return -EINVAL;
835 836
        }
    } else {
837 838 839
        stq_be_p(buf + 8, NBD_OPTS_MAGIC);
        stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);

840 841
        if (nbd_write(client->ioc, buf, 18, errp) < 0) {
            error_prepend(errp, "write failed: ");
842
            return -EINVAL;
843
        }
844
        ret = nbd_negotiate_options(client, myflags, errp);
845
        if (ret != 0) {
846 847 848
            if (ret < 0) {
                error_prepend(errp, "option negotiation failed: ");
            }
849
            return ret;
850
        }
N
Nick Thomas 已提交
851 852
    }

853
    trace_nbd_negotiate_success();
854 855

    return 0;
B
bellard 已提交
856 857
}

858 859
static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request,
                               Error **errp)
860
{
P
Paolo Bonzini 已提交
861
    uint8_t buf[NBD_REQUEST_SIZE];
N
Nick Thomas 已提交
862
    uint32_t magic;
863
    int ret;
N
Nick Thomas 已提交
864

865
    ret = nbd_read(ioc, buf, sizeof(buf), errp);
866 867 868 869
    if (ret < 0) {
        return ret;
    }

N
Nick Thomas 已提交
870 871
    /* Request
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
872 873
       [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, ...)
       [ 6 ..  7]   type    (NBD_CMD_READ, ...)
N
Nick Thomas 已提交
874 875 876 877 878
       [ 8 .. 15]   handle
       [16 .. 23]   from
       [24 .. 27]   len
     */

879
    magic = ldl_be_p(buf);
880 881
    request->flags  = lduw_be_p(buf + 4);
    request->type   = lduw_be_p(buf + 6);
882 883 884
    request->handle = ldq_be_p(buf + 8);
    request->from   = ldq_be_p(buf + 16);
    request->len    = ldl_be_p(buf + 24);
N
Nick Thomas 已提交
885

886 887
    trace_nbd_receive_request(magic, request->flags, request->type,
                              request->from, request->len);
N
Nick Thomas 已提交
888 889

    if (magic != NBD_REQUEST_MAGIC) {
890
        error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic);
891
        return -EINVAL;
N
Nick Thomas 已提交
892 893
    }
    return 0;
894 895
}

P
Paolo Bonzini 已提交
896 897
#define MAX_NBD_REQUESTS 16

898
void nbd_client_get(NBDClient *client)
899 900 901 902
{
    client->refcount++;
}

903
void nbd_client_put(NBDClient *client)
904 905
{
    if (--client->refcount == 0) {
906
        /* The last reference should be dropped by client->close,
M
Max Reitz 已提交
907
         * which is called by client_close.
908 909 910
         */
        assert(client->closing);

911
        qio_channel_detach_aio_context(client->ioc);
912 913
        object_unref(OBJECT(client->sioc));
        object_unref(OBJECT(client->ioc));
914 915 916 917
        if (client->tlscreds) {
            object_unref(OBJECT(client->tlscreds));
        }
        g_free(client->tlsaclname);
918 919 920 921
        if (client->exp) {
            QTAILQ_REMOVE(&client->exp->clients, client, next);
            nbd_export_put(client->exp);
        }
922 923 924 925
        g_free(client);
    }
}

926
static void client_close(NBDClient *client, bool negotiated)
927
{
928 929 930 931 932 933 934 935 936
    if (client->closing) {
        return;
    }

    client->closing = true;

    /* Force requests to finish.  They will drop their own references,
     * then we'll close the socket and free the NBDClient.
     */
937 938
    qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
                         NULL);
939 940

    /* Also tell the client, so that they release their reference.  */
941 942
    if (client->close_fn) {
        client->close_fn(client, negotiated);
943 944 945
    }
}

946
static NBDRequestData *nbd_request_get(NBDClient *client)
P
Paolo Bonzini 已提交
947
{
948
    NBDRequestData *req;
949

P
Paolo Bonzini 已提交
950 951 952
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
    client->nb_requests++;

953
    req = g_new0(NBDRequestData, 1);
954 955
    nbd_client_get(client);
    req->client = client;
P
Paolo Bonzini 已提交
956 957 958
    return req;
}

959
static void nbd_request_put(NBDRequestData *req)
P
Paolo Bonzini 已提交
960
{
961
    NBDClient *client = req->client;
962

963 964 965
    if (req->data) {
        qemu_vfree(req->data);
    }
966
    g_free(req);
967

M
Max Reitz 已提交
968
    client->nb_requests--;
969 970
    nbd_client_receive_next_request(client);

971
    nbd_client_put(client);
P
Paolo Bonzini 已提交
972 973
}

M
Max Reitz 已提交
974
static void blk_aio_attached(AioContext *ctx, void *opaque)
M
Max Reitz 已提交
975 976 977 978
{
    NBDExport *exp = opaque;
    NBDClient *client;

979
    trace_nbd_blk_aio_attached(exp->name, ctx);
M
Max Reitz 已提交
980 981 982 983

    exp->ctx = ctx;

    QTAILQ_FOREACH(client, &exp->clients, next) {
984 985 986 987 988 989 990
        qio_channel_attach_aio_context(client->ioc, ctx);
        if (client->recv_coroutine) {
            aio_co_schedule(ctx, client->recv_coroutine);
        }
        if (client->send_coroutine) {
            aio_co_schedule(ctx, client->send_coroutine);
        }
M
Max Reitz 已提交
991 992 993
    }
}

M
Max Reitz 已提交
994
static void blk_aio_detach(void *opaque)
M
Max Reitz 已提交
995 996 997 998
{
    NBDExport *exp = opaque;
    NBDClient *client;

999
    trace_nbd_blk_aio_detach(exp->name, exp->ctx);
M
Max Reitz 已提交
1000 1001

    QTAILQ_FOREACH(client, &exp->clients, next) {
1002
        qio_channel_detach_aio_context(client->ioc);
M
Max Reitz 已提交
1003 1004 1005 1006 1007
    }

    exp->ctx = NULL;
}

1008 1009 1010 1011 1012 1013
static void nbd_eject_notifier(Notifier *n, void *data)
{
    NBDExport *exp = container_of(n, NBDExport, eject_notifier);
    nbd_export_close(exp);
}

1014
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset, off_t size,
E
Eric Blake 已提交
1015
                          uint16_t nbdflags, void (*close)(NBDExport *),
1016
                          bool writethrough, BlockBackend *on_eject_blk,
M
Max Reitz 已提交
1017
                          Error **errp)
P
Paolo Bonzini 已提交
1018
{
1019
    AioContext *ctx;
1020
    BlockBackend *blk;
1021
    NBDExport *exp = g_new0(NBDExport, 1);
1022
    uint64_t perm;
1023
    int ret;
1024

1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
    /*
     * NBD exports are used for non-shared storage migration.  Make sure
     * that BDRV_O_INACTIVE is cleared and the image is ready for write
     * access since the export could be available before migration handover.
     */
    ctx = bdrv_get_aio_context(bs);
    aio_context_acquire(ctx);
    bdrv_invalidate_cache(bs, NULL);
    aio_context_release(ctx);

1035 1036 1037 1038 1039 1040 1041 1042
    /* Don't allow resize while the NBD server is running, otherwise we don't
     * care what happens with the node. */
    perm = BLK_PERM_CONSISTENT_READ;
    if ((nbdflags & NBD_FLAG_READ_ONLY) == 0) {
        perm |= BLK_PERM_WRITE;
    }
    blk = blk_new(perm, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
                        BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
1043 1044 1045 1046
    ret = blk_insert_bs(blk, bs, errp);
    if (ret < 0) {
        goto fail;
    }
1047 1048
    blk_set_enable_write_cache(blk, !writethrough);

1049
    exp->refcount = 1;
1050
    QTAILQ_INIT(&exp->clients);
M
Max Reitz 已提交
1051
    exp->blk = blk;
P
Paolo Bonzini 已提交
1052 1053
    exp->dev_offset = dev_offset;
    exp->nbdflags = nbdflags;
M
Max Reitz 已提交
1054 1055 1056 1057 1058 1059 1060 1061
    exp->size = size < 0 ? blk_getlength(blk) : size;
    if (exp->size < 0) {
        error_setg_errno(errp, -exp->size,
                         "Failed to determine the NBD export's length");
        goto fail;
    }
    exp->size -= exp->size % BDRV_SECTOR_SIZE;

1062
    exp->close = close;
M
Max Reitz 已提交
1063 1064
    exp->ctx = blk_get_aio_context(blk);
    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1065

1066 1067 1068 1069 1070 1071
    if (on_eject_blk) {
        blk_ref(on_eject_blk);
        exp->eject_notifier_blk = on_eject_blk;
        exp->eject_notifier.notify = nbd_eject_notifier;
        blk_add_remove_bs_notifier(on_eject_blk, &exp->eject_notifier);
    }
P
Paolo Bonzini 已提交
1072
    return exp;
M
Max Reitz 已提交
1073 1074

fail:
1075
    blk_unref(blk);
M
Max Reitz 已提交
1076 1077
    g_free(exp);
    return NULL;
P
Paolo Bonzini 已提交
1078 1079
}

P
Paolo Bonzini 已提交
1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
NBDExport *nbd_export_find(const char *name)
{
    NBDExport *exp;
    QTAILQ_FOREACH(exp, &exports, next) {
        if (strcmp(name, exp->name) == 0) {
            return exp;
        }
    }

    return NULL;
}

void nbd_export_set_name(NBDExport *exp, const char *name)
{
    if (exp->name == name) {
        return;
    }

    nbd_export_get(exp);
    if (exp->name != NULL) {
        g_free(exp->name);
        exp->name = NULL;
        QTAILQ_REMOVE(&exports, exp, next);
        nbd_export_put(exp);
    }
    if (name != NULL) {
        nbd_export_get(exp);
        exp->name = g_strdup(name);
        QTAILQ_INSERT_TAIL(&exports, exp, next);
    }
    nbd_export_put(exp);
}

1113 1114 1115 1116 1117 1118
void nbd_export_set_description(NBDExport *exp, const char *description)
{
    g_free(exp->description);
    exp->description = g_strdup(description);
}

P
Paolo Bonzini 已提交
1119 1120
void nbd_export_close(NBDExport *exp)
{
1121
    NBDClient *client, *next;
1122

1123 1124
    nbd_export_get(exp);
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1125
        client_close(client, true);
1126
    }
P
Paolo Bonzini 已提交
1127
    nbd_export_set_name(exp, NULL);
1128
    nbd_export_set_description(exp, NULL);
1129
    nbd_export_put(exp);
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142
}

void nbd_export_get(NBDExport *exp)
{
    assert(exp->refcount > 0);
    exp->refcount++;
}

void nbd_export_put(NBDExport *exp)
{
    assert(exp->refcount > 0);
    if (exp->refcount == 1) {
        nbd_export_close(exp);
P
Paolo Bonzini 已提交
1143 1144
    }

1145
    if (--exp->refcount == 0) {
P
Paolo Bonzini 已提交
1146
        assert(exp->name == NULL);
1147
        assert(exp->description == NULL);
P
Paolo Bonzini 已提交
1148

1149 1150 1151 1152
        if (exp->close) {
            exp->close(exp);
        }

1153
        if (exp->blk) {
1154 1155 1156 1157
            if (exp->eject_notifier_blk) {
                notifier_remove(&exp->eject_notifier);
                blk_unref(exp->eject_notifier_blk);
            }
1158 1159 1160 1161 1162 1163
            blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
                                            blk_aio_detach, exp);
            blk_unref(exp->blk);
            exp->blk = NULL;
        }

1164 1165
        g_free(exp);
    }
P
Paolo Bonzini 已提交
1166 1167
}

1168
BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
P
Paolo Bonzini 已提交
1169
{
M
Max Reitz 已提交
1170
    return exp->blk;
P
Paolo Bonzini 已提交
1171 1172
}

P
Paolo Bonzini 已提交
1173 1174 1175 1176 1177 1178 1179 1180 1181
void nbd_export_close_all(void)
{
    NBDExport *exp, *next;

    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
        nbd_export_close(exp);
    }
}

1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
                                        unsigned niov, Error **errp)
{
    int ret;

    g_assert(qemu_in_coroutine());
    qemu_co_mutex_lock(&client->send_lock);
    client->send_coroutine = qemu_coroutine_self();

    ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;

    client->send_coroutine = NULL;
    qemu_co_mutex_unlock(&client->send_lock);

    return ret;
}

1199 1200 1201 1202 1203 1204 1205 1206
static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
                                       uint64_t handle)
{
    stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
    stl_be_p(&reply->error, error);
    stq_be_p(&reply->handle, handle);
}

1207
static int nbd_co_send_simple_reply(NBDClient *client,
1208 1209
                                    uint64_t handle,
                                    uint32_t error,
1210 1211 1212
                                    void *data,
                                    size_t len,
                                    Error **errp)
1213
{
1214
    NBDSimpleReply reply;
1215
    int nbd_err = system_errno_to_nbd_errno(error);
1216 1217 1218 1219
    struct iovec iov[] = {
        {.iov_base = &reply, .iov_len = sizeof(reply)},
        {.iov_base = data, .iov_len = len}
    };
1220

1221 1222
    trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
                                   len);
1223
    set_be_simple_reply(&reply, nbd_err, handle);
P
Paolo Bonzini 已提交
1224

1225
    return nbd_co_send_iov(client, iov, len ? 2 : 1, errp);
1226 1227
}

1228 1229 1230 1231 1232 1233
/* nbd_co_receive_request
 * Collect a client request. Return 0 if request looks valid, -EIO to drop
 * connection right away, and any other negative value to report an error to
 * the client (although the caller may still need to disconnect after reporting
 * the error).
 */
1234 1235
static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
                                  Error **errp)
1236
{
1237
    NBDClient *client = req->client;
1238

1239
    g_assert(qemu_in_coroutine());
1240
    assert(client->recv_coroutine == qemu_coroutine_self());
1241
    if (nbd_receive_request(client->ioc, request, errp) < 0) {
1242
        return -EIO;
1243 1244
    }

1245 1246
    trace_nbd_co_receive_request_decode_type(request->handle, request->type,
                                             nbd_cmd_lookup(request->type));
1247

1248
    if (request->type != NBD_CMD_WRITE) {
1249 1250 1251 1252
        /* No payload, we are ready to read the next request.  */
        req->complete = true;
    }

1253
    if (request->type == NBD_CMD_DISC) {
1254 1255
        /* Special case: we're going to disconnect without a reply,
         * whether or not flags, from, or len are bogus */
1256
        return -EIO;
1257 1258 1259 1260 1261
    }

    /* Check for sanity in the parameters, part 1.  Defer as many
     * checks as possible until after reading any NBD_CMD_WRITE
     * payload, so we can try and keep the connection alive.  */
1262
    if ((request->from + request->len) < request->from) {
1263 1264
        error_setg(errp,
                   "integer overflow detected, you're probably being attacked");
1265
        return -EINVAL;
1266 1267
    }

1268
    if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE) {
1269
        if (request->len > NBD_MAX_BUFFER_SIZE) {
1270 1271
            error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
                       request->len, NBD_MAX_BUFFER_SIZE);
1272
            return -EINVAL;
1273 1274
        }

1275 1276
        req->data = blk_try_blockalign(client->exp->blk, request->len);
        if (req->data == NULL) {
1277
            error_setg(errp, "No memory");
1278
            return -ENOMEM;
1279
        }
1280
    }
1281
    if (request->type == NBD_CMD_WRITE) {
1282 1283
        if (nbd_read(client->ioc, req->data, request->len, errp) < 0) {
            error_prepend(errp, "reading from socket failed: ");
1284
            return -EIO;
1285
        }
1286
        req->complete = true;
1287

1288 1289
        trace_nbd_co_receive_request_payload_received(request->handle,
                                                      request->len);
1290
    }
1291 1292 1293

    /* Sanity checks, part 2. */
    if (request->from + request->len > client->exp->size) {
1294 1295 1296
        error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu32
                   ", Size: %" PRIu64, request->from, request->len,
                   (uint64_t)client->exp->size);
1297
        return request->type == NBD_CMD_WRITE ? -ENOSPC : -EINVAL;
1298
    }
1299
    if (request->flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
1300
        error_setg(errp, "unsupported flags (got 0x%x)", request->flags);
1301
        return -EINVAL;
E
Eric Blake 已提交
1302
    }
1303 1304
    if (request->type != NBD_CMD_WRITE_ZEROES &&
        (request->flags & NBD_CMD_FLAG_NO_HOLE)) {
1305
        error_setg(errp, "unexpected flags (got 0x%x)", request->flags);
1306
        return -EINVAL;
1307
    }
1308

1309
    return 0;
1310 1311
}

1312 1313
/* Owns a reference to the NBDClient passed as opaque.  */
static coroutine_fn void nbd_trip(void *opaque)
1314
{
P
Paolo Bonzini 已提交
1315
    NBDClient *client = opaque;
1316
    NBDExport *exp = client->exp;
1317
    NBDRequestData *req;
1318
    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
1319
    int ret;
1320
    int flags;
1321
    int reply_data_len = 0;
1322
    Error *local_err = NULL;
N
Nick Thomas 已提交
1323

1324
    trace_nbd_trip();
1325
    if (client->closing) {
1326
        nbd_client_put(client);
1327 1328
        return;
    }
N
Nick Thomas 已提交
1329

1330
    req = nbd_request_get(client);
1331
    ret = nbd_co_receive_request(req, &request, &local_err);
1332 1333
    client->recv_coroutine = NULL;
    nbd_client_receive_next_request(client);
1334
    if (ret == -EIO) {
1335
        goto disconnect;
1336
    }
N
Nick Thomas 已提交
1337

1338
    if (ret < 0) {
1339
        goto reply;
N
Nick Thomas 已提交
1340 1341
    }

1342 1343 1344 1345 1346 1347 1348 1349
    if (client->closing) {
        /*
         * The client may be closed when we are blocked in
         * nbd_co_receive_request()
         */
        goto done;
    }

1350
    switch (request.type) {
N
Nick Thomas 已提交
1351
    case NBD_CMD_READ:
1352 1353
        /* XXX: NBD Protocol only documents use of FUA with WRITE */
        if (request.flags & NBD_CMD_FLAG_FUA) {
M
Max Reitz 已提交
1354
            ret = blk_co_flush(exp->blk);
P
Paolo Bonzini 已提交
1355
            if (ret < 0) {
1356
                error_setg_errno(&local_err, -ret, "flush failed");
1357
                break;
P
Paolo Bonzini 已提交
1358 1359 1360
            }
        }

1361 1362
        ret = blk_pread(exp->blk, request.from + exp->dev_offset,
                        req->data, request.len);
1363
        if (ret < 0) {
1364
            error_setg_errno(&local_err, -ret, "reading from file failed");
1365
            break;
N
Nick Thomas 已提交
1366 1367
        }

1368 1369
        reply_data_len = request.len;

N
Nick Thomas 已提交
1370 1371
        break;
    case NBD_CMD_WRITE:
P
Paolo Bonzini 已提交
1372
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1373
            error_setg(&local_err, "Export is read-only");
1374
            ret = -EROFS;
1375
            break;
1376 1377
        }

1378
        flags = 0;
1379
        if (request.flags & NBD_CMD_FLAG_FUA) {
1380 1381
            flags |= BDRV_REQ_FUA;
        }
1382
        ret = blk_pwrite(exp->blk, request.from + exp->dev_offset,
1383
                         req->data, request.len, flags);
1384
        if (ret < 0) {
1385
            error_setg_errno(&local_err, -ret, "writing to file failed");
1386
        }
N
Nick Thomas 已提交
1387

1388 1389 1390
        break;
    case NBD_CMD_WRITE_ZEROES:
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1391
            error_setg(&local_err, "Export is read-only");
1392
            ret = -EROFS;
1393
            break;
1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405
        }

        flags = 0;
        if (request.flags & NBD_CMD_FLAG_FUA) {
            flags |= BDRV_REQ_FUA;
        }
        if (!(request.flags & NBD_CMD_FLAG_NO_HOLE)) {
            flags |= BDRV_REQ_MAY_UNMAP;
        }
        ret = blk_pwrite_zeroes(exp->blk, request.from + exp->dev_offset,
                                request.len, flags);
        if (ret < 0) {
1406
            error_setg_errno(&local_err, -ret, "writing to file failed");
1407 1408
        }

N
Nick Thomas 已提交
1409 1410
        break;
    case NBD_CMD_DISC:
1411 1412 1413
        /* unreachable, thanks to special case in nbd_co_receive_request() */
        abort();

P
Paolo Bonzini 已提交
1414
    case NBD_CMD_FLUSH:
M
Max Reitz 已提交
1415
        ret = blk_co_flush(exp->blk);
P
Paolo Bonzini 已提交
1416
        if (ret < 0) {
1417
            error_setg_errno(&local_err, -ret, "flush failed");
P
Paolo Bonzini 已提交
1418
        }
1419

P
Paolo Bonzini 已提交
1420 1421
        break;
    case NBD_CMD_TRIM:
1422 1423 1424
        ret = blk_co_pdiscard(exp->blk, request.from + exp->dev_offset,
                              request.len);
        if (ret < 0) {
1425
            error_setg_errno(&local_err, -ret, "discard failed");
P
Paolo Bonzini 已提交
1426
        }
1427

P
Paolo Bonzini 已提交
1428
        break;
N
Nick Thomas 已提交
1429
    default:
1430 1431
        error_setg(&local_err, "invalid request type (%" PRIu32 ") received",
                   request.type);
1432
        ret = -EINVAL;
1433 1434 1435
    }

reply:
1436
    if (local_err) {
1437 1438
        /* If we get here, local_err was not a fatal error, and should be sent
         * to the client. */
1439 1440 1441 1442
        error_report_err(local_err);
        local_err = NULL;
    }

1443
    if (nbd_co_send_simple_reply(req->client, request.handle,
1444
                                 ret < 0 ? -ret : 0,
1445
                                 req->data, reply_data_len, &local_err) < 0)
1446
    {
1447
        error_prepend(&local_err, "Failed to send reply: ");
1448 1449 1450
        goto disconnect;
    }

1451 1452 1453
    /* We must disconnect after NBD_CMD_WRITE if we did not
     * read the payload.
     */
1454 1455
    if (!req->complete) {
        error_setg(&local_err, "Request handling failed in intermediate state");
1456
        goto disconnect;
N
Nick Thomas 已提交
1457 1458
    }

1459
done:
P
Paolo Bonzini 已提交
1460
    nbd_request_put(req);
1461
    nbd_client_put(client);
P
Paolo Bonzini 已提交
1462 1463
    return;

1464
disconnect:
1465 1466 1467
    if (local_err) {
        error_reportf_err(local_err, "Disconnect client, due to: ");
    }
1468
    nbd_request_put(req);
1469
    client_close(client, true);
1470
    nbd_client_put(client);
B
bellard 已提交
1471
}
P
Paolo Bonzini 已提交
1472

1473
static void nbd_client_receive_next_request(NBDClient *client)
M
Max Reitz 已提交
1474
{
1475 1476 1477 1478
    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
        nbd_client_get(client);
        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
M
Max Reitz 已提交
1479 1480 1481
    }
}

1482 1483
static coroutine_fn void nbd_co_client_start(void *opaque)
{
1484
    NBDClient *client = opaque;
1485
    NBDExport *exp = client->exp;
1486
    Error *local_err = NULL;
1487 1488 1489

    if (exp) {
        nbd_export_get(exp);
1490
        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1491
    }
1492 1493
    qemu_co_mutex_init(&client->send_lock);

1494 1495 1496 1497
    if (nbd_negotiate(client, &local_err)) {
        if (local_err) {
            error_report_err(local_err);
        }
1498
        client_close(client, false);
1499
        return;
1500
    }
1501 1502

    nbd_client_receive_next_request(client);
1503 1504
}

1505 1506 1507 1508 1509 1510
/*
 * Create a new client listener on the given export @exp, using the
 * given channel @sioc.  Begin servicing it in a coroutine.  When the
 * connection closes, call @close_fn with an indication of whether the
 * client completed negotiation.
 */
1511 1512
void nbd_client_new(NBDExport *exp,
                    QIOChannelSocket *sioc,
1513 1514
                    QCryptoTLSCreds *tlscreds,
                    const char *tlsaclname,
1515
                    void (*close_fn)(NBDClient *, bool))
P
Paolo Bonzini 已提交
1516
{
1517
    NBDClient *client;
1518
    Coroutine *co;
1519

1520
    client = g_new0(NBDClient, 1);
1521 1522
    client->refcount = 1;
    client->exp = exp;
1523 1524 1525 1526 1527
    client->tlscreds = tlscreds;
    if (tlscreds) {
        object_ref(OBJECT(client->tlscreds));
    }
    client->tlsaclname = g_strdup(tlsaclname);
1528 1529 1530 1531
    client->sioc = sioc;
    object_ref(OBJECT(client->sioc));
    client->ioc = QIO_CHANNEL(sioc);
    object_ref(OBJECT(client->ioc));
1532
    client->close_fn = close_fn;
1533

1534 1535
    co = qemu_coroutine_create(nbd_co_client_start, client);
    qemu_coroutine_enter(co);
P
Paolo Bonzini 已提交
1536
}