nbd.c 30.6 KB
Newer Older
1
/*
B
bellard 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
 *
 *  Network Block Device
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
B
bellard 已提交
18

19 20
#include "block/nbd.h"
#include "block/block.h"
B
bellard 已提交
21

22
#include "block/coroutine.h"
P
Paolo Bonzini 已提交
23

B
bellard 已提交
24 25
#include <errno.h>
#include <string.h>
26
#ifndef _WIN32
B
bellard 已提交
27
#include <sys/ioctl.h>
28
#endif
29
#if defined(__sun__) || defined(__HAIKU__)
30 31
#include <sys/ioccom.h>
#endif
B
bellard 已提交
32 33
#include <ctype.h>
#include <inttypes.h>
34

P
Paolo Bonzini 已提交
35 36 37 38
#ifdef __linux__
#include <linux/fs.h>
#endif

39 40
#include "qemu/sockets.h"
#include "qemu/queue.h"
41
#include "qemu/main-loop.h"
42 43 44 45

//#define DEBUG_NBD

#ifdef DEBUG_NBD
46
#define TRACE(msg, ...) do { \
47
    LOG(msg, ## __VA_ARGS__); \
48
} while(0)
49 50 51 52
#else
#define TRACE(msg, ...) \
    do { } while (0)
#endif
B
bellard 已提交
53 54 55 56 57 58 59 60

#define LOG(msg, ...) do { \
    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
} while(0)

/* This is all part of the "official" NBD API */

P
Paolo Bonzini 已提交
61
#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
N
Nick Thomas 已提交
62
#define NBD_REPLY_SIZE          (4 + 4 + 8)
B
bellard 已提交
63 64
#define NBD_REQUEST_MAGIC       0x25609513
#define NBD_REPLY_MAGIC         0x67446698
P
Paolo Bonzini 已提交
65 66
#define NBD_OPTS_MAGIC          0x49484156454F5054LL
#define NBD_CLIENT_MAGIC        0x0000420281861253LL
B
bellard 已提交
67 68 69 70 71 72 73

#define NBD_SET_SOCK            _IO(0xab, 0)
#define NBD_SET_BLKSIZE         _IO(0xab, 1)
#define NBD_SET_SIZE            _IO(0xab, 2)
#define NBD_DO_IT               _IO(0xab, 3)
#define NBD_CLEAR_SOCK          _IO(0xab, 4)
#define NBD_CLEAR_QUE           _IO(0xab, 5)
N
Nick Thomas 已提交
74 75
#define NBD_PRINT_DEBUG         _IO(0xab, 6)
#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
B
bellard 已提交
76
#define NBD_DISCONNECT          _IO(0xab, 8)
77 78
#define NBD_SET_TIMEOUT         _IO(0xab, 9)
#define NBD_SET_FLAGS           _IO(0xab, 10)
B
bellard 已提交
79

N
Nick Thomas 已提交
80
#define NBD_OPT_EXPORT_NAME     (1 << 0)
81

82 83 84 85 86 87 88 89 90 91 92
/* Definitions for opaque data types */

typedef struct NBDRequest NBDRequest;

struct NBDRequest {
    QSIMPLEQ_ENTRY(NBDRequest) entry;
    NBDClient *client;
    uint8_t *data;
};

struct NBDExport {
93
    int refcount;
94 95
    void (*close)(NBDExport *exp);

96
    BlockDriverState *bs;
P
Paolo Bonzini 已提交
97
    char *name;
98 99 100
    off_t dev_offset;
    off_t size;
    uint32_t nbdflags;
101
    QTAILQ_HEAD(, NBDClient) clients;
P
Paolo Bonzini 已提交
102
    QTAILQ_ENTRY(NBDExport) next;
103 104
};

P
Paolo Bonzini 已提交
105 106
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);

107 108 109 110 111 112 113 114 115 116 117 118
struct NBDClient {
    int refcount;
    void (*close)(NBDClient *client);

    NBDExport *exp;
    int sock;

    Coroutine *recv_coroutine;

    CoMutex send_lock;
    Coroutine *send_coroutine;

119
    QTAILQ_ENTRY(NBDClient) next;
120
    int nb_requests;
121
    bool closing;
122 123
};

B
bellard 已提交
124 125
/* That's all folks */

126
ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
B
bellard 已提交
127 128
{
    size_t offset = 0;
129
    int err;
B
bellard 已提交
130

131 132 133 134 135 136 137 138
    if (qemu_in_coroutine()) {
        if (do_read) {
            return qemu_co_recv(fd, buffer, size);
        } else {
            return qemu_co_send(fd, buffer, size);
        }
    }

B
bellard 已提交
139 140 141 142
    while (offset < size) {
        ssize_t len;

        if (do_read) {
B
Blue Swirl 已提交
143
            len = qemu_recv(fd, buffer + offset, size - offset, 0);
B
bellard 已提交
144
        } else {
145
            len = send(fd, buffer + offset, size - offset, 0);
B
bellard 已提交
146 147
        }

148
        if (len < 0) {
149
            err = socket_error();
150

151
            /* recoverable error */
152
            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
153 154 155 156
                continue;
            }

            /* unrecoverable error */
157
            return -err;
B
bellard 已提交
158 159 160 161 162 163 164 165 166 167 168 169 170
        }

        /* eof */
        if (len == 0) {
            break;
        }

        offset += len;
    }

    return offset;
}

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
static ssize_t read_sync(int fd, void *buffer, size_t size)
{
    /* Sockets are kept in blocking mode in the negotiation phase.  After
     * that, a non-readable socket simply means that another thread stole
     * our request/reply.  Synchronization is done with recv_coroutine, so
     * that this is coroutine-safe.
     */
    return nbd_wr_sync(fd, buffer, size, true);
}

static ssize_t write_sync(int fd, void *buffer, size_t size)
{
    int ret;
    do {
        /* For writes, we do expect the socket to be writable.  */
        ret = nbd_wr_sync(fd, buffer, size, false);
    } while (ret == -EAGAIN);
    return ret;
}

191 192
static void combine_addr(char *buf, size_t len, const char* address,
                         uint16_t port)
B
bellard 已提交
193
{
194 195 196 197 198
    /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
    if (strstr(address, ":")) {
        snprintf(buf, len, "[%s]:%u", address, port);
    } else {
        snprintf(buf, len, "%s:%u", address, port);
B
bellard 已提交
199 200 201
    }
}

202 203 204 205 206 207 208 209 210 211 212 213
int tcp_socket_outgoing_opts(QemuOpts *opts)
{
    Error *local_err = NULL;
    int fd = inet_connect_opts(opts, &local_err, NULL, NULL);
    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }

    return fd;
}

214
int tcp_socket_incoming(const char *address, uint16_t port)
215
{
216 217 218 219
    char address_and_port[128];
    combine_addr(address_and_port, 128, address, port);
    return tcp_socket_incoming_spec(address_and_port);
}
220

221 222
int tcp_socket_incoming_spec(const char *address_and_port)
{
223 224 225 226 227 228 229 230
    Error *local_err = NULL;
    int fd = inet_listen(address_and_port, NULL, 0, SOCK_STREAM, 0, &local_err);

    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }
    return fd;
231
}
232

233 234
int unix_socket_incoming(const char *path)
{
235 236
    Error *local_err = NULL;
    int fd = unix_listen(path, NULL, 0, &local_err);
237

238 239 240 241 242
    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }
    return fd;
243 244
}

245 246
int unix_socket_outgoing(const char *path)
{
247 248 249 250 251 252 253 254
    Error *local_err = NULL;
    int fd = unix_connect(path, &local_err);

    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }
    return fd;
255
}
256

257
/* Basic flow for negotiation
B
bellard 已提交
258 259 260

   Server         Client
   Negotiate
261 262 263 264 265 266 267 268 269 270 271 272 273

   or

   Server         Client
   Negotiate #1
                  Option
   Negotiate #2

   ----

   followed by

   Server         Client
B
bellard 已提交
274 275 276 277 278 279 280
                  Request
   Response
                  Request
   Response
                  ...
   ...
                  Request (type == 2)
281

B
bellard 已提交
282 283
*/

284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
static int nbd_receive_options(NBDClient *client)
{
    int csock = client->sock;
    char name[256];
    uint32_t tmp, length;
    uint64_t magic;
    int rc;

    /* Client sends:
        [ 0 ..   3]   reserved (0)
        [ 4 ..  11]   NBD_OPTS_MAGIC
        [12 ..  15]   NBD_OPT_EXPORT_NAME
        [16 ..  19]   length
        [20 ..  xx]   export name (length bytes)
     */

    rc = -EINVAL;
    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking reserved");
    if (tmp != 0) {
        LOG("Bad reserved received");
        goto fail;
    }

    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking reserved");
    if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
        LOG("Bad magic received");
        goto fail;
    }

    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking option");
    if (tmp != be32_to_cpu(NBD_OPT_EXPORT_NAME)) {
        LOG("Bad option received");
        goto fail;
    }

    if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking length");
    length = be32_to_cpu(length);
    if (length > 255) {
        LOG("Bad length received");
        goto fail;
    }
    if (read_sync(csock, name, length) != length) {
        LOG("read failed");
        goto fail;
    }
    name[length] = '\0';

    client->exp = nbd_export_find(name);
    if (!client->exp) {
        LOG("export not found");
        goto fail;
    }

    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    nbd_export_get(client->exp);

    TRACE("Option negotiation succeeded.");
    rc = 0;
fail:
    return rc;
}

362
static int nbd_send_negotiate(NBDClient *client)
B
bellard 已提交
363
{
364
    int csock = client->sock;
N
Nick Thomas 已提交
365
    char buf[8 + 8 + 8 + 128];
366
    int rc;
367 368
    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
N
Nick Thomas 已提交
369

370 371 372
    /* Negotiation header without options:
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
N
Nick Thomas 已提交
373
        [16 ..  23]   size
374 375 376 377 378 379 380 381 382 383 384 385 386
        [24 ..  25]   server flags (0)
        [24 ..  27]   export flags
        [28 .. 151]   reserved     (0)

       Negotiation header with options, part 1:
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
        [16 ..  17]   server flags (0)

       part 2 (after options are sent):
        [18 ..  25]   size
        [26 ..  27]   export flags
        [28 .. 151]   reserved     (0)
N
Nick Thomas 已提交
387 388
     */

389
    qemu_set_block(csock);
390 391
    rc = -EINVAL;

N
Nick Thomas 已提交
392
    TRACE("Beginning negotiation.");
393
    memset(buf, 0, sizeof(buf));
N
Nick Thomas 已提交
394
    memcpy(buf, "NBDMAGIC", 8);
395 396 397 398 399 400 401 402
    if (client->exp) {
        assert ((client->exp->nbdflags & ~65535) == 0);
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
        cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
    } else {
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
    }
N
Nick Thomas 已提交
403

404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
    if (client->exp) {
        if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
            LOG("write failed");
            goto fail;
        }
    } else {
        if (write_sync(csock, buf, 18) != 18) {
            LOG("write failed");
            goto fail;
        }
        rc = nbd_receive_options(client);
        if (rc < 0) {
            LOG("option negotiation failed");
            goto fail;
        }

        assert ((client->exp->nbdflags & ~65535) == 0);
        cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
        if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
            LOG("write failed");
            goto fail;
        }
N
Nick Thomas 已提交
427 428
    }

D
Dong Xu Wang 已提交
429
    TRACE("Negotiation succeeded.");
430 431
    rc = 0;
fail:
432
    qemu_set_nonblock(csock);
433
    return rc;
B
bellard 已提交
434 435
}

436 437
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
                          off_t *size, size_t *blocksize)
B
bellard 已提交
438
{
N
Nick Thomas 已提交
439 440 441
    char buf[256];
    uint64_t magic, s;
    uint16_t tmp;
442
    int rc;
N
Nick Thomas 已提交
443

D
Dong Xu Wang 已提交
444
    TRACE("Receiving negotiation.");
N
Nick Thomas 已提交
445

446
    qemu_set_block(csock);
447 448
    rc = -EINVAL;

N
Nick Thomas 已提交
449 450
    if (read_sync(csock, buf, 8) != 8) {
        LOG("read failed");
451
        goto fail;
N
Nick Thomas 已提交
452 453 454 455 456
    }

    buf[8] = '\0';
    if (strlen(buf) == 0) {
        LOG("server connection closed");
457
        goto fail;
N
Nick Thomas 已提交
458 459 460 461 462 463 464 465 466 467 468 469 470 471
    }

    TRACE("Magic is %c%c%c%c%c%c%c%c",
          qemu_isprint(buf[0]) ? buf[0] : '.',
          qemu_isprint(buf[1]) ? buf[1] : '.',
          qemu_isprint(buf[2]) ? buf[2] : '.',
          qemu_isprint(buf[3]) ? buf[3] : '.',
          qemu_isprint(buf[4]) ? buf[4] : '.',
          qemu_isprint(buf[5]) ? buf[5] : '.',
          qemu_isprint(buf[6]) ? buf[6] : '.',
          qemu_isprint(buf[7]) ? buf[7] : '.');

    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
        LOG("Invalid magic received");
472
        goto fail;
N
Nick Thomas 已提交
473 474 475 476
    }

    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
        LOG("read failed");
477
        goto fail;
N
Nick Thomas 已提交
478 479 480 481 482 483 484 485 486 487
    }
    magic = be64_to_cpu(magic);
    TRACE("Magic is 0x%" PRIx64, magic);

    if (name) {
        uint32_t reserved = 0;
        uint32_t opt;
        uint32_t namesize;

        TRACE("Checking magic (opts_magic)");
P
Paolo Bonzini 已提交
488
        if (magic != NBD_OPTS_MAGIC) {
N
Nick Thomas 已提交
489
            LOG("Bad magic received");
490
            goto fail;
N
Nick Thomas 已提交
491 492 493
        }
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
            LOG("flags read failed");
494
            goto fail;
N
Nick Thomas 已提交
495 496 497 498 499 500
        }
        *flags = be16_to_cpu(tmp) << 16;
        /* reserved for future use */
        if (write_sync(csock, &reserved, sizeof(reserved)) !=
            sizeof(reserved)) {
            LOG("write failed (reserved)");
501
            goto fail;
N
Nick Thomas 已提交
502 503 504 505 506
        }
        /* write the export name */
        magic = cpu_to_be64(magic);
        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
            LOG("write failed (magic)");
507
            goto fail;
N
Nick Thomas 已提交
508 509 510 511
        }
        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
            LOG("write failed (opt)");
512
            goto fail;
N
Nick Thomas 已提交
513 514 515 516 517
        }
        namesize = cpu_to_be32(strlen(name));
        if (write_sync(csock, &namesize, sizeof(namesize)) !=
            sizeof(namesize)) {
            LOG("write failed (namesize)");
518
            goto fail;
N
Nick Thomas 已提交
519 520 521
        }
        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
            LOG("write failed (name)");
522
            goto fail;
N
Nick Thomas 已提交
523 524 525 526
        }
    } else {
        TRACE("Checking magic (cli_magic)");

P
Paolo Bonzini 已提交
527
        if (magic != NBD_CLIENT_MAGIC) {
N
Nick Thomas 已提交
528
            LOG("Bad magic received");
529
            goto fail;
N
Nick Thomas 已提交
530 531 532 533 534
        }
    }

    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
        LOG("read failed");
535
        goto fail;
N
Nick Thomas 已提交
536 537 538 539 540 541 542 543
    }
    *size = be64_to_cpu(s);
    *blocksize = 1024;
    TRACE("Size is %" PRIu64, *size);

    if (!name) {
        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
            LOG("read failed (flags)");
544
            goto fail;
N
Nick Thomas 已提交
545 546 547 548 549
        }
        *flags = be32_to_cpup(flags);
    } else {
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
            LOG("read failed (tmp)");
550
            goto fail;
N
Nick Thomas 已提交
551 552 553 554 555
        }
        *flags |= be32_to_cpu(tmp);
    }
    if (read_sync(csock, &buf, 124) != 124) {
        LOG("read failed (buf)");
556
        goto fail;
N
Nick Thomas 已提交
557
    }
558 559 560
    rc = 0;

fail:
561
    qemu_set_nonblock(csock);
562
    return rc;
563
}
B
bellard 已提交
564

P
Paolo Bonzini 已提交
565 566
#ifdef __linux__
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
567
{
568 569
    TRACE("Setting NBD socket");

570
    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
571 572
        int serrno = errno;
        LOG("Failed to set NBD socket");
573
        return -serrno;
574 575
    }

N
Nick Thomas 已提交
576
    TRACE("Setting block size to %lu", (unsigned long)blocksize);
B
bellard 已提交
577

578
    if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
N
Nick Thomas 已提交
579 580
        int serrno = errno;
        LOG("Failed setting NBD block size");
581
        return -serrno;
N
Nick Thomas 已提交
582
    }
B
bellard 已提交
583

B
Blue Swirl 已提交
584
        TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
B
bellard 已提交
585

586
    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
N
Nick Thomas 已提交
587 588
        int serrno = errno;
        LOG("Failed setting size (in blocks)");
589
        return -serrno;
N
Nick Thomas 已提交
590
    }
B
bellard 已提交
591

P
Paolo Bonzini 已提交
592 593 594 595 596 597 598 599 600 601 602
    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
        if (errno == ENOTTY) {
            int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
            TRACE("Setting readonly attribute");

            if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
                int serrno = errno;
                LOG("Failed setting read-only attribute");
                return -serrno;
            }
        } else {
P
Paolo Bonzini 已提交
603
            int serrno = errno;
P
Paolo Bonzini 已提交
604
            LOG("Failed setting flags");
605
            return -serrno;
P
Paolo Bonzini 已提交
606 607 608
        }
    }

N
Nick Thomas 已提交
609
    TRACE("Negotiation ended");
B
bellard 已提交
610

N
Nick Thomas 已提交
611
    return 0;
B
bellard 已提交
612 613 614 615
}

int nbd_disconnect(int fd)
{
N
Nick Thomas 已提交
616 617 618 619
    ioctl(fd, NBD_CLEAR_QUE);
    ioctl(fd, NBD_DISCONNECT);
    ioctl(fd, NBD_CLEAR_SOCK);
    return 0;
B
bellard 已提交
620 621
}

622
int nbd_client(int fd)
B
bellard 已提交
623
{
N
Nick Thomas 已提交
624 625
    int ret;
    int serrno;
B
bellard 已提交
626

N
Nick Thomas 已提交
627
    TRACE("Doing NBD loop");
B
bellard 已提交
628

N
Nick Thomas 已提交
629
    ret = ioctl(fd, NBD_DO_IT);
630
    if (ret < 0 && errno == EPIPE) {
631 632 633 634 635 636
        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
         * that case.
         */
        ret = 0;
    }
N
Nick Thomas 已提交
637
    serrno = errno;
B
bellard 已提交
638

N
Nick Thomas 已提交
639
    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
B
bellard 已提交
640

N
Nick Thomas 已提交
641 642
    TRACE("Clearing NBD queue");
    ioctl(fd, NBD_CLEAR_QUE);
B
bellard 已提交
643

N
Nick Thomas 已提交
644 645
    TRACE("Clearing NBD socket");
    ioctl(fd, NBD_CLEAR_SOCK);
B
bellard 已提交
646

N
Nick Thomas 已提交
647 648
    errno = serrno;
    return ret;
B
bellard 已提交
649
}
650
#else
P
Paolo Bonzini 已提交
651
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
652
{
653
    return -ENOTSUP;
654 655 656 657
}

int nbd_disconnect(int fd)
{
658
    return -ENOTSUP;
659 660
}

661
int nbd_client(int fd)
662
{
663
    return -ENOTSUP;
664 665
}
#endif
B
bellard 已提交
666

P
Paolo Bonzini 已提交
667
ssize_t nbd_send_request(int csock, struct nbd_request *request)
B
bellard 已提交
668
{
P
Paolo Bonzini 已提交
669
    uint8_t buf[NBD_REQUEST_SIZE];
670
    ssize_t ret;
N
Nick Thomas 已提交
671 672 673 674 675 676

    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
677

N
Nick Thomas 已提交
678 679 680 681
    TRACE("Sending request to client: "
          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
          request->from, request->len, request->handle, request->type);

682 683 684 685 686 687
    ret = write_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
688
        LOG("writing to socket failed");
689
        return -EINVAL;
N
Nick Thomas 已提交
690 691 692
    }
    return 0;
}
693

P
Paolo Bonzini 已提交
694
static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
695
{
P
Paolo Bonzini 已提交
696
    uint8_t buf[NBD_REQUEST_SIZE];
N
Nick Thomas 已提交
697
    uint32_t magic;
698
    ssize_t ret;
N
Nick Thomas 已提交
699

700 701 702 703 704 705
    ret = read_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
706
        LOG("read failed");
707
        return -EINVAL;
N
Nick Thomas 已提交
708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729
    }

    /* Request
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
       [ 8 .. 15]   handle
       [16 .. 23]   from
       [24 .. 27]   len
     */

    magic = be32_to_cpup((uint32_t*)buf);
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));

    TRACE("Got request: "
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
          magic, request->type, request->from, request->len);

    if (magic != NBD_REQUEST_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
730
        return -EINVAL;
N
Nick Thomas 已提交
731 732
    }
    return 0;
733 734
}

P
Paolo Bonzini 已提交
735
ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
736
{
N
Nick Thomas 已提交
737 738
    uint8_t buf[NBD_REPLY_SIZE];
    uint32_t magic;
739
    ssize_t ret;
N
Nick Thomas 已提交
740

741 742 743 744 745 746
    ret = read_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
747
        LOG("read failed");
748
        return -EINVAL;
N
Nick Thomas 已提交
749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
    }

    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */

    magic = be32_to_cpup((uint32_t*)buf);
    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));

    TRACE("Got reply: "
          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
          magic, reply->error, reply->handle);

    if (magic != NBD_REPLY_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
767
        return -EINVAL;
N
Nick Thomas 已提交
768 769
    }
    return 0;
770 771
}

P
Paolo Bonzini 已提交
772
static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
773
{
P
Paolo Bonzini 已提交
774
    uint8_t buf[NBD_REPLY_SIZE];
775
    ssize_t ret;
N
Nick Thomas 已提交
776 777 778 779 780 781 782 783 784 785 786 787

    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */
    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);

    TRACE("Sending response to client");

788 789 790 791 792 793
    ret = write_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
794
        LOG("writing to socket failed");
795
        return -EINVAL;
N
Nick Thomas 已提交
796 797
    }
    return 0;
798
}
B
bellard 已提交
799

P
Paolo Bonzini 已提交
800 801
#define MAX_NBD_REQUESTS 16

802
void nbd_client_get(NBDClient *client)
803 804 805 806
{
    client->refcount++;
}

807
void nbd_client_put(NBDClient *client)
808 809
{
    if (--client->refcount == 0) {
810 811 812 813 814 815 816 817
        /* The last reference should be dropped by client->close,
         * which is called by nbd_client_close.
         */
        assert(client->closing);

        qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
        close(client->sock);
        client->sock = -1;
818 819 820 821
        if (client->exp) {
            QTAILQ_REMOVE(&client->exp->clients, client, next);
            nbd_export_put(client->exp);
        }
822 823 824 825
        g_free(client);
    }
}

826
void nbd_client_close(NBDClient *client)
827
{
828 829 830 831 832 833 834 835 836 837 838 839
    if (client->closing) {
        return;
    }

    client->closing = true;

    /* Force requests to finish.  They will drop their own references,
     * then we'll close the socket and free the NBDClient.
     */
    shutdown(client->sock, 2);

    /* Also tell the client, so that they release their reference.  */
840 841 842 843 844
    if (client->close) {
        client->close(client);
    }
}

845
static NBDRequest *nbd_request_get(NBDClient *client)
P
Paolo Bonzini 已提交
846 847
{
    NBDRequest *req;
848

P
Paolo Bonzini 已提交
849 850 851
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
    client->nb_requests++;

852
    req = g_slice_new0(NBDRequest);
853 854
    nbd_client_get(client);
    req->client = client;
P
Paolo Bonzini 已提交
855 856 857
    return req;
}

858
static void nbd_request_put(NBDRequest *req)
P
Paolo Bonzini 已提交
859
{
860
    NBDClient *client = req->client;
861

862 863 864
    if (req->data) {
        qemu_vfree(req->data);
    }
865 866
    g_slice_free(NBDRequest, req);

P
Paolo Bonzini 已提交
867 868 869
    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
        qemu_notify_event();
    }
870
    nbd_client_put(client);
P
Paolo Bonzini 已提交
871 872
}

P
Paolo Bonzini 已提交
873
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
874 875
                          off_t size, uint32_t nbdflags,
                          void (*close)(NBDExport *))
P
Paolo Bonzini 已提交
876 877
{
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
878
    exp->refcount = 1;
879
    QTAILQ_INIT(&exp->clients);
P
Paolo Bonzini 已提交
880 881 882
    exp->bs = bs;
    exp->dev_offset = dev_offset;
    exp->nbdflags = nbdflags;
P
Paolo Bonzini 已提交
883
    exp->size = size == -1 ? bdrv_getlength(bs) : size;
884
    exp->close = close;
F
Fam Zheng 已提交
885
    bdrv_ref(bs);
P
Paolo Bonzini 已提交
886 887 888
    return exp;
}

P
Paolo Bonzini 已提交
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
NBDExport *nbd_export_find(const char *name)
{
    NBDExport *exp;
    QTAILQ_FOREACH(exp, &exports, next) {
        if (strcmp(name, exp->name) == 0) {
            return exp;
        }
    }

    return NULL;
}

void nbd_export_set_name(NBDExport *exp, const char *name)
{
    if (exp->name == name) {
        return;
    }

    nbd_export_get(exp);
    if (exp->name != NULL) {
        g_free(exp->name);
        exp->name = NULL;
        QTAILQ_REMOVE(&exports, exp, next);
        nbd_export_put(exp);
    }
    if (name != NULL) {
        nbd_export_get(exp);
        exp->name = g_strdup(name);
        QTAILQ_INSERT_TAIL(&exports, exp, next);
    }
    nbd_export_put(exp);
}

P
Paolo Bonzini 已提交
922 923
void nbd_export_close(NBDExport *exp)
{
924
    NBDClient *client, *next;
925

926 927 928 929
    nbd_export_get(exp);
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
        nbd_client_close(client);
    }
P
Paolo Bonzini 已提交
930
    nbd_export_set_name(exp, NULL);
931
    nbd_export_put(exp);
F
Fam Zheng 已提交
932 933 934 935
    if (exp->bs) {
        bdrv_unref(exp->bs);
        exp->bs = NULL;
    }
936 937 938 939 940 941 942 943 944 945 946 947 948
}

void nbd_export_get(NBDExport *exp)
{
    assert(exp->refcount > 0);
    exp->refcount++;
}

void nbd_export_put(NBDExport *exp)
{
    assert(exp->refcount > 0);
    if (exp->refcount == 1) {
        nbd_export_close(exp);
P
Paolo Bonzini 已提交
949 950
    }

951
    if (--exp->refcount == 0) {
P
Paolo Bonzini 已提交
952 953
        assert(exp->name == NULL);

954 955 956 957
        if (exp->close) {
            exp->close(exp);
        }

958 959
        g_free(exp);
    }
P
Paolo Bonzini 已提交
960 961
}

P
Paolo Bonzini 已提交
962 963 964 965 966
BlockDriverState *nbd_export_get_blockdev(NBDExport *exp)
{
    return exp->bs;
}

P
Paolo Bonzini 已提交
967 968 969 970 971 972 973 974 975
void nbd_export_close_all(void)
{
    NBDExport *exp, *next;

    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
        nbd_export_close(exp);
    }
}

P
Paolo Bonzini 已提交
976
static int nbd_can_read(void *opaque);
P
Paolo Bonzini 已提交
977 978 979
static void nbd_read(void *opaque);
static void nbd_restart_write(void *opaque);

P
Paolo Bonzini 已提交
980 981
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
                                 int len)
982
{
983 984
    NBDClient *client = req->client;
    int csock = client->sock;
P
Paolo Bonzini 已提交
985
    ssize_t rc, ret;
986

P
Paolo Bonzini 已提交
987
    qemu_co_mutex_lock(&client->send_lock);
P
Paolo Bonzini 已提交
988 989
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
                         nbd_restart_write, client);
P
Paolo Bonzini 已提交
990 991
    client->send_coroutine = qemu_coroutine_self();

992 993 994 995 996
    if (!len) {
        rc = nbd_send_reply(csock, reply);
    } else {
        socket_set_cork(csock, 1);
        rc = nbd_send_reply(csock, reply);
997
        if (rc >= 0) {
P
Paolo Bonzini 已提交
998
            ret = qemu_co_send(csock, req->data, len);
999
            if (ret != len) {
1000
                rc = -EIO;
1001 1002 1003 1004
            }
        }
        socket_set_cork(csock, 0);
    }
P
Paolo Bonzini 已提交
1005 1006

    client->send_coroutine = NULL;
P
Paolo Bonzini 已提交
1007
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
P
Paolo Bonzini 已提交
1008
    qemu_co_mutex_unlock(&client->send_lock);
1009 1010 1011
    return rc;
}

P
Paolo Bonzini 已提交
1012
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
1013
{
1014 1015
    NBDClient *client = req->client;
    int csock = client->sock;
1016
    uint32_t command;
P
Paolo Bonzini 已提交
1017
    ssize_t rc;
1018

P
Paolo Bonzini 已提交
1019
    client->recv_coroutine = qemu_coroutine_self();
1020 1021 1022 1023 1024
    rc = nbd_receive_request(csock, request);
    if (rc < 0) {
        if (rc != -EAGAIN) {
            rc = -EIO;
        }
1025 1026 1027
        goto out;
    }

1028
    if (request->len > NBD_MAX_BUFFER_SIZE) {
1029
        LOG("len (%u) is larger than max len (%u)",
1030
            request->len, NBD_MAX_BUFFER_SIZE);
1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
        rc = -EINVAL;
        goto out;
    }

    if ((request->from + request->len) < request->from) {
        LOG("integer overflow detected! "
            "you're probably being attacked");
        rc = -EINVAL;
        goto out;
    }

    TRACE("Decoding type");

1044 1045 1046 1047 1048
    command = request->type & NBD_CMD_MASK_COMMAND;
    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
        req->data = qemu_blockalign(client->exp->bs, request->len);
    }
    if (command == NBD_CMD_WRITE) {
1049 1050
        TRACE("Reading %u byte(s)", request->len);

P
Paolo Bonzini 已提交
1051
        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
1052 1053 1054 1055 1056 1057 1058 1059
            LOG("reading from socket failed");
            rc = -EIO;
            goto out;
        }
    }
    rc = 0;

out:
P
Paolo Bonzini 已提交
1060
    client->recv_coroutine = NULL;
1061 1062 1063
    return rc;
}

P
Paolo Bonzini 已提交
1064
static void nbd_trip(void *opaque)
1065
{
P
Paolo Bonzini 已提交
1066
    NBDClient *client = opaque;
1067
    NBDExport *exp = client->exp;
1068
    NBDRequest *req;
N
Nick Thomas 已提交
1069 1070
    struct nbd_request request;
    struct nbd_reply reply;
P
Paolo Bonzini 已提交
1071
    ssize_t ret;
N
Nick Thomas 已提交
1072 1073

    TRACE("Reading request.");
1074 1075 1076
    if (client->closing) {
        return;
    }
N
Nick Thomas 已提交
1077

1078
    req = nbd_request_get(client);
P
Paolo Bonzini 已提交
1079
    ret = nbd_co_receive_request(req, &request);
1080 1081 1082
    if (ret == -EAGAIN) {
        goto done;
    }
1083
    if (ret == -EIO) {
P
Paolo Bonzini 已提交
1084
        goto out;
1085
    }
N
Nick Thomas 已提交
1086

1087 1088 1089
    reply.handle = request.handle;
    reply.error = 0;

1090 1091 1092
    if (ret < 0) {
        reply.error = -ret;
        goto error_reply;
N
Nick Thomas 已提交
1093 1094
    }

P
Paolo Bonzini 已提交
1095
    if ((request.from + request.len) > exp->size) {
N
Nick Thomas 已提交
1096 1097
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
            ", Offset: %" PRIu64 "\n",
P
Paolo Bonzini 已提交
1098
                    request.from, request.len,
S
Stefan Weil 已提交
1099
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
N
Nick Thomas 已提交
1100
        LOG("requested operation past EOF--bad client?");
1101
        goto invalid_request;
N
Nick Thomas 已提交
1102 1103
    }

1104
    switch (request.type & NBD_CMD_MASK_COMMAND) {
N
Nick Thomas 已提交
1105 1106 1107
    case NBD_CMD_READ:
        TRACE("Request type is READ");

P
Paolo Bonzini 已提交
1108 1109 1110 1111 1112 1113 1114 1115 1116
        if (request.type & NBD_CMD_FLAG_FUA) {
            ret = bdrv_co_flush(exp->bs);
            if (ret < 0) {
                LOG("flush failed");
                reply.error = -ret;
                goto error_reply;
            }
        }

P
Paolo Bonzini 已提交
1117
        ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
P
Paolo Bonzini 已提交
1118
                        req->data, request.len / 512);
1119
        if (ret < 0) {
N
Nick Thomas 已提交
1120
            LOG("reading from file failed");
1121
            reply.error = -ret;
1122
            goto error_reply;
N
Nick Thomas 已提交
1123 1124 1125
        }

        TRACE("Read %u byte(s)", request.len);
P
Paolo Bonzini 已提交
1126
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
P
Paolo Bonzini 已提交
1127
            goto out;
N
Nick Thomas 已提交
1128 1129 1130 1131
        break;
    case NBD_CMD_WRITE:
        TRACE("Request type is WRITE");

P
Paolo Bonzini 已提交
1132
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
N
Nick Thomas 已提交
1133
            TRACE("Server is read-only, return error");
1134 1135 1136 1137 1138 1139
            reply.error = EROFS;
            goto error_reply;
        }

        TRACE("Writing to device");

P
Paolo Bonzini 已提交
1140
        ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
P
Paolo Bonzini 已提交
1141
                         req->data, request.len / 512);
1142 1143 1144 1145 1146
        if (ret < 0) {
            LOG("writing to file failed");
            reply.error = -ret;
            goto error_reply;
        }
N
Nick Thomas 已提交
1147

1148
        if (request.type & NBD_CMD_FLAG_FUA) {
P
Paolo Bonzini 已提交
1149
            ret = bdrv_co_flush(exp->bs);
1150
            if (ret < 0) {
1151
                LOG("flush failed");
1152
                reply.error = -ret;
1153
                goto error_reply;
1154
            }
N
Nick Thomas 已提交
1155 1156
        }

1157
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1158
            goto out;
1159
        }
N
Nick Thomas 已提交
1160 1161 1162 1163
        break;
    case NBD_CMD_DISC:
        TRACE("Request type is DISCONNECT");
        errno = 0;
P
Paolo Bonzini 已提交
1164
        goto out;
P
Paolo Bonzini 已提交
1165 1166 1167
    case NBD_CMD_FLUSH:
        TRACE("Request type is FLUSH");

P
Paolo Bonzini 已提交
1168
        ret = bdrv_co_flush(exp->bs);
P
Paolo Bonzini 已提交
1169 1170 1171 1172
        if (ret < 0) {
            LOG("flush failed");
            reply.error = -ret;
        }
1173
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1174
            goto out;
1175
        }
P
Paolo Bonzini 已提交
1176 1177 1178
        break;
    case NBD_CMD_TRIM:
        TRACE("Request type is TRIM");
P
Paolo Bonzini 已提交
1179 1180
        ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
                              request.len / 512);
P
Paolo Bonzini 已提交
1181 1182 1183 1184
        if (ret < 0) {
            LOG("discard failed");
            reply.error = -ret;
        }
1185
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1186
            goto out;
1187
        }
P
Paolo Bonzini 已提交
1188
        break;
N
Nick Thomas 已提交
1189 1190
    default:
        LOG("invalid request type (%u) received", request.type);
1191 1192 1193
    invalid_request:
        reply.error = -EINVAL;
    error_reply:
1194
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1195
            goto out;
1196
        }
1197
        break;
N
Nick Thomas 已提交
1198 1199 1200 1201
    }

    TRACE("Request/Reply complete");

1202
done:
P
Paolo Bonzini 已提交
1203 1204 1205
    nbd_request_put(req);
    return;

P
Paolo Bonzini 已提交
1206
out:
1207
    nbd_request_put(req);
P
Paolo Bonzini 已提交
1208
    nbd_client_close(client);
B
bellard 已提交
1209
}
P
Paolo Bonzini 已提交
1210

P
Paolo Bonzini 已提交
1211 1212 1213 1214 1215 1216 1217
static int nbd_can_read(void *opaque)
{
    NBDClient *client = opaque;

    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
}

1218 1219 1220 1221
static void nbd_read(void *opaque)
{
    NBDClient *client = opaque;

P
Paolo Bonzini 已提交
1222 1223 1224 1225
    if (client->recv_coroutine) {
        qemu_coroutine_enter(client->recv_coroutine, NULL);
    } else {
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1226 1227 1228
    }
}

P
Paolo Bonzini 已提交
1229 1230 1231 1232 1233 1234 1235
static void nbd_restart_write(void *opaque)
{
    NBDClient *client = opaque;

    qemu_coroutine_enter(client->send_coroutine, NULL);
}

1236 1237
NBDClient *nbd_client_new(NBDExport *exp, int csock,
                          void (*close)(NBDClient *))
P
Paolo Bonzini 已提交
1238
{
1239 1240 1241 1242 1243
    NBDClient *client;
    client = g_malloc0(sizeof(NBDClient));
    client->refcount = 1;
    client->exp = exp;
    client->sock = csock;
1244 1245 1246 1247
    if (nbd_send_negotiate(client) < 0) {
        g_free(client);
        return NULL;
    }
1248
    client->close = close;
P
Paolo Bonzini 已提交
1249
    qemu_co_mutex_init(&client->send_lock);
P
Paolo Bonzini 已提交
1250
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1251

1252 1253 1254 1255
    if (exp) {
        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
        nbd_export_get(exp);
    }
1256
    return client;
P
Paolo Bonzini 已提交
1257
}