nbd.c 28.9 KB
Newer Older
1
/*
B
bellard 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
 *
 *  Network Block Device
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
B
bellard 已提交
18

19 20
#include "block/nbd.h"
#include "block/block.h"
B
bellard 已提交
21

22
#include "block/coroutine.h"
P
Paolo Bonzini 已提交
23

B
bellard 已提交
24 25
#include <errno.h>
#include <string.h>
26
#ifndef _WIN32
B
bellard 已提交
27
#include <sys/ioctl.h>
28
#endif
29
#if defined(__sun__) || defined(__HAIKU__)
30 31
#include <sys/ioccom.h>
#endif
B
bellard 已提交
32 33
#include <ctype.h>
#include <inttypes.h>
34

P
Paolo Bonzini 已提交
35 36 37 38
#ifdef __linux__
#include <linux/fs.h>
#endif

39 40
#include "qemu/sockets.h"
#include "qemu/queue.h"
41
#include "qemu/main-loop.h"
42 43 44 45

//#define DEBUG_NBD

#ifdef DEBUG_NBD
46
#define TRACE(msg, ...) do { \
47
    LOG(msg, ## __VA_ARGS__); \
48
} while(0)
49 50 51 52
#else
#define TRACE(msg, ...) \
    do { } while (0)
#endif
B
bellard 已提交
53 54 55 56 57 58 59 60

#define LOG(msg, ...) do { \
    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
} while(0)

/* This is all part of the "official" NBD API */

P
Paolo Bonzini 已提交
61
#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
N
Nick Thomas 已提交
62
#define NBD_REPLY_SIZE          (4 + 4 + 8)
B
bellard 已提交
63 64
#define NBD_REQUEST_MAGIC       0x25609513
#define NBD_REPLY_MAGIC         0x67446698
P
Paolo Bonzini 已提交
65 66
#define NBD_OPTS_MAGIC          0x49484156454F5054LL
#define NBD_CLIENT_MAGIC        0x0000420281861253LL
B
bellard 已提交
67 68 69 70 71 72 73

#define NBD_SET_SOCK            _IO(0xab, 0)
#define NBD_SET_BLKSIZE         _IO(0xab, 1)
#define NBD_SET_SIZE            _IO(0xab, 2)
#define NBD_DO_IT               _IO(0xab, 3)
#define NBD_CLEAR_SOCK          _IO(0xab, 4)
#define NBD_CLEAR_QUE           _IO(0xab, 5)
N
Nick Thomas 已提交
74 75
#define NBD_PRINT_DEBUG         _IO(0xab, 6)
#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
B
bellard 已提交
76
#define NBD_DISCONNECT          _IO(0xab, 8)
77 78
#define NBD_SET_TIMEOUT         _IO(0xab, 9)
#define NBD_SET_FLAGS           _IO(0xab, 10)
B
bellard 已提交
79

N
Nick Thomas 已提交
80
#define NBD_OPT_EXPORT_NAME     (1 << 0)
81

82 83 84 85 86 87 88 89 90 91 92
/* Definitions for opaque data types */

typedef struct NBDRequest NBDRequest;

struct NBDRequest {
    QSIMPLEQ_ENTRY(NBDRequest) entry;
    NBDClient *client;
    uint8_t *data;
};

struct NBDExport {
93
    int refcount;
94 95
    void (*close)(NBDExport *exp);

96
    BlockDriverState *bs;
P
Paolo Bonzini 已提交
97
    char *name;
98 99 100
    off_t dev_offset;
    off_t size;
    uint32_t nbdflags;
101
    QTAILQ_HEAD(, NBDClient) clients;
P
Paolo Bonzini 已提交
102
    QTAILQ_ENTRY(NBDExport) next;
103 104
};

P
Paolo Bonzini 已提交
105 106
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);

107 108 109 110 111 112 113 114 115 116 117 118
struct NBDClient {
    int refcount;
    void (*close)(NBDClient *client);

    NBDExport *exp;
    int sock;

    Coroutine *recv_coroutine;

    CoMutex send_lock;
    Coroutine *send_coroutine;

119
    QTAILQ_ENTRY(NBDClient) next;
120
    int nb_requests;
121
    bool closing;
122 123
};

B
bellard 已提交
124 125
/* That's all folks */

126
ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
B
bellard 已提交
127 128
{
    size_t offset = 0;
129
    int err;
B
bellard 已提交
130

131 132 133 134 135 136 137 138
    if (qemu_in_coroutine()) {
        if (do_read) {
            return qemu_co_recv(fd, buffer, size);
        } else {
            return qemu_co_send(fd, buffer, size);
        }
    }

B
bellard 已提交
139 140 141 142
    while (offset < size) {
        ssize_t len;

        if (do_read) {
B
Blue Swirl 已提交
143
            len = qemu_recv(fd, buffer + offset, size - offset, 0);
B
bellard 已提交
144
        } else {
145
            len = send(fd, buffer + offset, size - offset, 0);
B
bellard 已提交
146 147
        }

148
        if (len < 0) {
149
            err = socket_error();
150

151
            /* recoverable error */
152
            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
153 154 155 156
                continue;
            }

            /* unrecoverable error */
157
            return -err;
B
bellard 已提交
158 159 160 161 162 163 164 165 166 167 168 169 170
        }

        /* eof */
        if (len == 0) {
            break;
        }

        offset += len;
    }

    return offset;
}

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
static ssize_t read_sync(int fd, void *buffer, size_t size)
{
    /* Sockets are kept in blocking mode in the negotiation phase.  After
     * that, a non-readable socket simply means that another thread stole
     * our request/reply.  Synchronization is done with recv_coroutine, so
     * that this is coroutine-safe.
     */
    return nbd_wr_sync(fd, buffer, size, true);
}

static ssize_t write_sync(int fd, void *buffer, size_t size)
{
    int ret;
    do {
        /* For writes, we do expect the socket to be writable.  */
        ret = nbd_wr_sync(fd, buffer, size, false);
    } while (ret == -EAGAIN);
    return ret;
}

191
/* Basic flow for negotiation
B
bellard 已提交
192 193 194

   Server         Client
   Negotiate
195 196 197 198 199 200 201 202 203 204 205 206 207

   or

   Server         Client
   Negotiate #1
                  Option
   Negotiate #2

   ----

   followed by

   Server         Client
B
bellard 已提交
208 209 210 211 212 213 214
                  Request
   Response
                  Request
   Response
                  ...
   ...
                  Request (type == 2)
215

B
bellard 已提交
216 217
*/

218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
static int nbd_receive_options(NBDClient *client)
{
    int csock = client->sock;
    char name[256];
    uint32_t tmp, length;
    uint64_t magic;
    int rc;

    /* Client sends:
        [ 0 ..   3]   reserved (0)
        [ 4 ..  11]   NBD_OPTS_MAGIC
        [12 ..  15]   NBD_OPT_EXPORT_NAME
        [16 ..  19]   length
        [20 ..  xx]   export name (length bytes)
     */

    rc = -EINVAL;
    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking reserved");
    if (tmp != 0) {
        LOG("Bad reserved received");
        goto fail;
    }

    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking reserved");
    if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
        LOG("Bad magic received");
        goto fail;
    }

    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking option");
    if (tmp != be32_to_cpu(NBD_OPT_EXPORT_NAME)) {
        LOG("Bad option received");
        goto fail;
    }

    if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking length");
    length = be32_to_cpu(length);
    if (length > 255) {
        LOG("Bad length received");
        goto fail;
    }
    if (read_sync(csock, name, length) != length) {
        LOG("read failed");
        goto fail;
    }
    name[length] = '\0';

    client->exp = nbd_export_find(name);
    if (!client->exp) {
        LOG("export not found");
        goto fail;
    }

    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    nbd_export_get(client->exp);

    TRACE("Option negotiation succeeded.");
    rc = 0;
fail:
    return rc;
}

296
static int nbd_send_negotiate(NBDClient *client)
B
bellard 已提交
297
{
298
    int csock = client->sock;
N
Nick Thomas 已提交
299
    char buf[8 + 8 + 8 + 128];
300
    int rc;
301 302
    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
N
Nick Thomas 已提交
303

304 305 306
    /* Negotiation header without options:
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
N
Nick Thomas 已提交
307
        [16 ..  23]   size
308
        [24 ..  25]   server flags (0)
H
Hani Benhabiles 已提交
309
        [26 ..  27]   export flags
310 311 312 313 314 315 316 317 318 319 320
        [28 .. 151]   reserved     (0)

       Negotiation header with options, part 1:
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
        [16 ..  17]   server flags (0)

       part 2 (after options are sent):
        [18 ..  25]   size
        [26 ..  27]   export flags
        [28 .. 151]   reserved     (0)
N
Nick Thomas 已提交
321 322
     */

323
    qemu_set_block(csock);
324 325
    rc = -EINVAL;

N
Nick Thomas 已提交
326
    TRACE("Beginning negotiation.");
327
    memset(buf, 0, sizeof(buf));
N
Nick Thomas 已提交
328
    memcpy(buf, "NBDMAGIC", 8);
329 330 331 332 333 334 335 336
    if (client->exp) {
        assert ((client->exp->nbdflags & ~65535) == 0);
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
        cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
    } else {
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
    }
N
Nick Thomas 已提交
337

338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
    if (client->exp) {
        if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
            LOG("write failed");
            goto fail;
        }
    } else {
        if (write_sync(csock, buf, 18) != 18) {
            LOG("write failed");
            goto fail;
        }
        rc = nbd_receive_options(client);
        if (rc < 0) {
            LOG("option negotiation failed");
            goto fail;
        }

        assert ((client->exp->nbdflags & ~65535) == 0);
        cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
        if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
            LOG("write failed");
            goto fail;
        }
N
Nick Thomas 已提交
361 362
    }

D
Dong Xu Wang 已提交
363
    TRACE("Negotiation succeeded.");
364 365
    rc = 0;
fail:
366
    qemu_set_nonblock(csock);
367
    return rc;
B
bellard 已提交
368 369
}

370 371
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
                          off_t *size, size_t *blocksize)
B
bellard 已提交
372
{
N
Nick Thomas 已提交
373 374 375
    char buf[256];
    uint64_t magic, s;
    uint16_t tmp;
376
    int rc;
N
Nick Thomas 已提交
377

D
Dong Xu Wang 已提交
378
    TRACE("Receiving negotiation.");
N
Nick Thomas 已提交
379

380 381
    rc = -EINVAL;

N
Nick Thomas 已提交
382 383
    if (read_sync(csock, buf, 8) != 8) {
        LOG("read failed");
384
        goto fail;
N
Nick Thomas 已提交
385 386 387 388 389
    }

    buf[8] = '\0';
    if (strlen(buf) == 0) {
        LOG("server connection closed");
390
        goto fail;
N
Nick Thomas 已提交
391 392 393 394 395 396 397 398 399 400 401 402 403 404
    }

    TRACE("Magic is %c%c%c%c%c%c%c%c",
          qemu_isprint(buf[0]) ? buf[0] : '.',
          qemu_isprint(buf[1]) ? buf[1] : '.',
          qemu_isprint(buf[2]) ? buf[2] : '.',
          qemu_isprint(buf[3]) ? buf[3] : '.',
          qemu_isprint(buf[4]) ? buf[4] : '.',
          qemu_isprint(buf[5]) ? buf[5] : '.',
          qemu_isprint(buf[6]) ? buf[6] : '.',
          qemu_isprint(buf[7]) ? buf[7] : '.');

    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
        LOG("Invalid magic received");
405
        goto fail;
N
Nick Thomas 已提交
406 407 408 409
    }

    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
        LOG("read failed");
410
        goto fail;
N
Nick Thomas 已提交
411 412 413 414 415 416 417 418 419 420
    }
    magic = be64_to_cpu(magic);
    TRACE("Magic is 0x%" PRIx64, magic);

    if (name) {
        uint32_t reserved = 0;
        uint32_t opt;
        uint32_t namesize;

        TRACE("Checking magic (opts_magic)");
P
Paolo Bonzini 已提交
421
        if (magic != NBD_OPTS_MAGIC) {
N
Nick Thomas 已提交
422
            LOG("Bad magic received");
423
            goto fail;
N
Nick Thomas 已提交
424 425 426
        }
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
            LOG("flags read failed");
427
            goto fail;
N
Nick Thomas 已提交
428 429 430 431 432 433
        }
        *flags = be16_to_cpu(tmp) << 16;
        /* reserved for future use */
        if (write_sync(csock, &reserved, sizeof(reserved)) !=
            sizeof(reserved)) {
            LOG("write failed (reserved)");
434
            goto fail;
N
Nick Thomas 已提交
435 436 437 438 439
        }
        /* write the export name */
        magic = cpu_to_be64(magic);
        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
            LOG("write failed (magic)");
440
            goto fail;
N
Nick Thomas 已提交
441 442 443 444
        }
        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
            LOG("write failed (opt)");
445
            goto fail;
N
Nick Thomas 已提交
446 447 448 449 450
        }
        namesize = cpu_to_be32(strlen(name));
        if (write_sync(csock, &namesize, sizeof(namesize)) !=
            sizeof(namesize)) {
            LOG("write failed (namesize)");
451
            goto fail;
N
Nick Thomas 已提交
452 453 454
        }
        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
            LOG("write failed (name)");
455
            goto fail;
N
Nick Thomas 已提交
456 457 458 459
        }
    } else {
        TRACE("Checking magic (cli_magic)");

P
Paolo Bonzini 已提交
460
        if (magic != NBD_CLIENT_MAGIC) {
N
Nick Thomas 已提交
461
            LOG("Bad magic received");
462
            goto fail;
N
Nick Thomas 已提交
463 464 465 466 467
        }
    }

    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
        LOG("read failed");
468
        goto fail;
N
Nick Thomas 已提交
469 470 471 472 473 474 475 476
    }
    *size = be64_to_cpu(s);
    *blocksize = 1024;
    TRACE("Size is %" PRIu64, *size);

    if (!name) {
        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
            LOG("read failed (flags)");
477
            goto fail;
N
Nick Thomas 已提交
478 479 480 481 482
        }
        *flags = be32_to_cpup(flags);
    } else {
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
            LOG("read failed (tmp)");
483
            goto fail;
N
Nick Thomas 已提交
484 485 486 487 488
        }
        *flags |= be32_to_cpu(tmp);
    }
    if (read_sync(csock, &buf, 124) != 124) {
        LOG("read failed (buf)");
489
        goto fail;
N
Nick Thomas 已提交
490
    }
491 492 493 494
    rc = 0;

fail:
    return rc;
495
}
B
bellard 已提交
496

P
Paolo Bonzini 已提交
497 498
#ifdef __linux__
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
499
{
500 501
    TRACE("Setting NBD socket");

502
    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
503 504
        int serrno = errno;
        LOG("Failed to set NBD socket");
505
        return -serrno;
506 507
    }

N
Nick Thomas 已提交
508
    TRACE("Setting block size to %lu", (unsigned long)blocksize);
B
bellard 已提交
509

510
    if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
N
Nick Thomas 已提交
511 512
        int serrno = errno;
        LOG("Failed setting NBD block size");
513
        return -serrno;
N
Nick Thomas 已提交
514
    }
B
bellard 已提交
515

B
Blue Swirl 已提交
516
        TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
B
bellard 已提交
517

518
    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
N
Nick Thomas 已提交
519 520
        int serrno = errno;
        LOG("Failed setting size (in blocks)");
521
        return -serrno;
N
Nick Thomas 已提交
522
    }
B
bellard 已提交
523

P
Paolo Bonzini 已提交
524 525 526 527 528 529 530 531 532 533 534
    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
        if (errno == ENOTTY) {
            int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
            TRACE("Setting readonly attribute");

            if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
                int serrno = errno;
                LOG("Failed setting read-only attribute");
                return -serrno;
            }
        } else {
P
Paolo Bonzini 已提交
535
            int serrno = errno;
P
Paolo Bonzini 已提交
536
            LOG("Failed setting flags");
537
            return -serrno;
P
Paolo Bonzini 已提交
538 539 540
        }
    }

N
Nick Thomas 已提交
541
    TRACE("Negotiation ended");
B
bellard 已提交
542

N
Nick Thomas 已提交
543
    return 0;
B
bellard 已提交
544 545 546 547
}

int nbd_disconnect(int fd)
{
N
Nick Thomas 已提交
548 549 550 551
    ioctl(fd, NBD_CLEAR_QUE);
    ioctl(fd, NBD_DISCONNECT);
    ioctl(fd, NBD_CLEAR_SOCK);
    return 0;
B
bellard 已提交
552 553
}

554
int nbd_client(int fd)
B
bellard 已提交
555
{
N
Nick Thomas 已提交
556 557
    int ret;
    int serrno;
B
bellard 已提交
558

N
Nick Thomas 已提交
559
    TRACE("Doing NBD loop");
B
bellard 已提交
560

N
Nick Thomas 已提交
561
    ret = ioctl(fd, NBD_DO_IT);
562
    if (ret < 0 && errno == EPIPE) {
563 564 565 566 567 568
        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
         * that case.
         */
        ret = 0;
    }
N
Nick Thomas 已提交
569
    serrno = errno;
B
bellard 已提交
570

N
Nick Thomas 已提交
571
    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
B
bellard 已提交
572

N
Nick Thomas 已提交
573 574
    TRACE("Clearing NBD queue");
    ioctl(fd, NBD_CLEAR_QUE);
B
bellard 已提交
575

N
Nick Thomas 已提交
576 577
    TRACE("Clearing NBD socket");
    ioctl(fd, NBD_CLEAR_SOCK);
B
bellard 已提交
578

N
Nick Thomas 已提交
579 580
    errno = serrno;
    return ret;
B
bellard 已提交
581
}
582
#else
P
Paolo Bonzini 已提交
583
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
584
{
585
    return -ENOTSUP;
586 587 588 589
}

int nbd_disconnect(int fd)
{
590
    return -ENOTSUP;
591 592
}

593
int nbd_client(int fd)
594
{
595
    return -ENOTSUP;
596 597
}
#endif
B
bellard 已提交
598

P
Paolo Bonzini 已提交
599
ssize_t nbd_send_request(int csock, struct nbd_request *request)
B
bellard 已提交
600
{
P
Paolo Bonzini 已提交
601
    uint8_t buf[NBD_REQUEST_SIZE];
602
    ssize_t ret;
N
Nick Thomas 已提交
603 604 605 606 607 608

    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
609

N
Nick Thomas 已提交
610 611 612 613
    TRACE("Sending request to client: "
          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
          request->from, request->len, request->handle, request->type);

614 615 616 617 618 619
    ret = write_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
620
        LOG("writing to socket failed");
621
        return -EINVAL;
N
Nick Thomas 已提交
622 623 624
    }
    return 0;
}
625

P
Paolo Bonzini 已提交
626
static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
627
{
P
Paolo Bonzini 已提交
628
    uint8_t buf[NBD_REQUEST_SIZE];
N
Nick Thomas 已提交
629
    uint32_t magic;
630
    ssize_t ret;
N
Nick Thomas 已提交
631

632 633 634 635 636 637
    ret = read_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
638
        LOG("read failed");
639
        return -EINVAL;
N
Nick Thomas 已提交
640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
    }

    /* Request
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
       [ 8 .. 15]   handle
       [16 .. 23]   from
       [24 .. 27]   len
     */

    magic = be32_to_cpup((uint32_t*)buf);
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));

    TRACE("Got request: "
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
          magic, request->type, request->from, request->len);

    if (magic != NBD_REQUEST_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
662
        return -EINVAL;
N
Nick Thomas 已提交
663 664
    }
    return 0;
665 666
}

P
Paolo Bonzini 已提交
667
ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
668
{
N
Nick Thomas 已提交
669 670
    uint8_t buf[NBD_REPLY_SIZE];
    uint32_t magic;
671
    ssize_t ret;
N
Nick Thomas 已提交
672

673 674 675 676 677 678
    ret = read_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
679
        LOG("read failed");
680
        return -EINVAL;
N
Nick Thomas 已提交
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
    }

    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */

    magic = be32_to_cpup((uint32_t*)buf);
    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));

    TRACE("Got reply: "
          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
          magic, reply->error, reply->handle);

    if (magic != NBD_REPLY_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
699
        return -EINVAL;
N
Nick Thomas 已提交
700 701
    }
    return 0;
702 703
}

P
Paolo Bonzini 已提交
704
static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
705
{
P
Paolo Bonzini 已提交
706
    uint8_t buf[NBD_REPLY_SIZE];
707
    ssize_t ret;
N
Nick Thomas 已提交
708 709 710 711 712 713 714 715 716 717 718 719

    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */
    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);

    TRACE("Sending response to client");

720 721 722 723 724 725
    ret = write_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
N
Nick Thomas 已提交
726
        LOG("writing to socket failed");
727
        return -EINVAL;
N
Nick Thomas 已提交
728 729
    }
    return 0;
730
}
B
bellard 已提交
731

P
Paolo Bonzini 已提交
732 733
#define MAX_NBD_REQUESTS 16

734
void nbd_client_get(NBDClient *client)
735 736 737 738
{
    client->refcount++;
}

739
void nbd_client_put(NBDClient *client)
740 741
{
    if (--client->refcount == 0) {
742 743 744 745 746 747 748 749
        /* The last reference should be dropped by client->close,
         * which is called by nbd_client_close.
         */
        assert(client->closing);

        qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
        close(client->sock);
        client->sock = -1;
750 751 752 753
        if (client->exp) {
            QTAILQ_REMOVE(&client->exp->clients, client, next);
            nbd_export_put(client->exp);
        }
754 755 756 757
        g_free(client);
    }
}

758
void nbd_client_close(NBDClient *client)
759
{
760 761 762 763 764 765 766 767 768 769 770 771
    if (client->closing) {
        return;
    }

    client->closing = true;

    /* Force requests to finish.  They will drop their own references,
     * then we'll close the socket and free the NBDClient.
     */
    shutdown(client->sock, 2);

    /* Also tell the client, so that they release their reference.  */
772 773 774 775 776
    if (client->close) {
        client->close(client);
    }
}

777
static NBDRequest *nbd_request_get(NBDClient *client)
P
Paolo Bonzini 已提交
778 779
{
    NBDRequest *req;
780

P
Paolo Bonzini 已提交
781 782 783
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
    client->nb_requests++;

784
    req = g_slice_new0(NBDRequest);
785 786
    nbd_client_get(client);
    req->client = client;
P
Paolo Bonzini 已提交
787 788 789
    return req;
}

790
static void nbd_request_put(NBDRequest *req)
P
Paolo Bonzini 已提交
791
{
792
    NBDClient *client = req->client;
793

794 795 796
    if (req->data) {
        qemu_vfree(req->data);
    }
797 798
    g_slice_free(NBDRequest, req);

P
Paolo Bonzini 已提交
799 800 801
    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
        qemu_notify_event();
    }
802
    nbd_client_put(client);
P
Paolo Bonzini 已提交
803 804
}

P
Paolo Bonzini 已提交
805
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
806 807
                          off_t size, uint32_t nbdflags,
                          void (*close)(NBDExport *))
P
Paolo Bonzini 已提交
808 809
{
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
810
    exp->refcount = 1;
811
    QTAILQ_INIT(&exp->clients);
P
Paolo Bonzini 已提交
812 813 814
    exp->bs = bs;
    exp->dev_offset = dev_offset;
    exp->nbdflags = nbdflags;
P
Paolo Bonzini 已提交
815
    exp->size = size == -1 ? bdrv_getlength(bs) : size;
816
    exp->close = close;
F
Fam Zheng 已提交
817
    bdrv_ref(bs);
P
Paolo Bonzini 已提交
818 819 820
    return exp;
}

P
Paolo Bonzini 已提交
821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
NBDExport *nbd_export_find(const char *name)
{
    NBDExport *exp;
    QTAILQ_FOREACH(exp, &exports, next) {
        if (strcmp(name, exp->name) == 0) {
            return exp;
        }
    }

    return NULL;
}

void nbd_export_set_name(NBDExport *exp, const char *name)
{
    if (exp->name == name) {
        return;
    }

    nbd_export_get(exp);
    if (exp->name != NULL) {
        g_free(exp->name);
        exp->name = NULL;
        QTAILQ_REMOVE(&exports, exp, next);
        nbd_export_put(exp);
    }
    if (name != NULL) {
        nbd_export_get(exp);
        exp->name = g_strdup(name);
        QTAILQ_INSERT_TAIL(&exports, exp, next);
    }
    nbd_export_put(exp);
}

P
Paolo Bonzini 已提交
854 855
void nbd_export_close(NBDExport *exp)
{
856
    NBDClient *client, *next;
857

858 859 860 861
    nbd_export_get(exp);
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
        nbd_client_close(client);
    }
P
Paolo Bonzini 已提交
862
    nbd_export_set_name(exp, NULL);
863
    nbd_export_put(exp);
F
Fam Zheng 已提交
864 865 866 867
    if (exp->bs) {
        bdrv_unref(exp->bs);
        exp->bs = NULL;
    }
868 869 870 871 872 873 874 875 876 877 878 879 880
}

void nbd_export_get(NBDExport *exp)
{
    assert(exp->refcount > 0);
    exp->refcount++;
}

void nbd_export_put(NBDExport *exp)
{
    assert(exp->refcount > 0);
    if (exp->refcount == 1) {
        nbd_export_close(exp);
P
Paolo Bonzini 已提交
881 882
    }

883
    if (--exp->refcount == 0) {
P
Paolo Bonzini 已提交
884 885
        assert(exp->name == NULL);

886 887 888 889
        if (exp->close) {
            exp->close(exp);
        }

890 891
        g_free(exp);
    }
P
Paolo Bonzini 已提交
892 893
}

P
Paolo Bonzini 已提交
894 895 896 897 898
BlockDriverState *nbd_export_get_blockdev(NBDExport *exp)
{
    return exp->bs;
}

P
Paolo Bonzini 已提交
899 900 901 902 903 904 905 906 907
void nbd_export_close_all(void)
{
    NBDExport *exp, *next;

    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
        nbd_export_close(exp);
    }
}

P
Paolo Bonzini 已提交
908
static int nbd_can_read(void *opaque);
P
Paolo Bonzini 已提交
909 910 911
static void nbd_read(void *opaque);
static void nbd_restart_write(void *opaque);

P
Paolo Bonzini 已提交
912 913
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
                                 int len)
914
{
915 916
    NBDClient *client = req->client;
    int csock = client->sock;
P
Paolo Bonzini 已提交
917
    ssize_t rc, ret;
918

P
Paolo Bonzini 已提交
919
    qemu_co_mutex_lock(&client->send_lock);
P
Paolo Bonzini 已提交
920 921
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
                         nbd_restart_write, client);
P
Paolo Bonzini 已提交
922 923
    client->send_coroutine = qemu_coroutine_self();

924 925 926 927 928
    if (!len) {
        rc = nbd_send_reply(csock, reply);
    } else {
        socket_set_cork(csock, 1);
        rc = nbd_send_reply(csock, reply);
929
        if (rc >= 0) {
P
Paolo Bonzini 已提交
930
            ret = qemu_co_send(csock, req->data, len);
931
            if (ret != len) {
932
                rc = -EIO;
933 934 935 936
            }
        }
        socket_set_cork(csock, 0);
    }
P
Paolo Bonzini 已提交
937 938

    client->send_coroutine = NULL;
P
Paolo Bonzini 已提交
939
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
P
Paolo Bonzini 已提交
940
    qemu_co_mutex_unlock(&client->send_lock);
941 942 943
    return rc;
}

P
Paolo Bonzini 已提交
944
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
945
{
946 947
    NBDClient *client = req->client;
    int csock = client->sock;
948
    uint32_t command;
P
Paolo Bonzini 已提交
949
    ssize_t rc;
950

P
Paolo Bonzini 已提交
951
    client->recv_coroutine = qemu_coroutine_self();
952 953 954 955 956
    rc = nbd_receive_request(csock, request);
    if (rc < 0) {
        if (rc != -EAGAIN) {
            rc = -EIO;
        }
957 958 959
        goto out;
    }

960
    if (request->len > NBD_MAX_BUFFER_SIZE) {
961
        LOG("len (%u) is larger than max len (%u)",
962
            request->len, NBD_MAX_BUFFER_SIZE);
963 964 965 966 967 968 969 970 971 972 973 974 975
        rc = -EINVAL;
        goto out;
    }

    if ((request->from + request->len) < request->from) {
        LOG("integer overflow detected! "
            "you're probably being attacked");
        rc = -EINVAL;
        goto out;
    }

    TRACE("Decoding type");

976 977 978 979 980
    command = request->type & NBD_CMD_MASK_COMMAND;
    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
        req->data = qemu_blockalign(client->exp->bs, request->len);
    }
    if (command == NBD_CMD_WRITE) {
981 982
        TRACE("Reading %u byte(s)", request->len);

P
Paolo Bonzini 已提交
983
        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
984 985 986 987 988 989 990 991
            LOG("reading from socket failed");
            rc = -EIO;
            goto out;
        }
    }
    rc = 0;

out:
P
Paolo Bonzini 已提交
992
    client->recv_coroutine = NULL;
993 994 995
    return rc;
}

P
Paolo Bonzini 已提交
996
static void nbd_trip(void *opaque)
997
{
P
Paolo Bonzini 已提交
998
    NBDClient *client = opaque;
999
    NBDExport *exp = client->exp;
1000
    NBDRequest *req;
N
Nick Thomas 已提交
1001 1002
    struct nbd_request request;
    struct nbd_reply reply;
P
Paolo Bonzini 已提交
1003
    ssize_t ret;
N
Nick Thomas 已提交
1004 1005

    TRACE("Reading request.");
1006 1007 1008
    if (client->closing) {
        return;
    }
N
Nick Thomas 已提交
1009

1010
    req = nbd_request_get(client);
P
Paolo Bonzini 已提交
1011
    ret = nbd_co_receive_request(req, &request);
1012 1013 1014
    if (ret == -EAGAIN) {
        goto done;
    }
1015
    if (ret == -EIO) {
P
Paolo Bonzini 已提交
1016
        goto out;
1017
    }
N
Nick Thomas 已提交
1018

1019 1020 1021
    reply.handle = request.handle;
    reply.error = 0;

1022 1023 1024
    if (ret < 0) {
        reply.error = -ret;
        goto error_reply;
N
Nick Thomas 已提交
1025 1026
    }

P
Paolo Bonzini 已提交
1027
    if ((request.from + request.len) > exp->size) {
N
Nick Thomas 已提交
1028 1029
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
            ", Offset: %" PRIu64 "\n",
P
Paolo Bonzini 已提交
1030
                    request.from, request.len,
S
Stefan Weil 已提交
1031
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
N
Nick Thomas 已提交
1032
        LOG("requested operation past EOF--bad client?");
1033
        goto invalid_request;
N
Nick Thomas 已提交
1034 1035
    }

1036
    switch (request.type & NBD_CMD_MASK_COMMAND) {
N
Nick Thomas 已提交
1037 1038 1039
    case NBD_CMD_READ:
        TRACE("Request type is READ");

P
Paolo Bonzini 已提交
1040 1041 1042 1043 1044 1045 1046 1047 1048
        if (request.type & NBD_CMD_FLAG_FUA) {
            ret = bdrv_co_flush(exp->bs);
            if (ret < 0) {
                LOG("flush failed");
                reply.error = -ret;
                goto error_reply;
            }
        }

P
Paolo Bonzini 已提交
1049
        ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
P
Paolo Bonzini 已提交
1050
                        req->data, request.len / 512);
1051
        if (ret < 0) {
N
Nick Thomas 已提交
1052
            LOG("reading from file failed");
1053
            reply.error = -ret;
1054
            goto error_reply;
N
Nick Thomas 已提交
1055 1056 1057
        }

        TRACE("Read %u byte(s)", request.len);
P
Paolo Bonzini 已提交
1058
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
P
Paolo Bonzini 已提交
1059
            goto out;
N
Nick Thomas 已提交
1060 1061 1062 1063
        break;
    case NBD_CMD_WRITE:
        TRACE("Request type is WRITE");

P
Paolo Bonzini 已提交
1064
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
N
Nick Thomas 已提交
1065
            TRACE("Server is read-only, return error");
1066 1067 1068 1069 1070 1071
            reply.error = EROFS;
            goto error_reply;
        }

        TRACE("Writing to device");

P
Paolo Bonzini 已提交
1072
        ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
P
Paolo Bonzini 已提交
1073
                         req->data, request.len / 512);
1074 1075 1076 1077 1078
        if (ret < 0) {
            LOG("writing to file failed");
            reply.error = -ret;
            goto error_reply;
        }
N
Nick Thomas 已提交
1079

1080
        if (request.type & NBD_CMD_FLAG_FUA) {
P
Paolo Bonzini 已提交
1081
            ret = bdrv_co_flush(exp->bs);
1082
            if (ret < 0) {
1083
                LOG("flush failed");
1084
                reply.error = -ret;
1085
                goto error_reply;
1086
            }
N
Nick Thomas 已提交
1087 1088
        }

1089
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1090
            goto out;
1091
        }
N
Nick Thomas 已提交
1092 1093 1094 1095
        break;
    case NBD_CMD_DISC:
        TRACE("Request type is DISCONNECT");
        errno = 0;
P
Paolo Bonzini 已提交
1096
        goto out;
P
Paolo Bonzini 已提交
1097 1098 1099
    case NBD_CMD_FLUSH:
        TRACE("Request type is FLUSH");

P
Paolo Bonzini 已提交
1100
        ret = bdrv_co_flush(exp->bs);
P
Paolo Bonzini 已提交
1101 1102 1103 1104
        if (ret < 0) {
            LOG("flush failed");
            reply.error = -ret;
        }
1105
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1106
            goto out;
1107
        }
P
Paolo Bonzini 已提交
1108 1109 1110
        break;
    case NBD_CMD_TRIM:
        TRACE("Request type is TRIM");
P
Paolo Bonzini 已提交
1111 1112
        ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
                              request.len / 512);
P
Paolo Bonzini 已提交
1113 1114 1115 1116
        if (ret < 0) {
            LOG("discard failed");
            reply.error = -ret;
        }
1117
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1118
            goto out;
1119
        }
P
Paolo Bonzini 已提交
1120
        break;
N
Nick Thomas 已提交
1121 1122
    default:
        LOG("invalid request type (%u) received", request.type);
1123 1124 1125
    invalid_request:
        reply.error = -EINVAL;
    error_reply:
1126
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
P
Paolo Bonzini 已提交
1127
            goto out;
1128
        }
1129
        break;
N
Nick Thomas 已提交
1130 1131 1132 1133
    }

    TRACE("Request/Reply complete");

1134
done:
P
Paolo Bonzini 已提交
1135 1136 1137
    nbd_request_put(req);
    return;

P
Paolo Bonzini 已提交
1138
out:
1139
    nbd_request_put(req);
P
Paolo Bonzini 已提交
1140
    nbd_client_close(client);
B
bellard 已提交
1141
}
P
Paolo Bonzini 已提交
1142

P
Paolo Bonzini 已提交
1143 1144 1145 1146 1147 1148 1149
static int nbd_can_read(void *opaque)
{
    NBDClient *client = opaque;

    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
}

1150 1151 1152 1153
static void nbd_read(void *opaque)
{
    NBDClient *client = opaque;

P
Paolo Bonzini 已提交
1154 1155 1156 1157
    if (client->recv_coroutine) {
        qemu_coroutine_enter(client->recv_coroutine, NULL);
    } else {
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1158 1159 1160
    }
}

P
Paolo Bonzini 已提交
1161 1162 1163 1164 1165 1166 1167
static void nbd_restart_write(void *opaque)
{
    NBDClient *client = opaque;

    qemu_coroutine_enter(client->send_coroutine, NULL);
}

1168 1169
NBDClient *nbd_client_new(NBDExport *exp, int csock,
                          void (*close)(NBDClient *))
P
Paolo Bonzini 已提交
1170
{
1171 1172 1173 1174 1175
    NBDClient *client;
    client = g_malloc0(sizeof(NBDClient));
    client->refcount = 1;
    client->exp = exp;
    client->sock = csock;
1176 1177 1178 1179
    if (nbd_send_negotiate(client) < 0) {
        g_free(client);
        return NULL;
    }
1180
    client->close = close;
P
Paolo Bonzini 已提交
1181
    qemu_co_mutex_init(&client->send_lock);
P
Paolo Bonzini 已提交
1182
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1183

1184 1185 1186 1187
    if (exp) {
        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
        nbd_export_get(exp);
    }
1188
    return client;
P
Paolo Bonzini 已提交
1189
}