nbd.c 13.9 KB
Newer Older
1 2 3 4
/*
 * QEMU Block driver for  NBD
 *
 * Copyright (C) 2008 Bull S.A.S.
M
malc 已提交
5
 *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Some parts:
 *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include "qemu-common.h"
#include "nbd.h"
31
#include "block_int.h"
32
#include "module.h"
33
#include "qemu_socket.h"
34 35 36 37

#include <sys/types.h>
#include <unistd.h>

38 39
#define EN_OPTSTR ":exportname="

40 41 42 43 44 45 46 47 48
/* #define DEBUG_NBD */

#if defined(DEBUG_NBD)
#define logout(fmt, ...) \
                fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
#else
#define logout(fmt, ...) ((void)0)
#endif

49 50 51 52
#define MAX_NBD_REQUESTS	16
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))

53 54
typedef struct BDRVNBDState {
    int sock;
P
Paolo Bonzini 已提交
55
    uint32_t nbdflags;
56 57
    off_t size;
    size_t blocksize;
58 59
    char *export_name; /* An NBD server may export several devices */

60 61 62 63
    CoMutex send_mutex;
    CoMutex free_sema;
    Coroutine *send_coroutine;
    int in_flight;
64

65
    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
66 67
    struct nbd_reply reply;

68 69 70 71
    /* If it begins with  '/', this is a UNIX domain socket. Otherwise,
     * it's a string of the form <hostname|ip4|\[ip6\]>:port
     */
    char *host_spec;
72 73
} BDRVNBDState;

74
static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
75
{
76
    char *file;
77 78
    char *export_name;
    const char *host_spec;
79
    const char *unixpath;
80
    int err = -EINVAL;
81

82
    file = g_strdup(filename);
83

84 85 86
    export_name = strstr(file, EN_OPTSTR);
    if (export_name) {
        if (export_name[strlen(EN_OPTSTR)] == 0) {
87 88
            goto out;
        }
89 90
        export_name[0] = 0; /* truncate 'file' */
        export_name += strlen(EN_OPTSTR);
91
        s->export_name = g_strdup(export_name);
92 93
    }

94 95
    /* extract the host_spec - fail if it's not nbd:... */
    if (!strstart(file, "nbd:", &host_spec)) {
96 97
        goto out;
    }
98

99 100 101
    /* are we a UNIX or TCP socket? */
    if (strstart(host_spec, "unix:", &unixpath)) {
        if (unixpath[0] != '/') { /* We demand  an absolute path*/
102 103
            goto out;
        }
104
        s->host_spec = g_strdup(unixpath);
105
    } else {
106
        s->host_spec = g_strdup(host_spec);
107
    }
108

109
    err = 0;
110

111
out:
112
    g_free(file);
113
    if (err != 0) {
114 115
        g_free(s->export_name);
        g_free(s->host_spec);
116 117 118
    }
    return err;
}
119

120 121
static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
{
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
    int i;

    /* Poor man semaphore.  The free_sema is locked when no other request
     * can be accepted, and unlocked after receiving one reply.  */
    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
        qemu_co_mutex_lock(&s->free_sema);
        assert(s->in_flight < MAX_NBD_REQUESTS);
    }
    s->in_flight++;

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i] == NULL) {
            s->recv_coroutine[i] = qemu_coroutine_self();
            break;
        }
    }

    assert(i < MAX_NBD_REQUESTS);
    request->handle = INDEX_TO_HANDLE(s, i);
141 142 143 144 145 146
}

static int nbd_have_request(void *opaque)
{
    BDRVNBDState *s = opaque;

147
    return s->in_flight > 0;
148 149 150 151 152
}

static void nbd_reply_ready(void *opaque)
{
    BDRVNBDState *s = opaque;
153
    uint64_t i;
154
    int ret;
155 156

    if (s->reply.handle == 0) {
157 158 159 160 161 162 163 164 165
        /* No reply already in flight.  Fetch a header.  It is possible
         * that another thread has done the same thing in parallel, so
         * the socket is not readable anymore.
         */
        ret = nbd_receive_reply(s->sock, &s->reply);
        if (ret == -EAGAIN) {
            return;
        }
        if (ret < 0) {
166
            s->reply.handle = 0;
167
            goto fail;
168 169 170 171 172 173
        }
    }

    /* There's no need for a mutex on the receive side, because the
     * handler acts as a synchronization point and ensures that only
     * one coroutine is called until the reply finishes.  */
174
    i = HANDLE_TO_INDEX(s, s->reply.handle);
175 176 177 178
    if (i >= MAX_NBD_REQUESTS) {
        goto fail;
    }

179 180 181 182 183 184 185 186 187 188
    if (s->recv_coroutine[i]) {
        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        return;
    }

fail:
    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i]) {
            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        }
189 190 191 192 193 194
    }
}

static void nbd_restart_write(void *opaque)
{
    BDRVNBDState *s = opaque;
195
    qemu_coroutine_enter(s->send_coroutine, NULL);
196 197 198 199 200 201 202
}

static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
                               struct iovec *iov, int offset)
{
    int rc, ret;

203 204
    qemu_co_mutex_lock(&s->send_mutex);
    s->send_coroutine = qemu_coroutine_self();
205
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
206
                            nbd_have_request, s);
207
    rc = nbd_send_request(s->sock, request);
208
    if (rc >= 0 && iov) {
209 210
        ret = qemu_co_sendv(s->sock, iov, request->len, offset);
        if (ret != request->len) {
211
            return -EIO;
212 213 214
        }
    }
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
215
                            nbd_have_request, s);
216 217
    s->send_coroutine = NULL;
    qemu_co_mutex_unlock(&s->send_mutex);
218 219 220 221 222 223 224 225 226
    return rc;
}

static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
                                 struct nbd_reply *reply,
                                 struct iovec *iov, int offset)
{
    int ret;

227 228
    /* Wait until we're woken up by the read handler.  TODO: perhaps
     * peek at the next reply and avoid yielding if it's ours?  */
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
    qemu_coroutine_yield();
    *reply = s->reply;
    if (reply->handle != request->handle) {
        reply->error = EIO;
    } else {
        if (iov && reply->error == 0) {
            ret = qemu_co_recvv(s->sock, iov, request->len, offset);
            if (ret != request->len) {
                reply->error = EIO;
            }
        }

        /* Tell the read handler to read another header.  */
        s->reply.handle = 0;
    }
}

static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
{
248 249 250 251 252
    int i = HANDLE_TO_INDEX(s, request->handle);
    s->recv_coroutine[i] = NULL;
    if (s->in_flight-- == MAX_NBD_REQUESTS) {
        qemu_co_mutex_unlock(&s->free_sema);
    }
253 254
}

255 256 257 258 259 260 261
static int nbd_establish_connection(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    int sock;
    int ret;
    off_t size;
    size_t blocksize;
262

263 264 265 266
    if (s->host_spec[0] == '/') {
        sock = unix_socket_outgoing(s->host_spec);
    } else {
        sock = tcp_socket_outgoing_spec(s->host_spec);
267 268
    }

269
    /* Failed to establish connection */
270
    if (sock < 0) {
271 272
        logout("Failed to establish connection to NBD server\n");
        return -errno;
273
    }
274

275
    /* NBD handshake */
P
Paolo Bonzini 已提交
276
    ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
277
                                &blocksize);
278
    if (ret < 0) {
279 280
        logout("Failed to negotiate with the NBD server\n");
        closesocket(sock);
281
        return ret;
282
    }
283

284 285
    /* Now that we're connected, set the socket to be non-blocking and
     * kick the reply mechanism.  */
286
    socket_set_nonblock(sock);
287
    qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL,
288
                            nbd_have_request, s);
289

290 291 292 293
    s->sock = sock;
    s->size = size;
    s->blocksize = blocksize;

294 295 296 297 298 299 300 301 302 303 304 305 306 307
    logout("Established connection with NBD server\n");
    return 0;
}

static void nbd_teardown_connection(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;

    request.type = NBD_CMD_DISC;
    request.from = 0;
    request.len = 0;
    nbd_send_request(s->sock, &request);

308
    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL);
309 310 311 312 313 314 315 316
    closesocket(s->sock);
}

static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
{
    BDRVNBDState *s = bs->opaque;
    int result;

317 318
    qemu_co_mutex_init(&s->send_mutex);
    qemu_co_mutex_init(&s->free_sema);
319

320 321 322 323 324 325 326 327 328 329 330 331
    /* Pop the config into our state object. Exit if invalid. */
    result = nbd_config(s, filename, flags);
    if (result != 0) {
        return result;
    }

    /* establish TCP connection, return error if it fails
     * TODO: Configurable retry-until-timeout behaviour.
     */
    result = nbd_establish_connection(bs);

    return result;
332 333
}

P
Paolo Bonzini 已提交
334 335 336
static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov,
                          int offset)
337 338 339 340
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;
341
    ssize_t ret;
342 343

    request.type = NBD_CMD_READ;
344
    request.from = sector_num * 512;
345 346
    request.len = nb_sectors * 512;

347
    nbd_coroutine_start(s, &request);
348 349
    ret = nbd_co_send_request(s, &request, NULL, 0);
    if (ret < 0) {
350
        reply.error = -ret;
351
    } else {
P
Paolo Bonzini 已提交
352
        nbd_co_receive_reply(s, &request, &reply, qiov->iov, offset);
353 354 355
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
356 357 358

}

P
Paolo Bonzini 已提交
359 360 361
static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov,
                           int offset)
362 363 364 365
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;
366
    ssize_t ret;
367 368

    request.type = NBD_CMD_WRITE;
369 370 371 372
    if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
        request.type |= NBD_CMD_FLAG_FUA;
    }

373
    request.from = sector_num * 512;
374 375
    request.len = nb_sectors * 512;

376
    nbd_coroutine_start(s, &request);
377 378
    ret = nbd_co_send_request(s, &request, qiov->iov, offset);
    if (ret < 0) {
379
        reply.error = -ret;
380 381 382 383 384
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
385 386
}

P
Paolo Bonzini 已提交
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
 * remain aligned to 4K. */
#define NBD_MAX_SECTORS 2040

static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov)
{
    int offset = 0;
    int ret;
    while (nb_sectors > NBD_MAX_SECTORS) {
        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
        if (ret < 0) {
            return ret;
        }
        offset += NBD_MAX_SECTORS * 512;
        sector_num += NBD_MAX_SECTORS;
        nb_sectors -= NBD_MAX_SECTORS;
    }
    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
}

static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
{
    int offset = 0;
    int ret;
    while (nb_sectors > NBD_MAX_SECTORS) {
        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
        if (ret < 0) {
            return ret;
        }
        offset += NBD_MAX_SECTORS * 512;
        sector_num += NBD_MAX_SECTORS;
        nb_sectors -= NBD_MAX_SECTORS;
    }
    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
}

P
Paolo Bonzini 已提交
425 426 427 428 429
static int nbd_co_flush(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;
430
    ssize_t ret;
P
Paolo Bonzini 已提交
431 432 433 434 435 436 437 438 439 440 441 442 443 444

    if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
        return 0;
    }

    request.type = NBD_CMD_FLUSH;
    if (s->nbdflags & NBD_FLAG_SEND_FUA) {
        request.type |= NBD_CMD_FLAG_FUA;
    }

    request.from = 0;
    request.len = 0;

    nbd_coroutine_start(s, &request);
445 446
    ret = nbd_co_send_request(s, &request, NULL, 0);
    if (ret < 0) {
447
        reply.error = -ret;
P
Paolo Bonzini 已提交
448 449 450 451 452 453 454
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
}

P
Paolo Bonzini 已提交
455 456 457 458 459 460
static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;
461
    ssize_t ret;
P
Paolo Bonzini 已提交
462 463 464 465 466 467 468 469 470

    if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
        return 0;
    }
    request.type = NBD_CMD_TRIM;
    request.from = sector_num * 512;;
    request.len = nb_sectors * 512;

    nbd_coroutine_start(s, &request);
471 472
    ret = nbd_co_send_request(s, &request, NULL, 0);
    if (ret < 0) {
473
        reply.error = -ret;
P
Paolo Bonzini 已提交
474 475 476 477 478 479 480
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
}

481 482
static void nbd_close(BlockDriverState *bs)
{
483
    BDRVNBDState *s = bs->opaque;
484 485
    g_free(s->export_name);
    g_free(s->host_spec);
486

487
    nbd_teardown_connection(bs);
488 489 490 491 492 493 494 495 496
}

static int64_t nbd_getlength(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;

    return s->size;
}

497
static BlockDriver bdrv_nbd = {
P
Paolo Bonzini 已提交
498 499 500 501 502 503 504
    .format_name         = "nbd",
    .instance_size       = sizeof(BDRVNBDState),
    .bdrv_file_open      = nbd_open,
    .bdrv_co_readv       = nbd_co_readv,
    .bdrv_co_writev      = nbd_co_writev,
    .bdrv_close          = nbd_close,
    .bdrv_co_flush_to_os = nbd_co_flush,
P
Paolo Bonzini 已提交
505
    .bdrv_co_discard     = nbd_co_discard,
P
Paolo Bonzini 已提交
506 507
    .bdrv_getlength      = nbd_getlength,
    .protocol_name       = "nbd",
508
};
509 510 511 512 513 514 515

static void bdrv_nbd_init(void)
{
    bdrv_register(&bdrv_nbd);
}

block_init(bdrv_nbd_init);