nbd.c 13.6 KB
Newer Older
1 2 3 4
/*
 * QEMU Block driver for  NBD
 *
 * Copyright (C) 2008 Bull S.A.S.
M
malc 已提交
5
 *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Some parts:
 *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include "qemu-common.h"
#include "nbd.h"
31
#include "block_int.h"
32
#include "module.h"
33
#include "qemu_socket.h"
34 35 36 37

#include <sys/types.h>
#include <unistd.h>

38 39
#define EN_OPTSTR ":exportname="

40 41 42 43 44 45 46 47 48
/* #define DEBUG_NBD */

#if defined(DEBUG_NBD)
#define logout(fmt, ...) \
                fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
#else
#define logout(fmt, ...) ((void)0)
#endif

49 50 51 52
#define MAX_NBD_REQUESTS	16
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))

53 54
typedef struct BDRVNBDState {
    int sock;
P
Paolo Bonzini 已提交
55
    uint32_t nbdflags;
56 57
    off_t size;
    size_t blocksize;
58 59
    char *export_name; /* An NBD server may export several devices */

60 61 62 63
    CoMutex send_mutex;
    CoMutex free_sema;
    Coroutine *send_coroutine;
    int in_flight;
64

65
    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
66 67
    struct nbd_reply reply;

68 69 70 71
    /* If it begins with  '/', this is a UNIX domain socket. Otherwise,
     * it's a string of the form <hostname|ip4|\[ip6\]>:port
     */
    char *host_spec;
72 73
} BDRVNBDState;

74
static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
75
{
76
    char *file;
77 78
    char *export_name;
    const char *host_spec;
79
    const char *unixpath;
80
    int err = -EINVAL;
81

82
    file = g_strdup(filename);
83

84 85 86
    export_name = strstr(file, EN_OPTSTR);
    if (export_name) {
        if (export_name[strlen(EN_OPTSTR)] == 0) {
87 88
            goto out;
        }
89 90
        export_name[0] = 0; /* truncate 'file' */
        export_name += strlen(EN_OPTSTR);
91
        s->export_name = g_strdup(export_name);
92 93
    }

94 95
    /* extract the host_spec - fail if it's not nbd:... */
    if (!strstart(file, "nbd:", &host_spec)) {
96 97
        goto out;
    }
98

99 100 101
    /* are we a UNIX or TCP socket? */
    if (strstart(host_spec, "unix:", &unixpath)) {
        if (unixpath[0] != '/') { /* We demand  an absolute path*/
102 103
            goto out;
        }
104
        s->host_spec = g_strdup(unixpath);
105
    } else {
106
        s->host_spec = g_strdup(host_spec);
107
    }
108

109
    err = 0;
110

111
out:
112
    g_free(file);
113
    if (err != 0) {
114 115
        g_free(s->export_name);
        g_free(s->host_spec);
116 117 118
    }
    return err;
}
119

120 121
static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
{
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
    int i;

    /* Poor man semaphore.  The free_sema is locked when no other request
     * can be accepted, and unlocked after receiving one reply.  */
    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
        qemu_co_mutex_lock(&s->free_sema);
        assert(s->in_flight < MAX_NBD_REQUESTS);
    }
    s->in_flight++;

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i] == NULL) {
            s->recv_coroutine[i] = qemu_coroutine_self();
            break;
        }
    }

    assert(i < MAX_NBD_REQUESTS);
    request->handle = INDEX_TO_HANDLE(s, i);
141 142 143 144 145 146
}

static int nbd_have_request(void *opaque)
{
    BDRVNBDState *s = opaque;

147
    return s->in_flight > 0;
148 149 150 151 152
}

static void nbd_reply_ready(void *opaque)
{
    BDRVNBDState *s = opaque;
153
    uint64_t i;
154 155 156 157 158

    if (s->reply.handle == 0) {
        /* No reply already in flight.  Fetch a header.  */
        if (nbd_receive_reply(s->sock, &s->reply) < 0) {
            s->reply.handle = 0;
159
            goto fail;
160 161 162 163 164 165
        }
    }

    /* There's no need for a mutex on the receive side, because the
     * handler acts as a synchronization point and ensures that only
     * one coroutine is called until the reply finishes.  */
166
    i = HANDLE_TO_INDEX(s, s->reply.handle);
167 168 169 170
    if (i >= MAX_NBD_REQUESTS) {
        goto fail;
    }

171 172 173 174 175 176 177 178 179 180
    if (s->recv_coroutine[i]) {
        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        return;
    }

fail:
    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i]) {
            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        }
181 182 183 184 185 186
    }
}

static void nbd_restart_write(void *opaque)
{
    BDRVNBDState *s = opaque;
187
    qemu_coroutine_enter(s->send_coroutine, NULL);
188 189 190 191 192 193 194
}

static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
                               struct iovec *iov, int offset)
{
    int rc, ret;

195 196
    qemu_co_mutex_lock(&s->send_mutex);
    s->send_coroutine = qemu_coroutine_self();
197 198 199 200 201 202 203 204 205 206 207 208
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
                            nbd_have_request, NULL, s);
    rc = nbd_send_request(s->sock, request);
    if (rc != -1 && iov) {
        ret = qemu_co_sendv(s->sock, iov, request->len, offset);
        if (ret != request->len) {
            errno = -EIO;
            rc = -1;
        }
    }
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
                            nbd_have_request, NULL, s);
209 210
    s->send_coroutine = NULL;
    qemu_co_mutex_unlock(&s->send_mutex);
211 212 213 214 215 216 217 218 219
    return rc;
}

static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
                                 struct nbd_reply *reply,
                                 struct iovec *iov, int offset)
{
    int ret;

220 221
    /* Wait until we're woken up by the read handler.  TODO: perhaps
     * peek at the next reply and avoid yielding if it's ours?  */
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
    qemu_coroutine_yield();
    *reply = s->reply;
    if (reply->handle != request->handle) {
        reply->error = EIO;
    } else {
        if (iov && reply->error == 0) {
            ret = qemu_co_recvv(s->sock, iov, request->len, offset);
            if (ret != request->len) {
                reply->error = EIO;
            }
        }

        /* Tell the read handler to read another header.  */
        s->reply.handle = 0;
    }
}

static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
{
241 242 243 244 245
    int i = HANDLE_TO_INDEX(s, request->handle);
    s->recv_coroutine[i] = NULL;
    if (s->in_flight-- == MAX_NBD_REQUESTS) {
        qemu_co_mutex_unlock(&s->free_sema);
    }
246 247
}

248 249 250 251 252 253 254
static int nbd_establish_connection(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    int sock;
    int ret;
    off_t size;
    size_t blocksize;
255

256 257 258 259
    if (s->host_spec[0] == '/') {
        sock = unix_socket_outgoing(s->host_spec);
    } else {
        sock = tcp_socket_outgoing_spec(s->host_spec);
260 261
    }

262
    /* Failed to establish connection */
263
    if (sock == -1) {
264 265
        logout("Failed to establish connection to NBD server\n");
        return -errno;
266
    }
267

268
    /* NBD handshake */
P
Paolo Bonzini 已提交
269
    ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
270
                                &blocksize);
271
    if (ret == -1) {
272 273 274
        logout("Failed to negotiate with the NBD server\n");
        closesocket(sock);
        return -errno;
275
    }
276

277 278
    /* Now that we're connected, set the socket to be non-blocking and
     * kick the reply mechanism.  */
279
    socket_set_nonblock(sock);
280 281
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
                            nbd_have_request, NULL, s);
282

283 284 285 286
    s->sock = sock;
    s->size = size;
    s->blocksize = blocksize;

287 288 289 290 291 292 293 294 295 296 297 298 299 300
    logout("Established connection with NBD server\n");
    return 0;
}

static void nbd_teardown_connection(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;

    request.type = NBD_CMD_DISC;
    request.from = 0;
    request.len = 0;
    nbd_send_request(s->sock, &request);

301
    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL, NULL);
302 303 304 305 306 307 308 309
    closesocket(s->sock);
}

static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
{
    BDRVNBDState *s = bs->opaque;
    int result;

310 311
    qemu_co_mutex_init(&s->send_mutex);
    qemu_co_mutex_init(&s->free_sema);
312

313 314 315 316 317 318 319 320 321 322 323 324
    /* Pop the config into our state object. Exit if invalid. */
    result = nbd_config(s, filename, flags);
    if (result != 0) {
        return result;
    }

    /* establish TCP connection, return error if it fails
     * TODO: Configurable retry-until-timeout behaviour.
     */
    result = nbd_establish_connection(bs);

    return result;
325 326
}

P
Paolo Bonzini 已提交
327 328 329
static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov,
                          int offset)
330 331 332 333 334 335
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    request.type = NBD_CMD_READ;
336
    request.from = sector_num * 512;
337 338
    request.len = nb_sectors * 512;

339 340 341 342
    nbd_coroutine_start(s, &request);
    if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
        reply.error = errno;
    } else {
P
Paolo Bonzini 已提交
343
        nbd_co_receive_reply(s, &request, &reply, qiov->iov, offset);
344 345 346
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
347 348 349

}

P
Paolo Bonzini 已提交
350 351 352
static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov,
                           int offset)
353 354 355 356 357 358
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    request.type = NBD_CMD_WRITE;
359 360 361 362
    if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
        request.type |= NBD_CMD_FLAG_FUA;
    }

363
    request.from = sector_num * 512;
364 365
    request.len = nb_sectors * 512;

366
    nbd_coroutine_start(s, &request);
P
Paolo Bonzini 已提交
367
    if (nbd_co_send_request(s, &request, qiov->iov, offset) == -1) {
368 369 370 371 372 373
        reply.error = errno;
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
374 375
}

P
Paolo Bonzini 已提交
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
 * remain aligned to 4K. */
#define NBD_MAX_SECTORS 2040

static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov)
{
    int offset = 0;
    int ret;
    while (nb_sectors > NBD_MAX_SECTORS) {
        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
        if (ret < 0) {
            return ret;
        }
        offset += NBD_MAX_SECTORS * 512;
        sector_num += NBD_MAX_SECTORS;
        nb_sectors -= NBD_MAX_SECTORS;
    }
    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
}

static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
{
    int offset = 0;
    int ret;
    while (nb_sectors > NBD_MAX_SECTORS) {
        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
        if (ret < 0) {
            return ret;
        }
        offset += NBD_MAX_SECTORS * 512;
        sector_num += NBD_MAX_SECTORS;
        nb_sectors -= NBD_MAX_SECTORS;
    }
    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
}

P
Paolo Bonzini 已提交
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
static int nbd_co_flush(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
        return 0;
    }

    request.type = NBD_CMD_FLUSH;
    if (s->nbdflags & NBD_FLAG_SEND_FUA) {
        request.type |= NBD_CMD_FLAG_FUA;
    }

    request.from = 0;
    request.len = 0;

    nbd_coroutine_start(s, &request);
    if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
        reply.error = errno;
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
}

P
Paolo Bonzini 已提交
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
        return 0;
    }
    request.type = NBD_CMD_TRIM;
    request.from = sector_num * 512;;
    request.len = nb_sectors * 512;

    nbd_coroutine_start(s, &request);
    if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
        reply.error = errno;
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
}

466 467
static void nbd_close(BlockDriverState *bs)
{
468
    BDRVNBDState *s = bs->opaque;
469 470
    g_free(s->export_name);
    g_free(s->host_spec);
471

472
    nbd_teardown_connection(bs);
473 474 475 476 477 478 479 480 481
}

static int64_t nbd_getlength(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;

    return s->size;
}

482
static BlockDriver bdrv_nbd = {
P
Paolo Bonzini 已提交
483 484 485 486 487 488 489
    .format_name         = "nbd",
    .instance_size       = sizeof(BDRVNBDState),
    .bdrv_file_open      = nbd_open,
    .bdrv_co_readv       = nbd_co_readv,
    .bdrv_co_writev      = nbd_co_writev,
    .bdrv_close          = nbd_close,
    .bdrv_co_flush_to_os = nbd_co_flush,
P
Paolo Bonzini 已提交
490
    .bdrv_co_discard     = nbd_co_discard,
P
Paolo Bonzini 已提交
491 492
    .bdrv_getlength      = nbd_getlength,
    .protocol_name       = "nbd",
493
};
494 495 496 497 498 499 500

static void bdrv_nbd_init(void)
{
    bdrv_register(&bdrv_nbd);
}

block_init(bdrv_nbd_init);