nbd.c 13.6 KB
Newer Older
1 2 3 4
/*
 * QEMU Block driver for  NBD
 *
 * Copyright (C) 2008 Bull S.A.S.
M
malc 已提交
5
 *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 * Some parts:
 *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include "qemu-common.h"
#include "nbd.h"
31
#include "block_int.h"
32
#include "module.h"
33
#include "qemu_socket.h"
34 35 36 37

#include <sys/types.h>
#include <unistd.h>

38 39
#define EN_OPTSTR ":exportname="

40 41 42 43 44 45 46 47 48
/* #define DEBUG_NBD */

#if defined(DEBUG_NBD)
#define logout(fmt, ...) \
                fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__)
#else
#define logout(fmt, ...) ((void)0)
#endif

49 50 51 52
#define MAX_NBD_REQUESTS	16
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs))

53 54
typedef struct BDRVNBDState {
    int sock;
P
Paolo Bonzini 已提交
55
    uint32_t nbdflags;
56 57
    off_t size;
    size_t blocksize;
58 59
    char *export_name; /* An NBD server may export several devices */

60 61 62 63
    CoMutex send_mutex;
    CoMutex free_sema;
    Coroutine *send_coroutine;
    int in_flight;
64

65
    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
66 67
    struct nbd_reply reply;

68 69 70 71
    /* If it begins with  '/', this is a UNIX domain socket. Otherwise,
     * it's a string of the form <hostname|ip4|\[ip6\]>:port
     */
    char *host_spec;
72 73
} BDRVNBDState;

74
static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
75
{
76
    char *file;
77 78
    char *export_name;
    const char *host_spec;
79
    const char *unixpath;
80
    int err = -EINVAL;
81

82
    file = g_strdup(filename);
83

84 85 86
    export_name = strstr(file, EN_OPTSTR);
    if (export_name) {
        if (export_name[strlen(EN_OPTSTR)] == 0) {
87 88
            goto out;
        }
89 90
        export_name[0] = 0; /* truncate 'file' */
        export_name += strlen(EN_OPTSTR);
91
        s->export_name = g_strdup(export_name);
92 93
    }

94 95
    /* extract the host_spec - fail if it's not nbd:... */
    if (!strstart(file, "nbd:", &host_spec)) {
96 97
        goto out;
    }
98

99 100 101
    /* are we a UNIX or TCP socket? */
    if (strstart(host_spec, "unix:", &unixpath)) {
        if (unixpath[0] != '/') { /* We demand  an absolute path*/
102 103
            goto out;
        }
104
        s->host_spec = g_strdup(unixpath);
105
    } else {
106
        s->host_spec = g_strdup(host_spec);
107
    }
108

109
    err = 0;
110

111
out:
112
    g_free(file);
113
    if (err != 0) {
114 115
        g_free(s->export_name);
        g_free(s->host_spec);
116 117 118
    }
    return err;
}
119

120 121
static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
{
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
    int i;

    /* Poor man semaphore.  The free_sema is locked when no other request
     * can be accepted, and unlocked after receiving one reply.  */
    if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
        qemu_co_mutex_lock(&s->free_sema);
        assert(s->in_flight < MAX_NBD_REQUESTS);
    }
    s->in_flight++;

    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i] == NULL) {
            s->recv_coroutine[i] = qemu_coroutine_self();
            break;
        }
    }

    assert(i < MAX_NBD_REQUESTS);
    request->handle = INDEX_TO_HANDLE(s, i);
141 142 143 144 145 146
}

static int nbd_have_request(void *opaque)
{
    BDRVNBDState *s = opaque;

147
    return s->in_flight > 0;
148 149 150 151 152
}

static void nbd_reply_ready(void *opaque)
{
    BDRVNBDState *s = opaque;
153
    int i;
154 155 156 157 158

    if (s->reply.handle == 0) {
        /* No reply already in flight.  Fetch a header.  */
        if (nbd_receive_reply(s->sock, &s->reply) < 0) {
            s->reply.handle = 0;
159
            goto fail;
160 161 162 163 164 165
        }
    }

    /* There's no need for a mutex on the receive side, because the
     * handler acts as a synchronization point and ensures that only
     * one coroutine is called until the reply finishes.  */
166 167 168 169 170 171 172 173 174 175 176
    i = HANDLE_TO_INDEX(s, s->reply.handle);
    if (s->recv_coroutine[i]) {
        qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        return;
    }

fail:
    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
        if (s->recv_coroutine[i]) {
            qemu_coroutine_enter(s->recv_coroutine[i], NULL);
        }
177 178 179 180 181 182
    }
}

static void nbd_restart_write(void *opaque)
{
    BDRVNBDState *s = opaque;
183
    qemu_coroutine_enter(s->send_coroutine, NULL);
184 185 186 187 188 189 190
}

static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
                               struct iovec *iov, int offset)
{
    int rc, ret;

191 192
    qemu_co_mutex_lock(&s->send_mutex);
    s->send_coroutine = qemu_coroutine_self();
193 194 195 196 197 198 199 200 201 202 203 204
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
                            nbd_have_request, NULL, s);
    rc = nbd_send_request(s->sock, request);
    if (rc != -1 && iov) {
        ret = qemu_co_sendv(s->sock, iov, request->len, offset);
        if (ret != request->len) {
            errno = -EIO;
            rc = -1;
        }
    }
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
                            nbd_have_request, NULL, s);
205 206
    s->send_coroutine = NULL;
    qemu_co_mutex_unlock(&s->send_mutex);
207 208 209 210 211 212 213 214 215
    return rc;
}

static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
                                 struct nbd_reply *reply,
                                 struct iovec *iov, int offset)
{
    int ret;

216 217
    /* Wait until we're woken up by the read handler.  TODO: perhaps
     * peek at the next reply and avoid yielding if it's ours?  */
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
    qemu_coroutine_yield();
    *reply = s->reply;
    if (reply->handle != request->handle) {
        reply->error = EIO;
    } else {
        if (iov && reply->error == 0) {
            ret = qemu_co_recvv(s->sock, iov, request->len, offset);
            if (ret != request->len) {
                reply->error = EIO;
            }
        }

        /* Tell the read handler to read another header.  */
        s->reply.handle = 0;
    }
}

static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
{
237 238 239 240 241
    int i = HANDLE_TO_INDEX(s, request->handle);
    s->recv_coroutine[i] = NULL;
    if (s->in_flight-- == MAX_NBD_REQUESTS) {
        qemu_co_mutex_unlock(&s->free_sema);
    }
242 243
}

244 245 246 247 248 249 250
static int nbd_establish_connection(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    int sock;
    int ret;
    off_t size;
    size_t blocksize;
251

252 253 254 255
    if (s->host_spec[0] == '/') {
        sock = unix_socket_outgoing(s->host_spec);
    } else {
        sock = tcp_socket_outgoing_spec(s->host_spec);
256 257
    }

258
    /* Failed to establish connection */
259
    if (sock == -1) {
260 261
        logout("Failed to establish connection to NBD server\n");
        return -errno;
262
    }
263

264
    /* NBD handshake */
P
Paolo Bonzini 已提交
265
    ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
266
                                &blocksize);
267
    if (ret == -1) {
268 269 270
        logout("Failed to negotiate with the NBD server\n");
        closesocket(sock);
        return -errno;
271
    }
272

273 274
    /* Now that we're connected, set the socket to be non-blocking and
     * kick the reply mechanism.  */
275
    socket_set_nonblock(sock);
276 277
    qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
                            nbd_have_request, NULL, s);
278

279 280 281 282
    s->sock = sock;
    s->size = size;
    s->blocksize = blocksize;

283 284 285 286 287 288 289 290 291 292 293 294 295 296
    logout("Established connection with NBD server\n");
    return 0;
}

static void nbd_teardown_connection(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;

    request.type = NBD_CMD_DISC;
    request.from = 0;
    request.len = 0;
    nbd_send_request(s->sock, &request);

297
    qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL, NULL);
298 299 300 301 302 303 304 305
    closesocket(s->sock);
}

static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
{
    BDRVNBDState *s = bs->opaque;
    int result;

306 307
    qemu_co_mutex_init(&s->send_mutex);
    qemu_co_mutex_init(&s->free_sema);
308

309 310 311 312 313 314 315 316 317 318 319 320
    /* Pop the config into our state object. Exit if invalid. */
    result = nbd_config(s, filename, flags);
    if (result != 0) {
        return result;
    }

    /* establish TCP connection, return error if it fails
     * TODO: Configurable retry-until-timeout behaviour.
     */
    result = nbd_establish_connection(bs);

    return result;
321 322
}

P
Paolo Bonzini 已提交
323 324 325
static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov,
                          int offset)
326 327 328 329 330 331
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    request.type = NBD_CMD_READ;
332
    request.from = sector_num * 512;
333 334
    request.len = nb_sectors * 512;

335 336 337 338
    nbd_coroutine_start(s, &request);
    if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
        reply.error = errno;
    } else {
P
Paolo Bonzini 已提交
339
        nbd_co_receive_reply(s, &request, &reply, qiov->iov, offset);
340 341 342
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
343 344 345

}

P
Paolo Bonzini 已提交
346 347 348
static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov,
                           int offset)
349 350 351 352 353 354
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    request.type = NBD_CMD_WRITE;
355 356 357 358
    if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
        request.type |= NBD_CMD_FLAG_FUA;
    }

359
    request.from = sector_num * 512;
360 361
    request.len = nb_sectors * 512;

362
    nbd_coroutine_start(s, &request);
P
Paolo Bonzini 已提交
363
    if (nbd_co_send_request(s, &request, qiov->iov, offset) == -1) {
364 365 366 367 368 369
        reply.error = errno;
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
370 371
}

P
Paolo Bonzini 已提交
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409
/* qemu-nbd has a limit of slightly less than 1M per request.  Try to
 * remain aligned to 4K. */
#define NBD_MAX_SECTORS 2040

static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov)
{
    int offset = 0;
    int ret;
    while (nb_sectors > NBD_MAX_SECTORS) {
        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
        if (ret < 0) {
            return ret;
        }
        offset += NBD_MAX_SECTORS * 512;
        sector_num += NBD_MAX_SECTORS;
        nb_sectors -= NBD_MAX_SECTORS;
    }
    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
}

static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
{
    int offset = 0;
    int ret;
    while (nb_sectors > NBD_MAX_SECTORS) {
        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
        if (ret < 0) {
            return ret;
        }
        offset += NBD_MAX_SECTORS * 512;
        sector_num += NBD_MAX_SECTORS;
        nb_sectors -= NBD_MAX_SECTORS;
    }
    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
}

P
Paolo Bonzini 已提交
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
static int nbd_co_flush(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
        return 0;
    }

    request.type = NBD_CMD_FLUSH;
    if (s->nbdflags & NBD_FLAG_SEND_FUA) {
        request.type |= NBD_CMD_FLAG_FUA;
    }

    request.from = 0;
    request.len = 0;

    nbd_coroutine_start(s, &request);
    if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
        reply.error = errno;
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
}

P
Paolo Bonzini 已提交
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors)
{
    BDRVNBDState *s = bs->opaque;
    struct nbd_request request;
    struct nbd_reply reply;

    if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
        return 0;
    }
    request.type = NBD_CMD_TRIM;
    request.from = sector_num * 512;;
    request.len = nb_sectors * 512;

    nbd_coroutine_start(s, &request);
    if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
        reply.error = errno;
    } else {
        nbd_co_receive_reply(s, &request, &reply, NULL, 0);
    }
    nbd_coroutine_end(s, &request);
    return -reply.error;
}

462 463
static void nbd_close(BlockDriverState *bs)
{
464
    BDRVNBDState *s = bs->opaque;
465 466
    g_free(s->export_name);
    g_free(s->host_spec);
467

468
    nbd_teardown_connection(bs);
469 470 471 472 473 474 475 476 477
}

static int64_t nbd_getlength(BlockDriverState *bs)
{
    BDRVNBDState *s = bs->opaque;

    return s->size;
}

478
static BlockDriver bdrv_nbd = {
P
Paolo Bonzini 已提交
479 480 481 482 483 484 485
    .format_name         = "nbd",
    .instance_size       = sizeof(BDRVNBDState),
    .bdrv_file_open      = nbd_open,
    .bdrv_co_readv       = nbd_co_readv,
    .bdrv_co_writev      = nbd_co_writev,
    .bdrv_close          = nbd_close,
    .bdrv_co_flush_to_os = nbd_co_flush,
P
Paolo Bonzini 已提交
486
    .bdrv_co_discard     = nbd_co_discard,
P
Paolo Bonzini 已提交
487 488
    .bdrv_getlength      = nbd_getlength,
    .protocol_name       = "nbd",
489
};
490 491 492 493 494 495 496

static void bdrv_nbd_init(void)
{
    bdrv_register(&bdrv_nbd);
}

block_init(bdrv_nbd_init);