quorum.c 6.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * Quorum Block filter
 *
 * Copyright (C) 2012-2014 Nodalink, EURL.
 *
 * Author:
 *   Benoît Canet <benoit.canet@irqsave.net>
 *
 * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp)
 * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc).
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */

#include "block/block_int.h"

18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
/* the following structure holds the state of one quorum instance */
typedef struct BDRVQuorumState {
    BlockDriverState **bs; /* children BlockDriverStates */
    int num_children;      /* children count */
    int threshold;         /* if less than threshold children reads gave the
                            * same result a quorum error occurs.
                            */
    bool is_blkverify;     /* true if the driver is in blkverify mode
                            * Writes are mirrored on two children devices.
                            * On reads the two children devices' contents are
                            * compared and if a difference is spotted its
                            * location is printed and the code aborts.
                            * It is useful to debug other block drivers by
                            * comparing them with a reference one.
                            */
} BDRVQuorumState;

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
typedef struct QuorumAIOCB QuorumAIOCB;

/* Quorum will create one instance of the following structure per operation it
 * performs on its children.
 * So for each read/write operation coming from the upper layer there will be
 * $children_count QuorumChildRequest.
 */
typedef struct QuorumChildRequest {
    BlockDriverAIOCB *aiocb;
    QEMUIOVector qiov;
    uint8_t *buf;
    int ret;
    QuorumAIOCB *parent;
} QuorumChildRequest;

/* Quorum will use the following structure to track progress of each read/write
 * operation received by the upper layer.
 * This structure hold pointers to the QuorumChildRequest structures instances
 * used to do operations on each children and track overall progress.
 */
struct QuorumAIOCB {
    BlockDriverAIOCB common;

    /* Request metadata */
    uint64_t sector_num;
    int nb_sectors;

    QEMUIOVector *qiov;         /* calling IOV */

    QuorumChildRequest *qcrs;   /* individual child requests */
    int count;                  /* number of completed AIOCB */
    int success_count;          /* number of successfully completed AIOCB */

    bool is_read;
    int vote_ret;
};
71

72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
{
    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
    BDRVQuorumState *s = acb->common.bs->opaque;
    int i;

    /* cancel all callbacks */
    for (i = 0; i < s->num_children; i++) {
        bdrv_aio_cancel(acb->qcrs[i].aiocb);
    }

    g_free(acb->qcrs);
    qemu_aio_release(acb);
}

static AIOCBInfo quorum_aiocb_info = {
    .aiocb_size         = sizeof(QuorumAIOCB),
    .cancel             = quorum_aio_cancel,
};

static void quorum_aio_finalize(QuorumAIOCB *acb)
{
B
Benoît Canet 已提交
94 95
    BDRVQuorumState *s = acb->common.bs->opaque;
    int i, ret = 0;
96 97 98

    acb->common.cb(acb->common.opaque, ret);

B
Benoît Canet 已提交
99 100 101 102 103 104 105
    if (acb->is_read) {
        for (i = 0; i < s->num_children; i++) {
            qemu_vfree(acb->qcrs[i].buf);
            qemu_iovec_destroy(&acb->qcrs[i].qiov);
        }
    }

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
    g_free(acb->qcrs);
    qemu_aio_release(acb);
}

static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
                                   BlockDriverState *bs,
                                   QEMUIOVector *qiov,
                                   uint64_t sector_num,
                                   int nb_sectors,
                                   BlockDriverCompletionFunc *cb,
                                   void *opaque)
{
    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque);
    int i;

    acb->common.bs->opaque = s;
    acb->sector_num = sector_num;
    acb->nb_sectors = nb_sectors;
    acb->qiov = qiov;
    acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
    acb->count = 0;
    acb->success_count = 0;
    acb->is_read = false;
    acb->vote_ret = 0;

    for (i = 0; i < s->num_children; i++) {
        acb->qcrs[i].buf = NULL;
        acb->qcrs[i].ret = 0;
        acb->qcrs[i].parent = acb;
    }

    return acb;
}

static void quorum_aio_cb(void *opaque, int ret)
{
    QuorumChildRequest *sacb = opaque;
    QuorumAIOCB *acb = sacb->parent;
    BDRVQuorumState *s = acb->common.bs->opaque;

    sacb->ret = ret;
    acb->count++;
    if (ret == 0) {
        acb->success_count++;
    }
    assert(acb->count <= s->num_children);
    assert(acb->success_count <= s->num_children);
    if (acb->count < s->num_children) {
        return;
    }

    quorum_aio_finalize(acb);
}

B
Benoît Canet 已提交
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
                                         int64_t sector_num,
                                         QEMUIOVector *qiov,
                                         int nb_sectors,
                                         BlockDriverCompletionFunc *cb,
                                         void *opaque)
{
    BDRVQuorumState *s = bs->opaque;
    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
                                      nb_sectors, cb, opaque);
    int i;

    acb->is_read = true;

    for (i = 0; i < s->num_children; i++) {
        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size);
        qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov);
        qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf);
    }

    for (i = 0; i < s->num_children; i++) {
        bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors,
                       quorum_aio_cb, &acb->qcrs[i]);
    }

    return &acb->common;
}

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
                                          int64_t sector_num,
                                          QEMUIOVector *qiov,
                                          int nb_sectors,
                                          BlockDriverCompletionFunc *cb,
                                          void *opaque)
{
    BDRVQuorumState *s = bs->opaque;
    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors,
                                      cb, opaque);
    int i;

    for (i = 0; i < s->num_children; i++) {
        acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
                                             nb_sectors, &quorum_aio_cb,
                                             &acb->qcrs[i]);
    }

    return &acb->common;
}

209 210 211 212 213
static BlockDriver bdrv_quorum = {
    .format_name        = "quorum",
    .protocol_name      = "quorum",

    .instance_size      = sizeof(BDRVQuorumState),
214

B
Benoît Canet 已提交
215
    .bdrv_aio_readv     = quorum_aio_readv,
216
    .bdrv_aio_writev    = quorum_aio_writev,
217 218 219 220 221 222 223 224
};

static void bdrv_quorum_init(void)
{
    bdrv_register(&bdrv_quorum);
}

block_init(bdrv_quorum_init);