qed.h 10.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * QEMU Enhanced Disk Format
 *
 * Copyright IBM, Corp. 2010
 *
 * Authors:
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

#ifndef BLOCK_QED_H
#define BLOCK_QED_H

18
#include "block/block_int.h"
19
#include "qemu/cutils.h"
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

/* The layout of a QED file is as follows:
 *
 * +--------+----------+----------+----------+-----+
 * | header | L1 table | cluster0 | cluster1 | ... |
 * +--------+----------+----------+----------+-----+
 *
 * There is a 2-level pagetable for cluster allocation:
 *
 *                     +----------+
 *                     | L1 table |
 *                     +----------+
 *                ,------'  |  '------.
 *           +----------+   |    +----------+
 *           | L2 table |  ...   | L2 table |
 *           +----------+        +----------+
 *       ,------'  |  '------.
 *  +----------+   |    +----------+
 *  |   Data   |  ...   |   Data   |
 *  +----------+        +----------+
 *
 * The L1 table is fixed size and always present.  L2 tables are allocated on
 * demand.  The L1 table size determines the maximum possible image size; it
 * can be influenced using the cluster_size and table_size values.
 *
 * All fields are little-endian on disk.
 */
47
#define  QED_DEFAULT_CLUSTER_SIZE  65536
48 49 50 51 52 53
enum {
    QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24,

    /* The image supports a backing file */
    QED_F_BACKING_FILE = 0x01,

S
Stefan Hajnoczi 已提交
54 55 56
    /* The image needs a consistency check before use */
    QED_F_NEED_CHECK = 0x02,

57 58 59 60 61
    /* The backing file format must not be probed, treat as raw image */
    QED_F_BACKING_FORMAT_NO_PROBE = 0x04,

    /* Feature bits must be used when the on-disk format changes */
    QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */
S
Stefan Hajnoczi 已提交
62
                       QED_F_NEED_CHECK |
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
                       QED_F_BACKING_FORMAT_NO_PROBE,
    QED_COMPAT_FEATURE_MASK = 0,            /* supported compat feature bits */
    QED_AUTOCLEAR_FEATURE_MASK = 0,         /* supported autoclear feature bits */

    /* Data is stored in groups of sectors called clusters.  Cluster size must
     * be large to avoid keeping too much metadata.  I/O requests that have
     * sub-cluster size will require read-modify-write.
     */
    QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */
    QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024,

    /* Allocated clusters are tracked using a 2-level pagetable.  Table size is
     * a multiple of clusters so large maximum image sizes can be supported
     * without jacking up the cluster size too much.
     */
    QED_MIN_TABLE_SIZE = 1,        /* in clusters */
    QED_MAX_TABLE_SIZE = 16,
    QED_DEFAULT_TABLE_SIZE = 4,
81 82 83

    /* Delay to flush and clean image after last allocating write completes */
    QED_NEED_CHECK_TIMEOUT = 5,    /* in seconds */
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
};

typedef struct {
    uint32_t magic;                 /* QED\0 */

    uint32_t cluster_size;          /* in bytes */
    uint32_t table_size;            /* for L1 and L2 tables, in clusters */
    uint32_t header_size;           /* in clusters */

    uint64_t features;              /* format feature bits */
    uint64_t compat_features;       /* compatible feature bits */
    uint64_t autoclear_features;    /* self-resetting feature bits */

    uint64_t l1_table_offset;       /* in bytes */
    uint64_t image_size;            /* total logical image size, in bytes */

    /* if (features & QED_F_BACKING_FILE) */
    uint32_t backing_filename_offset; /* in bytes from start of header */
    uint32_t backing_filename_size;   /* in bytes */
103
} QEMU_PACKED QEDHeader;
104

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
typedef struct {
    uint64_t offsets[0];            /* in bytes */
} QEDTable;

/* The L2 cache is a simple write-through cache for L2 structures */
typedef struct CachedL2Table {
    QEDTable *table;
    uint64_t offset;    /* offset=0 indicates an invalidate entry */
    QTAILQ_ENTRY(CachedL2Table) node;
    int ref;
} CachedL2Table;

typedef struct {
    QTAILQ_HEAD(, CachedL2Table) entries;
    unsigned int n_entries;
} L2TableCache;

typedef struct QEDRequest {
    CachedL2Table *l2_table;
} QEDRequest;

126 127
enum {
    QED_AIOCB_WRITE = 0x0001,       /* read or write? */
128
    QED_AIOCB_ZERO  = 0x0002,       /* zero write, used with QED_AIOCB_WRITE */
129 130
};

S
Stefan Hajnoczi 已提交
131
typedef struct QEDAIOCB {
K
Kevin Wolf 已提交
132
    BlockDriverState *bs;
S
Stefan Hajnoczi 已提交
133
    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
134
    int flags;                      /* QED_AIOCB_* bits ORed together */
S
Stefan Hajnoczi 已提交
135 136 137 138 139 140 141 142
    uint64_t end_pos;               /* request end on block device, in bytes */

    /* User scatter-gather list */
    QEMUIOVector *qiov;
    size_t qiov_offset;             /* byte count already processed */

    /* Current cluster scatter-gather list */
    QEMUIOVector cur_qiov;
143
    QEMUIOVector *backing_qiov;
S
Stefan Hajnoczi 已提交
144 145 146 147 148 149 150 151
    uint64_t cur_pos;               /* position on block device, in bytes */
    uint64_t cur_cluster;           /* cluster offset in image file */
    unsigned int cur_nclusters;     /* number of clusters being accessed */
    int find_cluster_ret;           /* used for L1/L2 update */

    QEDRequest request;
} QEDAIOCB;

152 153 154
typedef struct {
    BlockDriverState *bs;           /* device */

155 156 157
    /* Written only by an allocating write or the timer handler (the latter
     * while allocating reqs are plugged).
     */
158
    QEDHeader header;               /* always cpu-endian */
159 160 161

    /* Protected by table_lock.  */
    CoMutex table_lock;
162 163
    QEDTable *l1_table;
    L2TableCache l2_cache;          /* l2 table cache */
164 165 166 167
    uint32_t table_nelems;
    uint32_t l1_shift;
    uint32_t l2_shift;
    uint32_t l2_mask;
168
    uint64_t file_size;             /* length of image file, in bytes */
S
Stefan Hajnoczi 已提交
169 170

    /* Allocating write request queue */
171 172
    QEDAIOCB *allocating_acb;
    CoQueue allocating_write_reqs;
173 174 175 176
    bool allocating_write_reqs_plugged;

    /* Periodic flush and clear need check flag */
    QEMUTimer *need_check_timer;
177 178
} BDRVQEDState;

179 180
enum {
    QED_CLUSTER_FOUND,         /* cluster found */
181
    QED_CLUSTER_ZERO,          /* zero cluster found */
182 183 184 185
    QED_CLUSTER_L2,            /* cluster missing in L2 */
    QED_CLUSTER_L1,            /* cluster missing in L1 */
};

186 187 188 189 190
/**
 * Header functions
 */
int qed_write_header_sync(BDRVQEDState *s);

191 192 193 194 195 196 197 198 199 200 201 202 203
/**
 * L2 cache functions
 */
void qed_init_l2_cache(L2TableCache *l2_cache);
void qed_free_l2_cache(L2TableCache *l2_cache);
CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache);
void qed_unref_l2_cache_entry(CachedL2Table *entry);
CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset);
void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);

/**
 * Table I/O functions
 */
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
int coroutine_fn qed_read_l1_table_sync(BDRVQEDState *s);
int coroutine_fn qed_write_l1_table(BDRVQEDState *s, unsigned int index,
                                    unsigned int n);
int coroutine_fn qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
                                         unsigned int n);
int coroutine_fn qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                                        uint64_t offset);
int coroutine_fn qed_read_l2_table(BDRVQEDState *s, QEDRequest *request,
                                   uint64_t offset);
int coroutine_fn qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
                                    unsigned int index, unsigned int n,
                                    bool flush);
int coroutine_fn qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
                                         unsigned int index, unsigned int n,
                                         bool flush);
219 220 221 222

/**
 * Cluster functions
 */
223 224 225
int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
                                  uint64_t pos, size_t *len,
                                  uint64_t *img_offset);
226 227 228 229

/**
 * Consistency check
 */
230
int coroutine_fn qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix);
231 232 233

QEDTable *qed_alloc_table(BDRVQEDState *s);

234 235 236 237 238 239 240 241
/**
 * Round down to the start of a cluster
 */
static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset)
{
    return offset & ~(uint64_t)(s->header.cluster_size - 1);
}

242 243 244 245 246
static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset)
{
    return offset & (s->header.cluster_size - 1);
}

247
static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes)
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
{
    return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) /
           (s->header.cluster_size - 1);
}

static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos)
{
    return pos >> s->l1_shift;
}

static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos)
{
    return (pos >> s->l2_shift) & s->l2_mask;
}

263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
/**
 * Test if a cluster offset is valid
 */
static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset)
{
    uint64_t header_size = (uint64_t)s->header.header_size *
                           s->header.cluster_size;

    if (offset & (s->header.cluster_size - 1)) {
        return false;
    }
    return offset >= header_size && offset < s->file_size;
}

/**
 * Test if a table offset is valid
 */
static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset)
{
    uint64_t end_offset = offset + (s->header.table_size - 1) *
                          s->header.cluster_size;

    /* Overflow check */
    if (end_offset <= offset) {
        return false;
    }

    return qed_check_cluster_offset(s, offset) &&
           qed_check_cluster_offset(s, end_offset);
}

294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s,
                                                 uint64_t offset)
{
    if (qed_offset_into_cluster(s, offset)) {
        return false;
    }
    return true;
}

static inline bool qed_offset_is_unalloc_cluster(uint64_t offset)
{
    if (offset == 0) {
        return true;
    }
    return false;
}

static inline bool qed_offset_is_zero_cluster(uint64_t offset)
{
    if (offset == 1) {
        return true;
    }
    return false;
}

319
#endif /* BLOCK_QED_H */