提交 53365383 编写于 作者: L Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm

* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (80 commits)
  dm snapshot: use merge origin if snapshot invalid
  dm snapshot: report merge failure in status
  dm snapshot: merge consecutive chunks together
  dm snapshot: trigger exceptions in remaining snapshots during merge
  dm snapshot: delay merging a chunk until writes to it complete
  dm snapshot: queue writes to chunks being merged
  dm snapshot: add merging
  dm snapshot: permit only one merge at once
  dm snapshot: support barriers in snapshot merge target
  dm snapshot: avoid allocating exceptions in merge
  dm snapshot: rework writing to origin
  dm snapshot: add merge target
  dm exception store: add merge specific methods
  dm snapshot: create function for chunk_is_tracked wait
  dm snapshot: make bio optional in __origin_write
  dm mpath: reject messages when device is suspended
  dm: export suspended state to targets
  dm: rename dm_suspended to dm_suspended_md
  dm: swap target postsuspend call and setting suspended flag
  dm crypt: add plain64 iv
  ...
......@@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
original content;
*) To create device "forks", i.e. multiple different versions of the
same data stream.
*) To merge a snapshot of a block device back into the snapshot's origin
device.
In the first two cases, dm copies only the chunks of data that get
changed and uses a separate copy-on-write (COW) block device for
storage.
In both cases, dm copies only the chunks of data that get changed and
uses a separate copy-on-write (COW) block device for storage.
For snapshot merge the contents of the COW storage are merged back into
the origin device.
There are two dm targets available: snapshot and snapshot-origin.
There are three dm targets available:
snapshot, snapshot-origin, and snapshot-merge.
*) snapshot-origin <origin>
......@@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
saved on disk - they can be kept in memory by the kernel.
How this is used by LVM2
========================
* snapshot-merge <origin> <COW device> <persistent> <chunksize>
takes the same table arguments as the snapshot target except it only
works with persistent snapshots. This target assumes the role of the
"snapshot-origin" target and must not be loaded if the "snapshot-origin"
is still present for <origin>.
Creates a merging snapshot that takes control of the changed chunks
stored in the <COW device> of an existing snapshot, through a handover
procedure, and merges these chunks back into the <origin>. Once merging
has started (in the background) the <origin> may be opened and the merge
will continue while I/O is flowing to it. Changes to the <origin> are
deferred until the merging snapshot's corresponding chunk(s) have been
merged. Once merging has started the snapshot device, associated with
the "snapshot" target, will return -EIO when accessed.
How snapshot is used by LVM2
============================
When you create the first LVM2 snapshot of a volume, four dm devices are used:
1) a device containing the original mapping table of the source volume;
......@@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
How snapshot-merge is used by LVM2
==================================
A merging snapshot assumes the role of the "snapshot-origin" while
merging. As such the "snapshot-origin" is replaced with
"snapshot-merge". The "-real" device is not changed and the "-cow"
device is renamed to <origin name>-cow to aid LVM2's cleanup of the
merging snapshot after it completes. The "snapshot" that hands over its
COW device to the "snapshot-merge" is deactivated (unless using lvchange
--refresh); but if it is left active it will simply return I/O errors.
A snapshot will merge into its origin with the following command:
lvconvert --merge volumeGroup/snap
we'll now have this situation:
# dmsetup table|grep volumeGroup
volumeGroup-base-real: 0 2097152 linear 8:19 384
volumeGroup-base-cow: 0 204800 linear 8:19 2097536
volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
# ls -lL /dev/mapper/volumeGroup-*
brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
/*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
* Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
......@@ -71,10 +71,21 @@ struct crypt_iv_operations {
int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
const char *opts);
void (*dtr)(struct crypt_config *cc);
const char *(*status)(struct crypt_config *cc);
int (*init)(struct crypt_config *cc);
int (*wipe)(struct crypt_config *cc);
int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
};
struct iv_essiv_private {
struct crypto_cipher *tfm;
struct crypto_hash *hash_tfm;
u8 *salt;
};
struct iv_benbi_private {
int shift;
};
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
......@@ -102,8 +113,8 @@ struct crypt_config {
struct crypt_iv_operations *iv_gen_ops;
char *iv_mode;
union {
struct crypto_cipher *essiv_tfm;
int benbi_shift;
struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
......@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
* plain: the initial vector is the 32-bit little-endian version of the sector
* number, padded with zeros if necessary.
*
* plain64: the initial vector is the 64-bit little-endian version of the sector
* number, padded with zeros if necessary.
*
* essiv: "encrypted sector|salt initial vector", the sector number is
* encrypted with the bulk cipher using a salt as key. The salt
* should be derived from the bulk cipher's key via hashing.
......@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
return 0;
}
static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
sector_t sector)
{
struct crypto_cipher *essiv_tfm;
struct crypto_hash *hash_tfm;
memset(iv, 0, cc->iv_size);
*(u64 *)iv = cpu_to_le64(sector);
return 0;
}
/* Initialise ESSIV - compute salt but no local memory allocations */
static int crypt_iv_essiv_init(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
struct hash_desc desc;
struct scatterlist sg;
unsigned int saltsize;
u8 *salt;
int err;
if (opts == NULL) {
sg_init_one(&sg, cc->key, cc->key_size);
desc.tfm = essiv->hash_tfm;
desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
if (err)
return err;
return crypto_cipher_setkey(essiv->tfm, essiv->salt,
crypto_hash_digestsize(essiv->hash_tfm));
}
/* Wipe salt and reset key derived from volume key */
static int crypt_iv_essiv_wipe(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
memset(essiv->salt, 0, salt_size);
return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
crypto_free_cipher(essiv->tfm);
essiv->tfm = NULL;
crypto_free_hash(essiv->hash_tfm);
essiv->hash_tfm = NULL;
kzfree(essiv->salt);
essiv->salt = NULL;
}
static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
struct crypto_cipher *essiv_tfm = NULL;
struct crypto_hash *hash_tfm = NULL;
u8 *salt = NULL;
int err;
if (!opts) {
ti->error = "Digest algorithm missing for ESSIV mode";
return -EINVAL;
}
/* Hash the cipher key with the given hash algorithm */
/* Allocate hash algorithm */
hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(hash_tfm)) {
ti->error = "Error initializing ESSIV hash";
return PTR_ERR(hash_tfm);
err = PTR_ERR(hash_tfm);
goto bad;
}
saltsize = crypto_hash_digestsize(hash_tfm);
salt = kmalloc(saltsize, GFP_KERNEL);
if (salt == NULL) {
salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
if (!salt) {
ti->error = "Error kmallocing salt storage in ESSIV";
crypto_free_hash(hash_tfm);
return -ENOMEM;
err = -ENOMEM;
goto bad;
}
sg_init_one(&sg, cc->key, cc->key_size);
desc.tfm = hash_tfm;
desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
crypto_free_hash(hash_tfm);
if (err) {
ti->error = "Error calculating hash in ESSIV";
kfree(salt);
return err;
}
/* Setup the essiv_tfm with the given salt */
/* Allocate essiv_tfm */
essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(essiv_tfm)) {
ti->error = "Error allocating crypto tfm for ESSIV";
kfree(salt);
return PTR_ERR(essiv_tfm);
err = PTR_ERR(essiv_tfm);
goto bad;
}
if (crypto_cipher_blocksize(essiv_tfm) !=
crypto_ablkcipher_ivsize(cc->tfm)) {
ti->error = "Block size of ESSIV cipher does "
"not match IV size of block cipher";
crypto_free_cipher(essiv_tfm);
kfree(salt);
return -EINVAL;
err = -EINVAL;
goto bad;
}
err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
if (err) {
ti->error = "Failed to set key for ESSIV cipher";
crypto_free_cipher(essiv_tfm);
kfree(salt);
return err;
}
kfree(salt);
cc->iv_gen_private.essiv_tfm = essiv_tfm;
cc->iv_gen_private.essiv.salt = salt;
cc->iv_gen_private.essiv.tfm = essiv_tfm;
cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
return 0;
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
cc->iv_gen_private.essiv_tfm = NULL;
bad:
if (essiv_tfm && !IS_ERR(essiv_tfm))
crypto_free_cipher(essiv_tfm);
if (hash_tfm && !IS_ERR(hash_tfm))
crypto_free_hash(hash_tfm);
kfree(salt);
return err;
}
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
{
memset(iv, 0, cc->iv_size);
*(u64 *)iv = cpu_to_le64(sector);
crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
return 0;
}
......@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
cc->iv_gen_private.benbi_shift = 9 - log;
cc->iv_gen_private.benbi.shift = 9 - log;
return 0;
}
......@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
return 0;
......@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
static struct crypt_iv_operations crypt_iv_plain64_ops = {
.generator = crypt_iv_plain64_gen
};
static struct crypt_iv_operations crypt_iv_essiv_ops = {
.ctr = crypt_iv_essiv_ctr,
.dtr = crypt_iv_essiv_dtr,
.init = crypt_iv_essiv_init,
.wipe = crypt_iv_essiv_wipe,
.generator = crypt_iv_essiv_gen
};
......@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
return 0;
return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
static int crypt_wipe_key(struct crypt_config *cc)
{
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
return 0;
return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
/*
......@@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
if (crypt_set_key(cc, argv[1])) {
ti->error = "Error decoding key";
goto bad_cipher;
}
/* Compatibility mode for old dm-crypt cipher strings */
if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
chainmode = "cbc";
......@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
strcpy(cc->chainmode, chainmode);
cc->tfm = tfm;
if (crypt_set_key(cc, argv[1]) < 0) {
ti->error = "Error decoding and setting key";
goto bad_ivmode;
}
/*
* Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
* See comments at iv code
......@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops = NULL;
else if (strcmp(ivmode, "plain") == 0)
cc->iv_gen_ops = &crypt_iv_plain_ops;
else if (strcmp(ivmode, "plain64") == 0)
cc->iv_gen_ops = &crypt_iv_plain64_ops;
else if (strcmp(ivmode, "essiv") == 0)
cc->iv_gen_ops = &crypt_iv_essiv_ops;
else if (strcmp(ivmode, "benbi") == 0)
......@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
goto bad_ivmode;
if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
cc->iv_gen_ops->init(cc) < 0) {
ti->error = "Error initialising IV";
goto bad_slab_pool;
}
cc->iv_size = crypto_ablkcipher_ivsize(tfm);
if (cc->iv_size)
/* at least a 64 bit sector number should fit in our buffer */
......@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_bs;
}
if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
ti->error = "Error setting key";
goto bad_device;
}
if (sscanf(argv[2], "%llu", &tmpll) != 1) {
ti->error = "Invalid iv_offset sector";
goto bad_device;
......@@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti)
static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
{
struct crypt_config *cc = ti->private;
int ret = -EINVAL;
if (argc < 2)
goto error;
......@@ -1287,10 +1346,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
DMWARN("not suspended during key manipulation.");
return -EINVAL;
}
if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
return crypt_set_key(cc, argv[2]);
if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
ret = crypt_set_key(cc, argv[2]);
if (ret)
return ret;
if (cc->iv_gen_ops && cc->iv_gen_ops->init)
ret = cc->iv_gen_ops->init(cc);
return ret;
}
if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
ret = cc->iv_gen_ops->wipe(cc);
if (ret)
return ret;
}
return crypt_wipe_key(cc);
}
}
error:
......
......@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}
/* Validate the chunk size against the device block size */
if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
if (chunk_size %
(bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
*error = "Chunk size is not a multiple of device blocksize";
return -EINVAL;
}
......@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_snapshot *snap,
unsigned *args_used,
struct dm_exception_store **store)
{
......@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_exception_store *tmp_store;
char persistent;
if (argc < 3) {
if (argc < 2) {
ti->error = "Insufficient exception store arguments";
return -EINVAL;
}
......@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
return -ENOMEM;
}
persistent = toupper(*argv[1]);
persistent = toupper(*argv[0]);
if (persistent == 'P')
type = get_type("P");
else if (persistent == 'N')
type = get_type("N");
else {
ti->error = "Persistent flag is not P or N";
return -EINVAL;
r = -EINVAL;
goto bad_type;
}
if (!type) {
......@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
}
tmp_store->type = type;
tmp_store->ti = ti;
r = dm_get_device(ti, argv[0], 0, 0,
FMODE_READ | FMODE_WRITE, &tmp_store->cow);
if (r) {
ti->error = "Cannot get COW device";
goto bad_cow;
}
tmp_store->snap = snap;
r = set_chunk_size(tmp_store, argv[2], &ti->error);
r = set_chunk_size(tmp_store, argv[1], &ti->error);
if (r)
goto bad_ctr;
goto bad;
r = type->ctr(tmp_store, 0, NULL);
if (r) {
ti->error = "Exception store type constructor failed";
goto bad_ctr;
goto bad;
}
*args_used = 3;
*args_used = 2;
*store = tmp_store;
return 0;
bad_ctr:
dm_put_device(ti, tmp_store->cow);
bad_cow:
bad:
put_type(type);
bad_type:
kfree(tmp_store);
......@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
void dm_exception_store_destroy(struct dm_exception_store *store)
{
store->type->dtr(store);
dm_put_device(store->ti, store->cow);
put_type(store->type);
kfree(store);
}
......
......@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
* of chunks that follow contiguously. Remaining bits hold the number of the
* chunk within the device.
*/
struct dm_snap_exception {
struct dm_exception {
struct list_head hash_list;
chunk_t old_chunk;
......@@ -64,16 +64,33 @@ struct dm_exception_store_type {
* Find somewhere to store the next exception.
*/
int (*prepare_exception) (struct dm_exception_store *store,
struct dm_snap_exception *e);
struct dm_exception *e);
/*
* Update the metadata with this exception.
*/
void (*commit_exception) (struct dm_exception_store *store,
struct dm_snap_exception *e,
struct dm_exception *e,
void (*callback) (void *, int success),
void *callback_context);
/*
* Returns 0 if the exception store is empty.
*
* If there are exceptions still to be merged, sets
* *last_old_chunk and *last_new_chunk to the most recent
* still-to-be-merged chunk and returns the number of
* consecutive previous ones.
*/
int (*prepare_merge) (struct dm_exception_store *store,
chunk_t *last_old_chunk, chunk_t *last_new_chunk);
/*
* Clear the last n exceptions.
* nr_merged must be <= the value returned by prepare_merge.
*/
int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
/*
* The snapshot is invalid, note this in the metadata.
*/
......@@ -86,19 +103,19 @@ struct dm_exception_store_type {
/*
* Return how full the snapshot is.
*/
void (*fraction_full) (struct dm_exception_store *store,
sector_t *numerator,
sector_t *denominator);
void (*usage) (struct dm_exception_store *store,
sector_t *total_sectors, sector_t *sectors_allocated,
sector_t *metadata_sectors);
/* For internal device-mapper use only. */
struct list_head list;
};
struct dm_snapshot;
struct dm_exception_store {
struct dm_exception_store_type *type;
struct dm_target *ti;
struct dm_dev *cow;
struct dm_snapshot *snap;
/* Size of data blocks saved - must be a power of 2 */
unsigned chunk_size;
......@@ -108,6 +125,11 @@ struct dm_exception_store {
void *context;
};
/*
* Obtain the cow device used by a given snapshot.
*/
struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
/*
* Funtions to manipulate consecutive chunks
*/
......@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
BUG_ON(!dm_consecutive_chunk_count(e));
}
static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
BUG_ON(!dm_consecutive_chunk_count(e));
e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
}
# else
# define DM_CHUNK_CONSECUTIVE_BITS 0
......@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
return chunk;
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
return 0;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
}
static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
}
......@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
sector_t sector)
{
return (sector & ~store->chunk_mask) >> store->chunk_shift;
return sector >> store->chunk_shift;
}
int dm_exception_store_type_register(struct dm_exception_store_type *type);
......@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
char **error);
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_snapshot *snap,
unsigned *args_used,
struct dm_exception_store **store);
void dm_exception_store_destroy(struct dm_exception_store *store);
......
......@@ -5,6 +5,8 @@
* This file is released under the GPL.
*/
#include "dm.h"
#include <linux/device-mapper.h>
#include <linux/bio.h>
......@@ -14,12 +16,19 @@
#include <linux/slab.h>
#include <linux/dm-io.h>
#define DM_MSG_PREFIX "io"
#define DM_IO_MAX_REGIONS BITS_PER_LONG
struct dm_io_client {
mempool_t *pool;
struct bio_set *bios;
};
/* FIXME: can we shrink this ? */
/*
* Aligning 'struct io' reduces the number of bits required to store
* its address. Refer to store_io_and_region_in_bio() below.
*/
struct io {
unsigned long error_bits;
unsigned long eopnotsupp_bits;
......@@ -28,7 +37,9 @@ struct io {
struct dm_io_client *client;
io_notify_fn callback;
void *context;
};
} __attribute__((aligned(DM_IO_MAX_REGIONS)));
static struct kmem_cache *_dm_io_cache;
/*
* io contexts are only dynamically allocated for asynchronous
......@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
if (!client)
return ERR_PTR(-ENOMEM);
client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
if (!client->pool)
goto bad;
......@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
/*-----------------------------------------------------------------
* We need to keep track of which region a bio is doing io for.
* In order to save a memory allocation we store this the last
* bvec which we know is unused (blech).
* XXX This is ugly and can OOPS with some configs... find another way.
* To avoid a memory allocation to store just 5 or 6 bits, we
* ensure the 'struct io' pointer is aligned so enough low bits are
* always zero and then combine it with the region number directly in
* bi_private.
*---------------------------------------------------------------*/
static inline void bio_set_region(struct bio *bio, unsigned region)
static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
unsigned region)
{
bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
DMCRIT("Unaligned struct io pointer %p", io);
BUG();
}
bio->bi_private = (void *)((unsigned long)io | region);
}
static inline unsigned bio_get_region(struct bio *bio)
static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
unsigned *region)
{
return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
unsigned long val = (unsigned long)bio->bi_private;
*io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
*region = val & (DM_IO_MAX_REGIONS - 1);
}
/*-----------------------------------------------------------------
......@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
/*
* The bio destructor in bio_put() may use the io object.
*/
io = bio->bi_private;
region = bio_get_region(bio);
retrieve_io_and_region_from_bio(bio, &io, &region);
bio->bi_max_vecs++;
bio_put(bio);
dec_count(io, region, error);
......@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
static void dm_bio_destructor(struct bio *bio)
{
struct io *io = bio->bi_private;
unsigned region;
struct io *io;
retrieve_io_and_region_from_bio(bio, &io, &region);
bio_free(bio, io->client->bios);
}
......@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
unsigned num_bvecs;
sector_t remaining = where->count;
while (remaining) {
/*
* where->count may be zero if rw holds a write barrier and we
* need to send a zero-sized barrier.
*/
do {
/*
* Allocate a suitably sized-bio: we add an extra
* bvec for bio_get/set_region() and decrement bi_max_vecs
* to hide it from bio_add_page().
* Allocate a suitably sized-bio.
*/
num_bvecs = dm_sector_div_up(remaining,
(PAGE_SIZE >> SECTOR_SHIFT));
num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
num_bvecs);
if (unlikely(num_bvecs > BIO_MAX_PAGES))
num_bvecs = BIO_MAX_PAGES;
num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
bio->bi_private = io;
bio->bi_destructor = dm_bio_destructor;
bio->bi_max_vecs--;
bio_set_region(bio, region);
store_io_and_region_in_bio(bio, io, region);
/*
* Try and add as many pages as possible.
......@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
atomic_inc(&io->count);
submit_bio(rw, bio);
}
} while (remaining);
}
static void dispatch_io(int rw, unsigned int num_regions,
......@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
int i;
struct dpages old_pages = *dp;
BUG_ON(num_regions > DM_IO_MAX_REGIONS);
if (sync)
rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
......@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
*/
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
if (where[i].count)
if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
do_region(rw, i, where + i, dp, io);
}
......@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, int rw, struct dpages *dp,
unsigned long *error_bits)
{
struct io io;
/*
* gcc <= 4.3 can't do the alignment for stack variables, so we must
* align it on our own.
* volatile prevents the optimizer from removing or reusing
* "io_" field from the stack frame (allowed in ANSI C).
*/
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
......@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
}
retry:
io.error_bits = 0;
io.eopnotsupp_bits = 0;
atomic_set(&io.count, 1); /* see dispatch_io() */
io.sleeper = current;
io.client = client;
io->error_bits = 0;
io->eopnotsupp_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = current;
io->client = client;
dispatch_io(rw, num_regions, where, dp, &io, 1);
dispatch_io(rw, num_regions, where, dp, io, 1);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!atomic_read(&io.count))
if (!atomic_read(&io->count))
break;
io_schedule();
}
set_current_state(TASK_RUNNING);
if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
rw &= ~(1 << BIO_RW_BARRIER);
goto retry;
}
if (error_bits)
*error_bits = io.error_bits;
*error_bits = io->error_bits;
return io.error_bits ? -EIO : 0;
return io->error_bits ? -EIO : 0;
}
static int async_io(struct dm_io_client *client, unsigned int num_regions,
......@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
&dp, io_req->notify.fn, io_req->notify.context);
}
EXPORT_SYMBOL(dm_io);
int __init dm_io_init(void)
{
_dm_io_cache = KMEM_CACHE(io, 0);
if (!_dm_io_cache)
return -ENOMEM;
return 0;
}
void dm_io_exit(void)
{
kmem_cache_destroy(_dm_io_cache);
_dm_io_cache = NULL;
}
......@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
*/
static DECLARE_RWSEM(_hash_lock);
/*
* Protects use of mdptr to obtain hash cell name and uuid from mapped device.
*/
static DEFINE_MUTEX(dm_hash_cells_mutex);
static void init_buckets(struct list_head *buckets)
{
unsigned int i;
......@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
}
dm_get(md);
mutex_lock(&dm_hash_cells_mutex);
dm_set_mdptr(md, cell);
mutex_unlock(&dm_hash_cells_mutex);
up_write(&_hash_lock);
return 0;
......@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc)
/* remove from the dev hash */
list_del(&hc->uuid_list);
list_del(&hc->name_list);
mutex_lock(&dm_hash_cells_mutex);
dm_set_mdptr(hc->md, NULL);
mutex_unlock(&dm_hash_cells_mutex);
table = dm_get_table(hc->md);
table = dm_get_live_table(hc->md);
if (table) {
dm_table_event(table);
dm_table_put(table);
......@@ -321,13 +330,15 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
*/
list_del(&hc->name_list);
old_name = hc->name;
mutex_lock(&dm_hash_cells_mutex);
hc->name = new_name;
mutex_unlock(&dm_hash_cells_mutex);
list_add(&hc->name_list, _name_buckets + hash_str(new_name));
/*
* Wake up any dm event waiters.
*/
table = dm_get_table(hc->md);
table = dm_get_live_table(hc->md);
if (table) {
dm_table_event(table);
dm_table_put(table);
......@@ -512,8 +523,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size)
return 0;
}
static int check_name(const char *name)
{
if (strchr(name, '/')) {
......@@ -524,6 +533,40 @@ static int check_name(const char *name)
return 0;
}
/*
* On successful return, the caller must not attempt to acquire
* _hash_lock without first calling dm_table_put, because dm_table_destroy
* waits for this dm_table_put and could be called under this lock.
*/
static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
{
struct hash_cell *hc;
struct dm_table *table = NULL;
down_read(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
DMWARN("device has been removed from the dev hash table.");
goto out;
}
table = hc->new_map;
if (table)
dm_table_get(table);
out:
up_read(&_hash_lock);
return table;
}
static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
struct dm_ioctl *param)
{
return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
dm_get_inactive_table(md) : dm_get_live_table(md);
}
/*
* Fills in a dm_ioctl structure, ready for sending back to
* userland.
......@@ -536,7 +579,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
DM_ACTIVE_PRESENT_FLAG);
if (dm_suspended(md))
if (dm_suspended_md(md))
param->flags |= DM_SUSPEND_FLAG;
param->dev = huge_encode_dev(disk_devt(disk));
......@@ -548,18 +591,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
*/
param->open_count = dm_open_count(md);
if (get_disk_ro(disk))
param->flags |= DM_READONLY_FLAG;
param->event_nr = dm_get_event_nr(md);
param->target_count = 0;
table = dm_get_table(md);
table = dm_get_live_table(md);
if (table) {
param->flags |= DM_ACTIVE_PRESENT_FLAG;
param->target_count = dm_table_get_num_targets(table);
if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
if (get_disk_ro(disk))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
}
dm_table_put(table);
} else
param->target_count = 0;
param->flags |= DM_ACTIVE_PRESENT_FLAG;
}
if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
table = dm_get_inactive_table(md);
if (table) {
if (!(dm_table_get_mode(table) & FMODE_WRITE))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
dm_table_put(table);
}
}
return 0;
}
......@@ -634,9 +689,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
* Sneakily write in both the name and the uuid
* while we have the cell.
*/
strncpy(param->name, hc->name, sizeof(param->name));
strlcpy(param->name, hc->name, sizeof(param->name));
if (hc->uuid)
strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1);
strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
else
param->uuid[0] = '\0';
......@@ -784,7 +839,7 @@ static int do_suspend(struct dm_ioctl *param)
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
if (!dm_suspended_md(md))
r = dm_suspend(md, suspend_flags);
if (!r)
......@@ -800,7 +855,7 @@ static int do_resume(struct dm_ioctl *param)
unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct hash_cell *hc;
struct mapped_device *md;
struct dm_table *new_map;
struct dm_table *new_map, *old_map = NULL;
down_write(&_hash_lock);
......@@ -826,14 +881,14 @@ static int do_resume(struct dm_ioctl *param)
suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
if (!dm_suspended_md(md))
dm_suspend(md, suspend_flags);
r = dm_swap_table(md, new_map);
if (r) {
old_map = dm_swap_table(md, new_map);
if (IS_ERR(old_map)) {
dm_table_destroy(new_map);
dm_put(md);
return r;
return PTR_ERR(old_map);
}
if (dm_table_get_mode(new_map) & FMODE_WRITE)
......@@ -842,9 +897,11 @@ static int do_resume(struct dm_ioctl *param)
set_disk_ro(dm_disk(md), 1);
}
if (dm_suspended(md))
if (dm_suspended_md(md))
r = dm_resume(md);
if (old_map)
dm_table_destroy(old_map);
if (!r) {
dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
......@@ -982,7 +1039,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
if (r)
goto out;
table = dm_get_table(md);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
retrieve_status(table, param, param_size);
dm_table_put(table);
......@@ -1215,7 +1272,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
if (r)
goto out;
table = dm_get_table(md);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
retrieve_deps(table, param, param_size);
dm_table_put(table);
......@@ -1244,13 +1301,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
if (r)
goto out;
table = dm_get_table(md);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
retrieve_status(table, param, param_size);
dm_table_put(table);
}
out:
out:
dm_put(md);
return r;
}
......@@ -1288,10 +1345,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
goto out;
}
table = dm_get_table(md);
table = dm_get_live_table(md);
if (!table)
goto out_argv;
if (dm_deleting_md(md)) {
r = -ENXIO;
goto out_table;
}
ti = dm_table_find_target(table, tmsg->sector);
if (!dm_target_is_valid(ti)) {
DMWARN("Target message sector outside device.");
......@@ -1303,6 +1365,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
r = -EINVAL;
}
out_table:
dm_table_put(table);
out_argv:
kfree(argv);
......@@ -1582,8 +1645,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
if (!md)
return -ENXIO;
dm_get(md);
down_read(&_hash_lock);
mutex_lock(&dm_hash_cells_mutex);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
r = -ENXIO;
......@@ -1596,8 +1658,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
strcpy(uuid, hc->uuid ? : "");
out:
up_read(&_hash_lock);
dm_put(md);
mutex_unlock(&dm_hash_cells_mutex);
return r;
}
......@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job)
{
struct dm_kcopyd_client *kc = job->kc;
atomic_inc(&kc->nr_jobs);
push(&kc->pages_jobs, job);
if (unlikely(!job->source.count))
push(&kc->complete_jobs, job);
else
push(&kc->pages_jobs, job);
wake(kc);
}
......
......@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
EXPORT_SYMBOL(dm_dirty_log_type_unregister);
struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
struct dm_target *ti,
unsigned int argc, char **argv)
struct dm_target *ti,
int (*flush_callback_fn)(struct dm_target *ti),
unsigned int argc, char **argv)
{
struct dm_dirty_log_type *type;
struct dm_dirty_log *log;
......@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
return NULL;
}
log->flush_callback_fn = flush_callback_fn;
log->type = type;
if (type->ctr(log, ti, argc, argv)) {
kfree(log);
......@@ -208,7 +210,9 @@ struct log_header {
struct log_c {
struct dm_target *ti;
int touched;
int touched_dirtied;
int touched_cleaned;
int flush_failed;
uint32_t region_size;
unsigned int region_count;
region_t sync_count;
......@@ -233,6 +237,7 @@ struct log_c {
* Disk log fields
*/
int log_dev_failed;
int log_dev_flush_failed;
struct dm_dev *log_dev;
struct log_header header;
......@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
ext2_set_bit(bit, (unsigned long *) bs);
l->touched = 1;
l->touched_cleaned = 1;
}
static inline void log_clear_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
ext2_clear_bit(bit, (unsigned long *) bs);
l->touched = 1;
l->touched_dirtied = 1;
}
/*----------------------------------------------------------------
......@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw)
return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
}
static int flush_header(struct log_c *lc)
{
struct dm_io_region null_location = {
.bdev = lc->header_location.bdev,
.sector = 0,
.count = 0,
};
lc->io_req.bi_rw = WRITE_BARRIER;
return dm_io(&lc->io_req, 1, &null_location, NULL);
}
static int read_header(struct log_c *log)
{
int r;
......@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
}
lc->ti = ti;
lc->touched = 0;
lc->touched_dirtied = 0;
lc->touched_cleaned = 0;
lc->flush_failed = 0;
lc->region_size = region_size;
lc->region_count = region_count;
lc->sync = sync;
......@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
} else {
lc->log_dev = dev;
lc->log_dev_failed = 0;
lc->log_dev_flush_failed = 0;
lc->header_location.bdev = lc->log_dev->bdev;
lc->header_location.sector = 0;
......@@ -614,6 +635,11 @@ static int disk_resume(struct dm_dirty_log *log)
/* write the new header */
r = rw_header(lc, WRITE);
if (!r) {
r = flush_header(lc);
if (r)
lc->log_dev_flush_failed = 1;
}
if (r) {
DMWARN("%s: Failed to write header on dirty region log device",
lc->log_dev->name);
......@@ -656,18 +682,40 @@ static int core_flush(struct dm_dirty_log *log)
static int disk_flush(struct dm_dirty_log *log)
{
int r;
struct log_c *lc = (struct log_c *) log->context;
int r, i;
struct log_c *lc = log->context;
/* only write if the log has changed */
if (!lc->touched)
if (!lc->touched_cleaned && !lc->touched_dirtied)
return 0;
if (lc->touched_cleaned && log->flush_callback_fn &&
log->flush_callback_fn(lc->ti)) {
/*
* At this point it is impossible to determine which
* regions are clean and which are dirty (without
* re-reading the log off disk). So mark all of them
* dirty.
*/
lc->flush_failed = 1;
for (i = 0; i < lc->region_count; i++)
log_clear_bit(lc, lc->clean_bits, i);
}
r = rw_header(lc, WRITE);
if (r)
fail_log_device(lc);
else
lc->touched = 0;
else {
if (lc->touched_dirtied) {
r = flush_header(lc);
if (r) {
lc->log_dev_flush_failed = 1;
fail_log_device(lc);
} else
lc->touched_dirtied = 0;
}
lc->touched_cleaned = 0;
}
return r;
}
......@@ -681,7 +729,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
static void core_clear_region(struct dm_dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
log_set_bit(lc, lc->clean_bits, region);
if (likely(!lc->flush_failed))
log_set_bit(lc, lc->clean_bits, region);
}
static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
......@@ -762,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
switch(status) {
case STATUSTYPE_INFO:
DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
lc->log_dev_failed ? 'D' : 'A');
lc->log_dev_flush_failed ? 'F' :
lc->log_dev_failed ? 'D' :
'A');
break;
case STATUSTYPE_TABLE:
......
......@@ -93,6 +93,10 @@ struct multipath {
* can resubmit bios on error.
*/
mempool_t *mpio_pool;
struct mutex work_mutex;
unsigned suspended; /* Don't create new I/O internally when set. */
};
/*
......@@ -198,6 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->queue_io = 1;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
INIT_WORK(&m->trigger_event, trigger_event);
mutex_init(&m->work_mutex);
m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
......@@ -885,13 +890,18 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
return r;
}
static void multipath_dtr(struct dm_target *ti)
static void flush_multipath_work(void)
{
struct multipath *m = (struct multipath *) ti->private;
flush_workqueue(kmpath_handlerd);
flush_workqueue(kmultipathd);
flush_scheduled_work();
}
static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
flush_multipath_work();
free_multipath(m);
}
......@@ -1261,6 +1271,16 @@ static void multipath_presuspend(struct dm_target *ti)
queue_if_no_path(m, 0, 1);
}
static void multipath_postsuspend(struct dm_target *ti)
{
struct multipath *m = ti->private;
mutex_lock(&m->work_mutex);
m->suspended = 1;
flush_multipath_work();
mutex_unlock(&m->work_mutex);
}
/*
* Restore the queue_if_no_path setting.
*/
......@@ -1269,6 +1289,10 @@ static void multipath_resume(struct dm_target *ti)
struct multipath *m = (struct multipath *) ti->private;
unsigned long flags;
mutex_lock(&m->work_mutex);
m->suspended = 0;
mutex_unlock(&m->work_mutex);
spin_lock_irqsave(&m->lock, flags);
m->queue_if_no_path = m->saved_queue_if_no_path;
spin_unlock_irqrestore(&m->lock, flags);
......@@ -1397,51 +1421,71 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
{
int r;
int r = -EINVAL;
struct dm_dev *dev;
struct multipath *m = (struct multipath *) ti->private;
action_fn action;
mutex_lock(&m->work_mutex);
if (m->suspended) {
r = -EBUSY;
goto out;
}
if (dm_suspended(ti)) {
r = -EBUSY;
goto out;
}
if (argc == 1) {
if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
return queue_if_no_path(m, 1, 0);
else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
return queue_if_no_path(m, 0, 0);
if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
r = queue_if_no_path(m, 1, 0);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
r = queue_if_no_path(m, 0, 0);
goto out;
}
}
if (argc != 2)
goto error;
if (argc != 2) {
DMWARN("Unrecognised multipath message received.");
goto out;
}
if (!strnicmp(argv[0], MESG_STR("disable_group")))
return bypass_pg_num(m, argv[1], 1);
else if (!strnicmp(argv[0], MESG_STR("enable_group")))
return bypass_pg_num(m, argv[1], 0);
else if (!strnicmp(argv[0], MESG_STR("switch_group")))
return switch_pg_num(m, argv[1]);
else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
r = bypass_pg_num(m, argv[1], 1);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
r = bypass_pg_num(m, argv[1], 0);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
r = switch_pg_num(m, argv[1]);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
action = reinstate_path;
else if (!strnicmp(argv[0], MESG_STR("fail_path")))
action = fail_path;
else
goto error;
else {
DMWARN("Unrecognised multipath message received.");
goto out;
}
r = dm_get_device(ti, argv[1], ti->begin, ti->len,
dm_table_get_mode(ti->table), &dev);
if (r) {
DMWARN("message: error getting device %s",
argv[1]);
return -EINVAL;
goto out;
}
r = action_dev(m, dev, action);
dm_put_device(ti, dev);
out:
mutex_unlock(&m->work_mutex);
return r;
error:
DMWARN("Unrecognised multipath message received.");
return -EINVAL;
}
static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
......@@ -1567,13 +1611,14 @@ static int multipath_busy(struct dm_target *ti)
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
.version = {1, 1, 0},
.version = {1, 1, 1},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
.map_rq = multipath_map,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
......
......@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
*---------------------------------------------------------------*/
enum dm_raid1_error {
DM_RAID1_WRITE_ERROR,
DM_RAID1_FLUSH_ERROR,
DM_RAID1_SYNC_ERROR,
DM_RAID1_READ_ERROR
};
......@@ -57,6 +58,7 @@ struct mirror_set {
struct bio_list reads;
struct bio_list writes;
struct bio_list failures;
struct bio_list holds; /* bios are waiting until suspend */
struct dm_region_hash *rh;
struct dm_kcopyd_client *kcopyd_client;
......@@ -67,6 +69,7 @@ struct mirror_set {
region_t nr_regions;
int in_sync;
int log_failure;
int leg_failure;
atomic_t suspend;
atomic_t default_mirror; /* Default mirror */
......@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
atomic_set(&ms->default_mirror, m - m0);
}
static struct mirror *get_valid_mirror(struct mirror_set *ms)
{
struct mirror *m;
for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
if (!atomic_read(&m->error_count))
return m;
return NULL;
}
/* fail_mirror
* @m: mirror device to fail
* @error_type: one of the enum's, DM_RAID1_*_ERROR
......@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
struct mirror_set *ms = m->ms;
struct mirror *new;
ms->leg_failure = 1;
/*
* error_count is used for nothing more than a
* simple way to tell if a device has encountered
......@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
goto out;
}
for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
if (!atomic_read(&new->error_count)) {
set_default_mirror(new);
break;
}
if (unlikely(new == ms->mirror + ms->nr_mirrors))
new = get_valid_mirror(ms);
if (new)
set_default_mirror(new);
else
DMWARN("All sides of mirror have failed.");
out:
schedule_work(&ms->trigger_event);
}
static int mirror_flush(struct dm_target *ti)
{
struct mirror_set *ms = ti->private;
unsigned long error_bits;
unsigned int i;
struct dm_io_region io[ms->nr_mirrors];
struct mirror *m;
struct dm_io_request io_req = {
.bi_rw = WRITE_BARRIER,
.mem.type = DM_IO_KMEM,
.mem.ptr.bvec = NULL,
.client = ms->io_client,
};
for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
io[i].bdev = m->dev->bdev;
io[i].sector = 0;
io[i].count = 0;
}
error_bits = -1;
dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
if (unlikely(error_bits != 0)) {
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error_bits))
fail_mirror(ms->mirror + i,
DM_RAID1_FLUSH_ERROR);
return -EIO;
}
return 0;
}
/*-----------------------------------------------------------------
* Recovery.
*
......@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
*/
static sector_t map_sector(struct mirror *m, struct bio *bio)
{
if (unlikely(!bio->bi_size))
return 0;
return m->offset + (bio->bi_sector - m->ms->ti->begin);
}
......@@ -413,6 +462,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
io->count = bio->bi_size >> 9;
}
static void hold_bio(struct mirror_set *ms, struct bio *bio)
{
/*
* If device is suspended, complete the bio.
*/
if (atomic_read(&ms->suspend)) {
if (dm_noflush_suspending(ms->ti))
bio_endio(bio, DM_ENDIO_REQUEUE);
else
bio_endio(bio, -EIO);
return;
}
/*
* Hold bio until the suspend is complete.
*/
spin_lock_irq(&ms->lock);
bio_list_add(&ms->holds, bio);
spin_unlock_irq(&ms->lock);
}
/*-----------------------------------------------------------------
* Reads
*---------------------------------------------------------------*/
......@@ -511,7 +581,6 @@ static void write_callback(unsigned long error, void *context)
unsigned i, ret = 0;
struct bio *bio = (struct bio *) context;
struct mirror_set *ms;
int uptodate = 0;
int should_wake = 0;
unsigned long flags;
......@@ -524,36 +593,27 @@ static void write_callback(unsigned long error, void *context)
* This way we handle both writes to SYNC and NOSYNC
* regions with the same code.
*/
if (likely(!error))
goto out;
if (likely(!error)) {
bio_endio(bio, ret);
return;
}
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error))
fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
else
uptodate = 1;
if (unlikely(!uptodate)) {
DMERR("All replicated volumes dead, failing I/O");
/* None of the writes succeeded, fail the I/O. */
ret = -EIO;
} else if (errors_handled(ms)) {
/*
* Need to raise event. Since raising
* events can block, we need to do it in
* the main thread.
*/
spin_lock_irqsave(&ms->lock, flags);
if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio);
spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
return;
}
out:
bio_endio(bio, ret);
/*
* Need to raise event. Since raising
* events can block, we need to do it in
* the main thread.
*/
spin_lock_irqsave(&ms->lock, flags);
if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio);
spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
}
static void do_write(struct mirror_set *ms, struct bio *bio)
......@@ -562,7 +622,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
struct dm_io_region io[ms->nr_mirrors], *dest = io;
struct mirror *m;
struct dm_io_request io_req = {
.bi_rw = WRITE,
.bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
.mem.type = DM_IO_BVEC,
.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
.notify.fn = write_callback,
......@@ -603,6 +663,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) {
if (unlikely(bio_empty_barrier(bio))) {
bio_list_add(&sync, bio);
continue;
}
region = dm_rh_bio_to_region(ms->rh, bio);
if (log->type->is_remote_recovering &&
......@@ -672,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
dm_rh_delay(ms->rh, bio);
while ((bio = bio_list_pop(&nosync))) {
map_bio(get_default_mirror(ms), bio);
generic_make_request(bio);
if (unlikely(ms->leg_failure) && errors_handled(ms))
hold_bio(ms, bio);
else {
map_bio(get_default_mirror(ms), bio);
generic_make_request(bio);
}
}
}
......@@ -681,20 +750,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
{
struct bio *bio;
if (!failures->head)
return;
if (!ms->log_failure) {
while ((bio = bio_list_pop(failures))) {
ms->in_sync = 0;
dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
}
if (likely(!failures->head))
return;
}
/*
* If the log has failed, unattempted writes are being
* put on the failures list. We can't issue those writes
* put on the holds list. We can't issue those writes
* until a log has been marked, so we must store them.
*
* If a 'noflush' suspend is in progress, we can requeue
......@@ -709,23 +770,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
* for us to treat them the same and requeue them
* as well.
*/
if (dm_noflush_suspending(ms->ti)) {
while ((bio = bio_list_pop(failures)))
bio_endio(bio, DM_ENDIO_REQUEUE);
return;
}
while ((bio = bio_list_pop(failures))) {
if (!ms->log_failure) {
ms->in_sync = 0;
dm_rh_mark_nosync(ms->rh, bio);
}
if (atomic_read(&ms->suspend)) {
while ((bio = bio_list_pop(failures)))
/*
* If all the legs are dead, fail the I/O.
* If we have been told to handle errors, hold the bio
* and wait for userspace to deal with the problem.
* Otherwise pretend that the I/O succeeded. (This would
* be wrong if the failed leg returned after reboot and
* got replicated back to the good legs.)
*/
if (!get_valid_mirror(ms))
bio_endio(bio, -EIO);
return;
else if (errors_handled(ms))
hold_bio(ms, bio);
else
bio_endio(bio, 0);
}
spin_lock_irq(&ms->lock);
bio_list_merge(&ms->failures, failures);
spin_unlock_irq(&ms->lock);
delayed_wake(ms);
}
static void trigger_event(struct work_struct *work)
......@@ -784,12 +849,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
}
spin_lock_init(&ms->lock);
bio_list_init(&ms->reads);
bio_list_init(&ms->writes);
bio_list_init(&ms->failures);
bio_list_init(&ms->holds);
ms->ti = ti;
ms->nr_mirrors = nr_mirrors;
ms->nr_regions = dm_sector_div_up(ti->len, region_size);
ms->in_sync = 0;
ms->log_failure = 0;
ms->leg_failure = 0;
atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
......@@ -889,7 +959,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
return NULL;
}
dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2);
dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
argv + 2);
if (!dl) {
ti->error = "Error creating mirror dirty log";
return NULL;
......@@ -995,6 +1066,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = ms;
ti->split_io = dm_rh_get_region_size(ms->rh);
ti->num_flush_requests = 1;
ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
if (!ms->kmirrord_wq) {
......@@ -1122,7 +1194,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
* We need to dec pending if this was a write.
*/
if (rw == WRITE) {
dm_rh_dec(ms->rh, map_context->ll);
if (likely(!bio_empty_barrier(bio)))
dm_rh_dec(ms->rh, map_context->ll);
return error;
}
......@@ -1180,6 +1253,9 @@ static void mirror_presuspend(struct dm_target *ti)
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
struct bio_list holds;
struct bio *bio;
atomic_set(&ms->suspend, 1);
/*
......@@ -1202,6 +1278,22 @@ static void mirror_presuspend(struct dm_target *ti)
* we know that all of our I/O has been pushed.
*/
flush_workqueue(ms->kmirrord_wq);
/*
* Now set ms->suspend is set and the workqueue flushed, no more
* entries can be added to ms->hold list, so process it.
*
* Bios can still arrive concurrently with or after this
* presuspend function, but they cannot join the hold list
* because ms->suspend is set.
*/
spin_lock_irq(&ms->lock);
holds = ms->holds;
bio_list_init(&ms->holds);
spin_unlock_irq(&ms->lock);
while ((bio = bio_list_pop(&holds)))
hold_bio(ms, bio);
}
static void mirror_postsuspend(struct dm_target *ti)
......@@ -1244,7 +1336,8 @@ static char device_status_char(struct mirror *m)
if (!atomic_read(&(m->error_count)))
return 'A';
return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
(test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
(test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
(test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
}
......
......@@ -79,6 +79,11 @@ struct dm_region_hash {
struct list_head recovered_regions;
struct list_head failed_recovered_regions;
/*
* If there was a barrier failure no regions can be marked clean.
*/
int barrier_failure;
void *context;
sector_t target_begin;
......@@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create(
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
INIT_LIST_HEAD(&rh->failed_recovered_regions);
rh->barrier_failure = 0;
rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
sizeof(struct dm_region));
......@@ -377,8 +383,6 @@ static void complete_resync_work(struct dm_region *reg, int success)
/* dm_rh_mark_nosync
* @ms
* @bio
* @done
* @error
*
* The bio was written on some mirror(s) but failed on other mirror(s).
* We can successfully endio the bio but should avoid the region being
......@@ -386,8 +390,7 @@ static void complete_resync_work(struct dm_region *reg, int success)
*
* This function is _not_ safe in interrupt context!
*/
void dm_rh_mark_nosync(struct dm_region_hash *rh,
struct bio *bio, unsigned done, int error)
void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
{
unsigned long flags;
struct dm_dirty_log *log = rh->log;
......@@ -395,6 +398,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
region_t region = dm_rh_bio_to_region(rh, bio);
int recovering = 0;
if (bio_empty_barrier(bio)) {
rh->barrier_failure = 1;
return;
}
/* We must inform the log that the sync count has changed. */
log->type->set_region_sync(log, region, 0);
......@@ -419,7 +427,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
BUG_ON(!list_empty(&reg->list));
spin_unlock_irqrestore(&rh->region_lock, flags);
bio_endio(bio, error);
if (recovering)
complete_resync_work(reg, 0);
}
......@@ -515,8 +522,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
{
struct bio *bio;
for (bio = bios->head; bio; bio = bio->bi_next)
for (bio = bios->head; bio; bio = bio->bi_next) {
if (bio_empty_barrier(bio))
continue;
rh_inc(rh, dm_rh_bio_to_region(rh, bio));
}
}
EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
......@@ -544,7 +554,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
*/
/* do nothing for DM_RH_NOSYNC */
if (reg->state == DM_RH_RECOVERING) {
if (unlikely(rh->barrier_failure)) {
/*
* If a write barrier failed some time ago, we
* don't know whether or not this write made it
* to the disk, so we must resync the device.
*/
reg->state = DM_RH_NOSYNC;
} else if (reg->state == DM_RH_RECOVERING) {
list_add_tail(&reg->list, &rh->quiesced_regions);
} else if (reg->state == DM_RH_DIRTY) {
reg->state = DM_RH_CLEAN;
......
......@@ -55,6 +55,8 @@
*/
#define SNAPSHOT_DISK_VERSION 1
#define NUM_SNAPSHOT_HDR_CHUNKS 1
struct disk_header {
uint32_t magic;
......@@ -120,7 +122,22 @@ struct pstore {
/*
* The next free chunk for an exception.
*
* When creating exceptions, all the chunks here and above are
* free. It holds the next chunk to be allocated. On rare
* occasions (e.g. after a system crash) holes can be left in
* the exception store because chunks can be committed out of
* order.
*
* When merging exceptions, it does not necessarily mean all the
* chunks here and above are free. It holds the value it would
* have held if all chunks had been committed in order of
* allocation. Consequently the value may occasionally be
* slightly too low, but since it's only used for 'status' and
* it can never reach its minimum value too early this doesn't
* matter.
*/
chunk_t next_free;
/*
......@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
int metadata)
{
struct dm_io_region where = {
.bdev = ps->store->cow->bdev,
.bdev = dm_snap_cow(ps->store->snap)->bdev,
.sector = ps->store->chunk_size * chunk,
.count = ps->store->chunk_size,
};
......@@ -294,7 +311,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
*/
if (!ps->store->chunk_size) {
ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
bdev_logical_block_size(ps->store->cow->bdev) >> 9);
bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
bdev) >> 9);
ps->store->chunk_mask = ps->store->chunk_size - 1;
ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
chunk_size_supplied = 0;
......@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps,
e->new_chunk = cpu_to_le64(de->new_chunk);
}
static void clear_exception(struct pstore *ps, uint32_t index)
{
struct disk_exception *e = get_exception(ps, index);
/* clear it */
e->old_chunk = 0;
e->new_chunk = 0;
}
/*
* Registers the exceptions that are present in the current area.
* 'full' is filled in to indicate if the area has been
......@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store)
return (struct pstore *) store->context;
}
static void persistent_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
static void persistent_usage(struct dm_exception_store *store,
sector_t *total_sectors,
sector_t *sectors_allocated,
sector_t *metadata_sectors)
{
*numerator = get_info(store)->next_free * store->chunk_size;
*denominator = get_dev_size(store->cow->bdev);
struct pstore *ps = get_info(store);
*sectors_allocated = ps->next_free * store->chunk_size;
*total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
/*
* First chunk is the fixed header.
* Then there are (ps->current_area + 1) metadata chunks, each one
* separated from the next by ps->exceptions_per_area data chunks.
*/
*metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
store->chunk_size;
}
static void persistent_dtr(struct dm_exception_store *store)
......@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store,
ps->current_area = 0;
zero_memory_area(ps);
r = zero_disk_area(ps, 0);
if (r) {
if (r)
DMWARN("zero_disk_area(0) failed");
return r;
}
} else {
/*
* Sanity checks.
*/
if (ps->version != SNAPSHOT_DISK_VERSION) {
DMWARN("unable to handle snapshot disk version %d",
ps->version);
return -EINVAL;
}
return r;
}
/*
* Sanity checks.
*/
if (ps->version != SNAPSHOT_DISK_VERSION) {
DMWARN("unable to handle snapshot disk version %d",
ps->version);
return -EINVAL;
}
/*
* Metadata are valid, but snapshot is invalidated
*/
if (!ps->valid)
return 1;
/*
* Metadata are valid, but snapshot is invalidated
*/
if (!ps->valid)
return 1;
/*
* Read the metadata.
*/
r = read_exceptions(ps, callback, callback_context);
if (r)
return r;
}
/*
* Read the metadata.
*/
r = read_exceptions(ps, callback, callback_context);
return 0;
return r;
}
static int persistent_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
struct dm_exception *e)
{
struct pstore *ps = get_info(store);
uint32_t stride;
chunk_t next_free;
sector_t size = get_dev_size(store->cow->bdev);
sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
/* Is there enough room ? */
if (size < ((ps->next_free + 1) * store->chunk_size))
......@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
}
static void persistent_commit_exception(struct dm_exception_store *store,
struct dm_snap_exception *e,
struct dm_exception *e,
void (*callback) (void *, int success),
void *callback_context)
{
......@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store,
ps->callback_count = 0;
}
static int persistent_prepare_merge(struct dm_exception_store *store,
chunk_t *last_old_chunk,
chunk_t *last_new_chunk)
{
struct pstore *ps = get_info(store);
struct disk_exception de;
int nr_consecutive;
int r;
/*
* When current area is empty, move back to preceding area.
*/
if (!ps->current_committed) {
/*
* Have we finished?
*/
if (!ps->current_area)
return 0;
ps->current_area--;
r = area_io(ps, READ);
if (r < 0)
return r;
ps->current_committed = ps->exceptions_per_area;
}
read_exception(ps, ps->current_committed - 1, &de);
*last_old_chunk = de.old_chunk;
*last_new_chunk = de.new_chunk;
/*
* Find number of consecutive chunks within the current area,
* working backwards.
*/
for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
nr_consecutive++) {
read_exception(ps, ps->current_committed - 1 - nr_consecutive,
&de);
if (de.old_chunk != *last_old_chunk - nr_consecutive ||
de.new_chunk != *last_new_chunk - nr_consecutive)
break;
}
return nr_consecutive;
}
static int persistent_commit_merge(struct dm_exception_store *store,
int nr_merged)
{
int r, i;
struct pstore *ps = get_info(store);
BUG_ON(nr_merged > ps->current_committed);
for (i = 0; i < nr_merged; i++)
clear_exception(ps, ps->current_committed - 1 - i);
r = area_io(ps, WRITE);
if (r < 0)
return r;
ps->current_committed -= nr_merged;
/*
* At this stage, only persistent_usage() uses ps->next_free, so
* we make no attempt to keep ps->next_free strictly accurate
* as exceptions may have been committed out-of-order originally.
* Once a snapshot has become merging, we set it to the value it
* would have held had all the exceptions been committed in order.
*
* ps->current_area does not get reduced by prepare_merge() until
* after commit_merge() has removed the nr_merged previous exceptions.
*/
ps->next_free = (area_location(ps, ps->current_area) - 1) +
(ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
return 0;
}
static void persistent_drop_snapshot(struct dm_exception_store *store)
{
struct pstore *ps = get_info(store);
......@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store,
ps->area = NULL;
ps->zero_area = NULL;
ps->header_area = NULL;
ps->next_free = 2; /* skipping the header and first area */
ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
ps->current_committed = 0;
ps->callback_count = 0;
......@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store,
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
DMEMIT(" %s P %llu", store->cow->name,
(unsigned long long)store->chunk_size);
DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
}
return sz;
......@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = {
.read_metadata = persistent_read_metadata,
.prepare_exception = persistent_prepare_exception,
.commit_exception = persistent_commit_exception,
.prepare_merge = persistent_prepare_merge,
.commit_merge = persistent_commit_merge,
.drop_snapshot = persistent_drop_snapshot,
.fraction_full = persistent_fraction_full,
.usage = persistent_usage,
.status = persistent_status,
};
......@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = {
.read_metadata = persistent_read_metadata,
.prepare_exception = persistent_prepare_exception,
.commit_exception = persistent_commit_exception,
.prepare_merge = persistent_prepare_merge,
.commit_merge = persistent_commit_merge,
.drop_snapshot = persistent_drop_snapshot,
.fraction_full = persistent_fraction_full,
.usage = persistent_usage,
.status = persistent_status,
};
......
......@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store,
}
static int transient_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
struct dm_exception *e)
{
struct transient_c *tc = store->context;
sector_t size = get_dev_size(store->cow->bdev);
sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
if (size < (tc->next_free + store->chunk_size))
return -1;
......@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
}
static void transient_commit_exception(struct dm_exception_store *store,
struct dm_snap_exception *e,
struct dm_exception *e,
void (*callback) (void *, int success),
void *callback_context)
{
......@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store,
callback(callback_context, 1);
}
static void transient_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
static void transient_usage(struct dm_exception_store *store,
sector_t *total_sectors,
sector_t *sectors_allocated,
sector_t *metadata_sectors)
{
*numerator = ((struct transient_c *) store->context)->next_free;
*denominator = get_dev_size(store->cow->bdev);
*sectors_allocated = ((struct transient_c *) store->context)->next_free;
*total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
*metadata_sectors = 0;
}
static int transient_ctr(struct dm_exception_store *store,
......@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store,
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
DMEMIT(" %s N %llu", store->cow->name,
(unsigned long long)store->chunk_size);
DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
}
return sz;
......@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = {
.read_metadata = transient_read_metadata,
.prepare_exception = transient_prepare_exception,
.commit_exception = transient_commit_exception,
.fraction_full = transient_fraction_full,
.usage = transient_usage,
.status = transient_status,
};
......@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = {
.read_metadata = transient_read_metadata,
.prepare_exception = transient_prepare_exception,
.commit_exception = transient_commit_exception,
.fraction_full = transient_fraction_full,
.usage = transient_usage,
.status = transient_status,
};
......
此差异已折叠。
......@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
{
sprintf(buf, "%d\n", dm_suspended(md));
sprintf(buf, "%d\n", dm_suspended_md(md));
return strlen(buf);
}
......@@ -79,6 +79,13 @@ static struct sysfs_ops dm_sysfs_ops = {
.show = dm_attr_show,
};
/*
* The sysfs structure is embedded in md struct, nothing to do here
*/
static void dm_sysfs_release(struct kobject *kobj)
{
}
/*
* dm kobject is embedded in mapped_device structure
* no need to define release function here
......@@ -86,6 +93,7 @@ static struct sysfs_ops dm_sysfs_ops = {
static struct kobj_type dm_ktype = {
.sysfs_ops = &dm_sysfs_ops,
.default_attrs = dm_attrs,
.release = dm_sysfs_release
};
/*
......
......@@ -238,6 +238,9 @@ void dm_table_destroy(struct dm_table *t)
{
unsigned int i;
if (!t)
return;
while (atomic_read(&t->holders))
msleep(1);
smp_mb();
......
......@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
list_del_init(&event->elist);
/*
* Need to call dm_copy_name_and_uuid from here for now.
* Context of previous var adds and locking used for
* hash_cell not compatable.
* When a device is being removed this copy fails and we
* discard these unsent events.
*/
if (dm_copy_name_and_uuid(event->md, event->name,
event->uuid)) {
DMERR("%s: dm_copy_name_and_uuid() failed",
__func__);
DMINFO("%s: skipping sending uevent for lost device",
__func__);
goto uevent_free;
}
......
此差异已折叠。
......@@ -88,6 +88,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
int dm_split_args(int *argc, char ***argvp, char *input);
/*
* Is this mapped_device being deleted?
*/
int dm_deleting_md(struct mapped_device *md);
/*
* Is this mapped_device suspended?
*/
int dm_suspended_md(struct mapped_device *md);
/*
* The device-mapper can be driven through one of two interfaces;
* ioctl or filesystem, depending which patch you have applied.
......@@ -118,6 +128,9 @@ int dm_lock_for_deletion(struct mapped_device *md);
void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie);
int dm_io_init(void);
void dm_io_exit(void);
int dm_kcopyd_init(void);
void dm_kcopyd_exit(void);
......
......@@ -235,7 +235,7 @@ void dm_uevent_add(struct mapped_device *md, struct list_head *elist);
const char *dm_device_name(struct mapped_device *md);
int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
struct gendisk *dm_disk(struct mapped_device *md);
int dm_suspended(struct mapped_device *md);
int dm_suspended(struct dm_target *ti);
int dm_noflush_suspending(struct dm_target *ti);
union map_info *dm_get_mapinfo(struct bio *bio);
union map_info *dm_get_rq_mapinfo(struct request *rq);
......@@ -276,7 +276,7 @@ void dm_table_unplug_all(struct dm_table *t);
/*
* Table reference counting.
*/
struct dm_table *dm_get_table(struct mapped_device *md);
struct dm_table *dm_get_live_table(struct mapped_device *md);
void dm_table_get(struct dm_table *t);
void dm_table_put(struct dm_table *t);
......@@ -295,8 +295,10 @@ void dm_table_event(struct dm_table *t);
/*
* The device must be suspended before calling this method.
* Returns the previous table, which the caller must destroy.
*/
int dm_swap_table(struct mapped_device *md, struct dm_table *t);
struct dm_table *dm_swap_table(struct mapped_device *md,
struct dm_table *t);
/*
* A wrapper around vmalloc.
......
......@@ -21,6 +21,7 @@ struct dm_dirty_log_type;
struct dm_dirty_log {
struct dm_dirty_log_type *type;
int (*flush_callback_fn)(struct dm_target *ti);
void *context;
};
......@@ -136,8 +137,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type);
* type->constructor/destructor() directly.
*/
struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
struct dm_target *ti,
unsigned argc, char **argv);
struct dm_target *ti,
int (*flush_callback_fn)(struct dm_target *ti),
unsigned argc, char **argv);
void dm_dirty_log_destroy(struct dm_dirty_log *log);
#endif /* __KERNEL__ */
......
/*
* Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
* Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
* Copyright (C) 2004 - 2009 Red Hat, Inc. All rights reserved.
*
* This file is released under the LGPL.
*/
......@@ -266,9 +266,9 @@ enum {
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4
#define DM_VERSION_MINOR 15
#define DM_VERSION_MINOR 16
#define DM_VERSION_PATCHLEVEL 0
#define DM_VERSION_EXTRA "-ioctl (2009-04-01)"
#define DM_VERSION_EXTRA "-ioctl (2009-11-05)"
/* Status bits */
#define DM_READONLY_FLAG (1 << 0) /* In/Out */
......@@ -309,4 +309,11 @@ enum {
*/
#define DM_NOFLUSH_FLAG (1 << 11) /* In */
/*
* If set, any table information returned will relate to the inactive
* table instead of the live one. Always check DM_INACTIVE_PRESENT_FLAG
* is set before using the data returned.
*/
#define DM_QUERY_INACTIVE_TABLE_FLAG (1 << 12) /* In */
#endif /* _LINUX_DM_IOCTL_H */
......@@ -78,8 +78,7 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region);
/* Delay bios on regions. */
void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
void dm_rh_mark_nosync(struct dm_region_hash *rh,
struct bio *bio, unsigned done, int error);
void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio);
/*
* Region recovery control.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册