提交 ba368991 编写于 作者: L Linus Torvalds

Merge tag 'dm-3.17-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper changes from Mike Snitzer:

 - Allow the thin target to paired with any size external origin; also
   allow thin snapshots to be larger than the external origin.

 - Add support for quickly loading a repetitive pattern into the
   dm-switch target.

 - Use per-bio data in the dm-crypt target instead of always using a
   mempool for each allocation.  Required switching to kmalloc alignment
   for the bio slab.

 - Fix DM core to properly stack the QUEUE_FLAG_NO_SG_MERGE flag

 - Fix the dm-cache and dm-thin targets' export of the minimum_io_size
   to match the data block size -- this fixes an issue where mkfs.xfs
   would improperly infer raid striping was in place on the underlying
   storage.

 - Small cleanups in dm-io, dm-mpath and dm-cache

* tag 'dm-3.17-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm table: propagate QUEUE_FLAG_NO_SG_MERGE
  dm switch: efficiently support repetitive patterns
  dm switch: factor out switch_region_table_read
  dm cache: set minimum_io_size to cache's data block size
  dm thin: set minimum_io_size to pool's data block size
  dm crypt: use per-bio data
  block: use kmalloc alignment for bio slab
  dm table: make dm_table_supports_discards static
  dm cache metadata: use dm-space-map-metadata.h defined size limits
  dm cache: fail migrations in the do_worker error path
  dm cache: simplify deferred set reference count increments
  dm thin: relax external origin size constraints
  dm thin: switch to an atomic_t for tracking pending new block preparations
  dm mpath: eliminate pg_ready() wrapper
  dm io: simplify dec_count and sync_io
......@@ -106,6 +106,11 @@ which paths.
The path number in the range 0 ... (<num_paths> - 1).
Expressed in hexadecimal (WITHOUT any prefix like 0x).
R<n>,<m>
This parameter allows repetitive patterns to be loaded quickly. <n> and <m>
are hexadecimal numbers. The last <n> mappings are repeated in the next <m>
slots.
Status
======
......@@ -124,3 +129,10 @@ Create a switch device with 64kB region size:
Set mappings for the first 7 entries to point to devices switch0, switch1,
switch2, switch0, switch1, switch2, switch1:
dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1
Set repetitive mapping. This command:
dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10
is equivalent to:
dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \
:1 :2 :1 :2 :1 :2 :1 :2 :1 :2
......@@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
bslab = &bio_slabs[entry];
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
SLAB_HWCACHE_ALIGN, NULL);
if (!slab)
goto out_unlock;
......
......@@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
disk_super->cache_blocks = cpu_to_le32(0);
......@@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
bool may_format_device)
{
int r;
cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
CACHE_METADATA_CACHE_SIZE,
CACHE_MAX_CONCURRENT_LOCKS);
if (IS_ERR(cmd->bm)) {
......
......@@ -9,19 +9,17 @@
#include "dm-cache-block-types.h"
#include "dm-cache-policy-internal.h"
#include "persistent-data/dm-space-map-metadata.h"
/*----------------------------------------------------------------*/
#define DM_CACHE_METADATA_BLOCK_SIZE 4096
#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
/* FIXME: remove this restriction */
/*
* The metadata device is currently limited in size.
*
* We have one block of index, which can hold 255 index entries. Each
* index entry contains allocation info about 16k metadata blocks.
*/
#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
/*
* A metadata device larger than 16GB triggers a warning.
......
......@@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio)
return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
}
/*
* You must increment the deferred set whilst the prison cell is held. To
* encourage this, we ask for 'cell' to be passed in.
*/
static void inc_ds(struct cache *cache, struct bio *bio,
struct dm_bio_prison_cell *cell)
{
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
BUG_ON(!cell);
BUG_ON(pb->all_io_entry);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
}
static void issue(struct cache *cache, struct bio *bio)
{
unsigned long flags;
......@@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio)
spin_unlock_irqrestore(&cache->lock, flags);
}
static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
{
inc_ds(cache, bio, cell);
issue(cache, bio);
}
static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
{
unsigned long flags;
......@@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
/*
* No need to inc_ds() here, since the cell will be held for the
* duration of the io.
*/
generic_make_request(bio);
}
......@@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache,
return;
INIT_LIST_HEAD(&work);
if (pb->all_io_entry)
dm_deferred_entry_dec(pb->all_io_entry, &work);
dm_deferred_entry_dec(pb->all_io_entry, &work);
if (!list_empty(&work))
queue_quiesced_migrations(cache, &work);
......@@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
else
remap_to_cache(cache, bio, 0);
/*
* REQ_FLUSH is not directed at any particular block so we don't
* need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH
* by dm-core.
*/
issue(cache, bio);
}
......@@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
&cache->stats.read_miss : &cache->stats.write_miss);
}
static void issue_cache_bio(struct cache *cache, struct bio *bio,
struct per_bio_data *pb,
dm_oblock_t oblock, dm_cblock_t cblock)
{
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
remap_to_cache_dirty(cache, bio, oblock, cblock);
issue(cache, bio);
}
static void process_bio(struct cache *cache, struct prealloc *structs,
struct bio *bio)
{
......@@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
dm_oblock_t block = get_bio_block(cache, bio);
struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
struct policy_result lookup_result;
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
bool discarded_block = is_discarded_oblock(cache, block);
bool passthrough = passthrough_mode(&cache->features);
bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
......@@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
} else {
/* FIXME: factor out issue_origin() */
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
remap_to_origin_clear_discard(cache, bio, block);
issue(cache, bio);
inc_and_issue(cache, bio, new_ocell);
}
} else {
inc_hit_counter(cache, bio);
......@@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
if (bio_data_dir(bio) == WRITE &&
writethrough_mode(&cache->features) &&
!is_dirty(cache, lookup_result.cblock)) {
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
issue(cache, bio);
} else
issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
inc_and_issue(cache, bio, new_ocell);
} else {
remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
inc_and_issue(cache, bio, new_ocell);
}
}
break;
case POLICY_MISS:
inc_miss_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
remap_to_origin_clear_discard(cache, bio, block);
issue(cache, bio);
inc_and_issue(cache, bio, new_ocell);
break;
case POLICY_NEW:
......@@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
bio_list_init(&cache->deferred_flush_bios);
spin_unlock_irqrestore(&cache->lock, flags);
/*
* These bios have already been through inc_ds()
*/
while ((bio = bio_list_pop(&bios)))
submit_bios ? generic_make_request(bio) : bio_io_error(bio);
}
......@@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache)
bio_list_init(&cache->deferred_writethrough_bios);
spin_unlock_irqrestore(&cache->lock, flags);
/*
* These bios have already been through inc_ds()
*/
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
}
......@@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws)
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
process_migrations(cache, &cache->need_commit_migrations, migration_failure);
/*
* FIXME: rollback metadata or just go into a
......@@ -2406,16 +2433,13 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
return r;
}
static int cache_map(struct dm_target *ti, struct bio *bio)
static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
{
struct cache *cache = ti->private;
int r;
dm_oblock_t block = get_bio_block(cache, bio);
size_t pb_data_size = get_per_bio_data_size(cache);
bool can_migrate = false;
bool discarded_block;
struct dm_bio_prison_cell *cell;
struct policy_result lookup_result;
struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
......@@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
/*
* Check to see if that block is currently migrating.
*/
cell = alloc_prison_cell(cache);
if (!cell) {
*cell = alloc_prison_cell(cache);
if (!*cell) {
defer_bio(cache, bio);
return DM_MAPIO_SUBMITTED;
}
r = bio_detain(cache, block, bio, cell,
r = bio_detain(cache, block, bio, *cell,
(cell_free_fn) free_prison_cell,
cache, &cell);
cache, cell);
if (r) {
if (r < 0)
defer_bio(cache, bio);
......@@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
bio, &lookup_result);
if (r == -EWOULDBLOCK) {
cell_defer(cache, cell, true);
cell_defer(cache, *cell, true);
return DM_MAPIO_SUBMITTED;
} else if (r) {
DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
cell_defer(cache, *cell, false);
bio_io_error(bio);
return DM_MAPIO_SUBMITTED;
}
......@@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
* We need to invalidate this block, so
* defer for the worker thread.
*/
cell_defer(cache, cell, true);
cell_defer(cache, *cell, true);
r = DM_MAPIO_SUBMITTED;
} else {
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
inc_miss_counter(cache, bio);
remap_to_origin_clear_discard(cache, bio, block);
cell_defer(cache, cell, false);
}
} else {
inc_hit_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
!is_dirty(cache, lookup_result.cblock))
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
else
remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
cell_defer(cache, cell, false);
}
break;
case POLICY_MISS:
inc_miss_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
if (pb->req_nr != 0) {
/*
* This is a duplicate writethrough io that is no
* longer needed because the block has been demoted.
*/
bio_endio(bio, 0);
cell_defer(cache, cell, false);
return DM_MAPIO_SUBMITTED;
} else {
cell_defer(cache, *cell, false);
r = DM_MAPIO_SUBMITTED;
} else
remap_to_origin_clear_discard(cache, bio, block);
cell_defer(cache, cell, false);
}
break;
default:
DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
(unsigned) lookup_result.op);
cell_defer(cache, *cell, false);
bio_io_error(bio);
r = DM_MAPIO_SUBMITTED;
}
......@@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return r;
}
static int cache_map(struct dm_target *ti, struct bio *bio)
{
int r;
struct dm_bio_prison_cell *cell;
struct cache *cache = ti->private;
r = __cache_map(cache, bio, &cell);
if (r == DM_MAPIO_REMAPPED) {
inc_ds(cache, bio, cell);
cell_defer(cache, cell, false);
}
return r;
}
static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct cache *cache = ti->private;
......@@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
(unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
cache->sectors_per_block,
......@@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
if (io_opt_sectors < cache->sectors_per_block ||
do_div(io_opt_sectors, cache->sectors_per_block)) {
blk_limits_io_min(limits, 0);
blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
}
set_discard_limits(cache, limits);
......@@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
.version = {1, 4, 0},
.version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
......
......@@ -59,7 +59,7 @@ struct dm_crypt_io {
int error;
sector_t sector;
struct dm_crypt_io *base_io;
};
} CRYPTO_MINALIGN_ATTR;
struct dm_crypt_request {
struct convert_context *ctx;
......@@ -162,6 +162,8 @@ struct crypt_config {
*/
unsigned int dmreq_start;
unsigned int per_bio_data_size;
unsigned long flags;
unsigned int key_size;
unsigned int key_parts; /* independent parts in key buffer */
......@@ -895,6 +897,15 @@ static void crypt_alloc_req(struct crypt_config *cc,
kcryptd_async_done, dmreq_of_req(cc, ctx->req));
}
static void crypt_free_req(struct crypt_config *cc,
struct ablkcipher_request *req, struct bio *base_bio)
{
struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
if ((struct ablkcipher_request *)(io + 1) != req)
mempool_free(req, cc->req_pool);
}
/*
* Encrypt / decrypt data from one bio to another one (can be the same one)
*/
......@@ -1008,12 +1019,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
}
}
static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
struct bio *bio, sector_t sector)
static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
struct bio *bio, sector_t sector)
{
struct dm_crypt_io *io;
io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->cc = cc;
io->base_bio = bio;
io->sector = sector;
......@@ -1021,8 +1029,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
io->base_io = NULL;
io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
return io;
}
static void crypt_inc_pending(struct dm_crypt_io *io)
......@@ -1046,8 +1052,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
return;
if (io->ctx.req)
mempool_free(io->ctx.req, cc->req_pool);
mempool_free(io, cc->io_pool);
crypt_free_req(cc, io->ctx.req, base_bio);
if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
mempool_free(io, cc->io_pool);
if (likely(!base_io))
bio_endio(base_bio, error);
......@@ -1255,8 +1262,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
* between fragments, so switch to a new dm_crypt_io structure.
*/
if (unlikely(!crypt_finished && remaining)) {
new_io = crypt_io_alloc(io->cc, io->base_bio,
sector);
new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
crypt_io_init(new_io, io->cc, io->base_bio, sector);
crypt_inc_pending(new_io);
crypt_convert_init(cc, &new_io->ctx, NULL,
io->base_bio, sector);
......@@ -1325,7 +1332,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
if (error < 0)
io->error = -EIO;
mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
if (!atomic_dec_and_test(&ctx->cc_pending))
return;
......@@ -1728,6 +1735,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
cc->per_bio_data_size = ti->per_bio_data_size =
sizeof(struct dm_crypt_io) + cc->dmreq_start +
sizeof(struct dm_crypt_request) + cc->iv_size;
cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
if (!cc->page_pool) {
ti->error = "Cannot allocate page mempool";
......@@ -1824,7 +1835,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
io = dm_per_bio_data(bio, cc->per_bio_data_size);
crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
io->ctx.req = (struct ablkcipher_request *)(io + 1);
if (bio_data_dir(io->base_bio) == READ) {
if (kcryptd_io_read(io, GFP_NOWAIT))
......
......@@ -33,7 +33,6 @@ struct dm_io_client {
struct io {
unsigned long error_bits;
atomic_t count;
struct completion *wait;
struct dm_io_client *client;
io_notify_fn callback;
void *context;
......@@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
* We need an io object to keep track of the number of bios that
* have been dispatched for a particular io.
*---------------------------------------------------------------*/
static void dec_count(struct io *io, unsigned int region, int error)
static void complete_io(struct io *io)
{
if (error)
set_bit(region, &io->error_bits);
unsigned long error_bits = io->error_bits;
io_notify_fn fn = io->callback;
void *context = io->context;
if (atomic_dec_and_test(&io->count)) {
if (io->vma_invalidate_size)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
if (io->vma_invalidate_size)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
if (io->wait)
complete(io->wait);
mempool_free(io, io->client->pool);
fn(error_bits, context);
}
else {
unsigned long r = io->error_bits;
io_notify_fn fn = io->callback;
void *context = io->context;
static void dec_count(struct io *io, unsigned int region, int error)
{
if (error)
set_bit(region, &io->error_bits);
mempool_free(io, io->client->pool);
fn(r, context);
}
}
if (atomic_dec_and_test(&io->count))
complete_io(io);
}
static void endio(struct bio *bio, int error)
......@@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions,
dec_count(io, 0, 0);
}
struct sync_io {
unsigned long error_bits;
struct completion wait;
};
static void sync_io_complete(unsigned long error, void *context)
{
struct sync_io *sio = context;
sio->error_bits = error;
complete(&sio->wait);
}
static int sync_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, int rw, struct dpages *dp,
unsigned long *error_bits)
{
/*
* gcc <= 4.3 can't do the alignment for stack variables, so we must
* align it on our own.
* volatile prevents the optimizer from removing or reusing
* "io_" field from the stack frame (allowed in ANSI C).
*/
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
DECLARE_COMPLETION_ONSTACK(wait);
struct io *io;
struct sync_io sio;
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
return -EIO;
}
init_completion(&sio.wait);
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->wait = &wait;
io->client = client;
io->callback = sync_io_complete;
io->context = &sio;
io->vma_invalidate_address = dp->vma_invalidate_address;
io->vma_invalidate_size = dp->vma_invalidate_size;
dispatch_io(rw, num_regions, where, dp, io, 1);
wait_for_completion_io(&wait);
wait_for_completion_io(&sio.wait);
if (error_bits)
*error_bits = io->error_bits;
*error_bits = sio.error_bits;
return io->error_bits ? -EIO : 0;
return sio.error_bits ? -EIO : 0;
}
static int async_io(struct dm_io_client *client, unsigned int num_regions,
......@@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->wait = NULL;
io->client = client;
io->callback = fn;
io->context = context;
......@@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
* New collapsed (a)synchronous interface.
*
* If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
* the queue with blk_unplug() some time later or set REQ_SYNC in
io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
* the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
* the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
* If you fail to do one of these, the IO will be submitted to the disk after
* q->unplug_delay, which defaults to 3ms in blk-settings.c.
*/
int dm_io(struct dm_io_request *io_req, unsigned num_regions,
struct dm_io_region *where, unsigned long *sync_error_bits)
......
......@@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m)
dm_noflush_suspending(m->ti)));
}
#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
/*
* Map cloned requests
*/
......@@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
if (!__must_push_back(m))
r = -EIO; /* Failed */
goto out_unlock;
}
if (!pg_ready(m)) {
} else if (m->queue_io || m->pg_init_required) {
__pg_init_all_paths(m);
goto out_unlock;
}
if (set_mapinfo(m, map_context) < 0)
/* ENOMEM, requeue */
goto out_unlock;
......
......@@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr
*bit *= sctx->region_table_entry_bits;
}
static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
{
unsigned long region_index;
unsigned bit;
switch_get_position(sctx, region_nr, &region_index, &bit);
return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
((1 << sctx->region_table_entry_bits) - 1);
}
/*
* Find which path to use at given offset.
*/
static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
{
unsigned long region_index;
unsigned bit, path_nr;
unsigned path_nr;
sector_t p;
p = offset;
......@@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
else
sector_div(p, sctx->region_size);
switch_get_position(sctx, p, &region_index, &bit);
path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
((1 << sctx->region_table_entry_bits) - 1);
path_nr = switch_region_table_read(sctx, p);
/* This can only happen if the processor uses non-atomic stores. */
if (unlikely(path_nr >= sctx->nr_paths))
......@@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string)
}
static int process_set_region_mappings(struct switch_ctx *sctx,
unsigned argc, char **argv)
unsigned argc, char **argv)
{
unsigned i;
unsigned long region_index = 0;
......@@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx,
unsigned long path_nr;
const char *string = argv[i];
if ((*string & 0xdf) == 'R') {
unsigned long cycle_length, num_write;
string++;
if (unlikely(*string == ',')) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
cycle_length = parse_hex(&string);
if (unlikely(*string != ',')) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
string++;
if (unlikely(!*string)) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
num_write = parse_hex(&string);
if (unlikely(*string)) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
cycle_length - 1, region_index);
return -EINVAL;
}
if (unlikely(region_index + num_write < region_index) ||
unlikely(region_index + num_write >= sctx->nr_regions)) {
DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
region_index, num_write, sctx->nr_regions);
return -EINVAL;
}
while (num_write--) {
region_index++;
path_nr = switch_region_table_read(sctx, region_index - cycle_length);
switch_region_table_write(sctx, region_index, path_nr);
}
continue;
}
if (*string == ':')
region_index++;
else {
......@@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti,
static struct target_type switch_target = {
.name = "switch",
.version = {1, 0, 0},
.version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = switch_ctr,
.dtr = switch_dtr,
......
......@@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
return q && !blk_queue_add_random(q);
}
static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
......@@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t)
return true;
}
static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
return q && blk_queue_discard(q);
}
static bool dm_table_supports_discards(struct dm_table *t)
{
struct dm_target *ti;
unsigned i = 0;
/*
* Unless any target used by the table set discards_supported,
* require at least one underlying device to support discards.
* t->devices includes internal dm devices such as mirror logs
* so we need to use iterate_devices here, which targets
* supporting discard selectively must provide.
*/
while (i < dm_table_get_num_targets(t)) {
ti = dm_table_get_target(t, i++);
if (!ti->num_discard_bios)
continue;
if (ti->discards_supported)
return 1;
if (ti->type->iterate_devices &&
ti->type->iterate_devices(ti, device_discard_capable, NULL))
return 1;
}
return 0;
}
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
......@@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_same(t))
q->limits.max_write_same_sectors = 0;
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
else
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
dm_table_set_integrity(t);
/*
......@@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_run_md_queue_async);
static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
return q && blk_queue_discard(q);
}
bool dm_table_supports_discards(struct dm_table *t)
{
struct dm_target *ti;
unsigned i = 0;
/*
* Unless any target used by the table set discards_supported,
* require at least one underlying device to support discards.
* t->devices includes internal dm devices such as mirror logs
* so we need to use iterate_devices here, which targets
* supporting discard selectively must provide.
*/
while (i < dm_table_get_num_targets(t)) {
ti = dm_table_get_target(t, i++);
if (!ti->num_discard_bios)
continue;
if (ti->discards_supported)
return 1;
if (ti->type->iterate_devices &&
ti->type->iterate_devices(ti, device_discard_capable, NULL))
return 1;
}
return 0;
}
......@@ -227,6 +227,7 @@ struct thin_c {
struct list_head list;
struct dm_dev *pool_dev;
struct dm_dev *origin_dev;
sector_t origin_size;
dm_thin_id dev_id;
struct pool *pool;
......@@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
struct dm_thin_new_mapping {
struct list_head list;
bool quiesced:1;
bool prepared:1;
bool pass_discard:1;
bool definitely_not_shared:1;
/*
* Track quiescing, copying and zeroing preparation actions. When this
* counter hits zero the block is prepared and can be inserted into the
* btree.
*/
atomic_t prepare_actions;
int err;
struct thin_c *tc;
dm_block_t virt_block;
......@@ -575,43 +581,41 @@ struct dm_thin_new_mapping {
bio_end_io_t *saved_bi_end_io;
};
static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
{
struct pool *pool = m->tc->pool;
if (m->quiesced && m->prepared) {
if (atomic_dec_and_test(&m->prepare_actions)) {
list_add_tail(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
}
static void copy_complete(int read_err, unsigned long write_err, void *context)
static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
{
unsigned long flags;
struct dm_thin_new_mapping *m = context;
struct pool *pool = m->tc->pool;
m->err = read_err || write_err ? -EIO : 0;
spin_lock_irqsave(&pool->lock, flags);
m->prepared = true;
__maybe_add_mapping(m);
__complete_mapping_preparation(m);
spin_unlock_irqrestore(&pool->lock, flags);
}
static void copy_complete(int read_err, unsigned long write_err, void *context)
{
struct dm_thin_new_mapping *m = context;
m->err = read_err || write_err ? -EIO : 0;
complete_mapping_preparation(m);
}
static void overwrite_endio(struct bio *bio, int err)
{
unsigned long flags;
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
struct pool *pool = m->tc->pool;
m->err = err;
spin_lock_irqsave(&pool->lock, flags);
m->prepared = true;
__maybe_add_mapping(m);
spin_unlock_irqrestore(&pool->lock, flags);
complete_mapping_preparation(m);
}
/*----------------------------------------------------------------*/
......@@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
return m;
}
static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
sector_t begin, sector_t end)
{
int r;
struct dm_io_region to;
to.bdev = tc->pool_dev->bdev;
to.sector = begin;
to.count = end - begin;
r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
if (r < 0) {
DMERR_LIMIT("dm_kcopyd_zero() failed");
copy_complete(1, 1, m);
}
}
/*
* A partial copy also needs to zero the uncopied region.
*/
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_dev *origin, dm_block_t data_origin,
dm_block_t data_dest,
struct dm_bio_prison_cell *cell, struct bio *bio)
struct dm_bio_prison_cell *cell, struct bio *bio,
sector_t len)
{
int r;
struct pool *pool = tc->pool;
......@@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
m->data_block = data_dest;
m->cell = cell;
/*
* quiesce action + copy action + an extra reference held for the
* duration of this function (we may need to inc later for a
* partial zero).
*/
atomic_set(&m->prepare_actions, 3);
if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
m->quiesced = true;
complete_mapping_preparation(m); /* already quiesced */
/*
* IO to pool_dev remaps to the pool target's data_dev.
......@@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
from.bdev = origin->bdev;
from.sector = data_origin * pool->sectors_per_block;
from.count = pool->sectors_per_block;
from.count = len;
to.bdev = tc->pool_dev->bdev;
to.sector = data_dest * pool->sectors_per_block;
to.count = pool->sectors_per_block;
to.count = len;
r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
0, copy_complete, m);
if (r < 0) {
mempool_free(m, pool->mapping_pool);
DMERR_LIMIT("dm_kcopyd_copy() failed");
cell_error(pool, cell);
copy_complete(1, 1, m);
/*
* We allow the zero to be issued, to simplify the
* error path. Otherwise we'd need to start
* worrying about decrementing the prepare_actions
* counter.
*/
}
/*
* Do we need to zero a tail region?
*/
if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
atomic_inc(&m->prepare_actions);
ll_zero(tc, m,
data_dest * pool->sectors_per_block + len,
(data_dest + 1) * pool->sectors_per_block);
}
}
complete_mapping_preparation(m); /* drop our ref */
}
static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
......@@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_bio_prison_cell *cell, struct bio *bio)
{
schedule_copy(tc, virt_block, tc->pool_dev,
data_origin, data_dest, cell, bio);
}
static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_dest,
struct dm_bio_prison_cell *cell, struct bio *bio)
{
schedule_copy(tc, virt_block, tc->origin_dev,
virt_block, data_dest, cell, bio);
data_origin, data_dest, cell, bio,
tc->pool->sectors_per_block);
}
static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
......@@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
struct pool *pool = tc->pool;
struct dm_thin_new_mapping *m = get_next_mapping(pool);
m->quiesced = true;
m->prepared = false;
atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
m->virt_block = virt_block;
m->data_block = data_block;
......@@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
inc_all_io_entry(pool, bio);
remap_and_issue(tc, bio, data_block);
} else {
int r;
struct dm_io_region to;
to.bdev = tc->pool_dev->bdev;
to.sector = data_block * pool->sectors_per_block;
to.count = pool->sectors_per_block;
} else
ll_zero(tc, m,
data_block * pool->sectors_per_block,
(data_block + 1) * pool->sectors_per_block);
}
r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
if (r < 0) {
mempool_free(m, pool->mapping_pool);
DMERR_LIMIT("dm_kcopyd_zero() failed");
cell_error(pool, cell);
}
}
static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_dest,
struct dm_bio_prison_cell *cell, struct bio *bio)
{
struct pool *pool = tc->pool;
sector_t virt_block_begin = virt_block * pool->sectors_per_block;
sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
if (virt_block_end <= tc->origin_size)
schedule_copy(tc, virt_block, tc->origin_dev,
virt_block, data_dest, cell, bio,
pool->sectors_per_block);
else if (virt_block_begin < tc->origin_size)
schedule_copy(tc, virt_block, tc->origin_dev,
virt_block, data_dest, cell, bio,
tc->origin_size - virt_block_begin);
else
schedule_zero(tc, virt_block, data_dest, cell, bio);
}
/*
......@@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
remap_to_origin_and_issue(tc, bio);
if (bio_end_sector(bio) <= tc->origin_size)
remap_to_origin_and_issue(tc, bio);
else if (bio->bi_iter.bi_sector < tc->origin_size) {
zero_fill_bio(bio);
bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
remap_to_origin_and_issue(tc, bio);
} else {
zero_fill_bio(bio);
bio_endio(bio, 0);
}
} else
provision_block(tc, bio, block, cell);
break;
......@@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
if (io_opt_sectors < pool->sectors_per_block ||
do_div(io_opt_sectors, pool->sectors_per_block)) {
blk_limits_io_min(limits, 0);
blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
}
......@@ -3141,7 +3206,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
.version = {1, 12, 0},
.version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
......@@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &work, list) {
list_del(&m->list);
m->quiesced = true;
__maybe_add_mapping(m);
__complete_mapping_preparation(m);
}
spin_unlock_irqrestore(&pool->lock, flags);
}
......@@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti)
noflush_work(tc, do_noflush_stop);
}
static int thin_preresume(struct dm_target *ti)
{
struct thin_c *tc = ti->private;
if (tc->origin_dev)
tc->origin_size = get_dev_size(tc->origin_dev->bdev);
return 0;
}
/*
* <nr mapped sectors> <highest mapped sector>
*/
......@@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = {
.name = "thin",
.version = {1, 12, 0},
.version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
.map = thin_map,
.end_io = thin_endio,
.preresume = thin_preresume,
.presuspend = thin_presuspend,
.postsuspend = thin_postsuspend,
.status = thin_status,
......
......@@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
bool dm_table_supports_discards(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册