dm.c 38.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
M
Milan Broz 已提交
3
 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
L
Linus Torvalds 已提交
4 5 6 7 8
 *
 * This file is released under the GPL.
 */

#include "dm.h"
M
Mike Anderson 已提交
9
#include "dm-uevent.h"
L
Linus Torvalds 已提交
10 11 12

#include <linux/init.h>
#include <linux/module.h>
A
Arjan van de Ven 已提交
13
#include <linux/mutex.h>
L
Linus Torvalds 已提交
14 15 16 17 18 19 20
#include <linux/moduleparam.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
D
Darrick J. Wong 已提交
21
#include <linux/hdreg.h>
22 23

#include <trace/events/block.h>
L
Linus Torvalds 已提交
24

25 26
#define DM_MSG_PREFIX "core"

M
Milan Broz 已提交
27 28 29 30 31 32 33
/*
 * Cookies are numeric values sent with CHANGE and REMOVE
 * uevents while resuming, removing or renaming the device.
 */
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
#define DM_COOKIE_LENGTH 24

L
Linus Torvalds 已提交
34 35 36 37 38
static const char *_name = DM_NAME;

static unsigned int major = 0;
static unsigned int _major = 0;

39
static DEFINE_SPINLOCK(_minor_lock);
L
Linus Torvalds 已提交
40
/*
K
Kiyoshi Ueda 已提交
41
 * For bio-based dm.
L
Linus Torvalds 已提交
42 43 44 45 46 47
 * One of these is allocated per bio.
 */
struct dm_io {
	struct mapped_device *md;
	int error;
	atomic_t io_count;
R
Richard Kennedy 已提交
48
	struct bio *bio;
49
	unsigned long start_time;
L
Linus Torvalds 已提交
50 51 52
};

/*
K
Kiyoshi Ueda 已提交
53
 * For bio-based dm.
L
Linus Torvalds 已提交
54 55 56
 * One of these is allocated per target within a bio.  Hopefully
 * this will be simplified out one day.
 */
A
Alasdair G Kergon 已提交
57
struct dm_target_io {
L
Linus Torvalds 已提交
58 59 60 61 62
	struct dm_io *io;
	struct dm_target *ti;
	union map_info info;
};

K
Kiyoshi Ueda 已提交
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
/*
 * For request-based dm.
 * One of these is allocated per request.
 */
struct dm_rq_target_io {
	struct mapped_device *md;
	struct dm_target *ti;
	struct request *orig, clone;
	int error;
	union map_info info;
};

/*
 * For request-based dm.
 * One of these is allocated per bio.
 */
struct dm_rq_clone_bio_info {
	struct bio *orig;
	struct request *rq;
};

L
Linus Torvalds 已提交
84 85
union map_info *dm_get_mapinfo(struct bio *bio)
{
A
Alasdair G Kergon 已提交
86
	if (bio && bio->bi_private)
A
Alasdair G Kergon 已提交
87
		return &((struct dm_target_io *)bio->bi_private)->info;
A
Alasdair G Kergon 已提交
88
	return NULL;
L
Linus Torvalds 已提交
89 90
}

91 92
#define MINOR_ALLOCED ((void *)-1)

L
Linus Torvalds 已提交
93 94 95
/*
 * Bits for the md->flags field.
 */
96
#define DMF_BLOCK_IO_FOR_SUSPEND 0
L
Linus Torvalds 已提交
97
#define DMF_SUSPENDED 1
98
#define DMF_FROZEN 2
J
Jeff Mahoney 已提交
99
#define DMF_FREEING 3
100
#define DMF_DELETING 4
101
#define DMF_NOFLUSH_SUSPENDING 5
102
#define DMF_QUEUE_IO_TO_THREAD 6
L
Linus Torvalds 已提交
103

104 105 106
/*
 * Work processed by per-device workqueue.
 */
L
Linus Torvalds 已提交
107
struct mapped_device {
108
	struct rw_semaphore io_lock;
109
	struct mutex suspend_lock;
L
Linus Torvalds 已提交
110 111
	rwlock_t map_lock;
	atomic_t holders;
112
	atomic_t open_count;
L
Linus Torvalds 已提交
113 114 115

	unsigned long flags;

116
	struct request_queue *queue;
L
Linus Torvalds 已提交
117
	struct gendisk *disk;
M
Mike Anderson 已提交
118
	char name[16];
L
Linus Torvalds 已提交
119 120 121 122 123 124 125 126

	void *interface_ptr;

	/*
	 * A list of ios that arrived while we were suspended.
	 */
	atomic_t pending;
	wait_queue_head_t wait;
127
	struct work_struct work;
K
Kiyoshi Ueda 已提交
128
	struct bio_list deferred;
129
	spinlock_t deferred_lock;
L
Linus Torvalds 已提交
130

131 132 133 134 135
	/*
	 * An error from the barrier request currently being processed.
	 */
	int barrier_error;

136 137 138 139 140
	/*
	 * Processing queue (flush/barriers)
	 */
	struct workqueue_struct *wq;

L
Linus Torvalds 已提交
141 142 143 144 145 146 147 148 149 150 151
	/*
	 * The current mapping.
	 */
	struct dm_table *map;

	/*
	 * io objects are allocated from here.
	 */
	mempool_t *io_pool;
	mempool_t *tio_pool;

S
Stefan Bader 已提交
152 153
	struct bio_set *bs;

L
Linus Torvalds 已提交
154 155 156 157 158
	/*
	 * Event handling.
	 */
	atomic_t event_nr;
	wait_queue_head_t eventq;
M
Mike Anderson 已提交
159 160 161
	atomic_t uevent_seq;
	struct list_head uevent_list;
	spinlock_t uevent_lock; /* Protect access to uevent_list */
L
Linus Torvalds 已提交
162 163 164 165 166

	/*
	 * freeze/thaw support require holding onto a super block
	 */
	struct super_block *frozen_sb;
167
	struct block_device *bdev;
D
Darrick J. Wong 已提交
168 169 170

	/* forced geometry settings */
	struct hd_geometry geometry;
M
Milan Broz 已提交
171 172 173

	/* sysfs handle */
	struct kobject kobj;
174 175 176

	/* zero-length barrier that will be cloned and submitted to targets */
	struct bio barrier_bio;
L
Linus Torvalds 已提交
177 178 179
};

#define MIN_IOS 256
180 181
static struct kmem_cache *_io_cache;
static struct kmem_cache *_tio_cache;
K
Kiyoshi Ueda 已提交
182 183
static struct kmem_cache *_rq_tio_cache;
static struct kmem_cache *_rq_bio_info_cache;
L
Linus Torvalds 已提交
184 185 186

static int __init local_init(void)
{
K
Kiyoshi Ueda 已提交
187
	int r = -ENOMEM;
L
Linus Torvalds 已提交
188 189

	/* allocate a slab for the dm_ios */
A
Alasdair G Kergon 已提交
190
	_io_cache = KMEM_CACHE(dm_io, 0);
L
Linus Torvalds 已提交
191
	if (!_io_cache)
K
Kiyoshi Ueda 已提交
192
		return r;
L
Linus Torvalds 已提交
193 194

	/* allocate a slab for the target ios */
A
Alasdair G Kergon 已提交
195
	_tio_cache = KMEM_CACHE(dm_target_io, 0);
K
Kiyoshi Ueda 已提交
196 197
	if (!_tio_cache)
		goto out_free_io_cache;
L
Linus Torvalds 已提交
198

K
Kiyoshi Ueda 已提交
199 200 201 202 203 204 205 206
	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
	if (!_rq_tio_cache)
		goto out_free_tio_cache;

	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
	if (!_rq_bio_info_cache)
		goto out_free_rq_tio_cache;

M
Mike Anderson 已提交
207
	r = dm_uevent_init();
K
Kiyoshi Ueda 已提交
208
	if (r)
K
Kiyoshi Ueda 已提交
209
		goto out_free_rq_bio_info_cache;
M
Mike Anderson 已提交
210

L
Linus Torvalds 已提交
211 212
	_major = major;
	r = register_blkdev(_major, _name);
K
Kiyoshi Ueda 已提交
213 214
	if (r < 0)
		goto out_uevent_exit;
L
Linus Torvalds 已提交
215 216 217 218 219

	if (!_major)
		_major = r;

	return 0;
K
Kiyoshi Ueda 已提交
220 221 222

out_uevent_exit:
	dm_uevent_exit();
K
Kiyoshi Ueda 已提交
223 224 225 226
out_free_rq_bio_info_cache:
	kmem_cache_destroy(_rq_bio_info_cache);
out_free_rq_tio_cache:
	kmem_cache_destroy(_rq_tio_cache);
K
Kiyoshi Ueda 已提交
227 228 229 230 231 232
out_free_tio_cache:
	kmem_cache_destroy(_tio_cache);
out_free_io_cache:
	kmem_cache_destroy(_io_cache);

	return r;
L
Linus Torvalds 已提交
233 234 235 236
}

static void local_exit(void)
{
K
Kiyoshi Ueda 已提交
237 238
	kmem_cache_destroy(_rq_bio_info_cache);
	kmem_cache_destroy(_rq_tio_cache);
L
Linus Torvalds 已提交
239 240
	kmem_cache_destroy(_tio_cache);
	kmem_cache_destroy(_io_cache);
241
	unregister_blkdev(_major, _name);
M
Mike Anderson 已提交
242
	dm_uevent_exit();
L
Linus Torvalds 已提交
243 244 245 246 247 248

	_major = 0;

	DMINFO("cleaned up");
}

249
static int (*_inits[])(void) __initdata = {
L
Linus Torvalds 已提交
250 251 252 253
	local_init,
	dm_target_init,
	dm_linear_init,
	dm_stripe_init,
254
	dm_kcopyd_init,
L
Linus Torvalds 已提交
255 256 257
	dm_interface_init,
};

258
static void (*_exits[])(void) = {
L
Linus Torvalds 已提交
259 260 261 262
	local_exit,
	dm_target_exit,
	dm_linear_exit,
	dm_stripe_exit,
263
	dm_kcopyd_exit,
L
Linus Torvalds 已提交
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
	dm_interface_exit,
};

static int __init dm_init(void)
{
	const int count = ARRAY_SIZE(_inits);

	int r, i;

	for (i = 0; i < count; i++) {
		r = _inits[i]();
		if (r)
			goto bad;
	}

	return 0;

      bad:
	while (i--)
		_exits[i]();

	return r;
}

static void __exit dm_exit(void)
{
	int i = ARRAY_SIZE(_exits);

	while (i--)
		_exits[i]();
}

/*
 * Block device functions
 */
A
Al Viro 已提交
299
static int dm_blk_open(struct block_device *bdev, fmode_t mode)
L
Linus Torvalds 已提交
300 301 302
{
	struct mapped_device *md;

J
Jeff Mahoney 已提交
303 304
	spin_lock(&_minor_lock);

A
Al Viro 已提交
305
	md = bdev->bd_disk->private_data;
J
Jeff Mahoney 已提交
306 307 308
	if (!md)
		goto out;

309 310
	if (test_bit(DMF_FREEING, &md->flags) ||
	    test_bit(DMF_DELETING, &md->flags)) {
J
Jeff Mahoney 已提交
311 312 313 314
		md = NULL;
		goto out;
	}

L
Linus Torvalds 已提交
315
	dm_get(md);
316
	atomic_inc(&md->open_count);
J
Jeff Mahoney 已提交
317 318 319 320 321

out:
	spin_unlock(&_minor_lock);

	return md ? 0 : -ENXIO;
L
Linus Torvalds 已提交
322 323
}

A
Al Viro 已提交
324
static int dm_blk_close(struct gendisk *disk, fmode_t mode)
L
Linus Torvalds 已提交
325
{
A
Al Viro 已提交
326
	struct mapped_device *md = disk->private_data;
327
	atomic_dec(&md->open_count);
L
Linus Torvalds 已提交
328 329 330 331
	dm_put(md);
	return 0;
}

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
int dm_open_count(struct mapped_device *md)
{
	return atomic_read(&md->open_count);
}

/*
 * Guarantees nothing is using the device before it's deleted.
 */
int dm_lock_for_deletion(struct mapped_device *md)
{
	int r = 0;

	spin_lock(&_minor_lock);

	if (dm_open_count(md))
		r = -EBUSY;
	else
		set_bit(DMF_DELETING, &md->flags);

	spin_unlock(&_minor_lock);

	return r;
}

D
Darrick J. Wong 已提交
356 357 358 359 360 361 362
static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
	struct mapped_device *md = bdev->bd_disk->private_data;

	return dm_get_geometry(md, geo);
}

A
Al Viro 已提交
363
static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
364 365
			unsigned int cmd, unsigned long arg)
{
A
Al Viro 已提交
366 367
	struct mapped_device *md = bdev->bd_disk->private_data;
	struct dm_table *map = dm_get_table(md);
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
	struct dm_target *tgt;
	int r = -ENOTTY;

	if (!map || !dm_table_get_size(map))
		goto out;

	/* We only support devices that have a single target */
	if (dm_table_get_num_targets(map) != 1)
		goto out;

	tgt = dm_table_get_target(map, 0);

	if (dm_suspended(md)) {
		r = -EAGAIN;
		goto out;
	}

	if (tgt->type->ioctl)
386
		r = tgt->type->ioctl(tgt, cmd, arg);
387 388 389 390 391 392 393

out:
	dm_table_put(map);

	return r;
}

A
Alasdair G Kergon 已提交
394
static struct dm_io *alloc_io(struct mapped_device *md)
L
Linus Torvalds 已提交
395 396 397 398
{
	return mempool_alloc(md->io_pool, GFP_NOIO);
}

A
Alasdair G Kergon 已提交
399
static void free_io(struct mapped_device *md, struct dm_io *io)
L
Linus Torvalds 已提交
400 401 402 403
{
	mempool_free(io, md->io_pool);
}

A
Alasdair G Kergon 已提交
404
static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
L
Linus Torvalds 已提交
405 406 407 408
{
	mempool_free(tio, md->tio_pool);
}

409 410 411
static void start_io_acct(struct dm_io *io)
{
	struct mapped_device *md = io->md;
T
Tejun Heo 已提交
412
	int cpu;
413 414 415

	io->start_time = jiffies;

T
Tejun Heo 已提交
416 417 418 419
	cpu = part_stat_lock();
	part_round_stats(cpu, &dm_disk(md)->part0);
	part_stat_unlock();
	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
420 421
}

422
static void end_io_acct(struct dm_io *io)
423 424 425 426
{
	struct mapped_device *md = io->md;
	struct bio *bio = io->bio;
	unsigned long duration = jiffies - io->start_time;
T
Tejun Heo 已提交
427
	int pending, cpu;
428 429
	int rw = bio_data_dir(bio);

T
Tejun Heo 已提交
430 431 432 433
	cpu = part_stat_lock();
	part_round_stats(cpu, &dm_disk(md)->part0);
	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
	part_stat_unlock();
434

435 436 437 438
	/*
	 * After this is decremented the bio must not be touched if it is
	 * a barrier.
	 */
T
Tejun Heo 已提交
439 440
	dm_disk(md)->part0.in_flight = pending =
		atomic_dec_return(&md->pending);
441

442 443 444
	/* nudge anyone waiting on suspend queue */
	if (!pending)
		wake_up(&md->wait);
445 446
}

L
Linus Torvalds 已提交
447 448 449
/*
 * Add the bio to the list of deferred io.
 */
M
Mikulas Patocka 已提交
450
static void queue_io(struct mapped_device *md, struct bio *bio)
L
Linus Torvalds 已提交
451
{
452
	down_write(&md->io_lock);
L
Linus Torvalds 已提交
453

454
	spin_lock_irq(&md->deferred_lock);
L
Linus Torvalds 已提交
455
	bio_list_add(&md->deferred, bio);
456
	spin_unlock_irq(&md->deferred_lock);
L
Linus Torvalds 已提交
457

M
Mikulas Patocka 已提交
458 459 460
	if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
		queue_work(md->wq, &md->work);

461
	up_write(&md->io_lock);
L
Linus Torvalds 已提交
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
}

/*
 * Everyone (including functions in this file), should use this
 * function to access the md->map field, and make sure they call
 * dm_table_put() when finished.
 */
struct dm_table *dm_get_table(struct mapped_device *md)
{
	struct dm_table *t;

	read_lock(&md->map_lock);
	t = md->map;
	if (t)
		dm_table_get(t);
	read_unlock(&md->map_lock);

	return t;
}

D
Darrick J. Wong 已提交
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
/*
 * Get the geometry associated with a dm device
 */
int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
	*geo = md->geometry;

	return 0;
}

/*
 * Set the geometry of a device.
 */
int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;

	if (geo->start > sz) {
		DMWARN("Start sector is beyond the geometry limits.");
		return -EINVAL;
	}

	md->geometry = *geo;

	return 0;
}

L
Linus Torvalds 已提交
509 510 511 512 513 514 515 516 517
/*-----------------------------------------------------------------
 * CRUD START:
 *   A more elegant soln is in the works that uses the queue
 *   merge fn, unfortunately there are a couple of changes to
 *   the block layer that I want to make for this.  So in the
 *   interests of getting something for people to use I give
 *   you this clearly demarcated crap.
 *---------------------------------------------------------------*/

518 519 520 521 522
static int __noflush_suspending(struct mapped_device *md)
{
	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
}

L
Linus Torvalds 已提交
523 524 525 526
/*
 * Decrements the number of outstanding ios that a bio has been
 * cloned into, completing the original io if necc.
 */
527
static void dec_pending(struct dm_io *io, int error)
L
Linus Torvalds 已提交
528
{
529
	unsigned long flags;
530 531 532
	int io_error;
	struct bio *bio;
	struct mapped_device *md = io->md;
533 534

	/* Push-back supersedes any I/O errors */
535
	if (error && !(io->error > 0 && __noflush_suspending(md)))
L
Linus Torvalds 已提交
536 537 538
		io->error = error;

	if (atomic_dec_and_test(&io->io_count)) {
539 540 541 542
		if (io->error == DM_ENDIO_REQUEUE) {
			/*
			 * Target requested pushing back the I/O.
			 */
543
			spin_lock_irqsave(&md->deferred_lock, flags);
544 545 546 547 548
			if (__noflush_suspending(md)) {
				if (!bio_barrier(io->bio))
					bio_list_add_head(&md->deferred,
							  io->bio);
			} else
549 550
				/* noflush suspend was interrupted. */
				io->error = -EIO;
551
			spin_unlock_irqrestore(&md->deferred_lock, flags);
552 553
		}

554 555
		io_error = io->error;
		bio = io->bio;
556

557 558 559 560 561 562
		if (bio_barrier(bio)) {
			/*
			 * There can be just one barrier request so we use
			 * a per-device variable for error reporting.
			 * Note that you can't touch the bio after end_io_acct
			 */
563
			if (!md->barrier_error && io_error != -EOPNOTSUPP)
564
				md->barrier_error = io_error;
565 566 567
			end_io_acct(io);
		} else {
			end_io_acct(io);
568

569 570
			if (io_error != DM_ENDIO_REQUEUE) {
				trace_block_bio_complete(md->queue, bio);
571

572 573
				bio_endio(bio, io_error);
			}
574
		}
575 576

		free_io(md, io);
L
Linus Torvalds 已提交
577 578 579
	}
}

580
static void clone_endio(struct bio *bio, int error)
L
Linus Torvalds 已提交
581 582
{
	int r = 0;
A
Alasdair G Kergon 已提交
583
	struct dm_target_io *tio = bio->bi_private;
584
	struct dm_io *io = tio->io;
S
Stefan Bader 已提交
585
	struct mapped_device *md = tio->io->md;
L
Linus Torvalds 已提交
586 587 588 589 590 591 592
	dm_endio_fn endio = tio->ti->type->end_io;

	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
		error = -EIO;

	if (endio) {
		r = endio(tio->ti, bio, error, &tio->info);
593 594 595 596 597
		if (r < 0 || r == DM_ENDIO_REQUEUE)
			/*
			 * error and requeue request are handled
			 * in dec_pending().
			 */
L
Linus Torvalds 已提交
598
			error = r;
599 600
		else if (r == DM_ENDIO_INCOMPLETE)
			/* The target will handle the io */
601
			return;
602 603 604 605
		else if (r) {
			DMWARN("unimplemented target endio return value: %d", r);
			BUG();
		}
L
Linus Torvalds 已提交
606 607
	}

S
Stefan Bader 已提交
608 609 610 611 612 613
	/*
	 * Store md for cleanup instead of tio which is about to get freed.
	 */
	bio->bi_private = md->bs;

	free_tio(md, tio);
614 615
	bio_put(bio);
	dec_pending(io, error);
L
Linus Torvalds 已提交
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638
}

static sector_t max_io_len(struct mapped_device *md,
			   sector_t sector, struct dm_target *ti)
{
	sector_t offset = sector - ti->begin;
	sector_t len = ti->len - offset;

	/*
	 * Does the target need to split even further ?
	 */
	if (ti->split_io) {
		sector_t boundary;
		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
			   - offset;
		if (len > boundary)
			len = boundary;
	}

	return len;
}

static void __map_bio(struct dm_target *ti, struct bio *clone,
A
Alasdair G Kergon 已提交
639
		      struct dm_target_io *tio)
L
Linus Torvalds 已提交
640 641
{
	int r;
642
	sector_t sector;
S
Stefan Bader 已提交
643
	struct mapped_device *md;
L
Linus Torvalds 已提交
644 645 646 647 648 649 650 651 652 653

	clone->bi_end_io = clone_endio;
	clone->bi_private = tio;

	/*
	 * Map the clone.  If r == 0 we don't need to do
	 * anything, the target has assumed ownership of
	 * this io.
	 */
	atomic_inc(&tio->io->io_count);
654
	sector = clone->bi_sector;
L
Linus Torvalds 已提交
655
	r = ti->type->map(ti, clone, &tio->info);
656
	if (r == DM_MAPIO_REMAPPED) {
L
Linus Torvalds 已提交
657
		/* the bio has been remapped so dispatch it */
658

659
		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
660
				    tio->io->bio->bi_bdev->bd_dev, sector);
661

L
Linus Torvalds 已提交
662
		generic_make_request(clone);
663 664
	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
		/* error the io and bail out, or requeue it if needed */
S
Stefan Bader 已提交
665 666 667 668 669 670
		md = tio->io->md;
		dec_pending(tio->io, r);
		/*
		 * Store bio_set for cleanup.
		 */
		clone->bi_private = md->bs;
L
Linus Torvalds 已提交
671
		bio_put(clone);
S
Stefan Bader 已提交
672
		free_tio(md, tio);
673 674 675
	} else if (r) {
		DMWARN("unimplemented target map return value: %d", r);
		BUG();
L
Linus Torvalds 已提交
676 677 678 679 680 681 682 683 684 685 686 687 688
	}
}

struct clone_info {
	struct mapped_device *md;
	struct dm_table *map;
	struct bio *bio;
	struct dm_io *io;
	sector_t sector;
	sector_t sector_count;
	unsigned short idx;
};

P
Peter Osterlund 已提交
689 690
static void dm_bio_destructor(struct bio *bio)
{
S
Stefan Bader 已提交
691 692 693
	struct bio_set *bs = bio->bi_private;

	bio_free(bio, bs);
P
Peter Osterlund 已提交
694 695
}

L
Linus Torvalds 已提交
696 697 698 699 700
/*
 * Creates a little bio that is just does part of a bvec.
 */
static struct bio *split_bvec(struct bio *bio, sector_t sector,
			      unsigned short idx, unsigned int offset,
S
Stefan Bader 已提交
701
			      unsigned int len, struct bio_set *bs)
L
Linus Torvalds 已提交
702 703 704 705
{
	struct bio *clone;
	struct bio_vec *bv = bio->bi_io_vec + idx;

S
Stefan Bader 已提交
706
	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
P
Peter Osterlund 已提交
707
	clone->bi_destructor = dm_bio_destructor;
L
Linus Torvalds 已提交
708 709 710 711
	*clone->bi_io_vec = *bv;

	clone->bi_sector = sector;
	clone->bi_bdev = bio->bi_bdev;
712
	clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
L
Linus Torvalds 已提交
713 714 715 716
	clone->bi_vcnt = 1;
	clone->bi_size = to_bytes(len);
	clone->bi_io_vec->bv_offset = offset;
	clone->bi_io_vec->bv_len = clone->bi_size;
M
Martin K. Petersen 已提交
717
	clone->bi_flags |= 1 << BIO_CLONED;
L
Linus Torvalds 已提交
718

M
Martin K. Petersen 已提交
719 720 721 722 723 724
	if (bio_integrity(bio)) {
		bio_integrity_clone(clone, bio, GFP_NOIO);
		bio_integrity_trim(clone,
				   bio_sector_offset(bio, idx, offset), len);
	}

L
Linus Torvalds 已提交
725 726 727 728 729 730 731 732
	return clone;
}

/*
 * Creates a bio that consists of range of complete bvecs.
 */
static struct bio *clone_bio(struct bio *bio, sector_t sector,
			     unsigned short idx, unsigned short bv_count,
S
Stefan Bader 已提交
733
			     unsigned int len, struct bio_set *bs)
L
Linus Torvalds 已提交
734 735 736
{
	struct bio *clone;

S
Stefan Bader 已提交
737 738
	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
	__bio_clone(clone, bio);
739
	clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
S
Stefan Bader 已提交
740
	clone->bi_destructor = dm_bio_destructor;
L
Linus Torvalds 已提交
741 742 743 744 745 746
	clone->bi_sector = sector;
	clone->bi_idx = idx;
	clone->bi_vcnt = idx + bv_count;
	clone->bi_size = to_bytes(len);
	clone->bi_flags &= ~(1 << BIO_SEG_VALID);

M
Martin K. Petersen 已提交
747 748 749 750 751 752 753 754
	if (bio_integrity(bio)) {
		bio_integrity_clone(clone, bio, GFP_NOIO);

		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
			bio_integrity_trim(clone,
					   bio_sector_offset(bio, idx, 0), len);
	}

L
Linus Torvalds 已提交
755 756 757
	return clone;
}

758 759
static struct dm_target_io *alloc_tio(struct clone_info *ci,
				      struct dm_target *ti)
760
{
761
	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
762 763 764 765

	tio->io = ci->io;
	tio->ti = ti;
	memset(&tio->info, 0, sizeof(tio->info));
766 767 768 769 770 771 772 773 774 775

	return tio;
}

static void __flush_target(struct clone_info *ci, struct dm_target *ti,
			  unsigned flush_nr)
{
	struct dm_target_io *tio = alloc_tio(ci, ti);
	struct bio *clone;

776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
	tio->info.flush_request = flush_nr;

	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
	__bio_clone(clone, ci->bio);
	clone->bi_destructor = dm_bio_destructor;

	__map_bio(ti, clone, tio);
}

static int __clone_and_map_empty_barrier(struct clone_info *ci)
{
	unsigned target_nr = 0, flush_nr;
	struct dm_target *ti;

	while ((ti = dm_table_get_target(ci->map, target_nr++)))
		for (flush_nr = 0; flush_nr < ti->num_flush_requests;
		     flush_nr++)
			__flush_target(ci, ti, flush_nr);

	ci->sector_count = 0;

	return 0;
}

800
static int __clone_and_map(struct clone_info *ci)
L
Linus Torvalds 已提交
801 802
{
	struct bio *clone, *bio = ci->bio;
803 804
	struct dm_target *ti;
	sector_t len = 0, max;
A
Alasdair G Kergon 已提交
805
	struct dm_target_io *tio;
L
Linus Torvalds 已提交
806

807 808 809
	if (unlikely(bio_empty_barrier(bio)))
		return __clone_and_map_empty_barrier(ci);

810 811 812 813 814 815
	ti = dm_table_find_target(ci->map, ci->sector);
	if (!dm_target_is_valid(ti))
		return -EIO;

	max = max_io_len(ci->md, ci->sector, ti);

L
Linus Torvalds 已提交
816 817 818
	/*
	 * Allocate a target io object.
	 */
819
	tio = alloc_tio(ci, ti);
L
Linus Torvalds 已提交
820 821 822 823 824 825 826

	if (ci->sector_count <= max) {
		/*
		 * Optimise for the simple case where we can do all of
		 * the remaining io with a single clone.
		 */
		clone = clone_bio(bio, ci->sector, ci->idx,
S
Stefan Bader 已提交
827 828
				  bio->bi_vcnt - ci->idx, ci->sector_count,
				  ci->md->bs);
L
Linus Torvalds 已提交
829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
		__map_bio(ti, clone, tio);
		ci->sector_count = 0;

	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
		/*
		 * There are some bvecs that don't span targets.
		 * Do as many of these as possible.
		 */
		int i;
		sector_t remaining = max;
		sector_t bv_len;

		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
			bv_len = to_sector(bio->bi_io_vec[i].bv_len);

			if (bv_len > remaining)
				break;

			remaining -= bv_len;
			len += bv_len;
		}

S
Stefan Bader 已提交
851 852
		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
				  ci->md->bs);
L
Linus Torvalds 已提交
853 854 855 856 857 858 859 860
		__map_bio(ti, clone, tio);

		ci->sector += len;
		ci->sector_count -= len;
		ci->idx = i;

	} else {
		/*
861
		 * Handle a bvec that must be split between two or more targets.
L
Linus Torvalds 已提交
862 863
		 */
		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
864 865
		sector_t remaining = to_sector(bv->bv_len);
		unsigned int offset = 0;
L
Linus Torvalds 已提交
866

867 868 869
		do {
			if (offset) {
				ti = dm_table_find_target(ci->map, ci->sector);
870 871 872
				if (!dm_target_is_valid(ti))
					return -EIO;

873
				max = max_io_len(ci->md, ci->sector, ti);
L
Linus Torvalds 已提交
874

875
				tio = alloc_tio(ci, ti);
876 877 878 879 880
			}

			len = min(remaining, max);

			clone = split_bvec(bio, ci->sector, ci->idx,
S
Stefan Bader 已提交
881 882
					   bv->bv_offset + offset, len,
					   ci->md->bs);
883 884 885 886 887 888 889

			__map_bio(ti, clone, tio);

			ci->sector += len;
			ci->sector_count -= len;
			offset += to_bytes(len);
		} while (remaining -= len);
L
Linus Torvalds 已提交
890 891 892

		ci->idx++;
	}
893 894

	return 0;
L
Linus Torvalds 已提交
895 896 897
}

/*
M
Mikulas Patocka 已提交
898
 * Split the bio into several clones and submit it to targets.
L
Linus Torvalds 已提交
899
 */
900
static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
L
Linus Torvalds 已提交
901 902
{
	struct clone_info ci;
903
	int error = 0;
L
Linus Torvalds 已提交
904 905

	ci.map = dm_get_table(md);
906
	if (unlikely(!ci.map)) {
907 908 909
		if (!bio_barrier(bio))
			bio_io_error(bio);
		else
910 911
			if (!md->barrier_error)
				md->barrier_error = -EIO;
912 913
		return;
	}
914

L
Linus Torvalds 已提交
915 916 917 918 919 920 921 922 923
	ci.md = md;
	ci.bio = bio;
	ci.io = alloc_io(md);
	ci.io->error = 0;
	atomic_set(&ci.io->io_count, 1);
	ci.io->bio = bio;
	ci.io->md = md;
	ci.sector = bio->bi_sector;
	ci.sector_count = bio_sectors(bio);
924 925
	if (unlikely(bio_empty_barrier(bio)))
		ci.sector_count = 1;
L
Linus Torvalds 已提交
926 927
	ci.idx = bio->bi_idx;

928
	start_io_acct(ci.io);
929 930
	while (ci.sector_count && !error)
		error = __clone_and_map(&ci);
L
Linus Torvalds 已提交
931 932

	/* drop the extra reference count */
933
	dec_pending(ci.io, error);
L
Linus Torvalds 已提交
934 935 936 937 938 939
	dm_table_put(ci.map);
}
/*-----------------------------------------------------------------
 * CRUD END
 *---------------------------------------------------------------*/

M
Milan Broz 已提交
940 941 942 943 944 945 946 947
static int dm_merge_bvec(struct request_queue *q,
			 struct bvec_merge_data *bvm,
			 struct bio_vec *biovec)
{
	struct mapped_device *md = q->queuedata;
	struct dm_table *map = dm_get_table(md);
	struct dm_target *ti;
	sector_t max_sectors;
948
	int max_size = 0;
M
Milan Broz 已提交
949 950

	if (unlikely(!map))
951
		goto out;
M
Milan Broz 已提交
952 953

	ti = dm_table_find_target(map, bvm->bi_sector);
954 955
	if (!dm_target_is_valid(ti))
		goto out_table;
M
Milan Broz 已提交
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972

	/*
	 * Find maximum amount of I/O that won't need splitting
	 */
	max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
			  (sector_t) BIO_MAX_SECTORS);
	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
	if (max_size < 0)
		max_size = 0;

	/*
	 * merge_bvec_fn() returns number of bytes
	 * it can accept at this offset
	 * max is precomputed maximal io size
	 */
	if (max_size && ti->type->merge)
		max_size = ti->type->merge(ti, bvm, biovec, max_size);
973 974 975 976 977 978 979 980 981 982
	/*
	 * If the target doesn't support merge method and some of the devices
	 * provided their merge_bvec method (we know this by looking at
	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
	 * entries.  So always set max_size to 0, and the code below allows
	 * just one page.
	 */
	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)

		max_size = 0;
M
Milan Broz 已提交
983

984
out_table:
985 986 987
	dm_table_put(map);

out:
M
Milan Broz 已提交
988 989 990 991 992 993 994 995 996
	/*
	 * Always allow an entire first page
	 */
	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
		max_size = biovec->bv_len;

	return max_size;
}

L
Linus Torvalds 已提交
997 998 999 1000
/*
 * The request function that just remaps the bio built up by
 * dm_merge_bvec.
 */
1001
static int dm_request(struct request_queue *q, struct bio *bio)
L
Linus Torvalds 已提交
1002
{
1003
	int rw = bio_data_dir(bio);
L
Linus Torvalds 已提交
1004
	struct mapped_device *md = q->queuedata;
T
Tejun Heo 已提交
1005
	int cpu;
L
Linus Torvalds 已提交
1006

1007
	down_read(&md->io_lock);
L
Linus Torvalds 已提交
1008

T
Tejun Heo 已提交
1009 1010 1011 1012
	cpu = part_stat_lock();
	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
	part_stat_unlock();
1013

L
Linus Torvalds 已提交
1014
	/*
1015 1016
	 * If we're suspended or the thread is processing barriers
	 * we have to queue this io for later.
L
Linus Torvalds 已提交
1017
	 */
1018 1019
	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
	    unlikely(bio_barrier(bio))) {
1020
		up_read(&md->io_lock);
L
Linus Torvalds 已提交
1021

A
Alasdair G Kergon 已提交
1022 1023 1024 1025 1026
		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
		    bio_rw(bio) == READA) {
			bio_io_error(bio);
			return 0;
		}
L
Linus Torvalds 已提交
1027

M
Mikulas Patocka 已提交
1028
		queue_io(md, bio);
L
Linus Torvalds 已提交
1029

M
Mikulas Patocka 已提交
1030
		return 0;
L
Linus Torvalds 已提交
1031 1032
	}

1033
	__split_and_process_bio(md, bio);
1034
	up_read(&md->io_lock);
1035
	return 0;
L
Linus Torvalds 已提交
1036 1037
}

1038
static void dm_unplug_all(struct request_queue *q)
L
Linus Torvalds 已提交
1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
{
	struct mapped_device *md = q->queuedata;
	struct dm_table *map = dm_get_table(md);

	if (map) {
		dm_table_unplug_all(map);
		dm_table_put(map);
	}
}

static int dm_any_congested(void *congested_data, int bdi_bits)
{
1051 1052 1053
	int r = bdi_bits;
	struct mapped_device *md = congested_data;
	struct dm_table *map;
L
Linus Torvalds 已提交
1054

1055
	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1056 1057 1058 1059 1060 1061 1062
		map = dm_get_table(md);
		if (map) {
			r = dm_table_any_congested(map, bdi_bits);
			dm_table_put(map);
		}
	}

L
Linus Torvalds 已提交
1063 1064 1065 1066 1067 1068 1069 1070
	return r;
}

/*-----------------------------------------------------------------
 * An IDR is used to keep track of allocated minor numbers.
 *---------------------------------------------------------------*/
static DEFINE_IDR(_minor_idr);

1071
static void free_minor(int minor)
L
Linus Torvalds 已提交
1072
{
1073
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
1074
	idr_remove(&_minor_idr, minor);
1075
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
1076 1077 1078 1079 1080
}

/*
 * See if the device with a specific minor # is free.
 */
1081
static int specific_minor(int minor)
L
Linus Torvalds 已提交
1082 1083 1084 1085 1086 1087
{
	int r, m;

	if (minor >= (1 << MINORBITS))
		return -EINVAL;

J
Jeff Mahoney 已提交
1088 1089 1090 1091
	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
	if (!r)
		return -ENOMEM;

1092
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
1093 1094 1095 1096 1097 1098

	if (idr_find(&_minor_idr, minor)) {
		r = -EBUSY;
		goto out;
	}

1099
	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
J
Jeff Mahoney 已提交
1100
	if (r)
L
Linus Torvalds 已提交
1101 1102 1103 1104 1105 1106 1107 1108 1109
		goto out;

	if (m != minor) {
		idr_remove(&_minor_idr, m);
		r = -EBUSY;
		goto out;
	}

out:
1110
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
1111 1112 1113
	return r;
}

1114
static int next_free_minor(int *minor)
L
Linus Torvalds 已提交
1115
{
1116
	int r, m;
L
Linus Torvalds 已提交
1117 1118

	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
J
Jeff Mahoney 已提交
1119 1120 1121
	if (!r)
		return -ENOMEM;

1122
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
1123

1124
	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1125
	if (r)
L
Linus Torvalds 已提交
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
		goto out;

	if (m >= (1 << MINORBITS)) {
		idr_remove(&_minor_idr, m);
		r = -ENOSPC;
		goto out;
	}

	*minor = m;

out:
1137
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
1138 1139 1140 1141 1142
	return r;
}

static struct block_device_operations dm_blk_dops;

1143 1144
static void dm_wq_work(struct work_struct *work);

L
Linus Torvalds 已提交
1145 1146 1147
/*
 * Allocate and initialise a blank device with a given minor.
 */
1148
static struct mapped_device *alloc_dev(int minor)
L
Linus Torvalds 已提交
1149 1150
{
	int r;
1151
	struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1152
	void *old_md;
L
Linus Torvalds 已提交
1153 1154 1155 1156 1157 1158

	if (!md) {
		DMWARN("unable to allocate device, out of memory.");
		return NULL;
	}

1159
	if (!try_module_get(THIS_MODULE))
M
Milan Broz 已提交
1160
		goto bad_module_get;
1161

L
Linus Torvalds 已提交
1162
	/* get a minor number for the dev */
1163
	if (minor == DM_ANY_MINOR)
1164
		r = next_free_minor(&minor);
1165
	else
1166
		r = specific_minor(minor);
L
Linus Torvalds 已提交
1167
	if (r < 0)
M
Milan Broz 已提交
1168
		goto bad_minor;
L
Linus Torvalds 已提交
1169

1170
	init_rwsem(&md->io_lock);
1171
	mutex_init(&md->suspend_lock);
1172
	spin_lock_init(&md->deferred_lock);
L
Linus Torvalds 已提交
1173 1174
	rwlock_init(&md->map_lock);
	atomic_set(&md->holders, 1);
1175
	atomic_set(&md->open_count, 0);
L
Linus Torvalds 已提交
1176
	atomic_set(&md->event_nr, 0);
M
Mike Anderson 已提交
1177 1178 1179
	atomic_set(&md->uevent_seq, 0);
	INIT_LIST_HEAD(&md->uevent_list);
	spin_lock_init(&md->uevent_lock);
L
Linus Torvalds 已提交
1180 1181 1182

	md->queue = blk_alloc_queue(GFP_KERNEL);
	if (!md->queue)
M
Milan Broz 已提交
1183
		goto bad_queue;
L
Linus Torvalds 已提交
1184 1185 1186 1187 1188

	md->queue->queuedata = md;
	md->queue->backing_dev_info.congested_fn = dm_any_congested;
	md->queue->backing_dev_info.congested_data = md;
	blk_queue_make_request(md->queue, dm_request);
M
Mikulas Patocka 已提交
1189
	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
1190
	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
L
Linus Torvalds 已提交
1191
	md->queue->unplug_fn = dm_unplug_all;
M
Milan Broz 已提交
1192
	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
L
Linus Torvalds 已提交
1193

1194
	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
K
Kiyoshi Ueda 已提交
1195
	if (!md->io_pool)
M
Milan Broz 已提交
1196
		goto bad_io_pool;
L
Linus Torvalds 已提交
1197

1198
	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
L
Linus Torvalds 已提交
1199
	if (!md->tio_pool)
M
Milan Broz 已提交
1200
		goto bad_tio_pool;
L
Linus Torvalds 已提交
1201

1202
	md->bs = bioset_create(16, 0);
S
Stefan Bader 已提交
1203 1204 1205
	if (!md->bs)
		goto bad_no_bioset;

L
Linus Torvalds 已提交
1206 1207
	md->disk = alloc_disk(1);
	if (!md->disk)
M
Milan Broz 已提交
1208
		goto bad_disk;
L
Linus Torvalds 已提交
1209

1210 1211
	atomic_set(&md->pending, 0);
	init_waitqueue_head(&md->wait);
1212
	INIT_WORK(&md->work, dm_wq_work);
1213 1214
	init_waitqueue_head(&md->eventq);

L
Linus Torvalds 已提交
1215 1216 1217 1218 1219 1220 1221
	md->disk->major = _major;
	md->disk->first_minor = minor;
	md->disk->fops = &dm_blk_dops;
	md->disk->queue = md->queue;
	md->disk->private_data = md;
	sprintf(md->disk->disk_name, "dm-%d", minor);
	add_disk(md->disk);
M
Mike Anderson 已提交
1222
	format_dev_t(md->name, MKDEV(_major, minor));
L
Linus Torvalds 已提交
1223

1224 1225 1226 1227
	md->wq = create_singlethread_workqueue("kdmflush");
	if (!md->wq)
		goto bad_thread;

M
Mikulas Patocka 已提交
1228 1229 1230 1231
	md->bdev = bdget_disk(md->disk, 0);
	if (!md->bdev)
		goto bad_bdev;

1232
	/* Populate the mapping, nobody knows we exist yet */
1233
	spin_lock(&_minor_lock);
1234
	old_md = idr_replace(&_minor_idr, md, minor);
1235
	spin_unlock(&_minor_lock);
1236 1237 1238

	BUG_ON(old_md != MINOR_ALLOCED);

L
Linus Torvalds 已提交
1239 1240
	return md;

M
Mikulas Patocka 已提交
1241 1242
bad_bdev:
	destroy_workqueue(md->wq);
1243 1244
bad_thread:
	put_disk(md->disk);
M
Milan Broz 已提交
1245
bad_disk:
S
Stefan Bader 已提交
1246
	bioset_free(md->bs);
M
Milan Broz 已提交
1247
bad_no_bioset:
L
Linus Torvalds 已提交
1248
	mempool_destroy(md->tio_pool);
M
Milan Broz 已提交
1249
bad_tio_pool:
L
Linus Torvalds 已提交
1250
	mempool_destroy(md->io_pool);
M
Milan Broz 已提交
1251
bad_io_pool:
1252
	blk_cleanup_queue(md->queue);
M
Milan Broz 已提交
1253
bad_queue:
L
Linus Torvalds 已提交
1254
	free_minor(minor);
M
Milan Broz 已提交
1255
bad_minor:
1256
	module_put(THIS_MODULE);
M
Milan Broz 已提交
1257
bad_module_get:
L
Linus Torvalds 已提交
1258 1259 1260 1261
	kfree(md);
	return NULL;
}

J
Jun'ichi Nomura 已提交
1262 1263
static void unlock_fs(struct mapped_device *md);

L
Linus Torvalds 已提交
1264 1265
static void free_dev(struct mapped_device *md)
{
1266
	int minor = MINOR(disk_devt(md->disk));
1267

M
Mikulas Patocka 已提交
1268 1269
	unlock_fs(md);
	bdput(md->bdev);
1270
	destroy_workqueue(md->wq);
L
Linus Torvalds 已提交
1271 1272
	mempool_destroy(md->tio_pool);
	mempool_destroy(md->io_pool);
S
Stefan Bader 已提交
1273
	bioset_free(md->bs);
M
Martin K. Petersen 已提交
1274
	blk_integrity_unregister(md->disk);
L
Linus Torvalds 已提交
1275
	del_gendisk(md->disk);
1276
	free_minor(minor);
J
Jeff Mahoney 已提交
1277 1278 1279 1280 1281

	spin_lock(&_minor_lock);
	md->disk->private_data = NULL;
	spin_unlock(&_minor_lock);

L
Linus Torvalds 已提交
1282
	put_disk(md->disk);
1283
	blk_cleanup_queue(md->queue);
1284
	module_put(THIS_MODULE);
L
Linus Torvalds 已提交
1285 1286 1287 1288 1289 1290 1291 1292
	kfree(md);
}

/*
 * Bind a table to the device.
 */
static void event_callback(void *context)
{
M
Mike Anderson 已提交
1293 1294
	unsigned long flags;
	LIST_HEAD(uevents);
L
Linus Torvalds 已提交
1295 1296
	struct mapped_device *md = (struct mapped_device *) context;

M
Mike Anderson 已提交
1297 1298 1299 1300
	spin_lock_irqsave(&md->uevent_lock, flags);
	list_splice_init(&md->uevent_list, &uevents);
	spin_unlock_irqrestore(&md->uevent_lock, flags);

1301
	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
M
Mike Anderson 已提交
1302

L
Linus Torvalds 已提交
1303 1304 1305 1306
	atomic_inc(&md->event_nr);
	wake_up(&md->eventq);
}

1307
static void __set_size(struct mapped_device *md, sector_t size)
L
Linus Torvalds 已提交
1308
{
1309
	set_capacity(md->disk, size);
L
Linus Torvalds 已提交
1310

1311 1312 1313
	mutex_lock(&md->bdev->bd_inode->i_mutex);
	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
	mutex_unlock(&md->bdev->bd_inode->i_mutex);
L
Linus Torvalds 已提交
1314 1315
}

1316 1317
static int __bind(struct mapped_device *md, struct dm_table *t,
		  struct queue_limits *limits)
L
Linus Torvalds 已提交
1318
{
1319
	struct request_queue *q = md->queue;
L
Linus Torvalds 已提交
1320 1321 1322
	sector_t size;

	size = dm_table_get_size(t);
D
Darrick J. Wong 已提交
1323 1324 1325 1326 1327 1328 1329

	/*
	 * Wipe any geometry if the size of the table changed.
	 */
	if (size != get_capacity(md->disk))
		memset(&md->geometry, 0, sizeof(md->geometry));

M
Mikulas Patocka 已提交
1330
	__set_size(md, size);
1331 1332 1333

	if (!size) {
		dm_table_destroy(t);
L
Linus Torvalds 已提交
1334
		return 0;
1335
	}
L
Linus Torvalds 已提交
1336

1337 1338
	dm_table_event_callback(t, event_callback, md);

L
Linus Torvalds 已提交
1339 1340
	write_lock(&md->map_lock);
	md->map = t;
1341
	dm_table_set_restrictions(t, q, limits);
L
Linus Torvalds 已提交
1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357
	write_unlock(&md->map_lock);

	return 0;
}

static void __unbind(struct mapped_device *md)
{
	struct dm_table *map = md->map;

	if (!map)
		return;

	dm_table_event_callback(map, NULL, NULL);
	write_lock(&md->map_lock);
	md->map = NULL;
	write_unlock(&md->map_lock);
1358
	dm_table_destroy(map);
L
Linus Torvalds 已提交
1359 1360 1361 1362 1363
}

/*
 * Constructor for a new device.
 */
1364
int dm_create(int minor, struct mapped_device **result)
L
Linus Torvalds 已提交
1365 1366 1367
{
	struct mapped_device *md;

1368
	md = alloc_dev(minor);
L
Linus Torvalds 已提交
1369 1370 1371
	if (!md)
		return -ENXIO;

M
Milan Broz 已提交
1372 1373
	dm_sysfs_init(md);

L
Linus Torvalds 已提交
1374 1375 1376 1377
	*result = md;
	return 0;
}

1378
static struct mapped_device *dm_find_md(dev_t dev)
L
Linus Torvalds 已提交
1379 1380 1381 1382 1383 1384 1385
{
	struct mapped_device *md;
	unsigned minor = MINOR(dev);

	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
		return NULL;

1386
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
1387 1388

	md = idr_find(&_minor_idr, minor);
J
Jeff Mahoney 已提交
1389
	if (md && (md == MINOR_ALLOCED ||
1390
		   (MINOR(disk_devt(dm_disk(md))) != minor) ||
A
Alasdair G Kergon 已提交
1391
		   test_bit(DMF_FREEING, &md->flags))) {
1392
		md = NULL;
J
Jeff Mahoney 已提交
1393 1394
		goto out;
	}
L
Linus Torvalds 已提交
1395

J
Jeff Mahoney 已提交
1396
out:
1397
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
1398

1399 1400 1401
	return md;
}

1402 1403 1404 1405 1406 1407 1408 1409 1410 1411
struct mapped_device *dm_get_md(dev_t dev)
{
	struct mapped_device *md = dm_find_md(dev);

	if (md)
		dm_get(md);

	return md;
}

A
Alasdair G Kergon 已提交
1412
void *dm_get_mdptr(struct mapped_device *md)
1413
{
A
Alasdair G Kergon 已提交
1414
	return md->interface_ptr;
L
Linus Torvalds 已提交
1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426
}

void dm_set_mdptr(struct mapped_device *md, void *ptr)
{
	md->interface_ptr = ptr;
}

void dm_get(struct mapped_device *md)
{
	atomic_inc(&md->holders);
}

1427 1428 1429 1430 1431 1432
const char *dm_device_name(struct mapped_device *md)
{
	return md->name;
}
EXPORT_SYMBOL_GPL(dm_device_name);

L
Linus Torvalds 已提交
1433 1434
void dm_put(struct mapped_device *md)
{
M
Mike Anderson 已提交
1435
	struct dm_table *map;
L
Linus Torvalds 已提交
1436

J
Jeff Mahoney 已提交
1437 1438
	BUG_ON(test_bit(DMF_FREEING, &md->flags));

1439
	if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
M
Mike Anderson 已提交
1440
		map = dm_get_table(md);
1441 1442
		idr_replace(&_minor_idr, MINOR_ALLOCED,
			    MINOR(disk_devt(dm_disk(md))));
J
Jeff Mahoney 已提交
1443
		set_bit(DMF_FREEING, &md->flags);
1444
		spin_unlock(&_minor_lock);
1445
		if (!dm_suspended(md)) {
L
Linus Torvalds 已提交
1446 1447 1448
			dm_table_presuspend_targets(map);
			dm_table_postsuspend_targets(map);
		}
M
Milan Broz 已提交
1449
		dm_sysfs_exit(md);
M
Mike Anderson 已提交
1450
		dm_table_put(map);
1451
		__unbind(md);
L
Linus Torvalds 已提交
1452 1453 1454
		free_dev(md);
	}
}
E
Edward Goggin 已提交
1455
EXPORT_SYMBOL_GPL(dm_put);
L
Linus Torvalds 已提交
1456

1457
static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1458 1459
{
	int r = 0;
1460 1461 1462 1463 1464
	DECLARE_WAITQUEUE(wait, current);

	dm_unplug_all(md->queue);

	add_wait_queue(&md->wait, &wait);
1465 1466

	while (1) {
1467
		set_current_state(interruptible);
1468 1469 1470 1471 1472

		smp_mb();
		if (!atomic_read(&md->pending))
			break;

1473 1474
		if (interruptible == TASK_INTERRUPTIBLE &&
		    signal_pending(current)) {
1475 1476 1477 1478 1479 1480 1481 1482
			r = -EINTR;
			break;
		}

		io_schedule();
	}
	set_current_state(TASK_RUNNING);

1483 1484
	remove_wait_queue(&md->wait, &wait);

1485 1486 1487
	return r;
}

M
Mikulas Patocka 已提交
1488
static void dm_flush(struct mapped_device *md)
1489 1490
{
	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
1491 1492 1493 1494 1495 1496 1497

	bio_init(&md->barrier_bio);
	md->barrier_bio.bi_bdev = md->bdev;
	md->barrier_bio.bi_rw = WRITE_BARRIER;
	__split_and_process_bio(md, &md->barrier_bio);

	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
1498 1499 1500 1501
}

static void process_barrier(struct mapped_device *md, struct bio *bio)
{
1502 1503
	md->barrier_error = 0;

M
Mikulas Patocka 已提交
1504
	dm_flush(md);
1505

1506 1507 1508
	if (!bio_empty_barrier(bio)) {
		__split_and_process_bio(md, bio);
		dm_flush(md);
1509 1510 1511
	}

	if (md->barrier_error != DM_ENDIO_REQUEUE)
M
Mikulas Patocka 已提交
1512
		bio_endio(bio, md->barrier_error);
1513 1514 1515 1516 1517
	else {
		spin_lock_irq(&md->deferred_lock);
		bio_list_add_head(&md->deferred, bio);
		spin_unlock_irq(&md->deferred_lock);
	}
1518 1519
}

L
Linus Torvalds 已提交
1520 1521 1522
/*
 * Process the deferred bios
 */
1523
static void dm_wq_work(struct work_struct *work)
L
Linus Torvalds 已提交
1524
{
1525 1526
	struct mapped_device *md = container_of(work, struct mapped_device,
						work);
1527
	struct bio *c;
L
Linus Torvalds 已提交
1528

1529 1530
	down_write(&md->io_lock);

1531
	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
A
Alasdair G Kergon 已提交
1532 1533 1534 1535 1536
		spin_lock_irq(&md->deferred_lock);
		c = bio_list_pop(&md->deferred);
		spin_unlock_irq(&md->deferred_lock);

		if (!c) {
1537
			clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
A
Alasdair G Kergon 已提交
1538 1539
			break;
		}
1540

1541 1542
		up_write(&md->io_lock);

1543 1544 1545 1546
		if (bio_barrier(c))
			process_barrier(md, c);
		else
			__split_and_process_bio(md, c);
1547 1548

		down_write(&md->io_lock);
1549
	}
M
Milan Broz 已提交
1550

1551
	up_write(&md->io_lock);
L
Linus Torvalds 已提交
1552 1553
}

1554
static void dm_queue_flush(struct mapped_device *md)
1555
{
1556 1557
	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
	smp_mb__after_clear_bit();
1558
	queue_work(md->wq, &md->work);
1559 1560
}

L
Linus Torvalds 已提交
1561 1562 1563 1564 1565
/*
 * Swap in a new table (destroying old one).
 */
int dm_swap_table(struct mapped_device *md, struct dm_table *table)
{
1566
	struct queue_limits limits;
1567
	int r = -EINVAL;
L
Linus Torvalds 已提交
1568

1569
	mutex_lock(&md->suspend_lock);
L
Linus Torvalds 已提交
1570 1571

	/* device must be suspended */
1572
	if (!dm_suspended(md))
1573
		goto out;
L
Linus Torvalds 已提交
1574

1575 1576 1577 1578
	r = dm_calculate_queue_limits(table, &limits);
	if (r)
		goto out;

L
Linus Torvalds 已提交
1579
	__unbind(md);
1580
	r = __bind(md, table, &limits);
L
Linus Torvalds 已提交
1581

1582
out:
1583
	mutex_unlock(&md->suspend_lock);
1584
	return r;
L
Linus Torvalds 已提交
1585 1586 1587 1588 1589 1590
}

/*
 * Functions to lock and unlock any filesystem running on the
 * device.
 */
1591
static int lock_fs(struct mapped_device *md)
L
Linus Torvalds 已提交
1592
{
1593
	int r;
L
Linus Torvalds 已提交
1594 1595

	WARN_ON(md->frozen_sb);
1596

1597
	md->frozen_sb = freeze_bdev(md->bdev);
1598
	if (IS_ERR(md->frozen_sb)) {
1599
		r = PTR_ERR(md->frozen_sb);
1600 1601
		md->frozen_sb = NULL;
		return r;
1602 1603
	}

1604 1605
	set_bit(DMF_FROZEN, &md->flags);

L
Linus Torvalds 已提交
1606 1607 1608
	return 0;
}

1609
static void unlock_fs(struct mapped_device *md)
L
Linus Torvalds 已提交
1610
{
1611 1612 1613
	if (!test_bit(DMF_FROZEN, &md->flags))
		return;

1614
	thaw_bdev(md->bdev, md->frozen_sb);
L
Linus Torvalds 已提交
1615
	md->frozen_sb = NULL;
1616
	clear_bit(DMF_FROZEN, &md->flags);
L
Linus Torvalds 已提交
1617 1618 1619 1620 1621 1622 1623 1624 1625
}

/*
 * We need to be able to change a mapping table under a mounted
 * filesystem.  For example we might want to move some data in
 * the background.  Before the table can be swapped with
 * dm_bind_table, dm_suspend must be called to flush any in
 * flight bios and ensure that any further io gets deferred.
 */
1626
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
L
Linus Torvalds 已提交
1627
{
1628
	struct dm_table *map = NULL;
1629
	int r = 0;
1630
	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
1631
	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
L
Linus Torvalds 已提交
1632

1633
	mutex_lock(&md->suspend_lock);
1634

M
Milan Broz 已提交
1635 1636
	if (dm_suspended(md)) {
		r = -EINVAL;
1637
		goto out_unlock;
M
Milan Broz 已提交
1638
	}
L
Linus Torvalds 已提交
1639 1640 1641

	map = dm_get_table(md);

1642 1643 1644 1645 1646 1647 1648
	/*
	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
	 * This flag is cleared before dm_suspend returns.
	 */
	if (noflush)
		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

1649 1650 1651
	/* This does not get reverted if there's an error later. */
	dm_table_presuspend_targets(map);

M
Mikulas Patocka 已提交
1652 1653 1654 1655 1656 1657 1658
	/*
	 * Flush I/O to the device. noflush supersedes do_lockfs,
	 * because lock_fs() needs to flush I/Os.
	 */
	if (!noflush && do_lockfs) {
		r = lock_fs(md);
		if (r)
K
Kiyoshi Ueda 已提交
1659
			goto out;
1660
	}
L
Linus Torvalds 已提交
1661 1662

	/*
1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676
	 * Here we must make sure that no processes are submitting requests
	 * to target drivers i.e. no one may be executing
	 * __split_and_process_bio. This is called from dm_request and
	 * dm_wq_work.
	 *
	 * To get all processes out of __split_and_process_bio in dm_request,
	 * we take the write lock. To prevent any process from reentering
	 * __split_and_process_bio from dm_request, we set
	 * DMF_QUEUE_IO_TO_THREAD.
	 *
	 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
	 * and call flush_workqueue(md->wq). flush_workqueue will wait until
	 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
	 * further calls to __split_and_process_bio from dm_wq_work.
L
Linus Torvalds 已提交
1677
	 */
1678
	down_write(&md->io_lock);
1679 1680
	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
1681
	up_write(&md->io_lock);
L
Linus Torvalds 已提交
1682

1683 1684
	flush_workqueue(md->wq);

L
Linus Torvalds 已提交
1685
	/*
1686 1687 1688
	 * At this point no more requests are entering target request routines.
	 * We call dm_wait_for_completion to wait for all existing requests
	 * to finish.
L
Linus Torvalds 已提交
1689
	 */
1690
	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
L
Linus Torvalds 已提交
1691

1692
	down_write(&md->io_lock);
1693
	if (noflush)
1694
		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1695
	up_write(&md->io_lock);
1696

L
Linus Torvalds 已提交
1697
	/* were we interrupted ? */
1698
	if (r < 0) {
1699
		dm_queue_flush(md);
M
Milan Broz 已提交
1700

1701
		unlock_fs(md);
1702
		goto out; /* pushback list is already flushed, so skip flush */
1703
	}
L
Linus Torvalds 已提交
1704

1705 1706 1707 1708 1709 1710
	/*
	 * If dm_wait_for_completion returned 0, the device is completely
	 * quiescent now. There is no request-processing activity. All new
	 * requests are being added to md->deferred list.
	 */

1711
	dm_table_postsuspend_targets(map);
L
Linus Torvalds 已提交
1712

1713
	set_bit(DMF_SUSPENDED, &md->flags);
1714

1715 1716
out:
	dm_table_put(map);
1717 1718

out_unlock:
1719
	mutex_unlock(&md->suspend_lock);
1720
	return r;
L
Linus Torvalds 已提交
1721 1722 1723 1724
}

int dm_resume(struct mapped_device *md)
{
1725 1726
	int r = -EINVAL;
	struct dm_table *map = NULL;
L
Linus Torvalds 已提交
1727

1728
	mutex_lock(&md->suspend_lock);
1729
	if (!dm_suspended(md))
1730 1731 1732
		goto out;

	map = dm_get_table(md);
1733
	if (!map || !dm_table_get_size(map))
1734
		goto out;
L
Linus Torvalds 已提交
1735

1736 1737 1738
	r = dm_table_resume_targets(map);
	if (r)
		goto out;
1739

1740
	dm_queue_flush(md);
1741 1742 1743 1744 1745

	unlock_fs(md);

	clear_bit(DMF_SUSPENDED, &md->flags);

L
Linus Torvalds 已提交
1746
	dm_table_unplug_all(map);
1747 1748 1749
	r = 0;
out:
	dm_table_put(map);
1750
	mutex_unlock(&md->suspend_lock);
1751

1752
	return r;
L
Linus Torvalds 已提交
1753 1754 1755 1756 1757
}

/*-----------------------------------------------------------------
 * Event notification.
 *---------------------------------------------------------------*/
M
Milan Broz 已提交
1758 1759
void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
		       unsigned cookie)
1760
{
M
Milan Broz 已提交
1761 1762 1763 1764 1765 1766 1767 1768 1769 1770
	char udev_cookie[DM_COOKIE_LENGTH];
	char *envp[] = { udev_cookie, NULL };

	if (!cookie)
		kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
	else {
		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
			 DM_COOKIE_ENV_VAR_NAME, cookie);
		kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
	}
1771 1772
}

M
Mike Anderson 已提交
1773 1774 1775 1776 1777
uint32_t dm_next_uevent_seq(struct mapped_device *md)
{
	return atomic_add_return(1, &md->uevent_seq);
}

L
Linus Torvalds 已提交
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788
uint32_t dm_get_event_nr(struct mapped_device *md)
{
	return atomic_read(&md->event_nr);
}

int dm_wait_event(struct mapped_device *md, int event_nr)
{
	return wait_event_interruptible(md->eventq,
			(event_nr != atomic_read(&md->event_nr)));
}

M
Mike Anderson 已提交
1789 1790 1791 1792 1793 1794 1795 1796 1797
void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
{
	unsigned long flags;

	spin_lock_irqsave(&md->uevent_lock, flags);
	list_add(elist, &md->uevent_list);
	spin_unlock_irqrestore(&md->uevent_lock, flags);
}

L
Linus Torvalds 已提交
1798 1799 1800 1801 1802 1803 1804 1805 1806
/*
 * The gendisk is only valid as long as you have a reference
 * count on 'md'.
 */
struct gendisk *dm_disk(struct mapped_device *md)
{
	return md->disk;
}

M
Milan Broz 已提交
1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
struct kobject *dm_kobject(struct mapped_device *md)
{
	return &md->kobj;
}

/*
 * struct mapped_device should not be exported outside of dm.c
 * so use this check to verify that kobj is part of md structure
 */
struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
{
	struct mapped_device *md;

	md = container_of(kobj, struct mapped_device, kobj);
	if (&md->kobj != kobj)
		return NULL;

1824 1825 1826 1827
	if (test_bit(DMF_FREEING, &md->flags) ||
	    test_bit(DMF_DELETING, &md->flags))
		return NULL;

M
Milan Broz 已提交
1828 1829 1830 1831
	dm_get(md);
	return md;
}

L
Linus Torvalds 已提交
1832 1833 1834 1835 1836
int dm_suspended(struct mapped_device *md)
{
	return test_bit(DMF_SUSPENDED, &md->flags);
}

1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847
int dm_noflush_suspending(struct dm_target *ti)
{
	struct mapped_device *md = dm_table_get_md(ti->table);
	int r = __noflush_suspending(md);

	dm_put(md);

	return r;
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);

L
Linus Torvalds 已提交
1848 1849 1850
static struct block_device_operations dm_blk_dops = {
	.open = dm_blk_open,
	.release = dm_blk_close,
1851
	.ioctl = dm_blk_ioctl,
D
Darrick J. Wong 已提交
1852
	.getgeo = dm_blk_getgeo,
L
Linus Torvalds 已提交
1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868
	.owner = THIS_MODULE
};

EXPORT_SYMBOL(dm_get_mapinfo);

/*
 * module hooks
 */
module_init(dm_init);
module_exit(dm_exit);

module_param(major, uint, 0);
MODULE_PARM_DESC(major, "The major number of the device mapper");
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");