dm.c 34.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3
 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
L
Linus Torvalds 已提交
4 5 6 7 8 9
 *
 * This file is released under the GPL.
 */

#include "dm.h"
#include "dm-bio-list.h"
M
Mike Anderson 已提交
10
#include "dm-uevent.h"
L
Linus Torvalds 已提交
11 12 13

#include <linux/init.h>
#include <linux/module.h>
A
Arjan van de Ven 已提交
14
#include <linux/mutex.h>
L
Linus Torvalds 已提交
15 16 17 18 19 20 21
#include <linux/moduleparam.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
D
Darrick J. Wong 已提交
22
#include <linux/hdreg.h>
23
#include <linux/blktrace_api.h>
24
#include <trace/block.h>
L
Linus Torvalds 已提交
25

26 27
#define DM_MSG_PREFIX "core"

L
Linus Torvalds 已提交
28 29 30 31 32
static const char *_name = DM_NAME;

static unsigned int major = 0;
static unsigned int _major = 0;

33
static DEFINE_SPINLOCK(_minor_lock);
L
Linus Torvalds 已提交
34 35 36 37 38 39 40
/*
 * One of these is allocated per bio.
 */
struct dm_io {
	struct mapped_device *md;
	int error;
	atomic_t io_count;
R
Richard Kennedy 已提交
41
	struct bio *bio;
42
	unsigned long start_time;
L
Linus Torvalds 已提交
43 44 45 46 47 48
};

/*
 * One of these is allocated per target within a bio.  Hopefully
 * this will be simplified out one day.
 */
A
Alasdair G Kergon 已提交
49
struct dm_target_io {
L
Linus Torvalds 已提交
50 51 52 53 54
	struct dm_io *io;
	struct dm_target *ti;
	union map_info info;
};

55 56
DEFINE_TRACE(block_bio_complete);

L
Linus Torvalds 已提交
57 58
union map_info *dm_get_mapinfo(struct bio *bio)
{
A
Alasdair G Kergon 已提交
59
	if (bio && bio->bi_private)
A
Alasdair G Kergon 已提交
60
		return &((struct dm_target_io *)bio->bi_private)->info;
A
Alasdair G Kergon 已提交
61
	return NULL;
L
Linus Torvalds 已提交
62 63
}

64 65
#define MINOR_ALLOCED ((void *)-1)

L
Linus Torvalds 已提交
66 67 68 69 70
/*
 * Bits for the md->flags field.
 */
#define DMF_BLOCK_IO 0
#define DMF_SUSPENDED 1
71
#define DMF_FROZEN 2
J
Jeff Mahoney 已提交
72
#define DMF_FREEING 3
73
#define DMF_DELETING 4
74
#define DMF_NOFLUSH_SUSPENDING 5
L
Linus Torvalds 已提交
75

76 77 78 79 80 81 82 83 84 85 86 87
/*
 * Work processed by per-device workqueue.
 */
struct dm_wq_req {
	enum {
		DM_WQ_FLUSH_DEFERRED,
	} type;
	struct work_struct work;
	struct mapped_device *md;
	void *context;
};

L
Linus Torvalds 已提交
88
struct mapped_device {
89
	struct rw_semaphore io_lock;
90
	struct mutex suspend_lock;
91
	spinlock_t pushback_lock;
L
Linus Torvalds 已提交
92 93
	rwlock_t map_lock;
	atomic_t holders;
94
	atomic_t open_count;
L
Linus Torvalds 已提交
95 96 97

	unsigned long flags;

98
	struct request_queue *queue;
L
Linus Torvalds 已提交
99
	struct gendisk *disk;
M
Mike Anderson 已提交
100
	char name[16];
L
Linus Torvalds 已提交
101 102 103 104 105 106 107 108

	void *interface_ptr;

	/*
	 * A list of ios that arrived while we were suspended.
	 */
	atomic_t pending;
	wait_queue_head_t wait;
K
Kiyoshi Ueda 已提交
109
	struct bio_list deferred;
110
	struct bio_list pushback;
L
Linus Torvalds 已提交
111

112 113 114 115 116
	/*
	 * Processing queue (flush/barriers)
	 */
	struct workqueue_struct *wq;

L
Linus Torvalds 已提交
117 118 119 120 121 122 123 124 125 126 127
	/*
	 * The current mapping.
	 */
	struct dm_table *map;

	/*
	 * io objects are allocated from here.
	 */
	mempool_t *io_pool;
	mempool_t *tio_pool;

S
Stefan Bader 已提交
128 129
	struct bio_set *bs;

L
Linus Torvalds 已提交
130 131 132 133 134
	/*
	 * Event handling.
	 */
	atomic_t event_nr;
	wait_queue_head_t eventq;
M
Mike Anderson 已提交
135 136 137
	atomic_t uevent_seq;
	struct list_head uevent_list;
	spinlock_t uevent_lock; /* Protect access to uevent_list */
L
Linus Torvalds 已提交
138 139 140 141 142

	/*
	 * freeze/thaw support require holding onto a super block
	 */
	struct super_block *frozen_sb;
143
	struct block_device *suspended_bdev;
D
Darrick J. Wong 已提交
144 145 146

	/* forced geometry settings */
	struct hd_geometry geometry;
L
Linus Torvalds 已提交
147 148 149
};

#define MIN_IOS 256
150 151
static struct kmem_cache *_io_cache;
static struct kmem_cache *_tio_cache;
L
Linus Torvalds 已提交
152 153 154

static int __init local_init(void)
{
K
Kiyoshi Ueda 已提交
155
	int r = -ENOMEM;
L
Linus Torvalds 已提交
156 157

	/* allocate a slab for the dm_ios */
A
Alasdair G Kergon 已提交
158
	_io_cache = KMEM_CACHE(dm_io, 0);
L
Linus Torvalds 已提交
159
	if (!_io_cache)
K
Kiyoshi Ueda 已提交
160
		return r;
L
Linus Torvalds 已提交
161 162

	/* allocate a slab for the target ios */
A
Alasdair G Kergon 已提交
163
	_tio_cache = KMEM_CACHE(dm_target_io, 0);
K
Kiyoshi Ueda 已提交
164 165
	if (!_tio_cache)
		goto out_free_io_cache;
L
Linus Torvalds 已提交
166

M
Mike Anderson 已提交
167
	r = dm_uevent_init();
K
Kiyoshi Ueda 已提交
168 169
	if (r)
		goto out_free_tio_cache;
M
Mike Anderson 已提交
170

L
Linus Torvalds 已提交
171 172
	_major = major;
	r = register_blkdev(_major, _name);
K
Kiyoshi Ueda 已提交
173 174
	if (r < 0)
		goto out_uevent_exit;
L
Linus Torvalds 已提交
175 176 177 178 179

	if (!_major)
		_major = r;

	return 0;
K
Kiyoshi Ueda 已提交
180 181 182 183 184 185 186 187 188

out_uevent_exit:
	dm_uevent_exit();
out_free_tio_cache:
	kmem_cache_destroy(_tio_cache);
out_free_io_cache:
	kmem_cache_destroy(_io_cache);

	return r;
L
Linus Torvalds 已提交
189 190 191 192 193 194
}

static void local_exit(void)
{
	kmem_cache_destroy(_tio_cache);
	kmem_cache_destroy(_io_cache);
195
	unregister_blkdev(_major, _name);
M
Mike Anderson 已提交
196
	dm_uevent_exit();
L
Linus Torvalds 已提交
197 198 199 200 201 202

	_major = 0;

	DMINFO("cleaned up");
}

203
static int (*_inits[])(void) __initdata = {
L
Linus Torvalds 已提交
204 205 206 207
	local_init,
	dm_target_init,
	dm_linear_init,
	dm_stripe_init,
208
	dm_kcopyd_init,
L
Linus Torvalds 已提交
209 210 211
	dm_interface_init,
};

212
static void (*_exits[])(void) = {
L
Linus Torvalds 已提交
213 214 215 216
	local_exit,
	dm_target_exit,
	dm_linear_exit,
	dm_stripe_exit,
217
	dm_kcopyd_exit,
L
Linus Torvalds 已提交
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
	dm_interface_exit,
};

static int __init dm_init(void)
{
	const int count = ARRAY_SIZE(_inits);

	int r, i;

	for (i = 0; i < count; i++) {
		r = _inits[i]();
		if (r)
			goto bad;
	}

	return 0;

      bad:
	while (i--)
		_exits[i]();

	return r;
}

static void __exit dm_exit(void)
{
	int i = ARRAY_SIZE(_exits);

	while (i--)
		_exits[i]();
}

/*
 * Block device functions
 */
A
Al Viro 已提交
253
static int dm_blk_open(struct block_device *bdev, fmode_t mode)
L
Linus Torvalds 已提交
254 255 256
{
	struct mapped_device *md;

J
Jeff Mahoney 已提交
257 258
	spin_lock(&_minor_lock);

A
Al Viro 已提交
259
	md = bdev->bd_disk->private_data;
J
Jeff Mahoney 已提交
260 261 262
	if (!md)
		goto out;

263 264
	if (test_bit(DMF_FREEING, &md->flags) ||
	    test_bit(DMF_DELETING, &md->flags)) {
J
Jeff Mahoney 已提交
265 266 267 268
		md = NULL;
		goto out;
	}

L
Linus Torvalds 已提交
269
	dm_get(md);
270
	atomic_inc(&md->open_count);
J
Jeff Mahoney 已提交
271 272 273 274 275

out:
	spin_unlock(&_minor_lock);

	return md ? 0 : -ENXIO;
L
Linus Torvalds 已提交
276 277
}

A
Al Viro 已提交
278
static int dm_blk_close(struct gendisk *disk, fmode_t mode)
L
Linus Torvalds 已提交
279
{
A
Al Viro 已提交
280
	struct mapped_device *md = disk->private_data;
281
	atomic_dec(&md->open_count);
L
Linus Torvalds 已提交
282 283 284 285
	dm_put(md);
	return 0;
}

286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
int dm_open_count(struct mapped_device *md)
{
	return atomic_read(&md->open_count);
}

/*
 * Guarantees nothing is using the device before it's deleted.
 */
int dm_lock_for_deletion(struct mapped_device *md)
{
	int r = 0;

	spin_lock(&_minor_lock);

	if (dm_open_count(md))
		r = -EBUSY;
	else
		set_bit(DMF_DELETING, &md->flags);

	spin_unlock(&_minor_lock);

	return r;
}

D
Darrick J. Wong 已提交
310 311 312 313 314 315 316
static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
	struct mapped_device *md = bdev->bd_disk->private_data;

	return dm_get_geometry(md, geo);
}

A
Al Viro 已提交
317
static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
318 319
			unsigned int cmd, unsigned long arg)
{
A
Al Viro 已提交
320 321
	struct mapped_device *md = bdev->bd_disk->private_data;
	struct dm_table *map = dm_get_table(md);
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
	struct dm_target *tgt;
	int r = -ENOTTY;

	if (!map || !dm_table_get_size(map))
		goto out;

	/* We only support devices that have a single target */
	if (dm_table_get_num_targets(map) != 1)
		goto out;

	tgt = dm_table_get_target(map, 0);

	if (dm_suspended(md)) {
		r = -EAGAIN;
		goto out;
	}

	if (tgt->type->ioctl)
340
		r = tgt->type->ioctl(tgt, cmd, arg);
341 342 343 344 345 346 347

out:
	dm_table_put(map);

	return r;
}

A
Alasdair G Kergon 已提交
348
static struct dm_io *alloc_io(struct mapped_device *md)
L
Linus Torvalds 已提交
349 350 351 352
{
	return mempool_alloc(md->io_pool, GFP_NOIO);
}

A
Alasdair G Kergon 已提交
353
static void free_io(struct mapped_device *md, struct dm_io *io)
L
Linus Torvalds 已提交
354 355 356 357
{
	mempool_free(io, md->io_pool);
}

A
Alasdair G Kergon 已提交
358
static struct dm_target_io *alloc_tio(struct mapped_device *md)
L
Linus Torvalds 已提交
359 360 361 362
{
	return mempool_alloc(md->tio_pool, GFP_NOIO);
}

A
Alasdair G Kergon 已提交
363
static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
L
Linus Torvalds 已提交
364 365 366 367
{
	mempool_free(tio, md->tio_pool);
}

368 369 370
static void start_io_acct(struct dm_io *io)
{
	struct mapped_device *md = io->md;
T
Tejun Heo 已提交
371
	int cpu;
372 373 374

	io->start_time = jiffies;

T
Tejun Heo 已提交
375 376 377 378
	cpu = part_stat_lock();
	part_round_stats(cpu, &dm_disk(md)->part0);
	part_stat_unlock();
	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
379 380
}

381
static void end_io_acct(struct dm_io *io)
382 383 384 385
{
	struct mapped_device *md = io->md;
	struct bio *bio = io->bio;
	unsigned long duration = jiffies - io->start_time;
T
Tejun Heo 已提交
386
	int pending, cpu;
387 388
	int rw = bio_data_dir(bio);

T
Tejun Heo 已提交
389 390 391 392
	cpu = part_stat_lock();
	part_round_stats(cpu, &dm_disk(md)->part0);
	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
	part_stat_unlock();
393

T
Tejun Heo 已提交
394 395
	dm_disk(md)->part0.in_flight = pending =
		atomic_dec_return(&md->pending);
396

397 398 399
	/* nudge anyone waiting on suspend queue */
	if (!pending)
		wake_up(&md->wait);
400 401
}

L
Linus Torvalds 已提交
402 403 404 405 406
/*
 * Add the bio to the list of deferred io.
 */
static int queue_io(struct mapped_device *md, struct bio *bio)
{
407
	down_write(&md->io_lock);
L
Linus Torvalds 已提交
408 409

	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
410
		up_write(&md->io_lock);
L
Linus Torvalds 已提交
411 412 413 414 415
		return 1;
	}

	bio_list_add(&md->deferred, bio);

416
	up_write(&md->io_lock);
L
Linus Torvalds 已提交
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
	return 0;		/* deferred successfully */
}

/*
 * Everyone (including functions in this file), should use this
 * function to access the md->map field, and make sure they call
 * dm_table_put() when finished.
 */
struct dm_table *dm_get_table(struct mapped_device *md)
{
	struct dm_table *t;

	read_lock(&md->map_lock);
	t = md->map;
	if (t)
		dm_table_get(t);
	read_unlock(&md->map_lock);

	return t;
}

D
Darrick J. Wong 已提交
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464
/*
 * Get the geometry associated with a dm device
 */
int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
	*geo = md->geometry;

	return 0;
}

/*
 * Set the geometry of a device.
 */
int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;

	if (geo->start > sz) {
		DMWARN("Start sector is beyond the geometry limits.");
		return -EINVAL;
	}

	md->geometry = *geo;

	return 0;
}

L
Linus Torvalds 已提交
465 466 467 468 469 470 471 472 473
/*-----------------------------------------------------------------
 * CRUD START:
 *   A more elegant soln is in the works that uses the queue
 *   merge fn, unfortunately there are a couple of changes to
 *   the block layer that I want to make for this.  So in the
 *   interests of getting something for people to use I give
 *   you this clearly demarcated crap.
 *---------------------------------------------------------------*/

474 475 476 477 478
static int __noflush_suspending(struct mapped_device *md)
{
	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
}

L
Linus Torvalds 已提交
479 480 481 482
/*
 * Decrements the number of outstanding ios that a bio has been
 * cloned into, completing the original io if necc.
 */
483
static void dec_pending(struct dm_io *io, int error)
L
Linus Torvalds 已提交
484
{
485 486 487 488
	unsigned long flags;

	/* Push-back supersedes any I/O errors */
	if (error && !(io->error > 0 && __noflush_suspending(io->md)))
L
Linus Torvalds 已提交
489 490 491
		io->error = error;

	if (atomic_dec_and_test(&io->io_count)) {
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
		if (io->error == DM_ENDIO_REQUEUE) {
			/*
			 * Target requested pushing back the I/O.
			 * This must be handled before the sleeper on
			 * suspend queue merges the pushback list.
			 */
			spin_lock_irqsave(&io->md->pushback_lock, flags);
			if (__noflush_suspending(io->md))
				bio_list_add(&io->md->pushback, io->bio);
			else
				/* noflush suspend was interrupted. */
				io->error = -EIO;
			spin_unlock_irqrestore(&io->md->pushback_lock, flags);
		}

507
		end_io_acct(io);
L
Linus Torvalds 已提交
508

509
		if (io->error != DM_ENDIO_REQUEUE) {
510
			trace_block_bio_complete(io->md->queue, io->bio);
511

512
			bio_endio(io->bio, io->error);
513
		}
514

L
Linus Torvalds 已提交
515 516 517 518
		free_io(io->md, io);
	}
}

519
static void clone_endio(struct bio *bio, int error)
L
Linus Torvalds 已提交
520 521
{
	int r = 0;
A
Alasdair G Kergon 已提交
522
	struct dm_target_io *tio = bio->bi_private;
S
Stefan Bader 已提交
523
	struct mapped_device *md = tio->io->md;
L
Linus Torvalds 已提交
524 525 526 527 528 529 530
	dm_endio_fn endio = tio->ti->type->end_io;

	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
		error = -EIO;

	if (endio) {
		r = endio(tio->ti, bio, error, &tio->info);
531 532 533 534 535
		if (r < 0 || r == DM_ENDIO_REQUEUE)
			/*
			 * error and requeue request are handled
			 * in dec_pending().
			 */
L
Linus Torvalds 已提交
536
			error = r;
537 538
		else if (r == DM_ENDIO_INCOMPLETE)
			/* The target will handle the io */
539
			return;
540 541 542 543
		else if (r) {
			DMWARN("unimplemented target endio return value: %d", r);
			BUG();
		}
L
Linus Torvalds 已提交
544 545
	}

S
Stefan Bader 已提交
546 547 548 549 550 551 552
	dec_pending(tio->io, error);

	/*
	 * Store md for cleanup instead of tio which is about to get freed.
	 */
	bio->bi_private = md->bs;

L
Linus Torvalds 已提交
553
	bio_put(bio);
S
Stefan Bader 已提交
554
	free_tio(md, tio);
L
Linus Torvalds 已提交
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
}

static sector_t max_io_len(struct mapped_device *md,
			   sector_t sector, struct dm_target *ti)
{
	sector_t offset = sector - ti->begin;
	sector_t len = ti->len - offset;

	/*
	 * Does the target need to split even further ?
	 */
	if (ti->split_io) {
		sector_t boundary;
		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
			   - offset;
		if (len > boundary)
			len = boundary;
	}

	return len;
}

static void __map_bio(struct dm_target *ti, struct bio *clone,
A
Alasdair G Kergon 已提交
578
		      struct dm_target_io *tio)
L
Linus Torvalds 已提交
579 580
{
	int r;
581
	sector_t sector;
S
Stefan Bader 已提交
582
	struct mapped_device *md;
L
Linus Torvalds 已提交
583 584 585 586 587 588 589 590 591 592 593 594 595 596 597

	/*
	 * Sanity checks.
	 */
	BUG_ON(!clone->bi_size);

	clone->bi_end_io = clone_endio;
	clone->bi_private = tio;

	/*
	 * Map the clone.  If r == 0 we don't need to do
	 * anything, the target has assumed ownership of
	 * this io.
	 */
	atomic_inc(&tio->io->io_count);
598
	sector = clone->bi_sector;
L
Linus Torvalds 已提交
599
	r = ti->type->map(ti, clone, &tio->info);
600
	if (r == DM_MAPIO_REMAPPED) {
L
Linus Torvalds 已提交
601
		/* the bio has been remapped so dispatch it */
602

603
		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
604 605
				    tio->io->bio->bi_bdev->bd_dev,
				    clone->bi_sector, sector);
606

L
Linus Torvalds 已提交
607
		generic_make_request(clone);
608 609
	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
		/* error the io and bail out, or requeue it if needed */
S
Stefan Bader 已提交
610 611 612 613 614 615
		md = tio->io->md;
		dec_pending(tio->io, r);
		/*
		 * Store bio_set for cleanup.
		 */
		clone->bi_private = md->bs;
L
Linus Torvalds 已提交
616
		bio_put(clone);
S
Stefan Bader 已提交
617
		free_tio(md, tio);
618 619 620
	} else if (r) {
		DMWARN("unimplemented target map return value: %d", r);
		BUG();
L
Linus Torvalds 已提交
621 622 623 624 625 626 627 628 629 630 631 632 633
	}
}

struct clone_info {
	struct mapped_device *md;
	struct dm_table *map;
	struct bio *bio;
	struct dm_io *io;
	sector_t sector;
	sector_t sector_count;
	unsigned short idx;
};

P
Peter Osterlund 已提交
634 635
static void dm_bio_destructor(struct bio *bio)
{
S
Stefan Bader 已提交
636 637 638
	struct bio_set *bs = bio->bi_private;

	bio_free(bio, bs);
P
Peter Osterlund 已提交
639 640
}

L
Linus Torvalds 已提交
641 642 643 644 645
/*
 * Creates a little bio that is just does part of a bvec.
 */
static struct bio *split_bvec(struct bio *bio, sector_t sector,
			      unsigned short idx, unsigned int offset,
S
Stefan Bader 已提交
646
			      unsigned int len, struct bio_set *bs)
L
Linus Torvalds 已提交
647 648 649 650
{
	struct bio *clone;
	struct bio_vec *bv = bio->bi_io_vec + idx;

S
Stefan Bader 已提交
651
	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
P
Peter Osterlund 已提交
652
	clone->bi_destructor = dm_bio_destructor;
L
Linus Torvalds 已提交
653 654 655 656 657 658 659 660 661
	*clone->bi_io_vec = *bv;

	clone->bi_sector = sector;
	clone->bi_bdev = bio->bi_bdev;
	clone->bi_rw = bio->bi_rw;
	clone->bi_vcnt = 1;
	clone->bi_size = to_bytes(len);
	clone->bi_io_vec->bv_offset = offset;
	clone->bi_io_vec->bv_len = clone->bi_size;
M
Martin K. Petersen 已提交
662
	clone->bi_flags |= 1 << BIO_CLONED;
L
Linus Torvalds 已提交
663 664 665 666 667 668 669 670 671

	return clone;
}

/*
 * Creates a bio that consists of range of complete bvecs.
 */
static struct bio *clone_bio(struct bio *bio, sector_t sector,
			     unsigned short idx, unsigned short bv_count,
S
Stefan Bader 已提交
672
			     unsigned int len, struct bio_set *bs)
L
Linus Torvalds 已提交
673 674 675
{
	struct bio *clone;

S
Stefan Bader 已提交
676 677 678
	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
	__bio_clone(clone, bio);
	clone->bi_destructor = dm_bio_destructor;
L
Linus Torvalds 已提交
679 680 681 682 683 684 685 686 687
	clone->bi_sector = sector;
	clone->bi_idx = idx;
	clone->bi_vcnt = idx + bv_count;
	clone->bi_size = to_bytes(len);
	clone->bi_flags &= ~(1 << BIO_SEG_VALID);

	return clone;
}

688
static int __clone_and_map(struct clone_info *ci)
L
Linus Torvalds 已提交
689 690
{
	struct bio *clone, *bio = ci->bio;
691 692
	struct dm_target *ti;
	sector_t len = 0, max;
A
Alasdair G Kergon 已提交
693
	struct dm_target_io *tio;
L
Linus Torvalds 已提交
694

695 696 697 698 699 700
	ti = dm_table_find_target(ci->map, ci->sector);
	if (!dm_target_is_valid(ti))
		return -EIO;

	max = max_io_len(ci->md, ci->sector, ti);

L
Linus Torvalds 已提交
701 702 703 704 705 706 707 708 709 710 711 712 713 714
	/*
	 * Allocate a target io object.
	 */
	tio = alloc_tio(ci->md);
	tio->io = ci->io;
	tio->ti = ti;
	memset(&tio->info, 0, sizeof(tio->info));

	if (ci->sector_count <= max) {
		/*
		 * Optimise for the simple case where we can do all of
		 * the remaining io with a single clone.
		 */
		clone = clone_bio(bio, ci->sector, ci->idx,
S
Stefan Bader 已提交
715 716
				  bio->bi_vcnt - ci->idx, ci->sector_count,
				  ci->md->bs);
L
Linus Torvalds 已提交
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
		__map_bio(ti, clone, tio);
		ci->sector_count = 0;

	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
		/*
		 * There are some bvecs that don't span targets.
		 * Do as many of these as possible.
		 */
		int i;
		sector_t remaining = max;
		sector_t bv_len;

		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
			bv_len = to_sector(bio->bi_io_vec[i].bv_len);

			if (bv_len > remaining)
				break;

			remaining -= bv_len;
			len += bv_len;
		}

S
Stefan Bader 已提交
739 740
		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
				  ci->md->bs);
L
Linus Torvalds 已提交
741 742 743 744 745 746 747 748
		__map_bio(ti, clone, tio);

		ci->sector += len;
		ci->sector_count -= len;
		ci->idx = i;

	} else {
		/*
749
		 * Handle a bvec that must be split between two or more targets.
L
Linus Torvalds 已提交
750 751
		 */
		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
752 753
		sector_t remaining = to_sector(bv->bv_len);
		unsigned int offset = 0;
L
Linus Torvalds 已提交
754

755 756 757
		do {
			if (offset) {
				ti = dm_table_find_target(ci->map, ci->sector);
758 759 760
				if (!dm_target_is_valid(ti))
					return -EIO;

761
				max = max_io_len(ci->md, ci->sector, ti);
L
Linus Torvalds 已提交
762

763 764 765 766 767 768 769 770 771
				tio = alloc_tio(ci->md);
				tio->io = ci->io;
				tio->ti = ti;
				memset(&tio->info, 0, sizeof(tio->info));
			}

			len = min(remaining, max);

			clone = split_bvec(bio, ci->sector, ci->idx,
S
Stefan Bader 已提交
772 773
					   bv->bv_offset + offset, len,
					   ci->md->bs);
774 775 776 777 778 779 780

			__map_bio(ti, clone, tio);

			ci->sector += len;
			ci->sector_count -= len;
			offset += to_bytes(len);
		} while (remaining -= len);
L
Linus Torvalds 已提交
781 782 783

		ci->idx++;
	}
784 785

	return 0;
L
Linus Torvalds 已提交
786 787 788 789 790
}

/*
 * Split the bio into several clones.
 */
M
Milan Broz 已提交
791
static int __split_bio(struct mapped_device *md, struct bio *bio)
L
Linus Torvalds 已提交
792 793
{
	struct clone_info ci;
794
	int error = 0;
L
Linus Torvalds 已提交
795 796

	ci.map = dm_get_table(md);
M
Milan Broz 已提交
797 798
	if (unlikely(!ci.map))
		return -EIO;
L
Linus Torvalds 已提交
799 800 801 802 803 804 805 806 807 808 809 810

	ci.md = md;
	ci.bio = bio;
	ci.io = alloc_io(md);
	ci.io->error = 0;
	atomic_set(&ci.io->io_count, 1);
	ci.io->bio = bio;
	ci.io->md = md;
	ci.sector = bio->bi_sector;
	ci.sector_count = bio_sectors(bio);
	ci.idx = bio->bi_idx;

811
	start_io_acct(ci.io);
812 813
	while (ci.sector_count && !error)
		error = __clone_and_map(&ci);
L
Linus Torvalds 已提交
814 815

	/* drop the extra reference count */
816
	dec_pending(ci.io, error);
L
Linus Torvalds 已提交
817
	dm_table_put(ci.map);
M
Milan Broz 已提交
818 819

	return 0;
L
Linus Torvalds 已提交
820 821 822 823 824
}
/*-----------------------------------------------------------------
 * CRUD END
 *---------------------------------------------------------------*/

M
Milan Broz 已提交
825 826 827 828 829 830 831 832
static int dm_merge_bvec(struct request_queue *q,
			 struct bvec_merge_data *bvm,
			 struct bio_vec *biovec)
{
	struct mapped_device *md = q->queuedata;
	struct dm_table *map = dm_get_table(md);
	struct dm_target *ti;
	sector_t max_sectors;
833
	int max_size = 0;
M
Milan Broz 已提交
834 835

	if (unlikely(!map))
836
		goto out;
M
Milan Broz 已提交
837 838

	ti = dm_table_find_target(map, bvm->bi_sector);
839 840
	if (!dm_target_is_valid(ti))
		goto out_table;
M
Milan Broz 已提交
841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858

	/*
	 * Find maximum amount of I/O that won't need splitting
	 */
	max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
			  (sector_t) BIO_MAX_SECTORS);
	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
	if (max_size < 0)
		max_size = 0;

	/*
	 * merge_bvec_fn() returns number of bytes
	 * it can accept at this offset
	 * max is precomputed maximal io size
	 */
	if (max_size && ti->type->merge)
		max_size = ti->type->merge(ti, bvm, biovec, max_size);

859
out_table:
860 861 862
	dm_table_put(map);

out:
M
Milan Broz 已提交
863 864 865 866 867 868 869 870 871
	/*
	 * Always allow an entire first page
	 */
	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
		max_size = biovec->bv_len;

	return max_size;
}

L
Linus Torvalds 已提交
872 873 874 875
/*
 * The request function that just remaps the bio built up by
 * dm_merge_bvec.
 */
876
static int dm_request(struct request_queue *q, struct bio *bio)
L
Linus Torvalds 已提交
877
{
M
Milan Broz 已提交
878
	int r = -EIO;
879
	int rw = bio_data_dir(bio);
L
Linus Torvalds 已提交
880
	struct mapped_device *md = q->queuedata;
T
Tejun Heo 已提交
881
	int cpu;
L
Linus Torvalds 已提交
882

S
Stefan Bader 已提交
883 884 885 886 887
	/*
	 * There is no use in forwarding any barrier request since we can't
	 * guarantee it is (or can be) handled by the targets correctly.
	 */
	if (unlikely(bio_barrier(bio))) {
888
		bio_endio(bio, -EOPNOTSUPP);
S
Stefan Bader 已提交
889 890 891
		return 0;
	}

892
	down_read(&md->io_lock);
L
Linus Torvalds 已提交
893

T
Tejun Heo 已提交
894 895 896 897
	cpu = part_stat_lock();
	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
	part_stat_unlock();
898

L
Linus Torvalds 已提交
899 900 901 902 903
	/*
	 * If we're suspended we have to queue
	 * this io for later.
	 */
	while (test_bit(DMF_BLOCK_IO, &md->flags)) {
904
		up_read(&md->io_lock);
L
Linus Torvalds 已提交
905

M
Milan Broz 已提交
906 907
		if (bio_rw(bio) != READA)
			r = queue_io(md, bio);
L
Linus Torvalds 已提交
908

M
Milan Broz 已提交
909 910
		if (r <= 0)
			goto out_req;
L
Linus Torvalds 已提交
911 912 913 914 915

		/*
		 * We're in a while loop, because someone could suspend
		 * before we get to the following read lock.
		 */
916
		down_read(&md->io_lock);
L
Linus Torvalds 已提交
917 918
	}

M
Milan Broz 已提交
919
	r = __split_bio(md, bio);
920
	up_read(&md->io_lock);
M
Milan Broz 已提交
921 922 923 924 925

out_req:
	if (r < 0)
		bio_io_error(bio);

L
Linus Torvalds 已提交
926 927 928
	return 0;
}

929
static void dm_unplug_all(struct request_queue *q)
L
Linus Torvalds 已提交
930 931 932 933 934 935 936 937 938 939 940 941
{
	struct mapped_device *md = q->queuedata;
	struct dm_table *map = dm_get_table(md);

	if (map) {
		dm_table_unplug_all(map);
		dm_table_put(map);
	}
}

static int dm_any_congested(void *congested_data, int bdi_bits)
{
942 943 944
	int r = bdi_bits;
	struct mapped_device *md = congested_data;
	struct dm_table *map;
L
Linus Torvalds 已提交
945

946 947 948 949 950 951 952 953 954 955 956 957 958
	atomic_inc(&md->pending);

	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
		map = dm_get_table(md);
		if (map) {
			r = dm_table_any_congested(map, bdi_bits);
			dm_table_put(map);
		}
	}

	if (!atomic_dec_return(&md->pending))
		/* nudge anyone waiting on suspend queue */
		wake_up(&md->wait);
L
Linus Torvalds 已提交
959 960 961 962 963 964 965 966 967

	return r;
}

/*-----------------------------------------------------------------
 * An IDR is used to keep track of allocated minor numbers.
 *---------------------------------------------------------------*/
static DEFINE_IDR(_minor_idr);

968
static void free_minor(int minor)
L
Linus Torvalds 已提交
969
{
970
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
971
	idr_remove(&_minor_idr, minor);
972
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
973 974 975 976 977
}

/*
 * See if the device with a specific minor # is free.
 */
978
static int specific_minor(int minor)
L
Linus Torvalds 已提交
979 980 981 982 983 984
{
	int r, m;

	if (minor >= (1 << MINORBITS))
		return -EINVAL;

J
Jeff Mahoney 已提交
985 986 987 988
	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
	if (!r)
		return -ENOMEM;

989
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
990 991 992 993 994 995

	if (idr_find(&_minor_idr, minor)) {
		r = -EBUSY;
		goto out;
	}

996
	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
J
Jeff Mahoney 已提交
997
	if (r)
L
Linus Torvalds 已提交
998 999 1000 1001 1002 1003 1004 1005 1006
		goto out;

	if (m != minor) {
		idr_remove(&_minor_idr, m);
		r = -EBUSY;
		goto out;
	}

out:
1007
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
1008 1009 1010
	return r;
}

1011
static int next_free_minor(int *minor)
L
Linus Torvalds 已提交
1012
{
1013
	int r, m;
L
Linus Torvalds 已提交
1014 1015

	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
J
Jeff Mahoney 已提交
1016 1017 1018
	if (!r)
		return -ENOMEM;

1019
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
1020

1021
	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1022
	if (r)
L
Linus Torvalds 已提交
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
		goto out;

	if (m >= (1 << MINORBITS)) {
		idr_remove(&_minor_idr, m);
		r = -ENOSPC;
		goto out;
	}

	*minor = m;

out:
1034
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
1035 1036 1037 1038 1039 1040 1041 1042
	return r;
}

static struct block_device_operations dm_blk_dops;

/*
 * Allocate and initialise a blank device with a given minor.
 */
1043
static struct mapped_device *alloc_dev(int minor)
L
Linus Torvalds 已提交
1044 1045
{
	int r;
1046
	struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1047
	void *old_md;
L
Linus Torvalds 已提交
1048 1049 1050 1051 1052 1053

	if (!md) {
		DMWARN("unable to allocate device, out of memory.");
		return NULL;
	}

1054
	if (!try_module_get(THIS_MODULE))
M
Milan Broz 已提交
1055
		goto bad_module_get;
1056

L
Linus Torvalds 已提交
1057
	/* get a minor number for the dev */
1058
	if (minor == DM_ANY_MINOR)
1059
		r = next_free_minor(&minor);
1060
	else
1061
		r = specific_minor(minor);
L
Linus Torvalds 已提交
1062
	if (r < 0)
M
Milan Broz 已提交
1063
		goto bad_minor;
L
Linus Torvalds 已提交
1064

1065
	init_rwsem(&md->io_lock);
1066
	mutex_init(&md->suspend_lock);
1067
	spin_lock_init(&md->pushback_lock);
L
Linus Torvalds 已提交
1068 1069
	rwlock_init(&md->map_lock);
	atomic_set(&md->holders, 1);
1070
	atomic_set(&md->open_count, 0);
L
Linus Torvalds 已提交
1071
	atomic_set(&md->event_nr, 0);
M
Mike Anderson 已提交
1072 1073 1074
	atomic_set(&md->uevent_seq, 0);
	INIT_LIST_HEAD(&md->uevent_list);
	spin_lock_init(&md->uevent_lock);
L
Linus Torvalds 已提交
1075 1076 1077

	md->queue = blk_alloc_queue(GFP_KERNEL);
	if (!md->queue)
M
Milan Broz 已提交
1078
		goto bad_queue;
L
Linus Torvalds 已提交
1079 1080 1081 1082 1083

	md->queue->queuedata = md;
	md->queue->backing_dev_info.congested_fn = dm_any_congested;
	md->queue->backing_dev_info.congested_data = md;
	blk_queue_make_request(md->queue, dm_request);
1084
	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
L
Linus Torvalds 已提交
1085
	md->queue->unplug_fn = dm_unplug_all;
M
Milan Broz 已提交
1086
	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
L
Linus Torvalds 已提交
1087

1088
	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
K
Kiyoshi Ueda 已提交
1089
	if (!md->io_pool)
M
Milan Broz 已提交
1090
		goto bad_io_pool;
L
Linus Torvalds 已提交
1091

1092
	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
L
Linus Torvalds 已提交
1093
	if (!md->tio_pool)
M
Milan Broz 已提交
1094
		goto bad_tio_pool;
L
Linus Torvalds 已提交
1095

1096
	md->bs = bioset_create(16, 16);
S
Stefan Bader 已提交
1097 1098 1099
	if (!md->bs)
		goto bad_no_bioset;

L
Linus Torvalds 已提交
1100 1101
	md->disk = alloc_disk(1);
	if (!md->disk)
M
Milan Broz 已提交
1102
		goto bad_disk;
L
Linus Torvalds 已提交
1103

1104 1105 1106 1107
	atomic_set(&md->pending, 0);
	init_waitqueue_head(&md->wait);
	init_waitqueue_head(&md->eventq);

L
Linus Torvalds 已提交
1108 1109 1110 1111 1112 1113 1114
	md->disk->major = _major;
	md->disk->first_minor = minor;
	md->disk->fops = &dm_blk_dops;
	md->disk->queue = md->queue;
	md->disk->private_data = md;
	sprintf(md->disk->disk_name, "dm-%d", minor);
	add_disk(md->disk);
M
Mike Anderson 已提交
1115
	format_dev_t(md->name, MKDEV(_major, minor));
L
Linus Torvalds 已提交
1116

1117 1118 1119 1120
	md->wq = create_singlethread_workqueue("kdmflush");
	if (!md->wq)
		goto bad_thread;

1121
	/* Populate the mapping, nobody knows we exist yet */
1122
	spin_lock(&_minor_lock);
1123
	old_md = idr_replace(&_minor_idr, md, minor);
1124
	spin_unlock(&_minor_lock);
1125 1126 1127

	BUG_ON(old_md != MINOR_ALLOCED);

L
Linus Torvalds 已提交
1128 1129
	return md;

1130 1131
bad_thread:
	put_disk(md->disk);
M
Milan Broz 已提交
1132
bad_disk:
S
Stefan Bader 已提交
1133
	bioset_free(md->bs);
M
Milan Broz 已提交
1134
bad_no_bioset:
L
Linus Torvalds 已提交
1135
	mempool_destroy(md->tio_pool);
M
Milan Broz 已提交
1136
bad_tio_pool:
L
Linus Torvalds 已提交
1137
	mempool_destroy(md->io_pool);
M
Milan Broz 已提交
1138
bad_io_pool:
1139
	blk_cleanup_queue(md->queue);
M
Milan Broz 已提交
1140
bad_queue:
L
Linus Torvalds 已提交
1141
	free_minor(minor);
M
Milan Broz 已提交
1142
bad_minor:
1143
	module_put(THIS_MODULE);
M
Milan Broz 已提交
1144
bad_module_get:
L
Linus Torvalds 已提交
1145 1146 1147 1148
	kfree(md);
	return NULL;
}

J
Jun'ichi Nomura 已提交
1149 1150
static void unlock_fs(struct mapped_device *md);

L
Linus Torvalds 已提交
1151 1152
static void free_dev(struct mapped_device *md)
{
1153
	int minor = MINOR(disk_devt(md->disk));
1154

1155
	if (md->suspended_bdev) {
J
Jun'ichi Nomura 已提交
1156
		unlock_fs(md);
1157 1158
		bdput(md->suspended_bdev);
	}
1159
	destroy_workqueue(md->wq);
L
Linus Torvalds 已提交
1160 1161
	mempool_destroy(md->tio_pool);
	mempool_destroy(md->io_pool);
S
Stefan Bader 已提交
1162
	bioset_free(md->bs);
L
Linus Torvalds 已提交
1163
	del_gendisk(md->disk);
1164
	free_minor(minor);
J
Jeff Mahoney 已提交
1165 1166 1167 1168 1169

	spin_lock(&_minor_lock);
	md->disk->private_data = NULL;
	spin_unlock(&_minor_lock);

L
Linus Torvalds 已提交
1170
	put_disk(md->disk);
1171
	blk_cleanup_queue(md->queue);
1172
	module_put(THIS_MODULE);
L
Linus Torvalds 已提交
1173 1174 1175 1176 1177 1178 1179 1180
	kfree(md);
}

/*
 * Bind a table to the device.
 */
static void event_callback(void *context)
{
M
Mike Anderson 已提交
1181 1182
	unsigned long flags;
	LIST_HEAD(uevents);
L
Linus Torvalds 已提交
1183 1184
	struct mapped_device *md = (struct mapped_device *) context;

M
Mike Anderson 已提交
1185 1186 1187 1188
	spin_lock_irqsave(&md->uevent_lock, flags);
	list_splice_init(&md->uevent_list, &uevents);
	spin_unlock_irqrestore(&md->uevent_lock, flags);

1189
	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
M
Mike Anderson 已提交
1190

L
Linus Torvalds 已提交
1191 1192 1193 1194
	atomic_inc(&md->event_nr);
	wake_up(&md->eventq);
}

1195
static void __set_size(struct mapped_device *md, sector_t size)
L
Linus Torvalds 已提交
1196
{
1197
	set_capacity(md->disk, size);
L
Linus Torvalds 已提交
1198

1199
	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
1200
	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1201
	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
L
Linus Torvalds 已提交
1202 1203 1204 1205
}

static int __bind(struct mapped_device *md, struct dm_table *t)
{
1206
	struct request_queue *q = md->queue;
L
Linus Torvalds 已提交
1207 1208 1209
	sector_t size;

	size = dm_table_get_size(t);
D
Darrick J. Wong 已提交
1210 1211 1212 1213 1214 1215 1216

	/*
	 * Wipe any geometry if the size of the table changed.
	 */
	if (size != get_capacity(md->disk))
		memset(&md->geometry, 0, sizeof(md->geometry));

1217 1218
	if (md->suspended_bdev)
		__set_size(md, size);
L
Linus Torvalds 已提交
1219 1220 1221
	if (size == 0)
		return 0;

1222 1223 1224
	dm_table_get(t);
	dm_table_event_callback(t, event_callback, md);

L
Linus Torvalds 已提交
1225 1226
	write_lock(&md->map_lock);
	md->map = t;
1227
	dm_table_set_restrictions(t, q);
L
Linus Torvalds 已提交
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
	write_unlock(&md->map_lock);

	return 0;
}

static void __unbind(struct mapped_device *md)
{
	struct dm_table *map = md->map;

	if (!map)
		return;

	dm_table_event_callback(map, NULL, NULL);
	write_lock(&md->map_lock);
	md->map = NULL;
	write_unlock(&md->map_lock);
	dm_table_put(map);
}

/*
 * Constructor for a new device.
 */
1250
int dm_create(int minor, struct mapped_device **result)
L
Linus Torvalds 已提交
1251 1252 1253
{
	struct mapped_device *md;

1254
	md = alloc_dev(minor);
L
Linus Torvalds 已提交
1255 1256 1257 1258 1259 1260 1261
	if (!md)
		return -ENXIO;

	*result = md;
	return 0;
}

1262
static struct mapped_device *dm_find_md(dev_t dev)
L
Linus Torvalds 已提交
1263 1264 1265 1266 1267 1268 1269
{
	struct mapped_device *md;
	unsigned minor = MINOR(dev);

	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
		return NULL;

1270
	spin_lock(&_minor_lock);
L
Linus Torvalds 已提交
1271 1272

	md = idr_find(&_minor_idr, minor);
J
Jeff Mahoney 已提交
1273
	if (md && (md == MINOR_ALLOCED ||
1274
		   (MINOR(disk_devt(dm_disk(md))) != minor) ||
A
Alasdair G Kergon 已提交
1275
		   test_bit(DMF_FREEING, &md->flags))) {
1276
		md = NULL;
J
Jeff Mahoney 已提交
1277 1278
		goto out;
	}
L
Linus Torvalds 已提交
1279

J
Jeff Mahoney 已提交
1280
out:
1281
	spin_unlock(&_minor_lock);
L
Linus Torvalds 已提交
1282

1283 1284 1285
	return md;
}

1286 1287 1288 1289 1290 1291 1292 1293 1294 1295
struct mapped_device *dm_get_md(dev_t dev)
{
	struct mapped_device *md = dm_find_md(dev);

	if (md)
		dm_get(md);

	return md;
}

A
Alasdair G Kergon 已提交
1296
void *dm_get_mdptr(struct mapped_device *md)
1297
{
A
Alasdair G Kergon 已提交
1298
	return md->interface_ptr;
L
Linus Torvalds 已提交
1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
}

void dm_set_mdptr(struct mapped_device *md, void *ptr)
{
	md->interface_ptr = ptr;
}

void dm_get(struct mapped_device *md)
{
	atomic_inc(&md->holders);
}

1311 1312 1313 1314 1315 1316
const char *dm_device_name(struct mapped_device *md)
{
	return md->name;
}
EXPORT_SYMBOL_GPL(dm_device_name);

L
Linus Torvalds 已提交
1317 1318
void dm_put(struct mapped_device *md)
{
M
Mike Anderson 已提交
1319
	struct dm_table *map;
L
Linus Torvalds 已提交
1320

J
Jeff Mahoney 已提交
1321 1322
	BUG_ON(test_bit(DMF_FREEING, &md->flags));

1323
	if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
M
Mike Anderson 已提交
1324
		map = dm_get_table(md);
1325 1326
		idr_replace(&_minor_idr, MINOR_ALLOCED,
			    MINOR(disk_devt(dm_disk(md))));
J
Jeff Mahoney 已提交
1327
		set_bit(DMF_FREEING, &md->flags);
1328
		spin_unlock(&_minor_lock);
1329
		if (!dm_suspended(md)) {
L
Linus Torvalds 已提交
1330 1331 1332 1333
			dm_table_presuspend_targets(map);
			dm_table_postsuspend_targets(map);
		}
		__unbind(md);
M
Mike Anderson 已提交
1334
		dm_table_put(map);
L
Linus Torvalds 已提交
1335 1336 1337
		free_dev(md);
	}
}
E
Edward Goggin 已提交
1338
EXPORT_SYMBOL_GPL(dm_put);
L
Linus Torvalds 已提交
1339

1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
static int dm_wait_for_completion(struct mapped_device *md)
{
	int r = 0;

	while (1) {
		set_current_state(TASK_INTERRUPTIBLE);

		smp_mb();
		if (!atomic_read(&md->pending))
			break;

		if (signal_pending(current)) {
			r = -EINTR;
			break;
		}

		io_schedule();
	}
	set_current_state(TASK_RUNNING);

	return r;
}

L
Linus Torvalds 已提交
1363 1364 1365
/*
 * Process the deferred bios
 */
1366
static void __flush_deferred_io(struct mapped_device *md)
L
Linus Torvalds 已提交
1367
{
1368
	struct bio *c;
L
Linus Torvalds 已提交
1369

1370
	while ((c = bio_list_pop(&md->deferred))) {
M
Milan Broz 已提交
1371 1372
		if (__split_bio(md, c))
			bio_io_error(c);
L
Linus Torvalds 已提交
1373
	}
M
Milan Broz 已提交
1374 1375

	clear_bit(DMF_BLOCK_IO, &md->flags);
L
Linus Torvalds 已提交
1376 1377
}

1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388
static void __merge_pushback_list(struct mapped_device *md)
{
	unsigned long flags;

	spin_lock_irqsave(&md->pushback_lock, flags);
	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
	bio_list_merge_head(&md->deferred, &md->pushback);
	bio_list_init(&md->pushback);
	spin_unlock_irqrestore(&md->pushback_lock, flags);
}

1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
static void dm_wq_work(struct work_struct *work)
{
	struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
	struct mapped_device *md = req->md;

	down_write(&md->io_lock);
	switch (req->type) {
	case DM_WQ_FLUSH_DEFERRED:
		__flush_deferred_io(md);
		break;
	default:
		DMERR("dm_wq_work: unrecognised work type %d", req->type);
		BUG();
	}
	up_write(&md->io_lock);
}

static void dm_wq_queue(struct mapped_device *md, int type, void *context,
			struct dm_wq_req *req)
{
	req->type = type;
	req->md = md;
	req->context = context;
	INIT_WORK(&req->work, dm_wq_work);
	queue_work(md->wq, &req->work);
}

static void dm_queue_flush(struct mapped_device *md, int type, void *context)
{
	struct dm_wq_req req;

	dm_wq_queue(md, type, context, &req);
	flush_workqueue(md->wq);
}

L
Linus Torvalds 已提交
1424 1425 1426 1427 1428
/*
 * Swap in a new table (destroying old one).
 */
int dm_swap_table(struct mapped_device *md, struct dm_table *table)
{
1429
	int r = -EINVAL;
L
Linus Torvalds 已提交
1430

1431
	mutex_lock(&md->suspend_lock);
L
Linus Torvalds 已提交
1432 1433

	/* device must be suspended */
1434
	if (!dm_suspended(md))
1435
		goto out;
L
Linus Torvalds 已提交
1436

1437 1438 1439 1440 1441
	/* without bdev, the device size cannot be changed */
	if (!md->suspended_bdev)
		if (get_capacity(md->disk) != dm_table_get_size(table))
			goto out;

L
Linus Torvalds 已提交
1442 1443 1444
	__unbind(md);
	r = __bind(md, table);

1445
out:
1446
	mutex_unlock(&md->suspend_lock);
1447
	return r;
L
Linus Torvalds 已提交
1448 1449 1450 1451 1452 1453
}

/*
 * Functions to lock and unlock any filesystem running on the
 * device.
 */
1454
static int lock_fs(struct mapped_device *md)
L
Linus Torvalds 已提交
1455
{
1456
	int r;
L
Linus Torvalds 已提交
1457 1458

	WARN_ON(md->frozen_sb);
1459

1460
	md->frozen_sb = freeze_bdev(md->suspended_bdev);
1461
	if (IS_ERR(md->frozen_sb)) {
1462
		r = PTR_ERR(md->frozen_sb);
1463 1464
		md->frozen_sb = NULL;
		return r;
1465 1466
	}

1467 1468
	set_bit(DMF_FROZEN, &md->flags);

L
Linus Torvalds 已提交
1469
	/* don't bdput right now, we don't want the bdev
1470
	 * to go away while it is locked.
L
Linus Torvalds 已提交
1471 1472 1473 1474
	 */
	return 0;
}

1475
static void unlock_fs(struct mapped_device *md)
L
Linus Torvalds 已提交
1476
{
1477 1478 1479
	if (!test_bit(DMF_FROZEN, &md->flags))
		return;

1480
	thaw_bdev(md->suspended_bdev, md->frozen_sb);
L
Linus Torvalds 已提交
1481
	md->frozen_sb = NULL;
1482
	clear_bit(DMF_FROZEN, &md->flags);
L
Linus Torvalds 已提交
1483 1484 1485 1486 1487 1488 1489 1490 1491
}

/*
 * We need to be able to change a mapping table under a mounted
 * filesystem.  For example we might want to move some data in
 * the background.  Before the table can be swapped with
 * dm_bind_table, dm_suspend must be called to flush any in
 * flight bios and ensure that any further io gets deferred.
 */
1492
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
L
Linus Torvalds 已提交
1493
{
1494
	struct dm_table *map = NULL;
L
Linus Torvalds 已提交
1495
	DECLARE_WAITQUEUE(wait, current);
1496
	int r = 0;
1497
	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
1498
	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
L
Linus Torvalds 已提交
1499

1500
	mutex_lock(&md->suspend_lock);
1501

M
Milan Broz 已提交
1502 1503
	if (dm_suspended(md)) {
		r = -EINVAL;
1504
		goto out_unlock;
M
Milan Broz 已提交
1505
	}
L
Linus Torvalds 已提交
1506 1507 1508

	map = dm_get_table(md);

1509 1510 1511 1512 1513 1514 1515
	/*
	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
	 * This flag is cleared before dm_suspend returns.
	 */
	if (noflush)
		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

1516 1517 1518
	/* This does not get reverted if there's an error later. */
	dm_table_presuspend_targets(map);

1519 1520 1521 1522 1523 1524
	/* bdget() can stall if the pending I/Os are not flushed */
	if (!noflush) {
		md->suspended_bdev = bdget_disk(md->disk, 0);
		if (!md->suspended_bdev) {
			DMWARN("bdget failed in dm_suspend");
			r = -ENOMEM;
K
Kiyoshi Ueda 已提交
1525
			goto out;
1526
		}
1527

1528 1529 1530 1531 1532 1533 1534 1535 1536
		/*
		 * Flush I/O to the device. noflush supersedes do_lockfs,
		 * because lock_fs() needs to flush I/Os.
		 */
		if (do_lockfs) {
			r = lock_fs(md);
			if (r)
				goto out;
		}
1537
	}
L
Linus Torvalds 已提交
1538 1539

	/*
1540
	 * First we set the BLOCK_IO flag so no more ios will be mapped.
L
Linus Torvalds 已提交
1541
	 */
1542 1543
	down_write(&md->io_lock);
	set_bit(DMF_BLOCK_IO, &md->flags);
L
Linus Torvalds 已提交
1544 1545

	add_wait_queue(&md->wait, &wait);
1546
	up_write(&md->io_lock);
L
Linus Torvalds 已提交
1547 1548

	/* unplug */
1549
	if (map)
L
Linus Torvalds 已提交
1550 1551 1552
		dm_table_unplug_all(map);

	/*
1553
	 * Wait for the already-mapped ios to complete.
L
Linus Torvalds 已提交
1554
	 */
1555
	r = dm_wait_for_completion(md);
L
Linus Torvalds 已提交
1556

1557
	down_write(&md->io_lock);
L
Linus Torvalds 已提交
1558 1559
	remove_wait_queue(&md->wait, &wait);

1560 1561
	if (noflush)
		__merge_pushback_list(md);
1562
	up_write(&md->io_lock);
1563

L
Linus Torvalds 已提交
1564
	/* were we interrupted ? */
1565
	if (r < 0) {
1566
		dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
M
Milan Broz 已提交
1567

1568
		unlock_fs(md);
1569
		goto out; /* pushback list is already flushed, so skip flush */
1570
	}
L
Linus Torvalds 已提交
1571

1572
	dm_table_postsuspend_targets(map);
L
Linus Torvalds 已提交
1573

1574
	set_bit(DMF_SUSPENDED, &md->flags);
1575

1576
out:
1577 1578 1579 1580 1581
	if (r && md->suspended_bdev) {
		bdput(md->suspended_bdev);
		md->suspended_bdev = NULL;
	}

1582
	dm_table_put(map);
1583 1584

out_unlock:
1585
	mutex_unlock(&md->suspend_lock);
1586
	return r;
L
Linus Torvalds 已提交
1587 1588 1589 1590
}

int dm_resume(struct mapped_device *md)
{
1591 1592
	int r = -EINVAL;
	struct dm_table *map = NULL;
L
Linus Torvalds 已提交
1593

1594
	mutex_lock(&md->suspend_lock);
1595
	if (!dm_suspended(md))
1596 1597 1598
		goto out;

	map = dm_get_table(md);
1599
	if (!map || !dm_table_get_size(map))
1600
		goto out;
L
Linus Torvalds 已提交
1601

1602 1603 1604
	r = dm_table_resume_targets(map);
	if (r)
		goto out;
1605

1606
	dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
1607 1608 1609

	unlock_fs(md);

1610 1611 1612 1613
	if (md->suspended_bdev) {
		bdput(md->suspended_bdev);
		md->suspended_bdev = NULL;
	}
1614

1615 1616
	clear_bit(DMF_SUSPENDED, &md->flags);

L
Linus Torvalds 已提交
1617 1618
	dm_table_unplug_all(map);

1619
	dm_kobject_uevent(md);
1620

1621
	r = 0;
1622

1623 1624
out:
	dm_table_put(map);
1625
	mutex_unlock(&md->suspend_lock);
1626

1627
	return r;
L
Linus Torvalds 已提交
1628 1629 1630 1631 1632
}

/*-----------------------------------------------------------------
 * Event notification.
 *---------------------------------------------------------------*/
1633 1634
void dm_kobject_uevent(struct mapped_device *md)
{
1635
	kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
1636 1637
}

M
Mike Anderson 已提交
1638 1639 1640 1641 1642
uint32_t dm_next_uevent_seq(struct mapped_device *md)
{
	return atomic_add_return(1, &md->uevent_seq);
}

L
Linus Torvalds 已提交
1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
uint32_t dm_get_event_nr(struct mapped_device *md)
{
	return atomic_read(&md->event_nr);
}

int dm_wait_event(struct mapped_device *md, int event_nr)
{
	return wait_event_interruptible(md->eventq,
			(event_nr != atomic_read(&md->event_nr)));
}

M
Mike Anderson 已提交
1654 1655 1656 1657 1658 1659 1660 1661 1662
void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
{
	unsigned long flags;

	spin_lock_irqsave(&md->uevent_lock, flags);
	list_add(elist, &md->uevent_list);
	spin_unlock_irqrestore(&md->uevent_lock, flags);
}

L
Linus Torvalds 已提交
1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676
/*
 * The gendisk is only valid as long as you have a reference
 * count on 'md'.
 */
struct gendisk *dm_disk(struct mapped_device *md)
{
	return md->disk;
}

int dm_suspended(struct mapped_device *md)
{
	return test_bit(DMF_SUSPENDED, &md->flags);
}

1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
int dm_noflush_suspending(struct dm_target *ti)
{
	struct mapped_device *md = dm_table_get_md(ti->table);
	int r = __noflush_suspending(md);

	dm_put(md);

	return r;
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);

L
Linus Torvalds 已提交
1688 1689 1690
static struct block_device_operations dm_blk_dops = {
	.open = dm_blk_open,
	.release = dm_blk_close,
1691
	.ioctl = dm_blk_ioctl,
D
Darrick J. Wong 已提交
1692
	.getgeo = dm_blk_getgeo,
L
Linus Torvalds 已提交
1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708
	.owner = THIS_MODULE
};

EXPORT_SYMBOL(dm_get_mapinfo);

/*
 * module hooks
 */
module_init(dm_init);
module_exit(dm_exit);

module_param(major, uint, 0);
MODULE_PARM_DESC(major, "The major number of the device mapper");
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");