dm-io.c 11.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Copyright (C) 2003 Sistina Software
3
 * Copyright (C) 2006 Red Hat GmbH
L
Linus Torvalds 已提交
4 5 6 7
 *
 * This file is released under the GPL.
 */

M
Mikulas Patocka 已提交
8 9
#include "dm.h"

10
#include <linux/device-mapper.h>
L
Linus Torvalds 已提交
11 12 13 14 15 16

#include <linux/bio.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
A
Alasdair G Kergon 已提交
17
#include <linux/dm-io.h>
L
Linus Torvalds 已提交
18

19 20 21
#define DM_MSG_PREFIX "io"

#define DM_IO_MAX_REGIONS	BITS_PER_LONG
22 23
#define MIN_IOS		16
#define MIN_BIOS	16
24

25 26 27 28 29
struct dm_io_client {
	mempool_t *pool;
	struct bio_set *bios;
};

30 31 32 33
/*
 * Aligning 'struct io' reduces the number of bits required to store
 * its address.  Refer to store_io_and_region_in_bio() below.
 */
L
Linus Torvalds 已提交
34
struct io {
35
	unsigned long error_bits;
L
Linus Torvalds 已提交
36 37
	atomic_t count;
	struct task_struct *sleeper;
38
	struct dm_io_client *client;
L
Linus Torvalds 已提交
39 40
	io_notify_fn callback;
	void *context;
41
} __attribute__((aligned(DM_IO_MAX_REGIONS)));
L
Linus Torvalds 已提交
42

M
Mikulas Patocka 已提交
43 44
static struct kmem_cache *_dm_io_cache;

H
Heinz Mauelshagen 已提交
45 46 47
/*
 * Create a client with mempool and bioset.
 */
48
struct dm_io_client *dm_io_client_create(void)
H
Heinz Mauelshagen 已提交
49 50 51 52 53 54 55
{
	struct dm_io_client *client;

	client = kmalloc(sizeof(*client), GFP_KERNEL);
	if (!client)
		return ERR_PTR(-ENOMEM);

56
	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
H
Heinz Mauelshagen 已提交
57 58 59
	if (!client->pool)
		goto bad;

60
	client->bios = bioset_create(MIN_BIOS, 0);
H
Heinz Mauelshagen 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
	if (!client->bios)
		goto bad;

	return client;

   bad:
	if (client->pool)
		mempool_destroy(client->pool);
	kfree(client);
	return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(dm_io_client_create);

void dm_io_client_destroy(struct dm_io_client *client)
{
	mempool_destroy(client->pool);
	bioset_free(client->bios);
	kfree(client);
}
EXPORT_SYMBOL(dm_io_client_destroy);

L
Linus Torvalds 已提交
82 83
/*-----------------------------------------------------------------
 * We need to keep track of which region a bio is doing io for.
84 85 86 87
 * To avoid a memory allocation to store just 5 or 6 bits, we
 * ensure the 'struct io' pointer is aligned so enough low bits are
 * always zero and then combine it with the region number directly in
 * bi_private.
L
Linus Torvalds 已提交
88
 *---------------------------------------------------------------*/
89 90
static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
				       unsigned region)
L
Linus Torvalds 已提交
91
{
92 93 94 95 96 97
	if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
		DMCRIT("Unaligned struct io pointer %p", io);
		BUG();
	}

	bio->bi_private = (void *)((unsigned long)io | region);
L
Linus Torvalds 已提交
98 99
}

100 101
static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
				       unsigned *region)
L
Linus Torvalds 已提交
102
{
103 104 105 106
	unsigned long val = (unsigned long)bio->bi_private;

	*io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
	*region = val & (DM_IO_MAX_REGIONS - 1);
L
Linus Torvalds 已提交
107 108 109 110 111 112 113 114
}

/*-----------------------------------------------------------------
 * We need an io object to keep track of the number of bios that
 * have been dispatched for a particular io.
 *---------------------------------------------------------------*/
static void dec_count(struct io *io, unsigned int region, int error)
{
115
	if (error)
116
		set_bit(region, &io->error_bits);
L
Linus Torvalds 已提交
117 118 119 120 121 122

	if (atomic_dec_and_test(&io->count)) {
		if (io->sleeper)
			wake_up_process(io->sleeper);

		else {
123
			unsigned long r = io->error_bits;
L
Linus Torvalds 已提交
124 125 126
			io_notify_fn fn = io->callback;
			void *context = io->context;

M
Milan Broz 已提交
127
			mempool_free(io, io->client->pool);
L
Linus Torvalds 已提交
128 129 130 131 132
			fn(r, context);
		}
	}
}

133
static void endio(struct bio *bio, int error)
L
Linus Torvalds 已提交
134
{
H
Heinz Mauelshagen 已提交
135 136
	struct io *io;
	unsigned region;
L
Linus Torvalds 已提交
137 138 139 140

	if (error && bio_data_dir(bio) == READ)
		zero_fill_bio(bio);

H
Heinz Mauelshagen 已提交
141 142 143
	/*
	 * The bio destructor in bio_put() may use the io object.
	 */
144
	retrieve_io_and_region_from_bio(bio, &io, &region);
H
Heinz Mauelshagen 已提交
145

L
Linus Torvalds 已提交
146 147
	bio_put(bio);

H
Heinz Mauelshagen 已提交
148
	dec_count(io, region, error);
L
Linus Torvalds 已提交
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
}

/*-----------------------------------------------------------------
 * These little objects provide an abstraction for getting a new
 * destination page for io.
 *---------------------------------------------------------------*/
struct dpages {
	void (*get_page)(struct dpages *dp,
			 struct page **p, unsigned long *len, unsigned *offset);
	void (*next_page)(struct dpages *dp);

	unsigned context_u;
	void *context_ptr;
};

/*
 * Functions for getting the pages from a list.
 */
static void list_get_page(struct dpages *dp,
		  struct page **p, unsigned long *len, unsigned *offset)
{
	unsigned o = dp->context_u;
	struct page_list *pl = (struct page_list *) dp->context_ptr;

	*p = pl->page;
	*len = PAGE_SIZE - o;
	*offset = o;
}

static void list_next_page(struct dpages *dp)
{
	struct page_list *pl = (struct page_list *) dp->context_ptr;
	dp->context_ptr = pl->next;
	dp->context_u = 0;
}

static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
{
	dp->get_page = list_get_page;
	dp->next_page = list_next_page;
	dp->context_u = offset;
	dp->context_ptr = pl;
}

/*
 * Functions for getting the pages from a bvec.
 */
static void bvec_get_page(struct dpages *dp,
		  struct page **p, unsigned long *len, unsigned *offset)
{
	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
	*p = bvec->bv_page;
	*len = bvec->bv_len;
	*offset = bvec->bv_offset;
}

static void bvec_next_page(struct dpages *dp)
{
	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
	dp->context_ptr = bvec + 1;
}

static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
{
	dp->get_page = bvec_get_page;
	dp->next_page = bvec_next_page;
	dp->context_ptr = bvec;
}

H
Heinz Mauelshagen 已提交
218 219 220
/*
 * Functions for getting the pages from a VMA.
 */
L
Linus Torvalds 已提交
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
static void vm_get_page(struct dpages *dp,
		 struct page **p, unsigned long *len, unsigned *offset)
{
	*p = vmalloc_to_page(dp->context_ptr);
	*offset = dp->context_u;
	*len = PAGE_SIZE - dp->context_u;
}

static void vm_next_page(struct dpages *dp)
{
	dp->context_ptr += PAGE_SIZE - dp->context_u;
	dp->context_u = 0;
}

static void vm_dp_init(struct dpages *dp, void *data)
{
	dp->get_page = vm_get_page;
	dp->next_page = vm_next_page;
	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
	dp->context_ptr = data;
}

P
Peter Osterlund 已提交
243 244
static void dm_bio_destructor(struct bio *bio)
{
245 246 247 248
	unsigned region;
	struct io *io;

	retrieve_io_and_region_from_bio(bio, &io, &region);
249

M
Milan Broz 已提交
250
	bio_free(bio, io->client->bios);
P
Peter Osterlund 已提交
251 252
}

H
Heinz Mauelshagen 已提交
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
/*
 * Functions for getting the pages from kernel memory.
 */
static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
			unsigned *offset)
{
	*p = virt_to_page(dp->context_ptr);
	*offset = dp->context_u;
	*len = PAGE_SIZE - dp->context_u;
}

static void km_next_page(struct dpages *dp)
{
	dp->context_ptr += PAGE_SIZE - dp->context_u;
	dp->context_u = 0;
}

static void km_dp_init(struct dpages *dp, void *data)
{
	dp->get_page = km_get_page;
	dp->next_page = km_next_page;
	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
	dp->context_ptr = data;
}

L
Linus Torvalds 已提交
278 279 280
/*-----------------------------------------------------------------
 * IO routines that accept a list of pages.
 *---------------------------------------------------------------*/
H
Heinz Mauelshagen 已提交
281
static void do_region(int rw, unsigned region, struct dm_io_region *where,
L
Linus Torvalds 已提交
282 283 284 285 286 287 288 289 290
		      struct dpages *dp, struct io *io)
{
	struct bio *bio;
	struct page *page;
	unsigned long len;
	unsigned offset;
	unsigned num_bvecs;
	sector_t remaining = where->count;

M
Mikulas Patocka 已提交
291
	/*
292 293
	 * where->count may be zero if rw holds a flush and we need to
	 * send a zero-sized flush.
M
Mikulas Patocka 已提交
294 295
	 */
	do {
L
Linus Torvalds 已提交
296
		/*
297
		 * Allocate a suitably sized-bio.
L
Linus Torvalds 已提交
298
		 */
299 300
		num_bvecs = dm_sector_div_up(remaining,
					     (PAGE_SIZE >> SECTOR_SHIFT));
301
		num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
M
Milan Broz 已提交
302
		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
L
Linus Torvalds 已提交
303 304 305
		bio->bi_sector = where->sector + (where->count - remaining);
		bio->bi_bdev = where->bdev;
		bio->bi_end_io = endio;
P
Peter Osterlund 已提交
306
		bio->bi_destructor = dm_bio_destructor;
307
		store_io_and_region_in_bio(bio, io, region);
L
Linus Torvalds 已提交
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324

		/*
		 * Try and add as many pages as possible.
		 */
		while (remaining) {
			dp->get_page(dp, &page, &len, &offset);
			len = min(len, to_bytes(remaining));
			if (!bio_add_page(bio, page, len, offset))
				break;

			offset = 0;
			remaining -= to_sector(len);
			dp->next_page(dp);
		}

		atomic_inc(&io->count);
		submit_bio(rw, bio);
M
Mikulas Patocka 已提交
325
	} while (remaining);
L
Linus Torvalds 已提交
326 327 328
}

static void dispatch_io(int rw, unsigned int num_regions,
H
Heinz Mauelshagen 已提交
329
			struct dm_io_region *where, struct dpages *dp,
L
Linus Torvalds 已提交
330 331 332 333 334
			struct io *io, int sync)
{
	int i;
	struct dpages old_pages = *dp;

335 336
	BUG_ON(num_regions > DM_IO_MAX_REGIONS);

L
Linus Torvalds 已提交
337
	if (sync)
J
Jens Axboe 已提交
338
		rw |= REQ_SYNC;
L
Linus Torvalds 已提交
339 340 341 342 343 344 345

	/*
	 * For multiple regions we need to be careful to rewind
	 * the dp object for each call to do_region.
	 */
	for (i = 0; i < num_regions; i++) {
		*dp = old_pages;
346
		if (where[i].count || (rw & REQ_FLUSH))
L
Linus Torvalds 已提交
347 348 349 350
			do_region(rw, i, where + i, dp, io);
	}

	/*
H
Heinz Mauelshagen 已提交
351
	 * Drop the extra reference that we were holding to avoid
L
Linus Torvalds 已提交
352 353 354 355 356
	 * the io being completed too early.
	 */
	dec_count(io, 0, 0);
}

357
static int sync_io(struct dm_io_client *client, unsigned int num_regions,
H
Heinz Mauelshagen 已提交
358
		   struct dm_io_region *where, int rw, struct dpages *dp,
359
		   unsigned long *error_bits)
L
Linus Torvalds 已提交
360
{
361 362 363 364 365 366 367 368
	/*
	 * gcc <= 4.3 can't do the alignment for stack variables, so we must
	 * align it on our own.
	 * volatile prevents the optimizer from removing or reusing
	 * "io_" field from the stack frame (allowed in ANSI C).
	 */
	volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
	struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
L
Linus Torvalds 已提交
369

M
Mikulas Patocka 已提交
370
	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
L
Linus Torvalds 已提交
371 372 373 374
		WARN_ON(1);
		return -EIO;
	}

375 376 377 378
	io->error_bits = 0;
	atomic_set(&io->count, 1); /* see dispatch_io() */
	io->sleeper = current;
	io->client = client;
L
Linus Torvalds 已提交
379

380
	dispatch_io(rw, num_regions, where, dp, io, 1);
L
Linus Torvalds 已提交
381 382 383 384

	while (1) {
		set_current_state(TASK_UNINTERRUPTIBLE);

385
		if (!atomic_read(&io->count))
L
Linus Torvalds 已提交
386 387 388 389 390 391
			break;

		io_schedule();
	}
	set_current_state(TASK_RUNNING);

392
	if (error_bits)
393
		*error_bits = io->error_bits;
394

395
	return io->error_bits ? -EIO : 0;
L
Linus Torvalds 已提交
396 397
}

398
static int async_io(struct dm_io_client *client, unsigned int num_regions,
H
Heinz Mauelshagen 已提交
399
		    struct dm_io_region *where, int rw, struct dpages *dp,
400
		    io_notify_fn fn, void *context)
L
Linus Torvalds 已提交
401 402 403
{
	struct io *io;

M
Mikulas Patocka 已提交
404
	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
L
Linus Torvalds 已提交
405 406 407 408 409
		WARN_ON(1);
		fn(1, context);
		return -EIO;
	}

M
Milan Broz 已提交
410
	io = mempool_alloc(client->pool, GFP_NOIO);
411
	io->error_bits = 0;
L
Linus Torvalds 已提交
412 413
	atomic_set(&io->count, 1); /* see dispatch_io() */
	io->sleeper = NULL;
414
	io->client = client;
L
Linus Torvalds 已提交
415 416 417 418 419 420 421
	io->callback = fn;
	io->context = context;

	dispatch_io(rw, num_regions, where, dp, io, 0);
	return 0;
}

H
Heinz Mauelshagen 已提交
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
{
	/* Set up dpages based on memory type */
	switch (io_req->mem.type) {
	case DM_IO_PAGE_LIST:
		list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
		break;

	case DM_IO_BVEC:
		bvec_dp_init(dp, io_req->mem.ptr.bvec);
		break;

	case DM_IO_VMA:
		vm_dp_init(dp, io_req->mem.ptr.vma);
		break;

	case DM_IO_KMEM:
		km_dp_init(dp, io_req->mem.ptr.addr);
		break;

	default:
		return -EINVAL;
	}

	return 0;
}

/*
M
Mikulas Patocka 已提交
450 451 452
 * New collapsed (a)synchronous interface.
 *
 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
453 454
 * the queue with blk_unplug() some time later or set REQ_SYNC in
io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
M
Mikulas Patocka 已提交
455
 * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
H
Heinz Mauelshagen 已提交
456 457
 */
int dm_io(struct dm_io_request *io_req, unsigned num_regions,
H
Heinz Mauelshagen 已提交
458
	  struct dm_io_region *where, unsigned long *sync_error_bits)
H
Heinz Mauelshagen 已提交
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
{
	int r;
	struct dpages dp;

	r = dp_init(io_req, &dp);
	if (r)
		return r;

	if (!io_req->notify.fn)
		return sync_io(io_req->client, num_regions, where,
			       io_req->bi_rw, &dp, sync_error_bits);

	return async_io(io_req->client, num_regions, where, io_req->bi_rw,
			&dp, io_req->notify.fn, io_req->notify.context);
}
EXPORT_SYMBOL(dm_io);
M
Mikulas Patocka 已提交
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489

int __init dm_io_init(void)
{
	_dm_io_cache = KMEM_CACHE(io, 0);
	if (!_dm_io_cache)
		return -ENOMEM;

	return 0;
}

void dm_io_exit(void)
{
	kmem_cache_destroy(_dm_io_cache);
	_dm_io_cache = NULL;
}