bio.c 43.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2
/*
3
 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
L
Linus Torvalds 已提交
4 5 6 7 8
 */
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
9
#include <linux/uio.h>
10
#include <linux/iocontext.h>
L
Linus Torvalds 已提交
11 12 13
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
14
#include <linux/export.h>
L
Linus Torvalds 已提交
15 16
#include <linux/mempool.h>
#include <linux/workqueue.h>
17
#include <linux/cgroup.h>
18
#include <linux/blk-cgroup.h>
19
#include <linux/highmem.h>
20
#include <linux/sched/sysctl.h>
21
#include <linux/blk-crypto.h>
22
#include <linux/xarray.h>
L
Linus Torvalds 已提交
23

24
#include <trace/events/block.h>
25
#include "blk.h"
J
Josef Bacik 已提交
26
#include "blk-rq-qos.h"
27

28
static struct biovec_slab {
29 30 31
	int nr_vecs;
	char *name;
	struct kmem_cache *slab;
32 33 34 35
} bvec_slabs[] __read_mostly = {
	{ .nr_vecs = 16, .name = "biovec-16" },
	{ .nr_vecs = 64, .name = "biovec-64" },
	{ .nr_vecs = 128, .name = "biovec-128" },
36
	{ .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
L
Linus Torvalds 已提交
37
};
38

39 40 41 42 43 44 45 46 47 48
static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
{
	switch (nr_vecs) {
	/* smaller bios use inline vecs */
	case 5 ... 16:
		return &bvec_slabs[0];
	case 17 ... 64:
		return &bvec_slabs[1];
	case 65 ... 128:
		return &bvec_slabs[2];
49
	case 129 ... BIO_MAX_VECS:
50 51 52 53 54 55
		return &bvec_slabs[3];
	default:
		BUG();
		return NULL;
	}
}
L
Linus Torvalds 已提交
56 57 58 59 60

/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
61
struct bio_set fs_bio_set;
62
EXPORT_SYMBOL(fs_bio_set);
L
Linus Torvalds 已提交
63

64 65 66 67 68 69 70 71 72 73
/*
 * Our slab pool management
 */
struct bio_slab {
	struct kmem_cache *slab;
	unsigned int slab_ref;
	unsigned int slab_size;
	char name[8];
};
static DEFINE_MUTEX(bio_slab_lock);
74
static DEFINE_XARRAY(bio_slabs);
75

76
static struct bio_slab *create_bio_slab(unsigned int size)
77
{
78
	struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
79

80 81
	if (!bslab)
		return NULL;
82

83 84 85 86 87
	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
	bslab->slab = kmem_cache_create(bslab->name, size,
			ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
	if (!bslab->slab)
		goto fail_alloc_slab;
88

89 90
	bslab->slab_ref = 1;
	bslab->slab_size = size;
91

92 93
	if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
		return bslab;
94

95
	kmem_cache_destroy(bslab->slab);
96

97 98 99 100
fail_alloc_slab:
	kfree(bslab);
	return NULL;
}
101

102 103
static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
{
104
	return bs->front_pad + sizeof(struct bio) + bs->back_pad;
105
}
106

107 108 109 110
static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
{
	unsigned int size = bs_bio_slab_size(bs);
	struct bio_slab *bslab;
111

112 113 114 115 116 117
	mutex_lock(&bio_slab_lock);
	bslab = xa_load(&bio_slabs, size);
	if (bslab)
		bslab->slab_ref++;
	else
		bslab = create_bio_slab(size);
118
	mutex_unlock(&bio_slab_lock);
119 120 121 122

	if (bslab)
		return bslab->slab;
	return NULL;
123 124 125 126 127
}

static void bio_put_slab(struct bio_set *bs)
{
	struct bio_slab *bslab = NULL;
128
	unsigned int slab_size = bs_bio_slab_size(bs);
129 130 131

	mutex_lock(&bio_slab_lock);

132
	bslab = xa_load(&bio_slabs, slab_size);
133 134 135
	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
		goto out;

136 137
	WARN_ON_ONCE(bslab->slab != bs->bio_slab);

138 139 140 141 142
	WARN_ON(!bslab->slab_ref);

	if (--bslab->slab_ref)
		goto out;

143 144
	xa_erase(&bio_slabs, slab_size);

145
	kmem_cache_destroy(bslab->slab);
146
	kfree(bslab);
147 148 149 150 151

out:
	mutex_unlock(&bio_slab_lock);
}

152
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
153
{
154
	BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
155

156
	if (nr_vecs == BIO_MAX_VECS)
157
		mempool_free(bv, pool);
158 159
	else if (nr_vecs > BIO_INLINE_VECS)
		kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
160 161
}

162 163 164 165 166 167 168 169
/*
 * Make the first allocation restricted and don't dump info on allocation
 * failures, since we'll fall back to the mempool in case of failure.
 */
static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
{
	return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
		__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
170 171
}

172 173
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
		gfp_t gfp_mask)
L
Linus Torvalds 已提交
174
{
175
	struct biovec_slab *bvs = biovec_slab(*nr_vecs);
L
Linus Torvalds 已提交
176

177
	if (WARN_ON_ONCE(!bvs))
178 179 180
		return NULL;

	/*
181 182
	 * Upgrade the nr_vecs request to take full advantage of the allocation.
	 * We also rely on this in the bvec_free path.
183
	 */
184
	*nr_vecs = bvs->nr_vecs;
185 186

	/*
C
Christoph Hellwig 已提交
187 188
	 * Try a slab allocation first for all smaller allocations.  If that
	 * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
189
	 * The mempool is sized to handle up to BIO_MAX_VECS entries.
190
	 */
191
	if (*nr_vecs < BIO_MAX_VECS) {
C
Christoph Hellwig 已提交
192
		struct bio_vec *bvl;
L
Linus Torvalds 已提交
193

194
		bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
195
		if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
C
Christoph Hellwig 已提交
196
			return bvl;
197
		*nr_vecs = BIO_MAX_VECS;
198 199
	}

C
Christoph Hellwig 已提交
200
	return mempool_alloc(pool, gfp_mask);
L
Linus Torvalds 已提交
201 202
}

203
void bio_uninit(struct bio *bio)
L
Linus Torvalds 已提交
204
{
205 206 207 208 209 210
#ifdef CONFIG_BLK_CGROUP
	if (bio->bi_blkg) {
		blkg_put(bio->bi_blkg);
		bio->bi_blkg = NULL;
	}
#endif
211 212
	if (bio_integrity(bio))
		bio_integrity_free(bio);
213 214

	bio_crypt_free_ctx(bio);
K
Kent Overstreet 已提交
215
}
216
EXPORT_SYMBOL(bio_uninit);
217

K
Kent Overstreet 已提交
218 219 220 221 222
static void bio_free(struct bio *bio)
{
	struct bio_set *bs = bio->bi_pool;
	void *p;

223
	bio_uninit(bio);
K
Kent Overstreet 已提交
224 225

	if (bs) {
226
		bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
K
Kent Overstreet 已提交
227 228 229 230 231

		/*
		 * If we have front padding, adjust the bio pointer before freeing
		 */
		p = bio;
232 233
		p -= bs->front_pad;

234
		mempool_free(p, &bs->bio_pool);
K
Kent Overstreet 已提交
235 236 237 238
	} else {
		/* Bio was allocated by bio_kmalloc() */
		kfree(bio);
	}
P
Peter Osterlund 已提交
239 240
}

241 242 243 244 245
/*
 * Users of this function have their own bio allocation. Subsequently,
 * they must remember to pair any call to bio_init() with bio_uninit()
 * when IO has completed, or when the bio is released.
 */
246 247
void bio_init(struct bio *bio, struct bio_vec *table,
	      unsigned short max_vecs)
L
Linus Torvalds 已提交
248
{
J
Jens Axboe 已提交
249
	memset(bio, 0, sizeof(*bio));
250
	atomic_set(&bio->__bi_remaining, 1);
251
	atomic_set(&bio->__bi_cnt, 1);
252 253 254

	bio->bi_io_vec = table;
	bio->bi_max_vecs = max_vecs;
L
Linus Torvalds 已提交
255
}
256
EXPORT_SYMBOL(bio_init);
L
Linus Torvalds 已提交
257

K
Kent Overstreet 已提交
258 259 260 261 262 263 264 265 266 267 268 269
/**
 * bio_reset - reinitialize a bio
 * @bio:	bio to reset
 *
 * Description:
 *   After calling bio_reset(), @bio will be in the same state as a freshly
 *   allocated bio returned bio bio_alloc_bioset() - the only fields that are
 *   preserved are the ones that are initialized by bio_alloc_bioset(). See
 *   comment in struct bio.
 */
void bio_reset(struct bio *bio)
{
270
	bio_uninit(bio);
K
Kent Overstreet 已提交
271
	memset(bio, 0, BIO_RESET_BYTES);
272
	atomic_set(&bio->__bi_remaining, 1);
K
Kent Overstreet 已提交
273 274 275
}
EXPORT_SYMBOL(bio_reset);

276
static struct bio *__bio_chain_endio(struct bio *bio)
K
Kent Overstreet 已提交
277
{
278 279
	struct bio *parent = bio->bi_private;

280 281
	if (!parent->bi_status)
		parent->bi_status = bio->bi_status;
K
Kent Overstreet 已提交
282
	bio_put(bio);
283 284 285 286 287 288
	return parent;
}

static void bio_chain_endio(struct bio *bio)
{
	bio_endio(__bio_chain_endio(bio));
K
Kent Overstreet 已提交
289 290 291 292
}

/**
 * bio_chain - chain bio completions
293
 * @bio: the target bio
294
 * @parent: the parent bio of @bio
K
Kent Overstreet 已提交
295 296 297 298 299 300 301 302 303 304 305 306 307
 *
 * The caller won't have a bi_end_io called when @bio completes - instead,
 * @parent's bi_end_io won't be called until both @parent and @bio have
 * completed; the chained bio will also be freed when it completes.
 *
 * The caller must not set bi_private or bi_end_io in @bio.
 */
void bio_chain(struct bio *bio, struct bio *parent)
{
	BUG_ON(bio->bi_private || bio->bi_end_io);

	bio->bi_private = parent;
	bio->bi_end_io	= bio_chain_endio;
308
	bio_inc_remaining(parent);
K
Kent Overstreet 已提交
309 310 311
}
EXPORT_SYMBOL(bio_chain);

312 313 314 315 316 317 318 319 320 321 322 323 324
static void bio_alloc_rescue(struct work_struct *work)
{
	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
	struct bio *bio;

	while (1) {
		spin_lock(&bs->rescue_lock);
		bio = bio_list_pop(&bs->rescue_list);
		spin_unlock(&bs->rescue_lock);

		if (!bio)
			break;

325
		submit_bio_noacct(bio);
326 327 328 329 330 331 332 333
	}
}

static void punt_bios_to_rescuer(struct bio_set *bs)
{
	struct bio_list punt, nopunt;
	struct bio *bio;

334 335
	if (WARN_ON_ONCE(!bs->rescue_workqueue))
		return;
336 337 338 339 340 341 342 343 344 345 346 347 348 349
	/*
	 * In order to guarantee forward progress we must punt only bios that
	 * were allocated from this bio_set; otherwise, if there was a bio on
	 * there for a stacking driver higher up in the stack, processing it
	 * could require allocating bios from this bio_set, and doing that from
	 * our own rescuer would be bad.
	 *
	 * Since bio lists are singly linked, pop them all instead of trying to
	 * remove from the middle of the list:
	 */

	bio_list_init(&punt);
	bio_list_init(&nopunt);

350
	while ((bio = bio_list_pop(&current->bio_list[0])))
351
		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
352
	current->bio_list[0] = nopunt;
353

354 355 356 357
	bio_list_init(&nopunt);
	while ((bio = bio_list_pop(&current->bio_list[1])))
		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
	current->bio_list[1] = nopunt;
358 359 360 361 362 363 364 365

	spin_lock(&bs->rescue_lock);
	bio_list_merge(&bs->rescue_list, &punt);
	spin_unlock(&bs->rescue_lock);

	queue_work(bs->rescue_workqueue, &bs->rescue_work);
}

L
Linus Torvalds 已提交
366 367
/**
 * bio_alloc_bioset - allocate a bio for I/O
368
 * @gfp_mask:   the GFP_* mask given to the slab allocator
L
Linus Torvalds 已提交
369
 * @nr_iovecs:	number of iovecs to pre-allocate
370
 * @bs:		the bio_set to allocate from.
L
Linus Torvalds 已提交
371
 *
372
 * Allocate a bio from the mempools in @bs.
373
 *
374 375 376 377 378 379
 * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
 * allocate a bio.  This is due to the mempool guarantees.  To make this work,
 * callers must never allocate more than 1 bio at a time from the general pool.
 * Callers that need to allocate more than 1 bio must always submit the
 * previously allocated bio for IO before attempting to allocate a new one.
 * Failure to do so can cause deadlocks under memory pressure.
380
 *
381 382 383 384
 * Note that when running under submit_bio_noacct() (i.e. any block driver),
 * bios are not submitted until after you return - see the code in
 * submit_bio_noacct() that converts recursion into iteration, to prevent
 * stack overflows.
385
 *
386 387 388 389
 * This would normally mean allocating multiple bios under submit_bio_noacct()
 * would be susceptible to deadlocks, but we have
 * deadlock avoidance code that resubmits any blocked bios from a rescuer
 * thread.
390
 *
391 392 393 394
 * However, we do not guarantee forward progress for allocations from other
 * mempools. Doing multiple allocations from the same mempool under
 * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
 * for per bio allocations.
395
 *
396
 * Returns: Pointer to new bio on success, NULL on failure.
397
 */
398
struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
399
			     struct bio_set *bs)
L
Linus Torvalds 已提交
400
{
401
	gfp_t saved_gfp = gfp_mask;
T
Tejun Heo 已提交
402 403 404
	struct bio *bio;
	void *p;

405 406 407
	/* should not use nobvec bioset for nr_iovecs > 0 */
	if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
		return NULL;
408

409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
	/*
	 * submit_bio_noacct() converts recursion to iteration; this means if
	 * we're running beneath it, any bios we allocate and submit will not be
	 * submitted (and thus freed) until after we return.
	 *
	 * This exposes us to a potential deadlock if we allocate multiple bios
	 * from the same bio_set() while running underneath submit_bio_noacct().
	 * If we were to allocate multiple bios (say a stacking block driver
	 * that was splitting bios), we would deadlock if we exhausted the
	 * mempool's reserve.
	 *
	 * We solve this, and guarantee forward progress, with a rescuer
	 * workqueue per bio_set. If we go to allocate and there are bios on
	 * current->bio_list, we first try the allocation without
	 * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
	 * blocking to the rescuer workqueue before we retry with the original
	 * gfp_flags.
	 */
	if (current->bio_list &&
	    (!bio_list_empty(&current->bio_list[0]) ||
	     !bio_list_empty(&current->bio_list[1])) &&
	    bs->rescue_workqueue)
		gfp_mask &= ~__GFP_DIRECT_RECLAIM;

	p = mempool_alloc(&bs->bio_pool, gfp_mask);
	if (!p && gfp_mask != saved_gfp) {
		punt_bios_to_rescuer(bs);
		gfp_mask = saved_gfp;
437
		p = mempool_alloc(&bs->bio_pool, gfp_mask);
438
	}
T
Tejun Heo 已提交
439 440
	if (unlikely(!p))
		return NULL;
L
Linus Torvalds 已提交
441

442 443 444
	bio = p + bs->front_pad;
	if (nr_iovecs > BIO_INLINE_VECS) {
		struct bio_vec *bvl = NULL;
I
Ingo Molnar 已提交
445

446
		bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
447 448 449
		if (!bvl && gfp_mask != saved_gfp) {
			punt_bios_to_rescuer(bs);
			gfp_mask = saved_gfp;
450
			bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
451
		}
I
Ingo Molnar 已提交
452 453
		if (unlikely(!bvl))
			goto err_free;
454

455
		bio_init(bio, bvl, nr_iovecs);
456
	} else if (nr_iovecs) {
457 458 459
		bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
	} else {
		bio_init(bio, NULL, 0);
L
Linus Torvalds 已提交
460
	}
461 462

	bio->bi_pool = bs;
L
Linus Torvalds 已提交
463
	return bio;
I
Ingo Molnar 已提交
464 465

err_free:
466
	mempool_free(p, &bs->bio_pool);
I
Ingo Molnar 已提交
467
	return NULL;
L
Linus Torvalds 已提交
468
}
469
EXPORT_SYMBOL(bio_alloc_bioset);
L
Linus Torvalds 已提交
470

471 472 473 474 475 476 477 478 479
/**
 * bio_kmalloc - kmalloc a bio for I/O
 * @gfp_mask:   the GFP_* mask given to the slab allocator
 * @nr_iovecs:	number of iovecs to pre-allocate
 *
 * Use kmalloc to allocate and initialize a bio.
 *
 * Returns: Pointer to new bio on success, NULL on failure.
 */
480
struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
{
	struct bio *bio;

	if (nr_iovecs > UIO_MAXIOV)
		return NULL;

	bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
	if (unlikely(!bio))
		return NULL;
	bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
	bio->bi_pool = NULL;
	return bio;
}
EXPORT_SYMBOL(bio_kmalloc);

496
void zero_fill_bio(struct bio *bio)
L
Linus Torvalds 已提交
497 498
{
	unsigned long flags;
499 500
	struct bio_vec bv;
	struct bvec_iter iter;
L
Linus Torvalds 已提交
501

502
	bio_for_each_segment(bv, bio, iter) {
503 504 505
		char *data = bvec_kmap_irq(&bv, &flags);
		memset(data, 0, bv.bv_len);
		flush_dcache_page(bv.bv_page);
L
Linus Torvalds 已提交
506 507 508
		bvec_kunmap_irq(data, &flags);
	}
}
509
EXPORT_SYMBOL(zero_fill_bio);
L
Linus Torvalds 已提交
510

511 512 513 514 515 516 517 518 519 520
/**
 * bio_truncate - truncate the bio to small size of @new_size
 * @bio:	the bio to be truncated
 * @new_size:	new size for truncating the bio
 *
 * Description:
 *   Truncate the bio to new size of @new_size. If bio_op(bio) is
 *   REQ_OP_READ, zero the truncated part. This function should only
 *   be used for handling corner cases, such as bio eod.
 */
521 522 523 524 525 526 527 528 529 530
void bio_truncate(struct bio *bio, unsigned new_size)
{
	struct bio_vec bv;
	struct bvec_iter iter;
	unsigned int done = 0;
	bool truncated = false;

	if (new_size >= bio->bi_iter.bi_size)
		return;

531
	if (bio_op(bio) != REQ_OP_READ)
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
		goto exit;

	bio_for_each_segment(bv, bio, iter) {
		if (done + bv.bv_len > new_size) {
			unsigned offset;

			if (!truncated)
				offset = new_size - done;
			else
				offset = 0;
			zero_user(bv.bv_page, offset, bv.bv_len - offset);
			truncated = true;
		}
		done += bv.bv_len;
	}

 exit:
	/*
	 * Don't touch bvec table here and make it really immutable, since
	 * fs bio user has to retrieve all pages via bio_for_each_segment_all
	 * in its .end_bio() callback.
	 *
	 * It is enough to truncate bio by updating .bi_size since we can make
	 * correct bvec with the updated .bi_size for drivers.
	 */
	bio->bi_iter.bi_size = new_size;
}

560 561 562 563 564 565 566 567 568 569 570 571 572 573
/**
 * guard_bio_eod - truncate a BIO to fit the block device
 * @bio:	bio to truncate
 *
 * This allows us to do IO even on the odd last sectors of a device, even if the
 * block size is some multiple of the physical sector size.
 *
 * We'll just truncate the bio to the size of the device, and clear the end of
 * the buffer head manually.  Truly out-of-range accesses will turn into actual
 * I/O errors, this only handles the "we need to be able to do I/O at the final
 * sector" case.
 */
void guard_bio_eod(struct bio *bio)
{
574
	sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593

	if (!maxsector)
		return;

	/*
	 * If the *whole* IO is past the end of the device,
	 * let it through, and the IO layer will turn it into
	 * an EIO.
	 */
	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
		return;

	maxsector -= bio->bi_iter.bi_sector;
	if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
		return;

	bio_truncate(bio, maxsector << 9);
}

L
Linus Torvalds 已提交
594 595 596 597 598 599
/**
 * bio_put - release a reference to a bio
 * @bio:   bio to release reference to
 *
 * Description:
 *   Put a reference to a &struct bio, either one you have gotten with
600
 *   bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
L
Linus Torvalds 已提交
601 602 603
 **/
void bio_put(struct bio *bio)
{
604
	if (!bio_flagged(bio, BIO_REFFED))
K
Kent Overstreet 已提交
605
		bio_free(bio);
606 607 608 609 610 611 612 613 614
	else {
		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));

		/*
		 * last put frees it
		 */
		if (atomic_dec_and_test(&bio->__bi_cnt))
			bio_free(bio);
	}
L
Linus Torvalds 已提交
615
}
616
EXPORT_SYMBOL(bio_put);
L
Linus Torvalds 已提交
617

K
Kent Overstreet 已提交
618 619 620 621 622 623 624 625 626 627 628 629 630
/**
 * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
 * 	@bio: destination bio
 * 	@bio_src: bio to clone
 *
 *	Clone a &bio. Caller will own the returned bio, but not
 *	the actual data it points to. Reference count of returned
 * 	bio will be one.
 *
 * 	Caller must ensure that @bio_src is not freed before @bio.
 */
void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
{
631
	WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
K
Kent Overstreet 已提交
632 633

	/*
634
	 * most users will be overriding ->bi_bdev with a new target,
K
Kent Overstreet 已提交
635 636
	 * so we don't set nor calculate new physical/hw segment counts here
	 */
637
	bio->bi_bdev = bio_src->bi_bdev;
638
	bio_set_flag(bio, BIO_CLONED);
S
Shaohua Li 已提交
639 640
	if (bio_flagged(bio_src, BIO_THROTTLED))
		bio_set_flag(bio, BIO_THROTTLED);
641 642
	if (bio_flagged(bio_src, BIO_REMAPPED))
		bio_set_flag(bio, BIO_REMAPPED);
J
Jens Axboe 已提交
643
	bio->bi_opf = bio_src->bi_opf;
644
	bio->bi_ioprio = bio_src->bi_ioprio;
645
	bio->bi_write_hint = bio_src->bi_write_hint;
K
Kent Overstreet 已提交
646 647
	bio->bi_iter = bio_src->bi_iter;
	bio->bi_io_vec = bio_src->bi_io_vec;
648

649
	bio_clone_blkg_association(bio, bio_src);
650
	blkcg_bio_issue_init(bio);
K
Kent Overstreet 已提交
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
}
EXPORT_SYMBOL(__bio_clone_fast);

/**
 *	bio_clone_fast - clone a bio that shares the original bio's biovec
 *	@bio: bio to clone
 *	@gfp_mask: allocation priority
 *	@bs: bio_set to allocate from
 *
 * 	Like __bio_clone_fast, only also allocates the returned bio
 */
struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
{
	struct bio *b;

	b = bio_alloc_bioset(gfp_mask, 0, bs);
	if (!b)
		return NULL;

	__bio_clone_fast(b, bio);

672 673
	if (bio_crypt_clone(b, bio, gfp_mask) < 0)
		goto err_put;
674

675 676 677
	if (bio_integrity(bio) &&
	    bio_integrity_clone(b, bio, gfp_mask) < 0)
		goto err_put;
K
Kent Overstreet 已提交
678 679

	return b;
680 681 682 683

err_put:
	bio_put(b);
	return NULL;
K
Kent Overstreet 已提交
684 685 686
}
EXPORT_SYMBOL(bio_clone_fast);

687 688
const char *bio_devname(struct bio *bio, char *buf)
{
689
	return bdevname(bio->bi_bdev, buf);
690 691 692
}
EXPORT_SYMBOL(bio_devname);

693 694
static inline bool page_is_mergeable(const struct bio_vec *bv,
		struct page *page, unsigned int len, unsigned int off,
695
		bool *same_page)
696
{
697 698
	size_t bv_end = bv->bv_offset + bv->bv_len;
	phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
699 700 701 702 703 704
	phys_addr_t page_addr = page_to_phys(page);

	if (vec_end_addr + 1 != page_addr + off)
		return false;
	if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
		return false;
705

706
	*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
707 708 709
	if (*same_page)
		return true;
	return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
710 711
}

712 713 714 715 716 717 718 719
/*
 * Try to merge a page into a segment, while obeying the hardware segment
 * size limit.  This is not for normal read/write bios, but for passthrough
 * or Zone Append operations that we can't split.
 */
static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
				 struct page *page, unsigned len,
				 unsigned offset, bool *same_page)
720
{
721
	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
722 723 724 725 726 727 728 729
	unsigned long mask = queue_segment_boundary(q);
	phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
	phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;

	if ((addr1 | mask) != (addr2 | mask))
		return false;
	if (bv->bv_len + len > queue_max_segment_size(q))
		return false;
730
	return __bio_try_merge_page(bio, page, len, offset, same_page);
731 732
}

L
Linus Torvalds 已提交
733
/**
734 735 736 737 738 739 740 741
 * bio_add_hw_page - attempt to add a page to a bio with hw constraints
 * @q: the target queue
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 * @max_sectors: maximum number of sectors that can be added
 * @same_page: return if the segment has been merged inside the same page
K
Kent Overstreet 已提交
742
 *
743 744
 * Add a page to a bio while respecting the hardware max_sectors, max_segment
 * and gap limitations.
L
Linus Torvalds 已提交
745
 */
746
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
747
		struct page *page, unsigned int len, unsigned int offset,
748
		unsigned int max_sectors, bool *same_page)
L
Linus Torvalds 已提交
749 750 751
{
	struct bio_vec *bvec;

752
	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
L
Linus Torvalds 已提交
753 754
		return 0;

755
	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
L
Linus Torvalds 已提交
756 757
		return 0;

758
	if (bio->bi_vcnt > 0) {
759
		if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
760
			return len;
761 762 763 764 765

		/*
		 * If the queue doesn't support SG gaps and adding this segment
		 * would create a gap, disallow it.
		 */
766
		bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
767 768
		if (bvec_gap_to_prev(q, bvec, offset))
			return 0;
769 770
	}

M
Ming Lei 已提交
771
	if (bio_full(bio, len))
L
Linus Torvalds 已提交
772 773
		return 0;

774
	if (bio->bi_vcnt >= queue_max_segments(q))
775 776
		return 0;

777 778 779 780 781
	bvec = &bio->bi_io_vec[bio->bi_vcnt];
	bvec->bv_page = page;
	bvec->bv_len = len;
	bvec->bv_offset = offset;
	bio->bi_vcnt++;
782
	bio->bi_iter.bi_size += len;
L
Linus Torvalds 已提交
783 784
	return len;
}
785

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
/**
 * bio_add_pc_page	- attempt to add page to passthrough bio
 * @q: the target queue
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 *
 * Attempt to add a page to the bio_vec maplist. This can fail for a
 * number of reasons, such as the bio being full or target block device
 * limitations. The target block device must allow bio's up to PAGE_SIZE,
 * so it is always possible to add a single page to an empty bio.
 *
 * This should only be used by passthrough bios.
 */
801 802 803
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
		struct page *page, unsigned int len, unsigned int offset)
{
804
	bool same_page = false;
805 806
	return bio_add_hw_page(q, bio, page, len, offset,
			queue_max_hw_sectors(q), &same_page);
807
}
808
EXPORT_SYMBOL(bio_add_pc_page);
809

810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828
/**
 * bio_add_zone_append_page - attempt to add page to zone-append bio
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 *
 * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
 * for a zone-append request. This can fail for a number of reasons, such as the
 * bio being full or the target block device is not a zoned block device or
 * other limitations of the target block device. The target block device must
 * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
 * to an empty bio.
 *
 * Returns: number of bytes added to the bio, or 0 in case of a failure.
 */
int bio_add_zone_append_page(struct bio *bio, struct page *page,
			     unsigned int len, unsigned int offset)
{
829
	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
830 831 832 833 834 835 836 837 838 839 840 841 842
	bool same_page = false;

	if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
		return 0;

	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
		return 0;

	return bio_add_hw_page(q, bio, page, len, offset,
			       queue_max_zone_append_sectors(q), &same_page);
}
EXPORT_SYMBOL_GPL(bio_add_zone_append_page);

L
Linus Torvalds 已提交
843
/**
844 845
 * __bio_try_merge_page - try appending data to an existing bvec.
 * @bio: destination bio
846
 * @page: start page to add
847
 * @len: length of the data to add
848
 * @off: offset of the data relative to @page
849
 * @same_page: return if the segment has been merged inside the same page
L
Linus Torvalds 已提交
850
 *
851
 * Try to add the data at @page + @off to the last bvec of @bio.  This is a
852
 * useful optimisation for file systems with a block size smaller than the
853 854
 * page size.
 *
855 856
 * Warn if (@len, @off) crosses pages in case that @same_page is true.
 *
857
 * Return %true on success or %false on failure.
L
Linus Torvalds 已提交
858
 */
859
bool __bio_try_merge_page(struct bio *bio, struct page *page,
860
		unsigned int len, unsigned int off, bool *same_page)
L
Linus Torvalds 已提交
861
{
K
Kent Overstreet 已提交
862
	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
863
		return false;
864

865
	if (bio->bi_vcnt > 0) {
866
		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
867 868

		if (page_is_mergeable(bv, page, len, off, same_page)) {
869 870
			if (bio->bi_iter.bi_size > UINT_MAX - len) {
				*same_page = false;
871
				return false;
872
			}
873 874 875 876
			bv->bv_len += len;
			bio->bi_iter.bi_size += len;
			return true;
		}
K
Kent Overstreet 已提交
877
	}
878 879 880
	return false;
}
EXPORT_SYMBOL_GPL(__bio_try_merge_page);
K
Kent Overstreet 已提交
881

882
/**
883
 * __bio_add_page - add page(s) to a bio in a new segment
884
 * @bio: destination bio
885 886 887
 * @page: start page to add
 * @len: length of the data to add, may cross pages
 * @off: offset of the data relative to @page, may cross pages
888 889 890 891 892 893 894 895
 *
 * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
 * that @bio has space for another bvec.
 */
void __bio_add_page(struct bio *bio, struct page *page,
		unsigned int len, unsigned int off)
{
	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
K
Kent Overstreet 已提交
896

897
	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
M
Ming Lei 已提交
898
	WARN_ON_ONCE(bio_full(bio, len));
899 900 901 902

	bv->bv_page = page;
	bv->bv_offset = off;
	bv->bv_len = len;
K
Kent Overstreet 已提交
903 904

	bio->bi_iter.bi_size += len;
905
	bio->bi_vcnt++;
906 907 908

	if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
		bio_set_flag(bio, BIO_WORKINGSET);
909 910 911 912
}
EXPORT_SYMBOL_GPL(__bio_add_page);

/**
913
 *	bio_add_page	-	attempt to add page(s) to bio
914
 *	@bio: destination bio
915 916 917
 *	@page: start page to add
 *	@len: vec entry length, may cross pages
 *	@offset: vec entry offset relative to @page, may cross pages
918
 *
919
 *	Attempt to add page(s) to the bio_vec maplist. This will only fail
920 921 922 923 924
 *	if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
 */
int bio_add_page(struct bio *bio, struct page *page,
		 unsigned int len, unsigned int offset)
{
925 926 927
	bool same_page = false;

	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
M
Ming Lei 已提交
928
		if (bio_full(bio, len))
929 930 931
			return 0;
		__bio_add_page(bio, page, len, offset);
	}
K
Kent Overstreet 已提交
932
	return len;
L
Linus Torvalds 已提交
933
}
934
EXPORT_SYMBOL(bio_add_page);
L
Linus Torvalds 已提交
935

936
void bio_release_pages(struct bio *bio, bool mark_dirty)
937 938 939 940
{
	struct bvec_iter_all iter_all;
	struct bio_vec *bvec;

941 942 943
	if (bio_flagged(bio, BIO_NO_PAGE_REF))
		return;

944 945 946
	bio_for_each_segment_all(bvec, bio, iter_all) {
		if (mark_dirty && !PageCompound(bvec->bv_page))
			set_page_dirty_lock(bvec->bv_page);
947
		put_page(bvec->bv_page);
948
	}
949
}
950
EXPORT_SYMBOL_GPL(bio_release_pages);
951

952
static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
953
{
954
	WARN_ON_ONCE(bio->bi_max_vecs);
955 956 957 958 959

	bio->bi_vcnt = iter->nr_segs;
	bio->bi_io_vec = (struct bio_vec *)iter->bvec;
	bio->bi_iter.bi_bvec_done = iter->iov_offset;
	bio->bi_iter.bi_size = iter->count;
960
	bio_set_flag(bio, BIO_NO_PAGE_REF);
961
	bio_set_flag(bio, BIO_CLONED);
962 963

	iov_iter_advance(iter, iter->count);
964
	return 0;
965 966
}

967 968
#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))

969
/**
970
 * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
971 972 973
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be mapped
 *
974
 * Pins pages from *iter and appends them to @bio's bvec array. The
975
 * pages will have to be released using put_page() when done.
976
 * For multi-segment *iter, this function only adds pages from the
977
 * next non-empty segment of the iov iterator.
978
 */
979
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
980
{
981 982
	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
983 984
	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
	struct page **pages = (struct page **)bv;
985
	bool same_page = false;
986 987
	ssize_t size, left;
	unsigned len, i;
988
	size_t offset;
989 990 991 992 993 994 995 996

	/*
	 * Move page array up in the allocated memory for the bio vecs as far as
	 * possible so that we can start filling biovecs from the beginning
	 * without overwriting the temporary page array.
	*/
	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
997 998 999 1000 1001

	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
	if (unlikely(size <= 0))
		return size ? size : -EFAULT;

1002 1003
	for (left = size, i = 0; left > 0; left -= len, i++) {
		struct page *page = pages[i];
1004

1005
		len = min_t(size_t, PAGE_SIZE - offset, left);
1006 1007 1008 1009 1010

		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
			if (same_page)
				put_page(page);
		} else {
M
Ming Lei 已提交
1011
			if (WARN_ON_ONCE(bio_full(bio, len)))
1012 1013 1014
                                return -EINVAL;
			__bio_add_page(bio, page, len, offset);
		}
1015
		offset = 0;
1016 1017 1018 1019 1020
	}

	iov_iter_advance(iter, size);
	return 0;
}
1021

1022 1023 1024 1025
static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
1026
	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
1027 1028 1029 1030 1031 1032
	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
	struct page **pages = (struct page **)bv;
	ssize_t size, left;
	unsigned len, i;
	size_t offset;
1033
	int ret = 0;
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055

	if (WARN_ON_ONCE(!max_append_sectors))
		return 0;

	/*
	 * Move page array up in the allocated memory for the bio vecs as far as
	 * possible so that we can start filling biovecs from the beginning
	 * without overwriting the temporary page array.
	 */
	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
	if (unlikely(size <= 0))
		return size ? size : -EFAULT;

	for (left = size, i = 0; left > 0; left -= len, i++) {
		struct page *page = pages[i];
		bool same_page = false;

		len = min_t(size_t, PAGE_SIZE - offset, left);
		if (bio_add_hw_page(q, bio, page, len, offset,
1056 1057 1058 1059
				max_append_sectors, &same_page) != len) {
			ret = -EINVAL;
			break;
		}
1060 1061 1062 1063 1064
		if (same_page)
			put_page(page);
		offset = 0;
	}

1065 1066
	iov_iter_advance(iter, size - left);
	return ret;
1067 1068
}

1069
/**
1070
 * bio_iov_iter_get_pages - add user or kernel pages to a bio
1071
 * @bio: bio to add pages to
1072 1073 1074 1075 1076
 * @iter: iov iterator describing the region to be added
 *
 * This takes either an iterator pointing to user memory, or one pointing to
 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
 * map them into the kernel. On IO completion, the caller should put those
1077 1078 1079 1080 1081 1082
 * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
 * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
 * to ensure the bvecs and pages stay referenced until the submitted I/O is
 * completed by a call to ->ki_complete() or returns with an error other than
 * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
 * on IO completion. If it isn't, then pages should be released.
1083 1084
 *
 * The function tries, but does not guarantee, to pin as many pages as
1085
 * fit into the bio, or are requested in @iter, whatever is smaller. If
1086 1087
 * MM encounters an error pinning the requested pages, it stops. Error
 * is returned only if 0 pages could be pinned.
1088 1089 1090
 *
 * It's intended for direct IO, so doesn't do PSI tracking, the caller is
 * responsible for setting BIO_WORKINGSET if necessary.
1091 1092 1093
 */
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
1094
	int ret = 0;
1095

1096 1097 1098
	if (iov_iter_is_bvec(iter)) {
		if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
			return -EINVAL;
1099
		return bio_iov_bvec_set(bio, iter);
1100
	}
1101 1102

	do {
1103
		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
1104
			ret = __bio_iov_append_get_pages(bio, iter);
1105 1106
		else
			ret = __bio_iov_iter_get_pages(bio, iter);
M
Ming Lei 已提交
1107
	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
1108

1109 1110
	/* don't account direct I/O as memory stall */
	bio_clear_flag(bio, BIO_WORKINGSET);
1111
	return bio->bi_vcnt ? 0 : ret;
1112
}
1113
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
1114

1115
static void submit_bio_wait_endio(struct bio *bio)
1116
{
1117
	complete(bio->bi_private);
1118 1119 1120 1121 1122 1123 1124 1125
}

/**
 * submit_bio_wait - submit a bio, and wait until it completes
 * @bio: The &struct bio which describes the I/O
 *
 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
 * bio_endio() on failure.
1126 1127 1128 1129
 *
 * WARNING: Unlike to how submit_bio() is usually used, this function does not
 * result in bio reference to be consumed. The caller must drop the reference
 * on his own.
1130
 */
1131
int submit_bio_wait(struct bio *bio)
1132
{
1133 1134
	DECLARE_COMPLETION_ONSTACK_MAP(done,
			bio->bi_bdev->bd_disk->lockdep_map);
1135
	unsigned long hang_check;
1136

1137
	bio->bi_private = &done;
1138
	bio->bi_end_io = submit_bio_wait_endio;
J
Jens Axboe 已提交
1139
	bio->bi_opf |= REQ_SYNC;
1140
	submit_bio(bio);
1141 1142 1143 1144 1145 1146 1147 1148 1149

	/* Prevent hang_check timer from firing at us during very long I/O */
	hang_check = sysctl_hung_task_timeout_secs;
	if (hang_check)
		while (!wait_for_completion_io_timeout(&done,
					hang_check * (HZ/2)))
			;
	else
		wait_for_completion_io(&done);
1150

1151
	return blk_status_to_errno(bio->bi_status);
1152 1153 1154
}
EXPORT_SYMBOL(submit_bio_wait);

K
Kent Overstreet 已提交
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170
/**
 * bio_advance - increment/complete a bio by some number of bytes
 * @bio:	bio to advance
 * @bytes:	number of bytes to complete
 *
 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
 * be updated on the last bvec as well.
 *
 * @bio will then represent the remaining, uncompleted portion of the io.
 */
void bio_advance(struct bio *bio, unsigned bytes)
{
	if (bio_integrity(bio))
		bio_integrity_advance(bio, bytes);

1171
	bio_crypt_advance(bio, bytes);
K
Kent Overstreet 已提交
1172
	bio_advance_iter(bio, &bio->bi_iter, bytes);
K
Kent Overstreet 已提交
1173 1174 1175
}
EXPORT_SYMBOL(bio_advance);

1176 1177
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
			struct bio *src, struct bvec_iter *src_iter)
K
Kent Overstreet 已提交
1178
{
1179
	struct bio_vec src_bv, dst_bv;
K
Kent Overstreet 已提交
1180
	void *src_p, *dst_p;
1181
	unsigned bytes;
K
Kent Overstreet 已提交
1182

1183 1184 1185
	while (src_iter->bi_size && dst_iter->bi_size) {
		src_bv = bio_iter_iovec(src, *src_iter);
		dst_bv = bio_iter_iovec(dst, *dst_iter);
1186 1187

		bytes = min(src_bv.bv_len, dst_bv.bv_len);
K
Kent Overstreet 已提交
1188

1189 1190
		src_p = kmap_atomic(src_bv.bv_page);
		dst_p = kmap_atomic(dst_bv.bv_page);
K
Kent Overstreet 已提交
1191

1192 1193
		memcpy(dst_p + dst_bv.bv_offset,
		       src_p + src_bv.bv_offset,
K
Kent Overstreet 已提交
1194 1195 1196 1197 1198
		       bytes);

		kunmap_atomic(dst_p);
		kunmap_atomic(src_p);

1199 1200
		flush_dcache_page(dst_bv.bv_page);

P
Pavel Begunkov 已提交
1201 1202
		bio_advance_iter_single(src, src_iter, bytes);
		bio_advance_iter_single(dst, dst_iter, bytes);
K
Kent Overstreet 已提交
1203 1204
	}
}
1205 1206 1207
EXPORT_SYMBOL(bio_copy_data_iter);

/**
1208 1209 1210
 * bio_copy_data - copy contents of data buffers from one bio to another
 * @src: source bio
 * @dst: destination bio
1211 1212 1213 1214 1215 1216
 *
 * Stops when it reaches the end of either @src or @dst - that is, copies
 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
 */
void bio_copy_data(struct bio *dst, struct bio *src)
{
1217 1218 1219 1220
	struct bvec_iter src_iter = src->bi_iter;
	struct bvec_iter dst_iter = dst->bi_iter;

	bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1221
}
K
Kent Overstreet 已提交
1222 1223
EXPORT_SYMBOL(bio_copy_data);

1224
void bio_free_pages(struct bio *bio)
1225 1226
{
	struct bio_vec *bvec;
1227
	struct bvec_iter_all iter_all;
1228

1229
	bio_for_each_segment_all(bvec, bio, iter_all)
1230 1231
		__free_page(bvec->bv_page);
}
1232
EXPORT_SYMBOL(bio_free_pages);
1233

L
Linus Torvalds 已提交
1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
/*
 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
 * for performing direct-IO in BIOs.
 *
 * The problem is that we cannot run set_page_dirty() from interrupt context
 * because the required locks are not interrupt-safe.  So what we can do is to
 * mark the pages dirty _before_ performing IO.  And in interrupt context,
 * check that the pages are still dirty.   If so, fine.  If not, redirty them
 * in process context.
 *
 * We special-case compound pages here: normally this means reads into hugetlb
 * pages.  The logic in here doesn't really work right for compound pages
 * because the VM does not uniformly chase down the head page in all cases.
 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
 * handle them at all.  So we skip compound pages here at an early stage.
 *
 * Note that this code is very hard to test under normal circumstances because
 * direct-io pins the pages with get_user_pages().  This makes
 * is_page_cache_freeable return false, and the VM will not clean the pages.
1253
 * But other code (eg, flusher threads) could clean the pages if they are mapped
L
Linus Torvalds 已提交
1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264
 * pagecache.
 *
 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
 * deferred bio dirtying paths.
 */

/*
 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
 */
void bio_set_pages_dirty(struct bio *bio)
{
1265
	struct bio_vec *bvec;
1266
	struct bvec_iter_all iter_all;
L
Linus Torvalds 已提交
1267

1268
	bio_for_each_segment_all(bvec, bio, iter_all) {
1269 1270
		if (!PageCompound(bvec->bv_page))
			set_page_dirty_lock(bvec->bv_page);
L
Linus Torvalds 已提交
1271 1272 1273 1274 1275 1276 1277
	}
}

/*
 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
 * If they are, then fine.  If, however, some pages are clean then they must
 * have been written out during the direct-IO read.  So we take another ref on
1278
 * the BIO and re-dirty the pages in process context.
L
Linus Torvalds 已提交
1279 1280
 *
 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
1281 1282
 * here on.  It will run one put_page() against each page and will run one
 * bio_put() against the BIO.
L
Linus Torvalds 已提交
1283 1284
 */

1285
static void bio_dirty_fn(struct work_struct *work);
L
Linus Torvalds 已提交
1286

1287
static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
L
Linus Torvalds 已提交
1288 1289 1290 1291 1292 1293
static DEFINE_SPINLOCK(bio_dirty_lock);
static struct bio *bio_dirty_list;

/*
 * This runs in process context
 */
1294
static void bio_dirty_fn(struct work_struct *work)
L
Linus Torvalds 已提交
1295
{
1296
	struct bio *bio, *next;
L
Linus Torvalds 已提交
1297

1298 1299
	spin_lock_irq(&bio_dirty_lock);
	next = bio_dirty_list;
L
Linus Torvalds 已提交
1300
	bio_dirty_list = NULL;
1301
	spin_unlock_irq(&bio_dirty_lock);
L
Linus Torvalds 已提交
1302

1303 1304
	while ((bio = next) != NULL) {
		next = bio->bi_private;
L
Linus Torvalds 已提交
1305

1306
		bio_release_pages(bio, true);
L
Linus Torvalds 已提交
1307 1308 1309 1310 1311 1312
		bio_put(bio);
	}
}

void bio_check_pages_dirty(struct bio *bio)
{
1313
	struct bio_vec *bvec;
1314
	unsigned long flags;
1315
	struct bvec_iter_all iter_all;
L
Linus Torvalds 已提交
1316

1317
	bio_for_each_segment_all(bvec, bio, iter_all) {
1318 1319
		if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
			goto defer;
L
Linus Torvalds 已提交
1320 1321
	}

1322
	bio_release_pages(bio, false);
1323 1324 1325 1326 1327 1328 1329 1330
	bio_put(bio);
	return;
defer:
	spin_lock_irqsave(&bio_dirty_lock, flags);
	bio->bi_private = bio_dirty_list;
	bio_dirty_list = bio;
	spin_unlock_irqrestore(&bio_dirty_lock, flags);
	schedule_work(&bio_dirty_work);
L
Linus Torvalds 已提交
1331 1332
}

1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343
static inline bool bio_remaining_done(struct bio *bio)
{
	/*
	 * If we're not chaining, then ->__bi_remaining is always 1 and
	 * we always end io on the first invocation.
	 */
	if (!bio_flagged(bio, BIO_CHAIN))
		return true;

	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);

1344
	if (atomic_dec_and_test(&bio->__bi_remaining)) {
1345
		bio_clear_flag(bio, BIO_CHAIN);
1346
		return true;
1347
	}
1348 1349 1350 1351

	return false;
}

L
Linus Torvalds 已提交
1352 1353 1354 1355 1356
/**
 * bio_endio - end I/O on a bio
 * @bio:	bio
 *
 * Description:
1357 1358 1359
 *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
 *   way to end I/O on a bio. No one should call bi_end_io() directly on a
 *   bio unless they own it and thus know that it has an end_io function.
N
NeilBrown 已提交
1360 1361 1362 1363 1364
 *
 *   bio_endio() can be called several times on a bio that has been chained
 *   using bio_chain().  The ->bi_end_io() function will only be called the
 *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
 *   generated if BIO_TRACE_COMPLETION is set.
L
Linus Torvalds 已提交
1365
 **/
1366
void bio_endio(struct bio *bio)
L
Linus Torvalds 已提交
1367
{
C
Christoph Hellwig 已提交
1368
again:
1369
	if (!bio_remaining_done(bio))
C
Christoph Hellwig 已提交
1370
		return;
1371 1372
	if (!bio_integrity_endio(bio))
		return;
L
Linus Torvalds 已提交
1373

1374 1375
	if (bio->bi_bdev)
		rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
J
Josef Bacik 已提交
1376

C
Christoph Hellwig 已提交
1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
	/*
	 * Need to have a real endio function for chained bios, otherwise
	 * various corner cases will break (like stacking block devices that
	 * save/restore bi_end_io) - however, we want to avoid unbounded
	 * recursion and blowing the stack. Tail call optimization would
	 * handle this, but compiling with frame pointers also disables
	 * gcc's sibling call optimization.
	 */
	if (bio->bi_end_io == bio_chain_endio) {
		bio = __bio_chain_endio(bio);
		goto again;
K
Kent Overstreet 已提交
1388
	}
C
Christoph Hellwig 已提交
1389

1390 1391
	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
		trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
N
NeilBrown 已提交
1392 1393 1394
		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
	}

1395
	blk_throtl_bio_endio(bio);
S
Shaohua Li 已提交
1396 1397
	/* release cgroup info */
	bio_uninit(bio);
C
Christoph Hellwig 已提交
1398 1399
	if (bio->bi_end_io)
		bio->bi_end_io(bio);
L
Linus Torvalds 已提交
1400
}
1401
EXPORT_SYMBOL(bio_endio);
L
Linus Torvalds 已提交
1402

K
Kent Overstreet 已提交
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412
/**
 * bio_split - split a bio
 * @bio:	bio to split
 * @sectors:	number of sectors to split from the front of @bio
 * @gfp:	gfp mask
 * @bs:		bio set to allocate from
 *
 * Allocates and returns a new bio which represents @sectors from the start of
 * @bio, and updates @bio to represent the remaining sectors.
 *
1413
 * Unless this is a discard request the newly allocated bio will point
1414 1415
 * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
 * neither @bio nor @bs are freed before the split bio.
K
Kent Overstreet 已提交
1416 1417 1418 1419
 */
struct bio *bio_split(struct bio *bio, int sectors,
		      gfp_t gfp, struct bio_set *bs)
{
1420
	struct bio *split;
K
Kent Overstreet 已提交
1421 1422 1423 1424

	BUG_ON(sectors <= 0);
	BUG_ON(sectors >= bio_sectors(bio));

1425 1426 1427 1428
	/* Zone append commands cannot be split */
	if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
		return NULL;

1429
	split = bio_clone_fast(bio, gfp, bs);
K
Kent Overstreet 已提交
1430 1431 1432 1433 1434 1435
	if (!split)
		return NULL;

	split->bi_iter.bi_size = sectors << 9;

	if (bio_integrity(split))
1436
		bio_integrity_trim(split);
K
Kent Overstreet 已提交
1437 1438 1439

	bio_advance(bio, split->bi_iter.bi_size);

N
NeilBrown 已提交
1440
	if (bio_flagged(bio, BIO_TRACE_COMPLETION))
1441
		bio_set_flag(split, BIO_TRACE_COMPLETION);
N
NeilBrown 已提交
1442

K
Kent Overstreet 已提交
1443 1444 1445 1446
	return split;
}
EXPORT_SYMBOL(bio_split);

1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459
/**
 * bio_trim - trim a bio
 * @bio:	bio to trim
 * @offset:	number of sectors to trim from the front of @bio
 * @size:	size we want to trim @bio to, in sectors
 */
void bio_trim(struct bio *bio, int offset, int size)
{
	/* 'bio' is a cloned bio which we need to trim to match
	 * the given offset and size.
	 */

	size <<= 9;
1460
	if (offset == 0 && size == bio->bi_iter.bi_size)
1461 1462 1463
		return;

	bio_advance(bio, offset << 9);
1464
	bio->bi_iter.bi_size = size;
1465 1466

	if (bio_integrity(bio))
1467
		bio_integrity_trim(bio);
1468

1469 1470 1471
}
EXPORT_SYMBOL_GPL(bio_trim);

L
Linus Torvalds 已提交
1472 1473 1474 1475
/*
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
1476
int biovec_init_pool(mempool_t *pool, int pool_entries)
L
Linus Torvalds 已提交
1477
{
1478
	struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
L
Linus Torvalds 已提交
1479

1480
	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
L
Linus Torvalds 已提交
1481 1482
}

1483 1484 1485 1486 1487 1488 1489
/*
 * bioset_exit - exit a bioset initialized with bioset_init()
 *
 * May be called on a zeroed but uninitialized bioset (i.e. allocated with
 * kzalloc()).
 */
void bioset_exit(struct bio_set *bs)
L
Linus Torvalds 已提交
1490
{
1491 1492
	if (bs->rescue_workqueue)
		destroy_workqueue(bs->rescue_workqueue);
1493
	bs->rescue_workqueue = NULL;
1494

1495 1496
	mempool_exit(&bs->bio_pool);
	mempool_exit(&bs->bvec_pool);
1497

1498
	bioset_integrity_free(bs);
1499 1500 1501 1502 1503
	if (bs->bio_slab)
		bio_put_slab(bs);
	bs->bio_slab = NULL;
}
EXPORT_SYMBOL(bioset_exit);
L
Linus Torvalds 已提交
1504

1505 1506
/**
 * bioset_init - Initialize a bio_set
K
Kent Overstreet 已提交
1507
 * @bs:		pool to initialize
1508 1509 1510 1511 1512
 * @pool_size:	Number of bio and bio_vecs to cache in the mempool
 * @front_pad:	Number of bytes to allocate in front of the returned bio
 * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
 *              and %BIOSET_NEED_RESCUER
 *
K
Kent Overstreet 已提交
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524
 * Description:
 *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
 *    to ask for a number of bytes to be allocated in front of the bio.
 *    Front pad allocation is useful for embedding the bio inside
 *    another structure, to avoid allocating extra data to go with the bio.
 *    Note that the bio must be embedded at the END of that structure always,
 *    or things will break badly.
 *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
 *    for allocating iovecs.  This pool is not needed e.g. for bio_clone_fast().
 *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
 *    dispatch queued requests when the mempool runs out of space.
 *
1525 1526 1527 1528 1529 1530 1531
 */
int bioset_init(struct bio_set *bs,
		unsigned int pool_size,
		unsigned int front_pad,
		int flags)
{
	bs->front_pad = front_pad;
1532 1533 1534 1535
	if (flags & BIOSET_NEED_BVECS)
		bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
	else
		bs->back_pad = 0;
1536 1537 1538 1539 1540

	spin_lock_init(&bs->rescue_lock);
	bio_list_init(&bs->rescue_list);
	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);

1541
	bs->bio_slab = bio_find_or_create_slab(bs);
1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565
	if (!bs->bio_slab)
		return -ENOMEM;

	if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
		goto bad;

	if ((flags & BIOSET_NEED_BVECS) &&
	    biovec_init_pool(&bs->bvec_pool, pool_size))
		goto bad;

	if (!(flags & BIOSET_NEED_RESCUER))
		return 0;

	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
	if (!bs->rescue_workqueue)
		goto bad;

	return 0;
bad:
	bioset_exit(bs);
	return -ENOMEM;
}
EXPORT_SYMBOL(bioset_init);

1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583
/*
 * Initialize and setup a new bio_set, based on the settings from
 * another bio_set.
 */
int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
{
	int flags;

	flags = 0;
	if (src->bvec_pool.min_nr)
		flags |= BIOSET_NEED_BVECS;
	if (src->rescue_workqueue)
		flags |= BIOSET_NEED_RESCUER;

	return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
}
EXPORT_SYMBOL(bioset_init_from_src);

1584
static int __init init_bio(void)
L
Linus Torvalds 已提交
1585 1586 1587
{
	int i;

1588
	bio_integrity_init();
L
Linus Torvalds 已提交
1589

1590 1591
	for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
		struct biovec_slab *bvs = bvec_slabs + i;
1592

1593 1594 1595
		bvs->slab = kmem_cache_create(bvs->name,
				bvs->nr_vecs * sizeof(struct bio_vec), 0,
				SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
L
Linus Torvalds 已提交
1596 1597
	}

1598
	if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
L
Linus Torvalds 已提交
1599 1600
		panic("bio: can't allocate bios\n");

1601
	if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
1602 1603
		panic("bio: can't create integrity pool\n");

L
Linus Torvalds 已提交
1604 1605 1606
	return 0;
}
subsys_initcall(init_bio);