block_dev.c 42.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
L
Linus Torvalds 已提交
2 3 4
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
C
Christoph Hellwig 已提交
5
 *  Copyright (C) 2016 - 2020 Christoph Hellwig
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13
 */

#include <linux/init.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
14
#include <linux/device_cgroup.h>
L
Linus Torvalds 已提交
15 16
#include <linux/highmem.h>
#include <linux/blkdev.h>
17
#include <linux/backing-dev.h>
L
Linus Torvalds 已提交
18 19
#include <linux/module.h>
#include <linux/blkpg.h>
20
#include <linux/magic.h>
L
Linus Torvalds 已提交
21
#include <linux/buffer_head.h>
A
Al Viro 已提交
22
#include <linux/swap.h>
N
Nick Piggin 已提交
23
#include <linux/pagevec.h>
24
#include <linux/writeback.h>
L
Linus Torvalds 已提交
25 26
#include <linux/mpage.h>
#include <linux/mount.h>
27
#include <linux/pseudo_fs.h>
L
Linus Torvalds 已提交
28 29
#include <linux/uio.h>
#include <linux/namei.h>
30
#include <linux/log2.h>
A
Al Viro 已提交
31
#include <linux/cleancache.h>
32
#include <linux/task_io_accounting_ops.h>
33
#include <linux/falloc.h>
34
#include <linux/part_stat.h>
35
#include <linux/uaccess.h>
36
#include <linux/suspend.h>
37
#include "internal.h"
L
Linus Torvalds 已提交
38 39 40 41 42 43

struct bdev_inode {
	struct block_device bdev;
	struct inode vfs_inode;
};

A
Adrian Bunk 已提交
44 45
static const struct address_space_operations def_blk_aops;

L
Linus Torvalds 已提交
46 47 48 49 50
static inline struct bdev_inode *BDEV_I(struct inode *inode)
{
	return container_of(inode, struct bdev_inode, vfs_inode);
}

51
struct block_device *I_BDEV(struct inode *inode)
L
Linus Torvalds 已提交
52 53 54 55 56
{
	return &BDEV_I(inode)->bdev;
}
EXPORT_SYMBOL(I_BDEV);

57
static void bdev_write_inode(struct block_device *bdev)
58
{
59 60 61
	struct inode *inode = bdev->bd_inode;
	int ret;

62 63 64
	spin_lock(&inode->i_lock);
	while (inode->i_state & I_DIRTY) {
		spin_unlock(&inode->i_lock);
65 66 67 68 69 70 71
		ret = write_inode_now(inode, true);
		if (ret) {
			char name[BDEVNAME_SIZE];
			pr_warn_ratelimited("VFS: Dirty inode writeback failed "
					    "for block device %s (err=%d).\n",
					    bdevname(bdev, name), ret);
		}
72 73 74 75 76
		spin_lock(&inode->i_lock);
	}
	spin_unlock(&inode->i_lock);
}

P
Peter Zijlstra 已提交
77
/* Kill _all_ buffers and pagecache , dirty or not.. */
78
static void kill_bdev(struct block_device *bdev)
L
Linus Torvalds 已提交
79
{
A
Al Viro 已提交
80 81
	struct address_space *mapping = bdev->bd_inode->i_mapping;

82
	if (mapping_empty(mapping))
P
Peter Zijlstra 已提交
83
		return;
A
Al Viro 已提交
84

P
Peter Zijlstra 已提交
85
	invalidate_bh_lrus();
A
Al Viro 已提交
86
	truncate_inode_pages(mapping, 0);
87
}
A
Al Viro 已提交
88 89 90 91 92 93

/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
	struct address_space *mapping = bdev->bd_inode->i_mapping;

94 95 96 97 98
	if (mapping->nrpages) {
		invalidate_bh_lrus();
		lru_add_drain_all();	/* make sure all lru add caches are flushed */
		invalidate_mapping_pages(mapping, 0, -1);
	}
A
Al Viro 已提交
99 100 101
	/* 99% of the time, we don't need to flush the cleancache on the bdev.
	 * But, for the strange corners, lets be cautious
	 */
102
	cleancache_invalidate_inode(mapping);
A
Al Viro 已提交
103 104
}
EXPORT_SYMBOL(invalidate_bdev);
L
Linus Torvalds 已提交
105

106 107 108 109 110 111 112 113 114 115 116 117 118
/*
 * Drop all buffers & page cache for given bdev range. This function bails
 * with error if bdev has other exclusive owner (such as filesystem).
 */
int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
			loff_t lstart, loff_t lend)
{
	/*
	 * If we don't hold exclusive handle for the device, upgrade to it
	 * while we discard the buffer cache to avoid discarding buffers
	 * under live filesystem.
	 */
	if (!(mode & FMODE_EXCL)) {
119
		int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
120
		if (err)
121
			goto invalidate;
122
	}
123

124
	truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
125 126
	if (!(mode & FMODE_EXCL))
		bd_abort_claiming(bdev, truncate_bdev_range);
127
	return 0;
128 129 130 131 132 133 134 135 136

invalidate:
	/*
	 * Someone else has handle exclusively open. Try invalidating instead.
	 * The 'end' argument is inclusive so the rounding is safe.
	 */
	return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
					     lstart >> PAGE_SHIFT,
					     lend >> PAGE_SHIFT);
137 138
}

139 140
static void set_init_blocksize(struct block_device *bdev)
{
141 142 143 144 145 146 147 148 149
	unsigned int bsize = bdev_logical_block_size(bdev);
	loff_t size = i_size_read(bdev->bd_inode);

	while (bsize < PAGE_SIZE) {
		if (size & bsize)
			break;
		bsize <<= 1;
	}
	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
150 151
}

L
Linus Torvalds 已提交
152 153 154
int set_blocksize(struct block_device *bdev, int size)
{
	/* Size must be a power of two, and between 512 and PAGE_SIZE */
155
	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
L
Linus Torvalds 已提交
156 157 158
		return -EINVAL;

	/* Size cannot be smaller than the size supported by the device */
159
	if (size < bdev_logical_block_size(bdev))
L
Linus Torvalds 已提交
160 161 162
		return -EINVAL;

	/* Don't change the size if it is same as current */
163
	if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
L
Linus Torvalds 已提交
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
		sync_blockdev(bdev);
		bdev->bd_inode->i_blkbits = blksize_bits(size);
		kill_bdev(bdev);
	}
	return 0;
}

EXPORT_SYMBOL(set_blocksize);

int sb_set_blocksize(struct super_block *sb, int size)
{
	if (set_blocksize(sb->s_bdev, size))
		return 0;
	/* If we get here, we know size is power of two
	 * and it's value is between 512 and PAGE_SIZE */
	sb->s_blocksize = size;
180
	sb->s_blocksize_bits = blksize_bits(size);
L
Linus Torvalds 已提交
181 182 183 184 185 186 187
	return sb->s_blocksize;
}

EXPORT_SYMBOL(sb_set_blocksize);

int sb_min_blocksize(struct super_block *sb, int size)
{
188
	int minsize = bdev_logical_block_size(sb->s_bdev);
L
Linus Torvalds 已提交
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
	if (size < minsize)
		size = minsize;
	return sb_set_blocksize(sb, size);
}

EXPORT_SYMBOL(sb_min_blocksize);

static int
blkdev_get_block(struct inode *inode, sector_t iblock,
		struct buffer_head *bh, int create)
{
	bh->b_bdev = I_BDEV(inode);
	bh->b_blocknr = iblock;
	set_buffer_mapped(bh);
	return 0;
}

206 207 208 209 210
static struct inode *bdev_file_inode(struct file *file)
{
	return file->f_mapping->host;
}

211 212 213 214 215 216 217 218 219 220
static unsigned int dio_bio_write_op(struct kiocb *iocb)
{
	unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;

	/* avoid the need for a I/O completion work item */
	if (iocb->ki_flags & IOCB_DSYNC)
		op |= REQ_FUA;
	return op;
}

221 222 223 224 225 226 227
#define DIO_INLINE_BIO_VECS 4

static void blkdev_bio_end_io_simple(struct bio *bio)
{
	struct task_struct *waiter = bio->bi_private;

	WRITE_ONCE(bio->bi_private, NULL);
J
Jens Axboe 已提交
228
	blk_wake_io_task(waiter);
229 230 231 232
}

static ssize_t
__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
233
		unsigned int nr_pages)
234 235 236
{
	struct file *file = iocb->ki_filp;
	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
237
	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
238 239 240 241 242 243
	loff_t pos = iocb->ki_pos;
	bool should_dirty = false;
	struct bio bio;
	ssize_t ret;
	blk_qc_t qc;

244 245
	if ((pos | iov_iter_alignment(iter)) &
	    (bdev_logical_block_size(bdev) - 1))
246 247
		return -EINVAL;

248 249 250
	if (nr_pages <= DIO_INLINE_BIO_VECS)
		vecs = inline_vecs;
	else {
251 252
		vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
				     GFP_KERNEL);
253 254 255 256
		if (!vecs)
			return -ENOMEM;
	}

257
	bio_init(&bio, vecs, nr_pages);
258
	bio_set_dev(&bio, bdev);
259
	bio.bi_iter.bi_sector = pos >> 9;
260
	bio.bi_write_hint = iocb->ki_hint;
261 262
	bio.bi_private = current;
	bio.bi_end_io = blkdev_bio_end_io_simple;
263
	bio.bi_ioprio = iocb->ki_ioprio;
264 265 266

	ret = bio_iov_iter_get_pages(&bio, iter);
	if (unlikely(ret))
267
		goto out;
268 269 270
	ret = bio.bi_iter.bi_size;

	if (iov_iter_rw(iter) == READ) {
271
		bio.bi_opf = REQ_OP_READ;
272 273 274
		if (iter_is_iovec(iter))
			should_dirty = true;
	} else {
275
		bio.bi_opf = dio_bio_write_op(iocb);
276 277
		task_io_account_write(ret);
	}
278 279
	if (iocb->ki_flags & IOCB_NOWAIT)
		bio.bi_opf |= REQ_NOWAIT;
280
	if (iocb->ki_flags & IOCB_HIPRI)
J
Jens Axboe 已提交
281
		bio_set_polled(&bio, iocb);
282 283 284

	qc = submit_bio(&bio);
	for (;;) {
285
		set_current_state(TASK_UNINTERRUPTIBLE);
286 287 288
		if (!READ_ONCE(bio.bi_private))
			break;
		if (!(iocb->ki_flags & IOCB_HIPRI) ||
289
		    !blk_poll(bdev_get_queue(bdev), qc, true))
290
			blk_io_schedule();
291 292 293
	}
	__set_current_state(TASK_RUNNING);

294
	bio_release_pages(&bio, should_dirty);
295
	if (unlikely(bio.bi_status))
296
		ret = blk_status_to_errno(bio.bi_status);
297

298 299 300 301
out:
	if (vecs != inline_vecs)
		kfree(vecs);

302 303
	bio_uninit(&bio);

304 305 306
	return ret;
}

307 308 309 310 311 312 313 314 315 316 317 318 319
struct blkdev_dio {
	union {
		struct kiocb		*iocb;
		struct task_struct	*waiter;
	};
	size_t			size;
	atomic_t		ref;
	bool			multi_bio : 1;
	bool			should_dirty : 1;
	bool			is_sync : 1;
	struct bio		bio;
};

320
static struct bio_set blkdev_dio_pool;
321

322 323 324 325 326 327 328 329
static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
{
	struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
	struct request_queue *q = bdev_get_queue(bdev);

	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
}

330 331 332 333 334
static void blkdev_bio_end_io(struct bio *bio)
{
	struct blkdev_dio *dio = bio->bi_private;
	bool should_dirty = dio->should_dirty;

335 336 337 338
	if (bio->bi_status && !dio->bio.bi_status)
		dio->bio.bi_status = bio->bi_status;

	if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
339 340
		if (!dio->is_sync) {
			struct kiocb *iocb = dio->iocb;
341
			ssize_t ret;
342

343
			if (likely(!dio->bio.bi_status)) {
344 345
				ret = dio->size;
				iocb->ki_pos += ret;
346 347
			} else {
				ret = blk_status_to_errno(dio->bio.bi_status);
348 349 350
			}

			dio->iocb->ki_complete(iocb, ret, 0);
351 352
			if (dio->multi_bio)
				bio_put(&dio->bio);
353 354 355 356
		} else {
			struct task_struct *waiter = dio->waiter;

			WRITE_ONCE(dio->waiter, NULL);
J
Jens Axboe 已提交
357
			blk_wake_io_task(waiter);
358 359 360 361 362 363
		}
	}

	if (should_dirty) {
		bio_check_pages_dirty(bio);
	} else {
364
		bio_release_pages(bio, false);
365 366 367 368
		bio_put(bio);
	}
}

369 370
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
		unsigned int nr_pages)
371 372
{
	struct file *file = iocb->ki_filp;
373
	struct inode *inode = bdev_file_inode(file);
374
	struct block_device *bdev = I_BDEV(inode);
375
	struct blk_plug plug;
376 377
	struct blkdev_dio *dio;
	struct bio *bio;
378
	bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
379
	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
380 381
	loff_t pos = iocb->ki_pos;
	blk_qc_t qc = BLK_QC_T_NONE;
J
Jens Axboe 已提交
382
	int ret = 0;
383

384 385
	if ((pos | iov_iter_alignment(iter)) &
	    (bdev_logical_block_size(bdev) - 1))
386 387
		return -EINVAL;

J
Jens Axboe 已提交
388
	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
389 390

	dio = container_of(bio, struct blkdev_dio, bio);
391
	dio->is_sync = is_sync = is_sync_kiocb(iocb);
392
	if (dio->is_sync) {
393
		dio->waiter = current;
394 395
		bio_get(bio);
	} else {
396
		dio->iocb = iocb;
397
	}
398 399 400

	dio->size = 0;
	dio->multi_bio = false;
D
David Howells 已提交
401
	dio->should_dirty = is_read && iter_is_iovec(iter);
402

403 404 405 406 407 408 409
	/*
	 * Don't plug for HIPRI/polled IO, as those should go straight
	 * to issue
	 */
	if (!is_poll)
		blk_start_plug(&plug);

410
	for (;;) {
411
		bio_set_dev(bio, bdev);
412
		bio->bi_iter.bi_sector = pos >> 9;
413
		bio->bi_write_hint = iocb->ki_hint;
414 415
		bio->bi_private = dio;
		bio->bi_end_io = blkdev_bio_end_io;
416
		bio->bi_ioprio = iocb->ki_ioprio;
417

418 419
		ret = bio_iov_iter_get_pages(bio, iter);
		if (unlikely(ret)) {
420
			bio->bi_status = BLK_STS_IOERR;
421 422 423 424 425 426 427 428 429 430 431 432
			bio_endio(bio);
			break;
		}

		if (is_read) {
			bio->bi_opf = REQ_OP_READ;
			if (dio->should_dirty)
				bio_set_pages_dirty(bio);
		} else {
			bio->bi_opf = dio_bio_write_op(iocb);
			task_io_account_write(bio->bi_iter.bi_size);
		}
433 434
		if (iocb->ki_flags & IOCB_NOWAIT)
			bio->bi_opf |= REQ_NOWAIT;
435

J
Jens Axboe 已提交
436
		dio->size += bio->bi_iter.bi_size;
437 438
		pos += bio->bi_iter.bi_size;

439
		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
440
		if (!nr_pages) {
441 442 443
			bool polled = false;

			if (iocb->ki_flags & IOCB_HIPRI) {
J
Jens Axboe 已提交
444
				bio_set_polled(bio, iocb);
445 446
				polled = true;
			}
447

448
			qc = submit_bio(bio);
449 450 451

			if (polled)
				WRITE_ONCE(iocb->ki_cookie, qc);
452 453 454 455
			break;
		}

		if (!dio->multi_bio) {
456 457 458 459 460 461 462
			/*
			 * AIO needs an extra reference to ensure the dio
			 * structure which is embedded into the first bio
			 * stays around.
			 */
			if (!is_sync)
				bio_get(bio);
463 464 465 466 467 468
			dio->multi_bio = true;
			atomic_set(&dio->ref, 2);
		} else {
			atomic_inc(&dio->ref);
		}

J
Jens Axboe 已提交
469 470
		submit_bio(bio);
		bio = bio_alloc(GFP_KERNEL, nr_pages);
471
	}
472 473 474

	if (!is_poll)
		blk_finish_plug(&plug);
475

476
	if (!is_sync)
477 478 479
		return -EIOCBQUEUED;

	for (;;) {
480
		set_current_state(TASK_UNINTERRUPTIBLE);
481 482 483 484
		if (!READ_ONCE(dio->waiter))
			break;

		if (!(iocb->ki_flags & IOCB_HIPRI) ||
485
		    !blk_poll(bdev_get_queue(bdev), qc, true))
486
			blk_io_schedule();
487 488 489
	}
	__set_current_state(TASK_RUNNING);

490
	if (!ret)
491
		ret = blk_status_to_errno(dio->bio.bi_status);
492 493
	if (likely(!ret))
		ret = dio->size;
494 495 496 497 498 499 500 501

	bio_put(&dio->bio);
	return ret;
}

static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
502
	unsigned int nr_pages;
503

504
	if (!iov_iter_count(iter))
505
		return 0;
506

507 508
	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
509
		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
510

511
	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
512 513 514 515
}

static __init int blkdev_init(void)
{
516
	return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
517
}
518
module_init(blkdev_init);
519

520 521 522 523 524 525 526 527 528
int __sync_blockdev(struct block_device *bdev, int wait)
{
	if (!bdev)
		return 0;
	if (!wait)
		return filemap_flush(bdev->bd_inode->i_mapping);
	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
}

N
Nick Piggin 已提交
529 530 531 532 533 534
/*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
int sync_blockdev(struct block_device *bdev)
{
535
	return __sync_blockdev(bdev, 1);
N
Nick Piggin 已提交
536 537 538 539 540 541 542 543 544 545 546 547
}
EXPORT_SYMBOL(sync_blockdev);

/*
 * Write out and wait upon all dirty data associated with this
 * device.   Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int fsync_bdev(struct block_device *bdev)
{
	struct super_block *sb = get_super(bdev);
	if (sb) {
548
		int res = sync_filesystem(sb);
N
Nick Piggin 已提交
549 550 551 552 553
		drop_super(sb);
		return res;
	}
	return sync_blockdev(bdev);
}
554
EXPORT_SYMBOL(fsync_bdev);
N
Nick Piggin 已提交
555 556 557 558 559 560 561 562 563 564 565 566 567

/**
 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 * @bdev:	blockdevice to lock
 *
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
 * The reference counter (bd_fsfreeze_count) guarantees that only the last
 * unfreeze process can unfreeze the frozen filesystem actually when multiple
 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
 * actually.
 */
568
int freeze_bdev(struct block_device *bdev)
N
Nick Piggin 已提交
569 570 571 572 573
{
	struct super_block *sb;
	int error = 0;

	mutex_lock(&bdev->bd_fsfreeze_mutex);
574 575
	if (++bdev->bd_fsfreeze_count > 1)
		goto done;
576 577 578

	sb = get_active_super(bdev);
	if (!sb)
579
		goto sync;
580 581 582 583
	if (sb->s_op->freeze_super)
		error = sb->s_op->freeze_super(sb);
	else
		error = freeze_super(sb);
584 585
	deactivate_super(sb);

586 587
	if (error) {
		bdev->bd_fsfreeze_count--;
588
		goto done;
N
Nick Piggin 已提交
589
	}
590 591 592
	bdev->bd_fsfreeze_sb = sb;

sync:
N
Nick Piggin 已提交
593
	sync_blockdev(bdev);
594
done:
N
Nick Piggin 已提交
595
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
596
	return error;
N
Nick Piggin 已提交
597 598 599 600 601 602 603 604 605
}
EXPORT_SYMBOL(freeze_bdev);

/**
 * thaw_bdev  -- unlock filesystem
 * @bdev:	blockdevice to unlock
 *
 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 */
606
int thaw_bdev(struct block_device *bdev)
N
Nick Piggin 已提交
607
{
608
	struct super_block *sb;
609
	int error = -EINVAL;
N
Nick Piggin 已提交
610 611

	mutex_lock(&bdev->bd_fsfreeze_mutex);
612
	if (!bdev->bd_fsfreeze_count)
613
		goto out;
614 615 616

	error = 0;
	if (--bdev->bd_fsfreeze_count > 0)
617
		goto out;
618

619
	sb = bdev->bd_fsfreeze_sb;
620
	if (!sb)
621
		goto out;
622

623 624 625 626
	if (sb->s_op->thaw_super)
		error = sb->s_op->thaw_super(sb);
	else
		error = thaw_super(sb);
627
	if (error)
628
		bdev->bd_fsfreeze_count++;
629 630
	else
		bdev->bd_fsfreeze_sb = NULL;
631
out:
N
Nick Piggin 已提交
632
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
633
	return error;
N
Nick Piggin 已提交
634 635 636
}
EXPORT_SYMBOL(thaw_bdev);

L
Linus Torvalds 已提交
637 638 639 640 641 642 643 644 645 646
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
	return block_write_full_page(page, blkdev_get_block, wbc);
}

static int blkdev_readpage(struct file * file, struct page * page)
{
	return block_read_full_page(page, blkdev_get_block);
}

647
static void blkdev_readahead(struct readahead_control *rac)
648
{
649
	mpage_readahead(rac, blkdev_get_block);
650 651
}

N
Nick Piggin 已提交
652 653 654
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned flags,
			struct page **pagep, void **fsdata)
L
Linus Torvalds 已提交
655
{
656 657
	return block_write_begin(mapping, pos, len, flags, pagep,
				 blkdev_get_block);
L
Linus Torvalds 已提交
658 659
}

N
Nick Piggin 已提交
660 661 662
static int blkdev_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
L
Linus Torvalds 已提交
663
{
N
Nick Piggin 已提交
664 665 666 667
	int ret;
	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);

	unlock_page(page);
668
	put_page(page);
N
Nick Piggin 已提交
669 670

	return ret;
L
Linus Torvalds 已提交
671 672 673 674
}

/*
 * private llseek:
A
Al Viro 已提交
675
 * for a block special file file_inode(file)->i_size is zero
L
Linus Torvalds 已提交
676 677
 * so we compute the size by hand (just as in block_read/write above)
 */
678
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
L
Linus Torvalds 已提交
679
{
680
	struct inode *bd_inode = bdev_file_inode(file);
L
Linus Torvalds 已提交
681 682
	loff_t retval;

A
Al Viro 已提交
683
	inode_lock(bd_inode);
684
	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
A
Al Viro 已提交
685
	inode_unlock(bd_inode);
L
Linus Torvalds 已提交
686 687 688
	return retval;
}
	
689
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
L
Linus Torvalds 已提交
690
{
691
	struct inode *bd_inode = bdev_file_inode(filp);
692
	struct block_device *bdev = I_BDEV(bd_inode);
693
	int error;
694
	
695
	error = file_write_and_wait_range(filp, start, end);
696 697
	if (error)
		return error;
698

699 700 701 702 703
	/*
	 * There is no need to serialise calls to blkdev_issue_flush with
	 * i_mutex and doing so causes performance issues with concurrent
	 * O_SYNC writers to a block device.
	 */
704
	error = blkdev_issue_flush(bdev);
705 706
	if (error == -EOPNOTSUPP)
		error = 0;
707

708
	return error;
L
Linus Torvalds 已提交
709
}
710
EXPORT_SYMBOL(blkdev_fsync);
L
Linus Torvalds 已提交
711

712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
/**
 * bdev_read_page() - Start reading a page from a block device
 * @bdev: The device to read the page from
 * @sector: The offset on the device to read the page to (need not be aligned)
 * @page: The page to read
 *
 * On entry, the page should be locked.  It will be unlocked when the page
 * has been read.  If the block driver implements rw_page synchronously,
 * that will be true on exit from this function, but it need not be.
 *
 * Errors returned by this function are usually "soft", eg out of memory, or
 * queue full; callers should try a different route to read this page rather
 * than propagate an error back up the stack.
 *
 * Return: negative errno if an error occurs, 0 if submission was successful.
 */
int bdev_read_page(struct block_device *bdev, sector_t sector,
			struct page *page)
{
	const struct block_device_operations *ops = bdev->bd_disk->fops;
732 733
	int result = -EOPNOTSUPP;

734
	if (!ops->rw_page || bdev_get_integrity(bdev))
735 736
		return result;

737
	result = blk_queue_enter(bdev->bd_disk->queue, 0);
738 739
	if (result)
		return result;
740 741
	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
			      REQ_OP_READ);
742
	blk_queue_exit(bdev->bd_disk->queue);
743
	return result;
744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
}

/**
 * bdev_write_page() - Start writing a page to a block device
 * @bdev: The device to write the page to
 * @sector: The offset on the device to write the page to (need not be aligned)
 * @page: The page to write
 * @wbc: The writeback_control for the write
 *
 * On entry, the page should be locked and not currently under writeback.
 * On exit, if the write started successfully, the page will be unlocked and
 * under writeback.  If the write failed already (eg the driver failed to
 * queue the page to the device), the page will still be locked.  If the
 * caller is a ->writepage implementation, it will need to unlock the page.
 *
 * Errors returned by this function are usually "soft", eg out of memory, or
 * queue full; callers should try a different route to write this page rather
 * than propagate an error back up the stack.
 *
 * Return: negative errno if an error occurs, 0 if submission was successful.
 */
int bdev_write_page(struct block_device *bdev, sector_t sector,
			struct page *page, struct writeback_control *wbc)
{
	int result;
	const struct block_device_operations *ops = bdev->bd_disk->fops;
770

771
	if (!ops->rw_page || bdev_get_integrity(bdev))
772
		return -EOPNOTSUPP;
773
	result = blk_queue_enter(bdev->bd_disk->queue, 0);
774 775 776
	if (result)
		return result;

777
	set_page_writeback(page);
778 779
	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
			      REQ_OP_WRITE);
780
	if (result) {
781
		end_page_writeback(page);
782 783
	} else {
		clean_page_buffers(page);
784
		unlock_page(page);
785
	}
786
	blk_queue_exit(bdev->bd_disk->queue);
787 788 789
	return result;
}

L
Linus Torvalds 已提交
790 791 792 793 794
/*
 * pseudo-fs
 */

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
795
static struct kmem_cache * bdev_cachep __read_mostly;
L
Linus Torvalds 已提交
796 797 798

static struct inode *bdev_alloc_inode(struct super_block *sb)
{
799
	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
800

L
Linus Torvalds 已提交
801 802
	if (!ei)
		return NULL;
803 804
	memset(&ei->bdev, 0, sizeof(ei->bdev));
	ei->bdev.bd_bdi = &noop_backing_dev_info;
L
Linus Torvalds 已提交
805 806 807
	return &ei->vfs_inode;
}

A
Al Viro 已提交
808
static void bdev_free_inode(struct inode *inode)
L
Linus Torvalds 已提交
809
{
810 811 812
	struct block_device *bdev = I_BDEV(inode);

	free_percpu(bdev->bd_stats);
813
	kfree(bdev->bd_meta_info);
814

815 816
	if (!bdev_is_partition(bdev))
		kfree(bdev->bd_disk);
A
Al Viro 已提交
817
	kmem_cache_free(bdev_cachep, BDEV_I(inode));
N
Nick Piggin 已提交
818 819
}

820
static void init_once(void *data)
L
Linus Torvalds 已提交
821
{
822
	struct bdev_inode *ei = data;
L
Linus Torvalds 已提交
823

C
Christoph Lameter 已提交
824
	inode_init_once(&ei->vfs_inode);
L
Linus Torvalds 已提交
825 826
}

827
static void bdev_evict_inode(struct inode *inode)
L
Linus Torvalds 已提交
828 829
{
	struct block_device *bdev = &BDEV_I(inode)->bdev;
830
	truncate_inode_pages_final(&inode->i_data);
831
	invalidate_inode_buffers(inode); /* is it needed here? */
832
	clear_inode(inode);
833 834
	/* Detach inode from wb early as bdi_put() may free bdi->wb */
	inode_detach_wb(inode);
835
	if (bdev->bd_bdi != &noop_backing_dev_info) {
836
		bdi_put(bdev->bd_bdi);
837 838
		bdev->bd_bdi = &noop_backing_dev_info;
	}
L
Linus Torvalds 已提交
839 840
}

841
static const struct super_operations bdev_sops = {
L
Linus Torvalds 已提交
842 843
	.statfs = simple_statfs,
	.alloc_inode = bdev_alloc_inode,
A
Al Viro 已提交
844
	.free_inode = bdev_free_inode,
L
Linus Torvalds 已提交
845
	.drop_inode = generic_delete_inode,
846
	.evict_inode = bdev_evict_inode,
L
Linus Torvalds 已提交
847 848
};

849
static int bd_init_fs_context(struct fs_context *fc)
L
Linus Torvalds 已提交
850
{
851 852 853 854 855 856
	struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
	if (!ctx)
		return -ENOMEM;
	fc->s_iflags |= SB_I_CGROUPWB;
	ctx->ops = &bdev_sops;
	return 0;
L
Linus Torvalds 已提交
857 858 859 860
}

static struct file_system_type bd_type = {
	.name		= "bdev",
861
	.init_fs_context = bd_init_fs_context,
L
Linus Torvalds 已提交
862 863 864
	.kill_sb	= kill_anon_super,
};

T
Tejun Heo 已提交
865 866
struct super_block *blockdev_superblock __read_mostly;
EXPORT_SYMBOL_GPL(blockdev_superblock);
L
Linus Torvalds 已提交
867 868 869 870

void __init bdev_cache_init(void)
{
	int err;
871
	static struct vfsmount *bd_mnt;
872

L
Linus Torvalds 已提交
873
	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
874
			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
875
				SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
876
			init_once);
L
Linus Torvalds 已提交
877 878 879 880 881 882
	err = register_filesystem(&bd_type);
	if (err)
		panic("Cannot register bdev pseudo-fs");
	bd_mnt = kern_mount(&bd_type);
	if (IS_ERR(bd_mnt))
		panic("Cannot create bdev pseudo-fs");
883
	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
L
Linus Torvalds 已提交
884 885
}

886
struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
L
Linus Torvalds 已提交
887 888 889 890
{
	struct block_device *bdev;
	struct inode *inode;

891
	inode = new_inode(blockdev_superblock);
L
Linus Torvalds 已提交
892 893
	if (!inode)
		return NULL;
894 895 896 897 898 899
	inode->i_mode = S_IFBLK;
	inode->i_rdev = 0;
	inode->i_data.a_ops = &def_blk_aops;
	mapping_set_gfp_mask(&inode->i_data, GFP_USER);

	bdev = I_BDEV(inode);
900
	mutex_init(&bdev->bd_fsfreeze_mutex);
901 902 903 904
	spin_lock_init(&bdev->bd_size_lock);
	bdev->bd_disk = disk;
	bdev->bd_partno = partno;
	bdev->bd_inode = inode;
905
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
906 907
	INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
908 909 910 911 912
	bdev->bd_stats = alloc_percpu(struct disk_stats);
	if (!bdev->bd_stats) {
		iput(inode);
		return NULL;
	}
913 914
	return bdev;
}
L
Linus Torvalds 已提交
915

916 917 918 919 920 921 922
void bdev_add(struct block_device *bdev, dev_t dev)
{
	bdev->bd_dev = dev;
	bdev->bd_inode->i_rdev = dev;
	bdev->bd_inode->i_ino = dev;
	insert_inode_hash(bdev->bd_inode);
}
L
Linus Torvalds 已提交
923 924 925

long nr_blockdev_pages(void)
{
926
	struct inode *inode;
L
Linus Torvalds 已提交
927
	long ret = 0;
928 929 930 931 932 933

	spin_lock(&blockdev_superblock->s_inode_list_lock);
	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
		ret += inode->i_mapping->nrpages;
	spin_unlock(&blockdev_superblock->s_inode_list_lock);

L
Linus Torvalds 已提交
934 935 936
	return ret;
}

T
Tejun Heo 已提交
937 938 939 940 941 942
/**
 * bd_may_claim - test whether a block device can be claimed
 * @bdev: block device of interest
 * @whole: whole block device containing @bdev, may equal @bdev
 * @holder: holder trying to claim @bdev
 *
L
Lucas De Marchi 已提交
943
 * Test whether @bdev can be claimed by @holder.
T
Tejun Heo 已提交
944 945 946 947 948 949 950 951 952
 *
 * CONTEXT:
 * spin_lock(&bdev_lock).
 *
 * RETURNS:
 * %true if @bdev can be claimed, %false otherwise.
 */
static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
			 void *holder)
L
Linus Torvalds 已提交
953 954
{
	if (bdev->bd_holder == holder)
T
Tejun Heo 已提交
955
		return true;	 /* already a holder */
L
Linus Torvalds 已提交
956
	else if (bdev->bd_holder != NULL)
T
Tejun Heo 已提交
957
		return false; 	 /* held by someone else */
958
	else if (whole == bdev)
T
Tejun Heo 已提交
959
		return true;  	 /* is a whole device which isn't held */
L
Linus Torvalds 已提交
960

961
	else if (whole->bd_holder == bd_may_claim)
T
Tejun Heo 已提交
962 963 964
		return true; 	 /* is a partition of a device that is being partitioned */
	else if (whole->bd_holder != NULL)
		return false;	 /* is a partition of a held device */
L
Linus Torvalds 已提交
965
	else
T
Tejun Heo 已提交
966 967 968
		return true;	 /* is a partition of an un-held device */
}

969
/**
970
 * bd_prepare_to_claim - claim a block device
971 972 973
 * @bdev: block device of interest
 * @holder: holder trying to claim @bdev
 *
974 975 976
 * Claim @bdev.  This function fails if @bdev is already claimed by another
 * holder and waits if another claiming is in progress. return, the caller
 * has ownership of bd_claiming and bd_holder[s].
977 978 979 980
 *
 * RETURNS:
 * 0 if @bdev can be claimed, -EBUSY otherwise.
 */
981
int bd_prepare_to_claim(struct block_device *bdev, void *holder)
982
{
983 984 985 986
	struct block_device *whole = bdev_whole(bdev);

	if (WARN_ON_ONCE(!holder))
		return -EINVAL;
987
retry:
988
	spin_lock(&bdev_lock);
989
	/* if someone else claimed, fail */
990 991
	if (!bd_may_claim(bdev, whole, holder)) {
		spin_unlock(&bdev_lock);
992
		return -EBUSY;
993
	}
994

995 996
	/* if claiming is already in progress, wait for it to finish */
	if (whole->bd_claiming) {
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
		DEFINE_WAIT(wait);

		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
		spin_unlock(&bdev_lock);
		schedule();
		finish_wait(wq, &wait);
		goto retry;
	}

	/* yay, all mine */
1008 1009
	whole->bd_claiming = holder;
	spin_unlock(&bdev_lock);
1010 1011
	return 0;
}
1012
EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
1013

1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
static void bd_clear_claiming(struct block_device *whole, void *holder)
{
	lockdep_assert_held(&bdev_lock);
	/* tell others that we're done */
	BUG_ON(whole->bd_claiming != holder);
	whole->bd_claiming = NULL;
	wake_up_bit(&whole->bd_claiming, 0);
}

/**
 * bd_finish_claiming - finish claiming of a block device
 * @bdev: block device of interest
 * @holder: holder that has claimed @bdev
 *
 * Finish exclusive open of a block device. Mark the device as exlusively
 * open by the holder and wake up all waiters for exclusive open to finish.
 */
1031
static void bd_finish_claiming(struct block_device *bdev, void *holder)
1032
{
1033 1034
	struct block_device *whole = bdev_whole(bdev);

1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
	spin_lock(&bdev_lock);
	BUG_ON(!bd_may_claim(bdev, whole, holder));
	/*
	 * Note that for a whole device bd_holders will be incremented twice,
	 * and bd_holder will be set to bd_may_claim before being set to holder
	 */
	whole->bd_holders++;
	whole->bd_holder = bd_may_claim;
	bdev->bd_holders++;
	bdev->bd_holder = holder;
	bd_clear_claiming(whole, holder);
	spin_unlock(&bdev_lock);
}

/**
 * bd_abort_claiming - abort claiming of a block device
 * @bdev: block device of interest
 * @holder: holder that has claimed @bdev
 *
 * Abort claiming of a block device when the exclusive open failed. This can be
 * also used when exclusive open is not actually desired and we just needed
 * to block other exclusive openers for a while.
 */
1058
void bd_abort_claiming(struct block_device *bdev, void *holder)
1059 1060
{
	spin_lock(&bdev_lock);
1061
	bd_clear_claiming(bdev_whole(bdev), holder);
1062 1063 1064
	spin_unlock(&bdev_lock);
}
EXPORT_SYMBOL(bd_abort_claiming);
1065

C
Christoph Hellwig 已提交
1066 1067 1068 1069 1070 1071 1072
static void blkdev_flush_mapping(struct block_device *bdev)
{
	WARN_ON_ONCE(bdev->bd_holders);
	sync_blockdev(bdev);
	kill_bdev(bdev);
	bdev_write_inode(bdev);
}
1073

C
Christoph Hellwig 已提交
1074
static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
1075
{
1076
	struct gendisk *disk = bdev->bd_disk;
1077
	int ret = 0;
1078

C
Christoph Hellwig 已提交
1079 1080 1081 1082 1083 1084
	if (disk->fops->open) {
		ret = disk->fops->open(bdev, mode);
		if (ret) {
			/* avoid ghost partitions on a removed medium */
			if (ret == -ENOMEDIUM &&
			     test_bit(GD_NEED_PART_SCAN, &disk->state))
1085
				bdev_disk_changed(disk, true);
C
Christoph Hellwig 已提交
1086 1087
			return ret;
		}
1088
	}
1089

L
Linus Torvalds 已提交
1090
	if (!bdev->bd_openers) {
C
Christoph Hellwig 已提交
1091 1092 1093
		set_init_blocksize(bdev);
		if (bdev->bd_bdi == &noop_backing_dev_info)
			bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
1094
	}
C
Christoph Hellwig 已提交
1095
	if (test_bit(GD_NEED_PART_SCAN, &disk->state))
1096
		bdev_disk_changed(disk, false);
C
Christoph Hellwig 已提交
1097 1098 1099
	bdev->bd_openers++;
	return 0;;
}
1100

C
Christoph Hellwig 已提交
1101 1102 1103 1104 1105 1106
static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
{
	if (!--bdev->bd_openers)
		blkdev_flush_mapping(bdev);
	if (bdev->bd_disk->fops->release)
		bdev->bd_disk->fops->release(bdev->bd_disk, mode);
1107 1108
}

C
Christoph Hellwig 已提交
1109
static int blkdev_get_part(struct block_device *part, fmode_t mode)
L
Linus Torvalds 已提交
1110
{
C
Christoph Hellwig 已提交
1111 1112
	struct gendisk *disk = part->bd_disk;
	int ret;
1113

C
Christoph Hellwig 已提交
1114 1115
	if (part->bd_openers)
		goto done;
1116

1117
	ret = blkdev_get_whole(bdev_whole(part), mode);
1118
	if (ret)
1119
		return ret;
1120

C
Christoph Hellwig 已提交
1121 1122 1123
	ret = -ENXIO;
	if (!bdev_nr_sectors(part))
		goto out_blkdev_put;
1124

1125
	disk->open_partitions++;
C
Christoph Hellwig 已提交
1126 1127 1128 1129 1130
	set_init_blocksize(part);
	if (part->bd_bdi == &noop_backing_dev_info)
		part->bd_bdi = bdi_get(disk->queue->backing_dev_info);
done:
	part->bd_openers++;
L
Linus Torvalds 已提交
1131
	return 0;
1132

C
Christoph Hellwig 已提交
1133
out_blkdev_put:
1134
	blkdev_put_whole(bdev_whole(part), mode);
C
Christoph Hellwig 已提交
1135
	return ret;
L
Linus Torvalds 已提交
1136
}
C
Christoph Hellwig 已提交
1137

C
Christoph Hellwig 已提交
1138 1139 1140
static void blkdev_put_part(struct block_device *part, fmode_t mode)
{
	struct block_device *whole = bdev_whole(part);
1141

C
Christoph Hellwig 已提交
1142 1143 1144
	if (--part->bd_openers)
		return;
	blkdev_flush_mapping(part);
1145
	whole->bd_disk->open_partitions--;
C
Christoph Hellwig 已提交
1146
	blkdev_put_whole(whole, mode);
L
Linus Torvalds 已提交
1147 1148
}

1149 1150 1151
struct block_device *blkdev_get_no_open(dev_t dev)
{
	struct block_device *bdev;
1152
	struct inode *inode;
1153

1154 1155
	inode = ilookup(blockdev_superblock, dev);
	if (!inode) {
1156
		blk_request_module(dev);
1157 1158
		inode = ilookup(blockdev_superblock, dev);
		if (!inode)
1159
			return NULL;
1160 1161
	}

1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
	/* switch from the inode reference to a device mode one: */
	bdev = &BDEV_I(inode)->bdev;
	if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
		bdev = NULL;
	iput(inode);

	if (!bdev)
		return NULL;
	if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
	    !try_module_get(bdev->bd_disk->fops->owner)) {
		put_device(&bdev->bd_device);
		return NULL;
	}

1176 1177 1178 1179 1180 1181
	return bdev;
}

void blkdev_put_no_open(struct block_device *bdev)
{
	module_put(bdev->bd_disk->fops->owner);
1182
	put_device(&bdev->bd_device);
1183 1184
}

1185
/**
C
Christoph Hellwig 已提交
1186 1187
 * blkdev_get_by_dev - open a block device by device number
 * @dev: device number of block device to open
1188 1189 1190
 * @mode: FMODE_* mask
 * @holder: exclusive holder identifier
 *
C
Christoph Hellwig 已提交
1191 1192 1193 1194
 * Open the block device described by device number @dev. If @mode includes
 * %FMODE_EXCL, the block device is opened with exclusive access.  Specifying
 * %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may nest for
 * the same @holder.
1195
 *
C
Christoph Hellwig 已提交
1196 1197 1198
 * Use this interface ONLY if you really do not have anything better - i.e. when
 * you are behind a truly sucky interface and all you are given is a device
 * number.  Everything else should use blkdev_get_by_path().
1199 1200 1201 1202 1203
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
C
Christoph Hellwig 已提交
1204
 * Reference to the block_device on success, ERR_PTR(-errno) on failure.
1205
 */
C
Christoph Hellwig 已提交
1206
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
L
Linus Torvalds 已提交
1207
{
C
Christoph Hellwig 已提交
1208
	bool unblock_events = true;
C
Christoph Hellwig 已提交
1209
	struct block_device *bdev;
C
Christoph Hellwig 已提交
1210 1211
	struct gendisk *disk;
	int ret;
1212

1213
	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
C
Christoph Hellwig 已提交
1214
			MAJOR(dev), MINOR(dev),
1215 1216
			((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
			((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
1217
	if (ret)
C
Christoph Hellwig 已提交
1218 1219
		return ERR_PTR(ret);

1220 1221 1222 1223
	bdev = blkdev_get_no_open(dev);
	if (!bdev)
		return ERR_PTR(-ENXIO);
	disk = bdev->bd_disk;
1224

C
Christoph Hellwig 已提交
1225
	if (mode & FMODE_EXCL) {
1226
		ret = bd_prepare_to_claim(bdev, holder);
C
Christoph Hellwig 已提交
1227
		if (ret)
1228
			goto put_blkdev;
C
Christoph Hellwig 已提交
1229 1230 1231 1232
	}

	disk_block_events(disk);

1233
	mutex_lock(&disk->open_mutex);
C
Christoph Hellwig 已提交
1234 1235 1236 1237 1238 1239 1240
	ret = -ENXIO;
	if (!(disk->flags & GENHD_FL_UP))
		goto abort_claiming;
	if (bdev_is_partition(bdev))
		ret = blkdev_get_part(bdev, mode);
	else
		ret = blkdev_get_whole(bdev, mode);
1241 1242 1243
	if (ret)
		goto abort_claiming;
	if (mode & FMODE_EXCL) {
1244
		bd_finish_claiming(bdev, holder);
C
Christoph Hellwig 已提交
1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258

		/*
		 * Block event polling for write claims if requested.  Any write
		 * holder makes the write_holder state stick until all are
		 * released.  This is good enough and tracking individual
		 * writeable reference is too fragile given the way @mode is
		 * used in blkdev_get/put().
		 */
		if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
			bdev->bd_write_holder = true;
			unblock_events = false;
		}
	}
1259
	mutex_unlock(&disk->open_mutex);
C
Christoph Hellwig 已提交
1260 1261 1262

	if (unblock_events)
		disk_unblock_events(disk);
1263
	return bdev;
C
Christoph Hellwig 已提交
1264

1265 1266
abort_claiming:
	if (mode & FMODE_EXCL)
1267
		bd_abort_claiming(bdev, holder);
1268
	mutex_unlock(&disk->open_mutex);
1269 1270 1271 1272
	disk_unblock_events(disk);
put_blkdev:
	blkdev_put_no_open(bdev);
	return ERR_PTR(ret);
1273
}
C
Christoph Hellwig 已提交
1274
EXPORT_SYMBOL(blkdev_get_by_dev);
L
Linus Torvalds 已提交
1275

1276 1277 1278 1279 1280 1281
/**
 * blkdev_get_by_path - open a block device by name
 * @path: path to the block device to open
 * @mode: FMODE_* mask
 * @holder: exclusive holder identifier
 *
C
Christoph Hellwig 已提交
1282 1283 1284 1285
 * Open the block device described by the device file at @path.  If @mode
 * includes %FMODE_EXCL, the block device is opened with exclusive access.
 * Specifying %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may
 * nest for the same @holder.
1286 1287 1288 1289 1290
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
C
Christoph Hellwig 已提交
1291
 * Reference to the block_device on success, ERR_PTR(-errno) on failure.
1292 1293 1294 1295 1296
 */
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
					void *holder)
{
	struct block_device *bdev;
C
Christoph Hellwig 已提交
1297 1298
	dev_t dev;
	int error;
1299

C
Christoph Hellwig 已提交
1300 1301 1302
	error = lookup_bdev(path, &dev);
	if (error)
		return ERR_PTR(error);
1303

C
Christoph Hellwig 已提交
1304 1305
	bdev = blkdev_get_by_dev(dev, mode, holder);
	if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1306 1307 1308 1309
		blkdev_put(bdev, mode);
		return ERR_PTR(-EACCES);
	}

1310 1311 1312 1313
	return bdev;
}
EXPORT_SYMBOL(blkdev_get_by_path);

L
Linus Torvalds 已提交
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
static int blkdev_open(struct inode * inode, struct file * filp)
{
	struct block_device *bdev;

	/*
	 * Preserve backwards compatibility and allow large file access
	 * even if userspace doesn't ask for it explicitly. Some mkfs
	 * binary needs it. We might want to drop this workaround
	 * during an unstable branch.
	 */
	filp->f_flags |= O_LARGEFILE;

1326
	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
1327

1328 1329 1330 1331 1332 1333 1334
	if (filp->f_flags & O_NDELAY)
		filp->f_mode |= FMODE_NDELAY;
	if (filp->f_flags & O_EXCL)
		filp->f_mode |= FMODE_EXCL;
	if ((filp->f_flags & O_ACCMODE) == 3)
		filp->f_mode |= FMODE_WRITE_IOCTL;

C
Christoph Hellwig 已提交
1335 1336 1337
	bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
1338
	filp->f_mapping = bdev->bd_inode->i_mapping;
1339
	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
C
Christoph Hellwig 已提交
1340
	return 0;
L
Linus Torvalds 已提交
1341 1342
}

A
Al Viro 已提交
1343
void blkdev_put(struct block_device *bdev, fmode_t mode)
1344 1345 1346
{
	struct gendisk *disk = bdev->bd_disk;

1347 1348 1349 1350 1351 1352 1353 1354 1355 1356
	/*
	 * Sync early if it looks like we're the last one.  If someone else
	 * opens the block device between now and the decrement of bd_openers
	 * then we did a sync that we didn't need to, but that's not the end
	 * of the world and we want to avoid long (could be several minute)
	 * syncs while holding the mutex.
	 */
	if (bdev->bd_openers == 1)
		sync_blockdev(bdev);

1357
	mutex_lock(&disk->open_mutex);
1358
	if (mode & FMODE_EXCL) {
C
Christoph Hellwig 已提交
1359
		struct block_device *whole = bdev_whole(bdev);
1360 1361 1362 1363
		bool bdev_free;

		/*
		 * Release a claim on the device.  The holder fields
1364
		 * are protected with bdev_lock.  open_mutex is to
1365 1366 1367 1368 1369
		 * synchronize disk_holder unlinking.
		 */
		spin_lock(&bdev_lock);

		WARN_ON_ONCE(--bdev->bd_holders < 0);
C
Christoph Hellwig 已提交
1370
		WARN_ON_ONCE(--whole->bd_holders < 0);
1371 1372 1373

		if ((bdev_free = !bdev->bd_holders))
			bdev->bd_holder = NULL;
C
Christoph Hellwig 已提交
1374 1375
		if (!whole->bd_holders)
			whole->bd_holder = NULL;
1376 1377 1378

		spin_unlock(&bdev_lock);

1379 1380 1381 1382
		/*
		 * If this was the last claim, remove holder link and
		 * unblock evpoll if it was a write holder.
		 */
1383
		if (bdev_free && bdev->bd_write_holder) {
C
Christoph Hellwig 已提交
1384
			disk_unblock_events(disk);
1385
			bdev->bd_write_holder = false;
1386
		}
1387
	}
1388

1389 1390 1391 1392 1393
	/*
	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
	 * event.  This is to ensure detection of media removal commanded
	 * from userland - e.g. eject(1).
	 */
C
Christoph Hellwig 已提交
1394
	disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
1395

C
Christoph Hellwig 已提交
1396 1397 1398 1399
	if (bdev_is_partition(bdev))
		blkdev_put_part(bdev, mode);
	else
		blkdev_put_whole(bdev, mode);
1400 1401
	mutex_unlock(&disk->open_mutex);

1402
	blkdev_put_no_open(bdev);
1403
}
1404 1405
EXPORT_SYMBOL(blkdev_put);

L
Linus Torvalds 已提交
1406 1407
static int blkdev_close(struct inode * inode, struct file * filp)
{
1408
	struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
A
Al Viro 已提交
1409 1410
	blkdev_put(bdev, filp->f_mode);
	return 0;
L
Linus Torvalds 已提交
1411 1412
}

1413
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
L
Linus Torvalds 已提交
1414
{
1415
	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
1416
	fmode_t mode = file->f_mode;
1417 1418 1419 1420 1421

	/*
	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
	 * to updated it before every ioctl.
	 */
1422
	if (file->f_flags & O_NDELAY)
1423 1424 1425 1426
		mode |= FMODE_NDELAY;
	else
		mode &= ~FMODE_NDELAY;

1427
	return blkdev_ioctl(bdev, mode, cmd, arg);
L
Linus Torvalds 已提交
1428 1429
}

1430 1431 1432 1433 1434 1435 1436
/*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
 *
 * Does not take i_mutex for the write and thus is not for general purpose
 * use.
 */
C
Christoph Hellwig 已提交
1437
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
1438 1439
{
	struct file *file = iocb->ki_filp;
1440
	struct inode *bd_inode = bdev_file_inode(file);
1441
	loff_t size = i_size_read(bd_inode);
1442
	struct blk_plug plug;
1443
	size_t shorted = 0;
1444
	ssize_t ret;
1445

1446 1447
	if (bdev_read_only(I_BDEV(bd_inode)))
		return -EPERM;
1448

1449
	if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
1450 1451
		return -ETXTBSY;

1452
	if (!iov_iter_count(from))
1453 1454
		return 0;

1455 1456 1457
	if (iocb->ki_pos >= size)
		return -ENOSPC;

1458 1459 1460
	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
		return -EOPNOTSUPP;

1461 1462 1463 1464 1465
	size -= iocb->ki_pos;
	if (iov_iter_count(from) > size) {
		shorted = iov_iter_count(from) - size;
		iov_iter_truncate(from, size);
	}
1466

1467
	blk_start_plug(&plug);
1468
	ret = __generic_file_write_iter(iocb, from);
1469 1470
	if (ret > 0)
		ret = generic_write_sync(iocb, ret);
1471
	iov_iter_reexpand(from, iov_iter_count(from) + shorted);
1472
	blk_finish_plug(&plug);
1473 1474 1475
	return ret;
}

C
Christoph Hellwig 已提交
1476
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
1477 1478
{
	struct file *file = iocb->ki_filp;
1479
	struct inode *bd_inode = bdev_file_inode(file);
1480
	loff_t size = i_size_read(bd_inode);
1481
	loff_t pos = iocb->ki_pos;
1482 1483
	size_t shorted = 0;
	ssize_t ret;
1484 1485 1486 1487 1488

	if (pos >= size)
		return 0;

	size -= pos;
1489 1490 1491 1492 1493 1494 1495 1496
	if (iov_iter_count(to) > size) {
		shorted = iov_iter_count(to) - size;
		iov_iter_truncate(to, size);
	}

	ret = generic_file_read_iter(iocb, to);
	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
	return ret;
1497 1498
}

1499 1500 1501 1502 1503 1504
static int blkdev_writepages(struct address_space *mapping,
			     struct writeback_control *wbc)
{
	return generic_writepages(mapping, wbc);
}

A
Adrian Bunk 已提交
1505
static const struct address_space_operations def_blk_aops = {
1506
	.set_page_dirty	= __set_page_dirty_buffers,
L
Linus Torvalds 已提交
1507
	.readpage	= blkdev_readpage,
1508
	.readahead	= blkdev_readahead,
L
Linus Torvalds 已提交
1509
	.writepage	= blkdev_writepage,
N
Nick Piggin 已提交
1510 1511
	.write_begin	= blkdev_write_begin,
	.write_end	= blkdev_write_end,
1512
	.writepages	= blkdev_writepages,
L
Linus Torvalds 已提交
1513
	.direct_IO	= blkdev_direct_IO,
1514
	.migratepage	= buffer_migrate_page_norefs,
1515
	.is_dirty_writeback = buffer_check_dirty_writeback,
L
Linus Torvalds 已提交
1516 1517
};

1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
#define	BLKDEV_FALLOC_FL_SUPPORTED					\
		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
		 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)

static long blkdev_fallocate(struct file *file, int mode, loff_t start,
			     loff_t len)
{
	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
	loff_t end = start + len - 1;
	loff_t isize;
	int error;

	/* Fail if we don't recognize the flags. */
	if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
		return -EOPNOTSUPP;

	/* Don't go off the end of the device. */
	isize = i_size_read(bdev->bd_inode);
	if (start >= isize)
		return -EINVAL;
	if (end >= isize) {
		if (mode & FALLOC_FL_KEEP_SIZE) {
			len = isize - start;
			end = start + len - 1;
		} else
			return -EINVAL;
	}

	/*
	 * Don't allow IO that isn't aligned to logical block size.
	 */
	if ((start | len) & (bdev_logical_block_size(bdev) - 1))
		return -EINVAL;

	/* Invalidate the page cache, including dirty pages. */
1553 1554 1555
	error = truncate_bdev_range(bdev, file->f_mode, start, end);
	if (error)
		return error;
1556 1557 1558 1559 1560

	switch (mode) {
	case FALLOC_FL_ZERO_RANGE:
	case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
1561
					    GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
1562 1563
		break;
	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
1564 1565
		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
					     GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
		break;
	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
		error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
					     GFP_KERNEL, 0);
		break;
	default:
		return -EOPNOTSUPP;
	}
	if (error)
		return error;

	/*
1578 1579 1580
	 * Invalidate the page cache again; if someone wandered in and dirtied
	 * a page, we just discard it - userspace has no way of knowing whether
	 * the write happened before or after discard completing...
1581
	 */
1582
	return truncate_bdev_range(bdev, file->f_mode, start, end);
1583 1584
}

1585
const struct file_operations def_blk_fops = {
L
Linus Torvalds 已提交
1586 1587 1588
	.open		= blkdev_open,
	.release	= blkdev_close,
	.llseek		= block_llseek,
1589
	.read_iter	= blkdev_read_iter,
1590
	.write_iter	= blkdev_write_iter,
1591
	.iopoll		= blkdev_iopoll,
1592
	.mmap		= generic_file_mmap,
1593
	.fsync		= blkdev_fsync,
1594
	.unlocked_ioctl	= block_ioctl,
L
Linus Torvalds 已提交
1595 1596 1597
#ifdef CONFIG_COMPAT
	.compat_ioctl	= compat_blkdev_ioctl,
#endif
1598
	.splice_read	= generic_file_splice_read,
A
Al Viro 已提交
1599
	.splice_write	= iter_file_splice_write,
1600
	.fallocate	= blkdev_fallocate,
L
Linus Torvalds 已提交
1601 1602 1603 1604
};

/**
 * lookup_bdev  - lookup a struct block_device by name
1605
 * @pathname:	special file representing the block device
1606
 * @dev:	return value of the block device's dev_t
L
Linus Torvalds 已提交
1607
 *
1608
 * Get a reference to the blockdevice at @pathname in the current
L
Linus Torvalds 已提交
1609 1610 1611
 * namespace if possible and return it.  Return ERR_PTR(error)
 * otherwise.
 */
C
Christoph Hellwig 已提交
1612
int lookup_bdev(const char *pathname, dev_t *dev)
L
Linus Torvalds 已提交
1613 1614
{
	struct inode *inode;
1615
	struct path path;
L
Linus Torvalds 已提交
1616 1617
	int error;

1618
	if (!pathname || !*pathname)
C
Christoph Hellwig 已提交
1619
		return -EINVAL;
L
Linus Torvalds 已提交
1620

1621
	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
L
Linus Torvalds 已提交
1622
	if (error)
C
Christoph Hellwig 已提交
1623
		return error;
L
Linus Torvalds 已提交
1624

1625
	inode = d_backing_inode(path.dentry);
L
Linus Torvalds 已提交
1626 1627
	error = -ENOTBLK;
	if (!S_ISBLK(inode->i_mode))
C
Christoph Hellwig 已提交
1628
		goto out_path_put;
L
Linus Torvalds 已提交
1629
	error = -EACCES;
1630
	if (!may_open_dev(&path))
C
Christoph Hellwig 已提交
1631 1632 1633 1634 1635
		goto out_path_put;

	*dev = inode->i_rdev;
	error = 0;
out_path_put:
1636
	path_put(&path);
C
Christoph Hellwig 已提交
1637
	return error;
L
Linus Torvalds 已提交
1638
}
1639
EXPORT_SYMBOL(lookup_bdev);
L
Linus Torvalds 已提交
1640

1641
int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
{
	struct super_block *sb = get_super(bdev);
	int res = 0;

	if (sb) {
		/*
		 * no need to lock the super, get_super holds the
		 * read mutex so the filesystem cannot go away
		 * under us (->put_super runs with the write lock
		 * hold).
		 */
		shrink_dcache_sb(sb);
1654
		res = invalidate_inodes(sb, kill_dirty);
1655 1656
		drop_super(sb);
	}
1657
	invalidate_bdev(bdev);
1658 1659 1660
	return res;
}
EXPORT_SYMBOL(__invalidate_device);
1661 1662 1663 1664 1665

void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
{
	struct inode *inode, *old_inode = NULL;

1666
	spin_lock(&blockdev_superblock->s_inode_list_lock);
1667 1668
	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
		struct address_space *mapping = inode->i_mapping;
1669
		struct block_device *bdev;
1670 1671 1672 1673 1674 1675 1676 1677 1678

		spin_lock(&inode->i_lock);
		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
		    mapping->nrpages == 0) {
			spin_unlock(&inode->i_lock);
			continue;
		}
		__iget(inode);
		spin_unlock(&inode->i_lock);
1679
		spin_unlock(&blockdev_superblock->s_inode_list_lock);
1680 1681 1682
		/*
		 * We hold a reference to 'inode' so it couldn't have been
		 * removed from s_inodes list while we dropped the
1683
		 * s_inode_list_lock  We cannot iput the inode now as we can
1684
		 * be holding the last reference and we cannot iput it under
1685
		 * s_inode_list_lock. So we keep the reference and iput it
1686 1687 1688 1689
		 * later.
		 */
		iput(old_inode);
		old_inode = inode;
1690
		bdev = I_BDEV(inode);
1691

1692
		mutex_lock(&bdev->bd_disk->open_mutex);
1693 1694
		if (bdev->bd_openers)
			func(bdev, arg);
1695
		mutex_unlock(&bdev->bd_disk->open_mutex);
1696

1697
		spin_lock(&blockdev_superblock->s_inode_list_lock);
1698
	}
1699
	spin_unlock(&blockdev_superblock->s_inode_list_lock);
1700 1701
	iput(old_inode);
}