genhd.c 57.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6
/*
 *  gendisk handling
 */

#include <linux/module.h>
7
#include <linux/ctype.h>
L
Linus Torvalds 已提交
8 9
#include <linux/fs.h>
#include <linux/genhd.h>
10
#include <linux/kdev_t.h>
L
Linus Torvalds 已提交
11 12
#include <linux/kernel.h>
#include <linux/blkdev.h>
13
#include <linux/backing-dev.h>
L
Linus Torvalds 已提交
14 15
#include <linux/init.h>
#include <linux/spinlock.h>
16
#include <linux/proc_fs.h>
L
Linus Torvalds 已提交
17 18 19
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
20
#include <linux/mutex.h>
T
Tejun Heo 已提交
21
#include <linux/idr.h>
22
#include <linux/log2.h>
23
#include <linux/pm_runtime.h>
24
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
25

26 27
#include "blk.h"

28
static struct kobject *block_depr;
L
Linus Torvalds 已提交
29

30
static DEFINE_XARRAY(bdev_map);
C
Christoph Hellwig 已提交
31
static DEFINE_MUTEX(bdev_map_lock);
32

T
Tejun Heo 已提交
33
/* for extended dynamic devt allocation, currently only one major is used */
34
#define NR_EXT_DEVT		(1 << MINORBITS)
T
Tejun Heo 已提交
35

36
/* For extended devt allocation.  ext_devt_lock prevents look up
T
Tejun Heo 已提交
37 38
 * results from going away underneath its user.
 */
39
static DEFINE_SPINLOCK(ext_devt_lock);
T
Tejun Heo 已提交
40 41
static DEFINE_IDR(ext_devt_idr);

D
Derek Basehore 已提交
42 43
static void disk_check_events(struct disk_events *ev,
			      unsigned int *clearing_ptr);
44
static void disk_alloc_events(struct gendisk *disk);
45 46 47 48
static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);

49 50 51 52
/*
 * Set disk capacity and notify if the size is not currently
 * zero and will not be set to zero
 */
53
bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
54
					bool update_bdev)
55 56 57 58
{
	sector_t capacity = get_capacity(disk);

	set_capacity(disk, size);
59 60
	if (update_bdev)
		revalidate_disk_size(disk, true);
61 62 63 64 65

	if (capacity != size && capacity != 0 && size != 0) {
		char *envp[] = { "RESIZE=1", NULL };

		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
66
		return true;
67
	}
68 69

	return false;
70 71 72 73
}

EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify);

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
/*
 * Format the device name of the indicated disk into the supplied buffer and
 * return a pointer to that same buffer for convenience.
 */
char *disk_name(struct gendisk *hd, int partno, char *buf)
{
	if (!partno)
		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
	else
		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

	return buf;
}

const char *bdevname(struct block_device *bdev, char *buf)
{
92
	return disk_name(bdev->bd_disk, bdev->bd_partno, buf);
93 94
}
EXPORT_SYMBOL(bdevname);
95

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
{
	int cpu;

	memset(stat, 0, sizeof(struct disk_stats));
	for_each_possible_cpu(cpu) {
		struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu);
		int group;

		for (group = 0; group < NR_STAT_GROUPS; group++) {
			stat->nsecs[group] += ptr->nsecs[group];
			stat->sectors[group] += ptr->sectors[group];
			stat->ios[group] += ptr->ios[group];
			stat->merges[group] += ptr->merges[group];
		}

		stat->io_ticks += ptr->io_ticks;
	}
}

116
static unsigned int part_in_flight(struct hd_struct *part)
117
{
118
	unsigned int inflight = 0;
119
	int cpu;
120

121
	for_each_possible_cpu(cpu) {
122 123
		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
			    part_stat_local_read_cpu(part, in_flight[1], cpu);
124
	}
125 126
	if ((int)inflight < 0)
		inflight = 0;
127

128
	return inflight;
129 130
}

131
static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2])
132
{
133 134 135 136 137 138 139 140 141 142 143 144
	int cpu;

	inflight[0] = 0;
	inflight[1] = 0;
	for_each_possible_cpu(cpu) {
		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
	}
	if ((int)inflight[0] < 0)
		inflight[0] = 0;
	if ((int)inflight[1] < 0)
		inflight[1] = 0;
145 146
}

147 148 149 150 151 152 153 154 155
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
{
	struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);

	if (unlikely(partno < 0 || partno >= ptbl->len))
		return NULL;
	return rcu_dereference(ptbl->part[partno]);
}

156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
/**
 * disk_get_part - get partition
 * @disk: disk to look partition from
 * @partno: partition number
 *
 * Look for partition @partno from @disk.  If found, increment
 * reference count and return it.
 *
 * CONTEXT:
 * Don't care.
 *
 * RETURNS:
 * Pointer to the found partition on success, NULL if not found.
 */
struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
{
172
	struct hd_struct *part;
T
Tejun Heo 已提交
173

174
	rcu_read_lock();
175 176 177
	part = __disk_get_part(disk, partno);
	if (part)
		get_device(part_to_dev(part));
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
	rcu_read_unlock();

	return part;
}

/**
 * disk_part_iter_init - initialize partition iterator
 * @piter: iterator to initialize
 * @disk: disk to iterate over
 * @flags: DISK_PITER_* flags
 *
 * Initialize @piter so that it iterates over partitions of @disk.
 *
 * CONTEXT:
 * Don't care.
 */
void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
			  unsigned int flags)
{
T
Tejun Heo 已提交
197 198 199 200 201
	struct disk_part_tbl *ptbl;

	rcu_read_lock();
	ptbl = rcu_dereference(disk->part_tbl);

202 203 204 205
	piter->disk = disk;
	piter->part = NULL;

	if (flags & DISK_PITER_REVERSE)
T
Tejun Heo 已提交
206
		piter->idx = ptbl->len - 1;
207
	else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
208
		piter->idx = 0;
T
Tejun Heo 已提交
209 210
	else
		piter->idx = 1;
211 212

	piter->flags = flags;
T
Tejun Heo 已提交
213 214

	rcu_read_unlock();
215 216 217 218 219 220 221 222 223 224 225 226 227 228
}
EXPORT_SYMBOL_GPL(disk_part_iter_init);

/**
 * disk_part_iter_next - proceed iterator to the next partition and return it
 * @piter: iterator of interest
 *
 * Proceed @piter to the next partition and return it.
 *
 * CONTEXT:
 * Don't care.
 */
struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
{
T
Tejun Heo 已提交
229
	struct disk_part_tbl *ptbl;
230 231 232 233 234 235
	int inc, end;

	/* put the last partition */
	disk_put_part(piter->part);
	piter->part = NULL;

T
Tejun Heo 已提交
236
	/* get part_tbl */
237
	rcu_read_lock();
T
Tejun Heo 已提交
238
	ptbl = rcu_dereference(piter->disk->part_tbl);
239 240 241 242

	/* determine iteration parameters */
	if (piter->flags & DISK_PITER_REVERSE) {
		inc = -1;
243 244
		if (piter->flags & (DISK_PITER_INCL_PART0 |
				    DISK_PITER_INCL_EMPTY_PART0))
T
Tejun Heo 已提交
245 246 247
			end = -1;
		else
			end = 0;
248 249
	} else {
		inc = 1;
T
Tejun Heo 已提交
250
		end = ptbl->len;
251 252 253 254 255 256
	}

	/* iterate to the next partition */
	for (; piter->idx != end; piter->idx += inc) {
		struct hd_struct *part;

T
Tejun Heo 已提交
257
		part = rcu_dereference(ptbl->part[piter->idx]);
258 259
		if (!part)
			continue;
260
		if (!part_nr_sects_read(part) &&
261 262 263
		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
		      piter->idx == 0))
264 265
			continue;

266
		get_device(part_to_dev(part));
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
		piter->part = part;
		piter->idx += inc;
		break;
	}

	rcu_read_unlock();

	return piter->part;
}
EXPORT_SYMBOL_GPL(disk_part_iter_next);

/**
 * disk_part_iter_exit - finish up partition iteration
 * @piter: iter of interest
 *
 * Called when iteration is over.  Cleans up @piter.
 *
 * CONTEXT:
 * Don't care.
 */
void disk_part_iter_exit(struct disk_part_iter *piter)
{
	disk_put_part(piter->part);
	piter->part = NULL;
}
EXPORT_SYMBOL_GPL(disk_part_iter_exit);

294 295 296
static inline int sector_in_part(struct hd_struct *part, sector_t sector)
{
	return part->start_sect <= sector &&
297
		sector < part->start_sect + part_nr_sects_read(part);
298 299
}

300 301 302 303 304 305 306 307 308
/**
 * disk_map_sector_rcu - map sector to partition
 * @disk: gendisk of interest
 * @sector: sector to map
 *
 * Find out which partition @sector maps to on @disk.  This is
 * primarily used for stats accounting.
 *
 * CONTEXT:
309
 * RCU read locked.  The returned partition pointer is always valid
310 311
 * because its refcount is grabbed except for part0, which lifetime
 * is same with the disk.
312 313
 *
 * RETURNS:
T
Tejun Heo 已提交
314
 * Found partition on success, part0 is returned if no partition matches
315
 * or the matched partition is being deleted.
316 317 318
 */
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
{
T
Tejun Heo 已提交
319
	struct disk_part_tbl *ptbl;
320
	struct hd_struct *part;
321 322
	int i;

323
	rcu_read_lock();
T
Tejun Heo 已提交
324 325
	ptbl = rcu_dereference(disk->part_tbl);

326
	part = rcu_dereference(ptbl->last_lookup);
327
	if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
328
		goto out_unlock;
329

T
Tejun Heo 已提交
330
	for (i = 1; i < ptbl->len; i++) {
331
		part = rcu_dereference(ptbl->part[i]);
332

333
		if (part && sector_in_part(part, sector)) {
334 335 336 337 338 339 340
			/*
			 * only live partition can be cached for lookup,
			 * so use-after-free on cached & deleting partition
			 * can be avoided
			 */
			if (!hd_struct_try_get(part))
				break;
341
			rcu_assign_pointer(ptbl->last_lookup, part);
342
			goto out_unlock;
343
		}
344
	}
345 346 347 348 349

	part = &disk->part0;
out_unlock:
	rcu_read_unlock();
	return part;
350 351
}

352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
/**
 * disk_has_partitions
 * @disk: gendisk of interest
 *
 * Walk through the partition table and check if valid partition exists.
 *
 * CONTEXT:
 * Don't care.
 *
 * RETURNS:
 * True if the gendisk has at least one valid non-zero size partition.
 * Otherwise false.
 */
bool disk_has_partitions(struct gendisk *disk)
{
	struct disk_part_tbl *ptbl;
	int i;
	bool ret = false;

	rcu_read_lock();
	ptbl = rcu_dereference(disk->part_tbl);

	/* Iterate partitions skipping the whole device at index 0 */
	for (i = 1; i < ptbl->len; i++) {
		if (rcu_dereference(ptbl->part[i])) {
			ret = true;
			break;
		}
	}

	rcu_read_unlock();

	return ret;
}
EXPORT_SYMBOL_GPL(disk_has_partitions);

L
Linus Torvalds 已提交
388 389 390 391
/*
 * Can be deleted altogether. Later.
 *
 */
392
#define BLKDEV_MAJOR_HASH_SIZE 255
L
Linus Torvalds 已提交
393 394 395 396
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
397
	void (*probe)(dev_t devt);
398
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
C
Christoph Hellwig 已提交
399
static DEFINE_MUTEX(major_names_lock);
L
Linus Torvalds 已提交
400 401

/* index in the above - for now: assume no multimajor ranges */
402
static inline int major_to_index(unsigned major)
L
Linus Torvalds 已提交
403
{
404
	return major % BLKDEV_MAJOR_HASH_SIZE;
405 406
}

407
#ifdef CONFIG_PROC_FS
408
void blkdev_show(struct seq_file *seqf, off_t offset)
409
{
410
	struct blk_major_name *dp;
411

C
Christoph Hellwig 已提交
412
	mutex_lock(&major_names_lock);
413 414
	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
		if (dp->major == offset)
415
			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
C
Christoph Hellwig 已提交
416
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
417
}
418
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
419

420
/**
421
 * __register_blkdev - register a new block device
422
 *
423 424
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
425
 * @name: the name of the new block device as a zero terminated string
426
 * @probe: allback that is called on access to any minor number of @major
427 428 429
 *
 * The @name must be unique within the system.
 *
430 431
 * The return value depends on the @major input parameter:
 *
432 433
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
434
 *  - if any unused major number was requested with @major = 0 parameter
435
 *    then the return value is the allocated major number in range
436 437 438 439
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
440 441
 *
 * Use register_blkdev instead for any new code.
442
 */
443 444
int __register_blkdev(unsigned int major, const char *name,
		void (*probe)(dev_t devt))
L
Linus Torvalds 已提交
445 446 447 448
{
	struct blk_major_name **n, *p;
	int index, ret = 0;

C
Christoph Hellwig 已提交
449
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
450 451 452 453 454 455 456 457 458

	/* temporary */
	if (major == 0) {
		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
			if (major_names[index] == NULL)
				break;
		}

		if (index == 0) {
459 460
			printk("%s: failed to get major for %s\n",
			       __func__, name);
L
Linus Torvalds 已提交
461 462 463 464 465 466 467
			ret = -EBUSY;
			goto out;
		}
		major = index;
		ret = major;
	}

468
	if (major >= BLKDEV_MAJOR_MAX) {
469 470
		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
471 472 473 474 475

		ret = -EINVAL;
		goto out;
	}

L
Linus Torvalds 已提交
476 477 478 479 480 481 482
	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
	if (p == NULL) {
		ret = -ENOMEM;
		goto out;
	}

	p->major = major;
483
	p->probe = probe;
L
Linus Torvalds 已提交
484 485 486 487 488 489 490 491 492 493 494 495 496 497
	strlcpy(p->name, name, sizeof(p->name));
	p->next = NULL;
	index = major_to_index(major);

	for (n = &major_names[index]; *n; n = &(*n)->next) {
		if ((*n)->major == major)
			break;
	}
	if (!*n)
		*n = p;
	else
		ret = -EBUSY;

	if (ret < 0) {
498
		printk("register_blkdev: cannot get major %u for %s\n",
L
Linus Torvalds 已提交
499 500 501 502
		       major, name);
		kfree(p);
	}
out:
C
Christoph Hellwig 已提交
503
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
504 505
	return ret;
}
506
EXPORT_SYMBOL(__register_blkdev);
L
Linus Torvalds 已提交
507

A
Akinobu Mita 已提交
508
void unregister_blkdev(unsigned int major, const char *name)
L
Linus Torvalds 已提交
509 510 511 512 513
{
	struct blk_major_name **n;
	struct blk_major_name *p = NULL;
	int index = major_to_index(major);

C
Christoph Hellwig 已提交
514
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
515 516 517
	for (n = &major_names[index]; *n; n = &(*n)->next)
		if ((*n)->major == major)
			break;
518 519 520
	if (!*n || strcmp((*n)->name, name)) {
		WARN_ON(1);
	} else {
L
Linus Torvalds 已提交
521 522 523
		p = *n;
		*n = p->next;
	}
C
Christoph Hellwig 已提交
524
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
525 526 527 528 529
	kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
/**
 * blk_mangle_minor - scatter minor numbers apart
 * @minor: minor number to mangle
 *
 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 * is enabled.  Mangling twice gives the original value.
 *
 * RETURNS:
 * Mangled value.
 *
 * CONTEXT:
 * Don't care.
 */
static int blk_mangle_minor(int minor)
{
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
	int i;

	for (i = 0; i < MINORBITS / 2; i++) {
		int low = minor & (1 << i);
		int high = minor & (1 << (MINORBITS - 1 - i));
		int distance = MINORBITS - 1 - 2 * i;

		minor ^= low | high;	/* clear both bits */
		low <<= distance;	/* swap the positions */
		high >>= distance;
		minor |= low | high;	/* and set */
	}
#endif
	return minor;
}

T
Tejun Heo 已提交
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
/**
 * blk_alloc_devt - allocate a dev_t for a partition
 * @part: partition to allocate dev_t for
 * @devt: out parameter for resulting dev_t
 *
 * Allocate a dev_t for block device.
 *
 * RETURNS:
 * 0 on success, allocated dev_t is returned in *@devt.  -errno on
 * failure.
 *
 * CONTEXT:
 * Might sleep.
 */
int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
{
	struct gendisk *disk = part_to_disk(part);
T
Tejun Heo 已提交
579
	int idx;
T
Tejun Heo 已提交
580 581 582 583 584 585 586 587

	/* in consecutive minor range? */
	if (part->partno < disk->minors) {
		*devt = MKDEV(disk->major, disk->first_minor + part->partno);
		return 0;
	}

	/* allocate ext devt */
588 589
	idr_preload(GFP_KERNEL);

590
	spin_lock_bh(&ext_devt_lock);
591
	idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
592
	spin_unlock_bh(&ext_devt_lock);
593 594

	idr_preload_end();
T
Tejun Heo 已提交
595 596
	if (idx < 0)
		return idx == -ENOSPC ? -EBUSY : idx;
T
Tejun Heo 已提交
597

598
	*devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
T
Tejun Heo 已提交
599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
	return 0;
}

/**
 * blk_free_devt - free a dev_t
 * @devt: dev_t to free
 *
 * Free @devt which was allocated using blk_alloc_devt().
 *
 * CONTEXT:
 * Might sleep.
 */
void blk_free_devt(dev_t devt)
{
	if (devt == MKDEV(0, 0))
		return;

	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
617
		spin_lock_bh(&ext_devt_lock);
618
		idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
619
		spin_unlock_bh(&ext_devt_lock);
T
Tejun Heo 已提交
620 621 622
	}
}

623 624
/*
 * We invalidate devt by assigning NULL pointer for devt in idr.
Y
Yufen Yu 已提交
625 626 627 628 629 630 631 632 633 634
 */
void blk_invalidate_devt(dev_t devt)
{
	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
		spin_lock_bh(&ext_devt_lock);
		idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt)));
		spin_unlock_bh(&ext_devt_lock);
	}
}

635 636 637 638 639 640 641 642 643 644 645 646
static char *bdevt_str(dev_t devt, char *buf)
{
	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
		char tbuf[BDEVT_SIZE];
		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
	} else
		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));

	return buf;
}

647 648 649
static void blk_register_region(struct gendisk *disk)
{
	int i;
L
Linus Torvalds 已提交
650

C
Christoph Hellwig 已提交
651
	mutex_lock(&bdev_map_lock);
652 653 654
	for (i = 0; i < disk->minors; i++) {
		if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL))
			WARN_ON_ONCE(1);
655
	}
C
Christoph Hellwig 已提交
656
	mutex_unlock(&bdev_map_lock);
657
}
L
Linus Torvalds 已提交
658

659
static void blk_unregister_region(struct gendisk *disk)
L
Linus Torvalds 已提交
660
{
661
	int i;
L
Linus Torvalds 已提交
662

C
Christoph Hellwig 已提交
663
	mutex_lock(&bdev_map_lock);
664 665
	for (i = 0; i < disk->minors; i++)
		xa_erase(&bdev_map, disk_devt(disk) + i);
C
Christoph Hellwig 已提交
666
	mutex_unlock(&bdev_map_lock);
L
Linus Torvalds 已提交
667 668
}

669 670 671 672 673 674 675 676 677 678 679 680 681
static void disk_scan_partitions(struct gendisk *disk)
{
	struct block_device *bdev;

	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
		return;

	set_bit(GD_NEED_PART_SCAN, &disk->state);
	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
	if (!IS_ERR(bdev))
		blkdev_put(bdev, FMODE_READ);
}

682 683
static void register_disk(struct device *parent, struct gendisk *disk,
			  const struct attribute_group **groups)
684 685 686 687 688 689
{
	struct device *ddev = disk_to_dev(disk);
	struct disk_part_iter piter;
	struct hd_struct *part;
	int err;

690
	ddev->parent = parent;
691

692
	dev_set_name(ddev, "%s", disk->disk_name);
693 694 695 696

	/* delay uevents, until we scanned partition table */
	dev_set_uevent_suppress(ddev, 1);

697 698 699 700
	if (groups) {
		WARN_ON(ddev->groups);
		ddev->groups = groups;
	}
701 702 703 704 705 706 707 708 709 710
	if (device_add(ddev))
		return;
	if (!sysfs_deprecated) {
		err = sysfs_create_link(block_depr, &ddev->kobj,
					kobject_name(&ddev->kobj));
		if (err) {
			device_del(ddev);
			return;
		}
	}
711 712 713 714 715 716 717 718

	/*
	 * avoid probable deadlock caused by allocating memory with
	 * GFP_KERNEL in runtime_resume callback of its all ancestor
	 * devices
	 */
	pm_runtime_set_memalloc_noio(ddev, true);

719 720 721
	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);

722 723 724 725 726
	if (disk->flags & GENHD_FL_HIDDEN) {
		dev_set_uevent_suppress(ddev, 0);
		return;
	}

727
	disk_scan_partitions(disk);
728 729 730 731 732 733 734 735 736 737

	/* announce disk after possible partitions are created */
	dev_set_uevent_suppress(ddev, 0);
	kobject_uevent(&ddev->kobj, KOBJ_ADD);

	/* announce possible partitions */
	disk_part_iter_init(&piter, disk, 0);
	while ((part = disk_part_iter_next(&piter)))
		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
	disk_part_iter_exit(&piter);
738

739 740 741 742 743 744
	if (disk->queue->backing_dev_info->dev) {
		err = sysfs_create_link(&ddev->kobj,
			  &disk->queue->backing_dev_info->dev->kobj,
			  "bdi");
		WARN_ON(err);
	}
745 746
}

L
Linus Torvalds 已提交
747
/**
748
 * __device_add_disk - add disk information to kernel list
749
 * @parent: parent device for the disk
L
Linus Torvalds 已提交
750
 * @disk: per-device partitioning information
751
 * @groups: Additional per-device sysfs groups
752
 * @register_queue: register the queue if set to true
L
Linus Torvalds 已提交
753 754 755
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
756 757
 *
 * FIXME: error handling
L
Linus Torvalds 已提交
758
 */
759
static void __device_add_disk(struct device *parent, struct gendisk *disk,
760
			      const struct attribute_group **groups,
761
			      bool register_queue)
L
Linus Torvalds 已提交
762
{
763
	dev_t devt;
764
	int retval;
765

766 767 768 769 770 771 772 773 774
	/*
	 * The disk queue should now be all set with enough information about
	 * the device for the elevator code to pick an adequate default
	 * elevator if one is needed, that is, for devices requesting queue
	 * registration.
	 */
	if (register_queue)
		elevator_init_mq(disk->queue);

775 776 777 778 779
	/* minors == 0 indicates to use ext devt from part0 and should
	 * be accompanied with EXT_DEVT flag.  Make sure all
	 * parameters make sense.
	 */
	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
780 781
	WARN_ON(!disk->minors &&
		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
782

L
Linus Torvalds 已提交
783
	disk->flags |= GENHD_FL_UP;
784 785 786 787 788 789 790 791 792

	retval = blk_alloc_devt(&disk->part0, &devt);
	if (retval) {
		WARN_ON(1);
		return;
	}
	disk->major = MAJOR(devt);
	disk->first_minor = MINOR(devt);

793 794
	disk_alloc_events(disk);

795 796 797 798 799 800 801 802
	if (disk->flags & GENHD_FL_HIDDEN) {
		/*
		 * Don't let hidden disks show up in /proc/partitions,
		 * and don't bother scanning for partitions either.
		 */
		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
		disk->flags |= GENHD_FL_NO_PART_SCAN;
	} else {
803 804
		struct backing_dev_info *bdi = disk->queue->backing_dev_info;
		struct device *dev = disk_to_dev(disk);
805 806
		int ret;

807
		/* Register BDI before referencing it from bdev */
808 809
		dev->devt = devt;
		ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
810
		WARN_ON(ret);
811
		bdi_set_owner(bdi, dev);
812
		blk_register_region(disk);
813
	}
814
	register_disk(parent, disk, groups);
815 816
	if (register_queue)
		blk_register_queue(disk);
817

818 819 820 821
	/*
	 * Take an extra ref on queue which will be put on disk_release()
	 * so that it sticks around as long as @disk is there.
	 */
T
Tejun Heo 已提交
822
	WARN_ON_ONCE(!blk_get_queue(disk->queue));
823

824
	disk_add_events(disk);
825
	blk_integrity_add(disk);
L
Linus Torvalds 已提交
826
}
827

828 829 830
void device_add_disk(struct device *parent, struct gendisk *disk,
		     const struct attribute_group **groups)

831
{
832
	__device_add_disk(parent, disk, groups, true);
833
}
834
EXPORT_SYMBOL(device_add_disk);
L
Linus Torvalds 已提交
835

836 837
void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
{
838
	__device_add_disk(parent, disk, NULL, false);
839 840 841
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);

842 843 844 845 846 847 848 849 850 851
static void invalidate_partition(struct gendisk *disk, int partno)
{
	struct block_device *bdev;

	bdev = bdget_disk(disk, partno);
	if (!bdev)
		return;

	fsync_bdev(bdev);
	__invalidate_device(bdev, true);
852 853 854 855 856 857

	/*
	 * Unhash the bdev inode for this device so that it gets evicted as soon
	 * as last inode reference is dropped.
	 */
	remove_inode_hash(bdev->bd_inode);
858 859 860
	bdput(bdev);
}

861 862 863 864 865 866 867 868 869 870 871 872 873
/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
874 875 876 877 878
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
879
 */
880
void del_gendisk(struct gendisk *disk)
L
Linus Torvalds 已提交
881
{
882 883 884
	struct disk_part_iter piter;
	struct hd_struct *part;

885 886
	might_sleep();

887 888 889
	if (WARN_ON_ONCE(!disk->queue))
		return;

890
	blk_integrity_del(disk);
891 892
	disk_del_events(disk);

J
Jan Kara 已提交
893 894 895 896 897
	/*
	 * Block lookups of the disk until all bdevs are unhashed and the
	 * disk is marked as dead (GENHD_FL_UP cleared).
	 */
	down_write(&disk->lookup_sem);
898 899 900 901 902
	/* invalidate stuff */
	disk_part_iter_init(&piter, disk,
			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
	while ((part = disk_part_iter_next(&piter))) {
		invalidate_partition(disk, part->partno);
903
		delete_partition(part);
904 905 906 907 908 909
	}
	disk_part_iter_exit(&piter);

	invalidate_partition(disk, 0);
	set_capacity(disk, 0);
	disk->flags &= ~GENHD_FL_UP;
J
Jan Kara 已提交
910
	up_write(&disk->lookup_sem);
911

912
	if (!(disk->flags & GENHD_FL_HIDDEN)) {
913
		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
914

915 916 917 918
		/*
		 * Unregister bdi before releasing device numbers (as they can
		 * get reused and we'd get clashes in sysfs).
		 */
919
		bdi_unregister(disk->queue->backing_dev_info);
920
	}
921

922 923
	blk_unregister_queue(disk);
	
924
	if (!(disk->flags & GENHD_FL_HIDDEN))
925
		blk_unregister_region(disk);
Y
Yufen Yu 已提交
926 927 928 929 930 931 932
	/*
	 * Remove gendisk pointer from idr so that it cannot be looked up
	 * while RCU period before freeing gendisk is running to prevent
	 * use-after-free issues. Note that the device number stays
	 * "in-use" until we really free the gendisk.
	 */
	blk_invalidate_devt(disk_devt(disk));
933 934 935 936 937 938 939 940

	kobject_put(disk->part0.holder_dir);
	kobject_put(disk->slave_dir);

	part_stat_set_all(&disk->part0, 0);
	disk->part0.stamp = 0;
	if (!sysfs_deprecated)
		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
941
	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
942
	device_del(disk_to_dev(disk));
L
Linus Torvalds 已提交
943
}
944
EXPORT_SYMBOL(del_gendisk);
L
Linus Torvalds 已提交
945

946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
					struct device_attribute *attr,
					char *page)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return sprintf(page, "\n");

	return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *page, size_t len)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return -ENXIO;

	return badblocks_store(disk->bb, page, len, 0);
}

971 972
static void request_gendisk_module(dev_t devt)
{
973 974 975 976 977 978 979 980 981 982 983 984 985
	unsigned int major = MAJOR(devt);
	struct blk_major_name **n;

	mutex_lock(&major_names_lock);
	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
		if ((*n)->major == major && (*n)->probe) {
			(*n)->probe(devt);
			mutex_unlock(&major_names_lock);
			return;
		}
	}
	mutex_unlock(&major_names_lock);

986 987 988 989 990
	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
		/* Make old-style 2.4 aliases work */
		request_module("block-major-%d", MAJOR(devt));
}

991
static bool get_disk_and_module(struct gendisk *disk)
992
{
993
	struct module *owner;
994

995 996 997 998 999 1000
	if (!disk->fops)
		return false;
	owner = disk->fops->owner;
	if (owner && !try_module_get(owner))
		return false;
	if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) {
1001
		module_put(owner);
1002
		return false;
1003
	}
1004
	return true;
1005

1006
}
1007

L
Linus Torvalds 已提交
1008 1009
/**
 * get_gendisk - get partitioning information for a given device
1010
 * @devt: device to get partitioning information for
1011
 * @partno: returned partition index
L
Linus Torvalds 已提交
1012 1013
 *
 * This function gets the structure containing partitioning
1014
 * information for the given device @devt.
1015 1016
 *
 * Context: can sleep
L
Linus Torvalds 已提交
1017
 */
1018
struct gendisk *get_gendisk(dev_t devt, int *partno)
L
Linus Torvalds 已提交
1019
{
T
Tejun Heo 已提交
1020 1021
	struct gendisk *disk = NULL;

1022 1023
	might_sleep();

T
Tejun Heo 已提交
1024
	if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
		mutex_lock(&bdev_map_lock);
		disk = xa_load(&bdev_map, devt);
		if (!disk) {
			mutex_unlock(&bdev_map_lock);
			request_gendisk_module(devt);
			mutex_lock(&bdev_map_lock);
			disk = xa_load(&bdev_map, devt);
		}
		if (disk && !get_disk_and_module(disk))
			disk = NULL;
		if (disk)
			*partno = devt - disk_devt(disk);
		mutex_unlock(&bdev_map_lock);
T
Tejun Heo 已提交
1038 1039 1040
	} else {
		struct hd_struct *part;

1041
		spin_lock_bh(&ext_devt_lock);
1042
		part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
1043
		if (part && get_disk_and_module(part_to_disk(part))) {
T
Tejun Heo 已提交
1044 1045 1046
			*partno = part->partno;
			disk = part_to_disk(part);
		}
1047
		spin_unlock_bh(&ext_devt_lock);
T
Tejun Heo 已提交
1048
	}
1049

J
Jan Kara 已提交
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
	if (!disk)
		return NULL;

	/*
	 * Synchronize with del_gendisk() to not return disk that is being
	 * destroyed.
	 */
	down_read(&disk->lookup_sem);
	if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
		     !(disk->flags & GENHD_FL_UP))) {
		up_read(&disk->lookup_sem);
1061
		put_disk_and_module(disk);
1062
		disk = NULL;
J
Jan Kara 已提交
1063 1064
	} else {
		up_read(&disk->lookup_sem);
1065
	}
T
Tejun Heo 已提交
1066
	return disk;
L
Linus Torvalds 已提交
1067 1068
}

1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
/**
 * bdget_disk - do bdget() by gendisk and partition number
 * @disk: gendisk of interest
 * @partno: partition number
 *
 * Find partition @partno from @disk, do bdget() on it.
 *
 * CONTEXT:
 * Don't care.
 *
 * RETURNS:
 * Resulting block_device on success, NULL on failure.
 */
1082
struct block_device *bdget_disk(struct gendisk *disk, int partno)
1083
{
T
Tejun Heo 已提交
1084 1085
	struct hd_struct *part;
	struct block_device *bdev = NULL;
1086

T
Tejun Heo 已提交
1087
	part = disk_get_part(disk, partno);
1088
	if (part)
C
Christoph Hellwig 已提交
1089
		bdev = bdget_part(part);
T
Tejun Heo 已提交
1090
	disk_put_part(part);
1091

T
Tejun Heo 已提交
1092
	return bdev;
1093 1094 1095
}
EXPORT_SYMBOL(bdget_disk);

1096 1097 1098 1099 1100 1101 1102
/*
 * print a full list of all partitions - intended for places where the root
 * filesystem can't be mounted and thus to give the victim some idea of what
 * went wrong
 */
void __init printk_all_partitions(void)
{
1103 1104 1105 1106 1107 1108
	struct class_dev_iter iter;
	struct device *dev;

	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
1109 1110
		struct disk_part_iter piter;
		struct hd_struct *part;
1111 1112
		char name_buf[BDEVNAME_SIZE];
		char devt_buf[BDEVT_SIZE];
1113 1114 1115

		/*
		 * Don't show empty devices or things that have been
L
Lucas De Marchi 已提交
1116
		 * suppressed
1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
		 */
		if (get_capacity(disk) == 0 ||
		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
			continue;

		/*
		 * Note, unlike /proc/partitions, I am showing the
		 * numbers in hex - the same format as the root=
		 * option takes.
		 */
T
Tejun Heo 已提交
1127 1128 1129
		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
		while ((part = disk_part_iter_next(&piter))) {
			bool is_part0 = part == &disk->part0;
1130

1131
			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
1132
			       bdevt_str(part_devt(part), devt_buf),
1133 1134
			       (unsigned long long)part_nr_sects_read(part) >> 1
			       , disk_name(disk, part->partno, name_buf),
1135
			       part->info ? part->info->uuid : "");
T
Tejun Heo 已提交
1136
			if (is_part0) {
D
Dan Williams 已提交
1137
				if (dev->parent && dev->parent->driver)
T
Tejun Heo 已提交
1138
					printk(" driver: %s\n",
D
Dan Williams 已提交
1139
					      dev->parent->driver->name);
T
Tejun Heo 已提交
1140 1141 1142 1143 1144
				else
					printk(" (driver?)\n");
			} else
				printk("\n");
		}
1145
		disk_part_iter_exit(&piter);
1146 1147
	}
	class_dev_iter_exit(&iter);
1148 1149
}

L
Linus Torvalds 已提交
1150 1151
#ifdef CONFIG_PROC_FS
/* iterator */
1152
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
1153
{
1154 1155 1156
	loff_t skip = *pos;
	struct class_dev_iter *iter;
	struct device *dev;
1157

1158
	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170
	if (!iter)
		return ERR_PTR(-ENOMEM);

	seqf->private = iter;
	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
	do {
		dev = class_dev_iter_next(iter);
		if (!dev)
			return NULL;
	} while (skip--);

	return dev_to_disk(dev);
1171 1172
}

1173
static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
L
Linus Torvalds 已提交
1174
{
1175
	struct device *dev;
L
Linus Torvalds 已提交
1176

1177 1178
	(*pos)++;
	dev = class_dev_iter_next(seqf->private);
1179
	if (dev)
1180
		return dev_to_disk(dev);
1181

L
Linus Torvalds 已提交
1182 1183 1184
	return NULL;
}

1185
static void disk_seqf_stop(struct seq_file *seqf, void *v)
1186
{
1187
	struct class_dev_iter *iter = seqf->private;
1188

1189 1190 1191 1192
	/* stop is called even after start failed :-( */
	if (iter) {
		class_dev_iter_exit(iter);
		kfree(iter);
1193
		seqf->private = NULL;
1194
	}
L
Linus Torvalds 已提交
1195 1196
}

1197
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
L
Linus Torvalds 已提交
1198
{
1199
	void *p;
1200 1201

	p = disk_seqf_start(seqf, pos);
1202
	if (!IS_ERR_OR_NULL(p) && !*pos)
1203 1204
		seq_puts(seqf, "major minor  #blocks  name\n\n");
	return p;
L
Linus Torvalds 已提交
1205 1206
}

1207
static int show_partition(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
1208 1209
{
	struct gendisk *sgp = v;
1210 1211
	struct disk_part_iter piter;
	struct hd_struct *part;
L
Linus Torvalds 已提交
1212 1213 1214
	char buf[BDEVNAME_SIZE];

	/* Don't show non-partitionable removeable devices or empty devices */
T
Tejun Heo 已提交
1215
	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
1216
				   (sgp->flags & GENHD_FL_REMOVABLE)))
L
Linus Torvalds 已提交
1217 1218 1219 1220 1221
		return 0;
	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
		return 0;

	/* show the full disk and all non-0 size partitions of it */
T
Tejun Heo 已提交
1222
	disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
1223
	while ((part = disk_part_iter_next(&piter)))
1224
		seq_printf(seqf, "%4d  %7d %10llu %s\n",
1225
			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
1226
			   (unsigned long long)part_nr_sects_read(part) >> 1,
1227
			   disk_name(sgp, part->partno, buf));
1228
	disk_part_iter_exit(&piter);
L
Linus Torvalds 已提交
1229 1230 1231 1232

	return 0;
}

1233
static const struct seq_operations partitions_op = {
1234 1235 1236
	.start	= show_partition_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
1237
	.show	= show_partition
L
Linus Torvalds 已提交
1238 1239 1240 1241 1242
};
#endif

static int __init genhd_device_init(void)
{
1243 1244 1245 1246
	int error;

	block_class.dev_kobj = sysfs_dev_block_kobj;
	error = class_register(&block_class);
R
Roland McGrath 已提交
1247 1248
	if (unlikely(error))
		return error;
L
Linus Torvalds 已提交
1249
	blk_dev_init();
1250

1251 1252
	register_blkdev(BLOCK_EXT_MAJOR, "blkext");

1253
	/* create top-level block dir */
1254 1255
	if (!sysfs_deprecated)
		block_depr = kobject_create_and_add("block", NULL);
1256
	return 0;
L
Linus Torvalds 已提交
1257 1258 1259 1260
}

subsys_initcall(genhd_device_init);

1261 1262
static ssize_t disk_range_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
1263
{
1264
	struct gendisk *disk = dev_to_disk(dev);
L
Linus Torvalds 已提交
1265

1266
	return sprintf(buf, "%d\n", disk->minors);
L
Linus Torvalds 已提交
1267 1268
}

1269 1270 1271 1272 1273
static ssize_t disk_ext_range_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
1274
	return sprintf(buf, "%d\n", disk_max_parts(disk));
1275 1276
}

1277 1278
static ssize_t disk_removable_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
1279
{
1280
	struct gendisk *disk = dev_to_disk(dev);
1281

1282 1283
	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
1284 1285
}

1286 1287 1288 1289 1290 1291 1292 1293 1294
static ssize_t disk_hidden_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

K
Kay Sievers 已提交
1295 1296 1297 1298 1299
static ssize_t disk_ro_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
1300
	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
K
Kay Sievers 已提交
1301 1302
}

1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316
ssize_t part_size_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
	struct hd_struct *p = dev_to_part(dev);

	return sprintf(buf, "%llu\n",
		(unsigned long long)part_nr_sects_read(p));
}

ssize_t part_stat_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
	struct hd_struct *p = dev_to_part(dev);
	struct request_queue *q = part_to_disk(p)->queue;
1317
	struct disk_stats stat;
1318 1319
	unsigned int inflight;

1320
	part_stat_read_all(p, &stat);
1321 1322 1323
	if (queue_is_mq(q))
		inflight = blk_mq_in_flight(q, p);
	else
1324
		inflight = part_in_flight(p);
1325

1326 1327 1328 1329 1330 1331 1332
	return sprintf(buf,
		"%8lu %8lu %8llu %8u "
		"%8lu %8lu %8llu %8u "
		"%8u %8u %8u "
		"%8lu %8lu %8llu %8u "
		"%8lu %8u"
		"\n",
1333 1334 1335 1336 1337 1338 1339 1340
		stat.ios[STAT_READ],
		stat.merges[STAT_READ],
		(unsigned long long)stat.sectors[STAT_READ],
		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
		stat.ios[STAT_WRITE],
		stat.merges[STAT_WRITE],
		(unsigned long long)stat.sectors[STAT_WRITE],
		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
1341
		inflight,
1342
		jiffies_to_msecs(stat.io_ticks),
1343 1344 1345 1346 1347
		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
				      stat.nsecs[STAT_WRITE] +
				      stat.nsecs[STAT_DISCARD] +
				      stat.nsecs[STAT_FLUSH],
						NSEC_PER_MSEC),
1348 1349 1350 1351 1352 1353
		stat.ios[STAT_DISCARD],
		stat.merges[STAT_DISCARD],
		(unsigned long long)stat.sectors[STAT_DISCARD],
		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
		stat.ios[STAT_FLUSH],
		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
1354 1355 1356 1357 1358 1359 1360 1361 1362
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
			   char *buf)
{
	struct hd_struct *p = dev_to_part(dev);
	struct request_queue *q = part_to_disk(p)->queue;
	unsigned int inflight[2];

1363 1364 1365
	if (queue_is_mq(q))
		blk_mq_in_flight_rw(q, p, inflight);
	else
1366
		part_in_flight_rw(p, inflight);
1367

1368 1369 1370
	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

1371 1372
static ssize_t disk_capability_show(struct device *dev,
				    struct device_attribute *attr, char *buf)
1373
{
1374 1375 1376
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%x\n", disk->flags);
1377
}
1378

1379 1380 1381 1382 1383 1384 1385 1386 1387
static ssize_t disk_alignment_offset_show(struct device *dev,
					  struct device_attribute *attr,
					  char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}

1388 1389 1390 1391 1392 1393
static ssize_t disk_discard_alignment_show(struct device *dev,
					   struct device_attribute *attr,
					   char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

1394
	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
1395 1396
}

1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
1409

1410
#ifdef CONFIG_FAIL_MAKE_REQUEST
1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
ssize_t part_fail_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
	struct hd_struct *p = dev_to_part(dev);

	return sprintf(buf, "%d\n", p->make_it_fail);
}

ssize_t part_fail_store(struct device *dev,
			struct device_attribute *attr,
			const char *buf, size_t count)
{
	struct hd_struct *p = dev_to_part(dev);
	int i;

	if (count > 0 && sscanf(buf, "%d", &i) > 0)
		p->make_it_fail = (i == 0) ? 0 : 1;

	return count;
}

1432
static struct device_attribute dev_attr_fail =
1433
	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1434 1435
#endif /* CONFIG_FAIL_MAKE_REQUEST */

1436 1437
#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
1438
	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1439
#endif
1440 1441 1442

static struct attribute *disk_attrs[] = {
	&dev_attr_range.attr,
1443
	&dev_attr_ext_range.attr,
1444
	&dev_attr_removable.attr,
1445
	&dev_attr_hidden.attr,
K
Kay Sievers 已提交
1446
	&dev_attr_ro.attr,
1447
	&dev_attr_size.attr,
1448
	&dev_attr_alignment_offset.attr,
1449
	&dev_attr_discard_alignment.attr,
1450 1451
	&dev_attr_capability.attr,
	&dev_attr_stat.attr,
1452
	&dev_attr_inflight.attr,
1453
	&dev_attr_badblocks.attr,
1454 1455
#ifdef CONFIG_FAIL_MAKE_REQUEST
	&dev_attr_fail.attr,
1456 1457 1458
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
	&dev_attr_fail_timeout.attr,
1459 1460 1461 1462
#endif
	NULL
};

1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, typeof(*dev), kobj);
	struct gendisk *disk = dev_to_disk(dev);

	if (a == &dev_attr_badblocks.attr && !disk->bb)
		return 0;
	return a->mode;
}

1473 1474
static struct attribute_group disk_attr_group = {
	.attrs = disk_attrs,
1475
	.is_visible = disk_visible,
1476 1477
};

1478
static const struct attribute_group *disk_attr_groups[] = {
1479 1480
	&disk_attr_group,
	NULL
L
Linus Torvalds 已提交
1481 1482
};

T
Tejun Heo 已提交
1483 1484 1485 1486 1487 1488 1489 1490 1491
/**
 * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
 * @disk: disk to replace part_tbl for
 * @new_ptbl: new part_tbl to install
 *
 * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
 * original ptbl is freed using RCU callback.
 *
 * LOCKING:
1492
 * Matching bd_mutex locked or the caller is the only user of @disk.
T
Tejun Heo 已提交
1493 1494 1495 1496
 */
static void disk_replace_part_tbl(struct gendisk *disk,
				  struct disk_part_tbl *new_ptbl)
{
1497 1498
	struct disk_part_tbl *old_ptbl =
		rcu_dereference_protected(disk->part_tbl, 1);
T
Tejun Heo 已提交
1499 1500

	rcu_assign_pointer(disk->part_tbl, new_ptbl);
1501 1502 1503

	if (old_ptbl) {
		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
1504
		kfree_rcu(old_ptbl, rcu_head);
1505
	}
T
Tejun Heo 已提交
1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516
}

/**
 * disk_expand_part_tbl - expand disk->part_tbl
 * @disk: disk to expand part_tbl for
 * @partno: expand such that this partno can fit in
 *
 * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
 * uses RCU to allow unlocked dereferencing for stats and other stuff.
 *
 * LOCKING:
1517 1518
 * Matching bd_mutex locked or the caller is the only user of @disk.
 * Might sleep.
T
Tejun Heo 已提交
1519 1520 1521 1522 1523 1524
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int disk_expand_part_tbl(struct gendisk *disk, int partno)
{
1525 1526
	struct disk_part_tbl *old_ptbl =
		rcu_dereference_protected(disk->part_tbl, 1);
T
Tejun Heo 已提交
1527 1528
	struct disk_part_tbl *new_ptbl;
	int len = old_ptbl ? old_ptbl->len : 0;
1529 1530 1531 1532 1533 1534 1535 1536 1537
	int i, target;

	/*
	 * check for int overflow, since we can get here from blkpg_ioctl()
	 * with a user passed 'partno'.
	 */
	target = partno + 1;
	if (target < 0)
		return -EINVAL;
T
Tejun Heo 已提交
1538 1539 1540 1541 1542 1543 1544 1545

	/* disk_max_parts() is zero during initialization, ignore if so */
	if (disk_max_parts(disk) && target > disk_max_parts(disk))
		return -EINVAL;

	if (target <= len)
		return 0;

1546 1547
	new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
				disk->node_id);
T
Tejun Heo 已提交
1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559
	if (!new_ptbl)
		return -ENOMEM;

	new_ptbl->len = target;

	for (i = 0; i < len; i++)
		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);

	disk_replace_part_tbl(disk, new_ptbl);
	return 0;
}

1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575
/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * The struct gendisk refcount is incremented with get_gendisk() or
 * get_disk_and_module(), and its refcount is decremented with
 * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this
 * function is called.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
1576 1577
 *
 * Context: can sleep
1578
 */
1579
static void disk_release(struct device *dev)
L
Linus Torvalds 已提交
1580
{
1581 1582
	struct gendisk *disk = dev_to_disk(dev);

1583 1584
	might_sleep();

1585
	blk_free_devt(dev->devt);
1586
	disk_release_events(disk);
L
Linus Torvalds 已提交
1587
	kfree(disk->random);
T
Tejun Heo 已提交
1588
	disk_replace_part_tbl(disk, NULL);
1589
	hd_free_part(&disk->part0);
1590 1591
	if (disk->queue)
		blk_put_queue(disk->queue);
L
Linus Torvalds 已提交
1592 1593
	kfree(disk);
}
1594 1595
struct class block_class = {
	.name		= "block",
L
Linus Torvalds 已提交
1596 1597
};

1598
static char *block_devnode(struct device *dev, umode_t *mode,
1599
			   kuid_t *uid, kgid_t *gid)
1600 1601 1602
{
	struct gendisk *disk = dev_to_disk(dev);

1603 1604
	if (disk->fops->devnode)
		return disk->fops->devnode(disk, mode);
1605 1606 1607
	return NULL;
}

1608
const struct device_type disk_type = {
1609 1610 1611
	.name		= "disk",
	.groups		= disk_attr_groups,
	.release	= disk_release,
1612
	.devnode	= block_devnode,
L
Linus Torvalds 已提交
1613 1614
};

1615
#ifdef CONFIG_PROC_FS
1616 1617 1618 1619 1620 1621 1622 1623
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
1624 1625
{
	struct gendisk *gp = v;
1626 1627
	struct disk_part_iter piter;
	struct hd_struct *hd;
L
Linus Torvalds 已提交
1628
	char buf[BDEVNAME_SIZE];
1629
	unsigned int inflight;
1630
	struct disk_stats stat;
L
Linus Torvalds 已提交
1631 1632

	/*
1633
	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1634
		seq_puts(seqf,	"major minor name"
L
Linus Torvalds 已提交
1635 1636 1637 1638
				"     rio rmerge rsect ruse wio wmerge "
				"wsect wuse running use aveq"
				"\n\n");
	*/
1639

1640
	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1641
	while ((hd = disk_part_iter_next(&piter))) {
1642
		part_stat_read_all(hd, &stat);
1643 1644 1645
		if (queue_is_mq(gp->queue))
			inflight = blk_mq_in_flight(gp->queue, hd);
		else
1646
			inflight = part_in_flight(hd);
1647

1648 1649 1650 1651
		seq_printf(seqf, "%4d %7d %s "
			   "%lu %lu %lu %u "
			   "%lu %lu %lu %u "
			   "%u %u %u "
1652 1653 1654
			   "%lu %lu %lu %u "
			   "%lu %u"
			   "\n",
1655 1656
			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
			   disk_name(gp, hd->partno, buf),
1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
			   stat.ios[STAT_READ],
			   stat.merges[STAT_READ],
			   stat.sectors[STAT_READ],
			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
							NSEC_PER_MSEC),
			   stat.ios[STAT_WRITE],
			   stat.merges[STAT_WRITE],
			   stat.sectors[STAT_WRITE],
			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
							NSEC_PER_MSEC),
1667
			   inflight,
1668
			   jiffies_to_msecs(stat.io_ticks),
1669 1670 1671 1672 1673
			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
						 stat.nsecs[STAT_WRITE] +
						 stat.nsecs[STAT_DISCARD] +
						 stat.nsecs[STAT_FLUSH],
							NSEC_PER_MSEC),
1674 1675 1676 1677 1678 1679 1680 1681
			   stat.ios[STAT_DISCARD],
			   stat.merges[STAT_DISCARD],
			   stat.sectors[STAT_DISCARD],
			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
						 NSEC_PER_MSEC),
			   stat.ios[STAT_FLUSH],
			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
						 NSEC_PER_MSEC)
1682
			);
L
Linus Torvalds 已提交
1683
	}
1684
	disk_part_iter_exit(&piter);
1685

L
Linus Torvalds 已提交
1686 1687 1688
	return 0;
}

1689
static const struct seq_operations diskstats_op = {
1690 1691 1692
	.start	= disk_seqf_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
L
Linus Torvalds 已提交
1693 1694
	.show	= diskstats_show
};
1695 1696 1697

static int __init proc_genhd_init(void)
{
1698 1699
	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
	proc_create_seq("partitions", 0, NULL, &partitions_op);
1700 1701 1702
	return 0;
}
module_init(proc_genhd_init);
1703
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
1704

1705
dev_t blk_lookup_devt(const char *name, int partno)
1706
{
1707 1708 1709
	dev_t devt = MKDEV(0, 0);
	struct class_dev_iter iter;
	struct device *dev;
1710

1711 1712
	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
1713
		struct gendisk *disk = dev_to_disk(dev);
T
Tejun Heo 已提交
1714
		struct hd_struct *part;
1715

1716
		if (strcmp(dev_name(dev), name))
1717 1718
			continue;

1719 1720 1721 1722 1723 1724 1725 1726
		if (partno < disk->minors) {
			/* We need to return the right devno, even
			 * if the partition doesn't exist yet.
			 */
			devt = MKDEV(MAJOR(dev->devt),
				     MINOR(dev->devt) + partno);
			break;
		}
T
Tejun Heo 已提交
1727
		part = disk_get_part(disk, partno);
1728
		if (part) {
1729
			devt = part_devt(part);
1730
			disk_put_part(part);
T
Tejun Heo 已提交
1731
			break;
1732
		}
T
Tejun Heo 已提交
1733
		disk_put_part(part);
1734
	}
1735
	class_dev_iter_exit(&iter);
1736 1737 1738
	return devt;
}

1739
struct gendisk *__alloc_disk_node(int minors, int node_id)
1740 1741
{
	struct gendisk *disk;
1742
	struct disk_part_tbl *ptbl;
1743

1744 1745
	if (minors > DISK_MAX_PARTS) {
		printk(KERN_ERR
R
Randy Dunlap 已提交
1746
			"block: can't allocate more than %d partitions\n",
1747 1748 1749
			DISK_MAX_PARTS);
		minors = DISK_MAX_PARTS;
	}
1750

1751
	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1752 1753
	if (!disk)
		return NULL;
1754

1755 1756 1757
	disk->part0.dkstats = alloc_percpu(struct disk_stats);
	if (!disk->part0.dkstats)
		goto out_free_disk;
T
Tejun Heo 已提交
1758

1759 1760 1761 1762 1763
	init_rwsem(&disk->lookup_sem);
	disk->node_id = node_id;
	if (disk_expand_part_tbl(disk, 0)) {
		free_percpu(disk->part0.dkstats);
		goto out_free_disk;
L
Linus Torvalds 已提交
1764
	}
1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786

	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
	rcu_assign_pointer(ptbl->part[0], &disk->part0);

	/*
	 * set_capacity() and get_capacity() currently don't use
	 * seqcounter to read/update the part0->nr_sects. Still init
	 * the counter as we can read the sectors in IO submission
	 * patch using seqence counters.
	 *
	 * TODO: Ideally set_capacity() and get_capacity() should be
	 * converted to make use of bd_mutex and sequence counters.
	 */
	hd_sects_seq_init(&disk->part0);
	if (hd_ref_init(&disk->part0))
		goto out_free_part0;

	disk->minors = minors;
	rand_initialize_disk(disk);
	disk_to_dev(disk)->class = &block_class;
	disk_to_dev(disk)->type = &disk_type;
	device_initialize(disk_to_dev(disk));
L
Linus Torvalds 已提交
1787
	return disk;
1788 1789 1790 1791 1792 1793

out_free_part0:
	hd_free_part(&disk->part0);
out_free_disk:
	kfree(disk);
	return NULL;
L
Linus Torvalds 已提交
1794
}
1795
EXPORT_SYMBOL(__alloc_disk_node);
L
Linus Torvalds 已提交
1796

1797 1798
/**
 * put_disk - decrements the gendisk refcount
1799
 * @disk: the struct gendisk to decrement the refcount for
1800 1801 1802
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
1803 1804 1805
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
1806
 */
L
Linus Torvalds 已提交
1807 1808 1809
void put_disk(struct gendisk *disk)
{
	if (disk)
1810
		kobject_put(&disk_to_dev(disk)->kobj);
L
Linus Torvalds 已提交
1811 1812 1813
}
EXPORT_SYMBOL(put_disk);

1814 1815
/**
 * put_disk_and_module - decrements the module and gendisk refcount
1816
 * @disk: the struct gendisk to decrement the refcount for
1817
 *
1818 1819
 * This is a counterpart of get_disk_and_module() and thus also of
 * get_gendisk().
1820 1821 1822
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833
 */
void put_disk_and_module(struct gendisk *disk)
{
	if (disk) {
		struct module *owner = disk->fops->owner;

		put_disk(disk);
		module_put(owner);
	}
}

1834 1835 1836 1837 1838 1839 1840 1841 1842 1843
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
	char event[] = "DISK_RO=1";
	char *envp[] = { event, NULL };

	if (!ro)
		event[8] = '0';
	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

L
Linus Torvalds 已提交
1844 1845
void set_disk_ro(struct gendisk *disk, int flag)
{
1846 1847 1848
	struct disk_part_iter piter;
	struct hd_struct *part;

1849 1850 1851 1852 1853 1854
	if (disk->part0.policy != flag) {
		set_disk_ro_uevent(disk, flag);
		disk->part0.policy = flag;
	}

	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
1855 1856 1857
	while ((part = disk_part_iter_next(&piter)))
		part->policy = flag;
	disk_part_iter_exit(&piter);
L
Linus Torvalds 已提交
1858 1859 1860 1861 1862 1863 1864 1865
}

EXPORT_SYMBOL(set_disk_ro);

int bdev_read_only(struct block_device *bdev)
{
	if (!bdev)
		return 0;
T
Tejun Heo 已提交
1866
	return bdev->bd_part->policy;
L
Linus Torvalds 已提交
1867 1868 1869 1870
}

EXPORT_SYMBOL(bdev_read_only);

1871 1872 1873 1874 1875 1876 1877 1878
/*
 * Disk events - monitor disk events like media change and eject request.
 */
struct disk_events {
	struct list_head	node;		/* all disk_event's */
	struct gendisk		*disk;		/* the associated disk */
	spinlock_t		lock;

1879
	struct mutex		block_mutex;	/* protects blocking */
1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902
	int			block;		/* event blocking depth */
	unsigned int		pending;	/* events already sent out */
	unsigned int		clearing;	/* events being cleared */

	long			poll_msecs;	/* interval, -1 for default */
	struct delayed_work	dwork;
};

static const char *disk_events_strs[] = {
	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "media_change",
	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "eject_request",
};

static char *disk_uevents[] = {
	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "DISK_MEDIA_CHANGE=1",
	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "DISK_EJECT_REQUEST=1",
};

/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);

/* disable in-kernel polling by default */
1903
static unsigned long disk_events_dfl_poll_msecs;
1904 1905 1906 1907 1908 1909 1910 1911

static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
	struct disk_events *ev = disk->ev;
	long intv_msecs = 0;

	/*
	 * If device-specific poll interval is set, always use it.  If
1912
	 * the default is being used, poll if the POLL flag is set.
1913 1914 1915
	 */
	if (ev->poll_msecs >= 0)
		intv_msecs = ev->poll_msecs;
1916
	else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
1917 1918 1919 1920 1921
		intv_msecs = disk_events_dfl_poll_msecs;

	return msecs_to_jiffies(intv_msecs);
}

1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937
/**
 * disk_block_events - block and flush disk event checking
 * @disk: disk to block events for
 *
 * On return from this function, it is guaranteed that event checking
 * isn't in progress and won't happen until unblocked by
 * disk_unblock_events().  Events blocking is counted and the actual
 * unblocking happens after the matching number of unblocks are done.
 *
 * Note that this intentionally does not block event checking from
 * disk_clear_events().
 *
 * CONTEXT:
 * Might sleep.
 */
void disk_block_events(struct gendisk *disk)
1938 1939 1940 1941 1942
{
	struct disk_events *ev = disk->ev;
	unsigned long flags;
	bool cancel;

1943 1944 1945
	if (!ev)
		return;

1946 1947 1948 1949 1950 1951
	/*
	 * Outer mutex ensures that the first blocker completes canceling
	 * the event work before further blockers are allowed to finish.
	 */
	mutex_lock(&ev->block_mutex);

1952 1953 1954 1955
	spin_lock_irqsave(&ev->lock, flags);
	cancel = !ev->block++;
	spin_unlock_irqrestore(&ev->lock, flags);

1956 1957
	if (cancel)
		cancel_delayed_work_sync(&disk->ev->dwork);
1958 1959

	mutex_unlock(&ev->block_mutex);
1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
}

static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
	struct disk_events *ev = disk->ev;
	unsigned long intv;
	unsigned long flags;

	spin_lock_irqsave(&ev->lock, flags);

	if (WARN_ON_ONCE(ev->block <= 0))
		goto out_unlock;

	if (--ev->block)
		goto out_unlock;

	intv = disk_events_poll_jiffies(disk);
	if (check_now)
1978 1979
		queue_delayed_work(system_freezable_power_efficient_wq,
				&ev->dwork, 0);
1980
	else if (intv)
1981 1982
		queue_delayed_work(system_freezable_power_efficient_wq,
				&ev->dwork, intv);
1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
out_unlock:
	spin_unlock_irqrestore(&ev->lock, flags);
}

/**
 * disk_unblock_events - unblock disk event checking
 * @disk: disk to unblock events for
 *
 * Undo disk_block_events().  When the block count reaches zero, it
 * starts events polling if configured.
 *
 * CONTEXT:
 * Don't care.  Safe to call from irq context.
 */
void disk_unblock_events(struct gendisk *disk)
{
	if (disk->ev)
2000
		__disk_unblock_events(disk, false);
2001 2002 2003
}

/**
2004 2005 2006
 * disk_flush_events - schedule immediate event checking and flushing
 * @disk: disk to check and flush events for
 * @mask: events to flush
2007
 *
2008 2009 2010
 * Schedule immediate event checking on @disk if not blocked.  Events in
 * @mask are scheduled to be cleared from the driver.  Note that this
 * doesn't clear the events from @disk->ev.
2011 2012
 *
 * CONTEXT:
2013
 * If @mask is non-zero must be called with bdev->bd_mutex held.
2014
 */
2015
void disk_flush_events(struct gendisk *disk, unsigned int mask)
2016
{
2017 2018 2019 2020 2021
	struct disk_events *ev = disk->ev;

	if (!ev)
		return;

2022 2023
	spin_lock_irq(&ev->lock);
	ev->clearing |= mask;
2024
	if (!ev->block)
2025 2026
		mod_delayed_work(system_freezable_power_efficient_wq,
				&ev->dwork, 0);
2027
	spin_unlock_irq(&ev->lock);
2028 2029 2030 2031 2032
}

/**
 * disk_clear_events - synchronously check, clear and return pending events
 * @disk: disk to fetch and clear events from
2033
 * @mask: mask of events to be fetched and cleared
2034 2035 2036 2037 2038 2039 2040
 *
 * Disk events are synchronously checked and pending events in @mask
 * are cleared and returned.  This ignores the block count.
 *
 * CONTEXT:
 * Might sleep.
 */
2041
static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
2042 2043 2044
{
	struct disk_events *ev = disk->ev;
	unsigned int pending;
D
Derek Basehore 已提交
2045
	unsigned int clearing = mask;
2046

2047
	if (!ev)
2048 2049
		return 0;

D
Derek Basehore 已提交
2050 2051 2052 2053 2054 2055 2056
	disk_block_events(disk);

	/*
	 * store the union of mask and ev->clearing on the stack so that the
	 * race with disk_flush_events does not cause ambiguity (ev->clearing
	 * can still be modified even if events are blocked).
	 */
2057
	spin_lock_irq(&ev->lock);
D
Derek Basehore 已提交
2058 2059
	clearing |= ev->clearing;
	ev->clearing = 0;
2060 2061
	spin_unlock_irq(&ev->lock);

D
Derek Basehore 已提交
2062
	disk_check_events(ev, &clearing);
2063
	/*
D
Derek Basehore 已提交
2064 2065
	 * if ev->clearing is not 0, the disk_flush_events got called in the
	 * middle of this function, so we want to run the workfn without delay.
2066
	 */
D
Derek Basehore 已提交
2067
	__disk_unblock_events(disk, ev->clearing ? true : false);
2068 2069 2070 2071 2072 2073

	/* then, fetch and clear pending events */
	spin_lock_irq(&ev->lock);
	pending = ev->pending & mask;
	ev->pending &= ~mask;
	spin_unlock_irq(&ev->lock);
D
Derek Basehore 已提交
2074
	WARN_ON_ONCE(clearing & mask);
2075 2076 2077 2078

	return pending;
}

2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100
/**
 * bdev_check_media_change - check if a removable media has been changed
 * @bdev: block device to check
 *
 * Check whether a removable media has been changed, and attempt to free all
 * dentries and inodes and invalidates all block device page cache entries in
 * that case.
 *
 * Returns %true if the block device changed, or %false if not.
 */
bool bdev_check_media_change(struct block_device *bdev)
{
	unsigned int events;

	events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
				   DISK_EVENT_EJECT_REQUEST);
	if (!(events & DISK_EVENT_MEDIA_CHANGE))
		return false;

	if (__invalidate_device(bdev, true))
		pr_warn("VFS: busy inodes on changed media %s\n",
			bdev->bd_disk->disk_name);
2101
	set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
2102 2103 2104 2105
	return true;
}
EXPORT_SYMBOL(bdev_check_media_change);

D
Derek Basehore 已提交
2106 2107 2108 2109
/*
 * Separate this part out so that a different pointer for clearing_ptr can be
 * passed in for disk_clear_events.
 */
2110 2111 2112 2113
static void disk_events_workfn(struct work_struct *work)
{
	struct delayed_work *dwork = to_delayed_work(work);
	struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
D
Derek Basehore 已提交
2114 2115 2116 2117 2118 2119 2120

	disk_check_events(ev, &ev->clearing);
}

static void disk_check_events(struct disk_events *ev,
			      unsigned int *clearing_ptr)
{
2121 2122
	struct gendisk *disk = ev->disk;
	char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
D
Derek Basehore 已提交
2123
	unsigned int clearing = *clearing_ptr;
2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135
	unsigned int events;
	unsigned long intv;
	int nr_events = 0, i;

	/* check events */
	events = disk->fops->check_events(disk, clearing);

	/* accumulate pending events and schedule next poll if necessary */
	spin_lock_irq(&ev->lock);

	events &= ~ev->pending;
	ev->pending |= events;
D
Derek Basehore 已提交
2136
	*clearing_ptr &= ~clearing;
2137 2138 2139

	intv = disk_events_poll_jiffies(disk);
	if (!ev->block && intv)
2140 2141
		queue_delayed_work(system_freezable_power_efficient_wq,
				&ev->dwork, intv);
2142 2143 2144

	spin_unlock_irq(&ev->lock);

2145 2146
	/*
	 * Tell userland about new events.  Only the events listed in
2147 2148 2149
	 * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
	 * is set. Otherwise, events are processed internally but never
	 * get reported to userland.
2150
	 */
2151
	for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
2152 2153
		if ((events & disk->events & (1 << i)) &&
		    (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165
			envp[nr_events++] = disk_uevents[i];

	if (nr_events)
		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}

/*
 * A disk events enabled device has the following sysfs nodes under
 * its /sys/block/X/ directory.
 *
 * events		: list of all supported events
 * events_async		: list of events which can be detected w/o polling
2166
 *			  (always empty, only for backwards compatibility)
2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190
 * events_poll_msecs	: polling interval, 0: disable, -1: system default
 */
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
	const char *delim = "";
	ssize_t pos = 0;
	int i;

	for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
		if (events & (1 << i)) {
			pos += sprintf(buf + pos, "%s%s",
				       delim, disk_events_strs[i]);
			delim = " ";
		}
	if (pos)
		pos += sprintf(buf + pos, "\n");
	return pos;
}

static ssize_t disk_events_show(struct device *dev,
				struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

2191 2192 2193
	if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
		return 0;

2194 2195 2196 2197 2198 2199
	return __disk_events_show(disk->events, buf);
}

static ssize_t disk_events_async_show(struct device *dev,
				      struct device_attribute *attr, char *buf)
{
2200
	return 0;
2201 2202 2203 2204 2205 2206 2207 2208
}

static ssize_t disk_events_poll_msecs_show(struct device *dev,
					   struct device_attribute *attr,
					   char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

2209 2210 2211
	if (!disk->ev)
		return sprintf(buf, "-1\n");

2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227
	return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}

static ssize_t disk_events_poll_msecs_store(struct device *dev,
					    struct device_attribute *attr,
					    const char *buf, size_t count)
{
	struct gendisk *disk = dev_to_disk(dev);
	long intv;

	if (!count || !sscanf(buf, "%ld", &intv))
		return -EINVAL;

	if (intv < 0 && intv != -1)
		return -EINVAL;

2228 2229 2230
	if (!disk->ev)
		return -ENODEV;

2231
	disk_block_events(disk);
2232 2233 2234 2235 2236 2237
	disk->ev->poll_msecs = intv;
	__disk_unblock_events(disk, true);

	return count;
}

2238 2239 2240
static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
static const DEVICE_ATTR(events_poll_msecs, 0644,
2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254
			 disk_events_poll_msecs_show,
			 disk_events_poll_msecs_store);

static const struct attribute *disk_events_attrs[] = {
	&dev_attr_events.attr,
	&dev_attr_events_async.attr,
	&dev_attr_events_poll_msecs.attr,
	NULL,
};

/*
 * The default polling interval can be specified by the kernel
 * parameter block.events_dfl_poll_msecs which defaults to 0
 * (disable).  This can also be modified runtime by writing to
2255
 * /sys/module/block/parameters/events_dfl_poll_msecs.
2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269
 */
static int disk_events_set_dfl_poll_msecs(const char *val,
					  const struct kernel_param *kp)
{
	struct disk_events *ev;
	int ret;

	ret = param_set_ulong(val, kp);
	if (ret < 0)
		return ret;

	mutex_lock(&disk_events_mutex);

	list_for_each_entry(ev, &disk_events, node)
2270
		disk_flush_events(ev->disk, 0);
2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288

	mutex_unlock(&disk_events_mutex);

	return 0;
}

static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
	.set	= disk_events_set_dfl_poll_msecs,
	.get	= param_get_ulong,
};

#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX	"block."

module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
		&disk_events_dfl_poll_msecs, 0644);

/*
2289
 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
2290
 */
2291
static void disk_alloc_events(struct gendisk *disk)
2292 2293 2294
{
	struct disk_events *ev;

2295
	if (!disk->fops->check_events || !disk->events)
2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306
		return;

	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
	if (!ev) {
		pr_warn("%s: failed to initialize events\n", disk->disk_name);
		return;
	}

	INIT_LIST_HEAD(&ev->node);
	ev->disk = disk;
	spin_lock_init(&ev->lock);
2307
	mutex_init(&ev->block_mutex);
2308 2309 2310 2311
	ev->block = 1;
	ev->poll_msecs = -1;
	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);

2312 2313 2314 2315 2316 2317 2318 2319 2320 2321
	disk->ev = ev;
}

static void disk_add_events(struct gendisk *disk)
{
	/* FIXME: error handling */
	if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
		pr_warn("%s: failed to create sysfs files for events\n",
			disk->disk_name);

2322 2323 2324
	if (!disk->ev)
		return;

2325
	mutex_lock(&disk_events_mutex);
2326
	list_add_tail(&disk->ev->node, &disk_events);
2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337
	mutex_unlock(&disk_events_mutex);

	/*
	 * Block count is initialized to 1 and the following initial
	 * unblock kicks it into action.
	 */
	__disk_unblock_events(disk, true);
}

static void disk_del_events(struct gendisk *disk)
{
2338 2339
	if (disk->ev) {
		disk_block_events(disk);
2340

2341 2342 2343 2344
		mutex_lock(&disk_events_mutex);
		list_del_init(&disk->ev->node);
		mutex_unlock(&disk_events_mutex);
	}
2345 2346 2347 2348 2349 2350 2351 2352 2353 2354

	sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
}

static void disk_release_events(struct gendisk *disk)
{
	/* the block count should be 1 from disk_del_events() */
	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
	kfree(disk->ev);
}