genhd.c 34.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3
/*
 *  gendisk handling
C
Christoph Hellwig 已提交
4 5
 *
 * Portions Copyright (C) 2020 Christoph Hellwig
L
Linus Torvalds 已提交
6 7 8
 */

#include <linux/module.h>
9
#include <linux/ctype.h>
L
Linus Torvalds 已提交
10 11
#include <linux/fs.h>
#include <linux/genhd.h>
12
#include <linux/kdev_t.h>
L
Linus Torvalds 已提交
13 14
#include <linux/kernel.h>
#include <linux/blkdev.h>
15
#include <linux/backing-dev.h>
L
Linus Torvalds 已提交
16 17
#include <linux/init.h>
#include <linux/spinlock.h>
18
#include <linux/proc_fs.h>
L
Linus Torvalds 已提交
19 20 21
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
22
#include <linux/mutex.h>
T
Tejun Heo 已提交
23
#include <linux/idr.h>
24
#include <linux/log2.h>
25
#include <linux/pm_runtime.h>
26
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
27

28 29
#include "blk.h"

30
static struct kobject *block_depr;
L
Linus Torvalds 已提交
31

M
Matteo Croce 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
/*
 * Unique, monotonically increasing sequential number associated with block
 * devices instances (i.e. incremented each time a device is attached).
 * Associating uevents with block devices in userspace is difficult and racy:
 * the uevent netlink socket is lossy, and on slow and overloaded systems has
 * a very high latency.
 * Block devices do not have exclusive owners in userspace, any process can set
 * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
 * can be reused again and again).
 * A userspace process setting up a block device and watching for its events
 * cannot thus reliably tell whether an event relates to the device it just set
 * up or another earlier instance with the same name.
 * This sequential number allows userspace processes to solve this problem, and
 * uniquely associate an uevent to the lifetime to a device.
 */
static atomic64_t diskseq;

T
Tejun Heo 已提交
49
/* for extended dynamic devt allocation, currently only one major is used */
50
#define NR_EXT_DEVT		(1 << MINORBITS)
51
static DEFINE_IDA(ext_devt_ida);
T
Tejun Heo 已提交
52

53 54
void set_capacity(struct gendisk *disk, sector_t sectors)
{
55
	struct block_device *bdev = disk->part0;
56

57
	spin_lock(&bdev->bd_size_lock);
58
	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
59
	spin_unlock(&bdev->bd_size_lock);
60 61 62
}
EXPORT_SYMBOL(set_capacity);

63
/*
64 65
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
66
 */
67
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
68 69
{
	sector_t capacity = get_capacity(disk);
70
	char *envp[] = { "RESIZE=1", NULL };
71 72 73

	set_capacity(disk, size);

74 75 76 77 78 79
	/*
	 * Only print a message and send a uevent if the gendisk is user visible
	 * and alive.  This avoids spamming the log and udev when setting the
	 * initial capacity during probing.
	 */
	if (size == capacity ||
C
Christoph Hellwig 已提交
80 81
	    !disk_live(disk) ||
	    (disk->flags & GENHD_FL_HIDDEN))
82
		return false;
83

84
	pr_info("%s: detected capacity change from %lld to %lld\n",
M
Ming Lei 已提交
85
		disk->disk_name, capacity, size);
86

87 88 89 90 91 92 93 94
	/*
	 * Historically we did not send a uevent for changes to/from an empty
	 * device.
	 */
	if (!capacity || !size)
		return false;
	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
	return true;
95
}
96
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
97

98
/*
C
Christoph Hellwig 已提交
99 100 101 102 103
 * Format the device name of the indicated block device into the supplied buffer
 * and return a pointer to that same buffer for convenience.
 *
 * Note: do not use this in new code, use the %pg specifier to sprintf and
 * printk insted.
104
 */
C
Christoph Hellwig 已提交
105
const char *bdevname(struct block_device *bdev, char *buf)
106
{
C
Christoph Hellwig 已提交
107 108 109
	struct gendisk *hd = bdev->bd_disk;
	int partno = bdev->bd_partno;

110 111 112 113 114 115 116 117 118 119
	if (!partno)
		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
	else
		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

	return buf;
}
EXPORT_SYMBOL(bdevname);
120

121 122
static void part_stat_read_all(struct block_device *part,
		struct disk_stats *stat)
123 124 125 126 127
{
	int cpu;

	memset(stat, 0, sizeof(struct disk_stats));
	for_each_possible_cpu(cpu) {
128
		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
129 130 131 132 133 134 135 136 137 138 139 140 141
		int group;

		for (group = 0; group < NR_STAT_GROUPS; group++) {
			stat->nsecs[group] += ptr->nsecs[group];
			stat->sectors[group] += ptr->sectors[group];
			stat->ios[group] += ptr->ios[group];
			stat->merges[group] += ptr->merges[group];
		}

		stat->io_ticks += ptr->io_ticks;
	}
}

142
static unsigned int part_in_flight(struct block_device *part)
143
{
144
	unsigned int inflight = 0;
145
	int cpu;
146

147
	for_each_possible_cpu(cpu) {
148 149
		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
			    part_stat_local_read_cpu(part, in_flight[1], cpu);
150
	}
151 152
	if ((int)inflight < 0)
		inflight = 0;
153

154
	return inflight;
155 156
}

157 158
static void part_in_flight_rw(struct block_device *part,
		unsigned int inflight[2])
159
{
160 161 162 163 164 165 166 167 168 169 170 171
	int cpu;

	inflight[0] = 0;
	inflight[1] = 0;
	for_each_possible_cpu(cpu) {
		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
	}
	if ((int)inflight[0] < 0)
		inflight[0] = 0;
	if ((int)inflight[1] < 0)
		inflight[1] = 0;
172 173
}

L
Linus Torvalds 已提交
174 175 176 177
/*
 * Can be deleted altogether. Later.
 *
 */
178
#define BLKDEV_MAJOR_HASH_SIZE 255
L
Linus Torvalds 已提交
179 180 181 182
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
183
	void (*probe)(dev_t devt);
184
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
C
Christoph Hellwig 已提交
185
static DEFINE_MUTEX(major_names_lock);
L
Linus Torvalds 已提交
186 187

/* index in the above - for now: assume no multimajor ranges */
188
static inline int major_to_index(unsigned major)
L
Linus Torvalds 已提交
189
{
190
	return major % BLKDEV_MAJOR_HASH_SIZE;
191 192
}

193
#ifdef CONFIG_PROC_FS
194
void blkdev_show(struct seq_file *seqf, off_t offset)
195
{
196
	struct blk_major_name *dp;
197

C
Christoph Hellwig 已提交
198
	mutex_lock(&major_names_lock);
199 200
	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
		if (dp->major == offset)
201
			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
C
Christoph Hellwig 已提交
202
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
203
}
204
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
205

206
/**
207
 * __register_blkdev - register a new block device
208
 *
209 210
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
211
 * @name: the name of the new block device as a zero terminated string
212
 * @probe: allback that is called on access to any minor number of @major
213 214 215
 *
 * The @name must be unique within the system.
 *
216 217
 * The return value depends on the @major input parameter:
 *
218 219
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
220
 *  - if any unused major number was requested with @major = 0 parameter
221
 *    then the return value is the allocated major number in range
222 223 224 225
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
226 227
 *
 * Use register_blkdev instead for any new code.
228
 */
229 230
int __register_blkdev(unsigned int major, const char *name,
		void (*probe)(dev_t devt))
L
Linus Torvalds 已提交
231 232 233 234
{
	struct blk_major_name **n, *p;
	int index, ret = 0;

C
Christoph Hellwig 已提交
235
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
236 237 238 239 240 241 242 243 244

	/* temporary */
	if (major == 0) {
		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
			if (major_names[index] == NULL)
				break;
		}

		if (index == 0) {
245 246
			printk("%s: failed to get major for %s\n",
			       __func__, name);
L
Linus Torvalds 已提交
247 248 249 250 251 252 253
			ret = -EBUSY;
			goto out;
		}
		major = index;
		ret = major;
	}

254
	if (major >= BLKDEV_MAJOR_MAX) {
255 256
		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
257 258 259 260 261

		ret = -EINVAL;
		goto out;
	}

L
Linus Torvalds 已提交
262 263 264 265 266 267 268
	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
	if (p == NULL) {
		ret = -ENOMEM;
		goto out;
	}

	p->major = major;
269
	p->probe = probe;
L
Linus Torvalds 已提交
270 271 272 273 274 275 276 277 278 279 280 281 282 283
	strlcpy(p->name, name, sizeof(p->name));
	p->next = NULL;
	index = major_to_index(major);

	for (n = &major_names[index]; *n; n = &(*n)->next) {
		if ((*n)->major == major)
			break;
	}
	if (!*n)
		*n = p;
	else
		ret = -EBUSY;

	if (ret < 0) {
284
		printk("register_blkdev: cannot get major %u for %s\n",
L
Linus Torvalds 已提交
285 286 287 288
		       major, name);
		kfree(p);
	}
out:
C
Christoph Hellwig 已提交
289
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
290 291
	return ret;
}
292
EXPORT_SYMBOL(__register_blkdev);
L
Linus Torvalds 已提交
293

A
Akinobu Mita 已提交
294
void unregister_blkdev(unsigned int major, const char *name)
L
Linus Torvalds 已提交
295 296 297 298 299
{
	struct blk_major_name **n;
	struct blk_major_name *p = NULL;
	int index = major_to_index(major);

C
Christoph Hellwig 已提交
300
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
301 302 303
	for (n = &major_names[index]; *n; n = &(*n)->next)
		if ((*n)->major == major)
			break;
304 305 306
	if (!*n || strcmp((*n)->name, name)) {
		WARN_ON(1);
	} else {
L
Linus Torvalds 已提交
307 308 309
		p = *n;
		*n = p->next;
	}
C
Christoph Hellwig 已提交
310
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
311 312 313 314 315
	kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
/**
 * blk_mangle_minor - scatter minor numbers apart
 * @minor: minor number to mangle
 *
 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 * is enabled.  Mangling twice gives the original value.
 *
 * RETURNS:
 * Mangled value.
 *
 * CONTEXT:
 * Don't care.
 */
static int blk_mangle_minor(int minor)
{
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
	int i;

	for (i = 0; i < MINORBITS / 2; i++) {
		int low = minor & (1 << i);
		int high = minor & (1 << (MINORBITS - 1 - i));
		int distance = MINORBITS - 1 - 2 * i;

		minor ^= low | high;	/* clear both bits */
		low <<= distance;	/* swap the positions */
		high >>= distance;
		minor |= low | high;	/* and set */
	}
#endif
	return minor;
}

348
int blk_alloc_ext_minor(void)
T
Tejun Heo 已提交
349
{
T
Tejun Heo 已提交
350
	int idx;
T
Tejun Heo 已提交
351

352
	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
353 354 355 356 357 358
	if (idx < 0) {
		if (idx == -ENOSPC)
			return -EBUSY;
		return idx;
	}
	return blk_mangle_minor(idx);
T
Tejun Heo 已提交
359 360
}

361
void blk_free_ext_minor(unsigned int minor)
T
Tejun Heo 已提交
362
{
363
	ida_free(&ext_devt_ida, blk_mangle_minor(minor));
Y
Yufen Yu 已提交
364 365
}

366 367 368 369 370 371 372 373 374 375 376 377
static char *bdevt_str(dev_t devt, char *buf)
{
	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
		char tbuf[BDEVT_SIZE];
		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
	} else
		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));

	return buf;
}

378 379 380
void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
	struct block_device *part;
C
Christoph Hellwig 已提交
381
	unsigned long idx;
382

C
Christoph Hellwig 已提交
383 384 385 386
	rcu_read_lock();
	xa_for_each(&disk->part_tbl, idx, part) {
		if (bdev_is_partition(part) && !bdev_nr_sectors(part))
			continue;
387
		if (!kobject_get_unless_zero(&part->bd_device.kobj))
C
Christoph Hellwig 已提交
388 389 390
			continue;

		rcu_read_unlock();
391
		kobject_uevent(bdev_kobj(part), action);
392
		put_device(&part->bd_device);
C
Christoph Hellwig 已提交
393 394 395
		rcu_read_lock();
	}
	rcu_read_unlock();
396 397 398
}
EXPORT_SYMBOL_GPL(disk_uevent);

399 400 401 402 403 404 405 406 407 408 409 410 411
static void disk_scan_partitions(struct gendisk *disk)
{
	struct block_device *bdev;

	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
		return;

	set_bit(GD_NEED_PART_SCAN, &disk->state);
	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
	if (!IS_ERR(bdev))
		blkdev_put(bdev, FMODE_READ);
}

412 413
static void register_disk(struct device *parent, struct gendisk *disk,
			  const struct attribute_group **groups)
414 415 416 417
{
	struct device *ddev = disk_to_dev(disk);
	int err;

418
	ddev->parent = parent;
419

420
	dev_set_name(ddev, "%s", disk->disk_name);
421 422 423 424

	/* delay uevents, until we scanned partition table */
	dev_set_uevent_suppress(ddev, 1);

425 426 427 428
	if (groups) {
		WARN_ON(ddev->groups);
		ddev->groups = groups;
	}
429 430 431 432 433 434 435 436 437 438
	if (device_add(ddev))
		return;
	if (!sysfs_deprecated) {
		err = sysfs_create_link(block_depr, &ddev->kobj,
					kobject_name(&ddev->kobj));
		if (err) {
			device_del(ddev);
			return;
		}
	}
439 440 441 442 443 444 445 446

	/*
	 * avoid probable deadlock caused by allocating memory with
	 * GFP_KERNEL in runtime_resume callback of its all ancestor
	 * devices
	 */
	pm_runtime_set_memalloc_noio(ddev, true);

447 448
	disk->part0->bd_holder_dir =
		kobject_create_and_add("holders", &ddev->kobj);
449 450
	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);

451 452 453 454 455 456 457 458 459 460
	/*
	 * XXX: this is a mess, can't wait for real error handling in add_disk.
	 * Make sure ->slave_dir is NULL if we failed some of the registration
	 * so that the cleanup in bd_unlink_disk_holder works properly.
	 */
	if (bd_register_pending_holders(disk) < 0) {
		kobject_put(disk->slave_dir);
		disk->slave_dir = NULL;
	}

461
	if (disk->flags & GENHD_FL_HIDDEN)
462 463
		return;

464
	disk_scan_partitions(disk);
465

466
	/* announce the disk and partitions after all partitions are created */
467
	dev_set_uevent_suppress(ddev, 0);
468
	disk_uevent(disk, KOBJ_ADD);
469

470 471 472
	if (disk->bdi->dev) {
		err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj,
					"bdi");
473 474
		WARN_ON(err);
	}
475 476
}

L
Linus Torvalds 已提交
477
/**
478
 * device_add_disk - add disk information to kernel list
479
 * @parent: parent device for the disk
L
Linus Torvalds 已提交
480
 * @disk: per-device partitioning information
481
 * @groups: Additional per-device sysfs groups
L
Linus Torvalds 已提交
482 483 484
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
485 486
 *
 * FIXME: error handling
L
Linus Torvalds 已提交
487
 */
488 489 490 491

void device_add_disk(struct device *parent, struct gendisk *disk,
		     const struct attribute_group **groups)

L
Linus Torvalds 已提交
492
{
493
	int ret;
494

495 496 497 498 499 500
	/*
	 * The disk queue should now be all set with enough information about
	 * the device for the elevator code to pick an adequate default
	 * elevator if one is needed, that is, for devices requesting queue
	 * registration.
	 */
501
	elevator_init_mq(disk->queue);
502

503 504 505 506 507 508
	/*
	 * If the driver provides an explicit major number it also must provide
	 * the number of minors numbers supported, and those will be used to
	 * setup the gendisk.
	 * Otherwise just allocate the device numbers for both the whole device
	 * and all partitions from the extended dev_t space.
509
	 */
510 511
	if (disk->major) {
		WARN_ON(!disk->minors);
512 513 514 515 516 517

		if (disk->minors > DISK_MAX_PARTS) {
			pr_err("block: can't allocate more than %d partitions\n",
				DISK_MAX_PARTS);
			disk->minors = DISK_MAX_PARTS;
		}
518 519
	} else {
		WARN_ON(disk->minors);
520

521 522 523 524 525 526 527
		ret = blk_alloc_ext_minor();
		if (ret < 0) {
			WARN_ON(1);
			return;
		}
		disk->major = BLOCK_EXT_MAJOR;
		disk->first_minor = MINOR(ret);
528
		disk->flags |= GENHD_FL_EXT_DEVT;
529
	}
530

531 532
	disk_alloc_events(disk);

533 534 535 536 537 538 539 540
	if (disk->flags & GENHD_FL_HIDDEN) {
		/*
		 * Don't let hidden disks show up in /proc/partitions,
		 * and don't bother scanning for partitions either.
		 */
		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
		disk->flags |= GENHD_FL_NO_PART_SCAN;
	} else {
541
		struct device *dev = disk_to_dev(disk);
542

543
		/* Register BDI before referencing it from bdev */
544
		dev->devt = MKDEV(disk->major, disk->first_minor);
545
		ret = bdi_register(disk->bdi, "%u:%u",
546
				   disk->major, disk->first_minor);
547
		WARN_ON(ret);
548
		bdi_set_owner(disk->bdi, dev);
549
		bdev_add(disk->part0, dev->devt);
550
	}
551
	register_disk(parent, disk, groups);
552
	blk_register_queue(disk);
553

554 555 556 557
	/*
	 * Take an extra ref on queue which will be put on disk_release()
	 * so that it sticks around as long as @disk is there.
	 */
558 559 560 561
	if (blk_get_queue(disk->queue))
		set_bit(GD_QUEUE_REF, &disk->state);
	else
		WARN_ON_ONCE(1);
562

563
	disk_add_events(disk);
564
	blk_integrity_add(disk);
L
Linus Torvalds 已提交
565
}
566
EXPORT_SYMBOL(device_add_disk);
L
Linus Torvalds 已提交
567

568 569 570 571 572 573 574 575 576 577 578 579 580
/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
581 582 583 584 585
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
586
 */
587
void del_gendisk(struct gendisk *disk)
L
Linus Torvalds 已提交
588
{
589 590
	might_sleep();

591 592 593
	if (WARN_ON_ONCE(!disk->queue))
		return;

594
	blk_integrity_del(disk);
595 596
	disk_del_events(disk);

597
	mutex_lock(&disk->open_mutex);
598
	remove_inode_hash(disk->part0->bd_inode);
599
	blk_drop_partitions(disk);
600
	mutex_unlock(&disk->open_mutex);
601

602 603 604
	fsync_bdev(disk->part0);
	__invalidate_device(disk->part0, true);

605 606
	set_capacity(disk, 0);

607
	if (!(disk->flags & GENHD_FL_HIDDEN)) {
608
		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
609

610 611 612 613
		/*
		 * Unregister bdi before releasing device numbers (as they can
		 * get reused and we'd get clashes in sysfs).
		 */
614
		bdi_unregister(disk->bdi);
615
	}
616

617
	blk_unregister_queue(disk);
618

619
	kobject_put(disk->part0->bd_holder_dir);
620 621
	kobject_put(disk->slave_dir);

622
	part_stat_set_all(disk->part0, 0);
623
	disk->part0->bd_stamp = 0;
624 625
	if (!sysfs_deprecated)
		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
626
	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
627
	device_del(disk_to_dev(disk));
L
Linus Torvalds 已提交
628
}
629
EXPORT_SYMBOL(del_gendisk);
L
Linus Torvalds 已提交
630

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
					struct device_attribute *attr,
					char *page)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return sprintf(page, "\n");

	return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *page, size_t len)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return -ENXIO;

	return badblocks_store(disk->bb, page, len, 0);
}

656
void blk_request_module(dev_t devt)
657
{
658 659 660 661 662 663 664 665 666 667 668 669 670
	unsigned int major = MAJOR(devt);
	struct blk_major_name **n;

	mutex_lock(&major_names_lock);
	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
		if ((*n)->major == major && (*n)->probe) {
			(*n)->probe(devt);
			mutex_unlock(&major_names_lock);
			return;
		}
	}
	mutex_unlock(&major_names_lock);

671 672 673 674 675
	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
		/* Make old-style 2.4 aliases work */
		request_module("block-major-%d", MAJOR(devt));
}

676 677 678 679 680 681 682
/*
 * print a full list of all partitions - intended for places where the root
 * filesystem can't be mounted and thus to give the victim some idea of what
 * went wrong
 */
void __init printk_all_partitions(void)
{
683 684 685 686 687 688
	struct class_dev_iter iter;
	struct device *dev;

	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
689
		struct block_device *part;
690
		char devt_buf[BDEVT_SIZE];
691
		unsigned long idx;
692 693 694

		/*
		 * Don't show empty devices or things that have been
L
Lucas De Marchi 已提交
695
		 * suppressed
696 697 698 699 700 701
		 */
		if (get_capacity(disk) == 0 ||
		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
			continue;

		/*
702 703
		 * Note, unlike /proc/partitions, I am showing the numbers in
		 * hex - the same format as the root= option takes.
704
		 */
705 706 707 708
		rcu_read_lock();
		xa_for_each(&disk->part_tbl, idx, part) {
			if (!bdev_nr_sectors(part))
				continue;
709
			printk("%s%s %10llu %pg %s",
710
			       bdev_is_partition(part) ? "  " : "",
711
			       bdevt_str(part->bd_dev, devt_buf),
712
			       bdev_nr_sectors(part) >> 1, part,
713 714
			       part->bd_meta_info ?
					part->bd_meta_info->uuid : "");
715
			if (bdev_is_partition(part))
T
Tejun Heo 已提交
716
				printk("\n");
717 718 719 720 721
			else if (dev->parent && dev->parent->driver)
				printk(" driver: %s\n",
					dev->parent->driver->name);
			else
				printk(" (driver?)\n");
T
Tejun Heo 已提交
722
		}
723
		rcu_read_unlock();
724 725
	}
	class_dev_iter_exit(&iter);
726 727
}

L
Linus Torvalds 已提交
728 729
#ifdef CONFIG_PROC_FS
/* iterator */
730
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
731
{
732 733 734
	loff_t skip = *pos;
	struct class_dev_iter *iter;
	struct device *dev;
735

736
	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
737 738 739 740 741 742 743 744 745 746 747 748
	if (!iter)
		return ERR_PTR(-ENOMEM);

	seqf->private = iter;
	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
	do {
		dev = class_dev_iter_next(iter);
		if (!dev)
			return NULL;
	} while (skip--);

	return dev_to_disk(dev);
749 750
}

751
static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
L
Linus Torvalds 已提交
752
{
753
	struct device *dev;
L
Linus Torvalds 已提交
754

755 756
	(*pos)++;
	dev = class_dev_iter_next(seqf->private);
757
	if (dev)
758
		return dev_to_disk(dev);
759

L
Linus Torvalds 已提交
760 761 762
	return NULL;
}

763
static void disk_seqf_stop(struct seq_file *seqf, void *v)
764
{
765
	struct class_dev_iter *iter = seqf->private;
766

767 768 769 770
	/* stop is called even after start failed :-( */
	if (iter) {
		class_dev_iter_exit(iter);
		kfree(iter);
771
		seqf->private = NULL;
772
	}
L
Linus Torvalds 已提交
773 774
}

775
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
L
Linus Torvalds 已提交
776
{
777
	void *p;
778 779

	p = disk_seqf_start(seqf, pos);
780
	if (!IS_ERR_OR_NULL(p) && !*pos)
781 782
		seq_puts(seqf, "major minor  #blocks  name\n\n");
	return p;
L
Linus Torvalds 已提交
783 784
}

785
static int show_partition(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
786 787
{
	struct gendisk *sgp = v;
788
	struct block_device *part;
789
	unsigned long idx;
L
Linus Torvalds 已提交
790 791

	/* Don't show non-partitionable removeable devices or empty devices */
T
Tejun Heo 已提交
792
	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
793
				   (sgp->flags & GENHD_FL_REMOVABLE)))
L
Linus Torvalds 已提交
794 795 796 797
		return 0;
	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
		return 0;

798 799 800 801
	rcu_read_lock();
	xa_for_each(&sgp->part_tbl, idx, part) {
		if (!bdev_nr_sectors(part))
			continue;
802
		seq_printf(seqf, "%4d  %7d %10llu %pg\n",
803
			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
804
			   bdev_nr_sectors(part) >> 1, part);
805 806
	}
	rcu_read_unlock();
L
Linus Torvalds 已提交
807 808 809
	return 0;
}

810
static const struct seq_operations partitions_op = {
811 812 813
	.start	= show_partition_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
814
	.show	= show_partition
L
Linus Torvalds 已提交
815 816 817 818 819
};
#endif

static int __init genhd_device_init(void)
{
820 821 822 823
	int error;

	block_class.dev_kobj = sysfs_dev_block_kobj;
	error = class_register(&block_class);
R
Roland McGrath 已提交
824 825
	if (unlikely(error))
		return error;
L
Linus Torvalds 已提交
826
	blk_dev_init();
827

828 829
	register_blkdev(BLOCK_EXT_MAJOR, "blkext");

830
	/* create top-level block dir */
831 832
	if (!sysfs_deprecated)
		block_depr = kobject_create_and_add("block", NULL);
833
	return 0;
L
Linus Torvalds 已提交
834 835 836 837
}

subsys_initcall(genhd_device_init);

838 839
static ssize_t disk_range_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
840
{
841
	struct gendisk *disk = dev_to_disk(dev);
L
Linus Torvalds 已提交
842

843
	return sprintf(buf, "%d\n", disk->minors);
L
Linus Torvalds 已提交
844 845
}

846 847 848 849 850
static ssize_t disk_ext_range_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
851
	return sprintf(buf, "%d\n", disk_max_parts(disk));
852 853
}

854 855
static ssize_t disk_removable_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
856
{
857
	struct gendisk *disk = dev_to_disk(dev);
858

859 860
	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
861 862
}

863 864 865 866 867 868 869 870 871
static ssize_t disk_hidden_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

K
Kay Sievers 已提交
872 873 874 875 876
static ssize_t disk_ro_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
877
	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
K
Kay Sievers 已提交
878 879
}

880 881 882
ssize_t part_size_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
883
	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
884 885 886 887 888
}

ssize_t part_stat_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
889 890
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
891
	struct disk_stats stat;
892 893
	unsigned int inflight;

894
	part_stat_read_all(bdev, &stat);
895
	if (queue_is_mq(q))
896
		inflight = blk_mq_in_flight(q, bdev);
897
	else
898
		inflight = part_in_flight(bdev);
899

900 901 902 903 904 905 906
	return sprintf(buf,
		"%8lu %8lu %8llu %8u "
		"%8lu %8lu %8llu %8u "
		"%8u %8u %8u "
		"%8lu %8lu %8llu %8u "
		"%8lu %8u"
		"\n",
907 908 909 910 911 912 913 914
		stat.ios[STAT_READ],
		stat.merges[STAT_READ],
		(unsigned long long)stat.sectors[STAT_READ],
		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
		stat.ios[STAT_WRITE],
		stat.merges[STAT_WRITE],
		(unsigned long long)stat.sectors[STAT_WRITE],
		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
915
		inflight,
916
		jiffies_to_msecs(stat.io_ticks),
917 918 919 920 921
		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
				      stat.nsecs[STAT_WRITE] +
				      stat.nsecs[STAT_DISCARD] +
				      stat.nsecs[STAT_FLUSH],
						NSEC_PER_MSEC),
922 923 924 925 926 927
		stat.ios[STAT_DISCARD],
		stat.merges[STAT_DISCARD],
		(unsigned long long)stat.sectors[STAT_DISCARD],
		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
		stat.ios[STAT_FLUSH],
		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
928 929 930 931 932
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
			   char *buf)
{
933 934
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
935 936
	unsigned int inflight[2];

937
	if (queue_is_mq(q))
938
		blk_mq_in_flight_rw(q, bdev, inflight);
939
	else
940
		part_in_flight_rw(bdev, inflight);
941

942 943 944
	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

945 946
static ssize_t disk_capability_show(struct device *dev,
				    struct device_attribute *attr, char *buf)
947
{
948 949 950
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%x\n", disk->flags);
951
}
952

953 954 955 956 957 958 959 960 961
static ssize_t disk_alignment_offset_show(struct device *dev,
					  struct device_attribute *attr,
					  char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}

962 963 964 965 966 967
static ssize_t disk_discard_alignment_show(struct device *dev,
					   struct device_attribute *attr,
					   char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

968
	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
969 970
}

M
Matteo Croce 已提交
971 972 973 974 975 976 977 978
static ssize_t diskseq_show(struct device *dev,
			    struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%llu\n", disk->diskseq);
}

979 980 981 982 983 984 985 986 987 988 989 990
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
M
Matteo Croce 已提交
991
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
992

993
#ifdef CONFIG_FAIL_MAKE_REQUEST
994 995 996
ssize_t part_fail_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
997
	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
998 999 1000 1001 1002 1003 1004 1005 1006
}

ssize_t part_fail_store(struct device *dev,
			struct device_attribute *attr,
			const char *buf, size_t count)
{
	int i;

	if (count > 0 && sscanf(buf, "%d", &i) > 0)
1007
		dev_to_bdev(dev)->bd_make_it_fail = i;
1008 1009 1010 1011

	return count;
}

1012
static struct device_attribute dev_attr_fail =
1013
	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1014 1015
#endif /* CONFIG_FAIL_MAKE_REQUEST */

1016 1017
#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
1018
	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1019
#endif
1020 1021 1022

static struct attribute *disk_attrs[] = {
	&dev_attr_range.attr,
1023
	&dev_attr_ext_range.attr,
1024
	&dev_attr_removable.attr,
1025
	&dev_attr_hidden.attr,
K
Kay Sievers 已提交
1026
	&dev_attr_ro.attr,
1027
	&dev_attr_size.attr,
1028
	&dev_attr_alignment_offset.attr,
1029
	&dev_attr_discard_alignment.attr,
1030 1031
	&dev_attr_capability.attr,
	&dev_attr_stat.attr,
1032
	&dev_attr_inflight.attr,
1033
	&dev_attr_badblocks.attr,
1034 1035 1036
	&dev_attr_events.attr,
	&dev_attr_events_async.attr,
	&dev_attr_events_poll_msecs.attr,
M
Matteo Croce 已提交
1037
	&dev_attr_diskseq.attr,
1038 1039
#ifdef CONFIG_FAIL_MAKE_REQUEST
	&dev_attr_fail.attr,
1040 1041 1042
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
	&dev_attr_fail_timeout.attr,
1043 1044 1045 1046
#endif
	NULL
};

1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, typeof(*dev), kobj);
	struct gendisk *disk = dev_to_disk(dev);

	if (a == &dev_attr_badblocks.attr && !disk->bb)
		return 0;
	return a->mode;
}

1057 1058
static struct attribute_group disk_attr_group = {
	.attrs = disk_attrs,
1059
	.is_visible = disk_visible,
1060 1061
};

1062
static const struct attribute_group *disk_attr_groups[] = {
1063 1064
	&disk_attr_group,
	NULL
L
Linus Torvalds 已提交
1065 1066
};

1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
1078 1079
 *
 * Context: can sleep
1080
 */
1081
static void disk_release(struct device *dev)
L
Linus Torvalds 已提交
1082
{
1083 1084
	struct gendisk *disk = dev_to_disk(dev);

1085 1086
	might_sleep();

1087
	disk_release_events(disk);
L
Linus Torvalds 已提交
1088
	kfree(disk->random);
1089
	xa_destroy(&disk->part_tbl);
1090
	if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue)
1091
		blk_put_queue(disk->queue);
C
Christoph Hellwig 已提交
1092
	iput(disk->part0->bd_inode);	/* frees the disk */
L
Linus Torvalds 已提交
1093
}
1094 1095 1096 1097 1098 1099 1100 1101

static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
{
	struct gendisk *disk = dev_to_disk(dev);

	return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}

1102 1103
struct class block_class = {
	.name		= "block",
1104
	.dev_uevent	= block_uevent,
L
Linus Torvalds 已提交
1105 1106
};

1107
static char *block_devnode(struct device *dev, umode_t *mode,
1108
			   kuid_t *uid, kgid_t *gid)
1109 1110 1111
{
	struct gendisk *disk = dev_to_disk(dev);

1112 1113
	if (disk->fops->devnode)
		return disk->fops->devnode(disk, mode);
1114 1115 1116
	return NULL;
}

1117
const struct device_type disk_type = {
1118 1119 1120
	.name		= "disk",
	.groups		= disk_attr_groups,
	.release	= disk_release,
1121
	.devnode	= block_devnode,
L
Linus Torvalds 已提交
1122 1123
};

1124
#ifdef CONFIG_PROC_FS
1125 1126 1127 1128 1129 1130 1131 1132
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
1133 1134
{
	struct gendisk *gp = v;
1135
	struct block_device *hd;
1136
	unsigned int inflight;
1137
	struct disk_stats stat;
1138
	unsigned long idx;
L
Linus Torvalds 已提交
1139 1140

	/*
1141
	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1142
		seq_puts(seqf,	"major minor name"
L
Linus Torvalds 已提交
1143 1144 1145 1146
				"     rio rmerge rsect ruse wio wmerge "
				"wsect wuse running use aveq"
				"\n\n");
	*/
1147

1148 1149 1150 1151
	rcu_read_lock();
	xa_for_each(&gp->part_tbl, idx, hd) {
		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
			continue;
1152
		part_stat_read_all(hd, &stat);
1153
		if (queue_is_mq(gp->queue))
1154
			inflight = blk_mq_in_flight(gp->queue, hd);
1155
		else
1156
			inflight = part_in_flight(hd);
1157

1158
		seq_printf(seqf, "%4d %7d %pg "
1159 1160 1161
			   "%lu %lu %lu %u "
			   "%lu %lu %lu %u "
			   "%u %u %u "
1162 1163 1164
			   "%lu %lu %lu %u "
			   "%lu %u"
			   "\n",
1165
			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
			   stat.ios[STAT_READ],
			   stat.merges[STAT_READ],
			   stat.sectors[STAT_READ],
			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
							NSEC_PER_MSEC),
			   stat.ios[STAT_WRITE],
			   stat.merges[STAT_WRITE],
			   stat.sectors[STAT_WRITE],
			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
							NSEC_PER_MSEC),
1176
			   inflight,
1177
			   jiffies_to_msecs(stat.io_ticks),
1178 1179 1180 1181 1182
			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
						 stat.nsecs[STAT_WRITE] +
						 stat.nsecs[STAT_DISCARD] +
						 stat.nsecs[STAT_FLUSH],
							NSEC_PER_MSEC),
1183 1184 1185 1186 1187 1188 1189 1190
			   stat.ios[STAT_DISCARD],
			   stat.merges[STAT_DISCARD],
			   stat.sectors[STAT_DISCARD],
			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
						 NSEC_PER_MSEC),
			   stat.ios[STAT_FLUSH],
			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
						 NSEC_PER_MSEC)
1191
			);
L
Linus Torvalds 已提交
1192
	}
1193
	rcu_read_unlock();
1194

L
Linus Torvalds 已提交
1195 1196 1197
	return 0;
}

1198
static const struct seq_operations diskstats_op = {
1199 1200 1201
	.start	= disk_seqf_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
L
Linus Torvalds 已提交
1202 1203
	.show	= diskstats_show
};
1204 1205 1206

static int __init proc_genhd_init(void)
{
1207 1208
	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
	proc_create_seq("partitions", 0, NULL, &partitions_op);
1209 1210 1211
	return 0;
}
module_init(proc_genhd_init);
1212
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
1213

1214 1215
dev_t part_devt(struct gendisk *disk, u8 partno)
{
C
Christoph Hellwig 已提交
1216
	struct block_device *part;
1217 1218
	dev_t devt = 0;

C
Christoph Hellwig 已提交
1219 1220 1221
	rcu_read_lock();
	part = xa_load(&disk->part_tbl, partno);
	if (part)
1222
		devt = part->bd_dev;
C
Christoph Hellwig 已提交
1223
	rcu_read_unlock();
1224 1225 1226 1227

	return devt;
}

1228
dev_t blk_lookup_devt(const char *name, int partno)
1229
{
1230 1231 1232
	dev_t devt = MKDEV(0, 0);
	struct class_dev_iter iter;
	struct device *dev;
1233

1234 1235
	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
1236 1237
		struct gendisk *disk = dev_to_disk(dev);

1238
		if (strcmp(dev_name(dev), name))
1239 1240
			continue;

1241 1242 1243 1244 1245 1246
		if (partno < disk->minors) {
			/* We need to return the right devno, even
			 * if the partition doesn't exist yet.
			 */
			devt = MKDEV(MAJOR(dev->devt),
				     MINOR(dev->devt) + partno);
1247 1248 1249 1250
		} else {
			devt = part_devt(disk, partno);
			if (devt)
				break;
1251
		}
1252
	}
1253
	class_dev_iter_exit(&iter);
1254 1255 1256
	return devt;
}

1257 1258
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
		struct lock_class_key *lkclass)
1259 1260 1261
{
	struct gendisk *disk;

1262
	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1263 1264
	if (!disk)
		return NULL;
1265

1266 1267 1268 1269
	disk->bdi = bdi_alloc(node_id);
	if (!disk->bdi)
		goto out_free_disk;

1270 1271
	disk->part0 = bdev_alloc(disk, 0);
	if (!disk->part0)
1272
		goto out_free_bdi;
1273

1274
	disk->node_id = node_id;
1275
	mutex_init(&disk->open_mutex);
1276 1277 1278
	xa_init(&disk->part_tbl);
	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
		goto out_destroy_part_tbl;
1279 1280 1281 1282 1283

	rand_initialize_disk(disk);
	disk_to_dev(disk)->class = &block_class;
	disk_to_dev(disk)->type = &disk_type;
	device_initialize(disk_to_dev(disk));
M
Matteo Croce 已提交
1284
	inc_diskseq(disk);
1285
	disk->queue = q;
1286
	lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
1287 1288 1289
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
	INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
L
Linus Torvalds 已提交
1290
	return disk;
1291

1292 1293
out_destroy_part_tbl:
	xa_destroy(&disk->part_tbl);
C
Christoph Hellwig 已提交
1294
	iput(disk->part0->bd_inode);
1295 1296
out_free_bdi:
	bdi_put(disk->bdi);
1297 1298 1299
out_free_disk:
	kfree(disk);
	return NULL;
L
Linus Torvalds 已提交
1300
}
1301
EXPORT_SYMBOL(__alloc_disk_node);
L
Linus Torvalds 已提交
1302

1303
struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
1304 1305 1306 1307 1308 1309 1310 1311
{
	struct request_queue *q;
	struct gendisk *disk;

	q = blk_alloc_queue(node);
	if (!q)
		return NULL;

1312
	disk = __alloc_disk_node(q, node, lkclass);
1313 1314 1315 1316 1317 1318 1319 1320
	if (!disk) {
		blk_cleanup_queue(q);
		return NULL;
	}
	return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);

1321 1322
/**
 * put_disk - decrements the gendisk refcount
1323
 * @disk: the struct gendisk to decrement the refcount for
1324 1325 1326
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
1327 1328 1329
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
1330
 */
L
Linus Torvalds 已提交
1331 1332 1333
void put_disk(struct gendisk *disk)
{
	if (disk)
1334
		put_device(disk_to_dev(disk));
L
Linus Torvalds 已提交
1335 1336 1337
}
EXPORT_SYMBOL(put_disk);

1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
/**
 * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
 * @disk: gendisk to shutdown
 *
 * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
 * the queue DEAD, destroy and put it and the gendisk structure.
 *
 * Context: can sleep
 */
void blk_cleanup_disk(struct gendisk *disk)
{
	blk_cleanup_queue(disk->queue);
	put_disk(disk);
}
EXPORT_SYMBOL(blk_cleanup_disk);

1354 1355 1356 1357 1358 1359 1360 1361 1362 1363
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
	char event[] = "DISK_RO=1";
	char *envp[] = { event, NULL };

	if (!ro)
		event[8] = '0';
	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

1364 1365 1366
/**
 * set_disk_ro - set a gendisk read-only
 * @disk:	gendisk to operate on
1367
 * @read_only:	%true to set the disk read-only, %false set the disk read/write
1368 1369 1370 1371 1372 1373
 *
 * This function is used to indicate whether a given disk device should have its
 * read-only flag set. set_disk_ro() is typically used by device drivers to
 * indicate whether the underlying physical device is write-protected.
 */
void set_disk_ro(struct gendisk *disk, bool read_only)
L
Linus Torvalds 已提交
1374
{
1375 1376 1377 1378 1379 1380
	if (read_only) {
		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
			return;
	} else {
		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
			return;
1381
	}
1382
	set_disk_ro_uevent(disk, read_only);
L
Linus Torvalds 已提交
1383 1384 1385 1386 1387
}
EXPORT_SYMBOL(set_disk_ro);

int bdev_read_only(struct block_device *bdev)
{
1388
	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
L
Linus Torvalds 已提交
1389 1390
}
EXPORT_SYMBOL(bdev_read_only);
M
Matteo Croce 已提交
1391 1392 1393 1394 1395

void inc_diskseq(struct gendisk *disk)
{
	disk->diskseq = atomic64_inc_return(&diskseq);
}