genhd.c 34.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3
/*
 *  gendisk handling
C
Christoph Hellwig 已提交
4 5
 *
 * Portions Copyright (C) 2020 Christoph Hellwig
L
Linus Torvalds 已提交
6 7 8
 */

#include <linux/module.h>
9
#include <linux/ctype.h>
L
Linus Torvalds 已提交
10 11
#include <linux/fs.h>
#include <linux/genhd.h>
12
#include <linux/kdev_t.h>
L
Linus Torvalds 已提交
13 14
#include <linux/kernel.h>
#include <linux/blkdev.h>
15
#include <linux/backing-dev.h>
L
Linus Torvalds 已提交
16 17
#include <linux/init.h>
#include <linux/spinlock.h>
18
#include <linux/proc_fs.h>
L
Linus Torvalds 已提交
19 20 21
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
22
#include <linux/mutex.h>
T
Tejun Heo 已提交
23
#include <linux/idr.h>
24
#include <linux/log2.h>
25
#include <linux/pm_runtime.h>
26
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
27

28 29
#include "blk.h"

30
static struct kobject *block_depr;
L
Linus Torvalds 已提交
31

M
Matteo Croce 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
/*
 * Unique, monotonically increasing sequential number associated with block
 * devices instances (i.e. incremented each time a device is attached).
 * Associating uevents with block devices in userspace is difficult and racy:
 * the uevent netlink socket is lossy, and on slow and overloaded systems has
 * a very high latency.
 * Block devices do not have exclusive owners in userspace, any process can set
 * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
 * can be reused again and again).
 * A userspace process setting up a block device and watching for its events
 * cannot thus reliably tell whether an event relates to the device it just set
 * up or another earlier instance with the same name.
 * This sequential number allows userspace processes to solve this problem, and
 * uniquely associate an uevent to the lifetime to a device.
 */
static atomic64_t diskseq;

T
Tejun Heo 已提交
49
/* for extended dynamic devt allocation, currently only one major is used */
50
#define NR_EXT_DEVT		(1 << MINORBITS)
51
static DEFINE_IDA(ext_devt_ida);
T
Tejun Heo 已提交
52

53 54
void set_capacity(struct gendisk *disk, sector_t sectors)
{
55
	struct block_device *bdev = disk->part0;
56

57
	spin_lock(&bdev->bd_size_lock);
58
	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
59
	spin_unlock(&bdev->bd_size_lock);
60 61 62
}
EXPORT_SYMBOL(set_capacity);

63
/*
64 65
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
66
 */
67
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
68 69
{
	sector_t capacity = get_capacity(disk);
70
	char *envp[] = { "RESIZE=1", NULL };
71 72 73

	set_capacity(disk, size);

74 75 76 77 78 79 80 81
	/*
	 * Only print a message and send a uevent if the gendisk is user visible
	 * and alive.  This avoids spamming the log and udev when setting the
	 * initial capacity during probing.
	 */
	if (size == capacity ||
	    (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
		return false;
82

83
	pr_info("%s: detected capacity change from %lld to %lld\n",
M
Ming Lei 已提交
84
		disk->disk_name, capacity, size);
85

86 87 88 89 90 91 92 93
	/*
	 * Historically we did not send a uevent for changes to/from an empty
	 * device.
	 */
	if (!capacity || !size)
		return false;
	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
	return true;
94
}
95
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
96

97
/*
C
Christoph Hellwig 已提交
98 99 100 101 102
 * Format the device name of the indicated block device into the supplied buffer
 * and return a pointer to that same buffer for convenience.
 *
 * Note: do not use this in new code, use the %pg specifier to sprintf and
 * printk insted.
103
 */
C
Christoph Hellwig 已提交
104
const char *bdevname(struct block_device *bdev, char *buf)
105
{
C
Christoph Hellwig 已提交
106 107 108
	struct gendisk *hd = bdev->bd_disk;
	int partno = bdev->bd_partno;

109 110 111 112 113 114 115 116 117 118
	if (!partno)
		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
	else
		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

	return buf;
}
EXPORT_SYMBOL(bdevname);
119

120 121
static void part_stat_read_all(struct block_device *part,
		struct disk_stats *stat)
122 123 124 125 126
{
	int cpu;

	memset(stat, 0, sizeof(struct disk_stats));
	for_each_possible_cpu(cpu) {
127
		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
128 129 130 131 132 133 134 135 136 137 138 139 140
		int group;

		for (group = 0; group < NR_STAT_GROUPS; group++) {
			stat->nsecs[group] += ptr->nsecs[group];
			stat->sectors[group] += ptr->sectors[group];
			stat->ios[group] += ptr->ios[group];
			stat->merges[group] += ptr->merges[group];
		}

		stat->io_ticks += ptr->io_ticks;
	}
}

141
static unsigned int part_in_flight(struct block_device *part)
142
{
143
	unsigned int inflight = 0;
144
	int cpu;
145

146
	for_each_possible_cpu(cpu) {
147 148
		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
			    part_stat_local_read_cpu(part, in_flight[1], cpu);
149
	}
150 151
	if ((int)inflight < 0)
		inflight = 0;
152

153
	return inflight;
154 155
}

156 157
static void part_in_flight_rw(struct block_device *part,
		unsigned int inflight[2])
158
{
159 160 161 162 163 164 165 166 167 168 169 170
	int cpu;

	inflight[0] = 0;
	inflight[1] = 0;
	for_each_possible_cpu(cpu) {
		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
	}
	if ((int)inflight[0] < 0)
		inflight[0] = 0;
	if ((int)inflight[1] < 0)
		inflight[1] = 0;
171 172
}

L
Linus Torvalds 已提交
173 174 175 176
/*
 * Can be deleted altogether. Later.
 *
 */
177
#define BLKDEV_MAJOR_HASH_SIZE 255
L
Linus Torvalds 已提交
178 179 180 181
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
182
	void (*probe)(dev_t devt);
183
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
C
Christoph Hellwig 已提交
184
static DEFINE_MUTEX(major_names_lock);
L
Linus Torvalds 已提交
185 186

/* index in the above - for now: assume no multimajor ranges */
187
static inline int major_to_index(unsigned major)
L
Linus Torvalds 已提交
188
{
189
	return major % BLKDEV_MAJOR_HASH_SIZE;
190 191
}

192
#ifdef CONFIG_PROC_FS
193
void blkdev_show(struct seq_file *seqf, off_t offset)
194
{
195
	struct blk_major_name *dp;
196

C
Christoph Hellwig 已提交
197
	mutex_lock(&major_names_lock);
198 199
	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
		if (dp->major == offset)
200
			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
C
Christoph Hellwig 已提交
201
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
202
}
203
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
204

205
/**
206
 * __register_blkdev - register a new block device
207
 *
208 209
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
210
 * @name: the name of the new block device as a zero terminated string
211
 * @probe: allback that is called on access to any minor number of @major
212 213 214
 *
 * The @name must be unique within the system.
 *
215 216
 * The return value depends on the @major input parameter:
 *
217 218
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
219
 *  - if any unused major number was requested with @major = 0 parameter
220
 *    then the return value is the allocated major number in range
221 222 223 224
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
225 226
 *
 * Use register_blkdev instead for any new code.
227
 */
228 229
int __register_blkdev(unsigned int major, const char *name,
		void (*probe)(dev_t devt))
L
Linus Torvalds 已提交
230 231 232 233
{
	struct blk_major_name **n, *p;
	int index, ret = 0;

C
Christoph Hellwig 已提交
234
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
235 236 237 238 239 240 241 242 243

	/* temporary */
	if (major == 0) {
		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
			if (major_names[index] == NULL)
				break;
		}

		if (index == 0) {
244 245
			printk("%s: failed to get major for %s\n",
			       __func__, name);
L
Linus Torvalds 已提交
246 247 248 249 250 251 252
			ret = -EBUSY;
			goto out;
		}
		major = index;
		ret = major;
	}

253
	if (major >= BLKDEV_MAJOR_MAX) {
254 255
		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
256 257 258 259 260

		ret = -EINVAL;
		goto out;
	}

L
Linus Torvalds 已提交
261 262 263 264 265 266 267
	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
	if (p == NULL) {
		ret = -ENOMEM;
		goto out;
	}

	p->major = major;
268
	p->probe = probe;
L
Linus Torvalds 已提交
269 270 271 272 273 274 275 276 277 278 279 280 281 282
	strlcpy(p->name, name, sizeof(p->name));
	p->next = NULL;
	index = major_to_index(major);

	for (n = &major_names[index]; *n; n = &(*n)->next) {
		if ((*n)->major == major)
			break;
	}
	if (!*n)
		*n = p;
	else
		ret = -EBUSY;

	if (ret < 0) {
283
		printk("register_blkdev: cannot get major %u for %s\n",
L
Linus Torvalds 已提交
284 285 286 287
		       major, name);
		kfree(p);
	}
out:
C
Christoph Hellwig 已提交
288
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
289 290
	return ret;
}
291
EXPORT_SYMBOL(__register_blkdev);
L
Linus Torvalds 已提交
292

A
Akinobu Mita 已提交
293
void unregister_blkdev(unsigned int major, const char *name)
L
Linus Torvalds 已提交
294 295 296 297 298
{
	struct blk_major_name **n;
	struct blk_major_name *p = NULL;
	int index = major_to_index(major);

C
Christoph Hellwig 已提交
299
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
300 301 302
	for (n = &major_names[index]; *n; n = &(*n)->next)
		if ((*n)->major == major)
			break;
303 304 305
	if (!*n || strcmp((*n)->name, name)) {
		WARN_ON(1);
	} else {
L
Linus Torvalds 已提交
306 307 308
		p = *n;
		*n = p->next;
	}
C
Christoph Hellwig 已提交
309
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
310 311 312 313 314
	kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
/**
 * blk_mangle_minor - scatter minor numbers apart
 * @minor: minor number to mangle
 *
 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 * is enabled.  Mangling twice gives the original value.
 *
 * RETURNS:
 * Mangled value.
 *
 * CONTEXT:
 * Don't care.
 */
static int blk_mangle_minor(int minor)
{
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
	int i;

	for (i = 0; i < MINORBITS / 2; i++) {
		int low = minor & (1 << i);
		int high = minor & (1 << (MINORBITS - 1 - i));
		int distance = MINORBITS - 1 - 2 * i;

		minor ^= low | high;	/* clear both bits */
		low <<= distance;	/* swap the positions */
		high >>= distance;
		minor |= low | high;	/* and set */
	}
#endif
	return minor;
}

347
int blk_alloc_ext_minor(void)
T
Tejun Heo 已提交
348
{
T
Tejun Heo 已提交
349
	int idx;
T
Tejun Heo 已提交
350

351
	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
352 353 354 355 356 357
	if (idx < 0) {
		if (idx == -ENOSPC)
			return -EBUSY;
		return idx;
	}
	return blk_mangle_minor(idx);
T
Tejun Heo 已提交
358 359
}

360
void blk_free_ext_minor(unsigned int minor)
T
Tejun Heo 已提交
361
{
362
	ida_free(&ext_devt_ida, blk_mangle_minor(minor));
Y
Yufen Yu 已提交
363 364
}

365 366 367 368 369 370 371 372 373 374 375 376
static char *bdevt_str(dev_t devt, char *buf)
{
	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
		char tbuf[BDEVT_SIZE];
		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
	} else
		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));

	return buf;
}

377 378 379
void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
	struct block_device *part;
C
Christoph Hellwig 已提交
380
	unsigned long idx;
381

C
Christoph Hellwig 已提交
382 383 384 385
	rcu_read_lock();
	xa_for_each(&disk->part_tbl, idx, part) {
		if (bdev_is_partition(part) && !bdev_nr_sectors(part))
			continue;
386
		if (!kobject_get_unless_zero(&part->bd_device.kobj))
C
Christoph Hellwig 已提交
387 388 389
			continue;

		rcu_read_unlock();
390
		kobject_uevent(bdev_kobj(part), action);
391
		put_device(&part->bd_device);
C
Christoph Hellwig 已提交
392 393 394
		rcu_read_lock();
	}
	rcu_read_unlock();
395 396 397
}
EXPORT_SYMBOL_GPL(disk_uevent);

398 399 400 401 402 403 404 405 406 407 408 409 410
static void disk_scan_partitions(struct gendisk *disk)
{
	struct block_device *bdev;

	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
		return;

	set_bit(GD_NEED_PART_SCAN, &disk->state);
	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
	if (!IS_ERR(bdev))
		blkdev_put(bdev, FMODE_READ);
}

411 412
static void register_disk(struct device *parent, struct gendisk *disk,
			  const struct attribute_group **groups)
413 414 415 416
{
	struct device *ddev = disk_to_dev(disk);
	int err;

417
	ddev->parent = parent;
418

419
	dev_set_name(ddev, "%s", disk->disk_name);
420 421 422 423

	/* delay uevents, until we scanned partition table */
	dev_set_uevent_suppress(ddev, 1);

424 425 426 427
	if (groups) {
		WARN_ON(ddev->groups);
		ddev->groups = groups;
	}
428 429 430 431 432 433 434 435 436 437
	if (device_add(ddev))
		return;
	if (!sysfs_deprecated) {
		err = sysfs_create_link(block_depr, &ddev->kobj,
					kobject_name(&ddev->kobj));
		if (err) {
			device_del(ddev);
			return;
		}
	}
438 439 440 441 442 443 444 445

	/*
	 * avoid probable deadlock caused by allocating memory with
	 * GFP_KERNEL in runtime_resume callback of its all ancestor
	 * devices
	 */
	pm_runtime_set_memalloc_noio(ddev, true);

446 447
	disk->part0->bd_holder_dir =
		kobject_create_and_add("holders", &ddev->kobj);
448 449
	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);

450 451 452 453 454 455 456 457 458 459
	/*
	 * XXX: this is a mess, can't wait for real error handling in add_disk.
	 * Make sure ->slave_dir is NULL if we failed some of the registration
	 * so that the cleanup in bd_unlink_disk_holder works properly.
	 */
	if (bd_register_pending_holders(disk) < 0) {
		kobject_put(disk->slave_dir);
		disk->slave_dir = NULL;
	}

460
	if (disk->flags & GENHD_FL_HIDDEN)
461 462
		return;

463
	disk_scan_partitions(disk);
464

465
	/* announce the disk and partitions after all partitions are created */
466
	dev_set_uevent_suppress(ddev, 0);
467
	disk_uevent(disk, KOBJ_ADD);
468

469 470 471
	if (disk->bdi->dev) {
		err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj,
					"bdi");
472 473
		WARN_ON(err);
	}
474 475
}

L
Linus Torvalds 已提交
476
/**
477
 * device_add_disk - add disk information to kernel list
478
 * @parent: parent device for the disk
L
Linus Torvalds 已提交
479
 * @disk: per-device partitioning information
480
 * @groups: Additional per-device sysfs groups
L
Linus Torvalds 已提交
481 482 483
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
484 485
 *
 * FIXME: error handling
L
Linus Torvalds 已提交
486
 */
487 488 489 490

void device_add_disk(struct device *parent, struct gendisk *disk,
		     const struct attribute_group **groups)

L
Linus Torvalds 已提交
491
{
492
	int ret;
493

494 495 496 497 498 499
	/*
	 * The disk queue should now be all set with enough information about
	 * the device for the elevator code to pick an adequate default
	 * elevator if one is needed, that is, for devices requesting queue
	 * registration.
	 */
500
	elevator_init_mq(disk->queue);
501

502 503 504 505 506 507
	/*
	 * If the driver provides an explicit major number it also must provide
	 * the number of minors numbers supported, and those will be used to
	 * setup the gendisk.
	 * Otherwise just allocate the device numbers for both the whole device
	 * and all partitions from the extended dev_t space.
508
	 */
509 510
	if (disk->major) {
		WARN_ON(!disk->minors);
511 512 513 514 515 516

		if (disk->minors > DISK_MAX_PARTS) {
			pr_err("block: can't allocate more than %d partitions\n",
				DISK_MAX_PARTS);
			disk->minors = DISK_MAX_PARTS;
		}
517 518
	} else {
		WARN_ON(disk->minors);
519

520 521 522 523 524 525 526
		ret = blk_alloc_ext_minor();
		if (ret < 0) {
			WARN_ON(1);
			return;
		}
		disk->major = BLOCK_EXT_MAJOR;
		disk->first_minor = MINOR(ret);
527
		disk->flags |= GENHD_FL_EXT_DEVT;
528
	}
529 530

	disk->flags |= GENHD_FL_UP;
531

532 533
	disk_alloc_events(disk);

534 535 536 537 538 539 540 541
	if (disk->flags & GENHD_FL_HIDDEN) {
		/*
		 * Don't let hidden disks show up in /proc/partitions,
		 * and don't bother scanning for partitions either.
		 */
		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
		disk->flags |= GENHD_FL_NO_PART_SCAN;
	} else {
542
		struct device *dev = disk_to_dev(disk);
543

544
		/* Register BDI before referencing it from bdev */
545
		dev->devt = MKDEV(disk->major, disk->first_minor);
546
		ret = bdi_register(disk->bdi, "%u:%u",
547
				   disk->major, disk->first_minor);
548
		WARN_ON(ret);
549
		bdi_set_owner(disk->bdi, dev);
550
		bdev_add(disk->part0, dev->devt);
551
	}
552
	register_disk(parent, disk, groups);
553
	blk_register_queue(disk);
554

555 556 557 558
	/*
	 * Take an extra ref on queue which will be put on disk_release()
	 * so that it sticks around as long as @disk is there.
	 */
559 560 561 562
	if (blk_get_queue(disk->queue))
		set_bit(GD_QUEUE_REF, &disk->state);
	else
		WARN_ON_ONCE(1);
563

564
	disk_add_events(disk);
565
	blk_integrity_add(disk);
L
Linus Torvalds 已提交
566
}
567
EXPORT_SYMBOL(device_add_disk);
L
Linus Torvalds 已提交
568

569 570 571 572 573 574 575 576 577 578 579 580 581
/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
582 583 584 585 586
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
587
 */
588
void del_gendisk(struct gendisk *disk)
L
Linus Torvalds 已提交
589
{
590 591
	might_sleep();

592 593 594
	if (WARN_ON_ONCE(!disk->queue))
		return;

595
	blk_integrity_del(disk);
596 597
	disk_del_events(disk);

598
	mutex_lock(&disk->open_mutex);
599
	remove_inode_hash(disk->part0->bd_inode);
600
	disk->flags &= ~GENHD_FL_UP;
601
	blk_drop_partitions(disk);
602
	mutex_unlock(&disk->open_mutex);
603

604 605 606
	fsync_bdev(disk->part0);
	__invalidate_device(disk->part0, true);

607 608
	set_capacity(disk, 0);

609
	if (!(disk->flags & GENHD_FL_HIDDEN)) {
610
		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
611

612 613 614 615
		/*
		 * Unregister bdi before releasing device numbers (as they can
		 * get reused and we'd get clashes in sysfs).
		 */
616
		bdi_unregister(disk->bdi);
617
	}
618

619
	blk_unregister_queue(disk);
620

621
	kobject_put(disk->part0->bd_holder_dir);
622 623
	kobject_put(disk->slave_dir);

624
	part_stat_set_all(disk->part0, 0);
625
	disk->part0->bd_stamp = 0;
626 627
	if (!sysfs_deprecated)
		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
628
	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
629
	device_del(disk_to_dev(disk));
L
Linus Torvalds 已提交
630
}
631
EXPORT_SYMBOL(del_gendisk);
L
Linus Torvalds 已提交
632

633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
					struct device_attribute *attr,
					char *page)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return sprintf(page, "\n");

	return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *page, size_t len)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return -ENXIO;

	return badblocks_store(disk->bb, page, len, 0);
}

658
void blk_request_module(dev_t devt)
659
{
660 661 662 663 664 665 666 667 668 669 670 671 672
	unsigned int major = MAJOR(devt);
	struct blk_major_name **n;

	mutex_lock(&major_names_lock);
	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
		if ((*n)->major == major && (*n)->probe) {
			(*n)->probe(devt);
			mutex_unlock(&major_names_lock);
			return;
		}
	}
	mutex_unlock(&major_names_lock);

673 674 675 676 677
	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
		/* Make old-style 2.4 aliases work */
		request_module("block-major-%d", MAJOR(devt));
}

678 679 680 681 682 683 684
/*
 * print a full list of all partitions - intended for places where the root
 * filesystem can't be mounted and thus to give the victim some idea of what
 * went wrong
 */
void __init printk_all_partitions(void)
{
685 686 687 688 689 690
	struct class_dev_iter iter;
	struct device *dev;

	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
691
		struct block_device *part;
692
		char devt_buf[BDEVT_SIZE];
693
		unsigned long idx;
694 695 696

		/*
		 * Don't show empty devices or things that have been
L
Lucas De Marchi 已提交
697
		 * suppressed
698 699 700 701 702 703
		 */
		if (get_capacity(disk) == 0 ||
		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
			continue;

		/*
704 705
		 * Note, unlike /proc/partitions, I am showing the numbers in
		 * hex - the same format as the root= option takes.
706
		 */
707 708 709 710
		rcu_read_lock();
		xa_for_each(&disk->part_tbl, idx, part) {
			if (!bdev_nr_sectors(part))
				continue;
711
			printk("%s%s %10llu %pg %s",
712
			       bdev_is_partition(part) ? "  " : "",
713
			       bdevt_str(part->bd_dev, devt_buf),
714
			       bdev_nr_sectors(part) >> 1, part,
715 716
			       part->bd_meta_info ?
					part->bd_meta_info->uuid : "");
717
			if (bdev_is_partition(part))
T
Tejun Heo 已提交
718
				printk("\n");
719 720 721 722 723
			else if (dev->parent && dev->parent->driver)
				printk(" driver: %s\n",
					dev->parent->driver->name);
			else
				printk(" (driver?)\n");
T
Tejun Heo 已提交
724
		}
725
		rcu_read_unlock();
726 727
	}
	class_dev_iter_exit(&iter);
728 729
}

L
Linus Torvalds 已提交
730 731
#ifdef CONFIG_PROC_FS
/* iterator */
732
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
733
{
734 735 736
	loff_t skip = *pos;
	struct class_dev_iter *iter;
	struct device *dev;
737

738
	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
739 740 741 742 743 744 745 746 747 748 749 750
	if (!iter)
		return ERR_PTR(-ENOMEM);

	seqf->private = iter;
	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
	do {
		dev = class_dev_iter_next(iter);
		if (!dev)
			return NULL;
	} while (skip--);

	return dev_to_disk(dev);
751 752
}

753
static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
L
Linus Torvalds 已提交
754
{
755
	struct device *dev;
L
Linus Torvalds 已提交
756

757 758
	(*pos)++;
	dev = class_dev_iter_next(seqf->private);
759
	if (dev)
760
		return dev_to_disk(dev);
761

L
Linus Torvalds 已提交
762 763 764
	return NULL;
}

765
static void disk_seqf_stop(struct seq_file *seqf, void *v)
766
{
767
	struct class_dev_iter *iter = seqf->private;
768

769 770 771 772
	/* stop is called even after start failed :-( */
	if (iter) {
		class_dev_iter_exit(iter);
		kfree(iter);
773
		seqf->private = NULL;
774
	}
L
Linus Torvalds 已提交
775 776
}

777
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
L
Linus Torvalds 已提交
778
{
779
	void *p;
780 781

	p = disk_seqf_start(seqf, pos);
782
	if (!IS_ERR_OR_NULL(p) && !*pos)
783 784
		seq_puts(seqf, "major minor  #blocks  name\n\n");
	return p;
L
Linus Torvalds 已提交
785 786
}

787
static int show_partition(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
788 789
{
	struct gendisk *sgp = v;
790
	struct block_device *part;
791
	unsigned long idx;
L
Linus Torvalds 已提交
792 793

	/* Don't show non-partitionable removeable devices or empty devices */
T
Tejun Heo 已提交
794
	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
795
				   (sgp->flags & GENHD_FL_REMOVABLE)))
L
Linus Torvalds 已提交
796 797 798 799
		return 0;
	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
		return 0;

800 801 802 803
	rcu_read_lock();
	xa_for_each(&sgp->part_tbl, idx, part) {
		if (!bdev_nr_sectors(part))
			continue;
804
		seq_printf(seqf, "%4d  %7d %10llu %pg\n",
805
			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
806
			   bdev_nr_sectors(part) >> 1, part);
807 808
	}
	rcu_read_unlock();
L
Linus Torvalds 已提交
809 810 811
	return 0;
}

812
static const struct seq_operations partitions_op = {
813 814 815
	.start	= show_partition_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
816
	.show	= show_partition
L
Linus Torvalds 已提交
817 818 819 820 821
};
#endif

static int __init genhd_device_init(void)
{
822 823 824 825
	int error;

	block_class.dev_kobj = sysfs_dev_block_kobj;
	error = class_register(&block_class);
R
Roland McGrath 已提交
826 827
	if (unlikely(error))
		return error;
L
Linus Torvalds 已提交
828
	blk_dev_init();
829

830 831
	register_blkdev(BLOCK_EXT_MAJOR, "blkext");

832
	/* create top-level block dir */
833 834
	if (!sysfs_deprecated)
		block_depr = kobject_create_and_add("block", NULL);
835
	return 0;
L
Linus Torvalds 已提交
836 837 838 839
}

subsys_initcall(genhd_device_init);

840 841
static ssize_t disk_range_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
842
{
843
	struct gendisk *disk = dev_to_disk(dev);
L
Linus Torvalds 已提交
844

845
	return sprintf(buf, "%d\n", disk->minors);
L
Linus Torvalds 已提交
846 847
}

848 849 850 851 852
static ssize_t disk_ext_range_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
853
	return sprintf(buf, "%d\n", disk_max_parts(disk));
854 855
}

856 857
static ssize_t disk_removable_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
858
{
859
	struct gendisk *disk = dev_to_disk(dev);
860

861 862
	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
863 864
}

865 866 867 868 869 870 871 872 873
static ssize_t disk_hidden_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

K
Kay Sievers 已提交
874 875 876 877 878
static ssize_t disk_ro_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
879
	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
K
Kay Sievers 已提交
880 881
}

882 883 884
ssize_t part_size_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
885
	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
886 887 888 889 890
}

ssize_t part_stat_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
891 892
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
893
	struct disk_stats stat;
894 895
	unsigned int inflight;

896
	part_stat_read_all(bdev, &stat);
897
	if (queue_is_mq(q))
898
		inflight = blk_mq_in_flight(q, bdev);
899
	else
900
		inflight = part_in_flight(bdev);
901

902 903 904 905 906 907 908
	return sprintf(buf,
		"%8lu %8lu %8llu %8u "
		"%8lu %8lu %8llu %8u "
		"%8u %8u %8u "
		"%8lu %8lu %8llu %8u "
		"%8lu %8u"
		"\n",
909 910 911 912 913 914 915 916
		stat.ios[STAT_READ],
		stat.merges[STAT_READ],
		(unsigned long long)stat.sectors[STAT_READ],
		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
		stat.ios[STAT_WRITE],
		stat.merges[STAT_WRITE],
		(unsigned long long)stat.sectors[STAT_WRITE],
		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
917
		inflight,
918
		jiffies_to_msecs(stat.io_ticks),
919 920 921 922 923
		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
				      stat.nsecs[STAT_WRITE] +
				      stat.nsecs[STAT_DISCARD] +
				      stat.nsecs[STAT_FLUSH],
						NSEC_PER_MSEC),
924 925 926 927 928 929
		stat.ios[STAT_DISCARD],
		stat.merges[STAT_DISCARD],
		(unsigned long long)stat.sectors[STAT_DISCARD],
		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
		stat.ios[STAT_FLUSH],
		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
930 931 932 933 934
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
			   char *buf)
{
935 936
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
937 938
	unsigned int inflight[2];

939
	if (queue_is_mq(q))
940
		blk_mq_in_flight_rw(q, bdev, inflight);
941
	else
942
		part_in_flight_rw(bdev, inflight);
943

944 945 946
	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

947 948
static ssize_t disk_capability_show(struct device *dev,
				    struct device_attribute *attr, char *buf)
949
{
950 951 952
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%x\n", disk->flags);
953
}
954

955 956 957 958 959 960 961 962 963
static ssize_t disk_alignment_offset_show(struct device *dev,
					  struct device_attribute *attr,
					  char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}

964 965 966 967 968 969
static ssize_t disk_discard_alignment_show(struct device *dev,
					   struct device_attribute *attr,
					   char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

970
	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
971 972
}

M
Matteo Croce 已提交
973 974 975 976 977 978 979 980
static ssize_t diskseq_show(struct device *dev,
			    struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%llu\n", disk->diskseq);
}

981 982 983 984 985 986 987 988 989 990 991 992
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
M
Matteo Croce 已提交
993
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
994

995
#ifdef CONFIG_FAIL_MAKE_REQUEST
996 997 998
ssize_t part_fail_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
999
	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
1000 1001 1002 1003 1004 1005 1006 1007 1008
}

ssize_t part_fail_store(struct device *dev,
			struct device_attribute *attr,
			const char *buf, size_t count)
{
	int i;

	if (count > 0 && sscanf(buf, "%d", &i) > 0)
1009
		dev_to_bdev(dev)->bd_make_it_fail = i;
1010 1011 1012 1013

	return count;
}

1014
static struct device_attribute dev_attr_fail =
1015
	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1016 1017
#endif /* CONFIG_FAIL_MAKE_REQUEST */

1018 1019
#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
1020
	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1021
#endif
1022 1023 1024

static struct attribute *disk_attrs[] = {
	&dev_attr_range.attr,
1025
	&dev_attr_ext_range.attr,
1026
	&dev_attr_removable.attr,
1027
	&dev_attr_hidden.attr,
K
Kay Sievers 已提交
1028
	&dev_attr_ro.attr,
1029
	&dev_attr_size.attr,
1030
	&dev_attr_alignment_offset.attr,
1031
	&dev_attr_discard_alignment.attr,
1032 1033
	&dev_attr_capability.attr,
	&dev_attr_stat.attr,
1034
	&dev_attr_inflight.attr,
1035
	&dev_attr_badblocks.attr,
1036 1037 1038
	&dev_attr_events.attr,
	&dev_attr_events_async.attr,
	&dev_attr_events_poll_msecs.attr,
M
Matteo Croce 已提交
1039
	&dev_attr_diskseq.attr,
1040 1041
#ifdef CONFIG_FAIL_MAKE_REQUEST
	&dev_attr_fail.attr,
1042 1043 1044
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
	&dev_attr_fail_timeout.attr,
1045 1046 1047 1048
#endif
	NULL
};

1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, typeof(*dev), kobj);
	struct gendisk *disk = dev_to_disk(dev);

	if (a == &dev_attr_badblocks.attr && !disk->bb)
		return 0;
	return a->mode;
}

1059 1060
static struct attribute_group disk_attr_group = {
	.attrs = disk_attrs,
1061
	.is_visible = disk_visible,
1062 1063
};

1064
static const struct attribute_group *disk_attr_groups[] = {
1065 1066
	&disk_attr_group,
	NULL
L
Linus Torvalds 已提交
1067 1068
};

1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
1080 1081
 *
 * Context: can sleep
1082
 */
1083
static void disk_release(struct device *dev)
L
Linus Torvalds 已提交
1084
{
1085 1086
	struct gendisk *disk = dev_to_disk(dev);

1087 1088
	might_sleep();

1089
	bdi_put(disk->bdi);
1090 1091
	if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
		blk_free_ext_minor(MINOR(dev->devt));
1092
	disk_release_events(disk);
L
Linus Torvalds 已提交
1093
	kfree(disk->random);
1094
	xa_destroy(&disk->part_tbl);
1095
	if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue)
1096
		blk_put_queue(disk->queue);
C
Christoph Hellwig 已提交
1097
	iput(disk->part0->bd_inode);	/* frees the disk */
L
Linus Torvalds 已提交
1098
}
1099 1100 1101 1102 1103 1104 1105 1106

static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
{
	struct gendisk *disk = dev_to_disk(dev);

	return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}

1107 1108
struct class block_class = {
	.name		= "block",
1109
	.dev_uevent	= block_uevent,
L
Linus Torvalds 已提交
1110 1111
};

1112
static char *block_devnode(struct device *dev, umode_t *mode,
1113
			   kuid_t *uid, kgid_t *gid)
1114 1115 1116
{
	struct gendisk *disk = dev_to_disk(dev);

1117 1118
	if (disk->fops->devnode)
		return disk->fops->devnode(disk, mode);
1119 1120 1121
	return NULL;
}

1122
const struct device_type disk_type = {
1123 1124 1125
	.name		= "disk",
	.groups		= disk_attr_groups,
	.release	= disk_release,
1126
	.devnode	= block_devnode,
L
Linus Torvalds 已提交
1127 1128
};

1129
#ifdef CONFIG_PROC_FS
1130 1131 1132 1133 1134 1135 1136 1137
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
1138 1139
{
	struct gendisk *gp = v;
1140
	struct block_device *hd;
1141
	unsigned int inflight;
1142
	struct disk_stats stat;
1143
	unsigned long idx;
L
Linus Torvalds 已提交
1144 1145

	/*
1146
	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1147
		seq_puts(seqf,	"major minor name"
L
Linus Torvalds 已提交
1148 1149 1150 1151
				"     rio rmerge rsect ruse wio wmerge "
				"wsect wuse running use aveq"
				"\n\n");
	*/
1152

1153 1154 1155 1156
	rcu_read_lock();
	xa_for_each(&gp->part_tbl, idx, hd) {
		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
			continue;
1157
		part_stat_read_all(hd, &stat);
1158
		if (queue_is_mq(gp->queue))
1159
			inflight = blk_mq_in_flight(gp->queue, hd);
1160
		else
1161
			inflight = part_in_flight(hd);
1162

1163
		seq_printf(seqf, "%4d %7d %pg "
1164 1165 1166
			   "%lu %lu %lu %u "
			   "%lu %lu %lu %u "
			   "%u %u %u "
1167 1168 1169
			   "%lu %lu %lu %u "
			   "%lu %u"
			   "\n",
1170
			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
			   stat.ios[STAT_READ],
			   stat.merges[STAT_READ],
			   stat.sectors[STAT_READ],
			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
							NSEC_PER_MSEC),
			   stat.ios[STAT_WRITE],
			   stat.merges[STAT_WRITE],
			   stat.sectors[STAT_WRITE],
			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
							NSEC_PER_MSEC),
1181
			   inflight,
1182
			   jiffies_to_msecs(stat.io_ticks),
1183 1184 1185 1186 1187
			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
						 stat.nsecs[STAT_WRITE] +
						 stat.nsecs[STAT_DISCARD] +
						 stat.nsecs[STAT_FLUSH],
							NSEC_PER_MSEC),
1188 1189 1190 1191 1192 1193 1194 1195
			   stat.ios[STAT_DISCARD],
			   stat.merges[STAT_DISCARD],
			   stat.sectors[STAT_DISCARD],
			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
						 NSEC_PER_MSEC),
			   stat.ios[STAT_FLUSH],
			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
						 NSEC_PER_MSEC)
1196
			);
L
Linus Torvalds 已提交
1197
	}
1198
	rcu_read_unlock();
1199

L
Linus Torvalds 已提交
1200 1201 1202
	return 0;
}

1203
static const struct seq_operations diskstats_op = {
1204 1205 1206
	.start	= disk_seqf_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
L
Linus Torvalds 已提交
1207 1208
	.show	= diskstats_show
};
1209 1210 1211

static int __init proc_genhd_init(void)
{
1212 1213
	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
	proc_create_seq("partitions", 0, NULL, &partitions_op);
1214 1215 1216
	return 0;
}
module_init(proc_genhd_init);
1217
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
1218

1219 1220
dev_t part_devt(struct gendisk *disk, u8 partno)
{
C
Christoph Hellwig 已提交
1221
	struct block_device *part;
1222 1223
	dev_t devt = 0;

C
Christoph Hellwig 已提交
1224 1225 1226
	rcu_read_lock();
	part = xa_load(&disk->part_tbl, partno);
	if (part)
1227
		devt = part->bd_dev;
C
Christoph Hellwig 已提交
1228
	rcu_read_unlock();
1229 1230 1231 1232

	return devt;
}

1233
dev_t blk_lookup_devt(const char *name, int partno)
1234
{
1235 1236 1237
	dev_t devt = MKDEV(0, 0);
	struct class_dev_iter iter;
	struct device *dev;
1238

1239 1240
	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
1241 1242
		struct gendisk *disk = dev_to_disk(dev);

1243
		if (strcmp(dev_name(dev), name))
1244 1245
			continue;

1246 1247 1248 1249 1250 1251
		if (partno < disk->minors) {
			/* We need to return the right devno, even
			 * if the partition doesn't exist yet.
			 */
			devt = MKDEV(MAJOR(dev->devt),
				     MINOR(dev->devt) + partno);
1252 1253 1254 1255
		} else {
			devt = part_devt(disk, partno);
			if (devt)
				break;
1256
		}
1257
	}
1258
	class_dev_iter_exit(&iter);
1259 1260 1261
	return devt;
}

1262
struct gendisk *__alloc_disk_node(int minors, int node_id)
1263 1264 1265
{
	struct gendisk *disk;

1266
	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1267 1268
	if (!disk)
		return NULL;
1269

1270 1271 1272 1273
	disk->bdi = bdi_alloc(node_id);
	if (!disk->bdi)
		goto out_free_disk;

1274 1275
	disk->part0 = bdev_alloc(disk, 0);
	if (!disk->part0)
1276
		goto out_free_bdi;
1277

1278
	disk->node_id = node_id;
1279
	mutex_init(&disk->open_mutex);
1280 1281 1282
	xa_init(&disk->part_tbl);
	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
		goto out_destroy_part_tbl;
1283 1284 1285 1286 1287 1288

	disk->minors = minors;
	rand_initialize_disk(disk);
	disk_to_dev(disk)->class = &block_class;
	disk_to_dev(disk)->type = &disk_type;
	device_initialize(disk_to_dev(disk));
M
Matteo Croce 已提交
1289
	inc_diskseq(disk);
1290 1291 1292
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
	INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
L
Linus Torvalds 已提交
1293
	return disk;
1294

1295 1296
out_destroy_part_tbl:
	xa_destroy(&disk->part_tbl);
C
Christoph Hellwig 已提交
1297
	iput(disk->part0->bd_inode);
1298 1299
out_free_bdi:
	bdi_put(disk->bdi);
1300 1301 1302
out_free_disk:
	kfree(disk);
	return NULL;
L
Linus Torvalds 已提交
1303
}
1304
EXPORT_SYMBOL(__alloc_disk_node);
L
Linus Torvalds 已提交
1305

1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
struct gendisk *__blk_alloc_disk(int node)
{
	struct request_queue *q;
	struct gendisk *disk;

	q = blk_alloc_queue(node);
	if (!q)
		return NULL;

	disk = __alloc_disk_node(0, node);
	if (!disk) {
		blk_cleanup_queue(q);
		return NULL;
	}
	disk->queue = q;
	return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);

1325 1326
/**
 * put_disk - decrements the gendisk refcount
1327
 * @disk: the struct gendisk to decrement the refcount for
1328 1329 1330
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
1331 1332 1333
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
1334
 */
L
Linus Torvalds 已提交
1335 1336 1337
void put_disk(struct gendisk *disk)
{
	if (disk)
1338
		put_device(disk_to_dev(disk));
L
Linus Torvalds 已提交
1339 1340 1341
}
EXPORT_SYMBOL(put_disk);

1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357
/**
 * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
 * @disk: gendisk to shutdown
 *
 * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
 * the queue DEAD, destroy and put it and the gendisk structure.
 *
 * Context: can sleep
 */
void blk_cleanup_disk(struct gendisk *disk)
{
	blk_cleanup_queue(disk->queue);
	put_disk(disk);
}
EXPORT_SYMBOL(blk_cleanup_disk);

1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
	char event[] = "DISK_RO=1";
	char *envp[] = { event, NULL };

	if (!ro)
		event[8] = '0';
	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

1368 1369 1370
/**
 * set_disk_ro - set a gendisk read-only
 * @disk:	gendisk to operate on
1371
 * @read_only:	%true to set the disk read-only, %false set the disk read/write
1372 1373 1374 1375 1376 1377
 *
 * This function is used to indicate whether a given disk device should have its
 * read-only flag set. set_disk_ro() is typically used by device drivers to
 * indicate whether the underlying physical device is write-protected.
 */
void set_disk_ro(struct gendisk *disk, bool read_only)
L
Linus Torvalds 已提交
1378
{
1379 1380 1381 1382 1383 1384
	if (read_only) {
		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
			return;
	} else {
		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
			return;
1385
	}
1386
	set_disk_ro_uevent(disk, read_only);
L
Linus Torvalds 已提交
1387 1388 1389 1390 1391
}
EXPORT_SYMBOL(set_disk_ro);

int bdev_read_only(struct block_device *bdev)
{
1392
	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
L
Linus Torvalds 已提交
1393 1394
}
EXPORT_SYMBOL(bdev_read_only);
M
Matteo Croce 已提交
1395 1396 1397 1398 1399

void inc_diskseq(struct gendisk *disk)
{
	disk->diskseq = atomic64_inc_return(&diskseq);
}