genhd.c 34.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3
/*
 *  gendisk handling
C
Christoph Hellwig 已提交
4 5
 *
 * Portions Copyright (C) 2020 Christoph Hellwig
L
Linus Torvalds 已提交
6 7 8
 */

#include <linux/module.h>
9
#include <linux/ctype.h>
L
Linus Torvalds 已提交
10 11
#include <linux/fs.h>
#include <linux/genhd.h>
12
#include <linux/kdev_t.h>
L
Linus Torvalds 已提交
13 14
#include <linux/kernel.h>
#include <linux/blkdev.h>
15
#include <linux/backing-dev.h>
L
Linus Torvalds 已提交
16 17
#include <linux/init.h>
#include <linux/spinlock.h>
18
#include <linux/proc_fs.h>
L
Linus Torvalds 已提交
19 20 21
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
22
#include <linux/mutex.h>
T
Tejun Heo 已提交
23
#include <linux/idr.h>
24
#include <linux/log2.h>
25
#include <linux/pm_runtime.h>
26
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
27

28 29
#include "blk.h"

30
static struct kobject *block_depr;
L
Linus Torvalds 已提交
31

M
Matteo Croce 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
/*
 * Unique, monotonically increasing sequential number associated with block
 * devices instances (i.e. incremented each time a device is attached).
 * Associating uevents with block devices in userspace is difficult and racy:
 * the uevent netlink socket is lossy, and on slow and overloaded systems has
 * a very high latency.
 * Block devices do not have exclusive owners in userspace, any process can set
 * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
 * can be reused again and again).
 * A userspace process setting up a block device and watching for its events
 * cannot thus reliably tell whether an event relates to the device it just set
 * up or another earlier instance with the same name.
 * This sequential number allows userspace processes to solve this problem, and
 * uniquely associate an uevent to the lifetime to a device.
 */
static atomic64_t diskseq;

T
Tejun Heo 已提交
49
/* for extended dynamic devt allocation, currently only one major is used */
50
#define NR_EXT_DEVT		(1 << MINORBITS)
51
static DEFINE_IDA(ext_devt_ida);
T
Tejun Heo 已提交
52

53 54
void set_capacity(struct gendisk *disk, sector_t sectors)
{
55
	struct block_device *bdev = disk->part0;
56

57
	spin_lock(&bdev->bd_size_lock);
58
	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
59
	spin_unlock(&bdev->bd_size_lock);
60 61 62
}
EXPORT_SYMBOL(set_capacity);

63
/*
64 65
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
66
 */
67
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
68 69
{
	sector_t capacity = get_capacity(disk);
70
	char *envp[] = { "RESIZE=1", NULL };
71 72 73

	set_capacity(disk, size);

74 75 76 77 78 79
	/*
	 * Only print a message and send a uevent if the gendisk is user visible
	 * and alive.  This avoids spamming the log and udev when setting the
	 * initial capacity during probing.
	 */
	if (size == capacity ||
C
Christoph Hellwig 已提交
80 81
	    !disk_live(disk) ||
	    (disk->flags & GENHD_FL_HIDDEN))
82
		return false;
83

84
	pr_info("%s: detected capacity change from %lld to %lld\n",
M
Ming Lei 已提交
85
		disk->disk_name, capacity, size);
86

87 88 89 90 91 92 93 94
	/*
	 * Historically we did not send a uevent for changes to/from an empty
	 * device.
	 */
	if (!capacity || !size)
		return false;
	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
	return true;
95
}
96
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
97

98
/*
C
Christoph Hellwig 已提交
99 100 101 102 103
 * Format the device name of the indicated block device into the supplied buffer
 * and return a pointer to that same buffer for convenience.
 *
 * Note: do not use this in new code, use the %pg specifier to sprintf and
 * printk insted.
104
 */
C
Christoph Hellwig 已提交
105
const char *bdevname(struct block_device *bdev, char *buf)
106
{
C
Christoph Hellwig 已提交
107 108 109
	struct gendisk *hd = bdev->bd_disk;
	int partno = bdev->bd_partno;

110 111 112 113 114 115 116 117 118 119
	if (!partno)
		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
	else
		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

	return buf;
}
EXPORT_SYMBOL(bdevname);
120

121 122
static void part_stat_read_all(struct block_device *part,
		struct disk_stats *stat)
123 124 125 126 127
{
	int cpu;

	memset(stat, 0, sizeof(struct disk_stats));
	for_each_possible_cpu(cpu) {
128
		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
129 130 131 132 133 134 135 136 137 138 139 140 141
		int group;

		for (group = 0; group < NR_STAT_GROUPS; group++) {
			stat->nsecs[group] += ptr->nsecs[group];
			stat->sectors[group] += ptr->sectors[group];
			stat->ios[group] += ptr->ios[group];
			stat->merges[group] += ptr->merges[group];
		}

		stat->io_ticks += ptr->io_ticks;
	}
}

142
static unsigned int part_in_flight(struct block_device *part)
143
{
144
	unsigned int inflight = 0;
145
	int cpu;
146

147
	for_each_possible_cpu(cpu) {
148 149
		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
			    part_stat_local_read_cpu(part, in_flight[1], cpu);
150
	}
151 152
	if ((int)inflight < 0)
		inflight = 0;
153

154
	return inflight;
155 156
}

157 158
static void part_in_flight_rw(struct block_device *part,
		unsigned int inflight[2])
159
{
160 161 162 163 164 165 166 167 168 169 170 171
	int cpu;

	inflight[0] = 0;
	inflight[1] = 0;
	for_each_possible_cpu(cpu) {
		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
	}
	if ((int)inflight[0] < 0)
		inflight[0] = 0;
	if ((int)inflight[1] < 0)
		inflight[1] = 0;
172 173
}

L
Linus Torvalds 已提交
174 175 176 177
/*
 * Can be deleted altogether. Later.
 *
 */
178
#define BLKDEV_MAJOR_HASH_SIZE 255
L
Linus Torvalds 已提交
179 180 181 182
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
183
	void (*probe)(dev_t devt);
184
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
C
Christoph Hellwig 已提交
185
static DEFINE_MUTEX(major_names_lock);
L
Linus Torvalds 已提交
186 187

/* index in the above - for now: assume no multimajor ranges */
188
static inline int major_to_index(unsigned major)
L
Linus Torvalds 已提交
189
{
190
	return major % BLKDEV_MAJOR_HASH_SIZE;
191 192
}

193
#ifdef CONFIG_PROC_FS
194
void blkdev_show(struct seq_file *seqf, off_t offset)
195
{
196
	struct blk_major_name *dp;
197

C
Christoph Hellwig 已提交
198
	mutex_lock(&major_names_lock);
199 200
	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
		if (dp->major == offset)
201
			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
C
Christoph Hellwig 已提交
202
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
203
}
204
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
205

206
/**
207
 * __register_blkdev - register a new block device
208
 *
209 210
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
211
 * @name: the name of the new block device as a zero terminated string
212
 * @probe: allback that is called on access to any minor number of @major
213 214 215
 *
 * The @name must be unique within the system.
 *
216 217
 * The return value depends on the @major input parameter:
 *
218 219
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
220
 *  - if any unused major number was requested with @major = 0 parameter
221
 *    then the return value is the allocated major number in range
222 223 224 225
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
226 227
 *
 * Use register_blkdev instead for any new code.
228
 */
229 230
int __register_blkdev(unsigned int major, const char *name,
		void (*probe)(dev_t devt))
L
Linus Torvalds 已提交
231 232 233 234
{
	struct blk_major_name **n, *p;
	int index, ret = 0;

C
Christoph Hellwig 已提交
235
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
236 237 238 239 240 241 242 243 244

	/* temporary */
	if (major == 0) {
		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
			if (major_names[index] == NULL)
				break;
		}

		if (index == 0) {
245 246
			printk("%s: failed to get major for %s\n",
			       __func__, name);
L
Linus Torvalds 已提交
247 248 249 250 251 252 253
			ret = -EBUSY;
			goto out;
		}
		major = index;
		ret = major;
	}

254
	if (major >= BLKDEV_MAJOR_MAX) {
255 256
		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
257 258 259 260 261

		ret = -EINVAL;
		goto out;
	}

L
Linus Torvalds 已提交
262 263 264 265 266 267 268
	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
	if (p == NULL) {
		ret = -ENOMEM;
		goto out;
	}

	p->major = major;
269
	p->probe = probe;
L
Linus Torvalds 已提交
270 271 272 273 274 275 276 277 278 279 280 281 282 283
	strlcpy(p->name, name, sizeof(p->name));
	p->next = NULL;
	index = major_to_index(major);

	for (n = &major_names[index]; *n; n = &(*n)->next) {
		if ((*n)->major == major)
			break;
	}
	if (!*n)
		*n = p;
	else
		ret = -EBUSY;

	if (ret < 0) {
284
		printk("register_blkdev: cannot get major %u for %s\n",
L
Linus Torvalds 已提交
285 286 287 288
		       major, name);
		kfree(p);
	}
out:
C
Christoph Hellwig 已提交
289
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
290 291
	return ret;
}
292
EXPORT_SYMBOL(__register_blkdev);
L
Linus Torvalds 已提交
293

A
Akinobu Mita 已提交
294
void unregister_blkdev(unsigned int major, const char *name)
L
Linus Torvalds 已提交
295 296 297 298 299
{
	struct blk_major_name **n;
	struct blk_major_name *p = NULL;
	int index = major_to_index(major);

C
Christoph Hellwig 已提交
300
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
301 302 303
	for (n = &major_names[index]; *n; n = &(*n)->next)
		if ((*n)->major == major)
			break;
304 305 306
	if (!*n || strcmp((*n)->name, name)) {
		WARN_ON(1);
	} else {
L
Linus Torvalds 已提交
307 308 309
		p = *n;
		*n = p->next;
	}
C
Christoph Hellwig 已提交
310
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
311 312 313 314 315
	kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
/**
 * blk_mangle_minor - scatter minor numbers apart
 * @minor: minor number to mangle
 *
 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 * is enabled.  Mangling twice gives the original value.
 *
 * RETURNS:
 * Mangled value.
 *
 * CONTEXT:
 * Don't care.
 */
static int blk_mangle_minor(int minor)
{
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
	int i;

	for (i = 0; i < MINORBITS / 2; i++) {
		int low = minor & (1 << i);
		int high = minor & (1 << (MINORBITS - 1 - i));
		int distance = MINORBITS - 1 - 2 * i;

		minor ^= low | high;	/* clear both bits */
		low <<= distance;	/* swap the positions */
		high >>= distance;
		minor |= low | high;	/* and set */
	}
#endif
	return minor;
}

348
int blk_alloc_ext_minor(void)
T
Tejun Heo 已提交
349
{
T
Tejun Heo 已提交
350
	int idx;
T
Tejun Heo 已提交
351

352
	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
353 354 355 356 357 358
	if (idx < 0) {
		if (idx == -ENOSPC)
			return -EBUSY;
		return idx;
	}
	return blk_mangle_minor(idx);
T
Tejun Heo 已提交
359 360
}

361
void blk_free_ext_minor(unsigned int minor)
T
Tejun Heo 已提交
362
{
363
	ida_free(&ext_devt_ida, blk_mangle_minor(minor));
Y
Yufen Yu 已提交
364 365
}

366 367 368 369 370 371 372 373 374 375 376 377
static char *bdevt_str(dev_t devt, char *buf)
{
	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
		char tbuf[BDEVT_SIZE];
		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
	} else
		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));

	return buf;
}

378 379 380
void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
	struct block_device *part;
C
Christoph Hellwig 已提交
381
	unsigned long idx;
382

C
Christoph Hellwig 已提交
383 384 385 386
	rcu_read_lock();
	xa_for_each(&disk->part_tbl, idx, part) {
		if (bdev_is_partition(part) && !bdev_nr_sectors(part))
			continue;
387
		if (!kobject_get_unless_zero(&part->bd_device.kobj))
C
Christoph Hellwig 已提交
388 389 390
			continue;

		rcu_read_unlock();
391
		kobject_uevent(bdev_kobj(part), action);
392
		put_device(&part->bd_device);
C
Christoph Hellwig 已提交
393 394 395
		rcu_read_lock();
	}
	rcu_read_unlock();
396 397 398
}
EXPORT_SYMBOL_GPL(disk_uevent);

399 400 401 402 403 404 405 406 407 408 409 410 411
static void disk_scan_partitions(struct gendisk *disk)
{
	struct block_device *bdev;

	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
		return;

	set_bit(GD_NEED_PART_SCAN, &disk->state);
	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
	if (!IS_ERR(bdev))
		blkdev_put(bdev, FMODE_READ);
}

L
Linus Torvalds 已提交
412
/**
413
 * device_add_disk - add disk information to kernel list
414
 * @parent: parent device for the disk
L
Linus Torvalds 已提交
415
 * @disk: per-device partitioning information
416
 * @groups: Additional per-device sysfs groups
L
Linus Torvalds 已提交
417 418 419
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
420 421
 *
 * FIXME: error handling
L
Linus Torvalds 已提交
422
 */
423 424 425 426

void device_add_disk(struct device *parent, struct gendisk *disk,
		     const struct attribute_group **groups)

L
Linus Torvalds 已提交
427
{
428
	struct device *ddev = disk_to_dev(disk);
429
	int ret;
430

431 432 433 434 435 436
	/*
	 * The disk queue should now be all set with enough information about
	 * the device for the elevator code to pick an adequate default
	 * elevator if one is needed, that is, for devices requesting queue
	 * registration.
	 */
437
	elevator_init_mq(disk->queue);
438

439 440 441 442 443 444
	/*
	 * If the driver provides an explicit major number it also must provide
	 * the number of minors numbers supported, and those will be used to
	 * setup the gendisk.
	 * Otherwise just allocate the device numbers for both the whole device
	 * and all partitions from the extended dev_t space.
445
	 */
446 447
	if (disk->major) {
		WARN_ON(!disk->minors);
448 449 450 451 452 453

		if (disk->minors > DISK_MAX_PARTS) {
			pr_err("block: can't allocate more than %d partitions\n",
				DISK_MAX_PARTS);
			disk->minors = DISK_MAX_PARTS;
		}
454 455
	} else {
		WARN_ON(disk->minors);
456

457 458 459 460 461 462 463
		ret = blk_alloc_ext_minor();
		if (ret < 0) {
			WARN_ON(1);
			return;
		}
		disk->major = BLOCK_EXT_MAJOR;
		disk->first_minor = MINOR(ret);
464
		disk->flags |= GENHD_FL_EXT_DEVT;
465
	}
466

467 468
	disk_alloc_events(disk);

469 470 471 472 473 474
	/* delay uevents, until we scanned partition table */
	dev_set_uevent_suppress(ddev, 1);

	ddev->parent = parent;
	ddev->groups = groups;
	dev_set_name(ddev, "%s", disk->disk_name);
475 476
	if (!(disk->flags & GENHD_FL_HIDDEN))
		ddev->devt = MKDEV(disk->major, disk->first_minor);
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
	if (device_add(ddev))
		return;
	if (!sysfs_deprecated) {
		ret = sysfs_create_link(block_depr, &ddev->kobj,
					kobject_name(&ddev->kobj));
		if (ret) {
			device_del(ddev);
			return;
		}
	}

	/*
	 * avoid probable deadlock caused by allocating memory with
	 * GFP_KERNEL in runtime_resume callback of its all ancestor
	 * devices
	 */
	pm_runtime_set_memalloc_noio(ddev, true);

	disk->part0->bd_holder_dir =
		kobject_create_and_add("holders", &ddev->kobj);
	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);

	/*
	 * XXX: this is a mess, can't wait for real error handling in add_disk.
	 * Make sure ->slave_dir is NULL if we failed some of the registration
	 * so that the cleanup in bd_unlink_disk_holder works properly.
	 */
	if (bd_register_pending_holders(disk) < 0) {
		kobject_put(disk->slave_dir);
		disk->slave_dir = NULL;
507
	}
508

509 510 511 512 513 514 515 516 517 518 519 520 521 522
	if (disk->flags & GENHD_FL_HIDDEN) {
		/*
		 * Don't let hidden disks show up in /proc/partitions,
		 * and don't bother scanning for partitions either.
		 */
		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
		disk->flags |= GENHD_FL_NO_PART_SCAN;
	} else {
		ret = bdi_register(disk->bdi, "%u:%u",
				   disk->major, disk->first_minor);
		WARN_ON(ret);
		bdi_set_owner(disk->bdi, ddev);
		bdev_add(disk->part0, ddev->devt);

523 524 525 526
		disk_scan_partitions(disk);

		/*
		 * Announce the disk and partitions after all partitions are
527
		 * created. (for hidden disks uevents remain suppressed forever)
528 529 530 531 532 533 534 535 536 537 538
		 */
		dev_set_uevent_suppress(ddev, 0);
		disk_uevent(disk, KOBJ_ADD);

		if (disk->bdi->dev) {
			ret = sysfs_create_link(&ddev->kobj,
						&disk->bdi->dev->kobj, "bdi");
			WARN_ON(ret);
		}
	}

539
	blk_register_queue(disk);
540

541
	disk_add_events(disk);
542
	blk_integrity_add(disk);
L
Linus Torvalds 已提交
543
}
544
EXPORT_SYMBOL(device_add_disk);
L
Linus Torvalds 已提交
545

546 547 548 549 550 551 552 553 554 555 556 557 558
/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
559 560 561 562 563
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
564
 */
565
void del_gendisk(struct gendisk *disk)
L
Linus Torvalds 已提交
566
{
567 568
	might_sleep();

569
	if (WARN_ON_ONCE(!disk_live(disk)))
570 571
		return;

572
	blk_integrity_del(disk);
573 574
	disk_del_events(disk);

575
	mutex_lock(&disk->open_mutex);
576
	remove_inode_hash(disk->part0->bd_inode);
577
	blk_drop_partitions(disk);
578
	mutex_unlock(&disk->open_mutex);
579

580 581 582
	fsync_bdev(disk->part0);
	__invalidate_device(disk->part0, true);

583 584
	set_capacity(disk, 0);

585
	if (!(disk->flags & GENHD_FL_HIDDEN)) {
586
		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
587

588 589 590 591
		/*
		 * Unregister bdi before releasing device numbers (as they can
		 * get reused and we'd get clashes in sysfs).
		 */
592
		bdi_unregister(disk->bdi);
593
	}
594

595
	blk_unregister_queue(disk);
596

597
	kobject_put(disk->part0->bd_holder_dir);
598 599
	kobject_put(disk->slave_dir);

600
	part_stat_set_all(disk->part0, 0);
601
	disk->part0->bd_stamp = 0;
602 603
	if (!sysfs_deprecated)
		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
604
	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
605
	device_del(disk_to_dev(disk));
L
Linus Torvalds 已提交
606
}
607
EXPORT_SYMBOL(del_gendisk);
L
Linus Torvalds 已提交
608

609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
					struct device_attribute *attr,
					char *page)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return sprintf(page, "\n");

	return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *page, size_t len)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return -ENXIO;

	return badblocks_store(disk->bb, page, len, 0);
}

634
void blk_request_module(dev_t devt)
635
{
636 637 638 639 640 641 642 643 644 645 646 647 648
	unsigned int major = MAJOR(devt);
	struct blk_major_name **n;

	mutex_lock(&major_names_lock);
	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
		if ((*n)->major == major && (*n)->probe) {
			(*n)->probe(devt);
			mutex_unlock(&major_names_lock);
			return;
		}
	}
	mutex_unlock(&major_names_lock);

649 650 651 652 653
	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
		/* Make old-style 2.4 aliases work */
		request_module("block-major-%d", MAJOR(devt));
}

654 655 656 657 658 659 660
/*
 * print a full list of all partitions - intended for places where the root
 * filesystem can't be mounted and thus to give the victim some idea of what
 * went wrong
 */
void __init printk_all_partitions(void)
{
661 662 663 664 665 666
	struct class_dev_iter iter;
	struct device *dev;

	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
667
		struct block_device *part;
668
		char devt_buf[BDEVT_SIZE];
669
		unsigned long idx;
670 671 672

		/*
		 * Don't show empty devices or things that have been
L
Lucas De Marchi 已提交
673
		 * suppressed
674 675 676 677 678 679
		 */
		if (get_capacity(disk) == 0 ||
		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
			continue;

		/*
680 681
		 * Note, unlike /proc/partitions, I am showing the numbers in
		 * hex - the same format as the root= option takes.
682
		 */
683 684 685 686
		rcu_read_lock();
		xa_for_each(&disk->part_tbl, idx, part) {
			if (!bdev_nr_sectors(part))
				continue;
687
			printk("%s%s %10llu %pg %s",
688
			       bdev_is_partition(part) ? "  " : "",
689
			       bdevt_str(part->bd_dev, devt_buf),
690
			       bdev_nr_sectors(part) >> 1, part,
691 692
			       part->bd_meta_info ?
					part->bd_meta_info->uuid : "");
693
			if (bdev_is_partition(part))
T
Tejun Heo 已提交
694
				printk("\n");
695 696 697 698 699
			else if (dev->parent && dev->parent->driver)
				printk(" driver: %s\n",
					dev->parent->driver->name);
			else
				printk(" (driver?)\n");
T
Tejun Heo 已提交
700
		}
701
		rcu_read_unlock();
702 703
	}
	class_dev_iter_exit(&iter);
704 705
}

L
Linus Torvalds 已提交
706 707
#ifdef CONFIG_PROC_FS
/* iterator */
708
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
709
{
710 711 712
	loff_t skip = *pos;
	struct class_dev_iter *iter;
	struct device *dev;
713

714
	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
715 716 717 718 719 720 721 722 723 724 725 726
	if (!iter)
		return ERR_PTR(-ENOMEM);

	seqf->private = iter;
	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
	do {
		dev = class_dev_iter_next(iter);
		if (!dev)
			return NULL;
	} while (skip--);

	return dev_to_disk(dev);
727 728
}

729
static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
L
Linus Torvalds 已提交
730
{
731
	struct device *dev;
L
Linus Torvalds 已提交
732

733 734
	(*pos)++;
	dev = class_dev_iter_next(seqf->private);
735
	if (dev)
736
		return dev_to_disk(dev);
737

L
Linus Torvalds 已提交
738 739 740
	return NULL;
}

741
static void disk_seqf_stop(struct seq_file *seqf, void *v)
742
{
743
	struct class_dev_iter *iter = seqf->private;
744

745 746 747 748
	/* stop is called even after start failed :-( */
	if (iter) {
		class_dev_iter_exit(iter);
		kfree(iter);
749
		seqf->private = NULL;
750
	}
L
Linus Torvalds 已提交
751 752
}

753
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
L
Linus Torvalds 已提交
754
{
755
	void *p;
756 757

	p = disk_seqf_start(seqf, pos);
758
	if (!IS_ERR_OR_NULL(p) && !*pos)
759 760
		seq_puts(seqf, "major minor  #blocks  name\n\n");
	return p;
L
Linus Torvalds 已提交
761 762
}

763
static int show_partition(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
764 765
{
	struct gendisk *sgp = v;
766
	struct block_device *part;
767
	unsigned long idx;
L
Linus Torvalds 已提交
768 769

	/* Don't show non-partitionable removeable devices or empty devices */
T
Tejun Heo 已提交
770
	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
771
				   (sgp->flags & GENHD_FL_REMOVABLE)))
L
Linus Torvalds 已提交
772 773 774 775
		return 0;
	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
		return 0;

776 777 778 779
	rcu_read_lock();
	xa_for_each(&sgp->part_tbl, idx, part) {
		if (!bdev_nr_sectors(part))
			continue;
780
		seq_printf(seqf, "%4d  %7d %10llu %pg\n",
781
			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
782
			   bdev_nr_sectors(part) >> 1, part);
783 784
	}
	rcu_read_unlock();
L
Linus Torvalds 已提交
785 786 787
	return 0;
}

788
static const struct seq_operations partitions_op = {
789 790 791
	.start	= show_partition_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
792
	.show	= show_partition
L
Linus Torvalds 已提交
793 794 795 796 797
};
#endif

static int __init genhd_device_init(void)
{
798 799 800 801
	int error;

	block_class.dev_kobj = sysfs_dev_block_kobj;
	error = class_register(&block_class);
R
Roland McGrath 已提交
802 803
	if (unlikely(error))
		return error;
L
Linus Torvalds 已提交
804
	blk_dev_init();
805

806 807
	register_blkdev(BLOCK_EXT_MAJOR, "blkext");

808
	/* create top-level block dir */
809 810
	if (!sysfs_deprecated)
		block_depr = kobject_create_and_add("block", NULL);
811
	return 0;
L
Linus Torvalds 已提交
812 813 814 815
}

subsys_initcall(genhd_device_init);

816 817
static ssize_t disk_range_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
818
{
819
	struct gendisk *disk = dev_to_disk(dev);
L
Linus Torvalds 已提交
820

821
	return sprintf(buf, "%d\n", disk->minors);
L
Linus Torvalds 已提交
822 823
}

824 825 826 827 828
static ssize_t disk_ext_range_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
829
	return sprintf(buf, "%d\n", disk_max_parts(disk));
830 831
}

832 833
static ssize_t disk_removable_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
834
{
835
	struct gendisk *disk = dev_to_disk(dev);
836

837 838
	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
839 840
}

841 842 843 844 845 846 847 848 849
static ssize_t disk_hidden_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

K
Kay Sievers 已提交
850 851 852 853 854
static ssize_t disk_ro_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
855
	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
K
Kay Sievers 已提交
856 857
}

858 859 860
ssize_t part_size_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
861
	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
862 863 864 865 866
}

ssize_t part_stat_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
867 868
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
869
	struct disk_stats stat;
870 871
	unsigned int inflight;

872
	part_stat_read_all(bdev, &stat);
873
	if (queue_is_mq(q))
874
		inflight = blk_mq_in_flight(q, bdev);
875
	else
876
		inflight = part_in_flight(bdev);
877

878 879 880 881 882 883 884
	return sprintf(buf,
		"%8lu %8lu %8llu %8u "
		"%8lu %8lu %8llu %8u "
		"%8u %8u %8u "
		"%8lu %8lu %8llu %8u "
		"%8lu %8u"
		"\n",
885 886 887 888 889 890 891 892
		stat.ios[STAT_READ],
		stat.merges[STAT_READ],
		(unsigned long long)stat.sectors[STAT_READ],
		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
		stat.ios[STAT_WRITE],
		stat.merges[STAT_WRITE],
		(unsigned long long)stat.sectors[STAT_WRITE],
		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
893
		inflight,
894
		jiffies_to_msecs(stat.io_ticks),
895 896 897 898 899
		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
				      stat.nsecs[STAT_WRITE] +
				      stat.nsecs[STAT_DISCARD] +
				      stat.nsecs[STAT_FLUSH],
						NSEC_PER_MSEC),
900 901 902 903 904 905
		stat.ios[STAT_DISCARD],
		stat.merges[STAT_DISCARD],
		(unsigned long long)stat.sectors[STAT_DISCARD],
		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
		stat.ios[STAT_FLUSH],
		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
906 907 908 909 910
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
			   char *buf)
{
911 912
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
913 914
	unsigned int inflight[2];

915
	if (queue_is_mq(q))
916
		blk_mq_in_flight_rw(q, bdev, inflight);
917
	else
918
		part_in_flight_rw(bdev, inflight);
919

920 921 922
	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

923 924
static ssize_t disk_capability_show(struct device *dev,
				    struct device_attribute *attr, char *buf)
925
{
926 927 928
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%x\n", disk->flags);
929
}
930

931 932 933 934 935 936 937 938 939
static ssize_t disk_alignment_offset_show(struct device *dev,
					  struct device_attribute *attr,
					  char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}

940 941 942 943 944 945
static ssize_t disk_discard_alignment_show(struct device *dev,
					   struct device_attribute *attr,
					   char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

946
	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
947 948
}

M
Matteo Croce 已提交
949 950 951 952 953 954 955 956
static ssize_t diskseq_show(struct device *dev,
			    struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%llu\n", disk->diskseq);
}

957 958 959 960 961 962 963 964 965 966 967 968
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
M
Matteo Croce 已提交
969
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
970

971
#ifdef CONFIG_FAIL_MAKE_REQUEST
972 973 974
ssize_t part_fail_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
975
	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
976 977 978 979 980 981 982 983 984
}

ssize_t part_fail_store(struct device *dev,
			struct device_attribute *attr,
			const char *buf, size_t count)
{
	int i;

	if (count > 0 && sscanf(buf, "%d", &i) > 0)
985
		dev_to_bdev(dev)->bd_make_it_fail = i;
986 987 988 989

	return count;
}

990
static struct device_attribute dev_attr_fail =
991
	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
992 993
#endif /* CONFIG_FAIL_MAKE_REQUEST */

994 995
#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
996
	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
997
#endif
998 999 1000

static struct attribute *disk_attrs[] = {
	&dev_attr_range.attr,
1001
	&dev_attr_ext_range.attr,
1002
	&dev_attr_removable.attr,
1003
	&dev_attr_hidden.attr,
K
Kay Sievers 已提交
1004
	&dev_attr_ro.attr,
1005
	&dev_attr_size.attr,
1006
	&dev_attr_alignment_offset.attr,
1007
	&dev_attr_discard_alignment.attr,
1008 1009
	&dev_attr_capability.attr,
	&dev_attr_stat.attr,
1010
	&dev_attr_inflight.attr,
1011
	&dev_attr_badblocks.attr,
1012 1013 1014
	&dev_attr_events.attr,
	&dev_attr_events_async.attr,
	&dev_attr_events_poll_msecs.attr,
M
Matteo Croce 已提交
1015
	&dev_attr_diskseq.attr,
1016 1017
#ifdef CONFIG_FAIL_MAKE_REQUEST
	&dev_attr_fail.attr,
1018 1019 1020
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
	&dev_attr_fail_timeout.attr,
1021 1022 1023 1024
#endif
	NULL
};

1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, typeof(*dev), kobj);
	struct gendisk *disk = dev_to_disk(dev);

	if (a == &dev_attr_badblocks.attr && !disk->bb)
		return 0;
	return a->mode;
}

1035 1036
static struct attribute_group disk_attr_group = {
	.attrs = disk_attrs,
1037
	.is_visible = disk_visible,
1038 1039
};

1040
static const struct attribute_group *disk_attr_groups[] = {
1041 1042
	&disk_attr_group,
	NULL
L
Linus Torvalds 已提交
1043 1044
};

1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
1056 1057
 *
 * Context: can sleep
1058
 */
1059
static void disk_release(struct device *dev)
L
Linus Torvalds 已提交
1060
{
1061 1062
	struct gendisk *disk = dev_to_disk(dev);

1063 1064
	might_sleep();

1065
	disk_release_events(disk);
L
Linus Torvalds 已提交
1066
	kfree(disk->random);
1067
	xa_destroy(&disk->part_tbl);
1068
	disk->queue->disk = NULL;
1069
	blk_put_queue(disk->queue);
C
Christoph Hellwig 已提交
1070
	iput(disk->part0->bd_inode);	/* frees the disk */
L
Linus Torvalds 已提交
1071
}
1072 1073 1074 1075 1076 1077 1078 1079

static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
{
	struct gendisk *disk = dev_to_disk(dev);

	return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}

1080 1081
struct class block_class = {
	.name		= "block",
1082
	.dev_uevent	= block_uevent,
L
Linus Torvalds 已提交
1083 1084
};

1085
static char *block_devnode(struct device *dev, umode_t *mode,
1086
			   kuid_t *uid, kgid_t *gid)
1087 1088 1089
{
	struct gendisk *disk = dev_to_disk(dev);

1090 1091
	if (disk->fops->devnode)
		return disk->fops->devnode(disk, mode);
1092 1093 1094
	return NULL;
}

1095
const struct device_type disk_type = {
1096 1097 1098
	.name		= "disk",
	.groups		= disk_attr_groups,
	.release	= disk_release,
1099
	.devnode	= block_devnode,
L
Linus Torvalds 已提交
1100 1101
};

1102
#ifdef CONFIG_PROC_FS
1103 1104 1105 1106 1107 1108 1109 1110
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
1111 1112
{
	struct gendisk *gp = v;
1113
	struct block_device *hd;
1114
	unsigned int inflight;
1115
	struct disk_stats stat;
1116
	unsigned long idx;
L
Linus Torvalds 已提交
1117 1118

	/*
1119
	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1120
		seq_puts(seqf,	"major minor name"
L
Linus Torvalds 已提交
1121 1122 1123 1124
				"     rio rmerge rsect ruse wio wmerge "
				"wsect wuse running use aveq"
				"\n\n");
	*/
1125

1126 1127 1128 1129
	rcu_read_lock();
	xa_for_each(&gp->part_tbl, idx, hd) {
		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
			continue;
1130
		part_stat_read_all(hd, &stat);
1131
		if (queue_is_mq(gp->queue))
1132
			inflight = blk_mq_in_flight(gp->queue, hd);
1133
		else
1134
			inflight = part_in_flight(hd);
1135

1136
		seq_printf(seqf, "%4d %7d %pg "
1137 1138 1139
			   "%lu %lu %lu %u "
			   "%lu %lu %lu %u "
			   "%u %u %u "
1140 1141 1142
			   "%lu %lu %lu %u "
			   "%lu %u"
			   "\n",
1143
			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
			   stat.ios[STAT_READ],
			   stat.merges[STAT_READ],
			   stat.sectors[STAT_READ],
			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
							NSEC_PER_MSEC),
			   stat.ios[STAT_WRITE],
			   stat.merges[STAT_WRITE],
			   stat.sectors[STAT_WRITE],
			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
							NSEC_PER_MSEC),
1154
			   inflight,
1155
			   jiffies_to_msecs(stat.io_ticks),
1156 1157 1158 1159 1160
			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
						 stat.nsecs[STAT_WRITE] +
						 stat.nsecs[STAT_DISCARD] +
						 stat.nsecs[STAT_FLUSH],
							NSEC_PER_MSEC),
1161 1162 1163 1164 1165 1166 1167 1168
			   stat.ios[STAT_DISCARD],
			   stat.merges[STAT_DISCARD],
			   stat.sectors[STAT_DISCARD],
			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
						 NSEC_PER_MSEC),
			   stat.ios[STAT_FLUSH],
			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
						 NSEC_PER_MSEC)
1169
			);
L
Linus Torvalds 已提交
1170
	}
1171
	rcu_read_unlock();
1172

L
Linus Torvalds 已提交
1173 1174 1175
	return 0;
}

1176
static const struct seq_operations diskstats_op = {
1177 1178 1179
	.start	= disk_seqf_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
L
Linus Torvalds 已提交
1180 1181
	.show	= diskstats_show
};
1182 1183 1184

static int __init proc_genhd_init(void)
{
1185 1186
	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
	proc_create_seq("partitions", 0, NULL, &partitions_op);
1187 1188 1189
	return 0;
}
module_init(proc_genhd_init);
1190
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
1191

1192 1193
dev_t part_devt(struct gendisk *disk, u8 partno)
{
C
Christoph Hellwig 已提交
1194
	struct block_device *part;
1195 1196
	dev_t devt = 0;

C
Christoph Hellwig 已提交
1197 1198 1199
	rcu_read_lock();
	part = xa_load(&disk->part_tbl, partno);
	if (part)
1200
		devt = part->bd_dev;
C
Christoph Hellwig 已提交
1201
	rcu_read_unlock();
1202 1203 1204 1205

	return devt;
}

1206
dev_t blk_lookup_devt(const char *name, int partno)
1207
{
1208 1209 1210
	dev_t devt = MKDEV(0, 0);
	struct class_dev_iter iter;
	struct device *dev;
1211

1212 1213
	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
1214 1215
		struct gendisk *disk = dev_to_disk(dev);

1216
		if (strcmp(dev_name(dev), name))
1217 1218
			continue;

1219 1220 1221 1222 1223 1224
		if (partno < disk->minors) {
			/* We need to return the right devno, even
			 * if the partition doesn't exist yet.
			 */
			devt = MKDEV(MAJOR(dev->devt),
				     MINOR(dev->devt) + partno);
1225 1226 1227 1228
		} else {
			devt = part_devt(disk, partno);
			if (devt)
				break;
1229
		}
1230
	}
1231
	class_dev_iter_exit(&iter);
1232 1233 1234
	return devt;
}

1235 1236
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
		struct lock_class_key *lkclass)
1237 1238 1239
{
	struct gendisk *disk;

1240 1241 1242
	if (!blk_get_queue(q))
		return NULL;

1243
	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1244
	if (!disk)
1245
		goto out_put_queue;
1246

1247 1248 1249 1250
	disk->bdi = bdi_alloc(node_id);
	if (!disk->bdi)
		goto out_free_disk;

1251 1252
	disk->part0 = bdev_alloc(disk, 0);
	if (!disk->part0)
1253
		goto out_free_bdi;
1254

1255
	disk->node_id = node_id;
1256
	mutex_init(&disk->open_mutex);
1257 1258 1259
	xa_init(&disk->part_tbl);
	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
		goto out_destroy_part_tbl;
1260 1261 1262 1263 1264

	rand_initialize_disk(disk);
	disk_to_dev(disk)->class = &block_class;
	disk_to_dev(disk)->type = &disk_type;
	device_initialize(disk_to_dev(disk));
M
Matteo Croce 已提交
1265
	inc_diskseq(disk);
1266
	disk->queue = q;
1267
	q->disk = disk;
1268
	lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
1269 1270 1271
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
	INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
L
Linus Torvalds 已提交
1272
	return disk;
1273

1274 1275
out_destroy_part_tbl:
	xa_destroy(&disk->part_tbl);
C
Christoph Hellwig 已提交
1276
	iput(disk->part0->bd_inode);
1277 1278
out_free_bdi:
	bdi_put(disk->bdi);
1279 1280
out_free_disk:
	kfree(disk);
1281 1282
out_put_queue:
	blk_put_queue(q);
1283
	return NULL;
L
Linus Torvalds 已提交
1284
}
1285
EXPORT_SYMBOL(__alloc_disk_node);
L
Linus Torvalds 已提交
1286

1287
struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
1288 1289 1290 1291 1292 1293 1294 1295
{
	struct request_queue *q;
	struct gendisk *disk;

	q = blk_alloc_queue(node);
	if (!q)
		return NULL;

1296
	disk = __alloc_disk_node(q, node, lkclass);
1297 1298 1299 1300 1301 1302 1303 1304
	if (!disk) {
		blk_cleanup_queue(q);
		return NULL;
	}
	return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);

1305 1306
/**
 * put_disk - decrements the gendisk refcount
1307
 * @disk: the struct gendisk to decrement the refcount for
1308 1309 1310
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
1311 1312 1313
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
1314
 */
L
Linus Torvalds 已提交
1315 1316 1317
void put_disk(struct gendisk *disk)
{
	if (disk)
1318
		put_device(disk_to_dev(disk));
L
Linus Torvalds 已提交
1319 1320 1321
}
EXPORT_SYMBOL(put_disk);

1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337
/**
 * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
 * @disk: gendisk to shutdown
 *
 * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
 * the queue DEAD, destroy and put it and the gendisk structure.
 *
 * Context: can sleep
 */
void blk_cleanup_disk(struct gendisk *disk)
{
	blk_cleanup_queue(disk->queue);
	put_disk(disk);
}
EXPORT_SYMBOL(blk_cleanup_disk);

1338 1339 1340 1341 1342 1343 1344 1345 1346 1347
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
	char event[] = "DISK_RO=1";
	char *envp[] = { event, NULL };

	if (!ro)
		event[8] = '0';
	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

1348 1349 1350
/**
 * set_disk_ro - set a gendisk read-only
 * @disk:	gendisk to operate on
1351
 * @read_only:	%true to set the disk read-only, %false set the disk read/write
1352 1353 1354 1355 1356 1357
 *
 * This function is used to indicate whether a given disk device should have its
 * read-only flag set. set_disk_ro() is typically used by device drivers to
 * indicate whether the underlying physical device is write-protected.
 */
void set_disk_ro(struct gendisk *disk, bool read_only)
L
Linus Torvalds 已提交
1358
{
1359 1360 1361 1362 1363 1364
	if (read_only) {
		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
			return;
	} else {
		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
			return;
1365
	}
1366
	set_disk_ro_uevent(disk, read_only);
L
Linus Torvalds 已提交
1367 1368 1369 1370 1371
}
EXPORT_SYMBOL(set_disk_ro);

int bdev_read_only(struct block_device *bdev)
{
1372
	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
L
Linus Torvalds 已提交
1373 1374
}
EXPORT_SYMBOL(bdev_read_only);
M
Matteo Croce 已提交
1375 1376 1377 1378 1379

void inc_diskseq(struct gendisk *disk)
{
	disk->diskseq = atomic64_inc_return(&diskseq);
}