genhd.c 34.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3
/*
 *  gendisk handling
C
Christoph Hellwig 已提交
4 5
 *
 * Portions Copyright (C) 2020 Christoph Hellwig
L
Linus Torvalds 已提交
6 7 8
 */

#include <linux/module.h>
9
#include <linux/ctype.h>
L
Linus Torvalds 已提交
10 11
#include <linux/fs.h>
#include <linux/genhd.h>
12
#include <linux/kdev_t.h>
L
Linus Torvalds 已提交
13 14
#include <linux/kernel.h>
#include <linux/blkdev.h>
15
#include <linux/backing-dev.h>
L
Linus Torvalds 已提交
16 17
#include <linux/init.h>
#include <linux/spinlock.h>
18
#include <linux/proc_fs.h>
L
Linus Torvalds 已提交
19 20 21
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
22
#include <linux/mutex.h>
T
Tejun Heo 已提交
23
#include <linux/idr.h>
24
#include <linux/log2.h>
25
#include <linux/pm_runtime.h>
26
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
27

28 29
#include "blk.h"

30
static struct kobject *block_depr;
L
Linus Torvalds 已提交
31

M
Matteo Croce 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
/*
 * Unique, monotonically increasing sequential number associated with block
 * devices instances (i.e. incremented each time a device is attached).
 * Associating uevents with block devices in userspace is difficult and racy:
 * the uevent netlink socket is lossy, and on slow and overloaded systems has
 * a very high latency.
 * Block devices do not have exclusive owners in userspace, any process can set
 * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
 * can be reused again and again).
 * A userspace process setting up a block device and watching for its events
 * cannot thus reliably tell whether an event relates to the device it just set
 * up or another earlier instance with the same name.
 * This sequential number allows userspace processes to solve this problem, and
 * uniquely associate an uevent to the lifetime to a device.
 */
static atomic64_t diskseq;

T
Tejun Heo 已提交
49
/* for extended dynamic devt allocation, currently only one major is used */
50
#define NR_EXT_DEVT		(1 << MINORBITS)
51
static DEFINE_IDA(ext_devt_ida);
T
Tejun Heo 已提交
52

53 54
void set_capacity(struct gendisk *disk, sector_t sectors)
{
55
	struct block_device *bdev = disk->part0;
56

57
	spin_lock(&bdev->bd_size_lock);
58
	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
59
	spin_unlock(&bdev->bd_size_lock);
60 61 62
}
EXPORT_SYMBOL(set_capacity);

63
/*
64 65
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
66
 */
67
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
68 69
{
	sector_t capacity = get_capacity(disk);
70
	char *envp[] = { "RESIZE=1", NULL };
71 72 73

	set_capacity(disk, size);

74 75 76 77 78 79
	/*
	 * Only print a message and send a uevent if the gendisk is user visible
	 * and alive.  This avoids spamming the log and udev when setting the
	 * initial capacity during probing.
	 */
	if (size == capacity ||
C
Christoph Hellwig 已提交
80 81
	    !disk_live(disk) ||
	    (disk->flags & GENHD_FL_HIDDEN))
82
		return false;
83

84
	pr_info("%s: detected capacity change from %lld to %lld\n",
M
Ming Lei 已提交
85
		disk->disk_name, capacity, size);
86

87 88 89 90 91 92 93 94
	/*
	 * Historically we did not send a uevent for changes to/from an empty
	 * device.
	 */
	if (!capacity || !size)
		return false;
	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
	return true;
95
}
96
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
97

98
/*
C
Christoph Hellwig 已提交
99 100 101 102 103
 * Format the device name of the indicated block device into the supplied buffer
 * and return a pointer to that same buffer for convenience.
 *
 * Note: do not use this in new code, use the %pg specifier to sprintf and
 * printk insted.
104
 */
C
Christoph Hellwig 已提交
105
const char *bdevname(struct block_device *bdev, char *buf)
106
{
C
Christoph Hellwig 已提交
107 108 109
	struct gendisk *hd = bdev->bd_disk;
	int partno = bdev->bd_partno;

110 111 112 113 114 115 116 117 118 119
	if (!partno)
		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
	else
		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

	return buf;
}
EXPORT_SYMBOL(bdevname);
120

121 122
static void part_stat_read_all(struct block_device *part,
		struct disk_stats *stat)
123 124 125 126 127
{
	int cpu;

	memset(stat, 0, sizeof(struct disk_stats));
	for_each_possible_cpu(cpu) {
128
		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
129 130 131 132 133 134 135 136 137 138 139 140 141
		int group;

		for (group = 0; group < NR_STAT_GROUPS; group++) {
			stat->nsecs[group] += ptr->nsecs[group];
			stat->sectors[group] += ptr->sectors[group];
			stat->ios[group] += ptr->ios[group];
			stat->merges[group] += ptr->merges[group];
		}

		stat->io_ticks += ptr->io_ticks;
	}
}

142
static unsigned int part_in_flight(struct block_device *part)
143
{
144
	unsigned int inflight = 0;
145
	int cpu;
146

147
	for_each_possible_cpu(cpu) {
148 149
		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
			    part_stat_local_read_cpu(part, in_flight[1], cpu);
150
	}
151 152
	if ((int)inflight < 0)
		inflight = 0;
153

154
	return inflight;
155 156
}

157 158
static void part_in_flight_rw(struct block_device *part,
		unsigned int inflight[2])
159
{
160 161 162 163 164 165 166 167 168 169 170 171
	int cpu;

	inflight[0] = 0;
	inflight[1] = 0;
	for_each_possible_cpu(cpu) {
		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
	}
	if ((int)inflight[0] < 0)
		inflight[0] = 0;
	if ((int)inflight[1] < 0)
		inflight[1] = 0;
172 173
}

L
Linus Torvalds 已提交
174 175 176 177
/*
 * Can be deleted altogether. Later.
 *
 */
178
#define BLKDEV_MAJOR_HASH_SIZE 255
L
Linus Torvalds 已提交
179 180 181 182
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
183
	void (*probe)(dev_t devt);
184
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
C
Christoph Hellwig 已提交
185
static DEFINE_MUTEX(major_names_lock);
L
Linus Torvalds 已提交
186 187

/* index in the above - for now: assume no multimajor ranges */
188
static inline int major_to_index(unsigned major)
L
Linus Torvalds 已提交
189
{
190
	return major % BLKDEV_MAJOR_HASH_SIZE;
191 192
}

193
#ifdef CONFIG_PROC_FS
194
void blkdev_show(struct seq_file *seqf, off_t offset)
195
{
196
	struct blk_major_name *dp;
197

C
Christoph Hellwig 已提交
198
	mutex_lock(&major_names_lock);
199 200
	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
		if (dp->major == offset)
201
			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
C
Christoph Hellwig 已提交
202
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
203
}
204
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
205

206
/**
207
 * __register_blkdev - register a new block device
208
 *
209 210
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
211
 * @name: the name of the new block device as a zero terminated string
212
 * @probe: allback that is called on access to any minor number of @major
213 214 215
 *
 * The @name must be unique within the system.
 *
216 217
 * The return value depends on the @major input parameter:
 *
218 219
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
220
 *  - if any unused major number was requested with @major = 0 parameter
221
 *    then the return value is the allocated major number in range
222 223 224 225
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
226 227
 *
 * Use register_blkdev instead for any new code.
228
 */
229 230
int __register_blkdev(unsigned int major, const char *name,
		void (*probe)(dev_t devt))
L
Linus Torvalds 已提交
231 232 233 234
{
	struct blk_major_name **n, *p;
	int index, ret = 0;

C
Christoph Hellwig 已提交
235
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
236 237 238 239 240 241 242 243 244

	/* temporary */
	if (major == 0) {
		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
			if (major_names[index] == NULL)
				break;
		}

		if (index == 0) {
245 246
			printk("%s: failed to get major for %s\n",
			       __func__, name);
L
Linus Torvalds 已提交
247 248 249 250 251 252 253
			ret = -EBUSY;
			goto out;
		}
		major = index;
		ret = major;
	}

254
	if (major >= BLKDEV_MAJOR_MAX) {
255 256
		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
257 258 259 260 261

		ret = -EINVAL;
		goto out;
	}

L
Linus Torvalds 已提交
262 263 264 265 266 267 268
	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
	if (p == NULL) {
		ret = -ENOMEM;
		goto out;
	}

	p->major = major;
269
	p->probe = probe;
L
Linus Torvalds 已提交
270 271 272 273 274 275 276 277 278 279 280 281 282 283
	strlcpy(p->name, name, sizeof(p->name));
	p->next = NULL;
	index = major_to_index(major);

	for (n = &major_names[index]; *n; n = &(*n)->next) {
		if ((*n)->major == major)
			break;
	}
	if (!*n)
		*n = p;
	else
		ret = -EBUSY;

	if (ret < 0) {
284
		printk("register_blkdev: cannot get major %u for %s\n",
L
Linus Torvalds 已提交
285 286 287 288
		       major, name);
		kfree(p);
	}
out:
C
Christoph Hellwig 已提交
289
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
290 291
	return ret;
}
292
EXPORT_SYMBOL(__register_blkdev);
L
Linus Torvalds 已提交
293

A
Akinobu Mita 已提交
294
void unregister_blkdev(unsigned int major, const char *name)
L
Linus Torvalds 已提交
295 296 297 298 299
{
	struct blk_major_name **n;
	struct blk_major_name *p = NULL;
	int index = major_to_index(major);

C
Christoph Hellwig 已提交
300
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
301 302 303
	for (n = &major_names[index]; *n; n = &(*n)->next)
		if ((*n)->major == major)
			break;
304 305 306
	if (!*n || strcmp((*n)->name, name)) {
		WARN_ON(1);
	} else {
L
Linus Torvalds 已提交
307 308 309
		p = *n;
		*n = p->next;
	}
C
Christoph Hellwig 已提交
310
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
311 312 313 314 315
	kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
/**
 * blk_mangle_minor - scatter minor numbers apart
 * @minor: minor number to mangle
 *
 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 * is enabled.  Mangling twice gives the original value.
 *
 * RETURNS:
 * Mangled value.
 *
 * CONTEXT:
 * Don't care.
 */
static int blk_mangle_minor(int minor)
{
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
	int i;

	for (i = 0; i < MINORBITS / 2; i++) {
		int low = minor & (1 << i);
		int high = minor & (1 << (MINORBITS - 1 - i));
		int distance = MINORBITS - 1 - 2 * i;

		minor ^= low | high;	/* clear both bits */
		low <<= distance;	/* swap the positions */
		high >>= distance;
		minor |= low | high;	/* and set */
	}
#endif
	return minor;
}

348
int blk_alloc_ext_minor(void)
T
Tejun Heo 已提交
349
{
T
Tejun Heo 已提交
350
	int idx;
T
Tejun Heo 已提交
351

352
	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
353 354 355 356 357 358
	if (idx < 0) {
		if (idx == -ENOSPC)
			return -EBUSY;
		return idx;
	}
	return blk_mangle_minor(idx);
T
Tejun Heo 已提交
359 360
}

361
void blk_free_ext_minor(unsigned int minor)
T
Tejun Heo 已提交
362
{
363
	ida_free(&ext_devt_ida, blk_mangle_minor(minor));
Y
Yufen Yu 已提交
364 365
}

366 367 368 369 370 371 372 373 374 375 376 377
static char *bdevt_str(dev_t devt, char *buf)
{
	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
		char tbuf[BDEVT_SIZE];
		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
	} else
		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));

	return buf;
}

378 379 380
void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
	struct block_device *part;
C
Christoph Hellwig 已提交
381
	unsigned long idx;
382

C
Christoph Hellwig 已提交
383 384 385 386
	rcu_read_lock();
	xa_for_each(&disk->part_tbl, idx, part) {
		if (bdev_is_partition(part) && !bdev_nr_sectors(part))
			continue;
387
		if (!kobject_get_unless_zero(&part->bd_device.kobj))
C
Christoph Hellwig 已提交
388 389 390
			continue;

		rcu_read_unlock();
391
		kobject_uevent(bdev_kobj(part), action);
392
		put_device(&part->bd_device);
C
Christoph Hellwig 已提交
393 394 395
		rcu_read_lock();
	}
	rcu_read_unlock();
396 397 398
}
EXPORT_SYMBOL_GPL(disk_uevent);

399 400 401 402 403 404 405 406 407 408 409 410 411
static void disk_scan_partitions(struct gendisk *disk)
{
	struct block_device *bdev;

	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
		return;

	set_bit(GD_NEED_PART_SCAN, &disk->state);
	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
	if (!IS_ERR(bdev))
		blkdev_put(bdev, FMODE_READ);
}

412 413
static void register_disk(struct device *parent, struct gendisk *disk,
			  const struct attribute_group **groups)
414 415 416 417
{
	struct device *ddev = disk_to_dev(disk);
	int err;

418
	ddev->parent = parent;
419

420
	dev_set_name(ddev, "%s", disk->disk_name);
421 422 423 424

	/* delay uevents, until we scanned partition table */
	dev_set_uevent_suppress(ddev, 1);

425 426 427 428
	if (groups) {
		WARN_ON(ddev->groups);
		ddev->groups = groups;
	}
429 430 431 432 433 434 435 436 437 438
	if (device_add(ddev))
		return;
	if (!sysfs_deprecated) {
		err = sysfs_create_link(block_depr, &ddev->kobj,
					kobject_name(&ddev->kobj));
		if (err) {
			device_del(ddev);
			return;
		}
	}
439 440 441 442 443 444 445 446

	/*
	 * avoid probable deadlock caused by allocating memory with
	 * GFP_KERNEL in runtime_resume callback of its all ancestor
	 * devices
	 */
	pm_runtime_set_memalloc_noio(ddev, true);

447 448
	disk->part0->bd_holder_dir =
		kobject_create_and_add("holders", &ddev->kobj);
449 450
	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);

451 452 453 454 455 456 457 458 459 460
	/*
	 * XXX: this is a mess, can't wait for real error handling in add_disk.
	 * Make sure ->slave_dir is NULL if we failed some of the registration
	 * so that the cleanup in bd_unlink_disk_holder works properly.
	 */
	if (bd_register_pending_holders(disk) < 0) {
		kobject_put(disk->slave_dir);
		disk->slave_dir = NULL;
	}

461
	if (disk->flags & GENHD_FL_HIDDEN)
462 463
		return;

464
	disk_scan_partitions(disk);
465

466
	/* announce the disk and partitions after all partitions are created */
467
	dev_set_uevent_suppress(ddev, 0);
468
	disk_uevent(disk, KOBJ_ADD);
469

470 471 472
	if (disk->bdi->dev) {
		err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj,
					"bdi");
473 474
		WARN_ON(err);
	}
475 476
}

L
Linus Torvalds 已提交
477
/**
478
 * device_add_disk - add disk information to kernel list
479
 * @parent: parent device for the disk
L
Linus Torvalds 已提交
480
 * @disk: per-device partitioning information
481
 * @groups: Additional per-device sysfs groups
L
Linus Torvalds 已提交
482 483 484
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
485 486
 *
 * FIXME: error handling
L
Linus Torvalds 已提交
487
 */
488 489 490 491

void device_add_disk(struct device *parent, struct gendisk *disk,
		     const struct attribute_group **groups)

L
Linus Torvalds 已提交
492
{
493
	int ret;
494

495 496 497 498 499 500
	/*
	 * The disk queue should now be all set with enough information about
	 * the device for the elevator code to pick an adequate default
	 * elevator if one is needed, that is, for devices requesting queue
	 * registration.
	 */
501
	elevator_init_mq(disk->queue);
502

503 504 505 506 507 508
	/*
	 * If the driver provides an explicit major number it also must provide
	 * the number of minors numbers supported, and those will be used to
	 * setup the gendisk.
	 * Otherwise just allocate the device numbers for both the whole device
	 * and all partitions from the extended dev_t space.
509
	 */
510 511
	if (disk->major) {
		WARN_ON(!disk->minors);
512 513 514 515 516 517

		if (disk->minors > DISK_MAX_PARTS) {
			pr_err("block: can't allocate more than %d partitions\n",
				DISK_MAX_PARTS);
			disk->minors = DISK_MAX_PARTS;
		}
518 519
	} else {
		WARN_ON(disk->minors);
520

521 522 523 524 525 526 527
		ret = blk_alloc_ext_minor();
		if (ret < 0) {
			WARN_ON(1);
			return;
		}
		disk->major = BLOCK_EXT_MAJOR;
		disk->first_minor = MINOR(ret);
528
		disk->flags |= GENHD_FL_EXT_DEVT;
529
	}
530

531 532
	disk_alloc_events(disk);

533 534 535 536 537 538 539 540
	if (disk->flags & GENHD_FL_HIDDEN) {
		/*
		 * Don't let hidden disks show up in /proc/partitions,
		 * and don't bother scanning for partitions either.
		 */
		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
		disk->flags |= GENHD_FL_NO_PART_SCAN;
	} else {
541
		struct device *dev = disk_to_dev(disk);
542

543
		/* Register BDI before referencing it from bdev */
544
		dev->devt = MKDEV(disk->major, disk->first_minor);
545
		ret = bdi_register(disk->bdi, "%u:%u",
546
				   disk->major, disk->first_minor);
547
		WARN_ON(ret);
548
		bdi_set_owner(disk->bdi, dev);
549
		bdev_add(disk->part0, dev->devt);
550
	}
551
	register_disk(parent, disk, groups);
552
	blk_register_queue(disk);
553

554
	disk_add_events(disk);
555
	blk_integrity_add(disk);
L
Linus Torvalds 已提交
556
}
557
EXPORT_SYMBOL(device_add_disk);
L
Linus Torvalds 已提交
558

559 560 561 562 563 564 565 566 567 568 569 570 571
/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
572 573 574 575 576
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
577
 */
578
void del_gendisk(struct gendisk *disk)
L
Linus Torvalds 已提交
579
{
580 581
	might_sleep();

582
	if (WARN_ON_ONCE(!disk_live(disk)))
583 584
		return;

585
	blk_integrity_del(disk);
586 587
	disk_del_events(disk);

588
	mutex_lock(&disk->open_mutex);
589
	remove_inode_hash(disk->part0->bd_inode);
590
	blk_drop_partitions(disk);
591
	mutex_unlock(&disk->open_mutex);
592

593 594 595
	fsync_bdev(disk->part0);
	__invalidate_device(disk->part0, true);

596 597
	set_capacity(disk, 0);

598
	if (!(disk->flags & GENHD_FL_HIDDEN)) {
599
		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
600

601 602 603 604
		/*
		 * Unregister bdi before releasing device numbers (as they can
		 * get reused and we'd get clashes in sysfs).
		 */
605
		bdi_unregister(disk->bdi);
606
	}
607

608
	blk_unregister_queue(disk);
609

610
	kobject_put(disk->part0->bd_holder_dir);
611 612
	kobject_put(disk->slave_dir);

613
	part_stat_set_all(disk->part0, 0);
614
	disk->part0->bd_stamp = 0;
615 616
	if (!sysfs_deprecated)
		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
617
	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
618
	device_del(disk_to_dev(disk));
L
Linus Torvalds 已提交
619
}
620
EXPORT_SYMBOL(del_gendisk);
L
Linus Torvalds 已提交
621

622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
					struct device_attribute *attr,
					char *page)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return sprintf(page, "\n");

	return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *page, size_t len)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return -ENXIO;

	return badblocks_store(disk->bb, page, len, 0);
}

647
void blk_request_module(dev_t devt)
648
{
649 650 651 652 653 654 655 656 657 658 659 660 661
	unsigned int major = MAJOR(devt);
	struct blk_major_name **n;

	mutex_lock(&major_names_lock);
	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
		if ((*n)->major == major && (*n)->probe) {
			(*n)->probe(devt);
			mutex_unlock(&major_names_lock);
			return;
		}
	}
	mutex_unlock(&major_names_lock);

662 663 664 665 666
	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
		/* Make old-style 2.4 aliases work */
		request_module("block-major-%d", MAJOR(devt));
}

667 668 669 670 671 672 673
/*
 * print a full list of all partitions - intended for places where the root
 * filesystem can't be mounted and thus to give the victim some idea of what
 * went wrong
 */
void __init printk_all_partitions(void)
{
674 675 676 677 678 679
	struct class_dev_iter iter;
	struct device *dev;

	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
680
		struct block_device *part;
681
		char devt_buf[BDEVT_SIZE];
682
		unsigned long idx;
683 684 685

		/*
		 * Don't show empty devices or things that have been
L
Lucas De Marchi 已提交
686
		 * suppressed
687 688 689 690 691 692
		 */
		if (get_capacity(disk) == 0 ||
		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
			continue;

		/*
693 694
		 * Note, unlike /proc/partitions, I am showing the numbers in
		 * hex - the same format as the root= option takes.
695
		 */
696 697 698 699
		rcu_read_lock();
		xa_for_each(&disk->part_tbl, idx, part) {
			if (!bdev_nr_sectors(part))
				continue;
700
			printk("%s%s %10llu %pg %s",
701
			       bdev_is_partition(part) ? "  " : "",
702
			       bdevt_str(part->bd_dev, devt_buf),
703
			       bdev_nr_sectors(part) >> 1, part,
704 705
			       part->bd_meta_info ?
					part->bd_meta_info->uuid : "");
706
			if (bdev_is_partition(part))
T
Tejun Heo 已提交
707
				printk("\n");
708 709 710 711 712
			else if (dev->parent && dev->parent->driver)
				printk(" driver: %s\n",
					dev->parent->driver->name);
			else
				printk(" (driver?)\n");
T
Tejun Heo 已提交
713
		}
714
		rcu_read_unlock();
715 716
	}
	class_dev_iter_exit(&iter);
717 718
}

L
Linus Torvalds 已提交
719 720
#ifdef CONFIG_PROC_FS
/* iterator */
721
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
722
{
723 724 725
	loff_t skip = *pos;
	struct class_dev_iter *iter;
	struct device *dev;
726

727
	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
728 729 730 731 732 733 734 735 736 737 738 739
	if (!iter)
		return ERR_PTR(-ENOMEM);

	seqf->private = iter;
	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
	do {
		dev = class_dev_iter_next(iter);
		if (!dev)
			return NULL;
	} while (skip--);

	return dev_to_disk(dev);
740 741
}

742
static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
L
Linus Torvalds 已提交
743
{
744
	struct device *dev;
L
Linus Torvalds 已提交
745

746 747
	(*pos)++;
	dev = class_dev_iter_next(seqf->private);
748
	if (dev)
749
		return dev_to_disk(dev);
750

L
Linus Torvalds 已提交
751 752 753
	return NULL;
}

754
static void disk_seqf_stop(struct seq_file *seqf, void *v)
755
{
756
	struct class_dev_iter *iter = seqf->private;
757

758 759 760 761
	/* stop is called even after start failed :-( */
	if (iter) {
		class_dev_iter_exit(iter);
		kfree(iter);
762
		seqf->private = NULL;
763
	}
L
Linus Torvalds 已提交
764 765
}

766
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
L
Linus Torvalds 已提交
767
{
768
	void *p;
769 770

	p = disk_seqf_start(seqf, pos);
771
	if (!IS_ERR_OR_NULL(p) && !*pos)
772 773
		seq_puts(seqf, "major minor  #blocks  name\n\n");
	return p;
L
Linus Torvalds 已提交
774 775
}

776
static int show_partition(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
777 778
{
	struct gendisk *sgp = v;
779
	struct block_device *part;
780
	unsigned long idx;
L
Linus Torvalds 已提交
781 782

	/* Don't show non-partitionable removeable devices or empty devices */
T
Tejun Heo 已提交
783
	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
784
				   (sgp->flags & GENHD_FL_REMOVABLE)))
L
Linus Torvalds 已提交
785 786 787 788
		return 0;
	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
		return 0;

789 790 791 792
	rcu_read_lock();
	xa_for_each(&sgp->part_tbl, idx, part) {
		if (!bdev_nr_sectors(part))
			continue;
793
		seq_printf(seqf, "%4d  %7d %10llu %pg\n",
794
			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
795
			   bdev_nr_sectors(part) >> 1, part);
796 797
	}
	rcu_read_unlock();
L
Linus Torvalds 已提交
798 799 800
	return 0;
}

801
static const struct seq_operations partitions_op = {
802 803 804
	.start	= show_partition_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
805
	.show	= show_partition
L
Linus Torvalds 已提交
806 807 808 809 810
};
#endif

static int __init genhd_device_init(void)
{
811 812 813 814
	int error;

	block_class.dev_kobj = sysfs_dev_block_kobj;
	error = class_register(&block_class);
R
Roland McGrath 已提交
815 816
	if (unlikely(error))
		return error;
L
Linus Torvalds 已提交
817
	blk_dev_init();
818

819 820
	register_blkdev(BLOCK_EXT_MAJOR, "blkext");

821
	/* create top-level block dir */
822 823
	if (!sysfs_deprecated)
		block_depr = kobject_create_and_add("block", NULL);
824
	return 0;
L
Linus Torvalds 已提交
825 826 827 828
}

subsys_initcall(genhd_device_init);

829 830
static ssize_t disk_range_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
831
{
832
	struct gendisk *disk = dev_to_disk(dev);
L
Linus Torvalds 已提交
833

834
	return sprintf(buf, "%d\n", disk->minors);
L
Linus Torvalds 已提交
835 836
}

837 838 839 840 841
static ssize_t disk_ext_range_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
842
	return sprintf(buf, "%d\n", disk_max_parts(disk));
843 844
}

845 846
static ssize_t disk_removable_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
847
{
848
	struct gendisk *disk = dev_to_disk(dev);
849

850 851
	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
852 853
}

854 855 856 857 858 859 860 861 862
static ssize_t disk_hidden_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

K
Kay Sievers 已提交
863 864 865 866 867
static ssize_t disk_ro_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
868
	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
K
Kay Sievers 已提交
869 870
}

871 872 873
ssize_t part_size_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
874
	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
875 876 877 878 879
}

ssize_t part_stat_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
880 881
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
882
	struct disk_stats stat;
883 884
	unsigned int inflight;

885
	part_stat_read_all(bdev, &stat);
886
	if (queue_is_mq(q))
887
		inflight = blk_mq_in_flight(q, bdev);
888
	else
889
		inflight = part_in_flight(bdev);
890

891 892 893 894 895 896 897
	return sprintf(buf,
		"%8lu %8lu %8llu %8u "
		"%8lu %8lu %8llu %8u "
		"%8u %8u %8u "
		"%8lu %8lu %8llu %8u "
		"%8lu %8u"
		"\n",
898 899 900 901 902 903 904 905
		stat.ios[STAT_READ],
		stat.merges[STAT_READ],
		(unsigned long long)stat.sectors[STAT_READ],
		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
		stat.ios[STAT_WRITE],
		stat.merges[STAT_WRITE],
		(unsigned long long)stat.sectors[STAT_WRITE],
		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
906
		inflight,
907
		jiffies_to_msecs(stat.io_ticks),
908 909 910 911 912
		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
				      stat.nsecs[STAT_WRITE] +
				      stat.nsecs[STAT_DISCARD] +
				      stat.nsecs[STAT_FLUSH],
						NSEC_PER_MSEC),
913 914 915 916 917 918
		stat.ios[STAT_DISCARD],
		stat.merges[STAT_DISCARD],
		(unsigned long long)stat.sectors[STAT_DISCARD],
		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
		stat.ios[STAT_FLUSH],
		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
919 920 921 922 923
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
			   char *buf)
{
924 925
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
926 927
	unsigned int inflight[2];

928
	if (queue_is_mq(q))
929
		blk_mq_in_flight_rw(q, bdev, inflight);
930
	else
931
		part_in_flight_rw(bdev, inflight);
932

933 934 935
	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

936 937
static ssize_t disk_capability_show(struct device *dev,
				    struct device_attribute *attr, char *buf)
938
{
939 940 941
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%x\n", disk->flags);
942
}
943

944 945 946 947 948 949 950 951 952
static ssize_t disk_alignment_offset_show(struct device *dev,
					  struct device_attribute *attr,
					  char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}

953 954 955 956 957 958
static ssize_t disk_discard_alignment_show(struct device *dev,
					   struct device_attribute *attr,
					   char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

959
	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
960 961
}

M
Matteo Croce 已提交
962 963 964 965 966 967 968 969
static ssize_t diskseq_show(struct device *dev,
			    struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%llu\n", disk->diskseq);
}

970 971 972 973 974 975 976 977 978 979 980 981
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
M
Matteo Croce 已提交
982
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
983

984
#ifdef CONFIG_FAIL_MAKE_REQUEST
985 986 987
ssize_t part_fail_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
988
	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
989 990 991 992 993 994 995 996 997
}

ssize_t part_fail_store(struct device *dev,
			struct device_attribute *attr,
			const char *buf, size_t count)
{
	int i;

	if (count > 0 && sscanf(buf, "%d", &i) > 0)
998
		dev_to_bdev(dev)->bd_make_it_fail = i;
999 1000 1001 1002

	return count;
}

1003
static struct device_attribute dev_attr_fail =
1004
	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1005 1006
#endif /* CONFIG_FAIL_MAKE_REQUEST */

1007 1008
#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
1009
	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1010
#endif
1011 1012 1013

static struct attribute *disk_attrs[] = {
	&dev_attr_range.attr,
1014
	&dev_attr_ext_range.attr,
1015
	&dev_attr_removable.attr,
1016
	&dev_attr_hidden.attr,
K
Kay Sievers 已提交
1017
	&dev_attr_ro.attr,
1018
	&dev_attr_size.attr,
1019
	&dev_attr_alignment_offset.attr,
1020
	&dev_attr_discard_alignment.attr,
1021 1022
	&dev_attr_capability.attr,
	&dev_attr_stat.attr,
1023
	&dev_attr_inflight.attr,
1024
	&dev_attr_badblocks.attr,
1025 1026 1027
	&dev_attr_events.attr,
	&dev_attr_events_async.attr,
	&dev_attr_events_poll_msecs.attr,
M
Matteo Croce 已提交
1028
	&dev_attr_diskseq.attr,
1029 1030
#ifdef CONFIG_FAIL_MAKE_REQUEST
	&dev_attr_fail.attr,
1031 1032 1033
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
	&dev_attr_fail_timeout.attr,
1034 1035 1036 1037
#endif
	NULL
};

1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, typeof(*dev), kobj);
	struct gendisk *disk = dev_to_disk(dev);

	if (a == &dev_attr_badblocks.attr && !disk->bb)
		return 0;
	return a->mode;
}

1048 1049
static struct attribute_group disk_attr_group = {
	.attrs = disk_attrs,
1050
	.is_visible = disk_visible,
1051 1052
};

1053
static const struct attribute_group *disk_attr_groups[] = {
1054 1055
	&disk_attr_group,
	NULL
L
Linus Torvalds 已提交
1056 1057
};

1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
1069 1070
 *
 * Context: can sleep
1071
 */
1072
static void disk_release(struct device *dev)
L
Linus Torvalds 已提交
1073
{
1074 1075
	struct gendisk *disk = dev_to_disk(dev);

1076 1077
	might_sleep();

1078
	disk_release_events(disk);
L
Linus Torvalds 已提交
1079
	kfree(disk->random);
1080
	xa_destroy(&disk->part_tbl);
1081
	disk->queue->disk = NULL;
1082
	blk_put_queue(disk->queue);
C
Christoph Hellwig 已提交
1083
	iput(disk->part0->bd_inode);	/* frees the disk */
L
Linus Torvalds 已提交
1084
}
1085 1086 1087 1088 1089 1090 1091 1092

static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
{
	struct gendisk *disk = dev_to_disk(dev);

	return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}

1093 1094
struct class block_class = {
	.name		= "block",
1095
	.dev_uevent	= block_uevent,
L
Linus Torvalds 已提交
1096 1097
};

1098
static char *block_devnode(struct device *dev, umode_t *mode,
1099
			   kuid_t *uid, kgid_t *gid)
1100 1101 1102
{
	struct gendisk *disk = dev_to_disk(dev);

1103 1104
	if (disk->fops->devnode)
		return disk->fops->devnode(disk, mode);
1105 1106 1107
	return NULL;
}

1108
const struct device_type disk_type = {
1109 1110 1111
	.name		= "disk",
	.groups		= disk_attr_groups,
	.release	= disk_release,
1112
	.devnode	= block_devnode,
L
Linus Torvalds 已提交
1113 1114
};

1115
#ifdef CONFIG_PROC_FS
1116 1117 1118 1119 1120 1121 1122 1123
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
1124 1125
{
	struct gendisk *gp = v;
1126
	struct block_device *hd;
1127
	unsigned int inflight;
1128
	struct disk_stats stat;
1129
	unsigned long idx;
L
Linus Torvalds 已提交
1130 1131

	/*
1132
	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1133
		seq_puts(seqf,	"major minor name"
L
Linus Torvalds 已提交
1134 1135 1136 1137
				"     rio rmerge rsect ruse wio wmerge "
				"wsect wuse running use aveq"
				"\n\n");
	*/
1138

1139 1140 1141 1142
	rcu_read_lock();
	xa_for_each(&gp->part_tbl, idx, hd) {
		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
			continue;
1143
		part_stat_read_all(hd, &stat);
1144
		if (queue_is_mq(gp->queue))
1145
			inflight = blk_mq_in_flight(gp->queue, hd);
1146
		else
1147
			inflight = part_in_flight(hd);
1148

1149
		seq_printf(seqf, "%4d %7d %pg "
1150 1151 1152
			   "%lu %lu %lu %u "
			   "%lu %lu %lu %u "
			   "%u %u %u "
1153 1154 1155
			   "%lu %lu %lu %u "
			   "%lu %u"
			   "\n",
1156
			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
			   stat.ios[STAT_READ],
			   stat.merges[STAT_READ],
			   stat.sectors[STAT_READ],
			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
							NSEC_PER_MSEC),
			   stat.ios[STAT_WRITE],
			   stat.merges[STAT_WRITE],
			   stat.sectors[STAT_WRITE],
			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
							NSEC_PER_MSEC),
1167
			   inflight,
1168
			   jiffies_to_msecs(stat.io_ticks),
1169 1170 1171 1172 1173
			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
						 stat.nsecs[STAT_WRITE] +
						 stat.nsecs[STAT_DISCARD] +
						 stat.nsecs[STAT_FLUSH],
							NSEC_PER_MSEC),
1174 1175 1176 1177 1178 1179 1180 1181
			   stat.ios[STAT_DISCARD],
			   stat.merges[STAT_DISCARD],
			   stat.sectors[STAT_DISCARD],
			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
						 NSEC_PER_MSEC),
			   stat.ios[STAT_FLUSH],
			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
						 NSEC_PER_MSEC)
1182
			);
L
Linus Torvalds 已提交
1183
	}
1184
	rcu_read_unlock();
1185

L
Linus Torvalds 已提交
1186 1187 1188
	return 0;
}

1189
static const struct seq_operations diskstats_op = {
1190 1191 1192
	.start	= disk_seqf_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
L
Linus Torvalds 已提交
1193 1194
	.show	= diskstats_show
};
1195 1196 1197

static int __init proc_genhd_init(void)
{
1198 1199
	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
	proc_create_seq("partitions", 0, NULL, &partitions_op);
1200 1201 1202
	return 0;
}
module_init(proc_genhd_init);
1203
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
1204

1205 1206
dev_t part_devt(struct gendisk *disk, u8 partno)
{
C
Christoph Hellwig 已提交
1207
	struct block_device *part;
1208 1209
	dev_t devt = 0;

C
Christoph Hellwig 已提交
1210 1211 1212
	rcu_read_lock();
	part = xa_load(&disk->part_tbl, partno);
	if (part)
1213
		devt = part->bd_dev;
C
Christoph Hellwig 已提交
1214
	rcu_read_unlock();
1215 1216 1217 1218

	return devt;
}

1219
dev_t blk_lookup_devt(const char *name, int partno)
1220
{
1221 1222 1223
	dev_t devt = MKDEV(0, 0);
	struct class_dev_iter iter;
	struct device *dev;
1224

1225 1226
	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
1227 1228
		struct gendisk *disk = dev_to_disk(dev);

1229
		if (strcmp(dev_name(dev), name))
1230 1231
			continue;

1232 1233 1234 1235 1236 1237
		if (partno < disk->minors) {
			/* We need to return the right devno, even
			 * if the partition doesn't exist yet.
			 */
			devt = MKDEV(MAJOR(dev->devt),
				     MINOR(dev->devt) + partno);
1238 1239 1240 1241
		} else {
			devt = part_devt(disk, partno);
			if (devt)
				break;
1242
		}
1243
	}
1244
	class_dev_iter_exit(&iter);
1245 1246 1247
	return devt;
}

1248 1249
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
		struct lock_class_key *lkclass)
1250 1251 1252
{
	struct gendisk *disk;

1253 1254 1255
	if (!blk_get_queue(q))
		return NULL;

1256
	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1257
	if (!disk)
1258
		goto out_put_queue;
1259

1260 1261 1262 1263
	disk->bdi = bdi_alloc(node_id);
	if (!disk->bdi)
		goto out_free_disk;

1264 1265
	disk->part0 = bdev_alloc(disk, 0);
	if (!disk->part0)
1266
		goto out_free_bdi;
1267

1268
	disk->node_id = node_id;
1269
	mutex_init(&disk->open_mutex);
1270 1271 1272
	xa_init(&disk->part_tbl);
	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
		goto out_destroy_part_tbl;
1273 1274 1275 1276 1277

	rand_initialize_disk(disk);
	disk_to_dev(disk)->class = &block_class;
	disk_to_dev(disk)->type = &disk_type;
	device_initialize(disk_to_dev(disk));
M
Matteo Croce 已提交
1278
	inc_diskseq(disk);
1279
	disk->queue = q;
1280
	q->disk = disk;
1281
	lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
1282 1283 1284
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
	INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
L
Linus Torvalds 已提交
1285
	return disk;
1286

1287 1288
out_destroy_part_tbl:
	xa_destroy(&disk->part_tbl);
C
Christoph Hellwig 已提交
1289
	iput(disk->part0->bd_inode);
1290 1291
out_free_bdi:
	bdi_put(disk->bdi);
1292 1293
out_free_disk:
	kfree(disk);
1294 1295
out_put_queue:
	blk_put_queue(q);
1296
	return NULL;
L
Linus Torvalds 已提交
1297
}
1298
EXPORT_SYMBOL(__alloc_disk_node);
L
Linus Torvalds 已提交
1299

1300
struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
1301 1302 1303 1304 1305 1306 1307 1308
{
	struct request_queue *q;
	struct gendisk *disk;

	q = blk_alloc_queue(node);
	if (!q)
		return NULL;

1309
	disk = __alloc_disk_node(q, node, lkclass);
1310 1311 1312 1313 1314 1315 1316 1317
	if (!disk) {
		blk_cleanup_queue(q);
		return NULL;
	}
	return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);

1318 1319
/**
 * put_disk - decrements the gendisk refcount
1320
 * @disk: the struct gendisk to decrement the refcount for
1321 1322 1323
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
1324 1325 1326
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
1327
 */
L
Linus Torvalds 已提交
1328 1329 1330
void put_disk(struct gendisk *disk)
{
	if (disk)
1331
		put_device(disk_to_dev(disk));
L
Linus Torvalds 已提交
1332 1333 1334
}
EXPORT_SYMBOL(put_disk);

1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
/**
 * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
 * @disk: gendisk to shutdown
 *
 * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
 * the queue DEAD, destroy and put it and the gendisk structure.
 *
 * Context: can sleep
 */
void blk_cleanup_disk(struct gendisk *disk)
{
	blk_cleanup_queue(disk->queue);
	put_disk(disk);
}
EXPORT_SYMBOL(blk_cleanup_disk);

1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
	char event[] = "DISK_RO=1";
	char *envp[] = { event, NULL };

	if (!ro)
		event[8] = '0';
	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

1361 1362 1363
/**
 * set_disk_ro - set a gendisk read-only
 * @disk:	gendisk to operate on
1364
 * @read_only:	%true to set the disk read-only, %false set the disk read/write
1365 1366 1367 1368 1369 1370
 *
 * This function is used to indicate whether a given disk device should have its
 * read-only flag set. set_disk_ro() is typically used by device drivers to
 * indicate whether the underlying physical device is write-protected.
 */
void set_disk_ro(struct gendisk *disk, bool read_only)
L
Linus Torvalds 已提交
1371
{
1372 1373 1374 1375 1376 1377
	if (read_only) {
		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
			return;
	} else {
		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
			return;
1378
	}
1379
	set_disk_ro_uevent(disk, read_only);
L
Linus Torvalds 已提交
1380 1381 1382 1383 1384
}
EXPORT_SYMBOL(set_disk_ro);

int bdev_read_only(struct block_device *bdev)
{
1385
	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
L
Linus Torvalds 已提交
1386 1387
}
EXPORT_SYMBOL(bdev_read_only);
M
Matteo Croce 已提交
1388 1389 1390 1391 1392

void inc_diskseq(struct gendisk *disk)
{
	disk->diskseq = atomic64_inc_return(&diskseq);
}