genhd.c 34.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3
/*
 *  gendisk handling
C
Christoph Hellwig 已提交
4 5
 *
 * Portions Copyright (C) 2020 Christoph Hellwig
L
Linus Torvalds 已提交
6 7 8
 */

#include <linux/module.h>
9
#include <linux/ctype.h>
L
Linus Torvalds 已提交
10 11
#include <linux/fs.h>
#include <linux/genhd.h>
12
#include <linux/kdev_t.h>
L
Linus Torvalds 已提交
13 14
#include <linux/kernel.h>
#include <linux/blkdev.h>
15
#include <linux/backing-dev.h>
L
Linus Torvalds 已提交
16 17
#include <linux/init.h>
#include <linux/spinlock.h>
18
#include <linux/proc_fs.h>
L
Linus Torvalds 已提交
19 20 21
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
22
#include <linux/mutex.h>
T
Tejun Heo 已提交
23
#include <linux/idr.h>
24
#include <linux/log2.h>
25
#include <linux/pm_runtime.h>
26
#include <linux/badblocks.h>
L
Linus Torvalds 已提交
27

28 29
#include "blk.h"

30
static struct kobject *block_depr;
L
Linus Torvalds 已提交
31

M
Matteo Croce 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
/*
 * Unique, monotonically increasing sequential number associated with block
 * devices instances (i.e. incremented each time a device is attached).
 * Associating uevents with block devices in userspace is difficult and racy:
 * the uevent netlink socket is lossy, and on slow and overloaded systems has
 * a very high latency.
 * Block devices do not have exclusive owners in userspace, any process can set
 * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
 * can be reused again and again).
 * A userspace process setting up a block device and watching for its events
 * cannot thus reliably tell whether an event relates to the device it just set
 * up or another earlier instance with the same name.
 * This sequential number allows userspace processes to solve this problem, and
 * uniquely associate an uevent to the lifetime to a device.
 */
static atomic64_t diskseq;

T
Tejun Heo 已提交
49
/* for extended dynamic devt allocation, currently only one major is used */
50
#define NR_EXT_DEVT		(1 << MINORBITS)
51
static DEFINE_IDA(ext_devt_ida);
T
Tejun Heo 已提交
52

53 54
void set_capacity(struct gendisk *disk, sector_t sectors)
{
55
	struct block_device *bdev = disk->part0;
56

57
	spin_lock(&bdev->bd_size_lock);
58
	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
59
	spin_unlock(&bdev->bd_size_lock);
60 61 62
}
EXPORT_SYMBOL(set_capacity);

63
/*
64 65
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
66
 */
67
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
68 69
{
	sector_t capacity = get_capacity(disk);
70
	char *envp[] = { "RESIZE=1", NULL };
71 72 73

	set_capacity(disk, size);

74 75 76 77 78 79 80 81
	/*
	 * Only print a message and send a uevent if the gendisk is user visible
	 * and alive.  This avoids spamming the log and udev when setting the
	 * initial capacity during probing.
	 */
	if (size == capacity ||
	    (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
		return false;
82

83
	pr_info("%s: detected capacity change from %lld to %lld\n",
M
Ming Lei 已提交
84
		disk->disk_name, capacity, size);
85

86 87 88 89 90 91 92 93
	/*
	 * Historically we did not send a uevent for changes to/from an empty
	 * device.
	 */
	if (!capacity || !size)
		return false;
	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
	return true;
94
}
95
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
96

97
/*
C
Christoph Hellwig 已提交
98 99 100 101 102
 * Format the device name of the indicated block device into the supplied buffer
 * and return a pointer to that same buffer for convenience.
 *
 * Note: do not use this in new code, use the %pg specifier to sprintf and
 * printk insted.
103
 */
C
Christoph Hellwig 已提交
104
const char *bdevname(struct block_device *bdev, char *buf)
105
{
C
Christoph Hellwig 已提交
106 107 108
	struct gendisk *hd = bdev->bd_disk;
	int partno = bdev->bd_partno;

109 110 111 112 113 114 115 116 117 118
	if (!partno)
		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
	else
		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);

	return buf;
}
EXPORT_SYMBOL(bdevname);
119

120 121
static void part_stat_read_all(struct block_device *part,
		struct disk_stats *stat)
122 123 124 125 126
{
	int cpu;

	memset(stat, 0, sizeof(struct disk_stats));
	for_each_possible_cpu(cpu) {
127
		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
128 129 130 131 132 133 134 135 136 137 138 139 140
		int group;

		for (group = 0; group < NR_STAT_GROUPS; group++) {
			stat->nsecs[group] += ptr->nsecs[group];
			stat->sectors[group] += ptr->sectors[group];
			stat->ios[group] += ptr->ios[group];
			stat->merges[group] += ptr->merges[group];
		}

		stat->io_ticks += ptr->io_ticks;
	}
}

141
static unsigned int part_in_flight(struct block_device *part)
142
{
143
	unsigned int inflight = 0;
144
	int cpu;
145

146
	for_each_possible_cpu(cpu) {
147 148
		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
			    part_stat_local_read_cpu(part, in_flight[1], cpu);
149
	}
150 151
	if ((int)inflight < 0)
		inflight = 0;
152

153
	return inflight;
154 155
}

156 157
static void part_in_flight_rw(struct block_device *part,
		unsigned int inflight[2])
158
{
159 160 161 162 163 164 165 166 167 168 169 170
	int cpu;

	inflight[0] = 0;
	inflight[1] = 0;
	for_each_possible_cpu(cpu) {
		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
	}
	if ((int)inflight[0] < 0)
		inflight[0] = 0;
	if ((int)inflight[1] < 0)
		inflight[1] = 0;
171 172
}

L
Linus Torvalds 已提交
173 174 175 176
/*
 * Can be deleted altogether. Later.
 *
 */
177
#define BLKDEV_MAJOR_HASH_SIZE 255
L
Linus Torvalds 已提交
178 179 180 181
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
182
	void (*probe)(dev_t devt);
183
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
C
Christoph Hellwig 已提交
184
static DEFINE_MUTEX(major_names_lock);
L
Linus Torvalds 已提交
185 186

/* index in the above - for now: assume no multimajor ranges */
187
static inline int major_to_index(unsigned major)
L
Linus Torvalds 已提交
188
{
189
	return major % BLKDEV_MAJOR_HASH_SIZE;
190 191
}

192
#ifdef CONFIG_PROC_FS
193
void blkdev_show(struct seq_file *seqf, off_t offset)
194
{
195
	struct blk_major_name *dp;
196

C
Christoph Hellwig 已提交
197
	mutex_lock(&major_names_lock);
198 199
	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
		if (dp->major == offset)
200
			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
C
Christoph Hellwig 已提交
201
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
202
}
203
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
204

205
/**
206
 * __register_blkdev - register a new block device
207
 *
208 209
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
210
 * @name: the name of the new block device as a zero terminated string
211
 * @probe: allback that is called on access to any minor number of @major
212 213 214
 *
 * The @name must be unique within the system.
 *
215 216
 * The return value depends on the @major input parameter:
 *
217 218
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
219
 *  - if any unused major number was requested with @major = 0 parameter
220
 *    then the return value is the allocated major number in range
221 222 223 224
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
225 226
 *
 * Use register_blkdev instead for any new code.
227
 */
228 229
int __register_blkdev(unsigned int major, const char *name,
		void (*probe)(dev_t devt))
L
Linus Torvalds 已提交
230 231 232 233
{
	struct blk_major_name **n, *p;
	int index, ret = 0;

C
Christoph Hellwig 已提交
234
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
235 236 237 238 239 240 241 242 243

	/* temporary */
	if (major == 0) {
		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
			if (major_names[index] == NULL)
				break;
		}

		if (index == 0) {
244 245
			printk("%s: failed to get major for %s\n",
			       __func__, name);
L
Linus Torvalds 已提交
246 247 248 249 250 251 252
			ret = -EBUSY;
			goto out;
		}
		major = index;
		ret = major;
	}

253
	if (major >= BLKDEV_MAJOR_MAX) {
254 255
		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
256 257 258 259 260

		ret = -EINVAL;
		goto out;
	}

L
Linus Torvalds 已提交
261 262 263 264 265 266 267
	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
	if (p == NULL) {
		ret = -ENOMEM;
		goto out;
	}

	p->major = major;
268
	p->probe = probe;
L
Linus Torvalds 已提交
269 270 271 272 273 274 275 276 277 278 279 280 281 282
	strlcpy(p->name, name, sizeof(p->name));
	p->next = NULL;
	index = major_to_index(major);

	for (n = &major_names[index]; *n; n = &(*n)->next) {
		if ((*n)->major == major)
			break;
	}
	if (!*n)
		*n = p;
	else
		ret = -EBUSY;

	if (ret < 0) {
283
		printk("register_blkdev: cannot get major %u for %s\n",
L
Linus Torvalds 已提交
284 285 286 287
		       major, name);
		kfree(p);
	}
out:
C
Christoph Hellwig 已提交
288
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
289 290
	return ret;
}
291
EXPORT_SYMBOL(__register_blkdev);
L
Linus Torvalds 已提交
292

A
Akinobu Mita 已提交
293
void unregister_blkdev(unsigned int major, const char *name)
L
Linus Torvalds 已提交
294 295 296 297 298
{
	struct blk_major_name **n;
	struct blk_major_name *p = NULL;
	int index = major_to_index(major);

C
Christoph Hellwig 已提交
299
	mutex_lock(&major_names_lock);
L
Linus Torvalds 已提交
300 301 302
	for (n = &major_names[index]; *n; n = &(*n)->next)
		if ((*n)->major == major)
			break;
303 304 305
	if (!*n || strcmp((*n)->name, name)) {
		WARN_ON(1);
	} else {
L
Linus Torvalds 已提交
306 307 308
		p = *n;
		*n = p->next;
	}
C
Christoph Hellwig 已提交
309
	mutex_unlock(&major_names_lock);
L
Linus Torvalds 已提交
310 311 312 313 314
	kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
/**
 * blk_mangle_minor - scatter minor numbers apart
 * @minor: minor number to mangle
 *
 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 * is enabled.  Mangling twice gives the original value.
 *
 * RETURNS:
 * Mangled value.
 *
 * CONTEXT:
 * Don't care.
 */
static int blk_mangle_minor(int minor)
{
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
	int i;

	for (i = 0; i < MINORBITS / 2; i++) {
		int low = minor & (1 << i);
		int high = minor & (1 << (MINORBITS - 1 - i));
		int distance = MINORBITS - 1 - 2 * i;

		minor ^= low | high;	/* clear both bits */
		low <<= distance;	/* swap the positions */
		high >>= distance;
		minor |= low | high;	/* and set */
	}
#endif
	return minor;
}

347
int blk_alloc_ext_minor(void)
T
Tejun Heo 已提交
348
{
T
Tejun Heo 已提交
349
	int idx;
T
Tejun Heo 已提交
350

351
	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
352 353 354 355 356 357
	if (idx < 0) {
		if (idx == -ENOSPC)
			return -EBUSY;
		return idx;
	}
	return blk_mangle_minor(idx);
T
Tejun Heo 已提交
358 359
}

360
void blk_free_ext_minor(unsigned int minor)
T
Tejun Heo 已提交
361
{
362
	ida_free(&ext_devt_ida, blk_mangle_minor(minor));
Y
Yufen Yu 已提交
363 364
}

365 366 367 368 369 370 371 372 373 374 375 376
static char *bdevt_str(dev_t devt, char *buf)
{
	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
		char tbuf[BDEVT_SIZE];
		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
	} else
		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));

	return buf;
}

377 378 379
void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
	struct block_device *part;
C
Christoph Hellwig 已提交
380
	unsigned long idx;
381

C
Christoph Hellwig 已提交
382 383 384 385
	rcu_read_lock();
	xa_for_each(&disk->part_tbl, idx, part) {
		if (bdev_is_partition(part) && !bdev_nr_sectors(part))
			continue;
386
		if (!kobject_get_unless_zero(&part->bd_device.kobj))
C
Christoph Hellwig 已提交
387 388 389
			continue;

		rcu_read_unlock();
390
		kobject_uevent(bdev_kobj(part), action);
391
		put_device(&part->bd_device);
C
Christoph Hellwig 已提交
392 393 394
		rcu_read_lock();
	}
	rcu_read_unlock();
395 396 397
}
EXPORT_SYMBOL_GPL(disk_uevent);

398 399 400 401 402 403 404 405 406 407 408 409 410
static void disk_scan_partitions(struct gendisk *disk)
{
	struct block_device *bdev;

	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
		return;

	set_bit(GD_NEED_PART_SCAN, &disk->state);
	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
	if (!IS_ERR(bdev))
		blkdev_put(bdev, FMODE_READ);
}

411 412
static void register_disk(struct device *parent, struct gendisk *disk,
			  const struct attribute_group **groups)
413 414 415 416
{
	struct device *ddev = disk_to_dev(disk);
	int err;

417
	ddev->parent = parent;
418

419
	dev_set_name(ddev, "%s", disk->disk_name);
420 421 422 423

	/* delay uevents, until we scanned partition table */
	dev_set_uevent_suppress(ddev, 1);

424 425 426 427
	if (groups) {
		WARN_ON(ddev->groups);
		ddev->groups = groups;
	}
428 429 430 431 432 433 434 435 436 437
	if (device_add(ddev))
		return;
	if (!sysfs_deprecated) {
		err = sysfs_create_link(block_depr, &ddev->kobj,
					kobject_name(&ddev->kobj));
		if (err) {
			device_del(ddev);
			return;
		}
	}
438 439 440 441 442 443 444 445

	/*
	 * avoid probable deadlock caused by allocating memory with
	 * GFP_KERNEL in runtime_resume callback of its all ancestor
	 * devices
	 */
	pm_runtime_set_memalloc_noio(ddev, true);

446 447
	disk->part0->bd_holder_dir =
		kobject_create_and_add("holders", &ddev->kobj);
448 449
	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);

450
	if (disk->flags & GENHD_FL_HIDDEN)
451 452
		return;

453
	disk_scan_partitions(disk);
454

455
	/* announce the disk and partitions after all partitions are created */
456
	dev_set_uevent_suppress(ddev, 0);
457
	disk_uevent(disk, KOBJ_ADD);
458

459 460 461 462 463 464
	if (disk->queue->backing_dev_info->dev) {
		err = sysfs_create_link(&ddev->kobj,
			  &disk->queue->backing_dev_info->dev->kobj,
			  "bdi");
		WARN_ON(err);
	}
465 466
}

L
Linus Torvalds 已提交
467
/**
468
 * __device_add_disk - add disk information to kernel list
469
 * @parent: parent device for the disk
L
Linus Torvalds 已提交
470
 * @disk: per-device partitioning information
471
 * @groups: Additional per-device sysfs groups
472
 * @register_queue: register the queue if set to true
L
Linus Torvalds 已提交
473 474 475
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
476 477
 *
 * FIXME: error handling
L
Linus Torvalds 已提交
478
 */
479
static void __device_add_disk(struct device *parent, struct gendisk *disk,
480
			      const struct attribute_group **groups,
481
			      bool register_queue)
L
Linus Torvalds 已提交
482
{
483
	int ret;
484

485 486 487 488 489 490 491 492 493
	/*
	 * The disk queue should now be all set with enough information about
	 * the device for the elevator code to pick an adequate default
	 * elevator if one is needed, that is, for devices requesting queue
	 * registration.
	 */
	if (register_queue)
		elevator_init_mq(disk->queue);

494 495 496 497 498 499
	/*
	 * If the driver provides an explicit major number it also must provide
	 * the number of minors numbers supported, and those will be used to
	 * setup the gendisk.
	 * Otherwise just allocate the device numbers for both the whole device
	 * and all partitions from the extended dev_t space.
500
	 */
501 502
	if (disk->major) {
		WARN_ON(!disk->minors);
503 504 505 506 507 508

		if (disk->minors > DISK_MAX_PARTS) {
			pr_err("block: can't allocate more than %d partitions\n",
				DISK_MAX_PARTS);
			disk->minors = DISK_MAX_PARTS;
		}
509 510
	} else {
		WARN_ON(disk->minors);
511

512 513 514 515 516 517 518
		ret = blk_alloc_ext_minor();
		if (ret < 0) {
			WARN_ON(1);
			return;
		}
		disk->major = BLOCK_EXT_MAJOR;
		disk->first_minor = MINOR(ret);
519
		disk->flags |= GENHD_FL_EXT_DEVT;
520
	}
521 522

	disk->flags |= GENHD_FL_UP;
523

524 525
	disk_alloc_events(disk);

526 527 528 529 530 531 532 533
	if (disk->flags & GENHD_FL_HIDDEN) {
		/*
		 * Don't let hidden disks show up in /proc/partitions,
		 * and don't bother scanning for partitions either.
		 */
		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
		disk->flags |= GENHD_FL_NO_PART_SCAN;
	} else {
534 535
		struct backing_dev_info *bdi = disk->queue->backing_dev_info;
		struct device *dev = disk_to_dev(disk);
536

537
		/* Register BDI before referencing it from bdev */
538 539 540
		dev->devt = MKDEV(disk->major, disk->first_minor);
		ret = bdi_register(bdi, "%u:%u",
				   disk->major, disk->first_minor);
541
		WARN_ON(ret);
542
		bdi_set_owner(bdi, dev);
543
		bdev_add(disk->part0, dev->devt);
544
	}
545
	register_disk(parent, disk, groups);
546 547
	if (register_queue)
		blk_register_queue(disk);
548

549 550 551 552
	/*
	 * Take an extra ref on queue which will be put on disk_release()
	 * so that it sticks around as long as @disk is there.
	 */
553 554 555 556
	if (blk_get_queue(disk->queue))
		set_bit(GD_QUEUE_REF, &disk->state);
	else
		WARN_ON_ONCE(1);
557

558
	disk_add_events(disk);
559
	blk_integrity_add(disk);
L
Linus Torvalds 已提交
560
}
561

562 563 564
void device_add_disk(struct device *parent, struct gendisk *disk,
		     const struct attribute_group **groups)

565
{
566
	__device_add_disk(parent, disk, groups, true);
567
}
568
EXPORT_SYMBOL(device_add_disk);
L
Linus Torvalds 已提交
569

570 571
void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
{
572
	__device_add_disk(parent, disk, NULL, false);
573 574 575
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);

576 577 578 579 580 581 582 583 584 585 586 587 588
/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
589 590 591 592 593
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
594
 */
595
void del_gendisk(struct gendisk *disk)
L
Linus Torvalds 已提交
596
{
597 598
	might_sleep();

599 600 601
	if (WARN_ON_ONCE(!disk->queue))
		return;

602
	blk_integrity_del(disk);
603 604
	disk_del_events(disk);

605
	mutex_lock(&disk->open_mutex);
606
	remove_inode_hash(disk->part0->bd_inode);
607
	disk->flags &= ~GENHD_FL_UP;
608
	blk_drop_partitions(disk);
609
	mutex_unlock(&disk->open_mutex);
610

611 612 613
	fsync_bdev(disk->part0);
	__invalidate_device(disk->part0, true);

614 615
	set_capacity(disk, 0);

616
	if (!(disk->flags & GENHD_FL_HIDDEN)) {
617
		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
618

619 620 621 622
		/*
		 * Unregister bdi before releasing device numbers (as they can
		 * get reused and we'd get clashes in sysfs).
		 */
623
		bdi_unregister(disk->queue->backing_dev_info);
624
	}
625

626
	blk_unregister_queue(disk);
627

628
	kobject_put(disk->part0->bd_holder_dir);
629 630
	kobject_put(disk->slave_dir);

631
	part_stat_set_all(disk->part0, 0);
632
	disk->part0->bd_stamp = 0;
633 634
	if (!sysfs_deprecated)
		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
635
	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
636
	device_del(disk_to_dev(disk));
L
Linus Torvalds 已提交
637
}
638
EXPORT_SYMBOL(del_gendisk);
L
Linus Torvalds 已提交
639

640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
					struct device_attribute *attr,
					char *page)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return sprintf(page, "\n");

	return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *page, size_t len)
{
	struct gendisk *disk = dev_to_disk(dev);

	if (!disk->bb)
		return -ENXIO;

	return badblocks_store(disk->bb, page, len, 0);
}

665
void blk_request_module(dev_t devt)
666
{
667 668 669 670 671 672 673 674 675 676 677 678 679
	unsigned int major = MAJOR(devt);
	struct blk_major_name **n;

	mutex_lock(&major_names_lock);
	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
		if ((*n)->major == major && (*n)->probe) {
			(*n)->probe(devt);
			mutex_unlock(&major_names_lock);
			return;
		}
	}
	mutex_unlock(&major_names_lock);

680 681 682 683 684
	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
		/* Make old-style 2.4 aliases work */
		request_module("block-major-%d", MAJOR(devt));
}

685 686 687 688 689 690 691
/*
 * print a full list of all partitions - intended for places where the root
 * filesystem can't be mounted and thus to give the victim some idea of what
 * went wrong
 */
void __init printk_all_partitions(void)
{
692 693 694 695 696 697
	struct class_dev_iter iter;
	struct device *dev;

	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
		struct gendisk *disk = dev_to_disk(dev);
698
		struct block_device *part;
699
		char devt_buf[BDEVT_SIZE];
700
		unsigned long idx;
701 702 703

		/*
		 * Don't show empty devices or things that have been
L
Lucas De Marchi 已提交
704
		 * suppressed
705 706 707 708 709 710
		 */
		if (get_capacity(disk) == 0 ||
		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
			continue;

		/*
711 712
		 * Note, unlike /proc/partitions, I am showing the numbers in
		 * hex - the same format as the root= option takes.
713
		 */
714 715 716 717
		rcu_read_lock();
		xa_for_each(&disk->part_tbl, idx, part) {
			if (!bdev_nr_sectors(part))
				continue;
718
			printk("%s%s %10llu %pg %s",
719
			       bdev_is_partition(part) ? "  " : "",
720
			       bdevt_str(part->bd_dev, devt_buf),
721
			       bdev_nr_sectors(part) >> 1, part,
722 723
			       part->bd_meta_info ?
					part->bd_meta_info->uuid : "");
724
			if (bdev_is_partition(part))
T
Tejun Heo 已提交
725
				printk("\n");
726 727 728 729 730
			else if (dev->parent && dev->parent->driver)
				printk(" driver: %s\n",
					dev->parent->driver->name);
			else
				printk(" (driver?)\n");
T
Tejun Heo 已提交
731
		}
732
		rcu_read_unlock();
733 734
	}
	class_dev_iter_exit(&iter);
735 736
}

L
Linus Torvalds 已提交
737 738
#ifdef CONFIG_PROC_FS
/* iterator */
739
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
740
{
741 742 743
	loff_t skip = *pos;
	struct class_dev_iter *iter;
	struct device *dev;
744

745
	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
746 747 748 749 750 751 752 753 754 755 756 757
	if (!iter)
		return ERR_PTR(-ENOMEM);

	seqf->private = iter;
	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
	do {
		dev = class_dev_iter_next(iter);
		if (!dev)
			return NULL;
	} while (skip--);

	return dev_to_disk(dev);
758 759
}

760
static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
L
Linus Torvalds 已提交
761
{
762
	struct device *dev;
L
Linus Torvalds 已提交
763

764 765
	(*pos)++;
	dev = class_dev_iter_next(seqf->private);
766
	if (dev)
767
		return dev_to_disk(dev);
768

L
Linus Torvalds 已提交
769 770 771
	return NULL;
}

772
static void disk_seqf_stop(struct seq_file *seqf, void *v)
773
{
774
	struct class_dev_iter *iter = seqf->private;
775

776 777 778 779
	/* stop is called even after start failed :-( */
	if (iter) {
		class_dev_iter_exit(iter);
		kfree(iter);
780
		seqf->private = NULL;
781
	}
L
Linus Torvalds 已提交
782 783
}

784
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
L
Linus Torvalds 已提交
785
{
786
	void *p;
787 788

	p = disk_seqf_start(seqf, pos);
789
	if (!IS_ERR_OR_NULL(p) && !*pos)
790 791
		seq_puts(seqf, "major minor  #blocks  name\n\n");
	return p;
L
Linus Torvalds 已提交
792 793
}

794
static int show_partition(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
795 796
{
	struct gendisk *sgp = v;
797
	struct block_device *part;
798
	unsigned long idx;
L
Linus Torvalds 已提交
799 800

	/* Don't show non-partitionable removeable devices or empty devices */
T
Tejun Heo 已提交
801
	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
802
				   (sgp->flags & GENHD_FL_REMOVABLE)))
L
Linus Torvalds 已提交
803 804 805 806
		return 0;
	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
		return 0;

807 808 809 810
	rcu_read_lock();
	xa_for_each(&sgp->part_tbl, idx, part) {
		if (!bdev_nr_sectors(part))
			continue;
811
		seq_printf(seqf, "%4d  %7d %10llu %pg\n",
812
			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
813
			   bdev_nr_sectors(part) >> 1, part);
814 815
	}
	rcu_read_unlock();
L
Linus Torvalds 已提交
816 817 818
	return 0;
}

819
static const struct seq_operations partitions_op = {
820 821 822
	.start	= show_partition_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
823
	.show	= show_partition
L
Linus Torvalds 已提交
824 825 826 827 828
};
#endif

static int __init genhd_device_init(void)
{
829 830 831 832
	int error;

	block_class.dev_kobj = sysfs_dev_block_kobj;
	error = class_register(&block_class);
R
Roland McGrath 已提交
833 834
	if (unlikely(error))
		return error;
L
Linus Torvalds 已提交
835
	blk_dev_init();
836

837 838
	register_blkdev(BLOCK_EXT_MAJOR, "blkext");

839
	/* create top-level block dir */
840 841
	if (!sysfs_deprecated)
		block_depr = kobject_create_and_add("block", NULL);
842
	return 0;
L
Linus Torvalds 已提交
843 844 845 846
}

subsys_initcall(genhd_device_init);

847 848
static ssize_t disk_range_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
849
{
850
	struct gendisk *disk = dev_to_disk(dev);
L
Linus Torvalds 已提交
851

852
	return sprintf(buf, "%d\n", disk->minors);
L
Linus Torvalds 已提交
853 854
}

855 856 857 858 859
static ssize_t disk_ext_range_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
860
	return sprintf(buf, "%d\n", disk_max_parts(disk));
861 862
}

863 864
static ssize_t disk_removable_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
865
{
866
	struct gendisk *disk = dev_to_disk(dev);
867

868 869
	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
870 871
}

872 873 874 875 876 877 878 879 880
static ssize_t disk_hidden_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n",
		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

K
Kay Sievers 已提交
881 882 883 884 885
static ssize_t disk_ro_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

T
Tejun Heo 已提交
886
	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
K
Kay Sievers 已提交
887 888
}

889 890 891
ssize_t part_size_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
892
	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
893 894 895 896 897
}

ssize_t part_stat_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
898 899
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
900
	struct disk_stats stat;
901 902
	unsigned int inflight;

903
	part_stat_read_all(bdev, &stat);
904
	if (queue_is_mq(q))
905
		inflight = blk_mq_in_flight(q, bdev);
906
	else
907
		inflight = part_in_flight(bdev);
908

909 910 911 912 913 914 915
	return sprintf(buf,
		"%8lu %8lu %8llu %8u "
		"%8lu %8lu %8llu %8u "
		"%8u %8u %8u "
		"%8lu %8lu %8llu %8u "
		"%8lu %8u"
		"\n",
916 917 918 919 920 921 922 923
		stat.ios[STAT_READ],
		stat.merges[STAT_READ],
		(unsigned long long)stat.sectors[STAT_READ],
		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
		stat.ios[STAT_WRITE],
		stat.merges[STAT_WRITE],
		(unsigned long long)stat.sectors[STAT_WRITE],
		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
924
		inflight,
925
		jiffies_to_msecs(stat.io_ticks),
926 927 928 929 930
		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
				      stat.nsecs[STAT_WRITE] +
				      stat.nsecs[STAT_DISCARD] +
				      stat.nsecs[STAT_FLUSH],
						NSEC_PER_MSEC),
931 932 933 934 935 936
		stat.ios[STAT_DISCARD],
		stat.merges[STAT_DISCARD],
		(unsigned long long)stat.sectors[STAT_DISCARD],
		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
		stat.ios[STAT_FLUSH],
		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
937 938 939 940 941
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
			   char *buf)
{
942 943
	struct block_device *bdev = dev_to_bdev(dev);
	struct request_queue *q = bdev->bd_disk->queue;
944 945
	unsigned int inflight[2];

946
	if (queue_is_mq(q))
947
		blk_mq_in_flight_rw(q, bdev, inflight);
948
	else
949
		part_in_flight_rw(bdev, inflight);
950

951 952 953
	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

954 955
static ssize_t disk_capability_show(struct device *dev,
				    struct device_attribute *attr, char *buf)
956
{
957 958 959
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%x\n", disk->flags);
960
}
961

962 963 964 965 966 967 968 969 970
static ssize_t disk_alignment_offset_show(struct device *dev,
					  struct device_attribute *attr,
					  char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}

971 972 973 974 975 976
static ssize_t disk_discard_alignment_show(struct device *dev,
					   struct device_attribute *attr,
					   char *buf)
{
	struct gendisk *disk = dev_to_disk(dev);

977
	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
978 979
}

980 981 982 983 984 985 986 987 988 989 990 991
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
992

993
#ifdef CONFIG_FAIL_MAKE_REQUEST
994 995 996
ssize_t part_fail_show(struct device *dev,
		       struct device_attribute *attr, char *buf)
{
997
	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
998 999 1000 1001 1002 1003 1004 1005 1006
}

ssize_t part_fail_store(struct device *dev,
			struct device_attribute *attr,
			const char *buf, size_t count)
{
	int i;

	if (count > 0 && sscanf(buf, "%d", &i) > 0)
1007
		dev_to_bdev(dev)->bd_make_it_fail = i;
1008 1009 1010 1011

	return count;
}

1012
static struct device_attribute dev_attr_fail =
1013
	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1014 1015
#endif /* CONFIG_FAIL_MAKE_REQUEST */

1016 1017
#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
1018
	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1019
#endif
1020 1021 1022

static struct attribute *disk_attrs[] = {
	&dev_attr_range.attr,
1023
	&dev_attr_ext_range.attr,
1024
	&dev_attr_removable.attr,
1025
	&dev_attr_hidden.attr,
K
Kay Sievers 已提交
1026
	&dev_attr_ro.attr,
1027
	&dev_attr_size.attr,
1028
	&dev_attr_alignment_offset.attr,
1029
	&dev_attr_discard_alignment.attr,
1030 1031
	&dev_attr_capability.attr,
	&dev_attr_stat.attr,
1032
	&dev_attr_inflight.attr,
1033
	&dev_attr_badblocks.attr,
1034 1035 1036
	&dev_attr_events.attr,
	&dev_attr_events_async.attr,
	&dev_attr_events_poll_msecs.attr,
1037 1038
#ifdef CONFIG_FAIL_MAKE_REQUEST
	&dev_attr_fail.attr,
1039 1040 1041
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
	&dev_attr_fail_timeout.attr,
1042 1043 1044 1045
#endif
	NULL
};

1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
	struct device *dev = container_of(kobj, typeof(*dev), kobj);
	struct gendisk *disk = dev_to_disk(dev);

	if (a == &dev_attr_badblocks.attr && !disk->bb)
		return 0;
	return a->mode;
}

1056 1057
static struct attribute_group disk_attr_group = {
	.attrs = disk_attrs,
1058
	.is_visible = disk_visible,
1059 1060
};

1061
static const struct attribute_group *disk_attr_groups[] = {
1062 1063
	&disk_attr_group,
	NULL
L
Linus Torvalds 已提交
1064 1065
};

1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
1077 1078
 *
 * Context: can sleep
1079
 */
1080
static void disk_release(struct device *dev)
L
Linus Torvalds 已提交
1081
{
1082 1083
	struct gendisk *disk = dev_to_disk(dev);

1084 1085
	might_sleep();

1086 1087
	if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
		blk_free_ext_minor(MINOR(dev->devt));
1088
	disk_release_events(disk);
L
Linus Torvalds 已提交
1089
	kfree(disk->random);
1090
	xa_destroy(&disk->part_tbl);
1091
	if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue)
1092
		blk_put_queue(disk->queue);
C
Christoph Hellwig 已提交
1093
	iput(disk->part0->bd_inode);	/* frees the disk */
L
Linus Torvalds 已提交
1094
}
1095 1096 1097 1098 1099 1100 1101 1102

static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
{
	struct gendisk *disk = dev_to_disk(dev);

	return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}

1103 1104
struct class block_class = {
	.name		= "block",
1105
	.dev_uevent	= block_uevent,
L
Linus Torvalds 已提交
1106 1107
};

1108
static char *block_devnode(struct device *dev, umode_t *mode,
1109
			   kuid_t *uid, kgid_t *gid)
1110 1111 1112
{
	struct gendisk *disk = dev_to_disk(dev);

1113 1114
	if (disk->fops->devnode)
		return disk->fops->devnode(disk, mode);
1115 1116 1117
	return NULL;
}

1118
const struct device_type disk_type = {
1119 1120 1121
	.name		= "disk",
	.groups		= disk_attr_groups,
	.release	= disk_release,
1122
	.devnode	= block_devnode,
L
Linus Torvalds 已提交
1123 1124
};

1125
#ifdef CONFIG_PROC_FS
1126 1127 1128 1129 1130 1131 1132 1133
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
L
Linus Torvalds 已提交
1134 1135
{
	struct gendisk *gp = v;
1136
	struct block_device *hd;
1137
	unsigned int inflight;
1138
	struct disk_stats stat;
1139
	unsigned long idx;
L
Linus Torvalds 已提交
1140 1141

	/*
1142
	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1143
		seq_puts(seqf,	"major minor name"
L
Linus Torvalds 已提交
1144 1145 1146 1147
				"     rio rmerge rsect ruse wio wmerge "
				"wsect wuse running use aveq"
				"\n\n");
	*/
1148

1149 1150 1151 1152
	rcu_read_lock();
	xa_for_each(&gp->part_tbl, idx, hd) {
		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
			continue;
1153
		part_stat_read_all(hd, &stat);
1154
		if (queue_is_mq(gp->queue))
1155
			inflight = blk_mq_in_flight(gp->queue, hd);
1156
		else
1157
			inflight = part_in_flight(hd);
1158

1159
		seq_printf(seqf, "%4d %7d %pg "
1160 1161 1162
			   "%lu %lu %lu %u "
			   "%lu %lu %lu %u "
			   "%u %u %u "
1163 1164 1165
			   "%lu %lu %lu %u "
			   "%lu %u"
			   "\n",
1166
			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
			   stat.ios[STAT_READ],
			   stat.merges[STAT_READ],
			   stat.sectors[STAT_READ],
			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
							NSEC_PER_MSEC),
			   stat.ios[STAT_WRITE],
			   stat.merges[STAT_WRITE],
			   stat.sectors[STAT_WRITE],
			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
							NSEC_PER_MSEC),
1177
			   inflight,
1178
			   jiffies_to_msecs(stat.io_ticks),
1179 1180 1181 1182 1183
			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
						 stat.nsecs[STAT_WRITE] +
						 stat.nsecs[STAT_DISCARD] +
						 stat.nsecs[STAT_FLUSH],
							NSEC_PER_MSEC),
1184 1185 1186 1187 1188 1189 1190 1191
			   stat.ios[STAT_DISCARD],
			   stat.merges[STAT_DISCARD],
			   stat.sectors[STAT_DISCARD],
			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
						 NSEC_PER_MSEC),
			   stat.ios[STAT_FLUSH],
			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
						 NSEC_PER_MSEC)
1192
			);
L
Linus Torvalds 已提交
1193
	}
1194
	rcu_read_unlock();
1195

L
Linus Torvalds 已提交
1196 1197 1198
	return 0;
}

1199
static const struct seq_operations diskstats_op = {
1200 1201 1202
	.start	= disk_seqf_start,
	.next	= disk_seqf_next,
	.stop	= disk_seqf_stop,
L
Linus Torvalds 已提交
1203 1204
	.show	= diskstats_show
};
1205 1206 1207

static int __init proc_genhd_init(void)
{
1208 1209
	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
	proc_create_seq("partitions", 0, NULL, &partitions_op);
1210 1211 1212
	return 0;
}
module_init(proc_genhd_init);
1213
#endif /* CONFIG_PROC_FS */
L
Linus Torvalds 已提交
1214

1215 1216
dev_t part_devt(struct gendisk *disk, u8 partno)
{
C
Christoph Hellwig 已提交
1217
	struct block_device *part;
1218 1219
	dev_t devt = 0;

C
Christoph Hellwig 已提交
1220 1221 1222
	rcu_read_lock();
	part = xa_load(&disk->part_tbl, partno);
	if (part)
1223
		devt = part->bd_dev;
C
Christoph Hellwig 已提交
1224
	rcu_read_unlock();
1225 1226 1227 1228

	return devt;
}

1229
dev_t blk_lookup_devt(const char *name, int partno)
1230
{
1231 1232 1233
	dev_t devt = MKDEV(0, 0);
	struct class_dev_iter iter;
	struct device *dev;
1234

1235 1236
	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	while ((dev = class_dev_iter_next(&iter))) {
1237 1238
		struct gendisk *disk = dev_to_disk(dev);

1239
		if (strcmp(dev_name(dev), name))
1240 1241
			continue;

1242 1243 1244 1245 1246 1247
		if (partno < disk->minors) {
			/* We need to return the right devno, even
			 * if the partition doesn't exist yet.
			 */
			devt = MKDEV(MAJOR(dev->devt),
				     MINOR(dev->devt) + partno);
1248 1249 1250 1251
		} else {
			devt = part_devt(disk, partno);
			if (devt)
				break;
1252
		}
1253
	}
1254
	class_dev_iter_exit(&iter);
1255 1256 1257
	return devt;
}

1258
struct gendisk *__alloc_disk_node(int minors, int node_id)
1259 1260 1261
{
	struct gendisk *disk;

1262
	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1263 1264
	if (!disk)
		return NULL;
1265

1266 1267
	disk->part0 = bdev_alloc(disk, 0);
	if (!disk->part0)
1268 1269
		goto out_free_disk;

1270
	disk->node_id = node_id;
1271
	mutex_init(&disk->open_mutex);
1272 1273 1274
	xa_init(&disk->part_tbl);
	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
		goto out_destroy_part_tbl;
1275 1276 1277 1278 1279 1280

	disk->minors = minors;
	rand_initialize_disk(disk);
	disk_to_dev(disk)->class = &block_class;
	disk_to_dev(disk)->type = &disk_type;
	device_initialize(disk_to_dev(disk));
M
Matteo Croce 已提交
1281 1282
	inc_diskseq(disk);

L
Linus Torvalds 已提交
1283
	return disk;
1284

1285 1286
out_destroy_part_tbl:
	xa_destroy(&disk->part_tbl);
C
Christoph Hellwig 已提交
1287
	iput(disk->part0->bd_inode);
1288 1289 1290
out_free_disk:
	kfree(disk);
	return NULL;
L
Linus Torvalds 已提交
1291
}
1292
EXPORT_SYMBOL(__alloc_disk_node);
L
Linus Torvalds 已提交
1293

1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
struct gendisk *__blk_alloc_disk(int node)
{
	struct request_queue *q;
	struct gendisk *disk;

	q = blk_alloc_queue(node);
	if (!q)
		return NULL;

	disk = __alloc_disk_node(0, node);
	if (!disk) {
		blk_cleanup_queue(q);
		return NULL;
	}
	disk->queue = q;
	return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);

1313 1314
/**
 * put_disk - decrements the gendisk refcount
1315
 * @disk: the struct gendisk to decrement the refcount for
1316 1317 1318
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
1319 1320 1321
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
1322
 */
L
Linus Torvalds 已提交
1323 1324 1325
void put_disk(struct gendisk *disk)
{
	if (disk)
1326
		put_device(disk_to_dev(disk));
L
Linus Torvalds 已提交
1327 1328 1329
}
EXPORT_SYMBOL(put_disk);

1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345
/**
 * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
 * @disk: gendisk to shutdown
 *
 * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
 * the queue DEAD, destroy and put it and the gendisk structure.
 *
 * Context: can sleep
 */
void blk_cleanup_disk(struct gendisk *disk)
{
	blk_cleanup_queue(disk->queue);
	put_disk(disk);
}
EXPORT_SYMBOL(blk_cleanup_disk);

1346 1347 1348 1349 1350 1351 1352 1353 1354 1355
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
	char event[] = "DISK_RO=1";
	char *envp[] = { event, NULL };

	if (!ro)
		event[8] = '0';
	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

1356 1357 1358
/**
 * set_disk_ro - set a gendisk read-only
 * @disk:	gendisk to operate on
1359
 * @read_only:	%true to set the disk read-only, %false set the disk read/write
1360 1361 1362 1363 1364 1365
 *
 * This function is used to indicate whether a given disk device should have its
 * read-only flag set. set_disk_ro() is typically used by device drivers to
 * indicate whether the underlying physical device is write-protected.
 */
void set_disk_ro(struct gendisk *disk, bool read_only)
L
Linus Torvalds 已提交
1366
{
1367 1368 1369 1370 1371 1372
	if (read_only) {
		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
			return;
	} else {
		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
			return;
1373
	}
1374
	set_disk_ro_uevent(disk, read_only);
L
Linus Torvalds 已提交
1375 1376 1377 1378 1379
}
EXPORT_SYMBOL(set_disk_ro);

int bdev_read_only(struct block_device *bdev)
{
1380
	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
L
Linus Torvalds 已提交
1381 1382
}
EXPORT_SYMBOL(bdev_read_only);
M
Matteo Croce 已提交
1383 1384 1385 1386 1387

void inc_diskseq(struct gendisk *disk)
{
	disk->diskseq = atomic64_inc_return(&diskseq);
}