core.c 16.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3 4
 * Copyright (C) 1991-1998  Linus Torvalds
 * Re-organised Feb 1998 Russell King
C
Christoph Hellwig 已提交
5
 * Copyright (C) 2020 Christoph Hellwig
6 7 8 9 10
 */
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
11
#include <linux/vmalloc.h>
12
#include <linux/blktrace_api.h>
13
#include <linux/raid/detect.h>
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
#include "check.h"

static int (*check_part[])(struct parsed_partitions *) = {
	/*
	 * Probe partition formats with tables at disk address 0
	 * that also have an ADFS boot block at 0xdc0.
	 */
#ifdef CONFIG_ACORN_PARTITION_ICS
	adfspart_check_ICS,
#endif
#ifdef CONFIG_ACORN_PARTITION_POWERTEC
	adfspart_check_POWERTEC,
#endif
#ifdef CONFIG_ACORN_PARTITION_EESOX
	adfspart_check_EESOX,
#endif

	/*
	 * Now move on to formats that only have partition info at
	 * disk address 0xdc0.  Since these may also have stale
	 * PC/BIOS partition tables, they need to come before
	 * the msdos entry.
	 */
#ifdef CONFIG_ACORN_PARTITION_CUMANA
	adfspart_check_CUMANA,
#endif
#ifdef CONFIG_ACORN_PARTITION_ADFS
	adfspart_check_ADFS,
#endif

#ifdef CONFIG_CMDLINE_PARTITION
	cmdline_partition,
#endif
#ifdef CONFIG_EFI_PARTITION
	efi_partition,		/* this must come before msdos */
#endif
#ifdef CONFIG_SGI_PARTITION
	sgi_partition,
#endif
#ifdef CONFIG_LDM_PARTITION
	ldm_partition,		/* this must come before msdos */
#endif
#ifdef CONFIG_MSDOS_PARTITION
	msdos_partition,
#endif
#ifdef CONFIG_OSF_PARTITION
	osf_partition,
#endif
#ifdef CONFIG_SUN_PARTITION
	sun_partition,
#endif
#ifdef CONFIG_AMIGA_PARTITION
	amiga_partition,
#endif
#ifdef CONFIG_ATARI_PARTITION
	atari_partition,
#endif
#ifdef CONFIG_MAC_PARTITION
	mac_partition,
#endif
#ifdef CONFIG_ULTRIX_PARTITION
	ultrix_partition,
#endif
#ifdef CONFIG_IBM_PARTITION
	ibm_partition,
#endif
#ifdef CONFIG_KARMA_PARTITION
	karma_partition,
#endif
#ifdef CONFIG_SYSV68_PARTITION
	sysv68_partition,
#endif
	NULL
};

89 90
static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
91
	spin_lock(&bdev->bd_size_lock);
92
	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
93
	spin_unlock(&bdev->bd_size_lock);
94 95
}

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
{
	struct parsed_partitions *state;
	int nr;

	state = kzalloc(sizeof(*state), GFP_KERNEL);
	if (!state)
		return NULL;

	nr = disk_max_parts(hd);
	state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
	if (!state->parts) {
		kfree(state);
		return NULL;
	}

	state->limit = nr;

	return state;
}

static void free_partitions(struct parsed_partitions *state)
{
	vfree(state->parts);
	kfree(state);
}

static struct parsed_partitions *check_partition(struct gendisk *hd,
		struct block_device *bdev)
{
	struct parsed_partitions *state;
	int i, res, err;

	state = allocate_partitions(hd);
	if (!state)
		return NULL;
	state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
	if (!state->pp_buf) {
		free_partitions(state);
		return NULL;
	}
	state->pp_buf[0] = '\0';

	state->bdev = bdev;
	disk_name(hd, 0, state->name);
	snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
	if (isdigit(state->name[strlen(state->name)-1]))
		sprintf(state->name, "p");

	i = res = err = 0;
	while (!res && check_part[i]) {
		memset(state->parts, 0, state->limit * sizeof(state->parts[0]));
		res = check_part[i++](state);
		if (res < 0) {
			/*
			 * We have hit an I/O error which we don't report now.
			 * But record it, and let the others do their job.
			 */
			err = res;
			res = 0;
		}

	}
	if (res > 0) {
		printk(KERN_INFO "%s", state->pp_buf);

		free_page((unsigned long)state->pp_buf);
		return state;
	}
	if (state->access_beyond_eod)
		err = -ENOSPC;
	/*
	 * The partition is unrecognized. So report I/O errors if there were any
	 */
	if (err)
		res = err;
	if (res) {
		strlcat(state->pp_buf,
			" unable to read partition table\n", PAGE_SIZE);
		printk(KERN_INFO "%s", state->pp_buf);
	}
177

178 179 180 181
	free_page((unsigned long)state->pp_buf);
	free_partitions(state);
	return ERR_PTR(res);
}
182 183 184 185

static ssize_t part_partition_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
186
	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno);
187 188 189 190 191
}

static ssize_t part_start_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
{
192
	return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
193 194 195 196 197
}

static ssize_t part_ro_show(struct device *dev,
			    struct device_attribute *attr, char *buf)
{
198
	return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
199 200 201 202 203
}

static ssize_t part_alignment_offset_show(struct device *dev,
					  struct device_attribute *attr, char *buf)
{
204
	struct block_device *bdev = dev_to_bdev(dev);
205 206

	return sprintf(buf, "%u\n",
207 208
		queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
				bdev->bd_start_sect));
209 210 211 212 213
}

static ssize_t part_discard_alignment_show(struct device *dev,
					   struct device_attribute *attr, char *buf)
{
214
	struct block_device *bdev = dev_to_bdev(dev);
215 216

	return sprintf(buf, "%u\n",
217 218
		queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
				bdev->bd_start_sect));
219 220
}

221 222 223 224 225 226 227 228
static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
static DEVICE_ATTR(start, 0444, part_start_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(ro, 0444, part_ro_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
229 230
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
231
	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
#endif

static struct attribute *part_attrs[] = {
	&dev_attr_partition.attr,
	&dev_attr_start.attr,
	&dev_attr_size.attr,
	&dev_attr_ro.attr,
	&dev_attr_alignment_offset.attr,
	&dev_attr_discard_alignment.attr,
	&dev_attr_stat.attr,
	&dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
	&dev_attr_fail.attr,
#endif
	NULL
};

static struct attribute_group part_attr_group = {
	.attrs = part_attrs,
};

static const struct attribute_group *part_attr_groups[] = {
	&part_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
	&blk_trace_attr_group,
#endif
	NULL
};

static void part_release(struct device *dev)
{
263 264
	if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
		blk_free_ext_minor(MINOR(dev->devt));
265
	bdput(dev_to_bdev(dev));
266 267
}

268 269
static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
{
270
	struct block_device *part = dev_to_bdev(dev);
271

272 273 274
	add_uevent_var(env, "PARTN=%u", part->bd_partno);
	if (part->bd_meta_info && part->bd_meta_info->volname[0])
		add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname);
275 276 277
	return 0;
}

278 279 280 281
struct device_type part_type = {
	.name		= "partition",
	.groups		= part_attr_groups,
	.release	= part_release,
282
	.uevent		= part_uevent,
283 284
};

285 286 287 288
/*
 * Must be called either with bd_mutex held, before a disk can be opened or
 * after all disk users are gone.
 */
289
static void delete_partition(struct block_device *part)
290
{
291 292 293
	fsync_bdev(part);
	__invalidate_device(part, true);

294
	xa_erase(&part->bd_disk->part_tbl, part->bd_partno);
295 296
	kobject_put(part->bd_holder_dir);
	device_del(&part->bd_device);
297

Y
Yufen Yu 已提交
298
	/*
299 300
	 * Remove the block device from the inode hash, so that it cannot be
	 * looked up any more even when openers still hold references.
Y
Yufen Yu 已提交
301
	 */
302
	remove_inode_hash(part->bd_inode);
303

304
	put_device(&part->bd_device);
305 306 307 308 309 310 311
}

static ssize_t whole_disk_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
{
	return 0;
}
312
static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
313

314 315 316 317
/*
 * Must be called either with bd_mutex held, before a disk can be opened or
 * after all disk users are gone.
 */
318
static struct block_device *add_partition(struct gendisk *disk, int partno,
319 320 321 322 323 324
				sector_t start, sector_t len, int flags,
				struct partition_meta_info *info)
{
	dev_t devt = MKDEV(0, 0);
	struct device *ddev = disk_to_dev(disk);
	struct device *pdev;
325
	struct block_device *bdev;
326 327 328
	const char *dname;
	int err;

329 330 331
	if (partno >= disk_max_parts(disk))
		return ERR_PTR(-EINVAL);

332 333 334 335 336 337 338 339 340 341 342 343
	/*
	 * Partitions are not supported on zoned block devices that are used as
	 * such.
	 */
	switch (disk->queue->limits.zoned) {
	case BLK_ZONED_HM:
		pr_warn("%s: partitions not supported on host managed zoned block device\n",
			disk->disk_name);
		return ERR_PTR(-ENXIO);
	case BLK_ZONED_HA:
		pr_info("%s: disabling host aware zoned block device support due to partitions\n",
			disk->disk_name);
344
		blk_queue_set_zoned(disk, BLK_ZONED_NONE);
345 346 347 348 349
		break;
	case BLK_ZONED_NONE:
		break;
	}

350
	if (xa_load(&disk->part_tbl, partno))
351 352
		return ERR_PTR(-EBUSY);

353 354
	bdev = bdev_alloc(disk, partno);
	if (!bdev)
355
		return ERR_PTR(-ENOMEM);
356

357
	bdev->bd_start_sect = start;
358
	bdev_set_nr_sectors(bdev, len);
359 360

	if (info) {
361 362 363
		err = -ENOMEM;
		bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL);
		if (!bdev->bd_meta_info)
364
			goto out_bdput;
365 366
	}

367
	pdev = &bdev->bd_device;
368 369 370 371 372 373 374 375 376 377 378
	dname = dev_name(ddev);
	if (isdigit(dname[strlen(dname) - 1]))
		dev_set_name(pdev, "%sp%d", dname, partno);
	else
		dev_set_name(pdev, "%s%d", dname, partno);

	device_initialize(pdev);
	pdev->class = &block_class;
	pdev->type = &part_type;
	pdev->parent = ddev;

379 380 381 382 383 384 385 386 387
	/* in consecutive minor range? */
	if (bdev->bd_partno < disk->minors) {
		devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno);
	} else {
		err = blk_alloc_ext_minor();
		if (err < 0)
			goto out_put;
		devt = MKDEV(BLOCK_EXT_MAJOR, err);
	}
388 389 390 391 392 393 394 395 396
	pdev->devt = devt;

	/* delay uevent until 'holders' subdir is created */
	dev_set_uevent_suppress(pdev, 1);
	err = device_add(pdev);
	if (err)
		goto out_put;

	err = -ENOMEM;
397 398
	bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj);
	if (!bdev->bd_holder_dir)
399 400 401 402 403 404 405 406 407 408
		goto out_del;

	dev_set_uevent_suppress(pdev, 0);
	if (flags & ADDPART_FLAG_WHOLEDISK) {
		err = device_create_file(pdev, &dev_attr_whole_disk);
		if (err)
			goto out_del;
	}

	/* everything is up and running, commence */
409 410 411
	err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
	if (err)
		goto out_del;
412
	bdev_add(bdev, devt);
413 414 415 416

	/* suppress uevent if the disk suppresses it */
	if (!dev_get_uevent_suppress(ddev))
		kobject_uevent(&pdev->kobj, KOBJ_ADD);
417
	return bdev;
418

419 420
out_bdput:
	bdput(bdev);
421 422
	return ERR_PTR(err);
out_del:
423
	kobject_put(bdev->bd_holder_dir);
424 425 426 427 428 429
	device_del(pdev);
out_put:
	put_device(pdev);
	return ERR_PTR(err);
}

C
Christoph Hellwig 已提交
430 431 432
static bool partition_overlaps(struct gendisk *disk, sector_t start,
		sector_t length, int skip_partno)
{
433
	struct block_device *part;
C
Christoph Hellwig 已提交
434
	bool overlap = false;
435 436 437 438 439 440 441 442 443 444
	unsigned long idx;

	rcu_read_lock();
	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
		if (part->bd_partno != skip_partno &&
		    start < part->bd_start_sect + bdev_nr_sectors(part) &&
		    start + length > part->bd_start_sect) {
			overlap = true;
			break;
		}
C
Christoph Hellwig 已提交
445
	}
446
	rcu_read_unlock();
C
Christoph Hellwig 已提交
447 448 449 450 451 452 453

	return overlap;
}

int bdev_add_partition(struct block_device *bdev, int partno,
		sector_t start, sector_t length)
{
454
	struct block_device *part;
C
Christoph Hellwig 已提交
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469

	mutex_lock(&bdev->bd_mutex);
	if (partition_overlaps(bdev->bd_disk, start, length, -1)) {
		mutex_unlock(&bdev->bd_mutex);
		return -EBUSY;
	}

	part = add_partition(bdev->bd_disk, partno, start, length,
			ADDPART_FLAG_NONE, NULL);
	mutex_unlock(&bdev->bd_mutex);
	return PTR_ERR_OR_ZERO(part);
}

int bdev_del_partition(struct block_device *bdev, int partno)
{
470
	struct block_device *part;
471
	int ret;
C
Christoph Hellwig 已提交
472

473 474
	part = bdget_disk(bdev->bd_disk, partno);
	if (!part)
475
		return -ENXIO;
C
Christoph Hellwig 已提交
476

477
	mutex_lock(&part->bd_mutex);
478 479
	mutex_lock_nested(&bdev->bd_mutex, 1);

C
Christoph Hellwig 已提交
480
	ret = -EBUSY;
481
	if (part->bd_openers)
C
Christoph Hellwig 已提交
482 483
		goto out_unlock;

484
	delete_partition(part);
C
Christoph Hellwig 已提交
485 486
	ret = 0;
out_unlock:
487
	mutex_unlock(&bdev->bd_mutex);
488 489
	mutex_unlock(&part->bd_mutex);
	bdput(part);
C
Christoph Hellwig 已提交
490 491 492 493 494 495
	return ret;
}

int bdev_resize_partition(struct block_device *bdev, int partno,
		sector_t start, sector_t length)
{
496
	struct block_device *part;
C
Christoph Hellwig 已提交
497 498
	int ret = 0;

499
	part = bdget_disk(bdev->bd_disk, partno);
C
Christoph Hellwig 已提交
500 501 502
	if (!part)
		return -ENXIO;

503
	mutex_lock(&part->bd_mutex);
C
Christoph Hellwig 已提交
504 505
	mutex_lock_nested(&bdev->bd_mutex, 1);
	ret = -EINVAL;
506
	if (start != part->bd_start_sect)
C
Christoph Hellwig 已提交
507 508 509 510 511 512
		goto out_unlock;

	ret = -EBUSY;
	if (partition_overlaps(bdev->bd_disk, start, length, partno))
		goto out_unlock;

513
	bdev_set_nr_sectors(part, length);
C
Christoph Hellwig 已提交
514 515 516

	ret = 0;
out_unlock:
517
	mutex_unlock(&part->bd_mutex);
C
Christoph Hellwig 已提交
518
	mutex_unlock(&bdev->bd_mutex);
519
	bdput(part);
C
Christoph Hellwig 已提交
520 521 522
	return ret;
}

523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
	const struct block_device_operations *bdops = disk->fops;

	if (bdops->unlock_native_capacity &&
	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
		printk(KERN_CONT "enabling native capacity\n");
		bdops->unlock_native_capacity(disk);
		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
		return true;
	} else {
		printk(KERN_CONT "truncated\n");
		return false;
	}
}

539
void blk_drop_partitions(struct gendisk *disk)
540
{
541
	struct block_device *part;
542
	unsigned long idx;
543

544
	lockdep_assert_held(&disk->part0->bd_mutex);
545

546 547 548
	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
		if (!bdgrab(part))
			continue;
549
		delete_partition(part);
550 551
		bdput(part);
	}
552 553
}

554 555
static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
		struct parsed_partitions *state, int p)
556
{
557 558
	sector_t size = state->parts[p].size;
	sector_t from = state->parts[p].from;
559
	struct block_device *part;
560 561 562 563 564 565 566 567 568 569 570

	if (!size)
		return true;

	if (from >= get_capacity(disk)) {
		printk(KERN_WARNING
		       "%s: p%d start %llu is beyond EOD, ",
		       disk->disk_name, p, (unsigned long long) from);
		if (disk_unlock_native_capacity(disk))
			return false;
		return true;
571 572
	}

573 574 575 576
	if (from + size > get_capacity(disk)) {
		printk(KERN_WARNING
		       "%s: p%d size %llu extends beyond EOD, ",
		       disk->disk_name, p, (unsigned long long) size);
577

578 579 580 581 582 583 584 585 586 587 588 589 590
		if (disk_unlock_native_capacity(disk))
			return false;

		/*
		 * We can not ignore partitions of broken tables created by for
		 * example camera firmware, but we limit them to the end of the
		 * disk to avoid creating invalid block devices.
		 */
		size = get_capacity(disk) - from;
	}

	part = add_partition(disk, p, from, size, state->parts[p].flags,
			     &state->parts[p].info);
591
	if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
592 593 594 595 596
		printk(KERN_ERR " %s: p%d could not be added: %ld\n",
		       disk->disk_name, p, -PTR_ERR(part));
		return true;
	}

597 598
	if (IS_BUILTIN(CONFIG_BLK_DEV_MD) &&
	    (state->parts[p].flags & ADDPART_FLAG_RAID))
599
		md_autodetect_dev(part->bd_dev);
600

601 602 603
	return true;
}

604
int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
605 606
{
	struct parsed_partitions *state;
607
	int ret = -EAGAIN, p;
608

609 610 611
	if (!disk_part_scan_enabled(disk))
		return 0;

612 613
	state = check_partition(disk, bdev);
	if (!state)
614 615 616
		return 0;
	if (IS_ERR(state)) {
		/*
617 618
		 * I/O error reading the partition table.  If we tried to read
		 * beyond EOD, retry after unlocking the native capacity.
619 620 621 622 623
		 */
		if (PTR_ERR(state) == -ENOSPC) {
			printk(KERN_WARNING "%s: partition table beyond EOD, ",
			       disk->disk_name);
			if (disk_unlock_native_capacity(disk))
624
				return -EAGAIN;
625 626 627
		}
		return -EIO;
	}
628

629
	/*
630
	 * Partitions are not supported on host managed zoned block devices.
631
	 */
632 633
	if (disk->queue->limits.zoned == BLK_ZONED_HM) {
		pr_warn("%s: ignoring partition table on host managed zoned block device\n",
634
			disk->disk_name);
635 636
		ret = 0;
		goto out_free_state;
637 638
	}

639
	/*
640 641 642
	 * If we read beyond EOD, try unlocking native capacity even if the
	 * partition table was successfully read as we could be missing some
	 * partitions.
643 644 645 646 647 648
	 */
	if (state->access_beyond_eod) {
		printk(KERN_WARNING
		       "%s: partition table partially beyond EOD, ",
		       disk->disk_name);
		if (disk_unlock_native_capacity(disk))
649
			goto out_free_state;
650 651 652 653 654
	}

	/* tell userspace that the media / partition table may have changed */
	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);

655 656 657
	for (p = 1; p < state->limit; p++)
		if (!blk_add_partition(disk, bdev, state, p))
			goto out_free_state;
658

659 660
	ret = 0;
out_free_state:
661
	free_partitions(state);
662
	return ret;
663 664
}

665
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
666
{
667
	struct address_space *mapping = state->bdev->bd_inode->i_mapping;
668 669
	struct page *page;

670 671 672
	if (n >= get_capacity(state->bdev->bd_disk)) {
		state->access_beyond_eod = true;
		return NULL;
673
	}
674 675 676 677 678 679 680 681 682 683 684 685 686 687

	page = read_mapping_page(mapping,
			(pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL);
	if (IS_ERR(page))
		goto out;
	if (PageError(page))
		goto out_put_page;

	p->v = page;
	return (unsigned char *)page_address(page) +
			((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT);
out_put_page:
	put_page(page);
out:
688 689 690
	p->v = NULL;
	return NULL;
}