volumes.c 218.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6
#include <linux/sched.h>
7
#include <linux/sched/mm.h>
8
#include <linux/bio.h>
9
#include <linux/slab.h>
10
#include <linux/blkdev.h>
11
#include <linux/ratelimit.h>
I
Ilya Dryomov 已提交
12
#include <linux/kthread.h>
D
David Woodhouse 已提交
13
#include <linux/raid/pq.h>
S
Stefan Behrens 已提交
14
#include <linux/semaphore.h>
15
#include <linux/uuid.h>
A
Anand Jain 已提交
16
#include <linux/list_sort.h>
17
#include "misc.h"
18 19 20 21 22 23
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
D
David Woodhouse 已提交
24
#include "raid56.h"
25
#include "async-thread.h"
26
#include "check-integrity.h"
27
#include "rcu-string.h"
28
#include "dev-replace.h"
29
#include "sysfs.h"
30
#include "tree-checker.h"
31
#include "space-info.h"
32
#include "block-group.h"
33
#include "discard.h"
34
#include "zoned.h"
35

Z
Zhao Lei 已提交
36 37 38 39 40 41
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
42
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
43 44
		.devs_increment	= 2,
		.ncopies	= 2,
45
		.nparity        = 0,
46
		.raid_name	= "raid10",
47
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
48
		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
Z
Zhao Lei 已提交
49 50 51 52 53 54
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
55
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
56 57
		.devs_increment	= 2,
		.ncopies	= 2,
58
		.nparity        = 0,
59
		.raid_name	= "raid1",
60
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
61
		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
Z
Zhao Lei 已提交
62
	},
63 64 65
	[BTRFS_RAID_RAID1C3] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
66
		.devs_max	= 3,
67 68 69 70
		.devs_min	= 3,
		.tolerated_failures = 2,
		.devs_increment	= 3,
		.ncopies	= 3,
71
		.nparity        = 0,
72 73 74 75
		.raid_name	= "raid1c3",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
	},
76 77 78
	[BTRFS_RAID_RAID1C4] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
79
		.devs_max	= 4,
80 81 82 83
		.devs_min	= 4,
		.tolerated_failures = 3,
		.devs_increment	= 4,
		.ncopies	= 4,
84
		.nparity        = 0,
85 86 87 88
		.raid_name	= "raid1c4",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
	},
Z
Zhao Lei 已提交
89 90 91 92 93
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
94
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
95 96
		.devs_increment	= 1,
		.ncopies	= 2,
97
		.nparity        = 0,
98
		.raid_name	= "dup",
99
		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
100
		.mindev_error	= 0,
Z
Zhao Lei 已提交
101 102 103 104 105 106
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
107
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
108 109
		.devs_increment	= 1,
		.ncopies	= 1,
110
		.nparity        = 0,
111
		.raid_name	= "raid0",
112
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
113
		.mindev_error	= 0,
Z
Zhao Lei 已提交
114 115 116 117 118 119
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
120
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
121 122
		.devs_increment	= 1,
		.ncopies	= 1,
123
		.nparity        = 0,
124
		.raid_name	= "single",
125
		.bg_flag	= 0,
126
		.mindev_error	= 0,
Z
Zhao Lei 已提交
127 128 129 130 131 132
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
133
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
134
		.devs_increment	= 1,
135
		.ncopies	= 1,
136
		.nparity        = 1,
137
		.raid_name	= "raid5",
138
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
139
		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
Z
Zhao Lei 已提交
140 141 142 143 144 145
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
146
		.tolerated_failures = 2,
Z
Zhao Lei 已提交
147
		.devs_increment	= 1,
148
		.ncopies	= 1,
149
		.nparity        = 2,
150
		.raid_name	= "raid6",
151
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
152
		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
Z
Zhao Lei 已提交
153 154 155
	},
};

156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
/*
 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 * can be used as index to access btrfs_raid_array[].
 */
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
	if (flags & BTRFS_BLOCK_GROUP_RAID10)
		return BTRFS_RAID_RAID10;
	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
		return BTRFS_RAID_RAID1;
	else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
		return BTRFS_RAID_RAID1C3;
	else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
		return BTRFS_RAID_RAID1C4;
	else if (flags & BTRFS_BLOCK_GROUP_DUP)
		return BTRFS_RAID_DUP;
	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
		return BTRFS_RAID_RAID0;
	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
		return BTRFS_RAID_RAID5;
	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
		return BTRFS_RAID_RAID6;

	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}

182
const char *btrfs_bg_type_to_raid_name(u64 flags)
183
{
184 185 186
	const int index = btrfs_bg_flags_to_raid_index(flags);

	if (index >= BTRFS_NR_RAID_TYPES)
187 188
		return NULL;

189
	return btrfs_raid_array[index].raid_name;
190 191
}

192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
/*
 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 * bytes including terminating null byte.
 */
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
	int i;
	int ret;
	char *bp = buf;
	u64 flags = bg_flags;
	u32 size_bp = size_buf;

	if (!flags) {
		strcpy(bp, "NONE");
		return;
	}

#define DESCRIBE_FLAG(flag, desc)						\
	do {								\
		if (flags & (flag)) {					\
			ret = snprintf(bp, size_bp, "%s|", (desc));	\
			if (ret < 0 || ret >= size_bp)			\
				goto out_overflow;			\
			size_bp -= ret;					\
			bp += ret;					\
			flags &= ~(flag);				\
		}							\
	} while (0)

	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");

	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
			      btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG

	if (flags) {
		ret = snprintf(bp, size_bp, "0x%llx|", flags);
		size_bp -= ret;
	}

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */

	/*
	 * The text is trimmed, it's up to the caller to provide sufficiently
	 * large buffer
	 */
out_overflow:;
}

246
static int init_first_rw_device(struct btrfs_trans_handle *trans);
247
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
248
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
249
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
250 251 252 253 254
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Y
Yan Zheng 已提交
255

D
David Sterba 已提交
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
272
 * seeding, structure cloning, opening/closing devices at mount/umount time
D
David Sterba 已提交
273 274 275
 *
 * global::fs_devs - add, remove, updates to the global list
 *
276 277 278
 * does not protect: manipulation of the fs_devices::devices list in general
 * but in mount context it could be used to exclude list modifications by eg.
 * scan ioctl
D
David Sterba 已提交
279 280 281 282 283 284 285 286 287 288 289 290
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
291 292 293
 * Is not required at mount and close times, because our device list is
 * protected by the uuid_mutex at that point.
 *
D
David Sterba 已提交
294 295 296 297 298 299 300 301
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
302 303 304
 * device is added/removed. Additionally it also protects post_commit_list of
 * individual devices, since they can be added to the transaction's
 * post_commit_list only with chunk_mutex held.
D
David Sterba 已提交
305 306 307 308 309 310 311 312 313 314 315
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
316 317 318
 *   device_list_mutex
 *     chunk_mutex
 *   balance_mutex
319 320
 *
 *
321 322
 * Exclusive operations
 * ====================
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
348 349
 * The status of exclusive operation is set and cleared atomically.
 * During the course of Paused state, fs_info::exclusive_operation remains set.
350 351
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
352
 * The exclusive status is cleared when the device operation is canceled or
353
 * completed.
D
David Sterba 已提交
354 355
 */

356
DEFINE_MUTEX(uuid_mutex);
357
static LIST_HEAD(fs_uuids);
D
David Sterba 已提交
358
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
359 360 361
{
	return &fs_uuids;
}
362

D
David Sterba 已提交
363 364
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
365 366
 * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
 * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
D
David Sterba 已提交
367 368 369 370 371
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
372 373
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
						 const u8 *metadata_fsid)
374 375 376
{
	struct btrfs_fs_devices *fs_devs;

377
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
378 379 380 381 382 383 384
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
	INIT_LIST_HEAD(&fs_devs->alloc_list);
385
	INIT_LIST_HEAD(&fs_devs->fs_list);
386
	INIT_LIST_HEAD(&fs_devs->seed_list);
387 388 389
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

390 391 392 393 394
	if (metadata_fsid)
		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
	else if (fsid)
		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);

395 396 397
	return fs_devs;
}

398
void btrfs_free_device(struct btrfs_device *device)
399
{
400
	WARN_ON(!list_empty(&device->post_commit_list));
401
	rcu_string_free(device->name);
402
	extent_io_tree_release(&device->alloc_state);
403
	bio_put(device->flush_bio);
404
	btrfs_destroy_dev_zone_info(device);
405 406 407
	kfree(device);
}

Y
Yan Zheng 已提交
408 409 410 411 412 413 414 415
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
416
		btrfs_free_device(device);
Y
Yan Zheng 已提交
417 418 419 420
	}
	kfree(fs_devices);
}

421
void __exit btrfs_cleanup_fs_uuids(void)
422 423 424
{
	struct btrfs_fs_devices *fs_devices;

Y
Yan Zheng 已提交
425 426
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
427 428
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Y
Yan Zheng 已提交
429
		free_fs_devices(fs_devices);
430 431 432
	}
}

433 434
static noinline struct btrfs_fs_devices *find_fsid(
		const u8 *fsid, const u8 *metadata_fsid)
435 436 437
{
	struct btrfs_fs_devices *fs_devices;

438 439
	ASSERT(fsid);

440
	/* Handle non-split brain cases */
441
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
442 443 444 445 446 447 448 449 450
		if (metadata_fsid) {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
				      BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		} else {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		}
451 452 453 454
	}
	return NULL;
}

455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
				struct btrfs_super_block *disk_super)
{

	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handle scanned device having completed its fsid change but
	 * belonging to a fs_devices that was created by first scanning
	 * a device which didn't have its fsid/metadata_uuid changed
	 * at all and the CHANGING_FSID_V2 flag set.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (fs_devices->fsid_change &&
		    memcmp(disk_super->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0) {
			return fs_devices;
		}
	}
	/*
	 * Handle scanned device having completed its fsid change but
	 * belonging to a fs_devices that was created by a device that
	 * has an outdated pair of fsid/metadata_uuid and
	 * CHANGING_FSID_V2 flag set.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (fs_devices->fsid_change &&
		    memcmp(fs_devices->metadata_uuid,
			   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
		    memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0) {
			return fs_devices;
		}
	}

	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
}


496 497 498
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
499
		      struct btrfs_super_block **disk_super)
500 501 502 503 504 505 506 507 508 509 510 511
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
512
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
513 514 515 516 517
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
518 519 520
	*disk_super = btrfs_read_dev_super(*bdev);
	if (IS_ERR(*disk_super)) {
		ret = PTR_ERR(*disk_super);
521 522 523 524 525 526 527 528 529 530 531
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	return ret;
}

532 533 534 535 536 537 538 539 540 541 542
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
	int found;

	rcu_read_lock();
	found = strcmp(rcu_str_deref(device->name), path);
	rcu_read_unlock();

	return found == 0;
}

543 544 545 546 547 548 549
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
550 551 552
 *  Return:	0 for success or if @path is NULL.
 * 		-EBUSY if @path is a mounted device.
 * 		-ENOENT if @path does not match any device in the list.
553
 */
554
static int btrfs_free_stale_devices(const char *path,
555
				     struct btrfs_device *skip_device)
A
Anand Jain 已提交
556
{
557 558
	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
	struct btrfs_device *device, *tmp_device;
559 560 561 562
	int ret = 0;

	if (path)
		ret = -ENOENT;
A
Anand Jain 已提交
563

564
	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
A
Anand Jain 已提交
565

566
		mutex_lock(&fs_devices->device_list_mutex);
567 568 569
		list_for_each_entry_safe(device, tmp_device,
					 &fs_devices->devices, dev_list) {
			if (skip_device && skip_device == device)
570
				continue;
571
			if (path && !device->name)
A
Anand Jain 已提交
572
				continue;
573
			if (path && !device_path_matched(path, device))
574
				continue;
575 576 577 578 579 580
			if (fs_devices->opened) {
				/* for an already deleted device return 0 */
				if (path && ret != 0)
					ret = -EBUSY;
				break;
			}
A
Anand Jain 已提交
581 582

			/* delete the stale device */
583 584 585 586
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);

587
			ret = 0;
588 589
		}
		mutex_unlock(&fs_devices->device_list_mutex);
590

591 592 593 594
		if (fs_devices->num_devices == 0) {
			btrfs_sysfs_remove_fsid(fs_devices);
			list_del(&fs_devices->fs_list);
			free_fs_devices(fs_devices);
A
Anand Jain 已提交
595 596
		}
	}
597 598

	return ret;
A
Anand Jain 已提交
599 600
}

601 602 603 604 605
/*
 * This is only used on mount, and we are protected from competing things
 * messing with our fs_devices by the uuid_mutex, thus we do not need the
 * fs_devices->device_list_mutex here.
 */
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
622
				    &bdev, &disk_super);
623 624 625 626 627
	if (ret)
		return ret;

	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
628
		goto error_free_page;
629 630

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
631
		goto error_free_page;
632 633 634 635

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
636 637 638 639
		if (btrfs_super_incompat_flags(disk_super) &
		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
			pr_err(
		"BTRFS: Invalid seeding and uuid-changed device detected\n");
640
			goto error_free_page;
641 642
		}

643
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
644
		fs_devices->seeding = true;
645
	} else {
646 647 648 649
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
650 651 652 653
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
654
		fs_devices->rotating = true;
655 656

	device->bdev = bdev;
657
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
658 659 660
	device->mode = flags;

	fs_devices->open_devices++;
661 662
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
663
		fs_devices->rw_devices++;
664
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
665
	}
666
	btrfs_release_disk_super(disk_super);
667 668 669

	return 0;

670 671
error_free_page:
	btrfs_release_disk_super(disk_super);
672 673 674 675 676
	blkdev_put(bdev, flags);

	return -EINVAL;
}

677 678
/*
 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
679 680 681
 * being created with a disk that has already completed its fsid change. Such
 * disk can belong to an fs which has its FSID changed or to one which doesn't.
 * Handle both cases here.
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
 */
static struct btrfs_fs_devices *find_fsid_inprogress(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
			return fs_devices;
		}
	}

697
	return find_fsid(disk_super->fsid, NULL);
698 699
}

700 701 702 703 704 705 706 707

static struct btrfs_fs_devices *find_fsid_changed(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handles the case where scanned device is part of an fs that had
D
David Sterba 已提交
708
	 * multiple successful changes of FSID but currently device didn't
709 710 711 712 713
	 * observe it. Meaning our fsid will be different than theirs. We need
	 * to handle two subcases :
	 *  1 - The fs still continues to have different METADATA/FSID uuids.
	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
	 *  are equal).
714 715
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
716
		/* Changed UUIDs */
717 718 719 720 721
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, disk_super->fsid,
722 723 724 725 726 727 728 729
			   BTRFS_FSID_SIZE) != 0)
			return fs_devices;

		/* Unchanged UUIDs */
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0)
730 731 732 733 734
			return fs_devices;
	}

	return NULL;
}
735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760

static struct btrfs_fs_devices *find_fsid_reverted_metadata(
				struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handle the case where the scanned device is part of an fs whose last
	 * metadata UUID change reverted it to the original FSID. At the same
	 * time * fs_devices was first created by another constitutent device
	 * which didn't fully observe the operation. This results in an
	 * btrfs_fs_devices created with metadata/fsid different AND
	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
	 * fs_devices equal to the FSID of the disk.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
			   BTRFS_FSID_SIZE) == 0 &&
		    fs_devices->fsid_change)
			return fs_devices;
	}

	return NULL;
}
761 762 763 764
/*
 * Add new device to list of registered devices
 *
 * Returns:
765 766
 * device pointer which was just added or updated when successful
 * error pointer when failed
767
 */
768
static noinline struct btrfs_device *device_list_add(const char *path,
769 770
			   struct btrfs_super_block *disk_super,
			   bool *new_device_added)
771 772
{
	struct btrfs_device *device;
773
	struct btrfs_fs_devices *fs_devices = NULL;
774
	struct rcu_string *name;
775
	u64 found_transid = btrfs_super_generation(disk_super);
776
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
777 778
	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
779 780
	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
781

782
	if (fsid_change_in_progress) {
783
		if (!has_metadata_uuid)
784
			fs_devices = find_fsid_inprogress(disk_super);
785
		else
786
			fs_devices = find_fsid_changed(disk_super);
787
	} else if (has_metadata_uuid) {
788
		fs_devices = find_fsid_with_metadata_uuid(disk_super);
789
	} else {
790 791 792
		fs_devices = find_fsid_reverted_metadata(disk_super);
		if (!fs_devices)
			fs_devices = find_fsid(disk_super->fsid, NULL);
793 794
	}

795 796

	if (!fs_devices) {
797 798 799 800 801 802
		if (has_metadata_uuid)
			fs_devices = alloc_fs_devices(disk_super->fsid,
						      disk_super->metadata_uuid);
		else
			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);

803
		if (IS_ERR(fs_devices))
804
			return ERR_CAST(fs_devices);
805

806 807
		fs_devices->fsid_change = fsid_change_in_progress;

808
		mutex_lock(&fs_devices->device_list_mutex);
809
		list_add(&fs_devices->fs_list, &fs_uuids);
810

811 812
		device = NULL;
	} else {
813
		mutex_lock(&fs_devices->device_list_mutex);
814
		device = btrfs_find_device(fs_devices, devid,
815
				disk_super->dev_item.uuid, NULL);
816 817 818 819 820 821

		/*
		 * If this disk has been pulled into an fs devices created by
		 * a device which had the CHANGING_FSID_V2 flag then replace the
		 * metadata_uuid/fsid values of the fs_devices.
		 */
822
		if (fs_devices->fsid_change &&
823 824 825
		    found_transid > fs_devices->latest_generation) {
			memcpy(fs_devices->fsid, disk_super->fsid,
					BTRFS_FSID_SIZE);
826 827 828 829 830 831 832 833

			if (has_metadata_uuid)
				memcpy(fs_devices->metadata_uuid,
				       disk_super->metadata_uuid,
				       BTRFS_FSID_SIZE);
			else
				memcpy(fs_devices->metadata_uuid,
				       disk_super->fsid, BTRFS_FSID_SIZE);
834 835 836

			fs_devices->fsid_change = false;
		}
837
	}
838

839
	if (!device) {
840 841
		if (fs_devices->opened) {
			mutex_unlock(&fs_devices->device_list_mutex);
842
			return ERR_PTR(-EBUSY);
843
		}
Y
Yan Zheng 已提交
844

845 846 847
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
848
			mutex_unlock(&fs_devices->device_list_mutex);
849
			/* we can safely leave the fs_devices entry around */
850
			return device;
851
		}
852 853 854

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
855
			btrfs_free_device(device);
856
			mutex_unlock(&fs_devices->device_list_mutex);
857
			return ERR_PTR(-ENOMEM);
858
		}
859
		rcu_assign_pointer(device->name, name);
860

861
		list_add_rcu(&device->dev_list, &fs_devices->devices);
862
		fs_devices->num_devices++;
863

Y
Yan Zheng 已提交
864
		device->fs_devices = fs_devices;
865
		*new_device_added = true;
866 867

		if (disk_super->label[0])
868 869 870 871
			pr_info(
	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->label, devid, found_transid, path,
				current->comm, task_pid_nr(current));
872
		else
873 874 875 876
			pr_info(
	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->fsid, devid, found_transid, path,
				current->comm, task_pid_nr(current));
877

878
	} else if (!device->name || strcmp(device->name->str, path)) {
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
900 901 902 903
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
904
		 */
905
		if (!fs_devices->opened && found_transid < device->generation) {
906 907 908 909 910 911 912
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
913
			mutex_unlock(&fs_devices->device_list_mutex);
914
			return ERR_PTR(-EEXIST);
915
		}
916

917 918 919 920 921
		/*
		 * We are going to replace the device path for a given devid,
		 * make sure it's the same device if the device is mounted
		 */
		if (device->bdev) {
C
Christoph Hellwig 已提交
922 923
			int error;
			dev_t path_dev;
924

C
Christoph Hellwig 已提交
925 926
			error = lookup_bdev(path, &path_dev);
			if (error) {
927
				mutex_unlock(&fs_devices->device_list_mutex);
C
Christoph Hellwig 已提交
928
				return ERR_PTR(error);
929 930
			}

C
Christoph Hellwig 已提交
931
			if (device->bdev->bd_dev != path_dev) {
932
				mutex_unlock(&fs_devices->device_list_mutex);
933 934 935 936 937 938 939
				/*
				 * device->fs_info may not be reliable here, so
				 * pass in a NULL instead. This avoids a
				 * possible use-after-free when the fs_info and
				 * fs_info->sb are already torn down.
				 */
				btrfs_warn_in_rcu(NULL,
940 941 942 943
	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
						  path, devid, found_transid,
						  current->comm,
						  task_pid_nr(current));
944 945 946
				return ERR_PTR(-EEXIST);
			}
			btrfs_info_in_rcu(device->fs_info,
947 948 949 950
	"devid %llu device path %s changed to %s scanned by %s (%d)",
					  devid, rcu_str_deref(device->name),
					  path, current->comm,
					  task_pid_nr(current));
951 952
		}

953
		name = rcu_string_strdup(path, GFP_NOFS);
954 955
		if (!name) {
			mutex_unlock(&fs_devices->device_list_mutex);
956
			return ERR_PTR(-ENOMEM);
957
		}
958 959
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
960
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
961
			fs_devices->missing_devices--;
962
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
963
		}
964 965
	}

966 967 968 969 970 971
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
972
	if (!fs_devices->opened) {
973
		device->generation = found_transid;
974 975 976
		fs_devices->latest_generation = max_t(u64, found_transid,
						fs_devices->latest_generation);
	}
977

978 979
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

980
	mutex_unlock(&fs_devices->device_list_mutex);
981
	return device;
982 983
}

Y
Yan Zheng 已提交
984 985 986 987 988
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;
989
	int ret = 0;
Y
Yan Zheng 已提交
990

991
	fs_devices = alloc_fs_devices(orig->fsid, NULL);
992 993
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
994

995
	mutex_lock(&orig->device_list_mutex);
J
Josef Bacik 已提交
996
	fs_devices->total_devices = orig->total_devices;
Y
Yan Zheng 已提交
997 998

	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
999 1000
		struct rcu_string *name;

1001 1002
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
1003 1004
		if (IS_ERR(device)) {
			ret = PTR_ERR(device);
Y
Yan Zheng 已提交
1005
			goto error;
1006
		}
Y
Yan Zheng 已提交
1007

1008 1009 1010 1011
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
1012
		if (orig_dev->name) {
1013 1014
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
1015
			if (!name) {
1016
				btrfs_free_device(device);
1017
				ret = -ENOMEM;
1018 1019 1020
				goto error;
			}
			rcu_assign_pointer(device->name, name);
J
Julia Lawall 已提交
1021
		}
Y
Yan Zheng 已提交
1022 1023 1024 1025 1026

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
1027
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
1028 1029
	return fs_devices;
error:
1030
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
1031
	free_fs_devices(fs_devices);
1032
	return ERR_PTR(ret);
Y
Yan Zheng 已提交
1033 1034
}

1035
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1036
				      struct btrfs_device **latest_dev)
1037
{
Q
Qinghuang Feng 已提交
1038
	struct btrfs_device *device, *next;
1039

1040
	/* This is the initialized path, it is safe to release the devices. */
Q
Qinghuang Feng 已提交
1041
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1042
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1043
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1044
				      &device->dev_state) &&
1045 1046
			    !test_bit(BTRFS_DEV_STATE_MISSING,
				      &device->dev_state) &&
1047 1048 1049
			    (!*latest_dev ||
			     device->generation > (*latest_dev)->generation)) {
				*latest_dev = device;
1050
			}
Y
Yan Zheng 已提交
1051
			continue;
1052
		}
Y
Yan Zheng 已提交
1053

1054 1055 1056 1057 1058 1059 1060
		/*
		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
		 * in btrfs_init_dev_replace() so just continue.
		 */
		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
			continue;

Y
Yan Zheng 已提交
1061
		if (device->bdev) {
1062
			blkdev_put(device->bdev, device->mode);
Y
Yan Zheng 已提交
1063 1064 1065
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
1066
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
1067
			list_del_init(&device->dev_alloc_list);
1068
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1069
			fs_devices->rw_devices--;
Y
Yan Zheng 已提交
1070
		}
Y
Yan Zheng 已提交
1071 1072
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
1073
		btrfs_free_device(device);
1074
	}
Y
Yan Zheng 已提交
1075

1076 1077 1078 1079 1080 1081
}

/*
 * After we have read the system tree and know devids belonging to this
 * filesystem, remove the device which does not belong there.
 */
1082
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1083 1084
{
	struct btrfs_device *latest_dev = NULL;
1085
	struct btrfs_fs_devices *seed_dev;
1086 1087

	mutex_lock(&uuid_mutex);
1088
	__btrfs_free_extra_devids(fs_devices, &latest_dev);
1089 1090

	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1091
		__btrfs_free_extra_devids(seed_dev, &latest_dev);
Y
Yan Zheng 已提交
1092

1093
	fs_devices->latest_bdev = latest_dev->bdev;
1094

1095 1096
	mutex_unlock(&uuid_mutex);
}
1097

1098 1099
static void btrfs_close_bdev(struct btrfs_device *device)
{
D
David Sterba 已提交
1100 1101 1102
	if (!device->bdev)
		return;

1103
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1104 1105 1106 1107
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

D
David Sterba 已提交
1108
	blkdev_put(device->bdev, device->mode);
1109 1110
}

1111
static void btrfs_close_one_device(struct btrfs_device *device)
1112 1113 1114
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;

1115
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1116 1117 1118 1119 1120
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

1121
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1122 1123
		fs_devices->missing_devices--;

1124
	btrfs_close_bdev(device);
1125
	if (device->bdev) {
1126
		fs_devices->open_devices--;
1127
		device->bdev = NULL;
1128
	}
1129
	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1130
	btrfs_destroy_dev_zone_info(device);
1131

1132 1133 1134
	device->fs_info = NULL;
	atomic_set(&device->dev_stats_ccnt, 0);
	extent_io_tree_release(&device->alloc_state);
1135

1136 1137 1138 1139 1140 1141
	/* Verify the device is back in a pristine state  */
	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
	ASSERT(list_empty(&device->dev_alloc_list));
	ASSERT(list_empty(&device->post_commit_list));
	ASSERT(atomic_read(&device->reada_in_flight) == 0);
1142 1143
}

1144
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1145
{
1146
	struct btrfs_device *device, *tmp;
Y
Yan Zheng 已提交
1147

1148 1149
	lockdep_assert_held(&uuid_mutex);

Y
Yan Zheng 已提交
1150
	if (--fs_devices->opened > 0)
1151
		return;
1152

1153
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1154
		btrfs_close_one_device(device);
1155

Y
Yan Zheng 已提交
1156 1157
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Y
Yan Zheng 已提交
1158
	fs_devices->opened = 0;
1159
	fs_devices->seeding = false;
1160
	fs_devices->fs_info = NULL;
1161 1162
}

1163
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
Y
Yan Zheng 已提交
1164
{
1165 1166
	LIST_HEAD(list);
	struct btrfs_fs_devices *tmp;
Y
Yan Zheng 已提交
1167 1168

	mutex_lock(&uuid_mutex);
1169
	close_fs_devices(fs_devices);
1170 1171
	if (!fs_devices->opened)
		list_splice_init(&fs_devices->seed_list, &list);
Y
Yan Zheng 已提交
1172

1173
	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1174
		close_fs_devices(fs_devices);
1175
		list_del(&fs_devices->seed_list);
Y
Yan Zheng 已提交
1176 1177
		free_fs_devices(fs_devices);
	}
1178
	mutex_unlock(&uuid_mutex);
Y
Yan Zheng 已提交
1179 1180
}

1181
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
Y
Yan Zheng 已提交
1182
				fmode_t flags, void *holder)
1183 1184
{
	struct btrfs_device *device;
1185
	struct btrfs_device *latest_dev = NULL;
1186
	struct btrfs_device *tmp_device;
1187

1188 1189
	flags |= FMODE_EXCL;

1190 1191 1192
	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
				 dev_list) {
		int ret;
1193

1194 1195 1196
		ret = btrfs_open_one_device(fs_devices, device, flags, holder);
		if (ret == 0 &&
		    (!latest_dev || device->generation > latest_dev->generation)) {
1197
			latest_dev = device;
1198 1199 1200 1201 1202
		} else if (ret == -ENODATA) {
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);
		}
1203
	}
1204 1205 1206
	if (fs_devices->open_devices == 0)
		return -EINVAL;

Y
Yan Zheng 已提交
1207
	fs_devices->opened = 1;
1208
	fs_devices->latest_bdev = latest_dev->bdev;
Y
Yan Zheng 已提交
1209
	fs_devices->total_rw_bytes = 0;
1210
	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
A
Anand Jain 已提交
1211
	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1212 1213

	return 0;
Y
Yan Zheng 已提交
1214 1215
}

1216 1217
static int devid_cmp(void *priv, const struct list_head *a,
		     const struct list_head *b)
A
Anand Jain 已提交
1218
{
1219
	const struct btrfs_device *dev1, *dev2;
A
Anand Jain 已提交
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230

	dev1 = list_entry(a, struct btrfs_device, dev_list);
	dev2 = list_entry(b, struct btrfs_device, dev_list);

	if (dev1->devid < dev2->devid)
		return -1;
	else if (dev1->devid > dev2->devid)
		return 1;
	return 0;
}

Y
Yan Zheng 已提交
1231
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1232
		       fmode_t flags, void *holder)
Y
Yan Zheng 已提交
1233 1234 1235
{
	int ret;

1236
	lockdep_assert_held(&uuid_mutex);
1237 1238
	/*
	 * The device_list_mutex cannot be taken here in case opening the
1239
	 * underlying device takes further locks like open_mutex.
1240 1241 1242 1243
	 *
	 * We also don't need the lock here as this is called during mount and
	 * exclusion is provided by uuid_mutex
	 */
1244

Y
Yan Zheng 已提交
1245
	if (fs_devices->opened) {
Y
Yan Zheng 已提交
1246 1247
		fs_devices->opened++;
		ret = 0;
Y
Yan Zheng 已提交
1248
	} else {
A
Anand Jain 已提交
1249
		list_sort(NULL, &fs_devices->devices, devid_cmp);
1250
		ret = open_fs_devices(fs_devices, flags, holder);
Y
Yan Zheng 已提交
1251
	}
1252

1253 1254 1255
	return ret;
}

1256
void btrfs_release_disk_super(struct btrfs_super_block *super)
1257
{
1258 1259
	struct page *page = virt_to_page(super);

1260 1261 1262
	put_page(page);
}

1263
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1264
						       u64 bytenr, u64 bytenr_orig)
1265
{
1266 1267
	struct btrfs_super_block *disk_super;
	struct page *page;
1268 1269 1270 1271 1272
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1273
		return ERR_PTR(-EINVAL);
1274 1275

	/* make sure our super fits in the page */
1276 1277
	if (sizeof(*disk_super) > PAGE_SIZE)
		return ERR_PTR(-EINVAL);
1278 1279 1280

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
1281 1282
	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
		return ERR_PTR(-EINVAL);
1283 1284

	/* pull in the page with our super */
1285
	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1286

1287 1288
	if (IS_ERR(page))
		return ERR_CAST(page);
1289

1290
	p = page_address(page);
1291 1292

	/* align our pointer to the offset of the super block */
1293
	disk_super = p + offset_in_page(bytenr);
1294

1295
	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1296
	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1297
		btrfs_release_disk_super(p);
1298
		return ERR_PTR(-EINVAL);
1299 1300
	}

1301 1302
	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1303

1304
	return disk_super;
1305 1306
}

1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
int btrfs_forget_devices(const char *path)
{
	int ret;

	mutex_lock(&uuid_mutex);
	ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
	mutex_unlock(&uuid_mutex);

	return ret;
}

1318 1319 1320 1321 1322
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache
 */
1323 1324
struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
					   void *holder)
1325 1326
{
	struct btrfs_super_block *disk_super;
1327
	bool new_device_added = false;
1328
	struct btrfs_device *device = NULL;
1329
	struct block_device *bdev;
1330 1331
	u64 bytenr, bytenr_orig;
	int ret;
1332

1333 1334
	lockdep_assert_held(&uuid_mutex);

1335 1336 1337 1338 1339 1340
	/*
	 * we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
1341
	flags |= FMODE_EXCL;
1342 1343

	bdev = blkdev_get_by_path(path, flags, holder);
1344
	if (IS_ERR(bdev))
1345
		return ERR_CAST(bdev);
1346

1347 1348 1349 1350 1351 1352
	bytenr_orig = btrfs_sb_offset(0);
	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
	if (ret)
		return ERR_PTR(ret);

	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1353 1354
	if (IS_ERR(disk_super)) {
		device = ERR_CAST(disk_super);
1355
		goto error_bdev_put;
1356
	}
1357

1358
	device = device_list_add(path, disk_super, &new_device_added);
1359
	if (!IS_ERR(device)) {
1360 1361 1362
		if (new_device_added)
			btrfs_free_stale_devices(path, device);
	}
1363

1364
	btrfs_release_disk_super(disk_super);
1365 1366

error_bdev_put:
1367
	blkdev_put(bdev, flags);
1368

1369
	return device;
1370
}
1371

1372 1373 1374 1375 1376 1377
/*
 * Try to find a chunk that intersects [start, start + len] range and when one
 * such is found, record the end of it in *start
 */
static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
				    u64 len)
1378
{
1379
	u64 physical_start, physical_end;
1380

1381
	lockdep_assert_held(&device->fs_info->chunk_mutex);
1382

1383 1384 1385
	if (!find_first_extent_bit(&device->alloc_state, *start,
				   &physical_start, &physical_end,
				   CHUNK_ALLOCATED, NULL)) {
1386

1387 1388 1389 1390 1391
		if (in_range(physical_start, *start, len) ||
		    in_range(*start, physical_start,
			     physical_end - physical_start)) {
			*start = physical_end + 1;
			return true;
1392 1393
		}
	}
1394
	return false;
1395 1396
}

1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
{
	switch (device->fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		/*
		 * We don't want to overwrite the superblock on the drive nor
		 * any area used by the boot loader (grub for example), so we
		 * make sure to start at an offset of at least 1MB.
		 */
		return max_t(u64, start, SZ_1M);
1407 1408 1409 1410 1411 1412 1413
	case BTRFS_CHUNK_ALLOC_ZONED:
		/*
		 * We don't care about the starting region like regular
		 * allocator, because we anyway use/reserve the first two zones
		 * for superblock logging.
		 */
		return ALIGN(start, device->zone_info->zone_size);
1414 1415 1416 1417 1418
	default:
		BUG();
	}
}

1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
					u64 *hole_start, u64 *hole_size,
					u64 num_bytes)
{
	u64 zone_size = device->zone_info->zone_size;
	u64 pos;
	int ret;
	bool changed = false;

	ASSERT(IS_ALIGNED(*hole_start, zone_size));

	while (*hole_size > 0) {
		pos = btrfs_find_allocatable_zones(device, *hole_start,
						   *hole_start + *hole_size,
						   num_bytes);
		if (pos != *hole_start) {
			*hole_size = *hole_start + *hole_size - pos;
			*hole_start = pos;
			changed = true;
			if (*hole_size < num_bytes)
				break;
		}

		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);

		/* Range is ensured to be empty */
		if (!ret)
			return changed;

		/* Given hole range was invalid (outside of device) */
		if (ret == -ERANGE) {
			*hole_start += *hole_size;
1451
			*hole_size = 0;
1452
			return true;
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462
		}

		*hole_start += zone_size;
		*hole_size -= zone_size;
		changed = true;
	}

	return changed;
}

1463 1464 1465 1466 1467 1468 1469
/**
 * dev_extent_hole_check - check if specified hole is suitable for allocation
 * @device:	the device which we have the hole
 * @hole_start: starting position of the hole
 * @hole_size:	the size of the hole
 * @num_bytes:	the size of the free space that we need
 *
1470
 * This function may modify @hole_start and @hole_size to reflect the suitable
1471 1472 1473 1474 1475 1476 1477 1478
 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
 */
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
				  u64 *hole_size, u64 num_bytes)
{
	bool changed = false;
	u64 hole_end = *hole_start + *hole_size;

1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509
	for (;;) {
		/*
		 * Check before we set max_hole_start, otherwise we could end up
		 * sending back this offset anyway.
		 */
		if (contains_pending_extent(device, hole_start, *hole_size)) {
			if (hole_end >= *hole_start)
				*hole_size = hole_end - *hole_start;
			else
				*hole_size = 0;
			changed = true;
		}

		switch (device->fs_devices->chunk_alloc_policy) {
		case BTRFS_CHUNK_ALLOC_REGULAR:
			/* No extra check */
			break;
		case BTRFS_CHUNK_ALLOC_ZONED:
			if (dev_extent_hole_check_zoned(device, hole_start,
							hole_size, num_bytes)) {
				changed = true;
				/*
				 * The changed hole can contain pending extent.
				 * Loop again to check that.
				 */
				continue;
			}
			break;
		default:
			BUG();
		}
1510 1511 1512 1513 1514 1515

		break;
	}

	return changed;
}
1516

1517
/*
1518 1519 1520 1521 1522 1523 1524
 * find_free_dev_extent_start - find free space in the specified device
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1525
 *
1526 1527 1528
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
1529 1530 1531 1532 1533 1534 1535 1536
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1537 1538 1539 1540 1541
 *
 * NOTE: This function will search *commit* root of device tree, and does extra
 * check to ensure dev extents are not double allocated.
 * This makes the function safe to allocate dev extents but may not report
 * correct usable device space, as device extent freed in current transaction
D
David Sterba 已提交
1542
 * is not reported as available.
1543
 */
1544 1545 1546
static int find_free_dev_extent_start(struct btrfs_device *device,
				u64 num_bytes, u64 search_start, u64 *start,
				u64 *len)
1547
{
1548 1549
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1550
	struct btrfs_key key;
1551
	struct btrfs_dev_extent *dev_extent;
Y
Yan Zheng 已提交
1552
	struct btrfs_path *path;
1553 1554 1555 1556
	u64 hole_size;
	u64 max_hole_start;
	u64 max_hole_size;
	u64 extent_end;
1557 1558
	u64 search_end = device->total_bytes;
	int ret;
1559
	int slot;
1560
	struct extent_buffer *l;
1561

1562
	search_start = dev_extent_search_start(device, search_start);
1563

1564 1565 1566
	WARN_ON(device->zone_info &&
		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));

1567 1568 1569
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1570

1571 1572 1573
	max_hole_start = search_start;
	max_hole_size = 0;

1574
again:
1575 1576
	if (search_start >= search_end ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1577
		ret = -ENOSPC;
1578
		goto out;
1579 1580
	}

1581
	path->reada = READA_FORWARD;
1582 1583
	path->search_commit_root = 1;
	path->skip_locking = 1;
1584

1585 1586 1587
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1588

1589
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1590
	if (ret < 0)
1591
		goto out;
1592 1593 1594
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
1595
			goto out;
1596
	}
1597

1598 1599 1600 1601 1602 1603 1604 1605
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1606 1607 1608
				goto out;

			break;
1609 1610 1611 1612 1613 1614 1615
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1616
			break;
1617

1618
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1619
			goto next;
1620

1621 1622
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1623 1624
			dev_extent_hole_check(device, &search_start, &hole_size,
					      num_bytes);
1625

1626 1627 1628 1629
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1630

1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1643 1644 1645 1646
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1647 1648 1649 1650
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1651 1652 1653 1654 1655
next:
		path->slots[0]++;
		cond_resched();
	}

1656 1657 1658 1659 1660
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1661
	if (search_end > search_start) {
1662
		hole_size = search_end - search_start;
1663 1664
		if (dev_extent_hole_check(device, &search_start, &hole_size,
					  num_bytes)) {
1665 1666 1667
			btrfs_release_path(path);
			goto again;
		}
1668

1669 1670 1671 1672
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1673 1674
	}

1675
	/* See above. */
1676
	if (max_hole_size < num_bytes)
1677 1678 1679 1680 1681
		ret = -ENOSPC;
	else
		ret = 0;

out:
Y
Yan Zheng 已提交
1682
	btrfs_free_path(path);
1683
	*start = max_hole_start;
1684
	if (len)
1685
		*len = max_hole_size;
1686 1687 1688
	return ret;
}

1689
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1690 1691 1692
			 u64 *start, u64 *len)
{
	/* FIXME use last free of some kind */
1693
	return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1694 1695
}

1696
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1697
			  struct btrfs_device *device,
M
Miao Xie 已提交
1698
			  u64 start, u64 *dev_extent_len)
1699
{
1700 1701
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1702 1703 1704
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1705 1706 1707
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1708 1709 1710 1711 1712 1713 1714 1715

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
M
Miao Xie 已提交
1716
again:
1717
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1718 1719 1720
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1721 1722
		if (ret)
			goto out;
1723 1724 1725 1726 1727 1728
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
M
Miao Xie 已提交
1729 1730 1731
		key = found_key;
		btrfs_release_path(path);
		goto again;
1732 1733 1734 1735
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1736 1737
	} else {
		goto out;
1738
	}
1739

M
Miao Xie 已提交
1740 1741
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1742
	ret = btrfs_del_item(trans, root, path);
1743
	if (ret == 0)
1744
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1745
out:
1746 1747 1748 1749
	btrfs_free_path(path);
	return ret;
}

1750
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1751
{
1752 1753 1754 1755
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct rb_node *n;
	u64 ret = 0;
1756

1757
	em_tree = &fs_info->mapping_tree;
1758
	read_lock(&em_tree->lock);
L
Liu Bo 已提交
1759
	n = rb_last(&em_tree->map.rb_root);
1760 1761 1762
	if (n) {
		em = rb_entry(n, struct extent_map, rb_node);
		ret = em->start + em->len;
1763
	}
1764 1765
	read_unlock(&em_tree->lock);

1766 1767 1768
	return ret;
}

1769 1770
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1771 1772 1773 1774
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Y
Yan Zheng 已提交
1775 1776 1777 1778 1779
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1780 1781 1782 1783 1784

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1785
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1786 1787 1788
	if (ret < 0)
		goto error;

1789 1790 1791 1792 1793 1794
	if (ret == 0) {
		/* Corruption */
		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
		ret = -EUCLEAN;
		goto error;
	}
1795

1796 1797
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1798 1799
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1800
		*devid_ret = 1;
1801 1802 1803
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1804
		*devid_ret = found_key.offset + 1;
1805 1806 1807
	}
	ret = 0;
error:
Y
Yan Zheng 已提交
1808
	btrfs_free_path(path);
1809 1810 1811 1812 1813 1814 1815
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1816
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1817
			    struct btrfs_device *device)
1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831
{
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Y
Yan Zheng 已提交
1832
	key.offset = device->devid;
1833

1834 1835
	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
				      &key, sizeof(*dev_item));
1836 1837 1838 1839 1840 1841 1842
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Y
Yan Zheng 已提交
1843
	btrfs_set_device_generation(leaf, dev_item, 0);
1844 1845 1846 1847
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1848 1849 1850 1851
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1852 1853 1854
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1855
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1856

1857
	ptr = btrfs_device_uuid(dev_item);
1858
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1859
	ptr = btrfs_device_fsid(dev_item);
1860 1861
	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
			    ptr, BTRFS_FSID_SIZE);
1862 1863
	btrfs_mark_buffer_dirty(leaf);

Y
Yan Zheng 已提交
1864
	ret = 0;
1865 1866 1867 1868
out:
	btrfs_free_path(path);
	return ret;
}
1869

1870 1871 1872 1873
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 */
1874
static void update_dev_time(const char *path_name)
1875 1876 1877 1878
{
	struct file *filp;

	filp = filp_open(path_name, O_RDWR, 0);
1879
	if (IS_ERR(filp))
1880 1881 1882 1883 1884
		return;
	file_update_time(filp);
	filp_close(filp, NULL);
}

1885
static int btrfs_rm_dev_item(struct btrfs_device *device)
1886
{
1887
	struct btrfs_root *root = device->fs_info->chunk_root;
1888 1889 1890 1891 1892 1893 1894 1895 1896
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1897
	trans = btrfs_start_transaction(root, 0);
1898 1899 1900 1901
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}
1902 1903 1904 1905 1906
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1907 1908 1909 1910 1911
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
1912 1913 1914 1915
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
1916 1917 1918 1919 1920
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	}

1921 1922
out:
	btrfs_free_path(path);
1923 1924
	if (!ret)
		ret = btrfs_commit_transaction(trans);
1925 1926 1927
	return ret;
}

1928 1929 1930 1931 1932 1933 1934
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1935 1936
{
	u64 all_avail;
1937
	unsigned seq;
1938
	int i;
1939

1940
	do {
1941
		seq = read_seqbegin(&fs_info->profiles_lock);
1942

1943 1944 1945 1946
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
1947

1948
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1949
		if (!(all_avail & btrfs_raid_array[i].bg_flag))
1950
			continue;
1951

1952
		if (num_devices < btrfs_raid_array[i].devs_min) {
1953
			int ret = btrfs_raid_array[i].mindev_error;
1954

1955 1956 1957
			if (ret)
				return ret;
		}
D
David Woodhouse 已提交
1958 1959
	}

1960
	return 0;
1961 1962
}

1963 1964
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1965
{
Y
Yan Zheng 已提交
1966
	struct btrfs_device *next_device;
1967 1968 1969

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
1970 1971
		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
		    && next_device->bdev)
1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
			return next_device;
	}

	return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_bdev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
1984
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1985
					    struct btrfs_device *next_device)
1986
{
1987
	struct btrfs_fs_info *fs_info = device->fs_info;
1988

1989
	if (!next_device)
1990
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1991
							    device);
1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

	if (fs_info->fs_devices->latest_bdev == device->bdev)
		fs_info->fs_devices->latest_bdev = next_device->bdev;
}

2002 2003 2004 2005 2006 2007 2008 2009
/*
 * Return btrfs_fs_devices::num_devices excluding the device that's being
 * currently replaced.
 */
static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
	u64 num_devices = fs_info->fs_devices->num_devices;

2010
	down_read(&fs_info->dev_replace.rwsem);
2011 2012 2013 2014
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		ASSERT(num_devices > 1);
		num_devices--;
	}
2015
	up_read(&fs_info->dev_replace.rwsem);
2016 2017 2018 2019

	return num_devices;
}

2020 2021 2022
void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
			       struct block_device *bdev,
			       const char *device_path)
2023 2024 2025 2026 2027 2028 2029 2030
{
	struct btrfs_super_block *disk_super;
	int copy_num;

	if (!bdev)
		return;

	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2031 2032
		struct page *page;
		int ret;
2033

2034 2035 2036
		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
		if (IS_ERR(disk_super))
			continue;
2037

2038 2039 2040 2041 2042
		if (bdev_is_zoned(bdev)) {
			btrfs_reset_sb_log_zones(bdev, copy_num);
			continue;
		}

2043
		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055

		page = virt_to_page(disk_super);
		set_page_dirty(page);
		lock_page(page);
		/* write_on_page() unlocks the page */
		ret = write_one_page(page);
		if (ret)
			btrfs_warn(fs_info,
				"error clearing superblock number %d (%d)",
				copy_num, ret);
		btrfs_release_disk_super(disk_super);

2056 2057 2058 2059 2060 2061 2062 2063 2064
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
	update_dev_time(device_path);
}

2065
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2066
		    u64 devid)
2067 2068
{
	struct btrfs_device *device;
2069
	struct btrfs_fs_devices *cur_devices;
2070
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2071
	u64 num_devices;
2072 2073 2074 2075
	int ret = 0;

	mutex_lock(&uuid_mutex);

2076
	num_devices = btrfs_num_devices(fs_info);
2077

2078
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2079
	if (ret)
2080 2081
		goto out;

2082 2083 2084 2085 2086 2087 2088 2089
	device = btrfs_find_device_by_devspec(fs_info, devid, device_path);

	if (IS_ERR(device)) {
		if (PTR_ERR(device) == -ENOENT &&
		    strcmp(device_path, "missing") == 0)
			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
		else
			ret = PTR_ERR(device);
D
David Woodhouse 已提交
2090
		goto out;
2091
	}
2092

2093 2094 2095 2096 2097 2098 2099 2100
	if (btrfs_pinned_by_swapfile(fs_info, device)) {
		btrfs_warn_in_rcu(fs_info,
		  "cannot remove device %s (devid %llu) due to active swapfile",
				  rcu_str_deref(device->name), device->devid);
		ret = -ETXTBSY;
		goto out;
	}

2101
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2102
		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2103
		goto out;
2104 2105
	}

2106 2107
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    fs_info->fs_devices->rw_devices == 1) {
2108
		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2109
		goto out;
Y
Yan Zheng 已提交
2110 2111
	}

2112
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2113
		mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2114
		list_del_init(&device->dev_alloc_list);
2115
		device->fs_devices->rw_devices--;
2116
		mutex_unlock(&fs_info->chunk_mutex);
2117
	}
2118

2119
	mutex_unlock(&uuid_mutex);
2120
	ret = btrfs_shrink_device(device, 0);
2121 2122
	if (!ret)
		btrfs_reada_remove_dev(device);
2123
	mutex_lock(&uuid_mutex);
2124
	if (ret)
2125
		goto error_undo;
2126

2127 2128 2129 2130 2131
	/*
	 * TODO: the superblock still includes this device in its num_devices
	 * counter although write_all_supers() is not locked out. This
	 * could give a filesystem state which requires a degraded mount.
	 */
2132
	ret = btrfs_rm_dev_item(device);
2133
	if (ret)
2134
		goto error_undo;
2135

2136
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2137
	btrfs_scrub_cancel_dev(device);
2138 2139 2140 2141

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
2142 2143 2144 2145 2146
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
2147
	 */
2148

2149 2150 2151 2152 2153
	/*
	 * In normal cases the cur_devices == fs_devices. But in case
	 * of deleting a seed device, the cur_devices should point to
	 * its own fs_devices listed under the fs_devices->seed.
	 */
2154
	cur_devices = device->fs_devices;
2155
	mutex_lock(&fs_devices->device_list_mutex);
2156
	list_del_rcu(&device->dev_list);
2157

2158 2159
	cur_devices->num_devices--;
	cur_devices->total_devices--;
2160 2161 2162
	/* Update total_devices of the parent fs_devices if it's seed */
	if (cur_devices != fs_devices)
		fs_devices->total_devices--;
Y
Yan Zheng 已提交
2163

2164
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2165
		cur_devices->missing_devices--;
2166

2167
	btrfs_assign_next_active_device(device, NULL);
Y
Yan Zheng 已提交
2168

2169
	if (device->bdev) {
2170
		cur_devices->open_devices--;
2171
		/* remove sysfs entry */
2172
		btrfs_sysfs_remove_device(device);
2173
	}
2174

2175 2176
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2177
	mutex_unlock(&fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2178

2179 2180 2181 2182 2183
	/*
	 * at this point, the device is zero sized and detached from
	 * the devices list.  All that's left is to zero out the old
	 * supers and free the device.
	 */
2184
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2185 2186
		btrfs_scratch_superblocks(fs_info, device->bdev,
					  device->name->str);
2187 2188

	btrfs_close_bdev(device);
2189 2190
	synchronize_rcu();
	btrfs_free_device(device);
2191

2192
	if (cur_devices->open_devices == 0) {
2193
		list_del_init(&cur_devices->seed_list);
2194
		close_fs_devices(cur_devices);
2195
		free_fs_devices(cur_devices);
Y
Yan Zheng 已提交
2196 2197
	}

2198 2199 2200
out:
	mutex_unlock(&uuid_mutex);
	return ret;
2201

2202
error_undo:
2203
	btrfs_reada_undo_remove_dev(device);
2204
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2205
		mutex_lock(&fs_info->chunk_mutex);
2206
		list_add(&device->dev_alloc_list,
2207
			 &fs_devices->alloc_list);
2208
		device->fs_devices->rw_devices++;
2209
		mutex_unlock(&fs_info->chunk_mutex);
2210
	}
2211
	goto out;
2212 2213
}

2214
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2215
{
2216 2217
	struct btrfs_fs_devices *fs_devices;

2218
	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2219

2220 2221 2222 2223 2224 2225 2226
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
2227

2228
	list_del_rcu(&srcdev->dev_list);
2229
	list_del(&srcdev->dev_alloc_list);
2230
	fs_devices->num_devices--;
2231
	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2232
		fs_devices->missing_devices--;
2233

2234
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2235
		fs_devices->rw_devices--;
2236

2237
	if (srcdev->bdev)
2238
		fs_devices->open_devices--;
2239 2240
}

2241
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2242 2243
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2244

2245 2246
	mutex_lock(&uuid_mutex);

2247
	btrfs_close_bdev(srcdev);
2248 2249
	synchronize_rcu();
	btrfs_free_device(srcdev);
2250 2251 2252

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
2253 2254 2255 2256 2257 2258 2259 2260
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2261
		list_del_init(&fs_devices->seed_list);
2262
		close_fs_devices(fs_devices);
2263
		free_fs_devices(fs_devices);
2264
	}
2265
	mutex_unlock(&uuid_mutex);
2266 2267
}

2268
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2269
{
2270
	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2271 2272

	mutex_lock(&fs_devices->device_list_mutex);
2273

2274
	btrfs_sysfs_remove_device(tgtdev);
2275

2276
	if (tgtdev->bdev)
2277
		fs_devices->open_devices--;
2278

2279
	fs_devices->num_devices--;
2280

2281
	btrfs_assign_next_active_device(tgtdev, NULL);
2282 2283 2284

	list_del_rcu(&tgtdev->dev_list);

2285
	mutex_unlock(&fs_devices->device_list_mutex);
2286 2287 2288 2289 2290 2291 2292 2293

	/*
	 * The update_dev_time() with in btrfs_scratch_superblocks()
	 * may lead to a call to btrfs_show_devname() which will try
	 * to hold device_list_mutex. And here this device
	 * is already out of device list, so we don't have to hold
	 * the device_list_mutex lock.
	 */
2294 2295
	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
				  tgtdev->name->str);
2296 2297

	btrfs_close_bdev(tgtdev);
2298 2299
	synchronize_rcu();
	btrfs_free_device(tgtdev);
2300 2301
}

2302 2303
static struct btrfs_device *btrfs_find_device_by_path(
		struct btrfs_fs_info *fs_info, const char *device_path)
2304 2305 2306 2307 2308 2309
{
	int ret = 0;
	struct btrfs_super_block *disk_super;
	u64 devid;
	u8 *dev_uuid;
	struct block_device *bdev;
2310
	struct btrfs_device *device;
2311 2312

	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2313
				    fs_info->bdev_holder, 0, &bdev, &disk_super);
2314
	if (ret)
2315
		return ERR_PTR(ret);
2316

2317 2318
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	dev_uuid = disk_super->dev_item.uuid;
2319
	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2320
		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2321
					   disk_super->metadata_uuid);
2322
	else
2323
		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2324
					   disk_super->fsid);
2325

2326
	btrfs_release_disk_super(disk_super);
2327 2328
	if (!device)
		device = ERR_PTR(-ENOENT);
2329
	blkdev_put(bdev, FMODE_READ);
2330
	return device;
2331 2332
}

2333 2334 2335
/*
 * Lookup a device given by device id, or the path if the id is 0.
 */
2336
struct btrfs_device *btrfs_find_device_by_devspec(
2337 2338
		struct btrfs_fs_info *fs_info, u64 devid,
		const char *device_path)
2339
{
2340
	struct btrfs_device *device;
2341

2342
	if (devid) {
2343
		device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2344
					   NULL);
2345 2346
		if (!device)
			return ERR_PTR(-ENOENT);
2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359
		return device;
	}

	if (!device_path || !device_path[0])
		return ERR_PTR(-EINVAL);

	if (strcmp(device_path, "missing") == 0) {
		/* Find first missing device */
		list_for_each_entry(device, &fs_info->fs_devices->devices,
				    dev_list) {
			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				     &device->dev_state) && !device->bdev)
				return device;
2360
		}
2361
		return ERR_PTR(-ENOENT);
2362
	}
2363 2364

	return btrfs_find_device_by_path(fs_info, device_path);
2365 2366
}

Y
Yan Zheng 已提交
2367 2368 2369
/*
 * does all the dirty work required for changing file system's UUID.
 */
2370
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2371
{
2372
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2373
	struct btrfs_fs_devices *old_devices;
Y
Yan Zheng 已提交
2374
	struct btrfs_fs_devices *seed_devices;
2375
	struct btrfs_super_block *disk_super = fs_info->super_copy;
Y
Yan Zheng 已提交
2376 2377 2378
	struct btrfs_device *device;
	u64 super_flags;

2379
	lockdep_assert_held(&uuid_mutex);
Y
Yan Zheng 已提交
2380
	if (!fs_devices->seeding)
Y
Yan Zheng 已提交
2381 2382
		return -EINVAL;

2383 2384 2385 2386
	/*
	 * Private copy of the seed devices, anchored at
	 * fs_info->fs_devices->seed_list
	 */
2387
	seed_devices = alloc_fs_devices(NULL, NULL);
2388 2389
	if (IS_ERR(seed_devices))
		return PTR_ERR(seed_devices);
Y
Yan Zheng 已提交
2390

2391 2392 2393 2394 2395 2396
	/*
	 * It's necessary to retain a copy of the original seed fs_devices in
	 * fs_uuids so that filesystems which have been seeded can successfully
	 * reference the seed device from open_seed_devices. This also supports
	 * multiple fs seed.
	 */
Y
Yan Zheng 已提交
2397 2398 2399 2400
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
		return PTR_ERR(old_devices);
Y
Yan Zheng 已提交
2401
	}
Y
Yan Zheng 已提交
2402

2403
	list_add(&old_devices->fs_list, &fs_uuids);
Y
Yan Zheng 已提交
2404

Y
Yan Zheng 已提交
2405 2406 2407 2408
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2409
	mutex_init(&seed_devices->device_list_mutex);
2410

2411
	mutex_lock(&fs_devices->device_list_mutex);
2412 2413
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
M
Miao Xie 已提交
2414 2415
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2416

2417
	fs_devices->seeding = false;
Y
Yan Zheng 已提交
2418 2419
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2420
	fs_devices->missing_devices = 0;
2421
	fs_devices->rotating = false;
2422
	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
Y
Yan Zheng 已提交
2423 2424

	generate_random_uuid(fs_devices->fsid);
2425
	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
2426
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2427
	mutex_unlock(&fs_devices->device_list_mutex);
2428

Y
Yan Zheng 已提交
2429 2430 2431 2432 2433 2434 2435 2436
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);

	return 0;
}

/*
2437
 * Store the expected generation for seed devices in device items.
Y
Yan Zheng 已提交
2438
 */
2439
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
Y
Yan Zheng 已提交
2440
{
2441
	struct btrfs_fs_info *fs_info = trans->fs_info;
2442
	struct btrfs_root *root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2443 2444 2445 2446 2447
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2448
	u8 fs_uuid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475
	u8 dev_uuid[BTRFS_UUID_SIZE];
	u64 devid;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2476
			btrfs_release_path(path);
Y
Yan Zheng 已提交
2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
		devid = btrfs_device_id(leaf, dev_item);
2488
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Y
Yan Zheng 已提交
2489
				   BTRFS_UUID_SIZE);
2490
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2491
				   BTRFS_FSID_SIZE);
2492
		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2493
					   fs_uuid);
2494
		BUG_ON(!device); /* Logic error */
Y
Yan Zheng 已提交
2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
			btrfs_mark_buffer_dirty(leaf);
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2511
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2512
{
2513
	struct btrfs_root *root = fs_info->dev_root;
2514
	struct request_queue *q;
2515 2516 2517
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
	struct block_device *bdev;
2518
	struct super_block *sb = fs_info->sb;
2519
	struct rcu_string *name;
2520
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2521 2522
	u64 orig_super_total_bytes;
	u64 orig_super_num_devices;
Y
Yan Zheng 已提交
2523
	int seeding_dev = 0;
2524
	int ret = 0;
2525
	bool locked = false;
2526

2527
	if (sb_rdonly(sb) && !fs_devices->seeding)
2528
		return -EROFS;
2529

2530
	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2531
				  fs_info->bdev_holder);
2532 2533
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
2534

N
Naohiro Aota 已提交
2535 2536 2537 2538 2539
	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
		ret = -EINVAL;
		goto error;
	}

2540
	if (fs_devices->seeding) {
Y
Yan Zheng 已提交
2541 2542 2543
		seeding_dev = 1;
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
2544
		locked = true;
Y
Yan Zheng 已提交
2545 2546
	}

2547
	sync_blockdev(bdev);
2548

2549 2550
	rcu_read_lock();
	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2551 2552
		if (device->bdev == bdev) {
			ret = -EEXIST;
2553
			rcu_read_unlock();
Y
Yan Zheng 已提交
2554
			goto error;
2555 2556
		}
	}
2557
	rcu_read_unlock();
2558

2559
	device = btrfs_alloc_device(fs_info, NULL, NULL);
2560
	if (IS_ERR(device)) {
2561
		/* we can safely leave the fs_devices entry around */
2562
		ret = PTR_ERR(device);
Y
Yan Zheng 已提交
2563
		goto error;
2564 2565
	}

2566
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2567
	if (!name) {
Y
Yan Zheng 已提交
2568
		ret = -ENOMEM;
2569
		goto error_free_device;
2570
	}
2571
	rcu_assign_pointer(device->name, name);
Y
Yan Zheng 已提交
2572

2573 2574 2575 2576 2577 2578 2579
	device->fs_info = fs_info;
	device->bdev = bdev;

	ret = btrfs_get_dev_zone_info(device);
	if (ret)
		goto error_free_device;

2580
	trans = btrfs_start_transaction(root, 0);
2581 2582
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2583
		goto error_free_zone;
2584 2585
	}

2586
	q = bdev_get_queue(bdev);
2587
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Y
Yan Zheng 已提交
2588
	device->generation = trans->transid;
2589 2590 2591
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2592 2593
	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
					 fs_info->sectorsize);
2594
	device->disk_total_bytes = device->total_bytes;
2595
	device->commit_total_bytes = device->total_bytes;
2596
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2597
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2598
	device->mode = FMODE_EXCL;
2599
	device->dev_stats_valid = 1;
2600
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2601

Y
Yan Zheng 已提交
2602
	if (seeding_dev) {
2603
		btrfs_clear_sb_rdonly(sb);
2604
		ret = btrfs_prepare_sprout(fs_info);
2605 2606 2607 2608
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
Y
Yan Zheng 已提交
2609
	}
2610

2611
	device->fs_devices = fs_devices;
2612

2613
	mutex_lock(&fs_devices->device_list_mutex);
2614
	mutex_lock(&fs_info->chunk_mutex);
2615 2616 2617 2618 2619 2620 2621
	list_add_rcu(&device->dev_list, &fs_devices->devices);
	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
	fs_devices->num_devices++;
	fs_devices->open_devices++;
	fs_devices->rw_devices++;
	fs_devices->total_devices++;
	fs_devices->total_rw_bytes += device->total_bytes;
2622

2623
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2624

2625
	if (!blk_queue_nonrot(q))
2626
		fs_devices->rotating = true;
C
Chris Mason 已提交
2627

2628
	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2629
	btrfs_set_super_total_bytes(fs_info->super_copy,
2630 2631
		round_down(orig_super_total_bytes + device->total_bytes,
			   fs_info->sectorsize));
2632

2633 2634 2635
	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy,
				    orig_super_num_devices + 1);
2636

M
Miao Xie 已提交
2637 2638 2639 2640
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2641
	btrfs_clear_space_info_full(fs_info);
M
Miao Xie 已提交
2642

2643
	mutex_unlock(&fs_info->chunk_mutex);
2644 2645

	/* Add sysfs device entry */
2646
	btrfs_sysfs_add_device(device);
2647

2648
	mutex_unlock(&fs_devices->device_list_mutex);
2649

Y
Yan Zheng 已提交
2650
	if (seeding_dev) {
2651
		mutex_lock(&fs_info->chunk_mutex);
2652
		ret = init_first_rw_device(trans);
2653
		mutex_unlock(&fs_info->chunk_mutex);
2654
		if (ret) {
2655
			btrfs_abort_transaction(trans, ret);
2656
			goto error_sysfs;
2657
		}
M
Miao Xie 已提交
2658 2659
	}

2660
	ret = btrfs_add_dev_item(trans, device);
M
Miao Xie 已提交
2661
	if (ret) {
2662
		btrfs_abort_transaction(trans, ret);
2663
		goto error_sysfs;
M
Miao Xie 已提交
2664 2665 2666
	}

	if (seeding_dev) {
2667
		ret = btrfs_finish_sprout(trans);
2668
		if (ret) {
2669
			btrfs_abort_transaction(trans, ret);
2670
			goto error_sysfs;
2671
		}
2672

2673 2674 2675 2676 2677
		/*
		 * fs_devices now represents the newly sprouted filesystem and
		 * its fsid has been changed by btrfs_prepare_sprout
		 */
		btrfs_sysfs_update_sprout_fsid(fs_devices);
Y
Yan Zheng 已提交
2678 2679
	}

2680
	ret = btrfs_commit_transaction(trans);
2681

Y
Yan Zheng 已提交
2682 2683 2684
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2685
		locked = false;
2686

2687 2688 2689
		if (ret) /* transaction commit */
			return ret;

2690
		ret = btrfs_relocate_sys_chunks(fs_info);
2691
		if (ret < 0)
2692
			btrfs_handle_fs_error(fs_info, ret,
J
Jeff Mahoney 已提交
2693
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2694 2695 2696 2697
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2698 2699 2700
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2701
		}
2702
		ret = btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
2703
	}
2704

2705 2706 2707 2708 2709 2710 2711 2712 2713 2714
	/*
	 * Now that we have written a new super block to this device, check all
	 * other fs_devices list if device_path alienates any other scanned
	 * device.
	 * We can ignore the return value as it typically returns -EINVAL and
	 * only succeeds if the device was an alien.
	 */
	btrfs_forget_devices(device_path);

	/* Update ctime/mtime for blkid or udev */
2715
	update_dev_time(device_path);
2716

Y
Yan Zheng 已提交
2717
	return ret;
2718

2719
error_sysfs:
2720
	btrfs_sysfs_remove_device(device);
2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
	mutex_lock(&fs_info->chunk_mutex);
	list_del_rcu(&device->dev_list);
	list_del(&device->dev_alloc_list);
	fs_info->fs_devices->num_devices--;
	fs_info->fs_devices->open_devices--;
	fs_info->fs_devices->rw_devices--;
	fs_info->fs_devices->total_devices--;
	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
	btrfs_set_super_total_bytes(fs_info->super_copy,
				    orig_super_total_bytes);
	btrfs_set_super_num_devices(fs_info->super_copy,
				    orig_super_num_devices);
	mutex_unlock(&fs_info->chunk_mutex);
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2737
error_trans:
2738
	if (seeding_dev)
2739
		btrfs_set_sb_rdonly(sb);
2740 2741
	if (trans)
		btrfs_end_transaction(trans);
2742 2743
error_free_zone:
	btrfs_destroy_dev_zone_info(device);
2744
error_free_device:
2745
	btrfs_free_device(device);
Y
Yan Zheng 已提交
2746
error:
2747
	blkdev_put(bdev, FMODE_EXCL);
2748
	if (locked) {
Y
Yan Zheng 已提交
2749 2750 2751
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2752
	return ret;
2753 2754
}

C
Chris Mason 已提交
2755 2756
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2757 2758 2759
{
	int ret;
	struct btrfs_path *path;
2760
	struct btrfs_root *root = device->fs_info->chunk_root;
2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2790 2791 2792 2793
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2794 2795 2796 2797 2798 2799 2800
	btrfs_mark_buffer_dirty(leaf);

out:
	btrfs_free_path(path);
	return ret;
}

M
Miao Xie 已提交
2801
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2802 2803
		      struct btrfs_device *device, u64 new_size)
{
2804 2805
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
M
Miao Xie 已提交
2806 2807
	u64 old_total;
	u64 diff;
2808

2809
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Y
Yan Zheng 已提交
2810
		return -EACCES;
M
Miao Xie 已提交
2811

2812 2813
	new_size = round_down(new_size, fs_info->sectorsize);

2814
	mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2815
	old_total = btrfs_super_total_bytes(super_copy);
2816
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
M
Miao Xie 已提交
2817

2818
	if (new_size <= device->total_bytes ||
2819
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2820
		mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2821
		return -EINVAL;
M
Miao Xie 已提交
2822
	}
Y
Yan Zheng 已提交
2823

2824 2825
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Y
Yan Zheng 已提交
2826 2827
	device->fs_devices->total_rw_bytes += diff;

2828 2829
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2830
	btrfs_clear_space_info_full(device->fs_info);
2831 2832 2833
	if (list_empty(&device->post_commit_list))
		list_add_tail(&device->post_commit_list,
			      &trans->transaction->dev_update_list);
2834
	mutex_unlock(&fs_info->chunk_mutex);
2835

2836 2837 2838
	return btrfs_update_device(trans, device);
}

2839
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2840
{
2841
	struct btrfs_fs_info *fs_info = trans->fs_info;
2842
	struct btrfs_root *root = fs_info->chunk_root;
2843 2844 2845 2846 2847 2848 2849 2850
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2851
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2852 2853 2854 2855
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2856 2857 2858
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2859 2860
		btrfs_handle_fs_error(fs_info, -ENOENT,
				      "Failed lookup while freeing chunk.");
2861 2862 2863
		ret = -ENOENT;
		goto out;
	}
2864 2865

	ret = btrfs_del_item(trans, root, path);
2866
	if (ret < 0)
2867 2868
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to delete chunk item.");
2869
out:
2870
	btrfs_free_path(path);
2871
	return ret;
2872 2873
}

2874
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2875
{
2876
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2877 2878 2879 2880 2881 2882 2883 2884 2885 2886
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

2887
	lockdep_assert_held(&fs_info->chunk_mutex);
2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
2907
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
	return ret;
}

2920 2921 2922 2923 2924 2925 2926 2927 2928
/*
 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
 * @logical: Logical block offset in bytes.
 * @length: Length of extent in bytes.
 *
 * Return: Chunk mapping or ERR_PTR.
 */
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
				       u64 logical, u64 length)
2929 2930 2931 2932
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;

2933
	em_tree = &fs_info->mapping_tree;
2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, logical, length);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

	if (em->start > logical || em->start + em->len < logical) {
		btrfs_crit(fs_info,
			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
			   logical, length, em->start, em->start + em->len);
		free_extent_map(em);
		return ERR_PTR(-EINVAL);
	}

	/* callers are responsible for dropping em's ref. */
	return em;
}

2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978
static int remove_chunk_item(struct btrfs_trans_handle *trans,
			     struct map_lookup *map, u64 chunk_offset)
{
	int i;

	/*
	 * Removing chunk items and updating the device items in the chunks btree
	 * requires holding the chunk_mutex.
	 * See the comment at btrfs_chunk_alloc() for the details.
	 */
	lockdep_assert_held(&trans->fs_info->chunk_mutex);

	for (i = 0; i < map->num_stripes; i++) {
		int ret;

		ret = btrfs_update_device(trans, map->stripes[i].dev);
		if (ret)
			return ret;
	}

	return btrfs_free_chunk(trans, chunk_offset);
}

2979
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2980
{
2981
	struct btrfs_fs_info *fs_info = trans->fs_info;
2982 2983
	struct extent_map *em;
	struct map_lookup *map;
M
Miao Xie 已提交
2984
	u64 dev_extent_len = 0;
2985
	int i, ret = 0;
2986
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2987

2988
	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2989
	if (IS_ERR(em)) {
2990 2991
		/*
		 * This is a logic error, but we don't want to just rely on the
2992
		 * user having built with ASSERT enabled, so if ASSERT doesn't
2993 2994 2995
		 * do anything we still error out.
		 */
		ASSERT(0);
2996
		return PTR_ERR(em);
2997
	}
2998
	map = em->map_lookup;
2999

3000
	/*
3001 3002 3003 3004 3005 3006 3007 3008
	 * First delete the device extent items from the devices btree.
	 * We take the device_list_mutex to avoid racing with the finishing phase
	 * of a device replace operation. See the comment below before acquiring
	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
	 * because that can result in a deadlock when deleting the device extent
	 * items from the devices btree - COWing an extent buffer from the btree
	 * may result in allocating a new metadata chunk, which would attempt to
	 * lock again fs_info->chunk_mutex.
3009 3010
	 */
	mutex_lock(&fs_devices->device_list_mutex);
3011
	for (i = 0; i < map->num_stripes; i++) {
3012
		struct btrfs_device *device = map->stripes[i].dev;
M
Miao Xie 已提交
3013 3014 3015
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
3016
		if (ret) {
3017
			mutex_unlock(&fs_devices->device_list_mutex);
3018
			btrfs_abort_transaction(trans, ret);
3019 3020
			goto out;
		}
3021

M
Miao Xie 已提交
3022
		if (device->bytes_used > 0) {
3023
			mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
3024 3025
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
3026
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3027
			btrfs_clear_space_info_full(fs_info);
3028
			mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
3029
		}
3030 3031
	}
	mutex_unlock(&fs_devices->device_list_mutex);
3032

3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085
	/*
	 * We acquire fs_info->chunk_mutex for 2 reasons:
	 *
	 * 1) Just like with the first phase of the chunk allocation, we must
	 *    reserve system space, do all chunk btree updates and deletions, and
	 *    update the system chunk array in the superblock while holding this
	 *    mutex. This is for similar reasons as explained on the comment at
	 *    the top of btrfs_chunk_alloc();
	 *
	 * 2) Prevent races with the final phase of a device replace operation
	 *    that replaces the device object associated with the map's stripes,
	 *    because the device object's id can change at any time during that
	 *    final phase of the device replace operation
	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
	 *    replaced device and then see it with an ID of
	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
	 *    the device item, which does not exists on the chunk btree.
	 *    The finishing phase of device replace acquires both the
	 *    device_list_mutex and the chunk_mutex, in that order, so we are
	 *    safe by just acquiring the chunk_mutex.
	 */
	trans->removing_chunk = true;
	mutex_lock(&fs_info->chunk_mutex);

	check_system_chunk(trans, map->type);

	ret = remove_chunk_item(trans, map, chunk_offset);
	/*
	 * Normally we should not get -ENOSPC since we reserved space before
	 * through the call to check_system_chunk().
	 *
	 * Despite our system space_info having enough free space, we may not
	 * be able to allocate extents from its block groups, because all have
	 * an incompatible profile, which will force us to allocate a new system
	 * block group with the right profile, or right after we called
	 * check_system_space() above, a scrub turned the only system block group
	 * with enough free space into RO mode.
	 * This is explained with more detail at do_chunk_alloc().
	 *
	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
	 */
	if (ret == -ENOSPC) {
		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
		struct btrfs_block_group *sys_bg;

		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
		if (IS_ERR(sys_bg)) {
			ret = PTR_ERR(sys_bg);
			btrfs_abort_transaction(trans, ret);
			goto out;
		}

		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3086 3087 3088
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
3089
		}
3090

3091 3092 3093 3094 3095 3096
		ret = remove_chunk_item(trans, map, chunk_offset);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
	} else if (ret) {
3097
		btrfs_abort_transaction(trans, ret);
3098 3099
		goto out;
	}
3100

3101
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3102

3103
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3104
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3105
		if (ret) {
3106
			btrfs_abort_transaction(trans, ret);
3107 3108
			goto out;
		}
3109 3110
	}

3111 3112 3113 3114 3115 3116 3117 3118 3119
	mutex_unlock(&fs_info->chunk_mutex);
	trans->removing_chunk = false;

	/*
	 * We are done with chunk btree updates and deletions, so release the
	 * system space we previously reserved (with check_system_chunk()).
	 */
	btrfs_trans_release_chunk_metadata(trans);

3120
	ret = btrfs_remove_block_group(trans, chunk_offset, em);
3121
	if (ret) {
3122
		btrfs_abort_transaction(trans, ret);
3123 3124
		goto out;
	}
Y
Yan Zheng 已提交
3125

3126
out:
3127 3128 3129 3130
	if (trans->removing_chunk) {
		mutex_unlock(&fs_info->chunk_mutex);
		trans->removing_chunk = false;
	}
Y
Yan Zheng 已提交
3131 3132
	/* once for us */
	free_extent_map(em);
3133 3134
	return ret;
}
Y
Yan Zheng 已提交
3135

3136
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3137
{
3138
	struct btrfs_root *root = fs_info->chunk_root;
3139
	struct btrfs_trans_handle *trans;
3140
	struct btrfs_block_group *block_group;
3141
	u64 length;
3142
	int ret;
Y
Yan Zheng 已提交
3143

3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
3156
	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3157

3158
	/* step one, relocate all the extents inside this chunk */
3159
	btrfs_scrub_pause(fs_info);
3160
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3161
	btrfs_scrub_continue(fs_info);
3162 3163 3164
	if (ret)
		return ret;

3165 3166 3167 3168
	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
	if (!block_group)
		return -ENOENT;
	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3169
	length = block_group->length;
3170 3171
	btrfs_put_block_group(block_group);

3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185
	/*
	 * On a zoned file system, discard the whole block group, this will
	 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
	 * resetting the zone fails, don't treat it as a fatal problem from the
	 * filesystem's point of view.
	 */
	if (btrfs_is_zoned(fs_info)) {
		ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
		if (ret)
			btrfs_info(fs_info,
				"failed to reset zone %llu after relocation",
				chunk_offset);
	}

3186 3187 3188 3189 3190 3191 3192 3193
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

3194
	/*
3195 3196
	 * step two, delete the device extents and the
	 * chunk tree entries
3197
	 */
3198
	ret = btrfs_remove_chunk(trans, chunk_offset);
3199
	btrfs_end_transaction(trans);
3200
	return ret;
Y
Yan Zheng 已提交
3201 3202
}

3203
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
3204
{
3205
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
3206 3207 3208 3209 3210 3211
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
3212 3213
	bool retried = false;
	int failed = 0;
Y
Yan Zheng 已提交
3214 3215 3216 3217 3218 3219
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3220
again:
Y
Yan Zheng 已提交
3221 3222 3223 3224 3225
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
3226
		mutex_lock(&fs_info->reclaim_bgs_lock);
Y
Yan Zheng 已提交
3227
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3228
		if (ret < 0) {
3229
			mutex_unlock(&fs_info->reclaim_bgs_lock);
Y
Yan Zheng 已提交
3230
			goto error;
3231
		}
3232
		BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
3233 3234 3235

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
3236
		if (ret)
3237
			mutex_unlock(&fs_info->reclaim_bgs_lock);
Y
Yan Zheng 已提交
3238 3239 3240 3241
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
Z
Zheng Yan 已提交
3242

Y
Yan Zheng 已提交
3243 3244
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
Z
Zheng Yan 已提交
3245

Y
Yan Zheng 已提交
3246 3247 3248
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
3249
		btrfs_release_path(path);
3250

Y
Yan Zheng 已提交
3251
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3252
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3253 3254
			if (ret == -ENOSPC)
				failed++;
H
HIMANGI SARAOGI 已提交
3255 3256
			else
				BUG_ON(ret);
Y
Yan Zheng 已提交
3257
		}
3258
		mutex_unlock(&fs_info->reclaim_bgs_lock);
3259

Y
Yan Zheng 已提交
3260 3261 3262 3263 3264
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
3265 3266 3267 3268
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
3269
	} else if (WARN_ON(failed && retried)) {
3270 3271
		ret = -ENOSPC;
	}
Y
Yan Zheng 已提交
3272 3273 3274
error:
	btrfs_free_path(path);
	return ret;
3275 3276
}

3277 3278 3279 3280 3281 3282 3283 3284
/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				      u64 chunk_offset)
{
3285
	struct btrfs_block_group *cache;
3286 3287 3288 3289 3290 3291 3292 3293
	u64 bytes_used;
	u64 chunk_type;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	ASSERT(cache);
	chunk_type = cache->flags;
	btrfs_put_block_group(cache);

3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313
	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
		return 0;

	spin_lock(&fs_info->data_sinfo->lock);
	bytes_used = fs_info->data_sinfo->bytes_used;
	spin_unlock(&fs_info->data_sinfo->lock);

	if (!bytes_used) {
		struct btrfs_trans_handle *trans;
		int ret;

		trans =	btrfs_join_transaction(fs_info->tree_root);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
		btrfs_end_transaction(trans);
		if (ret < 0)
			return ret;
		return 1;
3314
	}
3315

3316 3317 3318
	return 0;
}

3319
static int insert_balance_item(struct btrfs_fs_info *fs_info,
3320 3321
			       struct btrfs_balance_control *bctl)
{
3322
	struct btrfs_root *root = fs_info->tree_root;
3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3342
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3343 3344 3345 3346 3347 3348 3349 3350 3351 3352
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3353
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
3367
	err = btrfs_commit_transaction(trans);
3368 3369 3370 3371 3372
	if (err && !ret)
		ret = err;
	return ret;
}

3373
static int del_balance_item(struct btrfs_fs_info *fs_info)
3374
{
3375
	struct btrfs_root *root = fs_info->tree_root;
3376 3377 3378 3379 3380 3381 3382 3383 3384
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3385
	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3386 3387 3388 3389 3390 3391
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3392
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3406
	err = btrfs_commit_transaction(trans);
3407 3408 3409 3410 3411
	if (err && !ret)
		ret = err;
	return ret;
}

I
Ilya Dryomov 已提交
3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3436
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3437 3438 3439 3440 3441
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3442
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3443 3444 3445 3446 3447
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3448
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3449 3450 3451 3452 3453 3454
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3455 3456 3457 3458
/*
 * Clear the balance status in fs_info and delete the balance item from disk.
 */
static void reset_balance_state(struct btrfs_fs_info *fs_info)
3459 3460
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3461
	int ret;
3462 3463 3464 3465 3466 3467 3468 3469

	BUG_ON(!fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
3470 3471 3472
	ret = del_balance_item(fs_info);
	if (ret)
		btrfs_handle_fs_error(fs_info, ret, NULL);
3473 3474
}

I
Ilya Dryomov 已提交
3475 3476 3477 3478
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3479
static int chunk_profiles_filter(u64 chunk_type,
I
Ilya Dryomov 已提交
3480 3481
				 struct btrfs_balance_args *bargs)
{
3482 3483
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
I
Ilya Dryomov 已提交
3484

3485
	if (bargs->profiles & chunk_type)
I
Ilya Dryomov 已提交
3486 3487 3488 3489 3490
		return 0;

	return 1;
}

3491
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
I
Ilya Dryomov 已提交
3492
			      struct btrfs_balance_args *bargs)
3493
{
3494
	struct btrfs_block_group *cache;
3495 3496 3497 3498 3499 3500
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3501
	chunk_used = cache->used;
3502 3503 3504 3505

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
3506 3507
		user_thresh_min = div_factor_fine(cache->length,
						  bargs->usage_min);
3508 3509 3510 3511

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
3512
		user_thresh_max = cache->length;
3513
	else
3514 3515
		user_thresh_max = div_factor_fine(cache->length,
						  bargs->usage_max);
3516 3517 3518 3519 3520 3521 3522 3523

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3524
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3525
		u64 chunk_offset, struct btrfs_balance_args *bargs)
I
Ilya Dryomov 已提交
3526
{
3527
	struct btrfs_block_group *cache;
I
Ilya Dryomov 已提交
3528 3529 3530 3531
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3532
	chunk_used = cache->used;
I
Ilya Dryomov 已提交
3533

3534
	if (bargs->usage_min == 0)
3535
		user_thresh = 1;
3536
	else if (bargs->usage > 100)
3537
		user_thresh = cache->length;
3538
	else
3539
		user_thresh = div_factor_fine(cache->length, bargs->usage);
3540

I
Ilya Dryomov 已提交
3541 3542 3543 3544 3545 3546 3547
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

I
Ilya Dryomov 已提交
3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

3565 3566 3567 3568 3569 3570
static u64 calc_data_stripes(u64 type, int num_stripes)
{
	const int index = btrfs_bg_flags_to_raid_index(type);
	const int ncopies = btrfs_raid_array[index].ncopies;
	const int nparity = btrfs_raid_array[index].nparity;

3571
	return (num_stripes - nparity) / ncopies;
3572 3573
}

I
Ilya Dryomov 已提交
3574 3575 3576 3577 3578 3579 3580 3581 3582
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
3583
	u64 type;
I
Ilya Dryomov 已提交
3584 3585 3586 3587 3588 3589
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

3590 3591
	type = btrfs_chunk_type(leaf, chunk);
	factor = calc_data_stripes(type, num_stripes);
I
Ilya Dryomov 已提交
3592 3593 3594 3595 3596 3597 3598 3599

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3600
		stripe_length = div_u64(stripe_length, factor);
I
Ilya Dryomov 已提交
3601 3602 3603 3604 3605 3606 3607 3608 3609

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3637
static int chunk_soft_convert_filter(u64 chunk_type,
3638 3639 3640 3641 3642
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3643 3644
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3645

3646
	if (bargs->target == chunk_type)
3647 3648 3649 3650 3651
		return 1;

	return 0;
}

3652
static int should_balance_chunk(struct extent_buffer *leaf,
3653 3654
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3655
	struct btrfs_fs_info *fs_info = leaf->fs_info;
3656
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

I
Ilya Dryomov 已提交
3673 3674 3675 3676
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3677 3678 3679 3680
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3681
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
I
Ilya Dryomov 已提交
3682
		return 0;
3683
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3684
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3685
		return 0;
I
Ilya Dryomov 已提交
3686 3687 3688 3689 3690 3691
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3692 3693 3694 3695
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3696
	    chunk_drange_filter(leaf, chunk, bargs)) {
I
Ilya Dryomov 已提交
3697
		return 0;
3698 3699 3700 3701 3702 3703
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3704 3705
	}

3706 3707 3708 3709 3710 3711
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3712 3713 3714 3715 3716 3717
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3718 3719 3720 3721 3722 3723 3724 3725
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3726 3727 3728
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3729
		 * determined here because we do not have the global information
3730 3731 3732 3733 3734 3735
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
3736 3737
	}

3738 3739 3740
	return 1;
}

3741
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3742
{
3743
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3744
	struct btrfs_root *chunk_root = fs_info->chunk_root;
3745
	u64 chunk_type;
3746
	struct btrfs_chunk *chunk;
3747
	struct btrfs_path *path = NULL;
3748 3749
	struct btrfs_key key;
	struct btrfs_key found_key;
3750 3751
	struct extent_buffer *leaf;
	int slot;
3752 3753
	int ret;
	int enospc_errors = 0;
3754
	bool counting = true;
3755
	/* The single value limit and min/max limits use the same bytes in the */
3756 3757 3758
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
3759 3760 3761
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
3762
	int chunk_reserved = 0;
3763 3764

	path = btrfs_alloc_path();
3765 3766 3767 3768
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
3769 3770 3771 3772 3773 3774

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
3775
	if (!counting) {
3776 3777 3778 3779
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
3780 3781 3782 3783
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
3784 3785 3786 3787
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

C
Chris Mason 已提交
3788
	while (1) {
3789
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3790
		    atomic_read(&fs_info->balance_cancel_req)) {
3791 3792 3793 3794
			ret = -ECANCELED;
			goto error;
		}

3795
		mutex_lock(&fs_info->reclaim_bgs_lock);
3796
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3797
		if (ret < 0) {
3798
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3799
			goto error;
3800
		}
3801 3802 3803 3804 3805 3806

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
3807
			BUG(); /* FIXME break ? */
3808 3809 3810

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
3811
		if (ret) {
3812
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3813
			ret = 0;
3814
			break;
3815
		}
3816

3817 3818 3819
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3820

3821
		if (found_key.objectid != key.objectid) {
3822
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3823
			break;
3824
		}
3825

3826
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3827
		chunk_type = btrfs_chunk_type(leaf, chunk);
3828

3829 3830 3831 3832 3833 3834
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

3835
		ret = should_balance_chunk(leaf, chunk, found_key.offset);
3836

3837
		btrfs_release_path(path);
3838
		if (!ret) {
3839
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3840
			goto loop;
3841
		}
3842

3843
		if (counting) {
3844
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3845 3846 3847
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
3869
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3870 3871 3872
			goto loop;
		}

3873 3874 3875 3876 3877 3878 3879 3880 3881
		if (!chunk_reserved) {
			/*
			 * We may be relocating the only data chunk we have,
			 * which could potentially end up with losing data's
			 * raid profile, so lets allocate an empty one in
			 * advance.
			 */
			ret = btrfs_may_alloc_data_chunk(fs_info,
							 found_key.offset);
3882
			if (ret < 0) {
3883
				mutex_unlock(&fs_info->reclaim_bgs_lock);
3884
				goto error;
3885 3886
			} else if (ret == 1) {
				chunk_reserved = 1;
3887 3888 3889
			}
		}

3890
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3891
		mutex_unlock(&fs_info->reclaim_bgs_lock);
3892
		if (ret == -ENOSPC) {
3893
			enospc_errors++;
3894 3895 3896 3897 3898 3899 3900
		} else if (ret == -ETXTBSY) {
			btrfs_info(fs_info,
	   "skipping relocation of block group %llu due to active swapfile",
				   found_key.offset);
			ret = 0;
		} else if (ret) {
			goto error;
3901 3902 3903 3904 3905
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
3906
loop:
3907 3908
		if (found_key.offset == 0)
			break;
3909
		key.offset = found_key.offset - 1;
3910
	}
3911

3912 3913 3914 3915 3916
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
3917 3918
error:
	btrfs_free_path(path);
3919
	if (enospc_errors) {
3920
		btrfs_info(fs_info, "%d enospc errors during balance",
J
Jeff Mahoney 已提交
3921
			   enospc_errors);
3922 3923 3924 3925
		if (!ret)
			ret = -ENOSPC;
	}

3926 3927 3928
	return ret;
}

3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948
/**
 * alloc_profile_is_valid - see if a given profile is valid and reduced
 * @flags: profile to validate
 * @extended: if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

3949
	return has_single_bit_set(flags);
3950 3951
}

3952 3953
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
3954 3955 3956 3957
	/* cancel requested || normal exit path */
	return atomic_read(&fs_info->balance_cancel_req) ||
		(atomic_read(&fs_info->balance_pause_req) == 0 &&
		 atomic_read(&fs_info->balance_cancel_req) == 0);
3958 3959
}

3960 3961 3962 3963 3964 3965 3966
/*
 * Validate target profile against allowed profiles and return true if it's OK.
 * Otherwise print the error message and return false.
 */
static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
		const struct btrfs_balance_args *bargs,
		u64 allowed, const char *type)
3967
{
3968 3969 3970
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return true;

3971 3972 3973 3974 3975 3976 3977
	if (fs_info->sectorsize < PAGE_SIZE &&
		bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		btrfs_err(fs_info,
		"RAID56 is not yet supported for sectorsize %u with page size %lu",
			  fs_info->sectorsize, PAGE_SIZE);
		return false;
	}
3978 3979 3980 3981 3982 3983 3984 3985
	/* Profile is valid and does not have bits outside of the allowed set */
	if (alloc_profile_is_valid(bargs->target, 1) &&
	    (bargs->target & ~allowed) == 0)
		return true;

	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
			type, btrfs_bg_type_to_raid_name(bargs->target));
	return false;
3986 3987
}

3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031
/*
 * Fill @buf with textual description of balance filter flags @bargs, up to
 * @size_buf including the terminating null. The output may be trimmed if it
 * does not fit into the provided buffer.
 */
static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
				 u32 size_buf)
{
	int ret;
	u32 size_bp = size_buf;
	char *bp = buf;
	u64 flags = bargs->flags;
	char tmp_buf[128] = {'\0'};

	if (!flags)
		return;

#define CHECK_APPEND_NOARG(a)						\
	do {								\
		ret = snprintf(bp, size_bp, (a));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

#define CHECK_APPEND_1ARG(a, v1)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

#define CHECK_APPEND_2ARG(a, v1, v2)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

4032 4033 4034
	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
		CHECK_APPEND_1ARG("convert=%s,",
				  btrfs_bg_type_to_raid_name(bargs->target));
4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141

	if (flags & BTRFS_BALANCE_ARGS_SOFT)
		CHECK_APPEND_NOARG("soft,");

	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
					    sizeof(tmp_buf));
		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
	}

	if (flags & BTRFS_BALANCE_ARGS_USAGE)
		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);

	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
		CHECK_APPEND_2ARG("usage=%u..%u,",
				  bargs->usage_min, bargs->usage_max);

	if (flags & BTRFS_BALANCE_ARGS_DEVID)
		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);

	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
		CHECK_APPEND_2ARG("drange=%llu..%llu,",
				  bargs->pstart, bargs->pend);

	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
				  bargs->vstart, bargs->vend);

	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);

	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
		CHECK_APPEND_2ARG("limit=%u..%u,",
				bargs->limit_min, bargs->limit_max);

	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
		CHECK_APPEND_2ARG("stripes=%u..%u,",
				  bargs->stripes_min, bargs->stripes_max);

#undef CHECK_APPEND_2ARG
#undef CHECK_APPEND_1ARG
#undef CHECK_APPEND_NOARG

out_overflow:

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
	else
		buf[0] = '\0';
}

static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
	u32 size_buf = 1024;
	char tmp_buf[192] = {'\0'};
	char *buf;
	char *bp;
	u32 size_bp = size_buf;
	int ret;
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;

	buf = kzalloc(size_buf, GFP_KERNEL);
	if (!buf)
		return;

	bp = buf;

#define CHECK_APPEND_1ARG(a, v1)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

	if (bctl->flags & BTRFS_BALANCE_FORCE)
		CHECK_APPEND_1ARG("%s", "-f ");

	if (bctl->flags & BTRFS_BALANCE_DATA) {
		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
	}

	if (bctl->flags & BTRFS_BALANCE_METADATA) {
		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
	}

	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
	}

#undef CHECK_APPEND_1ARG

out_overflow:

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
	btrfs_info(fs_info, "balance: %s %s",
		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
		   "resume" : "start", buf);

	kfree(buf);
}

4142
/*
4143
 * Should be called with balance mutexe held
4144
 */
4145 4146
int btrfs_balance(struct btrfs_fs_info *fs_info,
		  struct btrfs_balance_control *bctl,
4147 4148
		  struct btrfs_ioctl_balance_args *bargs)
{
4149
	u64 meta_target, data_target;
4150
	u64 allowed;
4151
	int mixed = 0;
4152
	int ret;
4153
	u64 num_devices;
4154
	unsigned seq;
4155
	bool reducing_redundancy;
4156
	int i;
4157

4158
	if (btrfs_fs_closing(fs_info) ||
4159
	    atomic_read(&fs_info->balance_pause_req) ||
4160
	    btrfs_should_cancel_balance(fs_info)) {
4161 4162 4163 4164
		ret = -EINVAL;
		goto out;
	}

4165 4166 4167 4168
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

4169 4170 4171 4172
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
4173 4174
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
4175 4176 4177
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
J
Jeff Mahoney 已提交
4178
			btrfs_err(fs_info,
4179
	  "balance: mixed groups data and metadata options must be the same");
4180 4181 4182 4183 4184
			ret = -EINVAL;
			goto out;
		}
	}

4185 4186
	/*
	 * rw_devices will not change at the moment, device add/delete/replace
4187
	 * are exclusive
4188 4189
	 */
	num_devices = fs_info->fs_devices->rw_devices;
4190 4191 4192 4193 4194 4195 4196

	/*
	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
	 * special bit for it, to make it easier to distinguish.  Thus we need
	 * to set it manually, or balance would refuse the profile.
	 */
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4197 4198 4199
	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
		if (num_devices >= btrfs_raid_array[i].devs_min)
			allowed |= btrfs_raid_array[i].bg_flag;
4200

4201 4202 4203
	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4204 4205 4206 4207
		ret = -EINVAL;
		goto out;
	}

4208 4209 4210 4211 4212 4213 4214 4215 4216 4217
	/*
	 * Allow to reduce metadata or system integrity only if force set for
	 * profiles with redundancy (copies, parity)
	 */
	allowed = 0;
	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
		if (btrfs_raid_array[i].ncopies >= 2 ||
		    btrfs_raid_array[i].tolerated_failures >= 1)
			allowed |= btrfs_raid_array[i].bg_flag;
	}
4218 4219 4220 4221 4222 4223 4224 4225
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
4226
		     !(bctl->meta.target & allowed)))
4227
			reducing_redundancy = true;
4228
		else
4229
			reducing_redundancy = false;
4230 4231 4232 4233 4234 4235

		/* if we're not converting, the target field is uninitialized */
		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
			bctl->data.target : fs_info->avail_data_alloc_bits;
4236
	} while (read_seqretry(&fs_info->profiles_lock, seq));
4237

4238
	if (reducing_redundancy) {
4239 4240
		if (bctl->flags & BTRFS_BALANCE_FORCE) {
			btrfs_info(fs_info,
4241
			   "balance: force reducing metadata redundancy");
4242 4243
		} else {
			btrfs_err(fs_info,
4244
	"balance: reduces metadata redundancy, use --force if you want this");
4245 4246 4247 4248 4249
			ret = -EINVAL;
			goto out;
		}
	}

4250 4251
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4252
		btrfs_warn(fs_info,
4253
	"balance: metadata profile %s has lower redundancy than data profile %s",
4254 4255
				btrfs_bg_type_to_raid_name(meta_target),
				btrfs_bg_type_to_raid_name(data_target));
4256 4257
	}

4258
	ret = insert_balance_item(fs_info, bctl);
I
Ilya Dryomov 已提交
4259
	if (ret && ret != -EEXIST)
4260 4261
		goto out;

I
Ilya Dryomov 已提交
4262 4263
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
4264 4265 4266 4267
		BUG_ON(fs_info->balance_ctl);
		spin_lock(&fs_info->balance_lock);
		fs_info->balance_ctl = bctl;
		spin_unlock(&fs_info->balance_lock);
I
Ilya Dryomov 已提交
4268 4269 4270 4271 4272 4273
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
4274

4275 4276
	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4277
	describe_balance_start_or_resume(fs_info);
4278 4279 4280 4281 4282
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
4283 4284
	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
		btrfs_info(fs_info, "balance: paused");
4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300
	/*
	 * Balance can be canceled by:
	 *
	 * - Regular cancel request
	 *   Then ret == -ECANCELED and balance_cancel_req > 0
	 *
	 * - Fatal signal to "btrfs" process
	 *   Either the signal caught by wait_reserve_ticket() and callers
	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
	 *   got -ECANCELED.
	 *   Either way, in this case balance_cancel_req = 0, and
	 *   ret == -EINTR or ret == -ECANCELED.
	 *
	 * So here we only check the return value to catch canceled balance.
	 */
	else if (ret == -ECANCELED || ret == -EINTR)
4301 4302 4303 4304
		btrfs_info(fs_info, "balance: canceled");
	else
		btrfs_info(fs_info, "balance: ended with status: %d", ret);

4305
	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4306 4307 4308

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
4309
		btrfs_update_ioctl_balance_args(fs_info, bargs);
4310 4311
	}

4312 4313
	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
	    balance_need_close(fs_info)) {
4314
		reset_balance_state(fs_info);
4315
		btrfs_exclop_finish(fs_info);
4316 4317
	}

4318
	wake_up(&fs_info->balance_wait_q);
4319 4320 4321

	return ret;
out:
I
Ilya Dryomov 已提交
4322
	if (bctl->flags & BTRFS_BALANCE_RESUME)
4323
		reset_balance_state(fs_info);
4324
	else
I
Ilya Dryomov 已提交
4325
		kfree(bctl);
4326
	btrfs_exclop_finish(fs_info);
4327

I
Ilya Dryomov 已提交
4328 4329 4330 4331 4332
	return ret;
}

static int balance_kthread(void *data)
{
4333
	struct btrfs_fs_info *fs_info = data;
4334
	int ret = 0;
I
Ilya Dryomov 已提交
4335 4336

	mutex_lock(&fs_info->balance_mutex);
4337
	if (fs_info->balance_ctl)
4338
		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
I
Ilya Dryomov 已提交
4339
	mutex_unlock(&fs_info->balance_mutex);
4340

I
Ilya Dryomov 已提交
4341 4342 4343
	return ret;
}

4344 4345 4346 4347
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

4348
	mutex_lock(&fs_info->balance_mutex);
4349
	if (!fs_info->balance_ctl) {
4350
		mutex_unlock(&fs_info->balance_mutex);
4351 4352
		return 0;
	}
4353
	mutex_unlock(&fs_info->balance_mutex);
4354

4355
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4356
		btrfs_info(fs_info, "balance: resume skipped");
4357 4358 4359
		return 0;
	}

4360 4361 4362 4363 4364 4365 4366 4367 4368
	/*
	 * A ro->rw remount sequence should continue with the paused balance
	 * regardless of who pauses it, system or the user as of now, so set
	 * the resume flag.
	 */
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
	spin_unlock(&fs_info->balance_lock);

4369
	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4370
	return PTR_ERR_OR_ZERO(tsk);
4371 4372
}

4373
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
I
Ilya Dryomov 已提交
4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
4388
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
I
Ilya Dryomov 已提交
4389 4390
	key.offset = 0;

4391
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
I
Ilya Dryomov 已提交
4392
	if (ret < 0)
4393
		goto out;
I
Ilya Dryomov 已提交
4394 4395
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
4396 4397 4398 4399 4400 4401 4402
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
I
Ilya Dryomov 已提交
4403 4404 4405 4406 4407
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

4408 4409
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
I
Ilya Dryomov 已提交
4410 4411 4412 4413 4414 4415 4416 4417

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

4418 4419 4420 4421 4422 4423 4424 4425 4426 4427
	/*
	 * This should never happen, as the paused balance state is recovered
	 * during mount without any chance of other exclusive ops to collide.
	 *
	 * This gives the exclusive op status to balance and keeps in paused
	 * state until user intervention (cancel or umount). If the ownership
	 * cannot be assigned, show a message but do not fail. The balance
	 * is in a paused state and must have fs_info::balance_ctl properly
	 * set up.
	 */
4428
	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4429
		btrfs_warn(fs_info,
4430
	"balance: cannot set exclusive op status, resume manually");
4431

4432 4433
	btrfs_release_path(path);

4434
	mutex_lock(&fs_info->balance_mutex);
4435 4436 4437 4438
	BUG_ON(fs_info->balance_ctl);
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
4439
	mutex_unlock(&fs_info->balance_mutex);
I
Ilya Dryomov 已提交
4440 4441
out:
	btrfs_free_path(path);
4442 4443 4444
	return ret;
}

4445 4446 4447 4448 4449 4450 4451 4452 4453 4454
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

4455
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4456 4457 4458 4459
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
4460
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4461 4462 4463

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
4464
		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4465 4466 4467 4468 4469 4470 4471 4472 4473
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4474 4475 4476 4477 4478 4479 4480 4481
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

4482 4483 4484 4485 4486 4487 4488 4489 4490 4491
	/*
	 * A paused balance with the item stored on disk can be resumed at
	 * mount time if the mount is read-write. Otherwise it's still paused
	 * and we must not allow cancelling as it deletes the item.
	 */
	if (sb_rdonly(fs_info->sb)) {
		mutex_unlock(&fs_info->balance_mutex);
		return -EROFS;
	}

4492 4493 4494 4495 4496
	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
4497
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4498 4499
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
4500
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4501 4502 4503
		mutex_lock(&fs_info->balance_mutex);
	} else {
		mutex_unlock(&fs_info->balance_mutex);
4504 4505 4506 4507
		/*
		 * Lock released to allow other waiters to continue, we'll
		 * reexamine the status again.
		 */
4508 4509
		mutex_lock(&fs_info->balance_mutex);

4510
		if (fs_info->balance_ctl) {
4511
			reset_balance_state(fs_info);
4512
			btrfs_exclop_finish(fs_info);
4513
			btrfs_info(fs_info, "balance: canceled");
4514
		}
4515 4516
	}

4517 4518
	BUG_ON(fs_info->balance_ctl ||
		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4519 4520 4521 4522 4523
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

4524
int btrfs_uuid_scan_kthread(void *data)
S
Stefan Behrens 已提交
4525 4526 4527 4528 4529 4530 4531 4532 4533 4534
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	int ret = 0;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_root_item root_item;
	u32 item_size;
4535
	struct btrfs_trans_handle *trans = NULL;
4536
	bool closing = false;
S
Stefan Behrens 已提交
4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;

	while (1) {
4549 4550 4551 4552
		if (btrfs_fs_closing(fs_info)) {
			closing = true;
			break;
		}
4553 4554
		ret = btrfs_search_forward(root, &key, path,
				BTRFS_OLDEST_GENERATION);
S
Stefan Behrens 已提交
4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577
		if (ret) {
			if (ret > 0)
				ret = 0;
			break;
		}

		if (key.type != BTRFS_ROOT_ITEM_KEY ||
		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
			goto skip;

		eb = path->nodes[0];
		slot = path->slots[0];
		item_size = btrfs_item_size_nr(eb, slot);
		if (item_size < sizeof(root_item))
			goto skip;

		read_extent_buffer(eb, &root_item,
				   btrfs_item_ptr_offset(eb, slot),
				   (int)sizeof(root_item));
		if (btrfs_root_refs(&root_item) == 0)
			goto skip;
4578 4579 4580 4581 4582 4583 4584

		if (!btrfs_is_empty_uuid(root_item.uuid) ||
		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
			if (trans)
				goto update_tree;

			btrfs_release_path(path);
S
Stefan Behrens 已提交
4585 4586 4587 4588 4589 4590 4591 4592 4593
			/*
			 * 1 - subvol uuid item
			 * 1 - received_subvol uuid item
			 */
			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
4594 4595 4596 4597 4598
			continue;
		} else {
			goto skip;
		}
update_tree:
4599
		btrfs_release_path(path);
4600
		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4601
			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
S
Stefan Behrens 已提交
4602 4603 4604
						  BTRFS_UUID_KEY_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4605
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4606 4607 4608 4609 4610 4611
					ret);
				break;
			}
		}

		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4612
			ret = btrfs_uuid_tree_add(trans,
S
Stefan Behrens 已提交
4613 4614 4615 4616
						  root_item.received_uuid,
						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4617
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4618 4619 4620 4621 4622
					ret);
				break;
			}
		}

4623
skip:
4624
		btrfs_release_path(path);
S
Stefan Behrens 已提交
4625
		if (trans) {
4626
			ret = btrfs_end_transaction(trans);
4627
			trans = NULL;
S
Stefan Behrens 已提交
4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648
			if (ret)
				break;
		}

		if (key.offset < (u64)-1) {
			key.offset++;
		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
		} else if (key.objectid < (u64)-1) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
			key.objectid++;
		} else {
			break;
		}
		cond_resched();
	}

out:
	btrfs_free_path(path);
4649
	if (trans && !IS_ERR(trans))
4650
		btrfs_end_transaction(trans);
S
Stefan Behrens 已提交
4651
	if (ret)
4652
		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4653
	else if (!closing)
4654
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
S
Stefan Behrens 已提交
4655 4656 4657 4658
	up(&fs_info->uuid_tree_rescan_sem);
	return 0;
}

4659 4660 4661 4662 4663
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *uuid_root;
S
Stefan Behrens 已提交
4664 4665
	struct task_struct *task;
	int ret;
4666 4667 4668 4669 4670 4671 4672 4673 4674

	/*
	 * 1 - root node
	 * 1 - root item
	 */
	trans = btrfs_start_transaction(tree_root, 2);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

4675
	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4676
	if (IS_ERR(uuid_root)) {
4677
		ret = PTR_ERR(uuid_root);
4678
		btrfs_abort_transaction(trans, ret);
4679
		btrfs_end_transaction(trans);
4680
		return ret;
4681 4682 4683 4684
	}

	fs_info->uuid_root = uuid_root;

4685
	ret = btrfs_commit_transaction(trans);
S
Stefan Behrens 已提交
4686 4687 4688 4689 4690 4691
	if (ret)
		return ret;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
4692
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4693
		btrfs_warn(fs_info, "failed to start uuid_scan task");
S
Stefan Behrens 已提交
4694 4695 4696 4697 4698
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
4699
}
S
Stefan Behrens 已提交
4700

4701 4702 4703 4704 4705 4706 4707
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4708 4709
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4710 4711 4712 4713 4714 4715 4716
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4717 4718
	int failed = 0;
	bool retried = false;
4719 4720
	struct extent_buffer *l;
	struct btrfs_key key;
4721
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4722
	u64 old_total = btrfs_super_total_bytes(super_copy);
4723
	u64 old_size = btrfs_device_get_total_bytes(device);
4724
	u64 diff;
4725
	u64 start;
4726 4727

	new_size = round_down(new_size, fs_info->sectorsize);
4728
	start = new_size;
4729
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4730

4731
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4732 4733
		return -EINVAL;

4734 4735 4736 4737
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4738
	path->reada = READA_BACK;
4739

4740 4741 4742 4743 4744 4745
	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

4746
	mutex_lock(&fs_info->chunk_mutex);
4747

4748
	btrfs_device_set_total_bytes(device, new_size);
4749
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
4750
		device->fs_devices->total_rw_bytes -= diff;
4751
		atomic64_sub(diff, &fs_info->free_chunk_space);
4752
	}
4753 4754 4755 4756 4757 4758

	/*
	 * Once the device's size has been set to the new size, ensure all
	 * in-memory chunks are synced to disk so that the loop below sees them
	 * and relocates them accordingly.
	 */
4759
	if (contains_pending_extent(device, &start, diff)) {
4760 4761 4762 4763 4764 4765 4766 4767
		mutex_unlock(&fs_info->chunk_mutex);
		ret = btrfs_commit_transaction(trans);
		if (ret)
			goto done;
	} else {
		mutex_unlock(&fs_info->chunk_mutex);
		btrfs_end_transaction(trans);
	}
4768

4769
again:
4770 4771 4772 4773
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4774
	do {
4775
		mutex_lock(&fs_info->reclaim_bgs_lock);
4776
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4777
		if (ret < 0) {
4778
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4779
			goto done;
4780
		}
4781 4782 4783

		ret = btrfs_previous_item(root, path, 0, key.type);
		if (ret) {
4784
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4785 4786
			if (ret < 0)
				goto done;
4787
			ret = 0;
4788
			btrfs_release_path(path);
4789
			break;
4790 4791 4792 4793 4794 4795
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4796
		if (key.objectid != device->devid) {
4797
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4798
			btrfs_release_path(path);
4799
			break;
4800
		}
4801 4802 4803 4804

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4805
		if (key.offset + length <= new_size) {
4806
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4807
			btrfs_release_path(path);
4808
			break;
4809
		}
4810 4811

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4812
		btrfs_release_path(path);
4813

4814 4815 4816 4817 4818 4819 4820 4821
		/*
		 * We may be relocating the only data chunk we have,
		 * which could potentially end up with losing data's
		 * raid profile, so lets allocate an empty one in
		 * advance.
		 */
		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
		if (ret < 0) {
4822
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4823 4824 4825
			goto done;
		}

4826
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4827
		mutex_unlock(&fs_info->reclaim_bgs_lock);
4828
		if (ret == -ENOSPC) {
4829
			failed++;
4830 4831 4832 4833 4834 4835 4836 4837
		} else if (ret) {
			if (ret == -ETXTBSY) {
				btrfs_warn(fs_info,
		   "could not shrink block group %llu due to active swapfile",
					   chunk_offset);
			}
			goto done;
		}
4838
	} while (key.offset-- > 0);
4839 4840 4841 4842 4843 4844 4845 4846

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4847 4848
	}

4849
	/* Shrinking succeeded, else we would be at "done". */
4850
	trans = btrfs_start_transaction(root, 0);
4851 4852 4853 4854 4855
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4856
	mutex_lock(&fs_info->chunk_mutex);
4857 4858 4859 4860
	/* Clear all state bits beyond the shrunk device size */
	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
			  CHUNK_STATE_MASK);

4861
	btrfs_device_set_disk_total_bytes(device, new_size);
4862 4863 4864
	if (list_empty(&device->post_commit_list))
		list_add_tail(&device->post_commit_list,
			      &trans->transaction->dev_update_list);
4865 4866

	WARN_ON(diff > old_total);
4867 4868
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4869
	mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
4870 4871 4872

	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4873 4874 4875 4876 4877 4878
	if (ret < 0) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	} else {
		ret = btrfs_commit_transaction(trans);
	}
4879 4880
done:
	btrfs_free_path(path);
4881
	if (ret) {
4882
		mutex_lock(&fs_info->chunk_mutex);
4883
		btrfs_device_set_total_bytes(device, old_size);
4884
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4885
			device->fs_devices->total_rw_bytes += diff;
4886
		atomic64_add(diff, &fs_info->free_chunk_space);
4887
		mutex_unlock(&fs_info->chunk_mutex);
4888
	}
4889 4890 4891
	return ret;
}

4892
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4893 4894 4895
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
4896
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4897 4898 4899 4900
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

4901 4902
	lockdep_assert_held(&fs_info->chunk_mutex);

4903
	array_size = btrfs_super_sys_array_size(super_copy);
4904
	if (array_size + item_size + sizeof(disk_key)
4905
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4906 4907 4908 4909 4910 4911 4912 4913 4914
		return -EFBIG;

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4915

4916 4917 4918
	return 0;
}

4919 4920 4921 4922
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
4923
{
4924 4925
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
4926

4927
	if (di_a->max_avail > di_b->max_avail)
4928
		return -1;
4929
	if (di_a->max_avail < di_b->max_avail)
4930
		return 1;
4931 4932 4933 4934 4935
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
4936
}
4937

D
David Woodhouse 已提交
4938 4939
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
4940
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
D
David Woodhouse 已提交
4941 4942
		return;

4943
	btrfs_set_fs_incompat(info, RAID56);
D
David Woodhouse 已提交
4944 4945
}

4946 4947 4948 4949 4950 4951 4952 4953
static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
		return;

	btrfs_set_fs_incompat(info, RAID1C34);
}

N
Naohiro Aota 已提交
4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978
/*
 * Structure used internally for __btrfs_alloc_chunk() function.
 * Wraps needed parameters.
 */
struct alloc_chunk_ctl {
	u64 start;
	u64 type;
	/* Total number of stripes to allocate */
	int num_stripes;
	/* sub_stripes info for map */
	int sub_stripes;
	/* Stripes per device */
	int dev_stripes;
	/* Maximum number of devices to use */
	int devs_max;
	/* Minimum number of devices to use */
	int devs_min;
	/* ndevs has to be a multiple of this */
	int devs_increment;
	/* Number of copies */
	int ncopies;
	/* Number of stripes worth of bytes to store parity information */
	int nparity;
	u64 max_stripe_size;
	u64 max_chunk_size;
4979
	u64 dev_extent_min;
N
Naohiro Aota 已提交
4980 4981 4982 4983 4984
	u64 stripe_size;
	u64 chunk_size;
	int ndevs;
};

4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012
static void init_alloc_chunk_ctl_policy_regular(
				struct btrfs_fs_devices *fs_devices,
				struct alloc_chunk_ctl *ctl)
{
	u64 type = ctl->type;

	if (type & BTRFS_BLOCK_GROUP_DATA) {
		ctl->max_stripe_size = SZ_1G;
		ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
		/* For larger filesystems, use larger metadata chunks */
		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
			ctl->max_stripe_size = SZ_1G;
		else
			ctl->max_stripe_size = SZ_256M;
		ctl->max_chunk_size = ctl->max_stripe_size;
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
		ctl->max_stripe_size = SZ_32M;
		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
		ctl->devs_max = min_t(int, ctl->devs_max,
				      BTRFS_MAX_DEVS_SYS_CHUNK);
	} else {
		BUG();
	}

	/* We don't want a chunk larger than 10% of writable space */
	ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
				  ctl->max_chunk_size);
5013
	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
5014 5015
}

5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036
static void init_alloc_chunk_ctl_policy_zoned(
				      struct btrfs_fs_devices *fs_devices,
				      struct alloc_chunk_ctl *ctl)
{
	u64 zone_size = fs_devices->fs_info->zone_size;
	u64 limit;
	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
	u64 min_chunk_size = min_data_stripes * zone_size;
	u64 type = ctl->type;

	ctl->max_stripe_size = zone_size;
	if (type & BTRFS_BLOCK_GROUP_DATA) {
		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
						 zone_size);
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
		ctl->max_chunk_size = ctl->max_stripe_size;
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
		ctl->devs_max = min_t(int, ctl->devs_max,
				      BTRFS_MAX_DEVS_SYS_CHUNK);
5037 5038
	} else {
		BUG();
5039 5040 5041 5042 5043 5044 5045 5046 5047 5048
	}

	/* We don't want a chunk larger than 10% of writable space */
	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
			       zone_size),
		    min_chunk_size);
	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
}

5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068
static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
				 struct alloc_chunk_ctl *ctl)
{
	int index = btrfs_bg_flags_to_raid_index(ctl->type);

	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
	ctl->devs_max = btrfs_raid_array[index].devs_max;
	if (!ctl->devs_max)
		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
	ctl->devs_min = btrfs_raid_array[index].devs_min;
	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
	ctl->ncopies = btrfs_raid_array[index].ncopies;
	ctl->nparity = btrfs_raid_array[index].nparity;
	ctl->ndevs = 0;

	switch (fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
		break;
5069 5070 5071
	case BTRFS_CHUNK_ALLOC_ZONED:
		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
		break;
5072 5073 5074 5075 5076
	default:
		BUG();
	}
}

5077 5078 5079
static int gather_device_info(struct btrfs_fs_devices *fs_devices,
			      struct alloc_chunk_ctl *ctl,
			      struct btrfs_device_info *devices_info)
5080
{
5081
	struct btrfs_fs_info *info = fs_devices->fs_info;
5082
	struct btrfs_device *device;
5083
	u64 total_avail;
5084
	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5085
	int ret;
5086 5087 5088
	int ndevs = 0;
	u64 max_avail;
	u64 dev_offset;
5089

5090
	/*
5091 5092
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
5093
	 */
5094
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5095
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
J
Julia Lawall 已提交
5096
			WARN(1, KERN_ERR
5097
			       "BTRFS: read-only device in alloc_list\n");
5098 5099
			continue;
		}
5100

5101 5102
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&device->dev_state) ||
5103
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5104
			continue;
5105

5106 5107 5108 5109
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
5110 5111

		/* If there is no space on this device, skip it. */
5112
		if (total_avail < ctl->dev_extent_min)
5113
			continue;
5114

5115 5116
		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
					   &max_avail);
5117
		if (ret && ret != -ENOSPC)
5118
			return ret;
5119

5120
		if (ret == 0)
5121
			max_avail = dev_extent_want;
5122

5123
		if (max_avail < ctl->dev_extent_min) {
5124 5125
			if (btrfs_test_opt(info, ENOSPC_DEBUG))
				btrfs_debug(info,
5126
			"%s: devid %llu has no free space, have=%llu want=%llu",
5127
					    __func__, device->devid, max_avail,
5128
					    ctl->dev_extent_min);
5129
			continue;
5130
		}
5131

5132 5133 5134 5135 5136
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
5137 5138 5139 5140 5141 5142
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
5143
	ctl->ndevs = ndevs;
5144

5145 5146 5147
	/*
	 * now sort the devices by hole size / available space
	 */
5148
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5149
	     btrfs_cmp_device_info, NULL);
5150

5151 5152 5153
	return 0;
}

5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197
static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
				      struct btrfs_device_info *devices_info)
{
	/* Number of stripes that count for block group size */
	int data_stripes;

	/*
	 * The primary goal is to maximize the number of stripes, so use as
	 * many devices as possible, even if the stripes are not maximum sized.
	 *
	 * The DUP profile stores more than one stripe per device, the
	 * max_avail is the total size so we have to adjust.
	 */
	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
				   ctl->dev_stripes);
	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;

	/* This will have to be fixed for RAID1 and RAID10 over more drives */
	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

	/*
	 * Use the number of data stripes to figure out how big this chunk is
	 * really going to be in terms of logical address space, and compare
	 * that answer with the max chunk size. If it's higher, we try to
	 * reduce stripe_size.
	 */
	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
		/*
		 * Reduce stripe_size, round it up to a 16MB boundary again and
		 * then use it, unless it ends up being even bigger than the
		 * previous value we had already.
		 */
		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
							data_stripes), SZ_16M),
				       ctl->stripe_size);
	}

	/* Align to BTRFS_STRIPE_LEN */
	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
	ctl->chunk_size = ctl->stripe_size * data_stripes;

	return 0;
}

5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229
static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
				    struct btrfs_device_info *devices_info)
{
	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
	/* Number of stripes that count for block group size */
	int data_stripes;

	/*
	 * It should hold because:
	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
	 */
	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);

	ctl->stripe_size = zone_size;
	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
					     ctl->stripe_size) + ctl->nparity,
				     ctl->dev_stripes);
		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
	}

	ctl->chunk_size = ctl->stripe_size * data_stripes;

	return 0;
}

5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256
static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
			      struct alloc_chunk_ctl *ctl,
			      struct btrfs_device_info *devices_info)
{
	struct btrfs_fs_info *info = fs_devices->fs_info;

	/*
	 * Round down to number of usable stripes, devs_increment can be any
	 * number so we can't use round_down() that requires power of 2, while
	 * rounddown is safe.
	 */
	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);

	if (ctl->ndevs < ctl->devs_min) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
			btrfs_debug(info,
	"%s: not enough devices with free space: have=%d minimum required=%d",
				    __func__, ctl->ndevs, ctl->devs_min);
		}
		return -ENOSPC;
	}

	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);

	switch (fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		return decide_stripe_size_regular(ctl, devices_info);
5257 5258
	case BTRFS_CHUNK_ALLOC_ZONED:
		return decide_stripe_size_zoned(ctl, devices_info);
5259 5260 5261 5262 5263
	default:
		BUG();
	}
}

5264
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
N
Naohiro Aota 已提交
5265 5266
			struct alloc_chunk_ctl *ctl,
			struct btrfs_device_info *devices_info)
5267 5268 5269 5270
{
	struct btrfs_fs_info *info = trans->fs_info;
	struct map_lookup *map = NULL;
	struct extent_map_tree *em_tree;
5271
	struct btrfs_block_group *block_group;
5272
	struct extent_map *em;
N
Naohiro Aota 已提交
5273 5274
	u64 start = ctl->start;
	u64 type = ctl->type;
5275 5276 5277 5278
	int ret;
	int i;
	int j;

N
Naohiro Aota 已提交
5279 5280
	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
	if (!map)
5281
		return ERR_PTR(-ENOMEM);
N
Naohiro Aota 已提交
5282
	map->num_stripes = ctl->num_stripes;
5283

N
Naohiro Aota 已提交
5284 5285 5286
	for (i = 0; i < ctl->ndevs; ++i) {
		for (j = 0; j < ctl->dev_stripes; ++j) {
			int s = i * ctl->dev_stripes + j;
5287 5288
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
N
Naohiro Aota 已提交
5289
						   j * ctl->stripe_size;
5290 5291
		}
	}
5292 5293 5294
	map->stripe_len = BTRFS_STRIPE_LEN;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
Y
Yan Zheng 已提交
5295
	map->type = type;
N
Naohiro Aota 已提交
5296
	map->sub_stripes = ctl->sub_stripes;
5297

N
Naohiro Aota 已提交
5298
	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5299

5300
	em = alloc_extent_map();
Y
Yan Zheng 已提交
5301
	if (!em) {
5302
		kfree(map);
5303
		return ERR_PTR(-ENOMEM);
5304
	}
5305
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5306
	em->map_lookup = map;
Y
Yan Zheng 已提交
5307
	em->start = start;
N
Naohiro Aota 已提交
5308
	em->len = ctl->chunk_size;
Y
Yan Zheng 已提交
5309 5310
	em->block_start = 0;
	em->block_len = em->len;
N
Naohiro Aota 已提交
5311
	em->orig_block_len = ctl->stripe_size;
5312

5313
	em_tree = &info->mapping_tree;
5314
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
5315
	ret = add_extent_mapping(em_tree, em, 0);
5316
	if (ret) {
5317
		write_unlock(&em_tree->lock);
5318
		free_extent_map(em);
5319
		return ERR_PTR(ret);
5320
	}
5321 5322
	write_unlock(&em_tree->lock);

5323 5324
	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
	if (IS_ERR(block_group))
5325
		goto error_del_extent;
Y
Yan Zheng 已提交
5326

5327 5328 5329
	for (i = 0; i < map->num_stripes; i++) {
		struct btrfs_device *dev = map->stripes[i].dev;

N
Naohiro Aota 已提交
5330
		btrfs_device_set_bytes_used(dev,
N
Naohiro Aota 已提交
5331
					    dev->bytes_used + ctl->stripe_size);
5332 5333 5334 5335
		if (list_empty(&dev->post_commit_list))
			list_add_tail(&dev->post_commit_list,
				      &trans->transaction->dev_update_list);
	}
5336

N
Naohiro Aota 已提交
5337
	atomic64_sub(ctl->stripe_size * map->num_stripes,
N
Naohiro Aota 已提交
5338
		     &info->free_chunk_space);
5339

5340
	free_extent_map(em);
5341
	check_raid56_incompat_flag(info, type);
5342
	check_raid1c34_incompat_flag(info, type);
D
David Woodhouse 已提交
5343

5344
	return block_group;
5345

5346
error_del_extent:
5347 5348 5349 5350 5351 5352 5353 5354
	write_lock(&em_tree->lock);
	remove_extent_mapping(em_tree, em);
	write_unlock(&em_tree->lock);

	/* One for our allocation */
	free_extent_map(em);
	/* One for the tree reference */
	free_extent_map(em);
N
Naohiro Aota 已提交
5355

5356
	return block_group;
N
Naohiro Aota 已提交
5357 5358
}

5359 5360
struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
					    u64 type)
N
Naohiro Aota 已提交
5361 5362 5363 5364 5365
{
	struct btrfs_fs_info *info = trans->fs_info;
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
	struct btrfs_device_info *devices_info = NULL;
	struct alloc_chunk_ctl ctl;
5366
	struct btrfs_block_group *block_group;
N
Naohiro Aota 已提交
5367 5368
	int ret;

5369 5370
	lockdep_assert_held(&info->chunk_mutex);

N
Naohiro Aota 已提交
5371 5372
	if (!alloc_profile_is_valid(type, 0)) {
		ASSERT(0);
5373
		return ERR_PTR(-EINVAL);
N
Naohiro Aota 已提交
5374 5375 5376 5377 5378
	}

	if (list_empty(&fs_devices->alloc_list)) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG))
			btrfs_debug(info, "%s: no writable device", __func__);
5379
		return ERR_PTR(-ENOSPC);
N
Naohiro Aota 已提交
5380 5381 5382 5383 5384
	}

	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
		ASSERT(0);
5385
		return ERR_PTR(-EINVAL);
N
Naohiro Aota 已提交
5386 5387
	}

5388
	ctl.start = find_next_chunk(info);
N
Naohiro Aota 已提交
5389 5390 5391 5392 5393 5394
	ctl.type = type;
	init_alloc_chunk_ctl(fs_devices, &ctl);

	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
			       GFP_NOFS);
	if (!devices_info)
5395
		return ERR_PTR(-ENOMEM);
N
Naohiro Aota 已提交
5396 5397

	ret = gather_device_info(fs_devices, &ctl, devices_info);
5398 5399
	if (ret < 0) {
		block_group = ERR_PTR(ret);
N
Naohiro Aota 已提交
5400
		goto out;
5401
	}
N
Naohiro Aota 已提交
5402 5403

	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5404 5405
	if (ret < 0) {
		block_group = ERR_PTR(ret);
N
Naohiro Aota 已提交
5406
		goto out;
5407
	}
N
Naohiro Aota 已提交
5408

5409
	block_group = create_chunk(trans, &ctl, devices_info);
N
Naohiro Aota 已提交
5410 5411

out:
5412
	kfree(devices_info);
5413
	return block_group;
Y
Yan Zheng 已提交
5414 5415
}

5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476
/*
 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
 * chunks.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
				     struct btrfs_block_group *bg)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_key key;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
	struct extent_map *em;
	struct map_lookup *map;
	size_t item_size;
	int i;
	int ret;

	/*
	 * We take the chunk_mutex for 2 reasons:
	 *
	 * 1) Updates and insertions in the chunk btree must be done while holding
	 *    the chunk_mutex, as well as updating the system chunk array in the
	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
	 *    details;
	 *
	 * 2) To prevent races with the final phase of a device replace operation
	 *    that replaces the device object associated with the map's stripes,
	 *    because the device object's id can change at any time during that
	 *    final phase of the device replace operation
	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
	 *    which would cause a failure when updating the device item, which does
	 *    not exists, or persisting a stripe of the chunk item with such ID.
	 *    Here we can't use the device_list_mutex because our caller already
	 *    has locked the chunk_mutex, and the final phase of device replace
	 *    acquires both mutexes - first the device_list_mutex and then the
	 *    chunk_mutex. Using any of those two mutexes protects us from a
	 *    concurrent device replace.
	 */
	lockdep_assert_held(&fs_info->chunk_mutex);

	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		btrfs_abort_transaction(trans, ret);
		return ret;
	}

	map = em->map_lookup;
	item_size = btrfs_chunk_item_size(map->num_stripes);

	chunk = kzalloc(item_size, GFP_NOFS);
	if (!chunk) {
		ret = -ENOMEM;
		btrfs_abort_transaction(trans, ret);
5477
		goto out;
Y
Yan Zheng 已提交
5478 5479
	}

5480 5481 5482 5483 5484 5485 5486 5487
	for (i = 0; i < map->num_stripes; i++) {
		struct btrfs_device *device = map->stripes[i].dev;

		ret = btrfs_update_device(trans, device);
		if (ret)
			goto out;
	}

Y
Yan Zheng 已提交
5488
	stripe = &chunk->stripe;
5489
	for (i = 0; i < map->num_stripes; i++) {
5490 5491
		struct btrfs_device *device = map->stripes[i].dev;
		const u64 dev_offset = map->stripes[i].physical;
5492

5493 5494 5495
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Y
Yan Zheng 已提交
5496
		stripe++;
5497 5498
	}

5499
	btrfs_set_stack_chunk_length(chunk, bg->length);
5500
	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
Y
Yan Zheng 已提交
5501 5502 5503 5504 5505
	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5506
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Y
Yan Zheng 已提交
5507
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5508

Y
Yan Zheng 已提交
5509 5510
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
5511
	key.offset = bg->start;
5512

Y
Yan Zheng 已提交
5513
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5514 5515 5516 5517 5518 5519
	if (ret)
		goto out;

	bg->chunk_item_inserted = 1;

	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5520
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5521 5522
		if (ret)
			goto out;
5523
	}
5524

5525
out:
5526
	kfree(chunk);
5527
	free_extent_map(em);
5528
	return ret;
Y
Yan Zheng 已提交
5529
}
5530

5531
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
Y
Yan Zheng 已提交
5532
{
5533
	struct btrfs_fs_info *fs_info = trans->fs_info;
Y
Yan Zheng 已提交
5534
	u64 alloc_profile;
5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557
	struct btrfs_block_group *meta_bg;
	struct btrfs_block_group *sys_bg;

	/*
	 * When adding a new device for sprouting, the seed device is read-only
	 * so we must first allocate a metadata and a system chunk. But before
	 * adding the block group items to the extent, device and chunk btrees,
	 * we must first:
	 *
	 * 1) Create both chunks without doing any changes to the btrees, as
	 *    otherwise we would get -ENOSPC since the block groups from the
	 *    seed device are read-only;
	 *
	 * 2) Add the device item for the new sprout device - finishing the setup
	 *    of a new block group requires updating the device item in the chunk
	 *    btree, so it must exist when we attempt to do it. The previous step
	 *    ensures this does not fail with -ENOSPC.
	 *
	 * After that we can add the block group items to their btrees:
	 * update existing device item in the chunk btree, add a new block group
	 * item to the extent btree, add a new chunk item to the chunk btree and
	 * finally add the new device extent items to the devices btree.
	 */
Y
Yan Zheng 已提交
5558

5559
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5560 5561 5562
	meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
	if (IS_ERR(meta_bg))
		return PTR_ERR(meta_bg);
Y
Yan Zheng 已提交
5563

5564
	alloc_profile = btrfs_system_alloc_profile(fs_info);
5565 5566 5567 5568 5569
	sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
	if (IS_ERR(sys_bg))
		return PTR_ERR(sys_bg);

	return 0;
Y
Yan Zheng 已提交
5570 5571
}

5572 5573
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
5574
	const int index = btrfs_bg_flags_to_raid_index(map->type);
Y
Yan Zheng 已提交
5575

5576
	return btrfs_raid_array[index].tolerated_failures;
Y
Yan Zheng 已提交
5577 5578
}

5579
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Y
Yan Zheng 已提交
5580 5581 5582 5583
{
	struct extent_map *em;
	struct map_lookup *map;
	int readonly = 0;
5584
	int miss_ndevs = 0;
Y
Yan Zheng 已提交
5585 5586
	int i;

5587
	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5588
	if (IS_ERR(em))
Y
Yan Zheng 已提交
5589 5590
		return 1;

5591
	map = em->map_lookup;
Y
Yan Zheng 已提交
5592
	for (i = 0; i < map->num_stripes; i++) {
5593 5594
		if (test_bit(BTRFS_DEV_STATE_MISSING,
					&map->stripes[i].dev->dev_state)) {
5595 5596 5597
			miss_ndevs++;
			continue;
		}
5598 5599
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
					&map->stripes[i].dev->dev_state)) {
Y
Yan Zheng 已提交
5600
			readonly = 1;
5601
			goto end;
Y
Yan Zheng 已提交
5602 5603
		}
	}
5604 5605 5606 5607 5608 5609 5610 5611 5612

	/*
	 * If the number of missing devices is larger than max errors,
	 * we can not write the data into that chunk successfully, so
	 * set it readonly.
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
		readonly = 1;
end:
5613
	free_extent_map(em);
Y
Yan Zheng 已提交
5614
	return readonly;
5615 5616
}

5617
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5618 5619 5620
{
	struct extent_map *em;

C
Chris Mason 已提交
5621
	while (1) {
5622 5623
		write_lock(&tree->lock);
		em = lookup_extent_mapping(tree, 0, (u64)-1);
5624
		if (em)
5625 5626
			remove_extent_mapping(tree, em);
		write_unlock(&tree->lock);
5627 5628 5629 5630 5631 5632 5633 5634 5635
		if (!em)
			break;
		/* once for us */
		free_extent_map(em);
		/* once for the tree */
		free_extent_map(em);
	}
}

5636
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5637 5638 5639 5640 5641
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret;

5642
	em = btrfs_get_chunk_map(fs_info, logical, len);
5643 5644 5645 5646 5647 5648 5649
	if (IS_ERR(em))
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5650 5651
		return 1;

5652
	map = em->map_lookup;
5653
	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5654
		ret = map->num_stripes;
C
Chris Mason 已提交
5655 5656
	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		ret = map->sub_stripes;
D
David Woodhouse 已提交
5657 5658 5659
	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		ret = 2;
	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
L
Liu Bo 已提交
5660 5661 5662
		/*
		 * There could be two corrupted data stripes, we need
		 * to loop retry in order to rebuild the correct data.
5663
		 *
L
Liu Bo 已提交
5664 5665 5666 5667
		 * Fail a stripe at a time on every retry except the
		 * stripe under reconstruction.
		 */
		ret = map->num_stripes;
5668 5669 5670
	else
		ret = 1;
	free_extent_map(em);
5671

5672
	down_read(&fs_info->dev_replace.rwsem);
5673 5674
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
	    fs_info->dev_replace.tgtdev)
5675
		ret++;
5676
	up_read(&fs_info->dev_replace.rwsem);
5677

5678 5679 5680
	return ret;
}

5681
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
D
David Woodhouse 已提交
5682 5683 5684 5685
				    u64 logical)
{
	struct extent_map *em;
	struct map_lookup *map;
5686
	unsigned long len = fs_info->sectorsize;
D
David Woodhouse 已提交
5687

5688
	em = btrfs_get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5689

5690 5691 5692 5693 5694 5695
	if (!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			len = map->stripe_len * nr_data_stripes(map);
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5696 5697 5698
	return len;
}

5699
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
D
David Woodhouse 已提交
5700 5701 5702 5703 5704
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret = 0;

5705
	em = btrfs_get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5706

5707 5708 5709 5710 5711 5712
	if(!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5713 5714 5715
	return ret;
}

5716
static int find_live_mirror(struct btrfs_fs_info *fs_info,
5717
			    struct map_lookup *map, int first,
5718
			    int dev_replace_is_ongoing)
5719 5720
{
	int i;
5721
	int num_stripes;
5722
	int preferred_mirror;
5723 5724 5725
	int tolerance;
	struct btrfs_device *srcdev;

5726
	ASSERT((map->type &
5727
		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5728 5729 5730 5731 5732 5733

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		num_stripes = map->sub_stripes;
	else
		num_stripes = map->num_stripes;

A
Anand Jain 已提交
5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745
	switch (fs_info->fs_devices->read_policy) {
	default:
		/* Shouldn't happen, just warn and use pid instead of failing */
		btrfs_warn_rl(fs_info,
			      "unknown read_policy type %u, reset to pid",
			      fs_info->fs_devices->read_policy);
		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
		fallthrough;
	case BTRFS_READ_POLICY_PID:
		preferred_mirror = first + (current->pid % num_stripes);
		break;
	}
5746

5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759
	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
5760 5761 5762
		if (map->stripes[preferred_mirror].dev->bdev &&
		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
			return preferred_mirror;
5763
		for (i = first; i < first + num_stripes; i++) {
5764 5765 5766 5767
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5768
	}
5769

5770 5771 5772
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
5773
	return preferred_mirror;
5774 5775
}

D
David Woodhouse 已提交
5776
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5777
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
D
David Woodhouse 已提交
5778 5779 5780 5781 5782 5783
{
	int i;
	int again = 1;

	while (again) {
		again = 0;
5784
		for (i = 0; i < num_stripes - 1; i++) {
5785 5786 5787 5788
			/* Swap if parity is on a smaller index */
			if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
				swap(bbio->stripes[i], bbio->stripes[i + 1]);
				swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
D
David Woodhouse 已提交
5789 5790 5791 5792 5793 5794
				again = 1;
			}
		}
	}
}

5795 5796 5797
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
	struct btrfs_bio *bbio = kzalloc(
5798
		 /* the size of the btrfs_bio */
5799
		sizeof(struct btrfs_bio) +
5800
		/* plus the variable array for the stripes */
5801
		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5802
		/* plus the variable array for the tgt dev */
5803
		sizeof(int) * (real_stripes) +
5804 5805 5806 5807 5808
		/*
		 * plus the raid_map, which includes both the tgt dev
		 * and the stripes
		 */
		sizeof(u64) * (total_stripes),
5809
		GFP_NOFS|__GFP_NOFAIL);
5810 5811

	atomic_set(&bbio->error, 0);
5812
	refcount_set(&bbio->refs, 1);
5813

5814 5815 5816
	bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
	bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);

5817 5818 5819 5820 5821
	return bbio;
}

void btrfs_get_bbio(struct btrfs_bio *bbio)
{
5822 5823
	WARN_ON(!refcount_read(&bbio->refs));
	refcount_inc(&bbio->refs);
5824 5825 5826 5827 5828 5829
}

void btrfs_put_bbio(struct btrfs_bio *bbio)
{
	if (!bbio)
		return;
5830
	if (refcount_dec_and_test(&bbio->refs))
5831 5832 5833
		kfree(bbio);
}

5834 5835 5836 5837 5838 5839
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5840
					 u64 logical, u64 *length_ret,
5841 5842 5843 5844 5845
					 struct btrfs_bio **bbio_ret)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_bio *bbio;
5846
	u64 length = *length_ret;
5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866
	u64 offset;
	u64 stripe_nr;
	u64 stripe_nr_end;
	u64 stripe_end_offset;
	u64 stripe_cnt;
	u64 stripe_len;
	u64 stripe_offset;
	u64 num_stripes;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
	u64 stripes_per_dev = 0;
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
	int ret = 0;
	int i;

	/* discard always return a bbio */
	ASSERT(bbio_ret);

5867
	em = btrfs_get_chunk_map(fs_info, logical, length);
5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878
	if (IS_ERR(em))
		return PTR_ERR(em);

	map = em->map_lookup;
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	offset = logical - em->start;
5879
	length = min_t(u64, em->start + em->len - logical, length);
5880
	*length_ret = length;
5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892

	stripe_len = map->stripe_len;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
	stripe_nr = div64_u64(offset, stripe_len);

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_nr * stripe_len;

	stripe_nr_end = round_up(offset + length, map->stripe_len);
5893
	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919
	stripe_cnt = stripe_nr_end - stripe_nr;
	stripe_end_offset = stripe_nr_end * map->stripe_len -
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
	num_stripes = 1;
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
		num_stripes = min_t(u64, map->num_stripes,
				    sub_stripes * stripe_cnt);
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
		stripe_index *= sub_stripes;
		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
					      &remaining_stripes);
		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
		last_stripe *= sub_stripes;
5920
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987
				BTRFS_BLOCK_GROUP_DUP)) {
		num_stripes = map->num_stripes;
	} else {
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
					&stripe_index);
	}

	bbio = alloc_btrfs_bio(num_stripes, 0);
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			bbio->stripes[i].length = stripes_per_dev *
				map->stripe_len;

			if (i / sub_stripes < remaining_stripes)
				bbio->stripes[i].length +=
					map->stripe_len;

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
				bbio->stripes[i].length -=
					stripe_offset;

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
				bbio->stripes[i].length -=
					stripe_end_offset;

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
			bbio->stripes[i].length = length;
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

	*bbio_ret = bbio;
	bbio->map_type = map->type;
	bbio->num_stripes = num_stripes;
out:
	free_extent_map(em);
	return ret;
}

5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064
/*
 * In dev-replace case, for repair case (that's the only case where the mirror
 * is selected explicitly when calling btrfs_map_block), blocks left of the
 * left cursor can also be read from the target drive.
 *
 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
 * array of stripes.
 * For READ, it also needs to be supported using the same mirror number.
 *
 * If the requested block is not left of the left cursor, EIO is returned. This
 * can happen because btrfs_num_copies() returns one more in the dev-replace
 * case.
 */
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 u64 srcdev_devid, int *mirror_num,
					 u64 *physical)
{
	struct btrfs_bio *bbio = NULL;
	int num_stripes;
	int index_srcdev = 0;
	int found = 0;
	u64 physical_of_found = 0;
	int i;
	int ret = 0;

	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				logical, &length, &bbio, 0, 0);
	if (ret) {
		ASSERT(bbio == NULL);
		return ret;
	}

	num_stripes = bbio->num_stripes;
	if (*mirror_num > num_stripes) {
		/*
		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
		 * that means that the requested area is not left of the left
		 * cursor
		 */
		btrfs_put_bbio(bbio);
		return -EIO;
	}

	/*
	 * process the rest of the function using the mirror_num of the source
	 * drive. Therefore look it up first.  At the end, patch the device
	 * pointer to the one of the target drive.
	 */
	for (i = 0; i < num_stripes; i++) {
		if (bbio->stripes[i].dev->devid != srcdev_devid)
			continue;

		/*
		 * In case of DUP, in order to keep it simple, only add the
		 * mirror with the lowest physical address
		 */
		if (found &&
		    physical_of_found <= bbio->stripes[i].physical)
			continue;

		index_srcdev = i;
		found = 1;
		physical_of_found = bbio->stripes[i].physical;
	}

	btrfs_put_bbio(bbio);

	ASSERT(found);
	if (!found)
		return -EIO;

	*mirror_num = index_srcdev + 1;
	*physical = physical_of_found;
	return ret;
}

6065 6066 6067 6068 6069
static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
{
	struct btrfs_block_group *cache;
	bool ret;

6070
	/* Non zoned filesystem does not use "to_copy" flag */
6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083
	if (!btrfs_is_zoned(fs_info))
		return false;

	cache = btrfs_lookup_block_group(fs_info, logical);

	spin_lock(&cache->lock);
	ret = cache->to_copy;
	spin_unlock(&cache->lock);

	btrfs_put_block_group(cache);
	return ret;
}

6084 6085 6086
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				      struct btrfs_bio **bbio_ret,
				      struct btrfs_dev_replace *dev_replace,
6087
				      u64 logical,
6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099
				      int *num_stripes_ret, int *max_errors_ret)
{
	struct btrfs_bio *bbio = *bbio_ret;
	u64 srcdev_devid = dev_replace->srcdev->devid;
	int tgtdev_indexes = 0;
	int num_stripes = *num_stripes_ret;
	int max_errors = *max_errors_ret;
	int i;

	if (op == BTRFS_MAP_WRITE) {
		int index_where_to_add;

6100 6101 6102 6103 6104 6105 6106
		/*
		 * A block group which have "to_copy" set will eventually
		 * copied by dev-replace process. We can avoid cloning IO here.
		 */
		if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
			return;

6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185
		/*
		 * duplicate the write operations while the dev replace
		 * procedure is running. Since the copying of the old disk to
		 * the new disk takes place at run time while the filesystem is
		 * mounted writable, the regular write operations to the old
		 * disk have to be duplicated to go to the new disk as well.
		 *
		 * Note that device->missing is handled by the caller, and that
		 * the write to the old disk is already set up in the stripes
		 * array.
		 */
		index_where_to_add = num_stripes;
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/* write to new disk, too */
				struct btrfs_bio_stripe *new =
					bbio->stripes + index_where_to_add;
				struct btrfs_bio_stripe *old =
					bbio->stripes + i;

				new->physical = old->physical;
				new->length = old->length;
				new->dev = dev_replace->tgtdev;
				bbio->tgtdev_map[i] = index_where_to_add;
				index_where_to_add++;
				max_errors++;
				tgtdev_indexes++;
			}
		}
		num_stripes = index_where_to_add;
	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
		int index_srcdev = 0;
		int found = 0;
		u64 physical_of_found = 0;

		/*
		 * During the dev-replace procedure, the target drive can also
		 * be used to read data in case it is needed to repair a corrupt
		 * block elsewhere. This is possible if the requested area is
		 * left of the left cursor. In this area, the target drive is a
		 * full copy of the source drive.
		 */
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/*
				 * In case of DUP, in order to keep it simple,
				 * only add the mirror with the lowest physical
				 * address
				 */
				if (found &&
				    physical_of_found <=
				     bbio->stripes[i].physical)
					continue;
				index_srcdev = i;
				found = 1;
				physical_of_found = bbio->stripes[i].physical;
			}
		}
		if (found) {
			struct btrfs_bio_stripe *tgtdev_stripe =
				bbio->stripes + num_stripes;

			tgtdev_stripe->physical = physical_of_found;
			tgtdev_stripe->length =
				bbio->stripes[index_srcdev].length;
			tgtdev_stripe->dev = dev_replace->tgtdev;
			bbio->tgtdev_map[index_srcdev] = num_stripes;

			tgtdev_indexes++;
			num_stripes++;
		}
	}

	*num_stripes_ret = num_stripes;
	*max_errors_ret = max_errors;
	bbio->num_tgtdevs = tgtdev_indexes;
	*bbio_ret = bbio;
}

6186 6187 6188 6189 6190
static bool need_full_stripe(enum btrfs_map_op op)
{
	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}

6191
/*
6192 6193 6194
 * Calculate the geometry of a particular (address, len) tuple. This
 * information is used to calculate how big a particular bio can get before it
 * straddles a stripe.
6195
 *
6196 6197 6198 6199 6200
 * @fs_info: the filesystem
 * @em:      mapping containing the logical extent
 * @op:      type of operation - write or read
 * @logical: address that we want to figure out the geometry of
 * @io_geom: pointer used to return values
6201 6202 6203 6204
 *
 * Returns < 0 in case a chunk for the given logical address cannot be found,
 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
 */
6205
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6206
			  enum btrfs_map_op op, u64 logical,
6207
			  struct btrfs_io_geometry *io_geom)
6208 6209
{
	struct map_lookup *map;
6210
	u64 len;
6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224
	u64 offset;
	u64 stripe_offset;
	u64 stripe_nr;
	u64 stripe_len;
	u64 raid56_full_stripe_start = (u64)-1;
	int data_stripes;

	ASSERT(op != BTRFS_MAP_DISCARD);

	map = em->map_lookup;
	/* Offset of this logical address in the chunk */
	offset = logical - em->start;
	/* Len of a stripe in a chunk */
	stripe_len = map->stripe_len;
D
David Sterba 已提交
6225
	/* Stripe where this block falls in */
6226 6227 6228 6229 6230 6231 6232
	stripe_nr = div64_u64(offset, stripe_len);
	/* Offset of stripe in the chunk */
	stripe_offset = stripe_nr * stripe_len;
	if (offset < stripe_offset) {
		btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
			stripe_offset, offset, em->start, logical, stripe_len);
6233
		return -EINVAL;
6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279
	}

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_offset;
	data_stripes = nr_data_stripes(map);

	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
		u64 max_len = stripe_len - stripe_offset;

		/*
		 * In case of raid56, we need to know the stripe aligned start
		 */
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
			unsigned long full_stripe_len = stripe_len * data_stripes;
			raid56_full_stripe_start = offset;

			/*
			 * Allow a write of a full stripe, but make sure we
			 * don't allow straddling of stripes
			 */
			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
					full_stripe_len);
			raid56_full_stripe_start *= full_stripe_len;

			/*
			 * For writes to RAID[56], allow a full stripeset across
			 * all disks. For other RAID types and for RAID[56]
			 * reads, just allow a single stripe (on a single disk).
			 */
			if (op == BTRFS_MAP_WRITE) {
				max_len = stripe_len * data_stripes -
					  (offset - raid56_full_stripe_start);
			}
		}
		len = min_t(u64, em->len - offset, max_len);
	} else {
		len = em->len - offset;
	}

	io_geom->len = len;
	io_geom->offset = offset;
	io_geom->stripe_len = stripe_len;
	io_geom->stripe_nr = stripe_nr;
	io_geom->stripe_offset = stripe_offset;
	io_geom->raid56_stripe_offset = raid56_full_stripe_start;

6280
	return 0;
6281 6282
}

6283 6284
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
6285
			     u64 logical, u64 *length,
6286
			     struct btrfs_bio **bbio_ret,
6287
			     int mirror_num, int need_raid_map)
6288 6289 6290
{
	struct extent_map *em;
	struct map_lookup *map;
6291 6292
	u64 stripe_offset;
	u64 stripe_nr;
D
David Woodhouse 已提交
6293
	u64 stripe_len;
6294
	u32 stripe_index;
6295
	int data_stripes;
6296
	int i;
L
Li Zefan 已提交
6297
	int ret = 0;
6298
	int num_stripes;
6299
	int max_errors = 0;
6300
	int tgtdev_indexes = 0;
6301
	struct btrfs_bio *bbio = NULL;
6302 6303 6304
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
	int num_alloc_stripes;
6305 6306
	int patch_the_first_stripe_for_dev_replace = 0;
	u64 physical_to_patch_in_first_stripe = 0;
D
David Woodhouse 已提交
6307
	u64 raid56_full_stripe_start = (u64)-1;
6308 6309 6310
	struct btrfs_io_geometry geom;

	ASSERT(bbio_ret);
6311
	ASSERT(op != BTRFS_MAP_DISCARD);
6312

6313 6314 6315
	em = btrfs_get_chunk_map(fs_info, logical, *length);
	ASSERT(!IS_ERR(em));

6316
	ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
6317 6318
	if (ret < 0)
		return ret;
6319

6320
	map = em->map_lookup;
6321

6322 6323 6324 6325 6326
	*length = geom.len;
	stripe_len = geom.stripe_len;
	stripe_nr = geom.stripe_nr;
	stripe_offset = geom.stripe_offset;
	raid56_full_stripe_start = geom.raid56_stripe_offset;
6327
	data_stripes = nr_data_stripes(map);
6328

6329
	down_read(&dev_replace->rwsem);
6330
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6331 6332 6333 6334
	/*
	 * Hold the semaphore for read during the whole operation, write is
	 * requested at commit time but must wait.
	 */
6335
	if (!dev_replace_is_ongoing)
6336
		up_read(&dev_replace->rwsem);
6337

6338
	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6339
	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6340 6341 6342 6343 6344
		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
						    dev_replace->srcdev->devid,
						    &mirror_num,
					    &physical_to_patch_in_first_stripe);
		if (ret)
6345
			goto out;
6346 6347
		else
			patch_the_first_stripe_for_dev_replace = 1;
6348 6349 6350 6351
	} else if (mirror_num > map->num_stripes) {
		mirror_num = 0;
	}

6352
	num_stripes = 1;
6353
	stripe_index = 0;
6354
	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6355 6356
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
6357
		if (!need_full_stripe(op))
6358
			mirror_num = 1;
6359
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6360
		if (need_full_stripe(op))
6361
			num_stripes = map->num_stripes;
6362
		else if (mirror_num)
6363
			stripe_index = mirror_num - 1;
6364
		else {
6365 6366
			stripe_index = find_live_mirror(fs_info, map, 0,
					    dev_replace_is_ongoing);
6367
			mirror_num = stripe_index + 1;
6368
		}
6369

6370
	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6371
		if (need_full_stripe(op)) {
6372
			num_stripes = map->num_stripes;
6373
		} else if (mirror_num) {
6374
			stripe_index = mirror_num - 1;
6375 6376 6377
		} else {
			mirror_num = 1;
		}
6378

C
Chris Mason 已提交
6379
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6380
		u32 factor = map->num_stripes / map->sub_stripes;
C
Chris Mason 已提交
6381

6382
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
C
Chris Mason 已提交
6383 6384
		stripe_index *= map->sub_stripes;

6385
		if (need_full_stripe(op))
6386
			num_stripes = map->sub_stripes;
C
Chris Mason 已提交
6387 6388
		else if (mirror_num)
			stripe_index += mirror_num - 1;
6389
		else {
J
Jan Schmidt 已提交
6390
			int old_stripe_index = stripe_index;
6391 6392 6393
			stripe_index = find_live_mirror(fs_info, map,
					      stripe_index,
					      dev_replace_is_ongoing);
J
Jan Schmidt 已提交
6394
			mirror_num = stripe_index - old_stripe_index + 1;
6395
		}
D
David Woodhouse 已提交
6396

6397
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6398
		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
D
David Woodhouse 已提交
6399
			/* push stripe_nr back to the start of the full stripe */
6400
			stripe_nr = div64_u64(raid56_full_stripe_start,
6401
					stripe_len * data_stripes);
D
David Woodhouse 已提交
6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415

			/* RAID[56] write or recovery. Return all stripes */
			num_stripes = map->num_stripes;
			max_errors = nr_parity_stripes(map);

			*length = map->stripe_len;
			stripe_index = 0;
			stripe_offset = 0;
		} else {
			/*
			 * Mirror #0 or #1 means the original data block.
			 * Mirror #2 is RAID5 parity block.
			 * Mirror #3 is RAID6 Q block.
			 */
6416
			stripe_nr = div_u64_rem(stripe_nr,
6417
					data_stripes, &stripe_index);
D
David Woodhouse 已提交
6418
			if (mirror_num > 1)
6419
				stripe_index = data_stripes + mirror_num - 2;
D
David Woodhouse 已提交
6420 6421

			/* We distribute the parity blocks across stripes */
6422 6423
			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
					&stripe_index);
6424
			if (!need_full_stripe(op) && mirror_num <= 1)
6425
				mirror_num = 1;
D
David Woodhouse 已提交
6426
		}
6427 6428
	} else {
		/*
6429 6430 6431
		 * after this, stripe_nr is the number of stripes on this
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
6432
		 */
6433 6434
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
6435
		mirror_num = stripe_index + 1;
6436
	}
6437
	if (stripe_index >= map->num_stripes) {
J
Jeff Mahoney 已提交
6438 6439
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6440 6441 6442 6443
			   stripe_index, map->num_stripes);
		ret = -EINVAL;
		goto out;
	}
6444

6445
	num_alloc_stripes = num_stripes;
6446
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6447
		if (op == BTRFS_MAP_WRITE)
6448
			num_alloc_stripes <<= 1;
6449
		if (op == BTRFS_MAP_GET_READ_MIRRORS)
6450
			num_alloc_stripes++;
6451
		tgtdev_indexes = num_stripes;
6452
	}
6453

6454
	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
L
Li Zefan 已提交
6455 6456 6457 6458
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}
6459 6460 6461 6462 6463 6464 6465

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical = map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;
		stripe_index++;
	}
L
Li Zefan 已提交
6466

6467
	/* build raid_map */
6468 6469
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
6470
		u64 tmp;
6471
		unsigned rot;
6472 6473

		/* Work out the disk rotation on this stripe-set */
6474
		div_u64_rem(stripe_nr, num_stripes, &rot);
6475 6476

		/* Fill in the logical address of each stripe */
6477 6478
		tmp = stripe_nr * data_stripes;
		for (i = 0; i < data_stripes; i++)
6479 6480 6481 6482 6483 6484 6485 6486
			bbio->raid_map[(i+rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bbio->raid_map[(i+rot+1) % num_stripes] =
				RAID6_Q_STRIPE;

6487
		sort_parity_stripes(bbio, num_stripes);
6488
	}
L
Li Zefan 已提交
6489

6490
	if (need_full_stripe(op))
6491
		max_errors = btrfs_chunk_max_errors(map);
L
Li Zefan 已提交
6492

6493
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6494
	    need_full_stripe(op)) {
6495 6496
		handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
					  &num_stripes, &max_errors);
6497 6498
	}

L
Li Zefan 已提交
6499
	*bbio_ret = bbio;
Z
Zhao Lei 已提交
6500
	bbio->map_type = map->type;
L
Li Zefan 已提交
6501 6502 6503
	bbio->num_stripes = num_stripes;
	bbio->max_errors = max_errors;
	bbio->mirror_num = mirror_num;
6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515

	/*
	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
	 * mirror_num == num_stripes + 1 && dev_replace target drive is
	 * available as a mirror
	 */
	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
		WARN_ON(num_stripes > 1);
		bbio->stripes[0].dev = dev_replace->tgtdev;
		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
		bbio->mirror_num = map->num_stripes + 1;
	}
6516
out:
6517
	if (dev_replace_is_ongoing) {
6518 6519
		lockdep_assert_held(&dev_replace->rwsem);
		/* Unlock and let waiting writers proceed */
6520
		up_read(&dev_replace->rwsem);
6521
	}
6522
	free_extent_map(em);
L
Li Zefan 已提交
6523
	return ret;
6524 6525
}

6526
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6527
		      u64 logical, u64 *length,
6528
		      struct btrfs_bio **bbio_ret, int mirror_num)
6529
{
6530 6531 6532 6533
	if (op == BTRFS_MAP_DISCARD)
		return __btrfs_map_block_for_discard(fs_info, logical,
						     length, bbio_ret);

6534
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6535
				 mirror_num, 0);
6536 6537
}

6538
/* For Scrub/replace */
6539
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6540
		     u64 logical, u64 *length,
6541
		     struct btrfs_bio **bbio_ret)
6542
{
6543
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6544 6545
}

6546
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6547
{
6548 6549
	bio->bi_private = bbio->private;
	bio->bi_end_io = bbio->end_io;
6550
	bio_endio(bio);
6551

6552
	btrfs_put_bbio(bbio);
6553 6554
}

6555
static void btrfs_end_bio(struct bio *bio)
6556
{
6557
	struct btrfs_bio *bbio = bio->bi_private;
6558
	int is_orig_bio = 0;
6559

6560
	if (bio->bi_status) {
6561
		atomic_inc(&bbio->error);
6562 6563
		if (bio->bi_status == BLK_STS_IOERR ||
		    bio->bi_status == BLK_STS_TARGET) {
6564
			struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6565

6566
			ASSERT(dev->bdev);
6567
			if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6568
				btrfs_dev_stat_inc_and_print(dev,
6569
						BTRFS_DEV_STAT_WRITE_ERRS);
6570 6571
			else if (!(bio->bi_opf & REQ_RAHEAD))
				btrfs_dev_stat_inc_and_print(dev,
6572
						BTRFS_DEV_STAT_READ_ERRS);
6573 6574
			if (bio->bi_opf & REQ_PREFLUSH)
				btrfs_dev_stat_inc_and_print(dev,
6575
						BTRFS_DEV_STAT_FLUSH_ERRS);
6576 6577
		}
	}
6578

6579
	if (bio == bbio->orig_bio)
6580 6581
		is_orig_bio = 1;

6582 6583
	btrfs_bio_counter_dec(bbio->fs_info);

6584
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6585 6586
		if (!is_orig_bio) {
			bio_put(bio);
6587
			bio = bbio->orig_bio;
6588
		}
6589

6590
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6591
		/* only send an error to the higher layers if it is
D
David Woodhouse 已提交
6592
		 * beyond the tolerance of the btrfs bio
6593
		 */
6594
		if (atomic_read(&bbio->error) > bbio->max_errors) {
6595
			bio->bi_status = BLK_STS_IOERR;
6596
		} else {
6597 6598 6599 6600
			/*
			 * this bio is actually up to date, we didn't
			 * go over the max number of errors
			 */
6601
			bio->bi_status = BLK_STS_OK;
6602
		}
6603

6604
		btrfs_end_bbio(bbio, bio);
6605
	} else if (!is_orig_bio) {
6606 6607 6608 6609
		bio_put(bio);
	}
}

6610
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6611
			      u64 physical, struct btrfs_device *dev)
6612
{
6613
	struct btrfs_fs_info *fs_info = bbio->fs_info;
6614 6615

	bio->bi_private = bbio;
6616
	btrfs_io_bio(bio)->device = dev;
6617
	bio->bi_end_io = btrfs_end_bio;
6618
	bio->bi_iter.bi_sector = physical >> 9;
6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632
	/*
	 * For zone append writing, bi_sector must point the beginning of the
	 * zone
	 */
	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
		if (btrfs_dev_is_sequential(dev, physical)) {
			u64 zone_start = round_down(physical, fs_info->zone_size);

			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
		} else {
			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
			bio->bi_opf |= REQ_OP_WRITE;
		}
	}
6633 6634
	btrfs_debug_in_rcu(fs_info,
	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
D
David Sterba 已提交
6635
		bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6636 6637
		(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
		dev->devid, bio->bi_iter.bi_size);
6638
	bio_set_dev(bio, dev->bdev);
6639

6640
	btrfs_bio_counter_inc_noblocked(fs_info);
6641

6642
	btrfsic_submit_bio(bio);
6643 6644 6645 6646 6647 6648
}

static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
	atomic_inc(&bbio->error);
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6649
		/* Should be the original bio. */
6650 6651
		WARN_ON(bio != bbio->orig_bio);

6652
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6653
		bio->bi_iter.bi_sector = logical >> 9;
6654 6655 6656 6657
		if (atomic_read(&bbio->error) > bbio->max_errors)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_status = BLK_STS_OK;
6658
		btrfs_end_bbio(bbio, bio);
6659 6660 6661
	}
}

6662
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6663
			   int mirror_num)
6664 6665
{
	struct btrfs_device *dev;
6666
	struct bio *first_bio = bio;
D
David Sterba 已提交
6667
	u64 logical = bio->bi_iter.bi_sector << 9;
6668 6669 6670
	u64 length = 0;
	u64 map_length;
	int ret;
6671 6672
	int dev_nr;
	int total_devs;
6673
	struct btrfs_bio *bbio = NULL;
6674

6675
	length = bio->bi_iter.bi_size;
6676
	map_length = length;
6677

6678
	btrfs_bio_counter_inc_blocked(fs_info);
6679
	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
M
Mike Christie 已提交
6680
				&map_length, &bbio, mirror_num, 1);
6681
	if (ret) {
6682
		btrfs_bio_counter_dec(fs_info);
6683
		return errno_to_blk_status(ret);
6684
	}
6685

6686
	total_devs = bbio->num_stripes;
D
David Woodhouse 已提交
6687 6688 6689
	bbio->orig_bio = first_bio;
	bbio->private = first_bio->bi_private;
	bbio->end_io = first_bio->bi_end_io;
6690
	bbio->fs_info = fs_info;
D
David Woodhouse 已提交
6691 6692
	atomic_set(&bbio->stripes_pending, bbio->num_stripes);

6693
	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6694
	    ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
D
David Woodhouse 已提交
6695 6696
		/* In this case, map_length has been set to the length of
		   a single stripe; not the whole write */
6697
		if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
6698 6699
			ret = raid56_parity_write(fs_info, bio, bbio,
						  map_length);
D
David Woodhouse 已提交
6700
		} else {
6701 6702
			ret = raid56_parity_recover(fs_info, bio, bbio,
						    map_length, mirror_num, 1);
D
David Woodhouse 已提交
6703
		}
6704

6705
		btrfs_bio_counter_dec(fs_info);
6706
		return errno_to_blk_status(ret);
D
David Woodhouse 已提交
6707 6708
	}

6709
	if (map_length < length) {
6710
		btrfs_crit(fs_info,
J
Jeff Mahoney 已提交
6711 6712
			   "mapping failed logical %llu bio len %llu len %llu",
			   logical, length, map_length);
6713 6714
		BUG();
	}
6715

6716
	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6717
		dev = bbio->stripes[dev_nr].dev;
6718 6719
		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
						   &dev->dev_state) ||
6720
		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
6721
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6722 6723 6724 6725
			bbio_error(bbio, first_bio, logical);
			continue;
		}

6726
		if (dev_nr < total_devs - 1)
6727
			bio = btrfs_bio_clone(first_bio);
6728
		else
6729
			bio = first_bio;
6730

6731
		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6732
	}
6733
	btrfs_bio_counter_dec(fs_info);
6734
	return BLK_STS_OK;
6735 6736
}

6737 6738 6739 6740 6741 6742 6743
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
6744
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6745
				       u64 devid, u8 *uuid, u8 *fsid)
6746
{
Y
Yan Zheng 已提交
6747
	struct btrfs_device *device;
6748 6749 6750 6751 6752 6753 6754 6755 6756 6757
	struct btrfs_fs_devices *seed_devs;

	if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
		list_for_each_entry(device, &fs_devices->devices, dev_list) {
			if (device->devid == devid &&
			    (!uuid || memcmp(device->uuid, uuid,
					     BTRFS_UUID_SIZE) == 0))
				return device;
		}
	}
Y
Yan Zheng 已提交
6758

6759
	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
Y
Yan Zheng 已提交
6760
		if (!fsid ||
6761 6762
		    !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
			list_for_each_entry(device, &seed_devs->devices,
6763 6764 6765 6766 6767 6768
					    dev_list) {
				if (device->devid == devid &&
				    (!uuid || memcmp(device->uuid, uuid,
						     BTRFS_UUID_SIZE) == 0))
					return device;
			}
Y
Yan Zheng 已提交
6769 6770
		}
	}
6771

Y
Yan Zheng 已提交
6772
	return NULL;
6773 6774
}

6775
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6776 6777 6778
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;
6779
	unsigned int nofs_flag;
6780

6781 6782 6783 6784 6785 6786 6787
	/*
	 * We call this under the chunk_mutex, so we want to use NOFS for this
	 * allocation, however we don't want to change btrfs_alloc_device() to
	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
	 * places.
	 */
	nofs_flag = memalloc_nofs_save();
6788
	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6789
	memalloc_nofs_restore(nofs_flag);
6790
	if (IS_ERR(device))
6791
		return device;
6792 6793

	list_add(&device->dev_list, &fs_devices->devices);
Y
Yan Zheng 已提交
6794
	device->fs_devices = fs_devices;
6795
	fs_devices->num_devices++;
6796

6797
	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6798
	fs_devices->missing_devices++;
6799

6800 6801 6802
	return device;
}

6803 6804 6805 6806 6807 6808 6809 6810 6811 6812
/**
 * btrfs_alloc_device - allocate struct btrfs_device
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6813
 * on error.  Returned struct is not linked onto any lists and must be
6814
 * destroyed with btrfs_free_device.
6815 6816 6817 6818 6819 6820 6821 6822
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
					const u64 *devid,
					const u8 *uuid)
{
	struct btrfs_device *dev;
	u64 tmp;

6823
	if (WARN_ON(!devid && !fs_info))
6824 6825
		return ERR_PTR(-EINVAL);

D
David Sterba 已提交
6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	if (!dev)
		return ERR_PTR(-ENOMEM);

	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
	INIT_LIST_HEAD(&dev->post_commit_list);

	atomic_set(&dev->reada_in_flight, 0);
	atomic_set(&dev->dev_stats_ccnt, 0);
	btrfs_device_data_ordered_init(dev);
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
	extent_io_tree_init(fs_info, &dev->alloc_state,
			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
6851 6852 6853 6854 6855 6856 6857 6858

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6859
			btrfs_free_device(dev);
6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

	return dev;
}

6873
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6874
					u64 devid, u8 *uuid, bool error)
6875
{
6876 6877 6878 6879 6880 6881
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6882 6883
}

6884 6885
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
6886
	const int data_stripes = calc_data_stripes(type, num_stripes);
6887

6888 6889 6890
	return div_u64(chunk_len, data_stripes);
}

6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930
#if BITS_PER_LONG == 32
/*
 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
 * can't be accessed on 32bit systems.
 *
 * This function do mount time check to reject the fs if it already has
 * metadata chunk beyond that limit.
 */
static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
				  u64 logical, u64 length, u64 type)
{
	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
		return 0;

	if (logical + length < MAX_LFS_FILESIZE)
		return 0;

	btrfs_err_32bit_limit(fs_info);
	return -EOVERFLOW;
}

/*
 * This is to give early warning for any metadata chunk reaching
 * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
 * Although we can still access the metadata, it's not going to be possible
 * once the limit is reached.
 */
static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
				  u64 logical, u64 length, u64 type)
{
	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
		return;

	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
		return;

	btrfs_warn_32bit_limit(fs_info);
}
#endif

6931
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6932 6933
			  struct btrfs_chunk *chunk)
{
6934
	struct btrfs_fs_info *fs_info = leaf->fs_info;
6935
	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6936 6937 6938 6939 6940
	struct map_lookup *map;
	struct extent_map *em;
	u64 logical;
	u64 length;
	u64 devid;
6941
	u64 type;
6942 6943 6944 6945 6946 6947 6948
	u8 uuid[BTRFS_UUID_SIZE];
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
6949
	type = btrfs_chunk_type(leaf, chunk);
6950 6951
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6952 6953 6954 6955 6956 6957 6958
#if BITS_PER_LONG == 32
	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
	if (ret < 0)
		return ret;
	warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif

6959 6960 6961 6962 6963
	/*
	 * Only need to verify chunk item if we're reading from sys chunk array,
	 * as chunk item in tree block is already verified by tree-checker.
	 */
	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6964
		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6965 6966 6967
		if (ret)
			return ret;
	}
6968

6969 6970 6971
	read_lock(&map_tree->lock);
	em = lookup_extent_mapping(map_tree, logical, 1);
	read_unlock(&map_tree->lock);
6972 6973 6974 6975 6976 6977 6978 6979 6980

	/* already mapped? */
	if (em && em->start <= logical && em->start + em->len > logical) {
		free_extent_map(em);
		return 0;
	} else if (em) {
		free_extent_map(em);
	}

6981
	em = alloc_extent_map();
6982 6983
	if (!em)
		return -ENOMEM;
6984
	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6985 6986 6987 6988 6989
	if (!map) {
		free_extent_map(em);
		return -ENOMEM;
	}

6990
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6991
	em->map_lookup = map;
6992 6993
	em->start = logical;
	em->len = length;
6994
	em->orig_start = 0;
6995
	em->block_start = 0;
C
Chris Mason 已提交
6996
	em->block_len = em->len;
6997

6998 6999 7000 7001
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
7002
	map->type = type;
C
Chris Mason 已提交
7003
	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
7004
	map->verified_stripes = 0;
7005
	em->orig_block_len = calc_stripe_length(type, em->len,
7006
						map->num_stripes);
7007 7008 7009 7010
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7011 7012 7013
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
7014
		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
7015
							devid, uuid, NULL);
7016
		if (!map->stripes[i].dev &&
7017
		    !btrfs_test_opt(fs_info, DEGRADED)) {
7018
			free_extent_map(em);
7019
			btrfs_report_missing_device(fs_info, devid, uuid, true);
7020
			return -ENOENT;
7021
		}
7022 7023
		if (!map->stripes[i].dev) {
			map->stripes[i].dev =
7024 7025
				add_missing_dev(fs_info->fs_devices, devid,
						uuid);
7026
			if (IS_ERR(map->stripes[i].dev)) {
7027
				free_extent_map(em);
7028 7029 7030 7031
				btrfs_err(fs_info,
					"failed to init missing dev %llu: %ld",
					devid, PTR_ERR(map->stripes[i].dev));
				return PTR_ERR(map->stripes[i].dev);
7032
			}
7033
			btrfs_report_missing_device(fs_info, devid, uuid, false);
7034
		}
7035 7036 7037
		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&(map->stripes[i].dev->dev_state));

7038 7039
	}

7040 7041 7042
	write_lock(&map_tree->lock);
	ret = add_extent_mapping(map_tree, em, 0);
	write_unlock(&map_tree->lock);
7043 7044 7045 7046 7047
	if (ret < 0) {
		btrfs_err(fs_info,
			  "failed to add chunk map, start=%llu len=%llu: %d",
			  em->start, em->len, ret);
	}
7048 7049
	free_extent_map(em);

7050
	return ret;
7051 7052
}

7053
static void fill_device_from_item(struct extent_buffer *leaf,
7054 7055 7056 7057 7058 7059
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
7060 7061
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
7062
	device->commit_total_bytes = device->disk_total_bytes;
7063
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7064
	device->commit_bytes_used = device->bytes_used;
7065 7066 7067 7068
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
7069
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7070
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
7071

7072
	ptr = btrfs_device_uuid(dev_item);
7073
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
7074 7075
}

7076
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7077
						  u8 *fsid)
Y
Yan Zheng 已提交
7078 7079 7080 7081
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

7082
	lockdep_assert_held(&uuid_mutex);
D
David Sterba 已提交
7083
	ASSERT(fsid);
Y
Yan Zheng 已提交
7084

7085
	/* This will match only for multi-device seed fs */
7086
	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7087
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7088 7089
			return fs_devices;

Y
Yan Zheng 已提交
7090

7091
	fs_devices = find_fsid(fsid, NULL);
Y
Yan Zheng 已提交
7092
	if (!fs_devices) {
7093
		if (!btrfs_test_opt(fs_info, DEGRADED))
7094 7095
			return ERR_PTR(-ENOENT);

7096
		fs_devices = alloc_fs_devices(fsid, NULL);
7097 7098 7099
		if (IS_ERR(fs_devices))
			return fs_devices;

7100
		fs_devices->seeding = true;
7101 7102
		fs_devices->opened = 1;
		return fs_devices;
Y
Yan Zheng 已提交
7103
	}
Y
Yan Zheng 已提交
7104

7105 7106 7107 7108
	/*
	 * Upon first call for a seed fs fsid, just create a private copy of the
	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
	 */
Y
Yan Zheng 已提交
7109
	fs_devices = clone_fs_devices(fs_devices);
7110 7111
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
7112

7113
	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
7114 7115
	if (ret) {
		free_fs_devices(fs_devices);
7116
		return ERR_PTR(ret);
7117
	}
Y
Yan Zheng 已提交
7118 7119

	if (!fs_devices->seeding) {
7120
		close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
7121
		free_fs_devices(fs_devices);
7122
		return ERR_PTR(-EINVAL);
Y
Yan Zheng 已提交
7123 7124
	}

7125
	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7126

7127
	return fs_devices;
Y
Yan Zheng 已提交
7128 7129
}

7130
static int read_one_dev(struct extent_buffer *leaf,
7131 7132
			struct btrfs_dev_item *dev_item)
{
7133
	struct btrfs_fs_info *fs_info = leaf->fs_info;
7134
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7135 7136 7137
	struct btrfs_device *device;
	u64 devid;
	int ret;
7138
	u8 fs_uuid[BTRFS_FSID_SIZE];
7139 7140
	u8 dev_uuid[BTRFS_UUID_SIZE];

7141
	devid = btrfs_device_id(leaf, dev_item);
7142
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7143
			   BTRFS_UUID_SIZE);
7144
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7145
			   BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
7146

7147
	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7148
		fs_devices = open_seed_devices(fs_info, fs_uuid);
7149 7150
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Y
Yan Zheng 已提交
7151 7152
	}

7153
	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
7154
				   fs_uuid);
7155
	if (!device) {
7156
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
7157 7158
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
7159
			return -ENOENT;
7160
		}
Y
Yan Zheng 已提交
7161

7162
		device = add_missing_dev(fs_devices, devid, dev_uuid);
7163 7164 7165 7166 7167 7168
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
7169
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7170
	} else {
7171
		if (!device->bdev) {
7172 7173 7174
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
7175
				return -ENOENT;
7176 7177 7178
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
7179
		}
7180

7181 7182
		if (!device->bdev &&
		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7183 7184 7185 7186 7187 7188
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
7189
			device->fs_devices->missing_devices++;
7190
			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
Y
Yan Zheng 已提交
7191
		}
7192 7193 7194

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
7195 7196
			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
							&device->dev_state));
7197 7198 7199 7200 7201 7202 7203 7204 7205 7206

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Y
Yan Zheng 已提交
7207 7208
	}

7209
	if (device->fs_devices != fs_info->fs_devices) {
7210
		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
Y
Yan Zheng 已提交
7211 7212 7213
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
7214
	}
7215 7216

	fill_device_from_item(leaf, dev_item, device);
7217 7218 7219 7220 7221 7222 7223 7224 7225 7226
	if (device->bdev) {
		u64 max_total_bytes = i_size_read(device->bdev->bd_inode);

		if (device->total_bytes > max_total_bytes) {
			btrfs_err(fs_info,
			"device total_bytes should be at most %llu but found %llu",
				  max_total_bytes, device->total_bytes);
			return -EINVAL;
		}
	}
7227
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7228
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7229
	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
Y
Yan Zheng 已提交
7230
		device->fs_devices->total_rw_bytes += device->total_bytes;
7231 7232
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
7233
	}
7234 7235 7236 7237
	ret = 0;
	return ret;
}

7238
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7239
{
7240
	struct btrfs_root *root = fs_info->tree_root;
7241
	struct btrfs_super_block *super_copy = fs_info->super_copy;
7242
	struct extent_buffer *sb;
7243 7244
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
7245 7246
	u8 *array_ptr;
	unsigned long sb_array_offset;
7247
	int ret = 0;
7248 7249 7250
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
7251
	u32 cur_offset;
7252
	u64 type;
7253
	struct btrfs_key key;
7254

7255
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7256 7257 7258 7259 7260
	/*
	 * This will create extent buffer of nodesize, superblock size is
	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
	 * overallocate but we can keep it as-is, only the first page is used.
	 */
7261 7262
	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
					  root->root_key.objectid, 0);
7263 7264
	if (IS_ERR(sb))
		return PTR_ERR(sb);
7265
	set_extent_buffer_uptodate(sb);
7266
	/*
7267
	 * The sb extent buffer is artificial and just used to read the system array.
7268
	 * set_extent_buffer_uptodate() call does not properly mark all it's
7269 7270 7271 7272 7273 7274 7275 7276 7277
	 * pages up-to-date when the page is larger: extent does not cover the
	 * whole page and consequently check_page_uptodate does not find all
	 * the page's extents up-to-date (the hole beyond sb),
	 * write_extent_buffer then triggers a WARN_ON.
	 *
	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
	 * but sb spans only this function. Add an explicit SetPageUptodate call
	 * to silence the warning eg. on PowerPC 64.
	 */
7278
	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7279
		SetPageUptodate(sb->pages[0]);
7280

7281
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7282 7283
	array_size = btrfs_super_sys_array_size(super_copy);

7284 7285 7286
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
7287

7288 7289
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
7290 7291 7292 7293
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

7294 7295
		btrfs_disk_key_to_cpu(&key, disk_key);

7296 7297 7298
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
7299

7300 7301 7302 7303 7304 7305 7306
		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
			ret = -EIO;
			break;
		}
7307

7308 7309 7310 7311 7312 7313 7314 7315
		chunk = (struct btrfs_chunk *)sb_array_offset;
		/*
		 * At least one btrfs_chunk with one stripe must be present,
		 * exact stripe count check comes afterwards
		 */
		len = btrfs_chunk_item_size(1);
		if (cur_offset + len > array_size)
			goto out_short_read;
7316

7317 7318 7319 7320 7321 7322 7323 7324
		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
		if (!num_stripes) {
			btrfs_err(fs_info,
			"invalid number of stripes %u in sys_array at offset %u",
				  num_stripes, cur_offset);
			ret = -EIO;
			break;
		}
7325

7326 7327
		type = btrfs_chunk_type(sb, chunk);
		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7328
			btrfs_err(fs_info,
7329 7330
			"invalid chunk type %llu in sys_array at offset %u",
				  type, cur_offset);
7331 7332
			ret = -EIO;
			break;
7333
		}
7334 7335 7336 7337 7338 7339 7340 7341 7342

		len = btrfs_chunk_item_size(num_stripes);
		if (cur_offset + len > array_size)
			goto out_short_read;

		ret = read_one_chunk(&key, sb, chunk);
		if (ret)
			break;

7343 7344 7345
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
7346
	}
7347
	clear_extent_buffer_uptodate(sb);
7348
	free_extent_buffer_stale(sb);
7349
	return ret;
7350 7351

out_short_read:
7352
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7353
			len, cur_offset);
7354
	clear_extent_buffer_uptodate(sb);
7355
	free_extent_buffer_stale(sb);
7356
	return -EIO;
7357 7358
}

7359 7360 7361
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
7362 7363
 * If the @failing_dev is specified, it's accounted as missing.
 *
7364 7365 7366
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
7367 7368
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
					struct btrfs_device *failing_dev)
7369
{
7370
	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7371 7372 7373 7374
	struct extent_map *em;
	u64 next_start = 0;
	bool ret = true;

7375 7376 7377
	read_lock(&map_tree->lock);
	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
	read_unlock(&map_tree->lock);
7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395
	/* No chunk at all? Return false anyway */
	if (!em) {
		ret = false;
		goto out;
	}
	while (em) {
		struct map_lookup *map;
		int missing = 0;
		int max_tolerated;
		int i;

		map = em->map_lookup;
		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

7396 7397
			if (!dev || !dev->bdev ||
			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7398 7399
			    dev->last_flush_error)
				missing++;
7400 7401
			else if (failing_dev && failing_dev == dev)
				missing++;
7402 7403
		}
		if (missing > max_tolerated) {
7404 7405
			if (!failing_dev)
				btrfs_warn(fs_info,
7406
	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
7407 7408 7409 7410 7411 7412 7413 7414
				   em->start, missing, max_tolerated);
			free_extent_map(em);
			ret = false;
			goto out;
		}
		next_start = extent_map_end(em);
		free_extent_map(em);

7415 7416
		read_lock(&map_tree->lock);
		em = lookup_extent_mapping(map_tree, next_start,
7417
					   (u64)(-1) - next_start);
7418
		read_unlock(&map_tree->lock);
7419 7420 7421 7422 7423
	}
out:
	return ret;
}

7424 7425 7426 7427 7428
static void readahead_tree_node_children(struct extent_buffer *node)
{
	int i;
	const int nr_items = btrfs_header_nritems(node);

7429 7430
	for (i = 0; i < nr_items; i++)
		btrfs_readahead_node_child(node, i);
7431 7432
}

7433
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7434
{
7435
	struct btrfs_root *root = fs_info->chunk_root;
7436 7437 7438 7439 7440 7441
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
7442
	u64 total_dev = 0;
7443
	u64 last_ra_node = 0;
7444 7445 7446 7447 7448

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

7449 7450 7451 7452
	/*
	 * uuid_mutex is needed only if we are mounting a sprout FS
	 * otherwise we don't need it.
	 */
7453 7454
	mutex_lock(&uuid_mutex);

7455 7456 7457 7458 7459 7460 7461 7462
	/*
	 * It is possible for mount and umount to race in such a way that
	 * we execute this code path, but open_fs_devices failed to clear
	 * total_rw_bytes. We certainly want it cleared before reading the
	 * device items, so clear it here.
	 */
	fs_info->fs_devices->total_rw_bytes = 0;

7463 7464 7465 7466 7467
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7468 7469 7470 7471 7472
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7473 7474
	if (ret < 0)
		goto error;
C
Chris Mason 已提交
7475
	while (1) {
7476 7477
		struct extent_buffer *node;

7478 7479 7480 7481 7482 7483 7484 7485 7486 7487
		leaf = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
			break;
		}
7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498
		/*
		 * The nodes on level 1 are not locked but we don't need to do
		 * that during mount time as nothing else can access the tree
		 */
		node = path->nodes[1];
		if (node) {
			if (last_ra_node != node->start) {
				readahead_tree_node_children(node);
				last_ra_node = node->start;
			}
		}
7499
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7500 7501 7502
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
7503
						  struct btrfs_dev_item);
7504
			ret = read_one_dev(leaf, dev_item);
7505 7506
			if (ret)
				goto error;
7507
			total_dev++;
7508 7509
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
7510 7511 7512 7513 7514 7515 7516 7517 7518 7519

			/*
			 * We are only called at mount time, so no need to take
			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
			 * we always lock first fs_info->chunk_mutex before
			 * acquiring any locks on the chunk tree. This is a
			 * requirement for chunk allocation, see the comment on
			 * top of btrfs_chunk_alloc() for details.
			 */
			ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7520
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7521
			ret = read_one_chunk(&found_key, leaf, chunk);
Y
Yan Zheng 已提交
7522 7523
			if (ret)
				goto error;
7524 7525 7526
		}
		path->slots[0]++;
	}
7527 7528 7529 7530 7531

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
7532 7533
	if (total_dev != fs_info->fs_devices->total_devices) {
		btrfs_err(fs_info,
7534
	   "super_num_devices %llu mismatch with num_devices %llu found here",
7535
			  btrfs_super_num_devices(fs_info->super_copy),
7536 7537 7538 7539
			  total_dev);
		ret = -EINVAL;
		goto error;
	}
7540 7541 7542
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
7543
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7544 7545
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
7546 7547 7548
		ret = -EINVAL;
		goto error;
	}
7549 7550
	ret = 0;
error:
7551 7552
	mutex_unlock(&uuid_mutex);

Y
Yan Zheng 已提交
7553
	btrfs_free_path(path);
7554 7555
	return ret;
}
7556

7557 7558
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
7559
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7560 7561
	struct btrfs_device *device;

7562 7563 7564 7565 7566 7567 7568 7569
	fs_devices->fs_info = fs_info;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list)
		device->fs_info = fs_info;

	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
		list_for_each_entry(device, &seed_devs->devices, dev_list)
7570
			device->fs_info = fs_info;
7571

7572
		seed_devs->fs_info = fs_info;
7573
	}
7574
	mutex_unlock(&fs_devices->device_list_mutex);
7575 7576
}

7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
				 const struct btrfs_dev_stats_item *ptr,
				 int index)
{
	u64 val;

	read_extent_buffer(eb, &val,
			   offsetof(struct btrfs_dev_stats_item, values) +
			    ((unsigned long)ptr) + (index * sizeof(u64)),
			   sizeof(val));
	return val;
}

static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
				      struct btrfs_dev_stats_item *ptr,
				      int index, u64 val)
{
	write_extent_buffer(eb, &val,
			    offsetof(struct btrfs_dev_stats_item, values) +
			     ((unsigned long)ptr) + (index * sizeof(u64)),
			    sizeof(val));
}

7600 7601
static int btrfs_device_init_dev_stats(struct btrfs_device *device,
				       struct btrfs_path *path)
7602
{
7603
	struct btrfs_dev_stats_item *ptr;
7604
	struct extent_buffer *eb;
7605 7606 7607 7608
	struct btrfs_key key;
	int item_size;
	int i, ret, slot;

7609 7610 7611
	if (!device->fs_info->dev_root)
		return 0;

7612 7613 7614 7615 7616 7617 7618 7619 7620
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
	key.offset = device->devid;
	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
	if (ret) {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			btrfs_dev_stat_set(device, i, 0);
		device->dev_stats_valid = 1;
		btrfs_release_path(path);
7621
		return ret < 0 ? ret : 0;
7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639
	}
	slot = path->slots[0];
	eb = path->nodes[0];
	item_size = btrfs_item_size_nr(eb, slot);

	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
		if (item_size >= (1 + i) * sizeof(__le64))
			btrfs_dev_stat_set(device, i,
					   btrfs_dev_stats_value(eb, ptr, i));
		else
			btrfs_dev_stat_set(device, i, 0);
	}

	device->dev_stats_valid = 1;
	btrfs_dev_stat_print_on_load(device);
	btrfs_release_path(path);
7640 7641

	return 0;
7642 7643 7644 7645 7646
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7647 7648
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
7649
	int ret = 0;
7650 7651

	path = btrfs_alloc_path();
A
Anand Jain 已提交
7652 7653
	if (!path)
		return -ENOMEM;
7654 7655

	mutex_lock(&fs_devices->device_list_mutex);
7656 7657 7658 7659 7660
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		ret = btrfs_device_init_dev_stats(device, path);
		if (ret)
			goto out;
	}
7661
	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7662 7663 7664 7665 7666
		list_for_each_entry(device, &seed_devs->devices, dev_list) {
			ret = btrfs_device_init_dev_stats(device, path);
			if (ret)
				goto out;
		}
7667
	}
7668
out:
7669 7670 7671
	mutex_unlock(&fs_devices->device_list_mutex);

	btrfs_free_path(path);
7672
	return ret;
7673 7674 7675 7676 7677
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
				struct btrfs_device *device)
{
7678
	struct btrfs_fs_info *fs_info = trans->fs_info;
7679
	struct btrfs_root *dev_root = fs_info->dev_root;
7680 7681 7682 7683 7684 7685 7686
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7687 7688
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7689 7690 7691
	key.offset = device->devid;

	path = btrfs_alloc_path();
7692 7693
	if (!path)
		return -ENOMEM;
7694 7695
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7696
		btrfs_warn_in_rcu(fs_info,
7697
			"error %d while searching for dev_stats item for device %s",
7698
			      ret, rcu_str_deref(device->name));
7699 7700 7701 7702 7703 7704 7705 7706
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7707
			btrfs_warn_in_rcu(fs_info,
7708
				"delete too small dev_stats item for device %s failed %d",
7709
				      rcu_str_deref(device->name), ret);
7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7721
			btrfs_warn_in_rcu(fs_info,
7722 7723
				"insert dev_stats item for device %s failed %d",
				rcu_str_deref(device->name), ret);
7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
7743
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7744
{
7745
	struct btrfs_fs_info *fs_info = trans->fs_info;
7746 7747
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7748
	int stats_cnt;
7749 7750 7751 7752
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7753 7754
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7755 7756
			continue;

7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7771
		ret = update_dev_stat_item(trans, device);
7772
		if (!ret)
7773
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7774 7775 7776 7777 7778 7779
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7780 7781 7782 7783 7784 7785
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);
	btrfs_dev_stat_print_on_error(dev);
}

7786
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7787
{
7788 7789
	if (!dev->dev_stats_valid)
		return;
7790
	btrfs_err_rl_in_rcu(dev->fs_info,
7791
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7792
			   rcu_str_deref(dev->name),
7793 7794 7795
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7796 7797
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7798
}
7799

7800 7801
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7802 7803 7804 7805 7806 7807 7808 7809
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7810
	btrfs_info_in_rcu(dev->fs_info,
7811
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7812
	       rcu_str_deref(dev->name),
7813 7814 7815 7816 7817 7818 7819
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7820
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7821
			struct btrfs_ioctl_get_dev_stats *stats)
7822 7823
{
	struct btrfs_device *dev;
7824
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7825 7826 7827
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7828
	dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
7829 7830 7831
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7832
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7833
		return -ENODEV;
7834
	} else if (!dev->dev_stats_valid) {
7835
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7836
		return -ENODEV;
7837
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7838 7839 7840 7841 7842
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
7843
				btrfs_dev_stat_set(dev, i, 0);
7844
		}
7845 7846
		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
			   current->comm, task_pid_nr(current));
7847 7848 7849 7850 7851 7852 7853 7854 7855
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7856

7857
/*
7858 7859 7860 7861 7862
 * Update the size and bytes used for each device where it changed.  This is
 * delayed since we would otherwise get errors while writing out the
 * superblocks.
 *
 * Must be invoked during transaction commit.
7863
 */
7864
void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7865 7866 7867
{
	struct btrfs_device *curr, *next;

7868
	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7869

7870
	if (list_empty(&trans->dev_update_list))
7871 7872
		return;

7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883
	/*
	 * We don't need the device_list_mutex here.  This list is owned by the
	 * transaction and the transaction must complete before the device is
	 * released.
	 */
	mutex_lock(&trans->fs_info->chunk_mutex);
	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
				 post_commit_list) {
		list_del_init(&curr->post_commit_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
		curr->commit_bytes_used = curr->bytes_used;
7884
	}
7885
	mutex_unlock(&trans->fs_info->chunk_mutex);
7886
}
7887

7888 7889 7890 7891 7892
/*
 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
 */
int btrfs_bg_type_to_factor(u64 flags)
{
7893 7894 7895
	const int index = btrfs_bg_flags_to_raid_index(flags);

	return btrfs_raid_array[index].ncopies;
7896
}
7897 7898 7899 7900 7901 7902 7903



static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
				 u64 chunk_offset, u64 devid,
				 u64 physical_offset, u64 physical_len)
{
7904
	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7905 7906
	struct extent_map *em;
	struct map_lookup *map;
7907
	struct btrfs_device *dev;
7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956
	u64 stripe_len;
	bool found = false;
	int ret = 0;
	int i;

	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
			  physical_offset, devid);
		ret = -EUCLEAN;
		goto out;
	}

	map = em->map_lookup;
	stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
	if (physical_len != stripe_len) {
		btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
			  physical_offset, devid, em->start, physical_len,
			  stripe_len);
		ret = -EUCLEAN;
		goto out;
	}

	for (i = 0; i < map->num_stripes; i++) {
		if (map->stripes[i].dev->devid == devid &&
		    map->stripes[i].physical == physical_offset) {
			found = true;
			if (map->verified_stripes >= map->num_stripes) {
				btrfs_err(fs_info,
				"too many dev extents for chunk %llu found",
					  em->start);
				ret = -EUCLEAN;
				goto out;
			}
			map->verified_stripes++;
			break;
		}
	}
	if (!found) {
		btrfs_err(fs_info,
	"dev extent physical offset %llu devid %llu has no corresponding chunk",
			physical_offset, devid);
		ret = -EUCLEAN;
	}
7957

D
David Sterba 已提交
7958
	/* Make sure no dev extent is beyond device boundary */
7959
	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
7960 7961 7962 7963 7964
	if (!dev) {
		btrfs_err(fs_info, "failed to find devid %llu", devid);
		ret = -EUCLEAN;
		goto out;
	}
7965

7966 7967 7968 7969 7970 7971 7972 7973
	if (physical_offset + physical_len > dev->disk_total_bytes) {
		btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
			  devid, physical_offset, physical_len,
			  dev->disk_total_bytes);
		ret = -EUCLEAN;
		goto out;
	}
7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987

	if (dev->zone_info) {
		u64 zone_size = dev->zone_info->zone_size;

		if (!IS_ALIGNED(physical_offset, zone_size) ||
		    !IS_ALIGNED(physical_len, zone_size)) {
			btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
				  devid, physical_offset, physical_len);
			ret = -EUCLEAN;
			goto out;
		}
	}

7988 7989 7990 7991 7992 7993 7994
out:
	free_extent_map(em);
	return ret;
}

static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
7995
	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7996 7997 7998 7999 8000
	struct extent_map *em;
	struct rb_node *node;
	int ret = 0;

	read_lock(&em_tree->lock);
L
Liu Bo 已提交
8001
	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029
		em = rb_entry(node, struct extent_map, rb_node);
		if (em->map_lookup->num_stripes !=
		    em->map_lookup->verified_stripes) {
			btrfs_err(fs_info,
			"chunk %llu has missing dev extent, have %d expect %d",
				  em->start, em->map_lookup->verified_stripes,
				  em->map_lookup->num_stripes);
			ret = -EUCLEAN;
			goto out;
		}
	}
out:
	read_unlock(&em_tree->lock);
	return ret;
}

/*
 * Ensure that all dev extents are mapped to correct chunk, otherwise
 * later chunk allocation/free would cause unexpected behavior.
 *
 * NOTE: This will iterate through the whole device tree, which should be of
 * the same size level as the chunk tree.  This slightly increases mount time.
 */
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
	struct btrfs_path *path;
	struct btrfs_root *root = fs_info->dev_root;
	struct btrfs_key key;
8030 8031
	u64 prev_devid = 0;
	u64 prev_dev_ext_end = 0;
8032 8033
	int ret = 0;

8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046
	/*
	 * We don't have a dev_root because we mounted with ignorebadroots and
	 * failed to load the root, so we want to skip the verification in this
	 * case for sure.
	 *
	 * However if the dev root is fine, but the tree itself is corrupted
	 * we'd still fail to mount.  This verification is only to make sure
	 * writes can happen safely, so instead just bypass this check
	 * completely in the case of IGNOREBADROOTS.
	 */
	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
		return 0;

8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060
	key.objectid = 1;
	key.type = BTRFS_DEV_EXTENT_KEY;
	key.offset = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	path->reada = READA_FORWARD;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;

	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8061
		ret = btrfs_next_leaf(root, path);
8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088
		if (ret < 0)
			goto out;
		/* No dev extents at all? Not good */
		if (ret > 0) {
			ret = -EUCLEAN;
			goto out;
		}
	}
	while (1) {
		struct extent_buffer *leaf = path->nodes[0];
		struct btrfs_dev_extent *dext;
		int slot = path->slots[0];
		u64 chunk_offset;
		u64 physical_offset;
		u64 physical_len;
		u64 devid;

		btrfs_item_key_to_cpu(leaf, &key, slot);
		if (key.type != BTRFS_DEV_EXTENT_KEY)
			break;
		devid = key.objectid;
		physical_offset = key.offset;

		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
		physical_len = btrfs_dev_extent_length(leaf, dext);

8089 8090 8091 8092 8093 8094 8095 8096 8097
		/* Check if this dev extent overlaps with the previous one */
		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
			btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
				  devid, physical_offset, prev_dev_ext_end);
			ret = -EUCLEAN;
			goto out;
		}

8098 8099 8100 8101
		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
					    physical_offset, physical_len);
		if (ret < 0)
			goto out;
8102 8103 8104
		prev_devid = devid;
		prev_dev_ext_end = physical_offset + physical_len;

8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119
		ret = btrfs_next_item(root, path);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			ret = 0;
			break;
		}
	}

	/* Ensure all chunks have corresponding dev extents */
	ret = verify_chunk_dev_extent_mapping(fs_info);
out:
	btrfs_free_path(path);
	return ret;
}
8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143

/*
 * Check whether the given block group or device is pinned by any inode being
 * used as a swapfile.
 */
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
{
	struct btrfs_swapfile_pin *sp;
	struct rb_node *node;

	spin_lock(&fs_info->swapfile_pins_lock);
	node = fs_info->swapfile_pins.rb_node;
	while (node) {
		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
		if (ptr < sp->ptr)
			node = node->rb_left;
		else if (ptr > sp->ptr)
			node = node->rb_right;
		else
			break;
	}
	spin_unlock(&fs_info->swapfile_pins_lock);
	return node != NULL;
}
8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161

static int relocating_repair_kthread(void *data)
{
	struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
	struct btrfs_fs_info *fs_info = cache->fs_info;
	u64 target;
	int ret = 0;

	target = cache->start;
	btrfs_put_block_group(cache);

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
		btrfs_info(fs_info,
			   "zoned: skip relocating block group %llu to repair: EBUSY",
			   target);
		return -EBUSY;
	}

8162
	mutex_lock(&fs_info->reclaim_bgs_lock);
8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183

	/* Ensure block group still exists */
	cache = btrfs_lookup_block_group(fs_info, target);
	if (!cache)
		goto out;

	if (!cache->relocating_repair)
		goto out;

	ret = btrfs_may_alloc_data_chunk(fs_info, target);
	if (ret < 0)
		goto out;

	btrfs_info(fs_info,
		   "zoned: relocating block group %llu to repair IO failure",
		   target);
	ret = btrfs_relocate_chunk(fs_info, target);

out:
	if (cache)
		btrfs_put_block_group(cache);
8184
	mutex_unlock(&fs_info->reclaim_bgs_lock);
8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215
	btrfs_exclop_finish(fs_info);

	return ret;
}

int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
{
	struct btrfs_block_group *cache;

	/* Do not attempt to repair in degraded state */
	if (btrfs_test_opt(fs_info, DEGRADED))
		return 0;

	cache = btrfs_lookup_block_group(fs_info, logical);
	if (!cache)
		return 0;

	spin_lock(&cache->lock);
	if (cache->relocating_repair) {
		spin_unlock(&cache->lock);
		btrfs_put_block_group(cache);
		return 0;
	}
	cache->relocating_repair = 1;
	spin_unlock(&cache->lock);

	kthread_run(relocating_repair_kthread, cache,
		    "btrfs-relocating-repair");

	return 0;
}