volumes.c 218.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6
#include <linux/sched.h>
7
#include <linux/sched/mm.h>
8
#include <linux/bio.h>
9
#include <linux/slab.h>
10
#include <linux/blkdev.h>
11
#include <linux/ratelimit.h>
I
Ilya Dryomov 已提交
12
#include <linux/kthread.h>
D
David Woodhouse 已提交
13
#include <linux/raid/pq.h>
S
Stefan Behrens 已提交
14
#include <linux/semaphore.h>
15
#include <linux/uuid.h>
A
Anand Jain 已提交
16
#include <linux/list_sort.h>
17
#include "misc.h"
18 19 20 21 22 23
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
D
David Woodhouse 已提交
24
#include "raid56.h"
25
#include "async-thread.h"
26
#include "check-integrity.h"
27
#include "rcu-string.h"
28
#include "dev-replace.h"
29
#include "sysfs.h"
30
#include "tree-checker.h"
31
#include "space-info.h"
32
#include "block-group.h"
33
#include "discard.h"
34
#include "zoned.h"
35

Z
Zhao Lei 已提交
36 37 38 39 40
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
41
		.devs_min	= 2,
42
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
43 44
		.devs_increment	= 2,
		.ncopies	= 2,
45
		.nparity        = 0,
46
		.raid_name	= "raid10",
47
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
48
		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
Z
Zhao Lei 已提交
49 50 51 52 53 54
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
55
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
56 57
		.devs_increment	= 2,
		.ncopies	= 2,
58
		.nparity        = 0,
59
		.raid_name	= "raid1",
60
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
61
		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
Z
Zhao Lei 已提交
62
	},
63 64 65
	[BTRFS_RAID_RAID1C3] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
66
		.devs_max	= 3,
67 68 69 70
		.devs_min	= 3,
		.tolerated_failures = 2,
		.devs_increment	= 3,
		.ncopies	= 3,
71
		.nparity        = 0,
72 73 74 75
		.raid_name	= "raid1c3",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
	},
76 77 78
	[BTRFS_RAID_RAID1C4] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
79
		.devs_max	= 4,
80 81 82 83
		.devs_min	= 4,
		.tolerated_failures = 3,
		.devs_increment	= 4,
		.ncopies	= 4,
84
		.nparity        = 0,
85 86 87 88
		.raid_name	= "raid1c4",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
	},
Z
Zhao Lei 已提交
89 90 91 92 93
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
94
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
95 96
		.devs_increment	= 1,
		.ncopies	= 2,
97
		.nparity        = 0,
98
		.raid_name	= "dup",
99
		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
100
		.mindev_error	= 0,
Z
Zhao Lei 已提交
101 102 103 104 105
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
106
		.devs_min	= 1,
107
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
108 109
		.devs_increment	= 1,
		.ncopies	= 1,
110
		.nparity        = 0,
111
		.raid_name	= "raid0",
112
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
113
		.mindev_error	= 0,
Z
Zhao Lei 已提交
114 115 116 117 118 119
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
120
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
121 122
		.devs_increment	= 1,
		.ncopies	= 1,
123
		.nparity        = 0,
124
		.raid_name	= "single",
125
		.bg_flag	= 0,
126
		.mindev_error	= 0,
Z
Zhao Lei 已提交
127 128 129 130 131 132
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
133
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
134
		.devs_increment	= 1,
135
		.ncopies	= 1,
136
		.nparity        = 1,
137
		.raid_name	= "raid5",
138
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
139
		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
Z
Zhao Lei 已提交
140 141 142 143 144 145
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
146
		.tolerated_failures = 2,
Z
Zhao Lei 已提交
147
		.devs_increment	= 1,
148
		.ncopies	= 1,
149
		.nparity        = 2,
150
		.raid_name	= "raid6",
151
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
152
		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
Z
Zhao Lei 已提交
153 154 155
	},
};

156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
/*
 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 * can be used as index to access btrfs_raid_array[].
 */
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
	if (flags & BTRFS_BLOCK_GROUP_RAID10)
		return BTRFS_RAID_RAID10;
	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
		return BTRFS_RAID_RAID1;
	else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
		return BTRFS_RAID_RAID1C3;
	else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
		return BTRFS_RAID_RAID1C4;
	else if (flags & BTRFS_BLOCK_GROUP_DUP)
		return BTRFS_RAID_DUP;
	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
		return BTRFS_RAID_RAID0;
	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
		return BTRFS_RAID_RAID5;
	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
		return BTRFS_RAID_RAID6;

	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}

182
const char *btrfs_bg_type_to_raid_name(u64 flags)
183
{
184 185 186
	const int index = btrfs_bg_flags_to_raid_index(flags);

	if (index >= BTRFS_NR_RAID_TYPES)
187 188
		return NULL;

189
	return btrfs_raid_array[index].raid_name;
190 191
}

192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
/*
 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 * bytes including terminating null byte.
 */
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
	int i;
	int ret;
	char *bp = buf;
	u64 flags = bg_flags;
	u32 size_bp = size_buf;

	if (!flags) {
		strcpy(bp, "NONE");
		return;
	}

#define DESCRIBE_FLAG(flag, desc)						\
	do {								\
		if (flags & (flag)) {					\
			ret = snprintf(bp, size_bp, "%s|", (desc));	\
			if (ret < 0 || ret >= size_bp)			\
				goto out_overflow;			\
			size_bp -= ret;					\
			bp += ret;					\
			flags &= ~(flag);				\
		}							\
	} while (0)

	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");

	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
			      btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG

	if (flags) {
		ret = snprintf(bp, size_bp, "0x%llx|", flags);
		size_bp -= ret;
	}

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */

	/*
	 * The text is trimmed, it's up to the caller to provide sufficiently
	 * large buffer
	 */
out_overflow:;
}

246
static int init_first_rw_device(struct btrfs_trans_handle *trans);
247
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
248
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
249
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
250 251 252 253 254
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Y
Yan Zheng 已提交
255

D
David Sterba 已提交
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
272
 * seeding, structure cloning, opening/closing devices at mount/umount time
D
David Sterba 已提交
273 274 275
 *
 * global::fs_devs - add, remove, updates to the global list
 *
276 277 278
 * does not protect: manipulation of the fs_devices::devices list in general
 * but in mount context it could be used to exclude list modifications by eg.
 * scan ioctl
D
David Sterba 已提交
279 280 281 282 283 284 285 286 287 288 289 290
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
291 292 293
 * Is not required at mount and close times, because our device list is
 * protected by the uuid_mutex at that point.
 *
D
David Sterba 已提交
294 295 296 297 298 299 300 301
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
302 303 304
 * device is added/removed. Additionally it also protects post_commit_list of
 * individual devices, since they can be added to the transaction's
 * post_commit_list only with chunk_mutex held.
D
David Sterba 已提交
305 306 307 308 309 310 311 312 313 314 315
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
316 317 318
 *   device_list_mutex
 *     chunk_mutex
 *   balance_mutex
319 320
 *
 *
321 322
 * Exclusive operations
 * ====================
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
348 349
 * The status of exclusive operation is set and cleared atomically.
 * During the course of Paused state, fs_info::exclusive_operation remains set.
350 351
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
352
 * The exclusive status is cleared when the device operation is canceled or
353
 * completed.
D
David Sterba 已提交
354 355
 */

356
DEFINE_MUTEX(uuid_mutex);
357
static LIST_HEAD(fs_uuids);
D
David Sterba 已提交
358
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
359 360 361
{
	return &fs_uuids;
}
362

D
David Sterba 已提交
363 364
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
365 366
 * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
 * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
D
David Sterba 已提交
367 368 369 370 371
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
372 373
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
						 const u8 *metadata_fsid)
374 375 376
{
	struct btrfs_fs_devices *fs_devs;

377
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
378 379 380 381 382 383 384
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
	INIT_LIST_HEAD(&fs_devs->alloc_list);
385
	INIT_LIST_HEAD(&fs_devs->fs_list);
386
	INIT_LIST_HEAD(&fs_devs->seed_list);
387 388 389
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

390 391 392 393 394
	if (metadata_fsid)
		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
	else if (fsid)
		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);

395 396 397
	return fs_devs;
}

398
void btrfs_free_device(struct btrfs_device *device)
399
{
400
	WARN_ON(!list_empty(&device->post_commit_list));
401
	rcu_string_free(device->name);
402
	extent_io_tree_release(&device->alloc_state);
403
	bio_put(device->flush_bio);
404
	btrfs_destroy_dev_zone_info(device);
405 406 407
	kfree(device);
}

Y
Yan Zheng 已提交
408 409 410 411 412 413 414 415
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
416
		btrfs_free_device(device);
Y
Yan Zheng 已提交
417 418 419 420
	}
	kfree(fs_devices);
}

421
void __exit btrfs_cleanup_fs_uuids(void)
422 423 424
{
	struct btrfs_fs_devices *fs_devices;

Y
Yan Zheng 已提交
425 426
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
427 428
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Y
Yan Zheng 已提交
429
		free_fs_devices(fs_devices);
430 431 432
	}
}

433 434
static noinline struct btrfs_fs_devices *find_fsid(
		const u8 *fsid, const u8 *metadata_fsid)
435 436 437
{
	struct btrfs_fs_devices *fs_devices;

438 439
	ASSERT(fsid);

440
	/* Handle non-split brain cases */
441
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
442 443 444 445 446 447 448 449 450
		if (metadata_fsid) {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
				      BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		} else {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		}
451 452 453 454
	}
	return NULL;
}

455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
				struct btrfs_super_block *disk_super)
{

	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handle scanned device having completed its fsid change but
	 * belonging to a fs_devices that was created by first scanning
	 * a device which didn't have its fsid/metadata_uuid changed
	 * at all and the CHANGING_FSID_V2 flag set.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (fs_devices->fsid_change &&
		    memcmp(disk_super->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0) {
			return fs_devices;
		}
	}
	/*
	 * Handle scanned device having completed its fsid change but
	 * belonging to a fs_devices that was created by a device that
	 * has an outdated pair of fsid/metadata_uuid and
	 * CHANGING_FSID_V2 flag set.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (fs_devices->fsid_change &&
		    memcmp(fs_devices->metadata_uuid,
			   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
		    memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0) {
			return fs_devices;
		}
	}

	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
}


496 497 498
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
499
		      struct btrfs_super_block **disk_super)
500 501 502 503 504 505 506 507 508 509 510 511
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
512
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
513 514 515 516 517
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
518 519 520
	*disk_super = btrfs_read_dev_super(*bdev);
	if (IS_ERR(*disk_super)) {
		ret = PTR_ERR(*disk_super);
521 522 523 524 525 526 527 528 529 530 531
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	return ret;
}

532 533 534 535 536 537 538 539 540 541 542
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
	int found;

	rcu_read_lock();
	found = strcmp(rcu_str_deref(device->name), path);
	rcu_read_unlock();

	return found == 0;
}

543 544 545 546 547 548 549
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
550 551 552
 *  Return:	0 for success or if @path is NULL.
 * 		-EBUSY if @path is a mounted device.
 * 		-ENOENT if @path does not match any device in the list.
553
 */
554
static int btrfs_free_stale_devices(const char *path,
555
				     struct btrfs_device *skip_device)
A
Anand Jain 已提交
556
{
557 558
	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
	struct btrfs_device *device, *tmp_device;
559 560 561 562
	int ret = 0;

	if (path)
		ret = -ENOENT;
A
Anand Jain 已提交
563

564
	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
A
Anand Jain 已提交
565

566
		mutex_lock(&fs_devices->device_list_mutex);
567 568 569
		list_for_each_entry_safe(device, tmp_device,
					 &fs_devices->devices, dev_list) {
			if (skip_device && skip_device == device)
570
				continue;
571
			if (path && !device->name)
A
Anand Jain 已提交
572
				continue;
573
			if (path && !device_path_matched(path, device))
574
				continue;
575 576 577 578 579 580
			if (fs_devices->opened) {
				/* for an already deleted device return 0 */
				if (path && ret != 0)
					ret = -EBUSY;
				break;
			}
A
Anand Jain 已提交
581 582

			/* delete the stale device */
583 584 585 586
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);

587
			ret = 0;
588 589
		}
		mutex_unlock(&fs_devices->device_list_mutex);
590

591 592 593 594
		if (fs_devices->num_devices == 0) {
			btrfs_sysfs_remove_fsid(fs_devices);
			list_del(&fs_devices->fs_list);
			free_fs_devices(fs_devices);
A
Anand Jain 已提交
595 596
		}
	}
597 598

	return ret;
A
Anand Jain 已提交
599 600
}

601 602 603 604 605
/*
 * This is only used on mount, and we are protected from competing things
 * messing with our fs_devices by the uuid_mutex, thus we do not need the
 * fs_devices->device_list_mutex here.
 */
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
622
				    &bdev, &disk_super);
623 624 625 626 627
	if (ret)
		return ret;

	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
628
		goto error_free_page;
629 630

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
631
		goto error_free_page;
632 633 634 635

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
636 637 638 639
		if (btrfs_super_incompat_flags(disk_super) &
		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
			pr_err(
		"BTRFS: Invalid seeding and uuid-changed device detected\n");
640
			goto error_free_page;
641 642
		}

643
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
644
		fs_devices->seeding = true;
645
	} else {
646 647 648 649
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
650 651 652 653
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
654
		fs_devices->rotating = true;
655 656

	device->bdev = bdev;
657
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
658 659 660
	device->mode = flags;

	fs_devices->open_devices++;
661 662
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
663
		fs_devices->rw_devices++;
664
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
665
	}
666
	btrfs_release_disk_super(disk_super);
667 668 669

	return 0;

670 671
error_free_page:
	btrfs_release_disk_super(disk_super);
672 673 674 675 676
	blkdev_put(bdev, flags);

	return -EINVAL;
}

677 678
/*
 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
679 680 681
 * being created with a disk that has already completed its fsid change. Such
 * disk can belong to an fs which has its FSID changed or to one which doesn't.
 * Handle both cases here.
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
 */
static struct btrfs_fs_devices *find_fsid_inprogress(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
			return fs_devices;
		}
	}

697
	return find_fsid(disk_super->fsid, NULL);
698 699
}

700 701 702 703 704 705 706 707

static struct btrfs_fs_devices *find_fsid_changed(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handles the case where scanned device is part of an fs that had
D
David Sterba 已提交
708
	 * multiple successful changes of FSID but currently device didn't
709 710 711 712 713
	 * observe it. Meaning our fsid will be different than theirs. We need
	 * to handle two subcases :
	 *  1 - The fs still continues to have different METADATA/FSID uuids.
	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
	 *  are equal).
714 715
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
716
		/* Changed UUIDs */
717 718 719 720 721
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, disk_super->fsid,
722 723 724 725 726 727 728 729
			   BTRFS_FSID_SIZE) != 0)
			return fs_devices;

		/* Unchanged UUIDs */
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0)
730 731 732 733 734
			return fs_devices;
	}

	return NULL;
}
735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760

static struct btrfs_fs_devices *find_fsid_reverted_metadata(
				struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handle the case where the scanned device is part of an fs whose last
	 * metadata UUID change reverted it to the original FSID. At the same
	 * time * fs_devices was first created by another constitutent device
	 * which didn't fully observe the operation. This results in an
	 * btrfs_fs_devices created with metadata/fsid different AND
	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
	 * fs_devices equal to the FSID of the disk.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
			   BTRFS_FSID_SIZE) == 0 &&
		    fs_devices->fsid_change)
			return fs_devices;
	}

	return NULL;
}
761 762 763 764
/*
 * Add new device to list of registered devices
 *
 * Returns:
765 766
 * device pointer which was just added or updated when successful
 * error pointer when failed
767
 */
768
static noinline struct btrfs_device *device_list_add(const char *path,
769 770
			   struct btrfs_super_block *disk_super,
			   bool *new_device_added)
771 772
{
	struct btrfs_device *device;
773
	struct btrfs_fs_devices *fs_devices = NULL;
774
	struct rcu_string *name;
775
	u64 found_transid = btrfs_super_generation(disk_super);
776
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
777 778
	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
779 780
	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
781

782
	if (fsid_change_in_progress) {
783
		if (!has_metadata_uuid)
784
			fs_devices = find_fsid_inprogress(disk_super);
785
		else
786
			fs_devices = find_fsid_changed(disk_super);
787
	} else if (has_metadata_uuid) {
788
		fs_devices = find_fsid_with_metadata_uuid(disk_super);
789
	} else {
790 791 792
		fs_devices = find_fsid_reverted_metadata(disk_super);
		if (!fs_devices)
			fs_devices = find_fsid(disk_super->fsid, NULL);
793 794
	}

795 796

	if (!fs_devices) {
797 798 799 800 801 802
		if (has_metadata_uuid)
			fs_devices = alloc_fs_devices(disk_super->fsid,
						      disk_super->metadata_uuid);
		else
			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);

803
		if (IS_ERR(fs_devices))
804
			return ERR_CAST(fs_devices);
805

806 807
		fs_devices->fsid_change = fsid_change_in_progress;

808
		mutex_lock(&fs_devices->device_list_mutex);
809
		list_add(&fs_devices->fs_list, &fs_uuids);
810

811 812
		device = NULL;
	} else {
813
		mutex_lock(&fs_devices->device_list_mutex);
814
		device = btrfs_find_device(fs_devices, devid,
815
				disk_super->dev_item.uuid, NULL);
816 817 818 819 820 821

		/*
		 * If this disk has been pulled into an fs devices created by
		 * a device which had the CHANGING_FSID_V2 flag then replace the
		 * metadata_uuid/fsid values of the fs_devices.
		 */
822
		if (fs_devices->fsid_change &&
823 824 825
		    found_transid > fs_devices->latest_generation) {
			memcpy(fs_devices->fsid, disk_super->fsid,
					BTRFS_FSID_SIZE);
826 827 828 829 830 831 832 833

			if (has_metadata_uuid)
				memcpy(fs_devices->metadata_uuid,
				       disk_super->metadata_uuid,
				       BTRFS_FSID_SIZE);
			else
				memcpy(fs_devices->metadata_uuid,
				       disk_super->fsid, BTRFS_FSID_SIZE);
834 835 836

			fs_devices->fsid_change = false;
		}
837
	}
838

839
	if (!device) {
840 841
		if (fs_devices->opened) {
			mutex_unlock(&fs_devices->device_list_mutex);
842
			return ERR_PTR(-EBUSY);
843
		}
Y
Yan Zheng 已提交
844

845 846 847
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
848
			mutex_unlock(&fs_devices->device_list_mutex);
849
			/* we can safely leave the fs_devices entry around */
850
			return device;
851
		}
852 853 854

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
855
			btrfs_free_device(device);
856
			mutex_unlock(&fs_devices->device_list_mutex);
857
			return ERR_PTR(-ENOMEM);
858
		}
859
		rcu_assign_pointer(device->name, name);
860

861
		list_add_rcu(&device->dev_list, &fs_devices->devices);
862
		fs_devices->num_devices++;
863

Y
Yan Zheng 已提交
864
		device->fs_devices = fs_devices;
865
		*new_device_added = true;
866 867

		if (disk_super->label[0])
868 869 870 871
			pr_info(
	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->label, devid, found_transid, path,
				current->comm, task_pid_nr(current));
872
		else
873 874 875 876
			pr_info(
	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->fsid, devid, found_transid, path,
				current->comm, task_pid_nr(current));
877

878
	} else if (!device->name || strcmp(device->name->str, path)) {
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
900 901 902 903
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
904
		 */
905
		if (!fs_devices->opened && found_transid < device->generation) {
906 907 908 909 910 911 912
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
913
			mutex_unlock(&fs_devices->device_list_mutex);
914
			return ERR_PTR(-EEXIST);
915
		}
916

917 918 919 920 921
		/*
		 * We are going to replace the device path for a given devid,
		 * make sure it's the same device if the device is mounted
		 */
		if (device->bdev) {
C
Christoph Hellwig 已提交
922 923
			int error;
			dev_t path_dev;
924

C
Christoph Hellwig 已提交
925 926
			error = lookup_bdev(path, &path_dev);
			if (error) {
927
				mutex_unlock(&fs_devices->device_list_mutex);
C
Christoph Hellwig 已提交
928
				return ERR_PTR(error);
929 930
			}

C
Christoph Hellwig 已提交
931
			if (device->bdev->bd_dev != path_dev) {
932
				mutex_unlock(&fs_devices->device_list_mutex);
933 934 935 936 937 938 939
				/*
				 * device->fs_info may not be reliable here, so
				 * pass in a NULL instead. This avoids a
				 * possible use-after-free when the fs_info and
				 * fs_info->sb are already torn down.
				 */
				btrfs_warn_in_rcu(NULL,
940 941 942 943
	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
						  path, devid, found_transid,
						  current->comm,
						  task_pid_nr(current));
944 945 946
				return ERR_PTR(-EEXIST);
			}
			btrfs_info_in_rcu(device->fs_info,
947 948 949 950
	"devid %llu device path %s changed to %s scanned by %s (%d)",
					  devid, rcu_str_deref(device->name),
					  path, current->comm,
					  task_pid_nr(current));
951 952
		}

953
		name = rcu_string_strdup(path, GFP_NOFS);
954 955
		if (!name) {
			mutex_unlock(&fs_devices->device_list_mutex);
956
			return ERR_PTR(-ENOMEM);
957
		}
958 959
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
960
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
961
			fs_devices->missing_devices--;
962
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
963
		}
964 965
	}

966 967 968 969 970 971
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
972
	if (!fs_devices->opened) {
973
		device->generation = found_transid;
974 975 976
		fs_devices->latest_generation = max_t(u64, found_transid,
						fs_devices->latest_generation);
	}
977

978 979
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

980
	mutex_unlock(&fs_devices->device_list_mutex);
981
	return device;
982 983
}

Y
Yan Zheng 已提交
984 985 986 987 988
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;
989
	int ret = 0;
Y
Yan Zheng 已提交
990

991
	fs_devices = alloc_fs_devices(orig->fsid, NULL);
992 993
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
994

995
	mutex_lock(&orig->device_list_mutex);
J
Josef Bacik 已提交
996
	fs_devices->total_devices = orig->total_devices;
Y
Yan Zheng 已提交
997 998

	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
999 1000
		struct rcu_string *name;

1001 1002
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
1003 1004
		if (IS_ERR(device)) {
			ret = PTR_ERR(device);
Y
Yan Zheng 已提交
1005
			goto error;
1006
		}
Y
Yan Zheng 已提交
1007

1008 1009 1010 1011
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
1012
		if (orig_dev->name) {
1013 1014
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
1015
			if (!name) {
1016
				btrfs_free_device(device);
1017
				ret = -ENOMEM;
1018 1019 1020
				goto error;
			}
			rcu_assign_pointer(device->name, name);
J
Julia Lawall 已提交
1021
		}
Y
Yan Zheng 已提交
1022 1023 1024 1025 1026

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
1027
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
1028 1029
	return fs_devices;
error:
1030
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
1031
	free_fs_devices(fs_devices);
1032
	return ERR_PTR(ret);
Y
Yan Zheng 已提交
1033 1034
}

1035
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1036
				      struct btrfs_device **latest_dev)
1037
{
Q
Qinghuang Feng 已提交
1038
	struct btrfs_device *device, *next;
1039

1040
	/* This is the initialized path, it is safe to release the devices. */
Q
Qinghuang Feng 已提交
1041
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1042
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1043
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1044
				      &device->dev_state) &&
1045 1046
			    !test_bit(BTRFS_DEV_STATE_MISSING,
				      &device->dev_state) &&
1047 1048 1049
			    (!*latest_dev ||
			     device->generation > (*latest_dev)->generation)) {
				*latest_dev = device;
1050
			}
Y
Yan Zheng 已提交
1051
			continue;
1052
		}
Y
Yan Zheng 已提交
1053

1054 1055 1056 1057 1058 1059 1060
		/*
		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
		 * in btrfs_init_dev_replace() so just continue.
		 */
		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
			continue;

Y
Yan Zheng 已提交
1061
		if (device->bdev) {
1062
			blkdev_put(device->bdev, device->mode);
Y
Yan Zheng 已提交
1063 1064 1065
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
1066
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
1067
			list_del_init(&device->dev_alloc_list);
1068
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1069
			fs_devices->rw_devices--;
Y
Yan Zheng 已提交
1070
		}
Y
Yan Zheng 已提交
1071 1072
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
1073
		btrfs_free_device(device);
1074
	}
Y
Yan Zheng 已提交
1075

1076 1077 1078 1079 1080 1081
}

/*
 * After we have read the system tree and know devids belonging to this
 * filesystem, remove the device which does not belong there.
 */
1082
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1083 1084
{
	struct btrfs_device *latest_dev = NULL;
1085
	struct btrfs_fs_devices *seed_dev;
1086 1087

	mutex_lock(&uuid_mutex);
1088
	__btrfs_free_extra_devids(fs_devices, &latest_dev);
1089 1090

	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1091
		__btrfs_free_extra_devids(seed_dev, &latest_dev);
Y
Yan Zheng 已提交
1092

1093
	fs_devices->latest_bdev = latest_dev->bdev;
1094

1095 1096
	mutex_unlock(&uuid_mutex);
}
1097

1098 1099
static void btrfs_close_bdev(struct btrfs_device *device)
{
D
David Sterba 已提交
1100 1101 1102
	if (!device->bdev)
		return;

1103
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1104 1105 1106 1107
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

D
David Sterba 已提交
1108
	blkdev_put(device->bdev, device->mode);
1109 1110
}

1111
static void btrfs_close_one_device(struct btrfs_device *device)
1112 1113 1114
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;

1115
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1116 1117 1118 1119 1120
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

1121 1122 1123
	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);

1124
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1125 1126
		fs_devices->missing_devices--;

1127
	btrfs_close_bdev(device);
1128
	if (device->bdev) {
1129
		fs_devices->open_devices--;
1130
		device->bdev = NULL;
1131
	}
1132
	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1133
	btrfs_destroy_dev_zone_info(device);
1134

1135 1136 1137
	device->fs_info = NULL;
	atomic_set(&device->dev_stats_ccnt, 0);
	extent_io_tree_release(&device->alloc_state);
1138

1139 1140 1141 1142 1143 1144
	/* Verify the device is back in a pristine state  */
	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
	ASSERT(list_empty(&device->dev_alloc_list));
	ASSERT(list_empty(&device->post_commit_list));
	ASSERT(atomic_read(&device->reada_in_flight) == 0);
1145 1146
}

1147
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1148
{
1149
	struct btrfs_device *device, *tmp;
Y
Yan Zheng 已提交
1150

1151 1152
	lockdep_assert_held(&uuid_mutex);

Y
Yan Zheng 已提交
1153
	if (--fs_devices->opened > 0)
1154
		return;
1155

1156
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1157
		btrfs_close_one_device(device);
1158

Y
Yan Zheng 已提交
1159 1160
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Y
Yan Zheng 已提交
1161
	fs_devices->opened = 0;
1162
	fs_devices->seeding = false;
1163
	fs_devices->fs_info = NULL;
1164 1165
}

1166
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
Y
Yan Zheng 已提交
1167
{
1168 1169
	LIST_HEAD(list);
	struct btrfs_fs_devices *tmp;
Y
Yan Zheng 已提交
1170 1171

	mutex_lock(&uuid_mutex);
1172
	close_fs_devices(fs_devices);
1173 1174
	if (!fs_devices->opened)
		list_splice_init(&fs_devices->seed_list, &list);
Y
Yan Zheng 已提交
1175

1176
	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1177
		close_fs_devices(fs_devices);
1178
		list_del(&fs_devices->seed_list);
Y
Yan Zheng 已提交
1179 1180
		free_fs_devices(fs_devices);
	}
1181
	mutex_unlock(&uuid_mutex);
Y
Yan Zheng 已提交
1182 1183
}

1184
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
Y
Yan Zheng 已提交
1185
				fmode_t flags, void *holder)
1186 1187
{
	struct btrfs_device *device;
1188
	struct btrfs_device *latest_dev = NULL;
1189
	struct btrfs_device *tmp_device;
1190

1191 1192
	flags |= FMODE_EXCL;

1193 1194 1195
	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
				 dev_list) {
		int ret;
1196

1197 1198 1199
		ret = btrfs_open_one_device(fs_devices, device, flags, holder);
		if (ret == 0 &&
		    (!latest_dev || device->generation > latest_dev->generation)) {
1200
			latest_dev = device;
1201 1202 1203 1204 1205
		} else if (ret == -ENODATA) {
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);
		}
1206
	}
1207 1208 1209
	if (fs_devices->open_devices == 0)
		return -EINVAL;

Y
Yan Zheng 已提交
1210
	fs_devices->opened = 1;
1211
	fs_devices->latest_bdev = latest_dev->bdev;
Y
Yan Zheng 已提交
1212
	fs_devices->total_rw_bytes = 0;
1213
	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
A
Anand Jain 已提交
1214
	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1215 1216

	return 0;
Y
Yan Zheng 已提交
1217 1218
}

1219 1220
static int devid_cmp(void *priv, const struct list_head *a,
		     const struct list_head *b)
A
Anand Jain 已提交
1221
{
1222
	const struct btrfs_device *dev1, *dev2;
A
Anand Jain 已提交
1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233

	dev1 = list_entry(a, struct btrfs_device, dev_list);
	dev2 = list_entry(b, struct btrfs_device, dev_list);

	if (dev1->devid < dev2->devid)
		return -1;
	else if (dev1->devid > dev2->devid)
		return 1;
	return 0;
}

Y
Yan Zheng 已提交
1234
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1235
		       fmode_t flags, void *holder)
Y
Yan Zheng 已提交
1236 1237 1238
{
	int ret;

1239
	lockdep_assert_held(&uuid_mutex);
1240 1241
	/*
	 * The device_list_mutex cannot be taken here in case opening the
1242
	 * underlying device takes further locks like open_mutex.
1243 1244 1245 1246
	 *
	 * We also don't need the lock here as this is called during mount and
	 * exclusion is provided by uuid_mutex
	 */
1247

Y
Yan Zheng 已提交
1248
	if (fs_devices->opened) {
Y
Yan Zheng 已提交
1249 1250
		fs_devices->opened++;
		ret = 0;
Y
Yan Zheng 已提交
1251
	} else {
A
Anand Jain 已提交
1252
		list_sort(NULL, &fs_devices->devices, devid_cmp);
1253
		ret = open_fs_devices(fs_devices, flags, holder);
Y
Yan Zheng 已提交
1254
	}
1255

1256 1257 1258
	return ret;
}

1259
void btrfs_release_disk_super(struct btrfs_super_block *super)
1260
{
1261 1262
	struct page *page = virt_to_page(super);

1263 1264 1265
	put_page(page);
}

1266
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1267
						       u64 bytenr, u64 bytenr_orig)
1268
{
1269 1270
	struct btrfs_super_block *disk_super;
	struct page *page;
1271 1272 1273 1274 1275
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1276
		return ERR_PTR(-EINVAL);
1277 1278

	/* make sure our super fits in the page */
1279 1280
	if (sizeof(*disk_super) > PAGE_SIZE)
		return ERR_PTR(-EINVAL);
1281 1282 1283

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
1284 1285
	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
		return ERR_PTR(-EINVAL);
1286 1287

	/* pull in the page with our super */
1288
	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1289

1290 1291
	if (IS_ERR(page))
		return ERR_CAST(page);
1292

1293
	p = page_address(page);
1294 1295

	/* align our pointer to the offset of the super block */
1296
	disk_super = p + offset_in_page(bytenr);
1297

1298
	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1299
	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1300
		btrfs_release_disk_super(p);
1301
		return ERR_PTR(-EINVAL);
1302 1303
	}

1304 1305
	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1306

1307
	return disk_super;
1308 1309
}

1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320
int btrfs_forget_devices(const char *path)
{
	int ret;

	mutex_lock(&uuid_mutex);
	ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
	mutex_unlock(&uuid_mutex);

	return ret;
}

1321 1322 1323 1324 1325
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache
 */
1326 1327
struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
					   void *holder)
1328 1329
{
	struct btrfs_super_block *disk_super;
1330
	bool new_device_added = false;
1331
	struct btrfs_device *device = NULL;
1332
	struct block_device *bdev;
1333 1334
	u64 bytenr, bytenr_orig;
	int ret;
1335

1336 1337
	lockdep_assert_held(&uuid_mutex);

1338 1339 1340 1341 1342 1343
	/*
	 * we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
1344
	flags |= FMODE_EXCL;
1345 1346

	bdev = blkdev_get_by_path(path, flags, holder);
1347
	if (IS_ERR(bdev))
1348
		return ERR_CAST(bdev);
1349

1350 1351 1352 1353 1354 1355
	bytenr_orig = btrfs_sb_offset(0);
	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
	if (ret)
		return ERR_PTR(ret);

	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1356 1357
	if (IS_ERR(disk_super)) {
		device = ERR_CAST(disk_super);
1358
		goto error_bdev_put;
1359
	}
1360

1361
	device = device_list_add(path, disk_super, &new_device_added);
1362
	if (!IS_ERR(device)) {
1363 1364 1365
		if (new_device_added)
			btrfs_free_stale_devices(path, device);
	}
1366

1367
	btrfs_release_disk_super(disk_super);
1368 1369

error_bdev_put:
1370
	blkdev_put(bdev, flags);
1371

1372
	return device;
1373
}
1374

1375 1376 1377 1378 1379 1380
/*
 * Try to find a chunk that intersects [start, start + len] range and when one
 * such is found, record the end of it in *start
 */
static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
				    u64 len)
1381
{
1382
	u64 physical_start, physical_end;
1383

1384
	lockdep_assert_held(&device->fs_info->chunk_mutex);
1385

1386 1387 1388
	if (!find_first_extent_bit(&device->alloc_state, *start,
				   &physical_start, &physical_end,
				   CHUNK_ALLOCATED, NULL)) {
1389

1390 1391 1392 1393 1394
		if (in_range(physical_start, *start, len) ||
		    in_range(*start, physical_start,
			     physical_end - physical_start)) {
			*start = physical_end + 1;
			return true;
1395 1396
		}
	}
1397
	return false;
1398 1399
}

1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
{
	switch (device->fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		/*
		 * We don't want to overwrite the superblock on the drive nor
		 * any area used by the boot loader (grub for example), so we
		 * make sure to start at an offset of at least 1MB.
		 */
		return max_t(u64, start, SZ_1M);
1410 1411 1412 1413 1414 1415 1416
	case BTRFS_CHUNK_ALLOC_ZONED:
		/*
		 * We don't care about the starting region like regular
		 * allocator, because we anyway use/reserve the first two zones
		 * for superblock logging.
		 */
		return ALIGN(start, device->zone_info->zone_size);
1417 1418 1419 1420 1421
	default:
		BUG();
	}
}

1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
					u64 *hole_start, u64 *hole_size,
					u64 num_bytes)
{
	u64 zone_size = device->zone_info->zone_size;
	u64 pos;
	int ret;
	bool changed = false;

	ASSERT(IS_ALIGNED(*hole_start, zone_size));

	while (*hole_size > 0) {
		pos = btrfs_find_allocatable_zones(device, *hole_start,
						   *hole_start + *hole_size,
						   num_bytes);
		if (pos != *hole_start) {
			*hole_size = *hole_start + *hole_size - pos;
			*hole_start = pos;
			changed = true;
			if (*hole_size < num_bytes)
				break;
		}

		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);

		/* Range is ensured to be empty */
		if (!ret)
			return changed;

		/* Given hole range was invalid (outside of device) */
		if (ret == -ERANGE) {
			*hole_start += *hole_size;
1454
			*hole_size = 0;
1455
			return true;
1456 1457 1458 1459 1460 1461 1462 1463 1464 1465
		}

		*hole_start += zone_size;
		*hole_size -= zone_size;
		changed = true;
	}

	return changed;
}

1466 1467 1468 1469 1470 1471 1472
/**
 * dev_extent_hole_check - check if specified hole is suitable for allocation
 * @device:	the device which we have the hole
 * @hole_start: starting position of the hole
 * @hole_size:	the size of the hole
 * @num_bytes:	the size of the free space that we need
 *
1473
 * This function may modify @hole_start and @hole_size to reflect the suitable
1474 1475 1476 1477 1478 1479 1480 1481
 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
 */
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
				  u64 *hole_size, u64 num_bytes)
{
	bool changed = false;
	u64 hole_end = *hole_start + *hole_size;

1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
	for (;;) {
		/*
		 * Check before we set max_hole_start, otherwise we could end up
		 * sending back this offset anyway.
		 */
		if (contains_pending_extent(device, hole_start, *hole_size)) {
			if (hole_end >= *hole_start)
				*hole_size = hole_end - *hole_start;
			else
				*hole_size = 0;
			changed = true;
		}

		switch (device->fs_devices->chunk_alloc_policy) {
		case BTRFS_CHUNK_ALLOC_REGULAR:
			/* No extra check */
			break;
		case BTRFS_CHUNK_ALLOC_ZONED:
			if (dev_extent_hole_check_zoned(device, hole_start,
							hole_size, num_bytes)) {
				changed = true;
				/*
				 * The changed hole can contain pending extent.
				 * Loop again to check that.
				 */
				continue;
			}
			break;
		default:
			BUG();
		}
1513 1514 1515 1516 1517 1518

		break;
	}

	return changed;
}
1519

1520
/*
1521 1522 1523 1524 1525 1526 1527
 * find_free_dev_extent_start - find free space in the specified device
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1528
 *
1529 1530 1531
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
1532 1533 1534 1535 1536 1537 1538 1539
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1540 1541 1542 1543 1544
 *
 * NOTE: This function will search *commit* root of device tree, and does extra
 * check to ensure dev extents are not double allocated.
 * This makes the function safe to allocate dev extents but may not report
 * correct usable device space, as device extent freed in current transaction
D
David Sterba 已提交
1545
 * is not reported as available.
1546
 */
1547 1548 1549
static int find_free_dev_extent_start(struct btrfs_device *device,
				u64 num_bytes, u64 search_start, u64 *start,
				u64 *len)
1550
{
1551 1552
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1553
	struct btrfs_key key;
1554
	struct btrfs_dev_extent *dev_extent;
Y
Yan Zheng 已提交
1555
	struct btrfs_path *path;
1556 1557 1558 1559
	u64 hole_size;
	u64 max_hole_start;
	u64 max_hole_size;
	u64 extent_end;
1560 1561
	u64 search_end = device->total_bytes;
	int ret;
1562
	int slot;
1563
	struct extent_buffer *l;
1564

1565
	search_start = dev_extent_search_start(device, search_start);
1566

1567 1568 1569
	WARN_ON(device->zone_info &&
		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));

1570 1571 1572
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1573

1574 1575 1576
	max_hole_start = search_start;
	max_hole_size = 0;

1577
again:
1578 1579
	if (search_start >= search_end ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1580
		ret = -ENOSPC;
1581
		goto out;
1582 1583
	}

1584
	path->reada = READA_FORWARD;
1585 1586
	path->search_commit_root = 1;
	path->skip_locking = 1;
1587

1588 1589 1590
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1591

1592
	ret = btrfs_search_backwards(root, &key, path);
1593
	if (ret < 0)
1594 1595
		goto out;

1596 1597 1598 1599 1600 1601 1602 1603
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1604 1605 1606
				goto out;

			break;
1607 1608 1609 1610 1611 1612 1613
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1614
			break;
1615

1616
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1617
			goto next;
1618

1619 1620
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1621 1622
			dev_extent_hole_check(device, &search_start, &hole_size,
					      num_bytes);
1623

1624 1625 1626 1627
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1628

1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1641 1642 1643 1644
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1645 1646 1647 1648
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1649 1650 1651 1652 1653
next:
		path->slots[0]++;
		cond_resched();
	}

1654 1655 1656 1657 1658
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1659
	if (search_end > search_start) {
1660
		hole_size = search_end - search_start;
1661 1662
		if (dev_extent_hole_check(device, &search_start, &hole_size,
					  num_bytes)) {
1663 1664 1665
			btrfs_release_path(path);
			goto again;
		}
1666

1667 1668 1669 1670
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1671 1672
	}

1673
	/* See above. */
1674
	if (max_hole_size < num_bytes)
1675 1676 1677 1678 1679
		ret = -ENOSPC;
	else
		ret = 0;

out:
Y
Yan Zheng 已提交
1680
	btrfs_free_path(path);
1681
	*start = max_hole_start;
1682
	if (len)
1683
		*len = max_hole_size;
1684 1685 1686
	return ret;
}

1687
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1688 1689 1690
			 u64 *start, u64 *len)
{
	/* FIXME use last free of some kind */
1691
	return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1692 1693
}

1694
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1695
			  struct btrfs_device *device,
M
Miao Xie 已提交
1696
			  u64 start, u64 *dev_extent_len)
1697
{
1698 1699
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1700 1701 1702
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1703 1704 1705
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1706 1707 1708 1709 1710 1711 1712 1713

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
M
Miao Xie 已提交
1714
again:
1715
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1716 1717 1718
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1719 1720
		if (ret)
			goto out;
1721 1722 1723 1724 1725 1726
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
M
Miao Xie 已提交
1727 1728 1729
		key = found_key;
		btrfs_release_path(path);
		goto again;
1730 1731 1732 1733
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1734 1735
	} else {
		goto out;
1736
	}
1737

M
Miao Xie 已提交
1738 1739
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1740
	ret = btrfs_del_item(trans, root, path);
1741
	if (ret == 0)
1742
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1743
out:
1744 1745 1746 1747
	btrfs_free_path(path);
	return ret;
}

1748
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1749
{
1750 1751 1752 1753
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct rb_node *n;
	u64 ret = 0;
1754

1755
	em_tree = &fs_info->mapping_tree;
1756
	read_lock(&em_tree->lock);
L
Liu Bo 已提交
1757
	n = rb_last(&em_tree->map.rb_root);
1758 1759 1760
	if (n) {
		em = rb_entry(n, struct extent_map, rb_node);
		ret = em->start + em->len;
1761
	}
1762 1763
	read_unlock(&em_tree->lock);

1764 1765 1766
	return ret;
}

1767 1768
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1769 1770 1771 1772
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Y
Yan Zheng 已提交
1773 1774 1775 1776 1777
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1778 1779 1780 1781 1782

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1783
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1784 1785 1786
	if (ret < 0)
		goto error;

1787 1788 1789 1790 1791 1792
	if (ret == 0) {
		/* Corruption */
		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
		ret = -EUCLEAN;
		goto error;
	}
1793

1794 1795
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1796 1797
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1798
		*devid_ret = 1;
1799 1800 1801
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1802
		*devid_ret = found_key.offset + 1;
1803 1804 1805
	}
	ret = 0;
error:
Y
Yan Zheng 已提交
1806
	btrfs_free_path(path);
1807 1808 1809 1810 1811 1812 1813
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1814
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1815
			    struct btrfs_device *device)
1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829
{
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Y
Yan Zheng 已提交
1830
	key.offset = device->devid;
1831

1832 1833
	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
				      &key, sizeof(*dev_item));
1834 1835 1836 1837 1838 1839 1840
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Y
Yan Zheng 已提交
1841
	btrfs_set_device_generation(leaf, dev_item, 0);
1842 1843 1844 1845
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1846 1847 1848 1849
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1850 1851 1852
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1853
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1854

1855
	ptr = btrfs_device_uuid(dev_item);
1856
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1857
	ptr = btrfs_device_fsid(dev_item);
1858 1859
	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
			    ptr, BTRFS_FSID_SIZE);
1860 1861
	btrfs_mark_buffer_dirty(leaf);

Y
Yan Zheng 已提交
1862
	ret = 0;
1863 1864 1865 1866
out:
	btrfs_free_path(path);
	return ret;
}
1867

1868 1869 1870 1871
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 */
1872
static void update_dev_time(const char *path_name)
1873 1874 1875 1876
{
	struct file *filp;

	filp = filp_open(path_name, O_RDWR, 0);
1877
	if (IS_ERR(filp))
1878 1879 1880 1881 1882
		return;
	file_update_time(filp);
	filp_close(filp, NULL);
}

1883
static int btrfs_rm_dev_item(struct btrfs_device *device)
1884
{
1885
	struct btrfs_root *root = device->fs_info->chunk_root;
1886 1887 1888 1889 1890 1891 1892 1893 1894
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1895
	trans = btrfs_start_transaction(root, 0);
1896 1897 1898 1899
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}
1900 1901 1902 1903 1904
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1905 1906 1907 1908 1909
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
1910 1911 1912 1913
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
1914 1915 1916 1917 1918
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	}

1919 1920
out:
	btrfs_free_path(path);
1921 1922
	if (!ret)
		ret = btrfs_commit_transaction(trans);
1923 1924 1925
	return ret;
}

1926 1927 1928 1929 1930 1931 1932
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1933 1934
{
	u64 all_avail;
1935
	unsigned seq;
1936
	int i;
1937

1938
	do {
1939
		seq = read_seqbegin(&fs_info->profiles_lock);
1940

1941 1942 1943 1944
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
1945

1946
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1947
		if (!(all_avail & btrfs_raid_array[i].bg_flag))
1948
			continue;
1949

1950 1951
		if (num_devices < btrfs_raid_array[i].devs_min)
			return btrfs_raid_array[i].mindev_error;
D
David Woodhouse 已提交
1952 1953
	}

1954
	return 0;
1955 1956
}

1957 1958
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1959
{
Y
Yan Zheng 已提交
1960
	struct btrfs_device *next_device;
1961 1962 1963

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
1964 1965
		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
		    && next_device->bdev)
1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
			return next_device;
	}

	return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_bdev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
1978
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1979
					    struct btrfs_device *next_device)
1980
{
1981
	struct btrfs_fs_info *fs_info = device->fs_info;
1982

1983
	if (!next_device)
1984
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1985
							    device);
1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

	if (fs_info->fs_devices->latest_bdev == device->bdev)
		fs_info->fs_devices->latest_bdev = next_device->bdev;
}

1996 1997 1998 1999 2000 2001 2002 2003
/*
 * Return btrfs_fs_devices::num_devices excluding the device that's being
 * currently replaced.
 */
static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
	u64 num_devices = fs_info->fs_devices->num_devices;

2004
	down_read(&fs_info->dev_replace.rwsem);
2005 2006 2007 2008
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		ASSERT(num_devices > 1);
		num_devices--;
	}
2009
	up_read(&fs_info->dev_replace.rwsem);
2010 2011 2012 2013

	return num_devices;
}

2014 2015 2016
void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
			       struct block_device *bdev,
			       const char *device_path)
2017 2018 2019 2020 2021 2022 2023 2024
{
	struct btrfs_super_block *disk_super;
	int copy_num;

	if (!bdev)
		return;

	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2025 2026
		struct page *page;
		int ret;
2027

2028 2029 2030
		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
		if (IS_ERR(disk_super))
			continue;
2031

2032 2033 2034 2035 2036
		if (bdev_is_zoned(bdev)) {
			btrfs_reset_sb_log_zones(bdev, copy_num);
			continue;
		}

2037
		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049

		page = virt_to_page(disk_super);
		set_page_dirty(page);
		lock_page(page);
		/* write_on_page() unlocks the page */
		ret = write_one_page(page);
		if (ret)
			btrfs_warn(fs_info,
				"error clearing superblock number %d (%d)",
				copy_num, ret);
		btrfs_release_disk_super(disk_super);

2050 2051 2052 2053 2054 2055 2056 2057 2058
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
	update_dev_time(device_path);
}

2059
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2060
		    u64 devid)
2061 2062
{
	struct btrfs_device *device;
2063
	struct btrfs_fs_devices *cur_devices;
2064
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2065
	u64 num_devices;
2066 2067 2068 2069
	int ret = 0;

	mutex_lock(&uuid_mutex);

2070
	num_devices = btrfs_num_devices(fs_info);
2071

2072
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2073
	if (ret)
2074 2075
		goto out;

2076 2077 2078 2079
	device = btrfs_find_device_by_devspec(fs_info, devid, device_path);

	if (IS_ERR(device)) {
		if (PTR_ERR(device) == -ENOENT &&
2080
		    device_path && strcmp(device_path, "missing") == 0)
2081 2082 2083
			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
		else
			ret = PTR_ERR(device);
D
David Woodhouse 已提交
2084
		goto out;
2085
	}
2086

2087 2088 2089 2090 2091 2092 2093 2094
	if (btrfs_pinned_by_swapfile(fs_info, device)) {
		btrfs_warn_in_rcu(fs_info,
		  "cannot remove device %s (devid %llu) due to active swapfile",
				  rcu_str_deref(device->name), device->devid);
		ret = -ETXTBSY;
		goto out;
	}

2095
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2096
		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2097
		goto out;
2098 2099
	}

2100 2101
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    fs_info->fs_devices->rw_devices == 1) {
2102
		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2103
		goto out;
Y
Yan Zheng 已提交
2104 2105
	}

2106
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2107
		mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2108
		list_del_init(&device->dev_alloc_list);
2109
		device->fs_devices->rw_devices--;
2110
		mutex_unlock(&fs_info->chunk_mutex);
2111
	}
2112

2113
	mutex_unlock(&uuid_mutex);
2114
	ret = btrfs_shrink_device(device, 0);
2115 2116
	if (!ret)
		btrfs_reada_remove_dev(device);
2117
	mutex_lock(&uuid_mutex);
2118
	if (ret)
2119
		goto error_undo;
2120

2121 2122 2123 2124 2125
	/*
	 * TODO: the superblock still includes this device in its num_devices
	 * counter although write_all_supers() is not locked out. This
	 * could give a filesystem state which requires a degraded mount.
	 */
2126
	ret = btrfs_rm_dev_item(device);
2127
	if (ret)
2128
		goto error_undo;
2129

2130
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2131
	btrfs_scrub_cancel_dev(device);
2132 2133 2134 2135

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
2136 2137 2138 2139 2140
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
2141
	 */
2142

2143 2144 2145 2146 2147
	/*
	 * In normal cases the cur_devices == fs_devices. But in case
	 * of deleting a seed device, the cur_devices should point to
	 * its own fs_devices listed under the fs_devices->seed.
	 */
2148
	cur_devices = device->fs_devices;
2149
	mutex_lock(&fs_devices->device_list_mutex);
2150
	list_del_rcu(&device->dev_list);
2151

2152 2153
	cur_devices->num_devices--;
	cur_devices->total_devices--;
2154 2155 2156
	/* Update total_devices of the parent fs_devices if it's seed */
	if (cur_devices != fs_devices)
		fs_devices->total_devices--;
Y
Yan Zheng 已提交
2157

2158
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2159
		cur_devices->missing_devices--;
2160

2161
	btrfs_assign_next_active_device(device, NULL);
Y
Yan Zheng 已提交
2162

2163
	if (device->bdev) {
2164
		cur_devices->open_devices--;
2165
		/* remove sysfs entry */
2166
		btrfs_sysfs_remove_device(device);
2167
	}
2168

2169 2170
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2171
	mutex_unlock(&fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2172

2173 2174 2175 2176 2177
	/*
	 * at this point, the device is zero sized and detached from
	 * the devices list.  All that's left is to zero out the old
	 * supers and free the device.
	 */
2178
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2179 2180
		btrfs_scratch_superblocks(fs_info, device->bdev,
					  device->name->str);
2181 2182

	btrfs_close_bdev(device);
2183 2184
	synchronize_rcu();
	btrfs_free_device(device);
2185

2186
	if (cur_devices->open_devices == 0) {
2187
		list_del_init(&cur_devices->seed_list);
2188
		close_fs_devices(cur_devices);
2189
		free_fs_devices(cur_devices);
Y
Yan Zheng 已提交
2190 2191
	}

2192 2193 2194
out:
	mutex_unlock(&uuid_mutex);
	return ret;
2195

2196
error_undo:
2197
	btrfs_reada_undo_remove_dev(device);
2198
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2199
		mutex_lock(&fs_info->chunk_mutex);
2200
		list_add(&device->dev_alloc_list,
2201
			 &fs_devices->alloc_list);
2202
		device->fs_devices->rw_devices++;
2203
		mutex_unlock(&fs_info->chunk_mutex);
2204
	}
2205
	goto out;
2206 2207
}

2208
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2209
{
2210 2211
	struct btrfs_fs_devices *fs_devices;

2212
	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2213

2214 2215 2216 2217 2218 2219 2220
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
2221

2222
	list_del_rcu(&srcdev->dev_list);
2223
	list_del(&srcdev->dev_alloc_list);
2224
	fs_devices->num_devices--;
2225
	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2226
		fs_devices->missing_devices--;
2227

2228
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2229
		fs_devices->rw_devices--;
2230

2231
	if (srcdev->bdev)
2232
		fs_devices->open_devices--;
2233 2234
}

2235
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2236 2237
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2238

2239 2240
	mutex_lock(&uuid_mutex);

2241
	btrfs_close_bdev(srcdev);
2242 2243
	synchronize_rcu();
	btrfs_free_device(srcdev);
2244 2245 2246

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
2247 2248 2249 2250 2251 2252 2253 2254
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2255
		list_del_init(&fs_devices->seed_list);
2256
		close_fs_devices(fs_devices);
2257
		free_fs_devices(fs_devices);
2258
	}
2259
	mutex_unlock(&uuid_mutex);
2260 2261
}

2262
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2263
{
2264
	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2265 2266

	mutex_lock(&fs_devices->device_list_mutex);
2267

2268
	btrfs_sysfs_remove_device(tgtdev);
2269

2270
	if (tgtdev->bdev)
2271
		fs_devices->open_devices--;
2272

2273
	fs_devices->num_devices--;
2274

2275
	btrfs_assign_next_active_device(tgtdev, NULL);
2276 2277 2278

	list_del_rcu(&tgtdev->dev_list);

2279
	mutex_unlock(&fs_devices->device_list_mutex);
2280 2281 2282 2283 2284 2285 2286 2287

	/*
	 * The update_dev_time() with in btrfs_scratch_superblocks()
	 * may lead to a call to btrfs_show_devname() which will try
	 * to hold device_list_mutex. And here this device
	 * is already out of device list, so we don't have to hold
	 * the device_list_mutex lock.
	 */
2288 2289
	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
				  tgtdev->name->str);
2290 2291

	btrfs_close_bdev(tgtdev);
2292 2293
	synchronize_rcu();
	btrfs_free_device(tgtdev);
2294 2295
}

2296 2297
static struct btrfs_device *btrfs_find_device_by_path(
		struct btrfs_fs_info *fs_info, const char *device_path)
2298 2299 2300 2301 2302 2303
{
	int ret = 0;
	struct btrfs_super_block *disk_super;
	u64 devid;
	u8 *dev_uuid;
	struct block_device *bdev;
2304
	struct btrfs_device *device;
2305 2306

	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2307
				    fs_info->bdev_holder, 0, &bdev, &disk_super);
2308
	if (ret)
2309
		return ERR_PTR(ret);
2310

2311 2312
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	dev_uuid = disk_super->dev_item.uuid;
2313
	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2314
		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2315
					   disk_super->metadata_uuid);
2316
	else
2317
		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2318
					   disk_super->fsid);
2319

2320
	btrfs_release_disk_super(disk_super);
2321 2322
	if (!device)
		device = ERR_PTR(-ENOENT);
2323
	blkdev_put(bdev, FMODE_READ);
2324
	return device;
2325 2326
}

2327 2328 2329
/*
 * Lookup a device given by device id, or the path if the id is 0.
 */
2330
struct btrfs_device *btrfs_find_device_by_devspec(
2331 2332
		struct btrfs_fs_info *fs_info, u64 devid,
		const char *device_path)
2333
{
2334
	struct btrfs_device *device;
2335

2336
	if (devid) {
2337
		device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2338
					   NULL);
2339 2340
		if (!device)
			return ERR_PTR(-ENOENT);
2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353
		return device;
	}

	if (!device_path || !device_path[0])
		return ERR_PTR(-EINVAL);

	if (strcmp(device_path, "missing") == 0) {
		/* Find first missing device */
		list_for_each_entry(device, &fs_info->fs_devices->devices,
				    dev_list) {
			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				     &device->dev_state) && !device->bdev)
				return device;
2354
		}
2355
		return ERR_PTR(-ENOENT);
2356
	}
2357 2358

	return btrfs_find_device_by_path(fs_info, device_path);
2359 2360
}

Y
Yan Zheng 已提交
2361 2362 2363
/*
 * does all the dirty work required for changing file system's UUID.
 */
2364
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2365
{
2366
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2367
	struct btrfs_fs_devices *old_devices;
Y
Yan Zheng 已提交
2368
	struct btrfs_fs_devices *seed_devices;
2369
	struct btrfs_super_block *disk_super = fs_info->super_copy;
Y
Yan Zheng 已提交
2370 2371 2372
	struct btrfs_device *device;
	u64 super_flags;

2373
	lockdep_assert_held(&uuid_mutex);
Y
Yan Zheng 已提交
2374
	if (!fs_devices->seeding)
Y
Yan Zheng 已提交
2375 2376
		return -EINVAL;

2377 2378 2379 2380
	/*
	 * Private copy of the seed devices, anchored at
	 * fs_info->fs_devices->seed_list
	 */
2381
	seed_devices = alloc_fs_devices(NULL, NULL);
2382 2383
	if (IS_ERR(seed_devices))
		return PTR_ERR(seed_devices);
Y
Yan Zheng 已提交
2384

2385 2386 2387 2388 2389 2390
	/*
	 * It's necessary to retain a copy of the original seed fs_devices in
	 * fs_uuids so that filesystems which have been seeded can successfully
	 * reference the seed device from open_seed_devices. This also supports
	 * multiple fs seed.
	 */
Y
Yan Zheng 已提交
2391 2392 2393 2394
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
		return PTR_ERR(old_devices);
Y
Yan Zheng 已提交
2395
	}
Y
Yan Zheng 已提交
2396

2397
	list_add(&old_devices->fs_list, &fs_uuids);
Y
Yan Zheng 已提交
2398

Y
Yan Zheng 已提交
2399 2400 2401 2402
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2403
	mutex_init(&seed_devices->device_list_mutex);
2404

2405
	mutex_lock(&fs_devices->device_list_mutex);
2406 2407
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
M
Miao Xie 已提交
2408 2409
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2410

2411
	fs_devices->seeding = false;
Y
Yan Zheng 已提交
2412 2413
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2414
	fs_devices->missing_devices = 0;
2415
	fs_devices->rotating = false;
2416
	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
Y
Yan Zheng 已提交
2417 2418

	generate_random_uuid(fs_devices->fsid);
2419
	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
2420
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2421
	mutex_unlock(&fs_devices->device_list_mutex);
2422

Y
Yan Zheng 已提交
2423 2424 2425 2426 2427 2428 2429 2430
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);

	return 0;
}

/*
2431
 * Store the expected generation for seed devices in device items.
Y
Yan Zheng 已提交
2432
 */
2433
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
Y
Yan Zheng 已提交
2434
{
2435
	struct btrfs_fs_info *fs_info = trans->fs_info;
2436
	struct btrfs_root *root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2437 2438 2439 2440 2441
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2442
	u8 fs_uuid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469
	u8 dev_uuid[BTRFS_UUID_SIZE];
	u64 devid;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2470
			btrfs_release_path(path);
Y
Yan Zheng 已提交
2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
		devid = btrfs_device_id(leaf, dev_item);
2482
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Y
Yan Zheng 已提交
2483
				   BTRFS_UUID_SIZE);
2484
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2485
				   BTRFS_FSID_SIZE);
2486
		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2487
					   fs_uuid);
2488
		BUG_ON(!device); /* Logic error */
Y
Yan Zheng 已提交
2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
			btrfs_mark_buffer_dirty(leaf);
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2505
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2506
{
2507
	struct btrfs_root *root = fs_info->dev_root;
2508
	struct request_queue *q;
2509 2510 2511
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
	struct block_device *bdev;
2512
	struct super_block *sb = fs_info->sb;
2513
	struct rcu_string *name;
2514
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2515 2516
	u64 orig_super_total_bytes;
	u64 orig_super_num_devices;
Y
Yan Zheng 已提交
2517
	int seeding_dev = 0;
2518
	int ret = 0;
2519
	bool locked = false;
2520

2521
	if (sb_rdonly(sb) && !fs_devices->seeding)
2522
		return -EROFS;
2523

2524
	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2525
				  fs_info->bdev_holder);
2526 2527
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
2528

N
Naohiro Aota 已提交
2529 2530 2531 2532 2533
	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
		ret = -EINVAL;
		goto error;
	}

2534
	if (fs_devices->seeding) {
Y
Yan Zheng 已提交
2535 2536 2537
		seeding_dev = 1;
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
2538
		locked = true;
Y
Yan Zheng 已提交
2539 2540
	}

2541
	sync_blockdev(bdev);
2542

2543 2544
	rcu_read_lock();
	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2545 2546
		if (device->bdev == bdev) {
			ret = -EEXIST;
2547
			rcu_read_unlock();
Y
Yan Zheng 已提交
2548
			goto error;
2549 2550
		}
	}
2551
	rcu_read_unlock();
2552

2553
	device = btrfs_alloc_device(fs_info, NULL, NULL);
2554
	if (IS_ERR(device)) {
2555
		/* we can safely leave the fs_devices entry around */
2556
		ret = PTR_ERR(device);
Y
Yan Zheng 已提交
2557
		goto error;
2558 2559
	}

2560
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2561
	if (!name) {
Y
Yan Zheng 已提交
2562
		ret = -ENOMEM;
2563
		goto error_free_device;
2564
	}
2565
	rcu_assign_pointer(device->name, name);
Y
Yan Zheng 已提交
2566

2567 2568 2569 2570 2571 2572 2573
	device->fs_info = fs_info;
	device->bdev = bdev;

	ret = btrfs_get_dev_zone_info(device);
	if (ret)
		goto error_free_device;

2574
	trans = btrfs_start_transaction(root, 0);
2575 2576
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2577
		goto error_free_zone;
2578 2579
	}

2580
	q = bdev_get_queue(bdev);
2581
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Y
Yan Zheng 已提交
2582
	device->generation = trans->transid;
2583 2584 2585
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2586 2587
	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
					 fs_info->sectorsize);
2588
	device->disk_total_bytes = device->total_bytes;
2589
	device->commit_total_bytes = device->total_bytes;
2590
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2591
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2592
	device->mode = FMODE_EXCL;
2593
	device->dev_stats_valid = 1;
2594
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2595

Y
Yan Zheng 已提交
2596
	if (seeding_dev) {
2597
		btrfs_clear_sb_rdonly(sb);
2598
		ret = btrfs_prepare_sprout(fs_info);
2599 2600 2601 2602
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
Y
Yan Zheng 已提交
2603
	}
2604

2605
	device->fs_devices = fs_devices;
2606

2607
	mutex_lock(&fs_devices->device_list_mutex);
2608
	mutex_lock(&fs_info->chunk_mutex);
2609 2610 2611 2612 2613 2614 2615
	list_add_rcu(&device->dev_list, &fs_devices->devices);
	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
	fs_devices->num_devices++;
	fs_devices->open_devices++;
	fs_devices->rw_devices++;
	fs_devices->total_devices++;
	fs_devices->total_rw_bytes += device->total_bytes;
2616

2617
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2618

2619
	if (!blk_queue_nonrot(q))
2620
		fs_devices->rotating = true;
C
Chris Mason 已提交
2621

2622
	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2623
	btrfs_set_super_total_bytes(fs_info->super_copy,
2624 2625
		round_down(orig_super_total_bytes + device->total_bytes,
			   fs_info->sectorsize));
2626

2627 2628 2629
	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy,
				    orig_super_num_devices + 1);
2630

M
Miao Xie 已提交
2631 2632 2633 2634
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2635
	btrfs_clear_space_info_full(fs_info);
M
Miao Xie 已提交
2636

2637
	mutex_unlock(&fs_info->chunk_mutex);
2638 2639

	/* Add sysfs device entry */
2640
	btrfs_sysfs_add_device(device);
2641

2642
	mutex_unlock(&fs_devices->device_list_mutex);
2643

Y
Yan Zheng 已提交
2644
	if (seeding_dev) {
2645
		mutex_lock(&fs_info->chunk_mutex);
2646
		ret = init_first_rw_device(trans);
2647
		mutex_unlock(&fs_info->chunk_mutex);
2648
		if (ret) {
2649
			btrfs_abort_transaction(trans, ret);
2650
			goto error_sysfs;
2651
		}
M
Miao Xie 已提交
2652 2653
	}

2654
	ret = btrfs_add_dev_item(trans, device);
M
Miao Xie 已提交
2655
	if (ret) {
2656
		btrfs_abort_transaction(trans, ret);
2657
		goto error_sysfs;
M
Miao Xie 已提交
2658 2659 2660
	}

	if (seeding_dev) {
2661
		ret = btrfs_finish_sprout(trans);
2662
		if (ret) {
2663
			btrfs_abort_transaction(trans, ret);
2664
			goto error_sysfs;
2665
		}
2666

2667 2668 2669 2670 2671
		/*
		 * fs_devices now represents the newly sprouted filesystem and
		 * its fsid has been changed by btrfs_prepare_sprout
		 */
		btrfs_sysfs_update_sprout_fsid(fs_devices);
Y
Yan Zheng 已提交
2672 2673
	}

2674
	ret = btrfs_commit_transaction(trans);
2675

Y
Yan Zheng 已提交
2676 2677 2678
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2679
		locked = false;
2680

2681 2682 2683
		if (ret) /* transaction commit */
			return ret;

2684
		ret = btrfs_relocate_sys_chunks(fs_info);
2685
		if (ret < 0)
2686
			btrfs_handle_fs_error(fs_info, ret,
J
Jeff Mahoney 已提交
2687
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2688 2689 2690 2691
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2692 2693 2694
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2695
		}
2696
		ret = btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
2697
	}
2698

2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
	/*
	 * Now that we have written a new super block to this device, check all
	 * other fs_devices list if device_path alienates any other scanned
	 * device.
	 * We can ignore the return value as it typically returns -EINVAL and
	 * only succeeds if the device was an alien.
	 */
	btrfs_forget_devices(device_path);

	/* Update ctime/mtime for blkid or udev */
2709
	update_dev_time(device_path);
2710

Y
Yan Zheng 已提交
2711
	return ret;
2712

2713
error_sysfs:
2714
	btrfs_sysfs_remove_device(device);
2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
	mutex_lock(&fs_info->chunk_mutex);
	list_del_rcu(&device->dev_list);
	list_del(&device->dev_alloc_list);
	fs_info->fs_devices->num_devices--;
	fs_info->fs_devices->open_devices--;
	fs_info->fs_devices->rw_devices--;
	fs_info->fs_devices->total_devices--;
	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
	btrfs_set_super_total_bytes(fs_info->super_copy,
				    orig_super_total_bytes);
	btrfs_set_super_num_devices(fs_info->super_copy,
				    orig_super_num_devices);
	mutex_unlock(&fs_info->chunk_mutex);
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2731
error_trans:
2732
	if (seeding_dev)
2733
		btrfs_set_sb_rdonly(sb);
2734 2735
	if (trans)
		btrfs_end_transaction(trans);
2736 2737
error_free_zone:
	btrfs_destroy_dev_zone_info(device);
2738
error_free_device:
2739
	btrfs_free_device(device);
Y
Yan Zheng 已提交
2740
error:
2741
	blkdev_put(bdev, FMODE_EXCL);
2742
	if (locked) {
Y
Yan Zheng 已提交
2743 2744 2745
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2746
	return ret;
2747 2748
}

C
Chris Mason 已提交
2749 2750
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2751 2752 2753
{
	int ret;
	struct btrfs_path *path;
2754
	struct btrfs_root *root = device->fs_info->chunk_root;
2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2784 2785 2786 2787
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2788 2789 2790 2791 2792 2793 2794
	btrfs_mark_buffer_dirty(leaf);

out:
	btrfs_free_path(path);
	return ret;
}

M
Miao Xie 已提交
2795
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2796 2797
		      struct btrfs_device *device, u64 new_size)
{
2798 2799
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
M
Miao Xie 已提交
2800 2801
	u64 old_total;
	u64 diff;
2802

2803
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Y
Yan Zheng 已提交
2804
		return -EACCES;
M
Miao Xie 已提交
2805

2806 2807
	new_size = round_down(new_size, fs_info->sectorsize);

2808
	mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2809
	old_total = btrfs_super_total_bytes(super_copy);
2810
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
M
Miao Xie 已提交
2811

2812
	if (new_size <= device->total_bytes ||
2813
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2814
		mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2815
		return -EINVAL;
M
Miao Xie 已提交
2816
	}
Y
Yan Zheng 已提交
2817

2818 2819
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Y
Yan Zheng 已提交
2820 2821
	device->fs_devices->total_rw_bytes += diff;

2822 2823
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2824
	btrfs_clear_space_info_full(device->fs_info);
2825 2826 2827
	if (list_empty(&device->post_commit_list))
		list_add_tail(&device->post_commit_list,
			      &trans->transaction->dev_update_list);
2828
	mutex_unlock(&fs_info->chunk_mutex);
2829

2830 2831 2832
	return btrfs_update_device(trans, device);
}

2833
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2834
{
2835
	struct btrfs_fs_info *fs_info = trans->fs_info;
2836
	struct btrfs_root *root = fs_info->chunk_root;
2837 2838 2839 2840 2841 2842 2843 2844
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2845
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2846 2847 2848 2849
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2850 2851 2852
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2853 2854
		btrfs_handle_fs_error(fs_info, -ENOENT,
				      "Failed lookup while freeing chunk.");
2855 2856 2857
		ret = -ENOENT;
		goto out;
	}
2858 2859

	ret = btrfs_del_item(trans, root, path);
2860
	if (ret < 0)
2861 2862
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to delete chunk item.");
2863
out:
2864
	btrfs_free_path(path);
2865
	return ret;
2866 2867
}

2868
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2869
{
2870
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2871 2872 2873 2874 2875 2876 2877 2878 2879 2880
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

2881
	lockdep_assert_held(&fs_info->chunk_mutex);
2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
2901
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
	return ret;
}

2914 2915 2916 2917 2918 2919 2920 2921 2922
/*
 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
 * @logical: Logical block offset in bytes.
 * @length: Length of extent in bytes.
 *
 * Return: Chunk mapping or ERR_PTR.
 */
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
				       u64 logical, u64 length)
2923 2924 2925 2926
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;

2927
	em_tree = &fs_info->mapping_tree;
2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, logical, length);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

	if (em->start > logical || em->start + em->len < logical) {
		btrfs_crit(fs_info,
			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
			   logical, length, em->start, em->start + em->len);
		free_extent_map(em);
		return ERR_PTR(-EINVAL);
	}

	/* callers are responsible for dropping em's ref. */
	return em;
}

2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972
static int remove_chunk_item(struct btrfs_trans_handle *trans,
			     struct map_lookup *map, u64 chunk_offset)
{
	int i;

	/*
	 * Removing chunk items and updating the device items in the chunks btree
	 * requires holding the chunk_mutex.
	 * See the comment at btrfs_chunk_alloc() for the details.
	 */
	lockdep_assert_held(&trans->fs_info->chunk_mutex);

	for (i = 0; i < map->num_stripes; i++) {
		int ret;

		ret = btrfs_update_device(trans, map->stripes[i].dev);
		if (ret)
			return ret;
	}

	return btrfs_free_chunk(trans, chunk_offset);
}

2973
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2974
{
2975
	struct btrfs_fs_info *fs_info = trans->fs_info;
2976 2977
	struct extent_map *em;
	struct map_lookup *map;
M
Miao Xie 已提交
2978
	u64 dev_extent_len = 0;
2979
	int i, ret = 0;
2980
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2981

2982
	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2983
	if (IS_ERR(em)) {
2984 2985
		/*
		 * This is a logic error, but we don't want to just rely on the
2986
		 * user having built with ASSERT enabled, so if ASSERT doesn't
2987 2988 2989
		 * do anything we still error out.
		 */
		ASSERT(0);
2990
		return PTR_ERR(em);
2991
	}
2992
	map = em->map_lookup;
2993

2994
	/*
2995 2996 2997 2998 2999 3000 3001 3002
	 * First delete the device extent items from the devices btree.
	 * We take the device_list_mutex to avoid racing with the finishing phase
	 * of a device replace operation. See the comment below before acquiring
	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
	 * because that can result in a deadlock when deleting the device extent
	 * items from the devices btree - COWing an extent buffer from the btree
	 * may result in allocating a new metadata chunk, which would attempt to
	 * lock again fs_info->chunk_mutex.
3003 3004
	 */
	mutex_lock(&fs_devices->device_list_mutex);
3005
	for (i = 0; i < map->num_stripes; i++) {
3006
		struct btrfs_device *device = map->stripes[i].dev;
M
Miao Xie 已提交
3007 3008 3009
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
3010
		if (ret) {
3011
			mutex_unlock(&fs_devices->device_list_mutex);
3012
			btrfs_abort_transaction(trans, ret);
3013 3014
			goto out;
		}
3015

M
Miao Xie 已提交
3016
		if (device->bytes_used > 0) {
3017
			mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
3018 3019
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
3020
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3021
			btrfs_clear_space_info_full(fs_info);
3022
			mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
3023
		}
3024 3025
	}
	mutex_unlock(&fs_devices->device_list_mutex);
3026

3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079
	/*
	 * We acquire fs_info->chunk_mutex for 2 reasons:
	 *
	 * 1) Just like with the first phase of the chunk allocation, we must
	 *    reserve system space, do all chunk btree updates and deletions, and
	 *    update the system chunk array in the superblock while holding this
	 *    mutex. This is for similar reasons as explained on the comment at
	 *    the top of btrfs_chunk_alloc();
	 *
	 * 2) Prevent races with the final phase of a device replace operation
	 *    that replaces the device object associated with the map's stripes,
	 *    because the device object's id can change at any time during that
	 *    final phase of the device replace operation
	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
	 *    replaced device and then see it with an ID of
	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
	 *    the device item, which does not exists on the chunk btree.
	 *    The finishing phase of device replace acquires both the
	 *    device_list_mutex and the chunk_mutex, in that order, so we are
	 *    safe by just acquiring the chunk_mutex.
	 */
	trans->removing_chunk = true;
	mutex_lock(&fs_info->chunk_mutex);

	check_system_chunk(trans, map->type);

	ret = remove_chunk_item(trans, map, chunk_offset);
	/*
	 * Normally we should not get -ENOSPC since we reserved space before
	 * through the call to check_system_chunk().
	 *
	 * Despite our system space_info having enough free space, we may not
	 * be able to allocate extents from its block groups, because all have
	 * an incompatible profile, which will force us to allocate a new system
	 * block group with the right profile, or right after we called
	 * check_system_space() above, a scrub turned the only system block group
	 * with enough free space into RO mode.
	 * This is explained with more detail at do_chunk_alloc().
	 *
	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
	 */
	if (ret == -ENOSPC) {
		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
		struct btrfs_block_group *sys_bg;

		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
		if (IS_ERR(sys_bg)) {
			ret = PTR_ERR(sys_bg);
			btrfs_abort_transaction(trans, ret);
			goto out;
		}

		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3080 3081 3082
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
3083
		}
3084

3085 3086 3087 3088 3089 3090
		ret = remove_chunk_item(trans, map, chunk_offset);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
	} else if (ret) {
3091
		btrfs_abort_transaction(trans, ret);
3092 3093
		goto out;
	}
3094

3095
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3096

3097
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3098
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3099
		if (ret) {
3100
			btrfs_abort_transaction(trans, ret);
3101 3102
			goto out;
		}
3103 3104
	}

3105 3106 3107 3108 3109 3110 3111 3112 3113
	mutex_unlock(&fs_info->chunk_mutex);
	trans->removing_chunk = false;

	/*
	 * We are done with chunk btree updates and deletions, so release the
	 * system space we previously reserved (with check_system_chunk()).
	 */
	btrfs_trans_release_chunk_metadata(trans);

3114
	ret = btrfs_remove_block_group(trans, chunk_offset, em);
3115
	if (ret) {
3116
		btrfs_abort_transaction(trans, ret);
3117 3118
		goto out;
	}
Y
Yan Zheng 已提交
3119

3120
out:
3121 3122 3123 3124
	if (trans->removing_chunk) {
		mutex_unlock(&fs_info->chunk_mutex);
		trans->removing_chunk = false;
	}
Y
Yan Zheng 已提交
3125 3126
	/* once for us */
	free_extent_map(em);
3127 3128
	return ret;
}
Y
Yan Zheng 已提交
3129

3130
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3131
{
3132
	struct btrfs_root *root = fs_info->chunk_root;
3133
	struct btrfs_trans_handle *trans;
3134
	struct btrfs_block_group *block_group;
3135
	u64 length;
3136
	int ret;
Y
Yan Zheng 已提交
3137

3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
3150
	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3151

3152
	/* step one, relocate all the extents inside this chunk */
3153
	btrfs_scrub_pause(fs_info);
3154
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3155
	btrfs_scrub_continue(fs_info);
3156 3157 3158
	if (ret)
		return ret;

3159 3160 3161 3162
	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
	if (!block_group)
		return -ENOENT;
	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3163
	length = block_group->length;
3164 3165
	btrfs_put_block_group(block_group);

3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179
	/*
	 * On a zoned file system, discard the whole block group, this will
	 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
	 * resetting the zone fails, don't treat it as a fatal problem from the
	 * filesystem's point of view.
	 */
	if (btrfs_is_zoned(fs_info)) {
		ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
		if (ret)
			btrfs_info(fs_info,
				"failed to reset zone %llu after relocation",
				chunk_offset);
	}

3180 3181 3182 3183 3184 3185 3186 3187
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

3188
	/*
3189 3190
	 * step two, delete the device extents and the
	 * chunk tree entries
3191
	 */
3192
	ret = btrfs_remove_chunk(trans, chunk_offset);
3193
	btrfs_end_transaction(trans);
3194
	return ret;
Y
Yan Zheng 已提交
3195 3196
}

3197
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
3198
{
3199
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
3200 3201 3202 3203 3204 3205
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
3206 3207
	bool retried = false;
	int failed = 0;
Y
Yan Zheng 已提交
3208 3209 3210 3211 3212 3213
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3214
again:
Y
Yan Zheng 已提交
3215 3216 3217 3218 3219
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
3220
		mutex_lock(&fs_info->reclaim_bgs_lock);
Y
Yan Zheng 已提交
3221
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3222
		if (ret < 0) {
3223
			mutex_unlock(&fs_info->reclaim_bgs_lock);
Y
Yan Zheng 已提交
3224
			goto error;
3225
		}
3226
		BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
3227 3228 3229

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
3230
		if (ret)
3231
			mutex_unlock(&fs_info->reclaim_bgs_lock);
Y
Yan Zheng 已提交
3232 3233 3234 3235
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
Z
Zheng Yan 已提交
3236

Y
Yan Zheng 已提交
3237 3238
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
Z
Zheng Yan 已提交
3239

Y
Yan Zheng 已提交
3240 3241 3242
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
3243
		btrfs_release_path(path);
3244

Y
Yan Zheng 已提交
3245
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3246
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3247 3248
			if (ret == -ENOSPC)
				failed++;
H
HIMANGI SARAOGI 已提交
3249 3250
			else
				BUG_ON(ret);
Y
Yan Zheng 已提交
3251
		}
3252
		mutex_unlock(&fs_info->reclaim_bgs_lock);
3253

Y
Yan Zheng 已提交
3254 3255 3256 3257 3258
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
3259 3260 3261 3262
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
3263
	} else if (WARN_ON(failed && retried)) {
3264 3265
		ret = -ENOSPC;
	}
Y
Yan Zheng 已提交
3266 3267 3268
error:
	btrfs_free_path(path);
	return ret;
3269 3270
}

3271 3272 3273 3274 3275 3276 3277 3278
/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				      u64 chunk_offset)
{
3279
	struct btrfs_block_group *cache;
3280 3281 3282 3283 3284 3285 3286 3287
	u64 bytes_used;
	u64 chunk_type;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	ASSERT(cache);
	chunk_type = cache->flags;
	btrfs_put_block_group(cache);

3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307
	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
		return 0;

	spin_lock(&fs_info->data_sinfo->lock);
	bytes_used = fs_info->data_sinfo->bytes_used;
	spin_unlock(&fs_info->data_sinfo->lock);

	if (!bytes_used) {
		struct btrfs_trans_handle *trans;
		int ret;

		trans =	btrfs_join_transaction(fs_info->tree_root);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
		btrfs_end_transaction(trans);
		if (ret < 0)
			return ret;
		return 1;
3308
	}
3309

3310 3311 3312
	return 0;
}

3313
static int insert_balance_item(struct btrfs_fs_info *fs_info,
3314 3315
			       struct btrfs_balance_control *bctl)
{
3316
	struct btrfs_root *root = fs_info->tree_root;
3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3336
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3337 3338 3339 3340 3341 3342 3343 3344 3345 3346
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3347
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
3361
	err = btrfs_commit_transaction(trans);
3362 3363 3364 3365 3366
	if (err && !ret)
		ret = err;
	return ret;
}

3367
static int del_balance_item(struct btrfs_fs_info *fs_info)
3368
{
3369
	struct btrfs_root *root = fs_info->tree_root;
3370 3371 3372 3373 3374 3375 3376 3377 3378
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3379
	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3380 3381 3382 3383 3384 3385
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3386
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3400
	err = btrfs_commit_transaction(trans);
3401 3402 3403 3404 3405
	if (err && !ret)
		ret = err;
	return ret;
}

I
Ilya Dryomov 已提交
3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3430
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3431 3432 3433 3434 3435
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3436
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3437 3438 3439 3440 3441
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3442
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3443 3444 3445 3446 3447 3448
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3449 3450 3451 3452
/*
 * Clear the balance status in fs_info and delete the balance item from disk.
 */
static void reset_balance_state(struct btrfs_fs_info *fs_info)
3453 3454
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3455
	int ret;
3456 3457 3458 3459 3460 3461 3462 3463

	BUG_ON(!fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
3464 3465 3466
	ret = del_balance_item(fs_info);
	if (ret)
		btrfs_handle_fs_error(fs_info, ret, NULL);
3467 3468
}

I
Ilya Dryomov 已提交
3469 3470 3471 3472
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3473
static int chunk_profiles_filter(u64 chunk_type,
I
Ilya Dryomov 已提交
3474 3475
				 struct btrfs_balance_args *bargs)
{
3476 3477
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
I
Ilya Dryomov 已提交
3478

3479
	if (bargs->profiles & chunk_type)
I
Ilya Dryomov 已提交
3480 3481 3482 3483 3484
		return 0;

	return 1;
}

3485
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
I
Ilya Dryomov 已提交
3486
			      struct btrfs_balance_args *bargs)
3487
{
3488
	struct btrfs_block_group *cache;
3489 3490 3491 3492 3493 3494
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3495
	chunk_used = cache->used;
3496 3497 3498 3499

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
3500 3501
		user_thresh_min = div_factor_fine(cache->length,
						  bargs->usage_min);
3502 3503 3504 3505

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
3506
		user_thresh_max = cache->length;
3507
	else
3508 3509
		user_thresh_max = div_factor_fine(cache->length,
						  bargs->usage_max);
3510 3511 3512 3513 3514 3515 3516 3517

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3518
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3519
		u64 chunk_offset, struct btrfs_balance_args *bargs)
I
Ilya Dryomov 已提交
3520
{
3521
	struct btrfs_block_group *cache;
I
Ilya Dryomov 已提交
3522 3523 3524 3525
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3526
	chunk_used = cache->used;
I
Ilya Dryomov 已提交
3527

3528
	if (bargs->usage_min == 0)
3529
		user_thresh = 1;
3530
	else if (bargs->usage > 100)
3531
		user_thresh = cache->length;
3532
	else
3533
		user_thresh = div_factor_fine(cache->length, bargs->usage);
3534

I
Ilya Dryomov 已提交
3535 3536 3537 3538 3539 3540 3541
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

I
Ilya Dryomov 已提交
3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

3559 3560 3561 3562 3563 3564
static u64 calc_data_stripes(u64 type, int num_stripes)
{
	const int index = btrfs_bg_flags_to_raid_index(type);
	const int ncopies = btrfs_raid_array[index].ncopies;
	const int nparity = btrfs_raid_array[index].nparity;

3565
	return (num_stripes - nparity) / ncopies;
3566 3567
}

I
Ilya Dryomov 已提交
3568 3569 3570 3571 3572 3573 3574 3575 3576
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
3577
	u64 type;
I
Ilya Dryomov 已提交
3578 3579 3580 3581 3582 3583
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

3584 3585
	type = btrfs_chunk_type(leaf, chunk);
	factor = calc_data_stripes(type, num_stripes);
I
Ilya Dryomov 已提交
3586 3587 3588 3589 3590 3591 3592 3593

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3594
		stripe_length = div_u64(stripe_length, factor);
I
Ilya Dryomov 已提交
3595 3596 3597 3598 3599 3600 3601 3602 3603

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3631
static int chunk_soft_convert_filter(u64 chunk_type,
3632 3633 3634 3635 3636
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3637 3638
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3639

3640
	if (bargs->target == chunk_type)
3641 3642 3643 3644 3645
		return 1;

	return 0;
}

3646
static int should_balance_chunk(struct extent_buffer *leaf,
3647 3648
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3649
	struct btrfs_fs_info *fs_info = leaf->fs_info;
3650
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

I
Ilya Dryomov 已提交
3667 3668 3669 3670
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3671 3672 3673 3674
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3675
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
I
Ilya Dryomov 已提交
3676
		return 0;
3677
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3678
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3679
		return 0;
I
Ilya Dryomov 已提交
3680 3681 3682 3683 3684 3685
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3686 3687 3688 3689
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3690
	    chunk_drange_filter(leaf, chunk, bargs)) {
I
Ilya Dryomov 已提交
3691
		return 0;
3692 3693 3694 3695 3696 3697
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3698 3699
	}

3700 3701 3702 3703 3704 3705
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3706 3707 3708 3709 3710 3711
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3712 3713 3714 3715 3716 3717 3718 3719
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3720 3721 3722
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3723
		 * determined here because we do not have the global information
3724 3725 3726 3727 3728 3729
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
3730 3731
	}

3732 3733 3734
	return 1;
}

3735
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3736
{
3737
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3738
	struct btrfs_root *chunk_root = fs_info->chunk_root;
3739
	u64 chunk_type;
3740
	struct btrfs_chunk *chunk;
3741
	struct btrfs_path *path = NULL;
3742 3743
	struct btrfs_key key;
	struct btrfs_key found_key;
3744 3745
	struct extent_buffer *leaf;
	int slot;
3746 3747
	int ret;
	int enospc_errors = 0;
3748
	bool counting = true;
3749
	/* The single value limit and min/max limits use the same bytes in the */
3750 3751 3752
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
3753 3754 3755
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
3756
	int chunk_reserved = 0;
3757 3758

	path = btrfs_alloc_path();
3759 3760 3761 3762
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
3763 3764 3765 3766 3767 3768

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
3769
	if (!counting) {
3770 3771 3772 3773
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
3774 3775 3776 3777
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
3778 3779 3780 3781
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

C
Chris Mason 已提交
3782
	while (1) {
3783
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3784
		    atomic_read(&fs_info->balance_cancel_req)) {
3785 3786 3787 3788
			ret = -ECANCELED;
			goto error;
		}

3789
		mutex_lock(&fs_info->reclaim_bgs_lock);
3790
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3791
		if (ret < 0) {
3792
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3793
			goto error;
3794
		}
3795 3796 3797 3798 3799 3800

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
3801
			BUG(); /* FIXME break ? */
3802 3803 3804

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
3805
		if (ret) {
3806
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3807
			ret = 0;
3808
			break;
3809
		}
3810

3811 3812 3813
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3814

3815
		if (found_key.objectid != key.objectid) {
3816
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3817
			break;
3818
		}
3819

3820
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3821
		chunk_type = btrfs_chunk_type(leaf, chunk);
3822

3823 3824 3825 3826 3827 3828
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

3829
		ret = should_balance_chunk(leaf, chunk, found_key.offset);
3830

3831
		btrfs_release_path(path);
3832
		if (!ret) {
3833
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3834
			goto loop;
3835
		}
3836

3837
		if (counting) {
3838
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3839 3840 3841
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
3863
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3864 3865 3866
			goto loop;
		}

3867 3868 3869 3870 3871 3872 3873 3874 3875
		if (!chunk_reserved) {
			/*
			 * We may be relocating the only data chunk we have,
			 * which could potentially end up with losing data's
			 * raid profile, so lets allocate an empty one in
			 * advance.
			 */
			ret = btrfs_may_alloc_data_chunk(fs_info,
							 found_key.offset);
3876
			if (ret < 0) {
3877
				mutex_unlock(&fs_info->reclaim_bgs_lock);
3878
				goto error;
3879 3880
			} else if (ret == 1) {
				chunk_reserved = 1;
3881 3882 3883
			}
		}

3884
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3885
		mutex_unlock(&fs_info->reclaim_bgs_lock);
3886
		if (ret == -ENOSPC) {
3887
			enospc_errors++;
3888 3889 3890 3891 3892 3893 3894
		} else if (ret == -ETXTBSY) {
			btrfs_info(fs_info,
	   "skipping relocation of block group %llu due to active swapfile",
				   found_key.offset);
			ret = 0;
		} else if (ret) {
			goto error;
3895 3896 3897 3898 3899
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
3900
loop:
3901 3902
		if (found_key.offset == 0)
			break;
3903
		key.offset = found_key.offset - 1;
3904
	}
3905

3906 3907 3908 3909 3910
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
3911 3912
error:
	btrfs_free_path(path);
3913
	if (enospc_errors) {
3914
		btrfs_info(fs_info, "%d enospc errors during balance",
J
Jeff Mahoney 已提交
3915
			   enospc_errors);
3916 3917 3918 3919
		if (!ret)
			ret = -ENOSPC;
	}

3920 3921 3922
	return ret;
}

3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942
/**
 * alloc_profile_is_valid - see if a given profile is valid and reduced
 * @flags: profile to validate
 * @extended: if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

3943
	return has_single_bit_set(flags);
3944 3945
}

3946 3947
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
3948 3949 3950 3951
	/* cancel requested || normal exit path */
	return atomic_read(&fs_info->balance_cancel_req) ||
		(atomic_read(&fs_info->balance_pause_req) == 0 &&
		 atomic_read(&fs_info->balance_cancel_req) == 0);
3952 3953
}

3954 3955 3956 3957 3958 3959 3960
/*
 * Validate target profile against allowed profiles and return true if it's OK.
 * Otherwise print the error message and return false.
 */
static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
		const struct btrfs_balance_args *bargs,
		u64 allowed, const char *type)
3961
{
3962 3963 3964
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return true;

3965 3966 3967 3968 3969 3970 3971
	if (fs_info->sectorsize < PAGE_SIZE &&
		bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		btrfs_err(fs_info,
		"RAID56 is not yet supported for sectorsize %u with page size %lu",
			  fs_info->sectorsize, PAGE_SIZE);
		return false;
	}
3972 3973 3974 3975 3976 3977 3978 3979
	/* Profile is valid and does not have bits outside of the allowed set */
	if (alloc_profile_is_valid(bargs->target, 1) &&
	    (bargs->target & ~allowed) == 0)
		return true;

	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
			type, btrfs_bg_type_to_raid_name(bargs->target));
	return false;
3980 3981
}

3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025
/*
 * Fill @buf with textual description of balance filter flags @bargs, up to
 * @size_buf including the terminating null. The output may be trimmed if it
 * does not fit into the provided buffer.
 */
static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
				 u32 size_buf)
{
	int ret;
	u32 size_bp = size_buf;
	char *bp = buf;
	u64 flags = bargs->flags;
	char tmp_buf[128] = {'\0'};

	if (!flags)
		return;

#define CHECK_APPEND_NOARG(a)						\
	do {								\
		ret = snprintf(bp, size_bp, (a));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

#define CHECK_APPEND_1ARG(a, v1)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

#define CHECK_APPEND_2ARG(a, v1, v2)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

4026 4027 4028
	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
		CHECK_APPEND_1ARG("convert=%s,",
				  btrfs_bg_type_to_raid_name(bargs->target));
4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135

	if (flags & BTRFS_BALANCE_ARGS_SOFT)
		CHECK_APPEND_NOARG("soft,");

	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
					    sizeof(tmp_buf));
		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
	}

	if (flags & BTRFS_BALANCE_ARGS_USAGE)
		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);

	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
		CHECK_APPEND_2ARG("usage=%u..%u,",
				  bargs->usage_min, bargs->usage_max);

	if (flags & BTRFS_BALANCE_ARGS_DEVID)
		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);

	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
		CHECK_APPEND_2ARG("drange=%llu..%llu,",
				  bargs->pstart, bargs->pend);

	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
				  bargs->vstart, bargs->vend);

	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);

	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
		CHECK_APPEND_2ARG("limit=%u..%u,",
				bargs->limit_min, bargs->limit_max);

	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
		CHECK_APPEND_2ARG("stripes=%u..%u,",
				  bargs->stripes_min, bargs->stripes_max);

#undef CHECK_APPEND_2ARG
#undef CHECK_APPEND_1ARG
#undef CHECK_APPEND_NOARG

out_overflow:

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
	else
		buf[0] = '\0';
}

static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
	u32 size_buf = 1024;
	char tmp_buf[192] = {'\0'};
	char *buf;
	char *bp;
	u32 size_bp = size_buf;
	int ret;
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;

	buf = kzalloc(size_buf, GFP_KERNEL);
	if (!buf)
		return;

	bp = buf;

#define CHECK_APPEND_1ARG(a, v1)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

	if (bctl->flags & BTRFS_BALANCE_FORCE)
		CHECK_APPEND_1ARG("%s", "-f ");

	if (bctl->flags & BTRFS_BALANCE_DATA) {
		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
	}

	if (bctl->flags & BTRFS_BALANCE_METADATA) {
		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
	}

	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
	}

#undef CHECK_APPEND_1ARG

out_overflow:

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
	btrfs_info(fs_info, "balance: %s %s",
		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
		   "resume" : "start", buf);

	kfree(buf);
}

4136
/*
4137
 * Should be called with balance mutexe held
4138
 */
4139 4140
int btrfs_balance(struct btrfs_fs_info *fs_info,
		  struct btrfs_balance_control *bctl,
4141 4142
		  struct btrfs_ioctl_balance_args *bargs)
{
4143
	u64 meta_target, data_target;
4144
	u64 allowed;
4145
	int mixed = 0;
4146
	int ret;
4147
	u64 num_devices;
4148
	unsigned seq;
4149
	bool reducing_redundancy;
4150
	int i;
4151

4152
	if (btrfs_fs_closing(fs_info) ||
4153
	    atomic_read(&fs_info->balance_pause_req) ||
4154
	    btrfs_should_cancel_balance(fs_info)) {
4155 4156 4157 4158
		ret = -EINVAL;
		goto out;
	}

4159 4160 4161 4162
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

4163 4164 4165 4166
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
4167 4168
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
4169 4170 4171
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
J
Jeff Mahoney 已提交
4172
			btrfs_err(fs_info,
4173
	  "balance: mixed groups data and metadata options must be the same");
4174 4175 4176 4177 4178
			ret = -EINVAL;
			goto out;
		}
	}

4179 4180
	/*
	 * rw_devices will not change at the moment, device add/delete/replace
4181
	 * are exclusive
4182 4183
	 */
	num_devices = fs_info->fs_devices->rw_devices;
4184 4185 4186 4187 4188 4189 4190

	/*
	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
	 * special bit for it, to make it easier to distinguish.  Thus we need
	 * to set it manually, or balance would refuse the profile.
	 */
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4191 4192 4193
	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
		if (num_devices >= btrfs_raid_array[i].devs_min)
			allowed |= btrfs_raid_array[i].bg_flag;
4194

4195 4196 4197
	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4198 4199 4200 4201
		ret = -EINVAL;
		goto out;
	}

4202 4203 4204 4205 4206 4207 4208 4209 4210 4211
	/*
	 * Allow to reduce metadata or system integrity only if force set for
	 * profiles with redundancy (copies, parity)
	 */
	allowed = 0;
	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
		if (btrfs_raid_array[i].ncopies >= 2 ||
		    btrfs_raid_array[i].tolerated_failures >= 1)
			allowed |= btrfs_raid_array[i].bg_flag;
	}
4212 4213 4214 4215 4216 4217 4218 4219
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
4220
		     !(bctl->meta.target & allowed)))
4221
			reducing_redundancy = true;
4222
		else
4223
			reducing_redundancy = false;
4224 4225 4226 4227 4228 4229

		/* if we're not converting, the target field is uninitialized */
		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
			bctl->data.target : fs_info->avail_data_alloc_bits;
4230
	} while (read_seqretry(&fs_info->profiles_lock, seq));
4231

4232
	if (reducing_redundancy) {
4233 4234
		if (bctl->flags & BTRFS_BALANCE_FORCE) {
			btrfs_info(fs_info,
4235
			   "balance: force reducing metadata redundancy");
4236 4237
		} else {
			btrfs_err(fs_info,
4238
	"balance: reduces metadata redundancy, use --force if you want this");
4239 4240 4241 4242 4243
			ret = -EINVAL;
			goto out;
		}
	}

4244 4245
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4246
		btrfs_warn(fs_info,
4247
	"balance: metadata profile %s has lower redundancy than data profile %s",
4248 4249
				btrfs_bg_type_to_raid_name(meta_target),
				btrfs_bg_type_to_raid_name(data_target));
4250 4251
	}

4252
	ret = insert_balance_item(fs_info, bctl);
I
Ilya Dryomov 已提交
4253
	if (ret && ret != -EEXIST)
4254 4255
		goto out;

I
Ilya Dryomov 已提交
4256 4257
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
4258 4259 4260 4261
		BUG_ON(fs_info->balance_ctl);
		spin_lock(&fs_info->balance_lock);
		fs_info->balance_ctl = bctl;
		spin_unlock(&fs_info->balance_lock);
I
Ilya Dryomov 已提交
4262 4263 4264 4265 4266 4267
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
4268

4269 4270
	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4271
	describe_balance_start_or_resume(fs_info);
4272 4273 4274 4275 4276
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
4277 4278
	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
		btrfs_info(fs_info, "balance: paused");
4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294
	/*
	 * Balance can be canceled by:
	 *
	 * - Regular cancel request
	 *   Then ret == -ECANCELED and balance_cancel_req > 0
	 *
	 * - Fatal signal to "btrfs" process
	 *   Either the signal caught by wait_reserve_ticket() and callers
	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
	 *   got -ECANCELED.
	 *   Either way, in this case balance_cancel_req = 0, and
	 *   ret == -EINTR or ret == -ECANCELED.
	 *
	 * So here we only check the return value to catch canceled balance.
	 */
	else if (ret == -ECANCELED || ret == -EINTR)
4295 4296 4297 4298
		btrfs_info(fs_info, "balance: canceled");
	else
		btrfs_info(fs_info, "balance: ended with status: %d", ret);

4299
	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4300 4301 4302

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
4303
		btrfs_update_ioctl_balance_args(fs_info, bargs);
4304 4305
	}

4306 4307
	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
	    balance_need_close(fs_info)) {
4308
		reset_balance_state(fs_info);
4309
		btrfs_exclop_finish(fs_info);
4310 4311
	}

4312
	wake_up(&fs_info->balance_wait_q);
4313 4314 4315

	return ret;
out:
I
Ilya Dryomov 已提交
4316
	if (bctl->flags & BTRFS_BALANCE_RESUME)
4317
		reset_balance_state(fs_info);
4318
	else
I
Ilya Dryomov 已提交
4319
		kfree(bctl);
4320
	btrfs_exclop_finish(fs_info);
4321

I
Ilya Dryomov 已提交
4322 4323 4324 4325 4326
	return ret;
}

static int balance_kthread(void *data)
{
4327
	struct btrfs_fs_info *fs_info = data;
4328
	int ret = 0;
I
Ilya Dryomov 已提交
4329 4330

	mutex_lock(&fs_info->balance_mutex);
4331
	if (fs_info->balance_ctl)
4332
		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
I
Ilya Dryomov 已提交
4333
	mutex_unlock(&fs_info->balance_mutex);
4334

I
Ilya Dryomov 已提交
4335 4336 4337
	return ret;
}

4338 4339 4340 4341
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

4342
	mutex_lock(&fs_info->balance_mutex);
4343
	if (!fs_info->balance_ctl) {
4344
		mutex_unlock(&fs_info->balance_mutex);
4345 4346
		return 0;
	}
4347
	mutex_unlock(&fs_info->balance_mutex);
4348

4349
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4350
		btrfs_info(fs_info, "balance: resume skipped");
4351 4352 4353
		return 0;
	}

4354 4355 4356 4357 4358 4359 4360 4361 4362
	/*
	 * A ro->rw remount sequence should continue with the paused balance
	 * regardless of who pauses it, system or the user as of now, so set
	 * the resume flag.
	 */
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
	spin_unlock(&fs_info->balance_lock);

4363
	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4364
	return PTR_ERR_OR_ZERO(tsk);
4365 4366
}

4367
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
I
Ilya Dryomov 已提交
4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
4382
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
I
Ilya Dryomov 已提交
4383 4384
	key.offset = 0;

4385
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
I
Ilya Dryomov 已提交
4386
	if (ret < 0)
4387
		goto out;
I
Ilya Dryomov 已提交
4388 4389
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
4390 4391 4392 4393 4394 4395 4396
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
I
Ilya Dryomov 已提交
4397 4398 4399 4400 4401
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

4402 4403
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
I
Ilya Dryomov 已提交
4404 4405 4406 4407 4408 4409 4410 4411

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

4412 4413 4414 4415 4416 4417 4418 4419 4420 4421
	/*
	 * This should never happen, as the paused balance state is recovered
	 * during mount without any chance of other exclusive ops to collide.
	 *
	 * This gives the exclusive op status to balance and keeps in paused
	 * state until user intervention (cancel or umount). If the ownership
	 * cannot be assigned, show a message but do not fail. The balance
	 * is in a paused state and must have fs_info::balance_ctl properly
	 * set up.
	 */
4422
	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4423
		btrfs_warn(fs_info,
4424
	"balance: cannot set exclusive op status, resume manually");
4425

4426 4427
	btrfs_release_path(path);

4428
	mutex_lock(&fs_info->balance_mutex);
4429 4430 4431 4432
	BUG_ON(fs_info->balance_ctl);
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
4433
	mutex_unlock(&fs_info->balance_mutex);
I
Ilya Dryomov 已提交
4434 4435
out:
	btrfs_free_path(path);
4436 4437 4438
	return ret;
}

4439 4440 4441 4442 4443 4444 4445 4446 4447 4448
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

4449
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4450 4451 4452 4453
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
4454
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4455 4456 4457

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
4458
		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4459 4460 4461 4462 4463 4464 4465 4466 4467
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4468 4469 4470 4471 4472 4473 4474 4475
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

4476 4477 4478 4479 4480 4481 4482 4483 4484 4485
	/*
	 * A paused balance with the item stored on disk can be resumed at
	 * mount time if the mount is read-write. Otherwise it's still paused
	 * and we must not allow cancelling as it deletes the item.
	 */
	if (sb_rdonly(fs_info->sb)) {
		mutex_unlock(&fs_info->balance_mutex);
		return -EROFS;
	}

4486 4487 4488 4489 4490
	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
4491
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4492 4493
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
4494
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4495 4496 4497
		mutex_lock(&fs_info->balance_mutex);
	} else {
		mutex_unlock(&fs_info->balance_mutex);
4498 4499 4500 4501
		/*
		 * Lock released to allow other waiters to continue, we'll
		 * reexamine the status again.
		 */
4502 4503
		mutex_lock(&fs_info->balance_mutex);

4504
		if (fs_info->balance_ctl) {
4505
			reset_balance_state(fs_info);
4506
			btrfs_exclop_finish(fs_info);
4507
			btrfs_info(fs_info, "balance: canceled");
4508
		}
4509 4510
	}

4511 4512
	BUG_ON(fs_info->balance_ctl ||
		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4513 4514 4515 4516 4517
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

4518
int btrfs_uuid_scan_kthread(void *data)
S
Stefan Behrens 已提交
4519 4520 4521 4522 4523 4524 4525 4526 4527 4528
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	int ret = 0;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_root_item root_item;
	u32 item_size;
4529
	struct btrfs_trans_handle *trans = NULL;
4530
	bool closing = false;
S
Stefan Behrens 已提交
4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;

	while (1) {
4543 4544 4545 4546
		if (btrfs_fs_closing(fs_info)) {
			closing = true;
			break;
		}
4547 4548
		ret = btrfs_search_forward(root, &key, path,
				BTRFS_OLDEST_GENERATION);
S
Stefan Behrens 已提交
4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571
		if (ret) {
			if (ret > 0)
				ret = 0;
			break;
		}

		if (key.type != BTRFS_ROOT_ITEM_KEY ||
		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
			goto skip;

		eb = path->nodes[0];
		slot = path->slots[0];
		item_size = btrfs_item_size_nr(eb, slot);
		if (item_size < sizeof(root_item))
			goto skip;

		read_extent_buffer(eb, &root_item,
				   btrfs_item_ptr_offset(eb, slot),
				   (int)sizeof(root_item));
		if (btrfs_root_refs(&root_item) == 0)
			goto skip;
4572 4573 4574 4575 4576 4577 4578

		if (!btrfs_is_empty_uuid(root_item.uuid) ||
		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
			if (trans)
				goto update_tree;

			btrfs_release_path(path);
S
Stefan Behrens 已提交
4579 4580 4581 4582 4583 4584 4585 4586 4587
			/*
			 * 1 - subvol uuid item
			 * 1 - received_subvol uuid item
			 */
			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
4588 4589 4590 4591 4592
			continue;
		} else {
			goto skip;
		}
update_tree:
4593
		btrfs_release_path(path);
4594
		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4595
			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
S
Stefan Behrens 已提交
4596 4597 4598
						  BTRFS_UUID_KEY_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4599
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4600 4601 4602 4603 4604 4605
					ret);
				break;
			}
		}

		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4606
			ret = btrfs_uuid_tree_add(trans,
S
Stefan Behrens 已提交
4607 4608 4609 4610
						  root_item.received_uuid,
						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4611
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4612 4613 4614 4615 4616
					ret);
				break;
			}
		}

4617
skip:
4618
		btrfs_release_path(path);
S
Stefan Behrens 已提交
4619
		if (trans) {
4620
			ret = btrfs_end_transaction(trans);
4621
			trans = NULL;
S
Stefan Behrens 已提交
4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642
			if (ret)
				break;
		}

		if (key.offset < (u64)-1) {
			key.offset++;
		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
		} else if (key.objectid < (u64)-1) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
			key.objectid++;
		} else {
			break;
		}
		cond_resched();
	}

out:
	btrfs_free_path(path);
4643
	if (trans && !IS_ERR(trans))
4644
		btrfs_end_transaction(trans);
S
Stefan Behrens 已提交
4645
	if (ret)
4646
		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4647
	else if (!closing)
4648
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
S
Stefan Behrens 已提交
4649 4650 4651 4652
	up(&fs_info->uuid_tree_rescan_sem);
	return 0;
}

4653 4654 4655 4656 4657
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *uuid_root;
S
Stefan Behrens 已提交
4658 4659
	struct task_struct *task;
	int ret;
4660 4661 4662 4663 4664 4665 4666 4667 4668

	/*
	 * 1 - root node
	 * 1 - root item
	 */
	trans = btrfs_start_transaction(tree_root, 2);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

4669
	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4670
	if (IS_ERR(uuid_root)) {
4671
		ret = PTR_ERR(uuid_root);
4672
		btrfs_abort_transaction(trans, ret);
4673
		btrfs_end_transaction(trans);
4674
		return ret;
4675 4676 4677 4678
	}

	fs_info->uuid_root = uuid_root;

4679
	ret = btrfs_commit_transaction(trans);
S
Stefan Behrens 已提交
4680 4681 4682 4683 4684 4685
	if (ret)
		return ret;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
4686
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4687
		btrfs_warn(fs_info, "failed to start uuid_scan task");
S
Stefan Behrens 已提交
4688 4689 4690 4691 4692
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
4693
}
S
Stefan Behrens 已提交
4694

4695 4696 4697 4698 4699 4700 4701
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4702 4703
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4704 4705 4706 4707 4708 4709 4710
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4711 4712
	int failed = 0;
	bool retried = false;
4713 4714
	struct extent_buffer *l;
	struct btrfs_key key;
4715
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4716
	u64 old_total = btrfs_super_total_bytes(super_copy);
4717
	u64 old_size = btrfs_device_get_total_bytes(device);
4718
	u64 diff;
4719
	u64 start;
4720 4721

	new_size = round_down(new_size, fs_info->sectorsize);
4722
	start = new_size;
4723
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4724

4725
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4726 4727
		return -EINVAL;

4728 4729 4730 4731
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4732
	path->reada = READA_BACK;
4733

4734 4735 4736 4737 4738 4739
	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

4740
	mutex_lock(&fs_info->chunk_mutex);
4741

4742
	btrfs_device_set_total_bytes(device, new_size);
4743
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
4744
		device->fs_devices->total_rw_bytes -= diff;
4745
		atomic64_sub(diff, &fs_info->free_chunk_space);
4746
	}
4747 4748 4749 4750 4751 4752

	/*
	 * Once the device's size has been set to the new size, ensure all
	 * in-memory chunks are synced to disk so that the loop below sees them
	 * and relocates them accordingly.
	 */
4753
	if (contains_pending_extent(device, &start, diff)) {
4754 4755 4756 4757 4758 4759 4760 4761
		mutex_unlock(&fs_info->chunk_mutex);
		ret = btrfs_commit_transaction(trans);
		if (ret)
			goto done;
	} else {
		mutex_unlock(&fs_info->chunk_mutex);
		btrfs_end_transaction(trans);
	}
4762

4763
again:
4764 4765 4766 4767
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4768
	do {
4769
		mutex_lock(&fs_info->reclaim_bgs_lock);
4770
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4771
		if (ret < 0) {
4772
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4773
			goto done;
4774
		}
4775 4776 4777

		ret = btrfs_previous_item(root, path, 0, key.type);
		if (ret) {
4778
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4779 4780
			if (ret < 0)
				goto done;
4781
			ret = 0;
4782
			btrfs_release_path(path);
4783
			break;
4784 4785 4786 4787 4788 4789
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4790
		if (key.objectid != device->devid) {
4791
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4792
			btrfs_release_path(path);
4793
			break;
4794
		}
4795 4796 4797 4798

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4799
		if (key.offset + length <= new_size) {
4800
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4801
			btrfs_release_path(path);
4802
			break;
4803
		}
4804 4805

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4806
		btrfs_release_path(path);
4807

4808 4809 4810 4811 4812 4813 4814 4815
		/*
		 * We may be relocating the only data chunk we have,
		 * which could potentially end up with losing data's
		 * raid profile, so lets allocate an empty one in
		 * advance.
		 */
		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
		if (ret < 0) {
4816
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4817 4818 4819
			goto done;
		}

4820
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4821
		mutex_unlock(&fs_info->reclaim_bgs_lock);
4822
		if (ret == -ENOSPC) {
4823
			failed++;
4824 4825 4826 4827 4828 4829 4830 4831
		} else if (ret) {
			if (ret == -ETXTBSY) {
				btrfs_warn(fs_info,
		   "could not shrink block group %llu due to active swapfile",
					   chunk_offset);
			}
			goto done;
		}
4832
	} while (key.offset-- > 0);
4833 4834 4835 4836 4837 4838 4839 4840

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4841 4842
	}

4843
	/* Shrinking succeeded, else we would be at "done". */
4844
	trans = btrfs_start_transaction(root, 0);
4845 4846 4847 4848 4849
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4850
	mutex_lock(&fs_info->chunk_mutex);
4851 4852 4853 4854
	/* Clear all state bits beyond the shrunk device size */
	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
			  CHUNK_STATE_MASK);

4855
	btrfs_device_set_disk_total_bytes(device, new_size);
4856 4857 4858
	if (list_empty(&device->post_commit_list))
		list_add_tail(&device->post_commit_list,
			      &trans->transaction->dev_update_list);
4859 4860

	WARN_ON(diff > old_total);
4861 4862
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4863
	mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
4864 4865 4866

	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4867 4868 4869 4870 4871 4872
	if (ret < 0) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	} else {
		ret = btrfs_commit_transaction(trans);
	}
4873 4874
done:
	btrfs_free_path(path);
4875
	if (ret) {
4876
		mutex_lock(&fs_info->chunk_mutex);
4877
		btrfs_device_set_total_bytes(device, old_size);
4878
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4879
			device->fs_devices->total_rw_bytes += diff;
4880
		atomic64_add(diff, &fs_info->free_chunk_space);
4881
		mutex_unlock(&fs_info->chunk_mutex);
4882
	}
4883 4884 4885
	return ret;
}

4886
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4887 4888 4889
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
4890
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4891 4892 4893 4894
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

4895 4896
	lockdep_assert_held(&fs_info->chunk_mutex);

4897
	array_size = btrfs_super_sys_array_size(super_copy);
4898
	if (array_size + item_size + sizeof(disk_key)
4899
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4900 4901 4902 4903 4904 4905 4906 4907 4908
		return -EFBIG;

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4909

4910 4911 4912
	return 0;
}

4913 4914 4915 4916
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
4917
{
4918 4919
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
4920

4921
	if (di_a->max_avail > di_b->max_avail)
4922
		return -1;
4923
	if (di_a->max_avail < di_b->max_avail)
4924
		return 1;
4925 4926 4927 4928 4929
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
4930
}
4931

D
David Woodhouse 已提交
4932 4933
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
4934
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
D
David Woodhouse 已提交
4935 4936
		return;

4937
	btrfs_set_fs_incompat(info, RAID56);
D
David Woodhouse 已提交
4938 4939
}

4940 4941 4942 4943 4944 4945 4946 4947
static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
		return;

	btrfs_set_fs_incompat(info, RAID1C34);
}

N
Naohiro Aota 已提交
4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972
/*
 * Structure used internally for __btrfs_alloc_chunk() function.
 * Wraps needed parameters.
 */
struct alloc_chunk_ctl {
	u64 start;
	u64 type;
	/* Total number of stripes to allocate */
	int num_stripes;
	/* sub_stripes info for map */
	int sub_stripes;
	/* Stripes per device */
	int dev_stripes;
	/* Maximum number of devices to use */
	int devs_max;
	/* Minimum number of devices to use */
	int devs_min;
	/* ndevs has to be a multiple of this */
	int devs_increment;
	/* Number of copies */
	int ncopies;
	/* Number of stripes worth of bytes to store parity information */
	int nparity;
	u64 max_stripe_size;
	u64 max_chunk_size;
4973
	u64 dev_extent_min;
N
Naohiro Aota 已提交
4974 4975 4976 4977 4978
	u64 stripe_size;
	u64 chunk_size;
	int ndevs;
};

4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006
static void init_alloc_chunk_ctl_policy_regular(
				struct btrfs_fs_devices *fs_devices,
				struct alloc_chunk_ctl *ctl)
{
	u64 type = ctl->type;

	if (type & BTRFS_BLOCK_GROUP_DATA) {
		ctl->max_stripe_size = SZ_1G;
		ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
		/* For larger filesystems, use larger metadata chunks */
		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
			ctl->max_stripe_size = SZ_1G;
		else
			ctl->max_stripe_size = SZ_256M;
		ctl->max_chunk_size = ctl->max_stripe_size;
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
		ctl->max_stripe_size = SZ_32M;
		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
		ctl->devs_max = min_t(int, ctl->devs_max,
				      BTRFS_MAX_DEVS_SYS_CHUNK);
	} else {
		BUG();
	}

	/* We don't want a chunk larger than 10% of writable space */
	ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
				  ctl->max_chunk_size);
5007
	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
5008 5009
}

5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030
static void init_alloc_chunk_ctl_policy_zoned(
				      struct btrfs_fs_devices *fs_devices,
				      struct alloc_chunk_ctl *ctl)
{
	u64 zone_size = fs_devices->fs_info->zone_size;
	u64 limit;
	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
	u64 min_chunk_size = min_data_stripes * zone_size;
	u64 type = ctl->type;

	ctl->max_stripe_size = zone_size;
	if (type & BTRFS_BLOCK_GROUP_DATA) {
		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
						 zone_size);
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
		ctl->max_chunk_size = ctl->max_stripe_size;
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
		ctl->devs_max = min_t(int, ctl->devs_max,
				      BTRFS_MAX_DEVS_SYS_CHUNK);
5031 5032
	} else {
		BUG();
5033 5034 5035 5036 5037 5038 5039 5040 5041 5042
	}

	/* We don't want a chunk larger than 10% of writable space */
	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
			       zone_size),
		    min_chunk_size);
	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
}

5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062
static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
				 struct alloc_chunk_ctl *ctl)
{
	int index = btrfs_bg_flags_to_raid_index(ctl->type);

	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
	ctl->devs_max = btrfs_raid_array[index].devs_max;
	if (!ctl->devs_max)
		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
	ctl->devs_min = btrfs_raid_array[index].devs_min;
	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
	ctl->ncopies = btrfs_raid_array[index].ncopies;
	ctl->nparity = btrfs_raid_array[index].nparity;
	ctl->ndevs = 0;

	switch (fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
		break;
5063 5064 5065
	case BTRFS_CHUNK_ALLOC_ZONED:
		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
		break;
5066 5067 5068 5069 5070
	default:
		BUG();
	}
}

5071 5072 5073
static int gather_device_info(struct btrfs_fs_devices *fs_devices,
			      struct alloc_chunk_ctl *ctl,
			      struct btrfs_device_info *devices_info)
5074
{
5075
	struct btrfs_fs_info *info = fs_devices->fs_info;
5076
	struct btrfs_device *device;
5077
	u64 total_avail;
5078
	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5079
	int ret;
5080 5081 5082
	int ndevs = 0;
	u64 max_avail;
	u64 dev_offset;
5083

5084
	/*
5085 5086
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
5087
	 */
5088
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5089
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
J
Julia Lawall 已提交
5090
			WARN(1, KERN_ERR
5091
			       "BTRFS: read-only device in alloc_list\n");
5092 5093
			continue;
		}
5094

5095 5096
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&device->dev_state) ||
5097
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5098
			continue;
5099

5100 5101 5102 5103
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
5104 5105

		/* If there is no space on this device, skip it. */
5106
		if (total_avail < ctl->dev_extent_min)
5107
			continue;
5108

5109 5110
		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
					   &max_avail);
5111
		if (ret && ret != -ENOSPC)
5112
			return ret;
5113

5114
		if (ret == 0)
5115
			max_avail = dev_extent_want;
5116

5117
		if (max_avail < ctl->dev_extent_min) {
5118 5119
			if (btrfs_test_opt(info, ENOSPC_DEBUG))
				btrfs_debug(info,
5120
			"%s: devid %llu has no free space, have=%llu want=%llu",
5121
					    __func__, device->devid, max_avail,
5122
					    ctl->dev_extent_min);
5123
			continue;
5124
		}
5125

5126 5127 5128 5129 5130
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
5131 5132 5133 5134 5135 5136
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
5137
	ctl->ndevs = ndevs;
5138

5139 5140 5141
	/*
	 * now sort the devices by hole size / available space
	 */
5142
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5143
	     btrfs_cmp_device_info, NULL);
5144

5145 5146 5147
	return 0;
}

5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191
static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
				      struct btrfs_device_info *devices_info)
{
	/* Number of stripes that count for block group size */
	int data_stripes;

	/*
	 * The primary goal is to maximize the number of stripes, so use as
	 * many devices as possible, even if the stripes are not maximum sized.
	 *
	 * The DUP profile stores more than one stripe per device, the
	 * max_avail is the total size so we have to adjust.
	 */
	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
				   ctl->dev_stripes);
	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;

	/* This will have to be fixed for RAID1 and RAID10 over more drives */
	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

	/*
	 * Use the number of data stripes to figure out how big this chunk is
	 * really going to be in terms of logical address space, and compare
	 * that answer with the max chunk size. If it's higher, we try to
	 * reduce stripe_size.
	 */
	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
		/*
		 * Reduce stripe_size, round it up to a 16MB boundary again and
		 * then use it, unless it ends up being even bigger than the
		 * previous value we had already.
		 */
		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
							data_stripes), SZ_16M),
				       ctl->stripe_size);
	}

	/* Align to BTRFS_STRIPE_LEN */
	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
	ctl->chunk_size = ctl->stripe_size * data_stripes;

	return 0;
}

5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223
static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
				    struct btrfs_device_info *devices_info)
{
	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
	/* Number of stripes that count for block group size */
	int data_stripes;

	/*
	 * It should hold because:
	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
	 */
	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);

	ctl->stripe_size = zone_size;
	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
					     ctl->stripe_size) + ctl->nparity,
				     ctl->dev_stripes);
		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
	}

	ctl->chunk_size = ctl->stripe_size * data_stripes;

	return 0;
}

5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250
static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
			      struct alloc_chunk_ctl *ctl,
			      struct btrfs_device_info *devices_info)
{
	struct btrfs_fs_info *info = fs_devices->fs_info;

	/*
	 * Round down to number of usable stripes, devs_increment can be any
	 * number so we can't use round_down() that requires power of 2, while
	 * rounddown is safe.
	 */
	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);

	if (ctl->ndevs < ctl->devs_min) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
			btrfs_debug(info,
	"%s: not enough devices with free space: have=%d minimum required=%d",
				    __func__, ctl->ndevs, ctl->devs_min);
		}
		return -ENOSPC;
	}

	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);

	switch (fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		return decide_stripe_size_regular(ctl, devices_info);
5251 5252
	case BTRFS_CHUNK_ALLOC_ZONED:
		return decide_stripe_size_zoned(ctl, devices_info);
5253 5254 5255 5256 5257
	default:
		BUG();
	}
}

5258
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
N
Naohiro Aota 已提交
5259 5260
			struct alloc_chunk_ctl *ctl,
			struct btrfs_device_info *devices_info)
5261 5262 5263 5264
{
	struct btrfs_fs_info *info = trans->fs_info;
	struct map_lookup *map = NULL;
	struct extent_map_tree *em_tree;
5265
	struct btrfs_block_group *block_group;
5266
	struct extent_map *em;
N
Naohiro Aota 已提交
5267 5268
	u64 start = ctl->start;
	u64 type = ctl->type;
5269 5270 5271 5272
	int ret;
	int i;
	int j;

N
Naohiro Aota 已提交
5273 5274
	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
	if (!map)
5275
		return ERR_PTR(-ENOMEM);
N
Naohiro Aota 已提交
5276
	map->num_stripes = ctl->num_stripes;
5277

N
Naohiro Aota 已提交
5278 5279 5280
	for (i = 0; i < ctl->ndevs; ++i) {
		for (j = 0; j < ctl->dev_stripes; ++j) {
			int s = i * ctl->dev_stripes + j;
5281 5282
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
N
Naohiro Aota 已提交
5283
						   j * ctl->stripe_size;
5284 5285
		}
	}
5286 5287 5288
	map->stripe_len = BTRFS_STRIPE_LEN;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
Y
Yan Zheng 已提交
5289
	map->type = type;
N
Naohiro Aota 已提交
5290
	map->sub_stripes = ctl->sub_stripes;
5291

N
Naohiro Aota 已提交
5292
	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5293

5294
	em = alloc_extent_map();
Y
Yan Zheng 已提交
5295
	if (!em) {
5296
		kfree(map);
5297
		return ERR_PTR(-ENOMEM);
5298
	}
5299
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5300
	em->map_lookup = map;
Y
Yan Zheng 已提交
5301
	em->start = start;
N
Naohiro Aota 已提交
5302
	em->len = ctl->chunk_size;
Y
Yan Zheng 已提交
5303 5304
	em->block_start = 0;
	em->block_len = em->len;
N
Naohiro Aota 已提交
5305
	em->orig_block_len = ctl->stripe_size;
5306

5307
	em_tree = &info->mapping_tree;
5308
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
5309
	ret = add_extent_mapping(em_tree, em, 0);
5310
	if (ret) {
5311
		write_unlock(&em_tree->lock);
5312
		free_extent_map(em);
5313
		return ERR_PTR(ret);
5314
	}
5315 5316
	write_unlock(&em_tree->lock);

5317 5318
	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
	if (IS_ERR(block_group))
5319
		goto error_del_extent;
Y
Yan Zheng 已提交
5320

5321 5322 5323
	for (i = 0; i < map->num_stripes; i++) {
		struct btrfs_device *dev = map->stripes[i].dev;

N
Naohiro Aota 已提交
5324
		btrfs_device_set_bytes_used(dev,
N
Naohiro Aota 已提交
5325
					    dev->bytes_used + ctl->stripe_size);
5326 5327 5328 5329
		if (list_empty(&dev->post_commit_list))
			list_add_tail(&dev->post_commit_list,
				      &trans->transaction->dev_update_list);
	}
5330

N
Naohiro Aota 已提交
5331
	atomic64_sub(ctl->stripe_size * map->num_stripes,
N
Naohiro Aota 已提交
5332
		     &info->free_chunk_space);
5333

5334
	free_extent_map(em);
5335
	check_raid56_incompat_flag(info, type);
5336
	check_raid1c34_incompat_flag(info, type);
D
David Woodhouse 已提交
5337

5338
	return block_group;
5339

5340
error_del_extent:
5341 5342 5343 5344 5345 5346 5347 5348
	write_lock(&em_tree->lock);
	remove_extent_mapping(em_tree, em);
	write_unlock(&em_tree->lock);

	/* One for our allocation */
	free_extent_map(em);
	/* One for the tree reference */
	free_extent_map(em);
N
Naohiro Aota 已提交
5349

5350
	return block_group;
N
Naohiro Aota 已提交
5351 5352
}

5353 5354
struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
					    u64 type)
N
Naohiro Aota 已提交
5355 5356 5357 5358 5359
{
	struct btrfs_fs_info *info = trans->fs_info;
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
	struct btrfs_device_info *devices_info = NULL;
	struct alloc_chunk_ctl ctl;
5360
	struct btrfs_block_group *block_group;
N
Naohiro Aota 已提交
5361 5362
	int ret;

5363 5364
	lockdep_assert_held(&info->chunk_mutex);

N
Naohiro Aota 已提交
5365 5366
	if (!alloc_profile_is_valid(type, 0)) {
		ASSERT(0);
5367
		return ERR_PTR(-EINVAL);
N
Naohiro Aota 已提交
5368 5369 5370 5371 5372
	}

	if (list_empty(&fs_devices->alloc_list)) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG))
			btrfs_debug(info, "%s: no writable device", __func__);
5373
		return ERR_PTR(-ENOSPC);
N
Naohiro Aota 已提交
5374 5375 5376 5377 5378
	}

	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
		ASSERT(0);
5379
		return ERR_PTR(-EINVAL);
N
Naohiro Aota 已提交
5380 5381
	}

5382
	ctl.start = find_next_chunk(info);
N
Naohiro Aota 已提交
5383 5384 5385 5386 5387 5388
	ctl.type = type;
	init_alloc_chunk_ctl(fs_devices, &ctl);

	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
			       GFP_NOFS);
	if (!devices_info)
5389
		return ERR_PTR(-ENOMEM);
N
Naohiro Aota 已提交
5390 5391

	ret = gather_device_info(fs_devices, &ctl, devices_info);
5392 5393
	if (ret < 0) {
		block_group = ERR_PTR(ret);
N
Naohiro Aota 已提交
5394
		goto out;
5395
	}
N
Naohiro Aota 已提交
5396 5397

	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5398 5399
	if (ret < 0) {
		block_group = ERR_PTR(ret);
N
Naohiro Aota 已提交
5400
		goto out;
5401
	}
N
Naohiro Aota 已提交
5402

5403
	block_group = create_chunk(trans, &ctl, devices_info);
N
Naohiro Aota 已提交
5404 5405

out:
5406
	kfree(devices_info);
5407
	return block_group;
Y
Yan Zheng 已提交
5408 5409
}

5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470
/*
 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
 * chunks.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
				     struct btrfs_block_group *bg)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_key key;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
	struct extent_map *em;
	struct map_lookup *map;
	size_t item_size;
	int i;
	int ret;

	/*
	 * We take the chunk_mutex for 2 reasons:
	 *
	 * 1) Updates and insertions in the chunk btree must be done while holding
	 *    the chunk_mutex, as well as updating the system chunk array in the
	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
	 *    details;
	 *
	 * 2) To prevent races with the final phase of a device replace operation
	 *    that replaces the device object associated with the map's stripes,
	 *    because the device object's id can change at any time during that
	 *    final phase of the device replace operation
	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
	 *    which would cause a failure when updating the device item, which does
	 *    not exists, or persisting a stripe of the chunk item with such ID.
	 *    Here we can't use the device_list_mutex because our caller already
	 *    has locked the chunk_mutex, and the final phase of device replace
	 *    acquires both mutexes - first the device_list_mutex and then the
	 *    chunk_mutex. Using any of those two mutexes protects us from a
	 *    concurrent device replace.
	 */
	lockdep_assert_held(&fs_info->chunk_mutex);

	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		btrfs_abort_transaction(trans, ret);
		return ret;
	}

	map = em->map_lookup;
	item_size = btrfs_chunk_item_size(map->num_stripes);

	chunk = kzalloc(item_size, GFP_NOFS);
	if (!chunk) {
		ret = -ENOMEM;
		btrfs_abort_transaction(trans, ret);
5471
		goto out;
Y
Yan Zheng 已提交
5472 5473
	}

5474 5475 5476 5477 5478 5479 5480 5481
	for (i = 0; i < map->num_stripes; i++) {
		struct btrfs_device *device = map->stripes[i].dev;

		ret = btrfs_update_device(trans, device);
		if (ret)
			goto out;
	}

Y
Yan Zheng 已提交
5482
	stripe = &chunk->stripe;
5483
	for (i = 0; i < map->num_stripes; i++) {
5484 5485
		struct btrfs_device *device = map->stripes[i].dev;
		const u64 dev_offset = map->stripes[i].physical;
5486

5487 5488 5489
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Y
Yan Zheng 已提交
5490
		stripe++;
5491 5492
	}

5493
	btrfs_set_stack_chunk_length(chunk, bg->length);
5494
	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
Y
Yan Zheng 已提交
5495 5496 5497 5498 5499
	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5500
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Y
Yan Zheng 已提交
5501
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5502

Y
Yan Zheng 已提交
5503 5504
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
5505
	key.offset = bg->start;
5506

Y
Yan Zheng 已提交
5507
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5508 5509 5510 5511 5512 5513
	if (ret)
		goto out;

	bg->chunk_item_inserted = 1;

	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5514
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5515 5516
		if (ret)
			goto out;
5517
	}
5518

5519
out:
5520
	kfree(chunk);
5521
	free_extent_map(em);
5522
	return ret;
Y
Yan Zheng 已提交
5523
}
5524

5525
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
Y
Yan Zheng 已提交
5526
{
5527
	struct btrfs_fs_info *fs_info = trans->fs_info;
Y
Yan Zheng 已提交
5528
	u64 alloc_profile;
5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551
	struct btrfs_block_group *meta_bg;
	struct btrfs_block_group *sys_bg;

	/*
	 * When adding a new device for sprouting, the seed device is read-only
	 * so we must first allocate a metadata and a system chunk. But before
	 * adding the block group items to the extent, device and chunk btrees,
	 * we must first:
	 *
	 * 1) Create both chunks without doing any changes to the btrees, as
	 *    otherwise we would get -ENOSPC since the block groups from the
	 *    seed device are read-only;
	 *
	 * 2) Add the device item for the new sprout device - finishing the setup
	 *    of a new block group requires updating the device item in the chunk
	 *    btree, so it must exist when we attempt to do it. The previous step
	 *    ensures this does not fail with -ENOSPC.
	 *
	 * After that we can add the block group items to their btrees:
	 * update existing device item in the chunk btree, add a new block group
	 * item to the extent btree, add a new chunk item to the chunk btree and
	 * finally add the new device extent items to the devices btree.
	 */
Y
Yan Zheng 已提交
5552

5553
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5554 5555 5556
	meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
	if (IS_ERR(meta_bg))
		return PTR_ERR(meta_bg);
Y
Yan Zheng 已提交
5557

5558
	alloc_profile = btrfs_system_alloc_profile(fs_info);
5559 5560 5561 5562 5563
	sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
	if (IS_ERR(sys_bg))
		return PTR_ERR(sys_bg);

	return 0;
Y
Yan Zheng 已提交
5564 5565
}

5566 5567
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
5568
	const int index = btrfs_bg_flags_to_raid_index(map->type);
Y
Yan Zheng 已提交
5569

5570
	return btrfs_raid_array[index].tolerated_failures;
Y
Yan Zheng 已提交
5571 5572
}

5573
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Y
Yan Zheng 已提交
5574 5575 5576 5577
{
	struct extent_map *em;
	struct map_lookup *map;
	int readonly = 0;
5578
	int miss_ndevs = 0;
Y
Yan Zheng 已提交
5579 5580
	int i;

5581
	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5582
	if (IS_ERR(em))
Y
Yan Zheng 已提交
5583 5584
		return 1;

5585
	map = em->map_lookup;
Y
Yan Zheng 已提交
5586
	for (i = 0; i < map->num_stripes; i++) {
5587 5588
		if (test_bit(BTRFS_DEV_STATE_MISSING,
					&map->stripes[i].dev->dev_state)) {
5589 5590 5591
			miss_ndevs++;
			continue;
		}
5592 5593
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
					&map->stripes[i].dev->dev_state)) {
Y
Yan Zheng 已提交
5594
			readonly = 1;
5595
			goto end;
Y
Yan Zheng 已提交
5596 5597
		}
	}
5598 5599 5600 5601 5602 5603 5604 5605 5606

	/*
	 * If the number of missing devices is larger than max errors,
	 * we can not write the data into that chunk successfully, so
	 * set it readonly.
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
		readonly = 1;
end:
5607
	free_extent_map(em);
Y
Yan Zheng 已提交
5608
	return readonly;
5609 5610
}

5611
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5612 5613 5614
{
	struct extent_map *em;

C
Chris Mason 已提交
5615
	while (1) {
5616 5617
		write_lock(&tree->lock);
		em = lookup_extent_mapping(tree, 0, (u64)-1);
5618
		if (em)
5619 5620
			remove_extent_mapping(tree, em);
		write_unlock(&tree->lock);
5621 5622 5623 5624 5625 5626 5627 5628 5629
		if (!em)
			break;
		/* once for us */
		free_extent_map(em);
		/* once for the tree */
		free_extent_map(em);
	}
}

5630
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5631 5632 5633 5634 5635
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret;

5636
	em = btrfs_get_chunk_map(fs_info, logical, len);
5637 5638 5639 5640 5641 5642 5643
	if (IS_ERR(em))
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5644 5645
		return 1;

5646
	map = em->map_lookup;
5647
	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5648
		ret = map->num_stripes;
C
Chris Mason 已提交
5649 5650
	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		ret = map->sub_stripes;
D
David Woodhouse 已提交
5651 5652 5653
	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		ret = 2;
	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
L
Liu Bo 已提交
5654 5655 5656
		/*
		 * There could be two corrupted data stripes, we need
		 * to loop retry in order to rebuild the correct data.
5657
		 *
L
Liu Bo 已提交
5658 5659 5660 5661
		 * Fail a stripe at a time on every retry except the
		 * stripe under reconstruction.
		 */
		ret = map->num_stripes;
5662 5663 5664
	else
		ret = 1;
	free_extent_map(em);
5665

5666
	down_read(&fs_info->dev_replace.rwsem);
5667 5668
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
	    fs_info->dev_replace.tgtdev)
5669
		ret++;
5670
	up_read(&fs_info->dev_replace.rwsem);
5671

5672 5673 5674
	return ret;
}

5675
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
D
David Woodhouse 已提交
5676 5677 5678 5679
				    u64 logical)
{
	struct extent_map *em;
	struct map_lookup *map;
5680
	unsigned long len = fs_info->sectorsize;
D
David Woodhouse 已提交
5681

5682
	em = btrfs_get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5683

5684 5685 5686 5687 5688 5689
	if (!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			len = map->stripe_len * nr_data_stripes(map);
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5690 5691 5692
	return len;
}

5693
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
D
David Woodhouse 已提交
5694 5695 5696 5697 5698
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret = 0;

5699
	em = btrfs_get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5700

5701 5702 5703 5704 5705 5706
	if(!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5707 5708 5709
	return ret;
}

5710
static int find_live_mirror(struct btrfs_fs_info *fs_info,
5711
			    struct map_lookup *map, int first,
5712
			    int dev_replace_is_ongoing)
5713 5714
{
	int i;
5715
	int num_stripes;
5716
	int preferred_mirror;
5717 5718 5719
	int tolerance;
	struct btrfs_device *srcdev;

5720
	ASSERT((map->type &
5721
		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5722 5723 5724 5725 5726 5727

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		num_stripes = map->sub_stripes;
	else
		num_stripes = map->num_stripes;

A
Anand Jain 已提交
5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739
	switch (fs_info->fs_devices->read_policy) {
	default:
		/* Shouldn't happen, just warn and use pid instead of failing */
		btrfs_warn_rl(fs_info,
			      "unknown read_policy type %u, reset to pid",
			      fs_info->fs_devices->read_policy);
		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
		fallthrough;
	case BTRFS_READ_POLICY_PID:
		preferred_mirror = first + (current->pid % num_stripes);
		break;
	}
5740

5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753
	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
5754 5755 5756
		if (map->stripes[preferred_mirror].dev->bdev &&
		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
			return preferred_mirror;
5757
		for (i = first; i < first + num_stripes; i++) {
5758 5759 5760 5761
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5762
	}
5763

5764 5765 5766
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
5767
	return preferred_mirror;
5768 5769
}

D
David Woodhouse 已提交
5770
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5771
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
D
David Woodhouse 已提交
5772 5773 5774 5775 5776 5777
{
	int i;
	int again = 1;

	while (again) {
		again = 0;
5778
		for (i = 0; i < num_stripes - 1; i++) {
5779 5780 5781 5782
			/* Swap if parity is on a smaller index */
			if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
				swap(bbio->stripes[i], bbio->stripes[i + 1]);
				swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
D
David Woodhouse 已提交
5783 5784 5785 5786 5787 5788
				again = 1;
			}
		}
	}
}

5789 5790 5791
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
	struct btrfs_bio *bbio = kzalloc(
5792
		 /* the size of the btrfs_bio */
5793
		sizeof(struct btrfs_bio) +
5794
		/* plus the variable array for the stripes */
5795
		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5796
		/* plus the variable array for the tgt dev */
5797
		sizeof(int) * (real_stripes) +
5798 5799 5800 5801 5802
		/*
		 * plus the raid_map, which includes both the tgt dev
		 * and the stripes
		 */
		sizeof(u64) * (total_stripes),
5803
		GFP_NOFS|__GFP_NOFAIL);
5804 5805

	atomic_set(&bbio->error, 0);
5806
	refcount_set(&bbio->refs, 1);
5807

5808 5809 5810
	bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
	bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);

5811 5812 5813 5814 5815
	return bbio;
}

void btrfs_get_bbio(struct btrfs_bio *bbio)
{
5816 5817
	WARN_ON(!refcount_read(&bbio->refs));
	refcount_inc(&bbio->refs);
5818 5819 5820 5821 5822 5823
}

void btrfs_put_bbio(struct btrfs_bio *bbio)
{
	if (!bbio)
		return;
5824
	if (refcount_dec_and_test(&bbio->refs))
5825 5826 5827
		kfree(bbio);
}

5828 5829 5830 5831 5832 5833
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5834
					 u64 logical, u64 *length_ret,
5835 5836 5837 5838 5839
					 struct btrfs_bio **bbio_ret)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_bio *bbio;
5840
	u64 length = *length_ret;
5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860
	u64 offset;
	u64 stripe_nr;
	u64 stripe_nr_end;
	u64 stripe_end_offset;
	u64 stripe_cnt;
	u64 stripe_len;
	u64 stripe_offset;
	u64 num_stripes;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
	u64 stripes_per_dev = 0;
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
	int ret = 0;
	int i;

	/* discard always return a bbio */
	ASSERT(bbio_ret);

5861
	em = btrfs_get_chunk_map(fs_info, logical, length);
5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872
	if (IS_ERR(em))
		return PTR_ERR(em);

	map = em->map_lookup;
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	offset = logical - em->start;
5873
	length = min_t(u64, em->start + em->len - logical, length);
5874
	*length_ret = length;
5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886

	stripe_len = map->stripe_len;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
	stripe_nr = div64_u64(offset, stripe_len);

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_nr * stripe_len;

	stripe_nr_end = round_up(offset + length, map->stripe_len);
5887
	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913
	stripe_cnt = stripe_nr_end - stripe_nr;
	stripe_end_offset = stripe_nr_end * map->stripe_len -
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
	num_stripes = 1;
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
		num_stripes = min_t(u64, map->num_stripes,
				    sub_stripes * stripe_cnt);
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
		stripe_index *= sub_stripes;
		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
					      &remaining_stripes);
		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
		last_stripe *= sub_stripes;
5914
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981
				BTRFS_BLOCK_GROUP_DUP)) {
		num_stripes = map->num_stripes;
	} else {
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
					&stripe_index);
	}

	bbio = alloc_btrfs_bio(num_stripes, 0);
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			bbio->stripes[i].length = stripes_per_dev *
				map->stripe_len;

			if (i / sub_stripes < remaining_stripes)
				bbio->stripes[i].length +=
					map->stripe_len;

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
				bbio->stripes[i].length -=
					stripe_offset;

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
				bbio->stripes[i].length -=
					stripe_end_offset;

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
			bbio->stripes[i].length = length;
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

	*bbio_ret = bbio;
	bbio->map_type = map->type;
	bbio->num_stripes = num_stripes;
out:
	free_extent_map(em);
	return ret;
}

5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058
/*
 * In dev-replace case, for repair case (that's the only case where the mirror
 * is selected explicitly when calling btrfs_map_block), blocks left of the
 * left cursor can also be read from the target drive.
 *
 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
 * array of stripes.
 * For READ, it also needs to be supported using the same mirror number.
 *
 * If the requested block is not left of the left cursor, EIO is returned. This
 * can happen because btrfs_num_copies() returns one more in the dev-replace
 * case.
 */
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 u64 srcdev_devid, int *mirror_num,
					 u64 *physical)
{
	struct btrfs_bio *bbio = NULL;
	int num_stripes;
	int index_srcdev = 0;
	int found = 0;
	u64 physical_of_found = 0;
	int i;
	int ret = 0;

	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				logical, &length, &bbio, 0, 0);
	if (ret) {
		ASSERT(bbio == NULL);
		return ret;
	}

	num_stripes = bbio->num_stripes;
	if (*mirror_num > num_stripes) {
		/*
		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
		 * that means that the requested area is not left of the left
		 * cursor
		 */
		btrfs_put_bbio(bbio);
		return -EIO;
	}

	/*
	 * process the rest of the function using the mirror_num of the source
	 * drive. Therefore look it up first.  At the end, patch the device
	 * pointer to the one of the target drive.
	 */
	for (i = 0; i < num_stripes; i++) {
		if (bbio->stripes[i].dev->devid != srcdev_devid)
			continue;

		/*
		 * In case of DUP, in order to keep it simple, only add the
		 * mirror with the lowest physical address
		 */
		if (found &&
		    physical_of_found <= bbio->stripes[i].physical)
			continue;

		index_srcdev = i;
		found = 1;
		physical_of_found = bbio->stripes[i].physical;
	}

	btrfs_put_bbio(bbio);

	ASSERT(found);
	if (!found)
		return -EIO;

	*mirror_num = index_srcdev + 1;
	*physical = physical_of_found;
	return ret;
}

6059 6060 6061 6062 6063
static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
{
	struct btrfs_block_group *cache;
	bool ret;

6064
	/* Non zoned filesystem does not use "to_copy" flag */
6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077
	if (!btrfs_is_zoned(fs_info))
		return false;

	cache = btrfs_lookup_block_group(fs_info, logical);

	spin_lock(&cache->lock);
	ret = cache->to_copy;
	spin_unlock(&cache->lock);

	btrfs_put_block_group(cache);
	return ret;
}

6078 6079 6080
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				      struct btrfs_bio **bbio_ret,
				      struct btrfs_dev_replace *dev_replace,
6081
				      u64 logical,
6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093
				      int *num_stripes_ret, int *max_errors_ret)
{
	struct btrfs_bio *bbio = *bbio_ret;
	u64 srcdev_devid = dev_replace->srcdev->devid;
	int tgtdev_indexes = 0;
	int num_stripes = *num_stripes_ret;
	int max_errors = *max_errors_ret;
	int i;

	if (op == BTRFS_MAP_WRITE) {
		int index_where_to_add;

6094 6095 6096 6097 6098 6099 6100
		/*
		 * A block group which have "to_copy" set will eventually
		 * copied by dev-replace process. We can avoid cloning IO here.
		 */
		if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
			return;

6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179
		/*
		 * duplicate the write operations while the dev replace
		 * procedure is running. Since the copying of the old disk to
		 * the new disk takes place at run time while the filesystem is
		 * mounted writable, the regular write operations to the old
		 * disk have to be duplicated to go to the new disk as well.
		 *
		 * Note that device->missing is handled by the caller, and that
		 * the write to the old disk is already set up in the stripes
		 * array.
		 */
		index_where_to_add = num_stripes;
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/* write to new disk, too */
				struct btrfs_bio_stripe *new =
					bbio->stripes + index_where_to_add;
				struct btrfs_bio_stripe *old =
					bbio->stripes + i;

				new->physical = old->physical;
				new->length = old->length;
				new->dev = dev_replace->tgtdev;
				bbio->tgtdev_map[i] = index_where_to_add;
				index_where_to_add++;
				max_errors++;
				tgtdev_indexes++;
			}
		}
		num_stripes = index_where_to_add;
	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
		int index_srcdev = 0;
		int found = 0;
		u64 physical_of_found = 0;

		/*
		 * During the dev-replace procedure, the target drive can also
		 * be used to read data in case it is needed to repair a corrupt
		 * block elsewhere. This is possible if the requested area is
		 * left of the left cursor. In this area, the target drive is a
		 * full copy of the source drive.
		 */
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/*
				 * In case of DUP, in order to keep it simple,
				 * only add the mirror with the lowest physical
				 * address
				 */
				if (found &&
				    physical_of_found <=
				     bbio->stripes[i].physical)
					continue;
				index_srcdev = i;
				found = 1;
				physical_of_found = bbio->stripes[i].physical;
			}
		}
		if (found) {
			struct btrfs_bio_stripe *tgtdev_stripe =
				bbio->stripes + num_stripes;

			tgtdev_stripe->physical = physical_of_found;
			tgtdev_stripe->length =
				bbio->stripes[index_srcdev].length;
			tgtdev_stripe->dev = dev_replace->tgtdev;
			bbio->tgtdev_map[index_srcdev] = num_stripes;

			tgtdev_indexes++;
			num_stripes++;
		}
	}

	*num_stripes_ret = num_stripes;
	*max_errors_ret = max_errors;
	bbio->num_tgtdevs = tgtdev_indexes;
	*bbio_ret = bbio;
}

6180 6181 6182 6183 6184
static bool need_full_stripe(enum btrfs_map_op op)
{
	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}

6185
/*
6186 6187 6188
 * Calculate the geometry of a particular (address, len) tuple. This
 * information is used to calculate how big a particular bio can get before it
 * straddles a stripe.
6189
 *
6190 6191 6192 6193 6194
 * @fs_info: the filesystem
 * @em:      mapping containing the logical extent
 * @op:      type of operation - write or read
 * @logical: address that we want to figure out the geometry of
 * @io_geom: pointer used to return values
6195 6196 6197 6198
 *
 * Returns < 0 in case a chunk for the given logical address cannot be found,
 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
 */
6199
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6200
			  enum btrfs_map_op op, u64 logical,
6201
			  struct btrfs_io_geometry *io_geom)
6202 6203
{
	struct map_lookup *map;
6204
	u64 len;
6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218
	u64 offset;
	u64 stripe_offset;
	u64 stripe_nr;
	u64 stripe_len;
	u64 raid56_full_stripe_start = (u64)-1;
	int data_stripes;

	ASSERT(op != BTRFS_MAP_DISCARD);

	map = em->map_lookup;
	/* Offset of this logical address in the chunk */
	offset = logical - em->start;
	/* Len of a stripe in a chunk */
	stripe_len = map->stripe_len;
D
David Sterba 已提交
6219
	/* Stripe where this block falls in */
6220 6221 6222 6223 6224 6225 6226
	stripe_nr = div64_u64(offset, stripe_len);
	/* Offset of stripe in the chunk */
	stripe_offset = stripe_nr * stripe_len;
	if (offset < stripe_offset) {
		btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
			stripe_offset, offset, em->start, logical, stripe_len);
6227
		return -EINVAL;
6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273
	}

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_offset;
	data_stripes = nr_data_stripes(map);

	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
		u64 max_len = stripe_len - stripe_offset;

		/*
		 * In case of raid56, we need to know the stripe aligned start
		 */
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
			unsigned long full_stripe_len = stripe_len * data_stripes;
			raid56_full_stripe_start = offset;

			/*
			 * Allow a write of a full stripe, but make sure we
			 * don't allow straddling of stripes
			 */
			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
					full_stripe_len);
			raid56_full_stripe_start *= full_stripe_len;

			/*
			 * For writes to RAID[56], allow a full stripeset across
			 * all disks. For other RAID types and for RAID[56]
			 * reads, just allow a single stripe (on a single disk).
			 */
			if (op == BTRFS_MAP_WRITE) {
				max_len = stripe_len * data_stripes -
					  (offset - raid56_full_stripe_start);
			}
		}
		len = min_t(u64, em->len - offset, max_len);
	} else {
		len = em->len - offset;
	}

	io_geom->len = len;
	io_geom->offset = offset;
	io_geom->stripe_len = stripe_len;
	io_geom->stripe_nr = stripe_nr;
	io_geom->stripe_offset = stripe_offset;
	io_geom->raid56_stripe_offset = raid56_full_stripe_start;

6274
	return 0;
6275 6276
}

6277 6278
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
6279
			     u64 logical, u64 *length,
6280
			     struct btrfs_bio **bbio_ret,
6281
			     int mirror_num, int need_raid_map)
6282 6283 6284
{
	struct extent_map *em;
	struct map_lookup *map;
6285 6286
	u64 stripe_offset;
	u64 stripe_nr;
D
David Woodhouse 已提交
6287
	u64 stripe_len;
6288
	u32 stripe_index;
6289
	int data_stripes;
6290
	int i;
L
Li Zefan 已提交
6291
	int ret = 0;
6292
	int num_stripes;
6293
	int max_errors = 0;
6294
	int tgtdev_indexes = 0;
6295
	struct btrfs_bio *bbio = NULL;
6296 6297 6298
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
	int num_alloc_stripes;
6299 6300
	int patch_the_first_stripe_for_dev_replace = 0;
	u64 physical_to_patch_in_first_stripe = 0;
D
David Woodhouse 已提交
6301
	u64 raid56_full_stripe_start = (u64)-1;
6302 6303 6304
	struct btrfs_io_geometry geom;

	ASSERT(bbio_ret);
6305
	ASSERT(op != BTRFS_MAP_DISCARD);
6306

6307 6308 6309
	em = btrfs_get_chunk_map(fs_info, logical, *length);
	ASSERT(!IS_ERR(em));

6310
	ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
6311 6312
	if (ret < 0)
		return ret;
6313

6314
	map = em->map_lookup;
6315

6316 6317 6318 6319 6320
	*length = geom.len;
	stripe_len = geom.stripe_len;
	stripe_nr = geom.stripe_nr;
	stripe_offset = geom.stripe_offset;
	raid56_full_stripe_start = geom.raid56_stripe_offset;
6321
	data_stripes = nr_data_stripes(map);
6322

6323
	down_read(&dev_replace->rwsem);
6324
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6325 6326 6327 6328
	/*
	 * Hold the semaphore for read during the whole operation, write is
	 * requested at commit time but must wait.
	 */
6329
	if (!dev_replace_is_ongoing)
6330
		up_read(&dev_replace->rwsem);
6331

6332
	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6333
	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6334 6335 6336 6337 6338
		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
						    dev_replace->srcdev->devid,
						    &mirror_num,
					    &physical_to_patch_in_first_stripe);
		if (ret)
6339
			goto out;
6340 6341
		else
			patch_the_first_stripe_for_dev_replace = 1;
6342 6343 6344 6345
	} else if (mirror_num > map->num_stripes) {
		mirror_num = 0;
	}

6346
	num_stripes = 1;
6347
	stripe_index = 0;
6348
	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6349 6350
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
6351
		if (!need_full_stripe(op))
6352
			mirror_num = 1;
6353
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6354
		if (need_full_stripe(op))
6355
			num_stripes = map->num_stripes;
6356
		else if (mirror_num)
6357
			stripe_index = mirror_num - 1;
6358
		else {
6359 6360
			stripe_index = find_live_mirror(fs_info, map, 0,
					    dev_replace_is_ongoing);
6361
			mirror_num = stripe_index + 1;
6362
		}
6363

6364
	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6365
		if (need_full_stripe(op)) {
6366
			num_stripes = map->num_stripes;
6367
		} else if (mirror_num) {
6368
			stripe_index = mirror_num - 1;
6369 6370 6371
		} else {
			mirror_num = 1;
		}
6372

C
Chris Mason 已提交
6373
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6374
		u32 factor = map->num_stripes / map->sub_stripes;
C
Chris Mason 已提交
6375

6376
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
C
Chris Mason 已提交
6377 6378
		stripe_index *= map->sub_stripes;

6379
		if (need_full_stripe(op))
6380
			num_stripes = map->sub_stripes;
C
Chris Mason 已提交
6381 6382
		else if (mirror_num)
			stripe_index += mirror_num - 1;
6383
		else {
J
Jan Schmidt 已提交
6384
			int old_stripe_index = stripe_index;
6385 6386 6387
			stripe_index = find_live_mirror(fs_info, map,
					      stripe_index,
					      dev_replace_is_ongoing);
J
Jan Schmidt 已提交
6388
			mirror_num = stripe_index - old_stripe_index + 1;
6389
		}
D
David Woodhouse 已提交
6390

6391
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6392
		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
D
David Woodhouse 已提交
6393
			/* push stripe_nr back to the start of the full stripe */
6394
			stripe_nr = div64_u64(raid56_full_stripe_start,
6395
					stripe_len * data_stripes);
D
David Woodhouse 已提交
6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409

			/* RAID[56] write or recovery. Return all stripes */
			num_stripes = map->num_stripes;
			max_errors = nr_parity_stripes(map);

			*length = map->stripe_len;
			stripe_index = 0;
			stripe_offset = 0;
		} else {
			/*
			 * Mirror #0 or #1 means the original data block.
			 * Mirror #2 is RAID5 parity block.
			 * Mirror #3 is RAID6 Q block.
			 */
6410
			stripe_nr = div_u64_rem(stripe_nr,
6411
					data_stripes, &stripe_index);
D
David Woodhouse 已提交
6412
			if (mirror_num > 1)
6413
				stripe_index = data_stripes + mirror_num - 2;
D
David Woodhouse 已提交
6414 6415

			/* We distribute the parity blocks across stripes */
6416 6417
			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
					&stripe_index);
6418
			if (!need_full_stripe(op) && mirror_num <= 1)
6419
				mirror_num = 1;
D
David Woodhouse 已提交
6420
		}
6421 6422
	} else {
		/*
6423 6424 6425
		 * after this, stripe_nr is the number of stripes on this
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
6426
		 */
6427 6428
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
6429
		mirror_num = stripe_index + 1;
6430
	}
6431
	if (stripe_index >= map->num_stripes) {
J
Jeff Mahoney 已提交
6432 6433
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6434 6435 6436 6437
			   stripe_index, map->num_stripes);
		ret = -EINVAL;
		goto out;
	}
6438

6439
	num_alloc_stripes = num_stripes;
6440
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6441
		if (op == BTRFS_MAP_WRITE)
6442
			num_alloc_stripes <<= 1;
6443
		if (op == BTRFS_MAP_GET_READ_MIRRORS)
6444
			num_alloc_stripes++;
6445
		tgtdev_indexes = num_stripes;
6446
	}
6447

6448
	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
L
Li Zefan 已提交
6449 6450 6451 6452
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}
6453 6454 6455 6456 6457 6458 6459

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical = map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;
		stripe_index++;
	}
L
Li Zefan 已提交
6460

6461
	/* build raid_map */
6462 6463
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
6464
		u64 tmp;
6465
		unsigned rot;
6466 6467

		/* Work out the disk rotation on this stripe-set */
6468
		div_u64_rem(stripe_nr, num_stripes, &rot);
6469 6470

		/* Fill in the logical address of each stripe */
6471 6472
		tmp = stripe_nr * data_stripes;
		for (i = 0; i < data_stripes; i++)
6473 6474 6475 6476 6477 6478 6479 6480
			bbio->raid_map[(i+rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bbio->raid_map[(i+rot+1) % num_stripes] =
				RAID6_Q_STRIPE;

6481
		sort_parity_stripes(bbio, num_stripes);
6482
	}
L
Li Zefan 已提交
6483

6484
	if (need_full_stripe(op))
6485
		max_errors = btrfs_chunk_max_errors(map);
L
Li Zefan 已提交
6486

6487
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6488
	    need_full_stripe(op)) {
6489 6490
		handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
					  &num_stripes, &max_errors);
6491 6492
	}

L
Li Zefan 已提交
6493
	*bbio_ret = bbio;
Z
Zhao Lei 已提交
6494
	bbio->map_type = map->type;
L
Li Zefan 已提交
6495 6496 6497
	bbio->num_stripes = num_stripes;
	bbio->max_errors = max_errors;
	bbio->mirror_num = mirror_num;
6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509

	/*
	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
	 * mirror_num == num_stripes + 1 && dev_replace target drive is
	 * available as a mirror
	 */
	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
		WARN_ON(num_stripes > 1);
		bbio->stripes[0].dev = dev_replace->tgtdev;
		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
		bbio->mirror_num = map->num_stripes + 1;
	}
6510
out:
6511
	if (dev_replace_is_ongoing) {
6512 6513
		lockdep_assert_held(&dev_replace->rwsem);
		/* Unlock and let waiting writers proceed */
6514
		up_read(&dev_replace->rwsem);
6515
	}
6516
	free_extent_map(em);
L
Li Zefan 已提交
6517
	return ret;
6518 6519
}

6520
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6521
		      u64 logical, u64 *length,
6522
		      struct btrfs_bio **bbio_ret, int mirror_num)
6523
{
6524 6525 6526 6527
	if (op == BTRFS_MAP_DISCARD)
		return __btrfs_map_block_for_discard(fs_info, logical,
						     length, bbio_ret);

6528
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6529
				 mirror_num, 0);
6530 6531
}

6532
/* For Scrub/replace */
6533
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6534
		     u64 logical, u64 *length,
6535
		     struct btrfs_bio **bbio_ret)
6536
{
6537
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6538 6539
}

6540
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6541
{
6542 6543
	bio->bi_private = bbio->private;
	bio->bi_end_io = bbio->end_io;
6544
	bio_endio(bio);
6545

6546
	btrfs_put_bbio(bbio);
6547 6548
}

6549
static void btrfs_end_bio(struct bio *bio)
6550
{
6551
	struct btrfs_bio *bbio = bio->bi_private;
6552
	int is_orig_bio = 0;
6553

6554
	if (bio->bi_status) {
6555
		atomic_inc(&bbio->error);
6556 6557
		if (bio->bi_status == BLK_STS_IOERR ||
		    bio->bi_status == BLK_STS_TARGET) {
6558
			struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6559

6560
			ASSERT(dev->bdev);
6561
			if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6562
				btrfs_dev_stat_inc_and_print(dev,
6563
						BTRFS_DEV_STAT_WRITE_ERRS);
6564 6565
			else if (!(bio->bi_opf & REQ_RAHEAD))
				btrfs_dev_stat_inc_and_print(dev,
6566
						BTRFS_DEV_STAT_READ_ERRS);
6567 6568
			if (bio->bi_opf & REQ_PREFLUSH)
				btrfs_dev_stat_inc_and_print(dev,
6569
						BTRFS_DEV_STAT_FLUSH_ERRS);
6570 6571
		}
	}
6572

6573
	if (bio == bbio->orig_bio)
6574 6575
		is_orig_bio = 1;

6576 6577
	btrfs_bio_counter_dec(bbio->fs_info);

6578
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6579 6580
		if (!is_orig_bio) {
			bio_put(bio);
6581
			bio = bbio->orig_bio;
6582
		}
6583

6584
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6585
		/* only send an error to the higher layers if it is
D
David Woodhouse 已提交
6586
		 * beyond the tolerance of the btrfs bio
6587
		 */
6588
		if (atomic_read(&bbio->error) > bbio->max_errors) {
6589
			bio->bi_status = BLK_STS_IOERR;
6590
		} else {
6591 6592 6593 6594
			/*
			 * this bio is actually up to date, we didn't
			 * go over the max number of errors
			 */
6595
			bio->bi_status = BLK_STS_OK;
6596
		}
6597

6598
		btrfs_end_bbio(bbio, bio);
6599
	} else if (!is_orig_bio) {
6600 6601 6602 6603
		bio_put(bio);
	}
}

6604
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6605
			      u64 physical, struct btrfs_device *dev)
6606
{
6607
	struct btrfs_fs_info *fs_info = bbio->fs_info;
6608 6609

	bio->bi_private = bbio;
6610
	btrfs_io_bio(bio)->device = dev;
6611
	bio->bi_end_io = btrfs_end_bio;
6612
	bio->bi_iter.bi_sector = physical >> 9;
6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626
	/*
	 * For zone append writing, bi_sector must point the beginning of the
	 * zone
	 */
	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
		if (btrfs_dev_is_sequential(dev, physical)) {
			u64 zone_start = round_down(physical, fs_info->zone_size);

			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
		} else {
			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
			bio->bi_opf |= REQ_OP_WRITE;
		}
	}
6627 6628
	btrfs_debug_in_rcu(fs_info,
	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
D
David Sterba 已提交
6629
		bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6630 6631
		(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
		dev->devid, bio->bi_iter.bi_size);
6632
	bio_set_dev(bio, dev->bdev);
6633

6634
	btrfs_bio_counter_inc_noblocked(fs_info);
6635

6636
	btrfsic_submit_bio(bio);
6637 6638 6639 6640 6641 6642
}

static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
	atomic_inc(&bbio->error);
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6643
		/* Should be the original bio. */
6644 6645
		WARN_ON(bio != bbio->orig_bio);

6646
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6647
		bio->bi_iter.bi_sector = logical >> 9;
6648 6649 6650 6651
		if (atomic_read(&bbio->error) > bbio->max_errors)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_status = BLK_STS_OK;
6652
		btrfs_end_bbio(bbio, bio);
6653 6654 6655
	}
}

6656
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6657
			   int mirror_num)
6658 6659
{
	struct btrfs_device *dev;
6660
	struct bio *first_bio = bio;
D
David Sterba 已提交
6661
	u64 logical = bio->bi_iter.bi_sector << 9;
6662 6663 6664
	u64 length = 0;
	u64 map_length;
	int ret;
6665 6666
	int dev_nr;
	int total_devs;
6667
	struct btrfs_bio *bbio = NULL;
6668

6669
	length = bio->bi_iter.bi_size;
6670
	map_length = length;
6671

6672
	btrfs_bio_counter_inc_blocked(fs_info);
6673
	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
M
Mike Christie 已提交
6674
				&map_length, &bbio, mirror_num, 1);
6675
	if (ret) {
6676
		btrfs_bio_counter_dec(fs_info);
6677
		return errno_to_blk_status(ret);
6678
	}
6679

6680
	total_devs = bbio->num_stripes;
D
David Woodhouse 已提交
6681 6682 6683
	bbio->orig_bio = first_bio;
	bbio->private = first_bio->bi_private;
	bbio->end_io = first_bio->bi_end_io;
6684
	bbio->fs_info = fs_info;
D
David Woodhouse 已提交
6685 6686
	atomic_set(&bbio->stripes_pending, bbio->num_stripes);

6687
	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6688
	    ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
D
David Woodhouse 已提交
6689 6690
		/* In this case, map_length has been set to the length of
		   a single stripe; not the whole write */
6691
		if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
6692 6693
			ret = raid56_parity_write(fs_info, bio, bbio,
						  map_length);
D
David Woodhouse 已提交
6694
		} else {
6695 6696
			ret = raid56_parity_recover(fs_info, bio, bbio,
						    map_length, mirror_num, 1);
D
David Woodhouse 已提交
6697
		}
6698

6699
		btrfs_bio_counter_dec(fs_info);
6700
		return errno_to_blk_status(ret);
D
David Woodhouse 已提交
6701 6702
	}

6703
	if (map_length < length) {
6704
		btrfs_crit(fs_info,
J
Jeff Mahoney 已提交
6705 6706
			   "mapping failed logical %llu bio len %llu len %llu",
			   logical, length, map_length);
6707 6708
		BUG();
	}
6709

6710
	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6711
		dev = bbio->stripes[dev_nr].dev;
6712 6713
		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
						   &dev->dev_state) ||
6714
		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
6715
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6716 6717 6718 6719
			bbio_error(bbio, first_bio, logical);
			continue;
		}

6720
		if (dev_nr < total_devs - 1)
6721
			bio = btrfs_bio_clone(first_bio);
6722
		else
6723
			bio = first_bio;
6724

6725
		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6726
	}
6727
	btrfs_bio_counter_dec(fs_info);
6728
	return BLK_STS_OK;
6729 6730
}

6731 6732 6733 6734 6735 6736 6737
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
6738
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6739
				       u64 devid, u8 *uuid, u8 *fsid)
6740
{
Y
Yan Zheng 已提交
6741
	struct btrfs_device *device;
6742 6743 6744 6745 6746 6747 6748 6749 6750 6751
	struct btrfs_fs_devices *seed_devs;

	if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
		list_for_each_entry(device, &fs_devices->devices, dev_list) {
			if (device->devid == devid &&
			    (!uuid || memcmp(device->uuid, uuid,
					     BTRFS_UUID_SIZE) == 0))
				return device;
		}
	}
Y
Yan Zheng 已提交
6752

6753
	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
Y
Yan Zheng 已提交
6754
		if (!fsid ||
6755 6756
		    !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
			list_for_each_entry(device, &seed_devs->devices,
6757 6758 6759 6760 6761 6762
					    dev_list) {
				if (device->devid == devid &&
				    (!uuid || memcmp(device->uuid, uuid,
						     BTRFS_UUID_SIZE) == 0))
					return device;
			}
Y
Yan Zheng 已提交
6763 6764
		}
	}
6765

Y
Yan Zheng 已提交
6766
	return NULL;
6767 6768
}

6769
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6770 6771 6772
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;
6773
	unsigned int nofs_flag;
6774

6775 6776 6777 6778 6779 6780 6781
	/*
	 * We call this under the chunk_mutex, so we want to use NOFS for this
	 * allocation, however we don't want to change btrfs_alloc_device() to
	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
	 * places.
	 */
	nofs_flag = memalloc_nofs_save();
6782
	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6783
	memalloc_nofs_restore(nofs_flag);
6784
	if (IS_ERR(device))
6785
		return device;
6786 6787

	list_add(&device->dev_list, &fs_devices->devices);
Y
Yan Zheng 已提交
6788
	device->fs_devices = fs_devices;
6789
	fs_devices->num_devices++;
6790

6791
	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6792
	fs_devices->missing_devices++;
6793

6794 6795 6796
	return device;
}

6797 6798 6799 6800 6801 6802 6803 6804 6805 6806
/**
 * btrfs_alloc_device - allocate struct btrfs_device
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6807
 * on error.  Returned struct is not linked onto any lists and must be
6808
 * destroyed with btrfs_free_device.
6809 6810 6811 6812 6813 6814 6815 6816
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
					const u64 *devid,
					const u8 *uuid)
{
	struct btrfs_device *dev;
	u64 tmp;

6817
	if (WARN_ON(!devid && !fs_info))
6818 6819
		return ERR_PTR(-EINVAL);

D
David Sterba 已提交
6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	if (!dev)
		return ERR_PTR(-ENOMEM);

	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
	INIT_LIST_HEAD(&dev->post_commit_list);

	atomic_set(&dev->reada_in_flight, 0);
	atomic_set(&dev->dev_stats_ccnt, 0);
	btrfs_device_data_ordered_init(dev);
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
	extent_io_tree_init(fs_info, &dev->alloc_state,
			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
6845 6846 6847 6848 6849 6850 6851 6852

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6853
			btrfs_free_device(dev);
6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

	return dev;
}

6867
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6868
					u64 devid, u8 *uuid, bool error)
6869
{
6870 6871 6872 6873 6874 6875
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6876 6877
}

6878 6879
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
6880
	const int data_stripes = calc_data_stripes(type, num_stripes);
6881

6882 6883 6884
	return div_u64(chunk_len, data_stripes);
}

6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924
#if BITS_PER_LONG == 32
/*
 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
 * can't be accessed on 32bit systems.
 *
 * This function do mount time check to reject the fs if it already has
 * metadata chunk beyond that limit.
 */
static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
				  u64 logical, u64 length, u64 type)
{
	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
		return 0;

	if (logical + length < MAX_LFS_FILESIZE)
		return 0;

	btrfs_err_32bit_limit(fs_info);
	return -EOVERFLOW;
}

/*
 * This is to give early warning for any metadata chunk reaching
 * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
 * Although we can still access the metadata, it's not going to be possible
 * once the limit is reached.
 */
static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
				  u64 logical, u64 length, u64 type)
{
	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
		return;

	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
		return;

	btrfs_warn_32bit_limit(fs_info);
}
#endif

6925
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6926 6927
			  struct btrfs_chunk *chunk)
{
6928
	struct btrfs_fs_info *fs_info = leaf->fs_info;
6929
	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6930 6931 6932 6933 6934
	struct map_lookup *map;
	struct extent_map *em;
	u64 logical;
	u64 length;
	u64 devid;
6935
	u64 type;
6936 6937 6938 6939 6940 6941 6942
	u8 uuid[BTRFS_UUID_SIZE];
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
6943
	type = btrfs_chunk_type(leaf, chunk);
6944 6945
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6946 6947 6948 6949 6950 6951 6952
#if BITS_PER_LONG == 32
	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
	if (ret < 0)
		return ret;
	warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif

6953 6954 6955 6956 6957
	/*
	 * Only need to verify chunk item if we're reading from sys chunk array,
	 * as chunk item in tree block is already verified by tree-checker.
	 */
	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6958
		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6959 6960 6961
		if (ret)
			return ret;
	}
6962

6963 6964 6965
	read_lock(&map_tree->lock);
	em = lookup_extent_mapping(map_tree, logical, 1);
	read_unlock(&map_tree->lock);
6966 6967 6968 6969 6970 6971 6972 6973 6974

	/* already mapped? */
	if (em && em->start <= logical && em->start + em->len > logical) {
		free_extent_map(em);
		return 0;
	} else if (em) {
		free_extent_map(em);
	}

6975
	em = alloc_extent_map();
6976 6977
	if (!em)
		return -ENOMEM;
6978
	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6979 6980 6981 6982 6983
	if (!map) {
		free_extent_map(em);
		return -ENOMEM;
	}

6984
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6985
	em->map_lookup = map;
6986 6987
	em->start = logical;
	em->len = length;
6988
	em->orig_start = 0;
6989
	em->block_start = 0;
C
Chris Mason 已提交
6990
	em->block_len = em->len;
6991

6992 6993 6994 6995
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6996
	map->type = type;
C
Chris Mason 已提交
6997
	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6998
	map->verified_stripes = 0;
6999
	em->orig_block_len = calc_stripe_length(type, em->len,
7000
						map->num_stripes);
7001 7002 7003 7004
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7005 7006 7007
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
7008
		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
7009
							devid, uuid, NULL);
7010
		if (!map->stripes[i].dev &&
7011
		    !btrfs_test_opt(fs_info, DEGRADED)) {
7012
			free_extent_map(em);
7013
			btrfs_report_missing_device(fs_info, devid, uuid, true);
7014
			return -ENOENT;
7015
		}
7016 7017
		if (!map->stripes[i].dev) {
			map->stripes[i].dev =
7018 7019
				add_missing_dev(fs_info->fs_devices, devid,
						uuid);
7020
			if (IS_ERR(map->stripes[i].dev)) {
7021
				free_extent_map(em);
7022 7023 7024 7025
				btrfs_err(fs_info,
					"failed to init missing dev %llu: %ld",
					devid, PTR_ERR(map->stripes[i].dev));
				return PTR_ERR(map->stripes[i].dev);
7026
			}
7027
			btrfs_report_missing_device(fs_info, devid, uuid, false);
7028
		}
7029 7030 7031
		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&(map->stripes[i].dev->dev_state));

7032 7033
	}

7034 7035 7036
	write_lock(&map_tree->lock);
	ret = add_extent_mapping(map_tree, em, 0);
	write_unlock(&map_tree->lock);
7037 7038 7039 7040 7041
	if (ret < 0) {
		btrfs_err(fs_info,
			  "failed to add chunk map, start=%llu len=%llu: %d",
			  em->start, em->len, ret);
	}
7042 7043
	free_extent_map(em);

7044
	return ret;
7045 7046
}

7047
static void fill_device_from_item(struct extent_buffer *leaf,
7048 7049 7050 7051 7052 7053
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
7054 7055
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
7056
	device->commit_total_bytes = device->disk_total_bytes;
7057
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7058
	device->commit_bytes_used = device->bytes_used;
7059 7060 7061 7062
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
7063
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7064
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
7065

7066
	ptr = btrfs_device_uuid(dev_item);
7067
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
7068 7069
}

7070
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7071
						  u8 *fsid)
Y
Yan Zheng 已提交
7072 7073 7074 7075
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

7076
	lockdep_assert_held(&uuid_mutex);
D
David Sterba 已提交
7077
	ASSERT(fsid);
Y
Yan Zheng 已提交
7078

7079
	/* This will match only for multi-device seed fs */
7080
	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7081
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7082 7083
			return fs_devices;

Y
Yan Zheng 已提交
7084

7085
	fs_devices = find_fsid(fsid, NULL);
Y
Yan Zheng 已提交
7086
	if (!fs_devices) {
7087
		if (!btrfs_test_opt(fs_info, DEGRADED))
7088 7089
			return ERR_PTR(-ENOENT);

7090
		fs_devices = alloc_fs_devices(fsid, NULL);
7091 7092 7093
		if (IS_ERR(fs_devices))
			return fs_devices;

7094
		fs_devices->seeding = true;
7095 7096
		fs_devices->opened = 1;
		return fs_devices;
Y
Yan Zheng 已提交
7097
	}
Y
Yan Zheng 已提交
7098

7099 7100 7101 7102
	/*
	 * Upon first call for a seed fs fsid, just create a private copy of the
	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
	 */
Y
Yan Zheng 已提交
7103
	fs_devices = clone_fs_devices(fs_devices);
7104 7105
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
7106

7107
	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
7108 7109
	if (ret) {
		free_fs_devices(fs_devices);
7110
		return ERR_PTR(ret);
7111
	}
Y
Yan Zheng 已提交
7112 7113

	if (!fs_devices->seeding) {
7114
		close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
7115
		free_fs_devices(fs_devices);
7116
		return ERR_PTR(-EINVAL);
Y
Yan Zheng 已提交
7117 7118
	}

7119
	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7120

7121
	return fs_devices;
Y
Yan Zheng 已提交
7122 7123
}

7124
static int read_one_dev(struct extent_buffer *leaf,
7125 7126
			struct btrfs_dev_item *dev_item)
{
7127
	struct btrfs_fs_info *fs_info = leaf->fs_info;
7128
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7129 7130 7131
	struct btrfs_device *device;
	u64 devid;
	int ret;
7132
	u8 fs_uuid[BTRFS_FSID_SIZE];
7133 7134
	u8 dev_uuid[BTRFS_UUID_SIZE];

7135
	devid = btrfs_device_id(leaf, dev_item);
7136
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7137
			   BTRFS_UUID_SIZE);
7138
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7139
			   BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
7140

7141
	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7142
		fs_devices = open_seed_devices(fs_info, fs_uuid);
7143 7144
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Y
Yan Zheng 已提交
7145 7146
	}

7147
	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
7148
				   fs_uuid);
7149
	if (!device) {
7150
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
7151 7152
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
7153
			return -ENOENT;
7154
		}
Y
Yan Zheng 已提交
7155

7156
		device = add_missing_dev(fs_devices, devid, dev_uuid);
7157 7158 7159 7160 7161 7162
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
7163
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7164
	} else {
7165
		if (!device->bdev) {
7166 7167 7168
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
7169
				return -ENOENT;
7170 7171 7172
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
7173
		}
7174

7175 7176
		if (!device->bdev &&
		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7177 7178 7179 7180 7181 7182
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
7183
			device->fs_devices->missing_devices++;
7184
			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
Y
Yan Zheng 已提交
7185
		}
7186 7187 7188

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
7189 7190
			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
							&device->dev_state));
7191 7192 7193 7194 7195 7196 7197 7198 7199 7200

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Y
Yan Zheng 已提交
7201 7202
	}

7203
	if (device->fs_devices != fs_info->fs_devices) {
7204
		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
Y
Yan Zheng 已提交
7205 7206 7207
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
7208
	}
7209 7210

	fill_device_from_item(leaf, dev_item, device);
7211 7212 7213 7214 7215 7216 7217 7218 7219 7220
	if (device->bdev) {
		u64 max_total_bytes = i_size_read(device->bdev->bd_inode);

		if (device->total_bytes > max_total_bytes) {
			btrfs_err(fs_info,
			"device total_bytes should be at most %llu but found %llu",
				  max_total_bytes, device->total_bytes);
			return -EINVAL;
		}
	}
7221
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7222
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7223
	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
Y
Yan Zheng 已提交
7224
		device->fs_devices->total_rw_bytes += device->total_bytes;
7225 7226
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
7227
	}
7228 7229 7230 7231
	ret = 0;
	return ret;
}

7232
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7233
{
7234
	struct btrfs_root *root = fs_info->tree_root;
7235
	struct btrfs_super_block *super_copy = fs_info->super_copy;
7236
	struct extent_buffer *sb;
7237 7238
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
7239 7240
	u8 *array_ptr;
	unsigned long sb_array_offset;
7241
	int ret = 0;
7242 7243 7244
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
7245
	u32 cur_offset;
7246
	u64 type;
7247
	struct btrfs_key key;
7248

7249
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7250 7251 7252 7253 7254
	/*
	 * This will create extent buffer of nodesize, superblock size is
	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
	 * overallocate but we can keep it as-is, only the first page is used.
	 */
7255 7256
	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
					  root->root_key.objectid, 0);
7257 7258
	if (IS_ERR(sb))
		return PTR_ERR(sb);
7259
	set_extent_buffer_uptodate(sb);
7260
	/*
7261
	 * The sb extent buffer is artificial and just used to read the system array.
7262
	 * set_extent_buffer_uptodate() call does not properly mark all it's
7263 7264 7265 7266 7267 7268 7269 7270 7271
	 * pages up-to-date when the page is larger: extent does not cover the
	 * whole page and consequently check_page_uptodate does not find all
	 * the page's extents up-to-date (the hole beyond sb),
	 * write_extent_buffer then triggers a WARN_ON.
	 *
	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
	 * but sb spans only this function. Add an explicit SetPageUptodate call
	 * to silence the warning eg. on PowerPC 64.
	 */
7272
	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7273
		SetPageUptodate(sb->pages[0]);
7274

7275
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7276 7277
	array_size = btrfs_super_sys_array_size(super_copy);

7278 7279 7280
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
7281

7282 7283
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
7284 7285 7286 7287
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

7288 7289
		btrfs_disk_key_to_cpu(&key, disk_key);

7290 7291 7292
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
7293

7294 7295 7296 7297 7298 7299 7300
		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
			ret = -EIO;
			break;
		}
7301

7302 7303 7304 7305 7306 7307 7308 7309
		chunk = (struct btrfs_chunk *)sb_array_offset;
		/*
		 * At least one btrfs_chunk with one stripe must be present,
		 * exact stripe count check comes afterwards
		 */
		len = btrfs_chunk_item_size(1);
		if (cur_offset + len > array_size)
			goto out_short_read;
7310

7311 7312 7313 7314 7315 7316 7317 7318
		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
		if (!num_stripes) {
			btrfs_err(fs_info,
			"invalid number of stripes %u in sys_array at offset %u",
				  num_stripes, cur_offset);
			ret = -EIO;
			break;
		}
7319

7320 7321
		type = btrfs_chunk_type(sb, chunk);
		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7322
			btrfs_err(fs_info,
7323 7324
			"invalid chunk type %llu in sys_array at offset %u",
				  type, cur_offset);
7325 7326
			ret = -EIO;
			break;
7327
		}
7328 7329 7330 7331 7332 7333 7334 7335 7336

		len = btrfs_chunk_item_size(num_stripes);
		if (cur_offset + len > array_size)
			goto out_short_read;

		ret = read_one_chunk(&key, sb, chunk);
		if (ret)
			break;

7337 7338 7339
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
7340
	}
7341
	clear_extent_buffer_uptodate(sb);
7342
	free_extent_buffer_stale(sb);
7343
	return ret;
7344 7345

out_short_read:
7346
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7347
			len, cur_offset);
7348
	clear_extent_buffer_uptodate(sb);
7349
	free_extent_buffer_stale(sb);
7350
	return -EIO;
7351 7352
}

7353 7354 7355
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
7356 7357
 * If the @failing_dev is specified, it's accounted as missing.
 *
7358 7359 7360
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
7361 7362
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
					struct btrfs_device *failing_dev)
7363
{
7364
	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7365 7366 7367 7368
	struct extent_map *em;
	u64 next_start = 0;
	bool ret = true;

7369 7370 7371
	read_lock(&map_tree->lock);
	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
	read_unlock(&map_tree->lock);
7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389
	/* No chunk at all? Return false anyway */
	if (!em) {
		ret = false;
		goto out;
	}
	while (em) {
		struct map_lookup *map;
		int missing = 0;
		int max_tolerated;
		int i;

		map = em->map_lookup;
		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

7390 7391
			if (!dev || !dev->bdev ||
			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7392 7393
			    dev->last_flush_error)
				missing++;
7394 7395
			else if (failing_dev && failing_dev == dev)
				missing++;
7396 7397
		}
		if (missing > max_tolerated) {
7398 7399
			if (!failing_dev)
				btrfs_warn(fs_info,
7400
	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
7401 7402 7403 7404 7405 7406 7407 7408
				   em->start, missing, max_tolerated);
			free_extent_map(em);
			ret = false;
			goto out;
		}
		next_start = extent_map_end(em);
		free_extent_map(em);

7409 7410
		read_lock(&map_tree->lock);
		em = lookup_extent_mapping(map_tree, next_start,
7411
					   (u64)(-1) - next_start);
7412
		read_unlock(&map_tree->lock);
7413 7414 7415 7416 7417
	}
out:
	return ret;
}

7418 7419 7420 7421 7422
static void readahead_tree_node_children(struct extent_buffer *node)
{
	int i;
	const int nr_items = btrfs_header_nritems(node);

7423 7424
	for (i = 0; i < nr_items; i++)
		btrfs_readahead_node_child(node, i);
7425 7426
}

7427
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7428
{
7429
	struct btrfs_root *root = fs_info->chunk_root;
7430 7431 7432 7433 7434 7435
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
7436
	u64 total_dev = 0;
7437
	u64 last_ra_node = 0;
7438 7439 7440 7441 7442

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

7443 7444 7445 7446
	/*
	 * uuid_mutex is needed only if we are mounting a sprout FS
	 * otherwise we don't need it.
	 */
7447 7448
	mutex_lock(&uuid_mutex);

7449 7450 7451 7452 7453 7454 7455 7456
	/*
	 * It is possible for mount and umount to race in such a way that
	 * we execute this code path, but open_fs_devices failed to clear
	 * total_rw_bytes. We certainly want it cleared before reading the
	 * device items, so clear it here.
	 */
	fs_info->fs_devices->total_rw_bytes = 0;

7457 7458 7459 7460 7461
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7462 7463 7464 7465 7466
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7467 7468
	if (ret < 0)
		goto error;
C
Chris Mason 已提交
7469
	while (1) {
7470 7471
		struct extent_buffer *node;

7472 7473 7474 7475 7476 7477 7478 7479 7480 7481
		leaf = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
			break;
		}
7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492
		/*
		 * The nodes on level 1 are not locked but we don't need to do
		 * that during mount time as nothing else can access the tree
		 */
		node = path->nodes[1];
		if (node) {
			if (last_ra_node != node->start) {
				readahead_tree_node_children(node);
				last_ra_node = node->start;
			}
		}
7493
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7494 7495 7496
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
7497
						  struct btrfs_dev_item);
7498
			ret = read_one_dev(leaf, dev_item);
7499 7500
			if (ret)
				goto error;
7501
			total_dev++;
7502 7503
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
7504 7505 7506 7507 7508 7509 7510 7511 7512 7513

			/*
			 * We are only called at mount time, so no need to take
			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
			 * we always lock first fs_info->chunk_mutex before
			 * acquiring any locks on the chunk tree. This is a
			 * requirement for chunk allocation, see the comment on
			 * top of btrfs_chunk_alloc() for details.
			 */
			ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7514
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7515
			ret = read_one_chunk(&found_key, leaf, chunk);
Y
Yan Zheng 已提交
7516 7517
			if (ret)
				goto error;
7518 7519 7520
		}
		path->slots[0]++;
	}
7521 7522 7523 7524 7525

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
7526 7527
	if (total_dev != fs_info->fs_devices->total_devices) {
		btrfs_err(fs_info,
7528
	   "super_num_devices %llu mismatch with num_devices %llu found here",
7529
			  btrfs_super_num_devices(fs_info->super_copy),
7530 7531 7532 7533
			  total_dev);
		ret = -EINVAL;
		goto error;
	}
7534 7535 7536
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
7537
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7538 7539
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
7540 7541 7542
		ret = -EINVAL;
		goto error;
	}
7543 7544
	ret = 0;
error:
7545 7546
	mutex_unlock(&uuid_mutex);

Y
Yan Zheng 已提交
7547
	btrfs_free_path(path);
7548 7549
	return ret;
}
7550

7551 7552
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
7553
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7554 7555
	struct btrfs_device *device;

7556 7557 7558 7559 7560 7561 7562 7563
	fs_devices->fs_info = fs_info;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list)
		device->fs_info = fs_info;

	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
		list_for_each_entry(device, &seed_devs->devices, dev_list)
7564
			device->fs_info = fs_info;
7565

7566
		seed_devs->fs_info = fs_info;
7567
	}
7568
	mutex_unlock(&fs_devices->device_list_mutex);
7569 7570
}

7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
				 const struct btrfs_dev_stats_item *ptr,
				 int index)
{
	u64 val;

	read_extent_buffer(eb, &val,
			   offsetof(struct btrfs_dev_stats_item, values) +
			    ((unsigned long)ptr) + (index * sizeof(u64)),
			   sizeof(val));
	return val;
}

static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
				      struct btrfs_dev_stats_item *ptr,
				      int index, u64 val)
{
	write_extent_buffer(eb, &val,
			    offsetof(struct btrfs_dev_stats_item, values) +
			     ((unsigned long)ptr) + (index * sizeof(u64)),
			    sizeof(val));
}

7594 7595
static int btrfs_device_init_dev_stats(struct btrfs_device *device,
				       struct btrfs_path *path)
7596
{
7597
	struct btrfs_dev_stats_item *ptr;
7598
	struct extent_buffer *eb;
7599 7600 7601 7602
	struct btrfs_key key;
	int item_size;
	int i, ret, slot;

7603 7604 7605
	if (!device->fs_info->dev_root)
		return 0;

7606 7607 7608 7609 7610 7611 7612 7613 7614
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
	key.offset = device->devid;
	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
	if (ret) {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			btrfs_dev_stat_set(device, i, 0);
		device->dev_stats_valid = 1;
		btrfs_release_path(path);
7615
		return ret < 0 ? ret : 0;
7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633
	}
	slot = path->slots[0];
	eb = path->nodes[0];
	item_size = btrfs_item_size_nr(eb, slot);

	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
		if (item_size >= (1 + i) * sizeof(__le64))
			btrfs_dev_stat_set(device, i,
					   btrfs_dev_stats_value(eb, ptr, i));
		else
			btrfs_dev_stat_set(device, i, 0);
	}

	device->dev_stats_valid = 1;
	btrfs_dev_stat_print_on_load(device);
	btrfs_release_path(path);
7634 7635

	return 0;
7636 7637 7638 7639 7640
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7641 7642
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
7643
	int ret = 0;
7644 7645

	path = btrfs_alloc_path();
A
Anand Jain 已提交
7646 7647
	if (!path)
		return -ENOMEM;
7648 7649

	mutex_lock(&fs_devices->device_list_mutex);
7650 7651 7652 7653 7654
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		ret = btrfs_device_init_dev_stats(device, path);
		if (ret)
			goto out;
	}
7655
	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7656 7657 7658 7659 7660
		list_for_each_entry(device, &seed_devs->devices, dev_list) {
			ret = btrfs_device_init_dev_stats(device, path);
			if (ret)
				goto out;
		}
7661
	}
7662
out:
7663 7664 7665
	mutex_unlock(&fs_devices->device_list_mutex);

	btrfs_free_path(path);
7666
	return ret;
7667 7668 7669 7670 7671
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
				struct btrfs_device *device)
{
7672
	struct btrfs_fs_info *fs_info = trans->fs_info;
7673
	struct btrfs_root *dev_root = fs_info->dev_root;
7674 7675 7676 7677 7678 7679 7680
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7681 7682
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7683 7684 7685
	key.offset = device->devid;

	path = btrfs_alloc_path();
7686 7687
	if (!path)
		return -ENOMEM;
7688 7689
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7690
		btrfs_warn_in_rcu(fs_info,
7691
			"error %d while searching for dev_stats item for device %s",
7692
			      ret, rcu_str_deref(device->name));
7693 7694 7695 7696 7697 7698 7699 7700
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7701
			btrfs_warn_in_rcu(fs_info,
7702
				"delete too small dev_stats item for device %s failed %d",
7703
				      rcu_str_deref(device->name), ret);
7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7715
			btrfs_warn_in_rcu(fs_info,
7716 7717
				"insert dev_stats item for device %s failed %d",
				rcu_str_deref(device->name), ret);
7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
7737
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7738
{
7739
	struct btrfs_fs_info *fs_info = trans->fs_info;
7740 7741
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7742
	int stats_cnt;
7743 7744 7745 7746
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7747 7748
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7749 7750
			continue;

7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7765
		ret = update_dev_stat_item(trans, device);
7766
		if (!ret)
7767
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7768 7769 7770 7771 7772 7773
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7774 7775 7776 7777 7778 7779
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);
	btrfs_dev_stat_print_on_error(dev);
}

7780
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7781
{
7782 7783
	if (!dev->dev_stats_valid)
		return;
7784
	btrfs_err_rl_in_rcu(dev->fs_info,
7785
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7786
			   rcu_str_deref(dev->name),
7787 7788 7789
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7790 7791
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7792
}
7793

7794 7795
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7796 7797 7798 7799 7800 7801 7802 7803
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7804
	btrfs_info_in_rcu(dev->fs_info,
7805
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7806
	       rcu_str_deref(dev->name),
7807 7808 7809 7810 7811 7812 7813
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7814
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7815
			struct btrfs_ioctl_get_dev_stats *stats)
7816 7817
{
	struct btrfs_device *dev;
7818
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7819 7820 7821
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7822
	dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
7823 7824 7825
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7826
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7827
		return -ENODEV;
7828
	} else if (!dev->dev_stats_valid) {
7829
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7830
		return -ENODEV;
7831
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7832 7833 7834 7835 7836
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
7837
				btrfs_dev_stat_set(dev, i, 0);
7838
		}
7839 7840
		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
			   current->comm, task_pid_nr(current));
7841 7842 7843 7844 7845 7846 7847 7848 7849
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7850

7851
/*
7852 7853 7854 7855 7856
 * Update the size and bytes used for each device where it changed.  This is
 * delayed since we would otherwise get errors while writing out the
 * superblocks.
 *
 * Must be invoked during transaction commit.
7857
 */
7858
void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7859 7860 7861
{
	struct btrfs_device *curr, *next;

7862
	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7863

7864
	if (list_empty(&trans->dev_update_list))
7865 7866
		return;

7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877
	/*
	 * We don't need the device_list_mutex here.  This list is owned by the
	 * transaction and the transaction must complete before the device is
	 * released.
	 */
	mutex_lock(&trans->fs_info->chunk_mutex);
	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
				 post_commit_list) {
		list_del_init(&curr->post_commit_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
		curr->commit_bytes_used = curr->bytes_used;
7878
	}
7879
	mutex_unlock(&trans->fs_info->chunk_mutex);
7880
}
7881

7882 7883 7884 7885 7886
/*
 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
 */
int btrfs_bg_type_to_factor(u64 flags)
{
7887 7888 7889
	const int index = btrfs_bg_flags_to_raid_index(flags);

	return btrfs_raid_array[index].ncopies;
7890
}
7891 7892 7893 7894 7895 7896 7897



static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
				 u64 chunk_offset, u64 devid,
				 u64 physical_offset, u64 physical_len)
{
7898
	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7899 7900
	struct extent_map *em;
	struct map_lookup *map;
7901
	struct btrfs_device *dev;
7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950
	u64 stripe_len;
	bool found = false;
	int ret = 0;
	int i;

	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
			  physical_offset, devid);
		ret = -EUCLEAN;
		goto out;
	}

	map = em->map_lookup;
	stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
	if (physical_len != stripe_len) {
		btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
			  physical_offset, devid, em->start, physical_len,
			  stripe_len);
		ret = -EUCLEAN;
		goto out;
	}

	for (i = 0; i < map->num_stripes; i++) {
		if (map->stripes[i].dev->devid == devid &&
		    map->stripes[i].physical == physical_offset) {
			found = true;
			if (map->verified_stripes >= map->num_stripes) {
				btrfs_err(fs_info,
				"too many dev extents for chunk %llu found",
					  em->start);
				ret = -EUCLEAN;
				goto out;
			}
			map->verified_stripes++;
			break;
		}
	}
	if (!found) {
		btrfs_err(fs_info,
	"dev extent physical offset %llu devid %llu has no corresponding chunk",
			physical_offset, devid);
		ret = -EUCLEAN;
	}
7951

D
David Sterba 已提交
7952
	/* Make sure no dev extent is beyond device boundary */
7953
	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
7954 7955 7956 7957 7958
	if (!dev) {
		btrfs_err(fs_info, "failed to find devid %llu", devid);
		ret = -EUCLEAN;
		goto out;
	}
7959

7960 7961 7962 7963 7964 7965 7966 7967
	if (physical_offset + physical_len > dev->disk_total_bytes) {
		btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
			  devid, physical_offset, physical_len,
			  dev->disk_total_bytes);
		ret = -EUCLEAN;
		goto out;
	}
7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981

	if (dev->zone_info) {
		u64 zone_size = dev->zone_info->zone_size;

		if (!IS_ALIGNED(physical_offset, zone_size) ||
		    !IS_ALIGNED(physical_len, zone_size)) {
			btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
				  devid, physical_offset, physical_len);
			ret = -EUCLEAN;
			goto out;
		}
	}

7982 7983 7984 7985 7986 7987 7988
out:
	free_extent_map(em);
	return ret;
}

static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
7989
	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7990 7991 7992 7993 7994
	struct extent_map *em;
	struct rb_node *node;
	int ret = 0;

	read_lock(&em_tree->lock);
L
Liu Bo 已提交
7995
	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023
		em = rb_entry(node, struct extent_map, rb_node);
		if (em->map_lookup->num_stripes !=
		    em->map_lookup->verified_stripes) {
			btrfs_err(fs_info,
			"chunk %llu has missing dev extent, have %d expect %d",
				  em->start, em->map_lookup->verified_stripes,
				  em->map_lookup->num_stripes);
			ret = -EUCLEAN;
			goto out;
		}
	}
out:
	read_unlock(&em_tree->lock);
	return ret;
}

/*
 * Ensure that all dev extents are mapped to correct chunk, otherwise
 * later chunk allocation/free would cause unexpected behavior.
 *
 * NOTE: This will iterate through the whole device tree, which should be of
 * the same size level as the chunk tree.  This slightly increases mount time.
 */
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
	struct btrfs_path *path;
	struct btrfs_root *root = fs_info->dev_root;
	struct btrfs_key key;
8024 8025
	u64 prev_devid = 0;
	u64 prev_dev_ext_end = 0;
8026 8027
	int ret = 0;

8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040
	/*
	 * We don't have a dev_root because we mounted with ignorebadroots and
	 * failed to load the root, so we want to skip the verification in this
	 * case for sure.
	 *
	 * However if the dev root is fine, but the tree itself is corrupted
	 * we'd still fail to mount.  This verification is only to make sure
	 * writes can happen safely, so instead just bypass this check
	 * completely in the case of IGNOREBADROOTS.
	 */
	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
		return 0;

8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054
	key.objectid = 1;
	key.type = BTRFS_DEV_EXTENT_KEY;
	key.offset = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	path->reada = READA_FORWARD;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;

	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8055
		ret = btrfs_next_leaf(root, path);
8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082
		if (ret < 0)
			goto out;
		/* No dev extents at all? Not good */
		if (ret > 0) {
			ret = -EUCLEAN;
			goto out;
		}
	}
	while (1) {
		struct extent_buffer *leaf = path->nodes[0];
		struct btrfs_dev_extent *dext;
		int slot = path->slots[0];
		u64 chunk_offset;
		u64 physical_offset;
		u64 physical_len;
		u64 devid;

		btrfs_item_key_to_cpu(leaf, &key, slot);
		if (key.type != BTRFS_DEV_EXTENT_KEY)
			break;
		devid = key.objectid;
		physical_offset = key.offset;

		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
		physical_len = btrfs_dev_extent_length(leaf, dext);

8083 8084 8085 8086 8087 8088 8089 8090 8091
		/* Check if this dev extent overlaps with the previous one */
		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
			btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
				  devid, physical_offset, prev_dev_ext_end);
			ret = -EUCLEAN;
			goto out;
		}

8092 8093 8094 8095
		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
					    physical_offset, physical_len);
		if (ret < 0)
			goto out;
8096 8097 8098
		prev_devid = devid;
		prev_dev_ext_end = physical_offset + physical_len;

8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113
		ret = btrfs_next_item(root, path);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			ret = 0;
			break;
		}
	}

	/* Ensure all chunks have corresponding dev extents */
	ret = verify_chunk_dev_extent_mapping(fs_info);
out:
	btrfs_free_path(path);
	return ret;
}
8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137

/*
 * Check whether the given block group or device is pinned by any inode being
 * used as a swapfile.
 */
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
{
	struct btrfs_swapfile_pin *sp;
	struct rb_node *node;

	spin_lock(&fs_info->swapfile_pins_lock);
	node = fs_info->swapfile_pins.rb_node;
	while (node) {
		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
		if (ptr < sp->ptr)
			node = node->rb_left;
		else if (ptr > sp->ptr)
			node = node->rb_right;
		else
			break;
	}
	spin_unlock(&fs_info->swapfile_pins_lock);
	return node != NULL;
}
8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155

static int relocating_repair_kthread(void *data)
{
	struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
	struct btrfs_fs_info *fs_info = cache->fs_info;
	u64 target;
	int ret = 0;

	target = cache->start;
	btrfs_put_block_group(cache);

	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
		btrfs_info(fs_info,
			   "zoned: skip relocating block group %llu to repair: EBUSY",
			   target);
		return -EBUSY;
	}

8156
	mutex_lock(&fs_info->reclaim_bgs_lock);
8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177

	/* Ensure block group still exists */
	cache = btrfs_lookup_block_group(fs_info, target);
	if (!cache)
		goto out;

	if (!cache->relocating_repair)
		goto out;

	ret = btrfs_may_alloc_data_chunk(fs_info, target);
	if (ret < 0)
		goto out;

	btrfs_info(fs_info,
		   "zoned: relocating block group %llu to repair IO failure",
		   target);
	ret = btrfs_relocate_chunk(fs_info, target);

out:
	if (cache)
		btrfs_put_block_group(cache);
8178
	mutex_unlock(&fs_info->reclaim_bgs_lock);
8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209
	btrfs_exclop_finish(fs_info);

	return ret;
}

int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
{
	struct btrfs_block_group *cache;

	/* Do not attempt to repair in degraded state */
	if (btrfs_test_opt(fs_info, DEGRADED))
		return 0;

	cache = btrfs_lookup_block_group(fs_info, logical);
	if (!cache)
		return 0;

	spin_lock(&cache->lock);
	if (cache->relocating_repair) {
		spin_unlock(&cache->lock);
		btrfs_put_block_group(cache);
		return 0;
	}
	cache->relocating_repair = 1;
	spin_unlock(&cache->lock);

	kthread_run(relocating_repair_kthread, cache,
		    "btrfs-relocating-repair");

	return 0;
}